diff --git a/.licenserc.yaml b/.licenserc.yaml index 6e2b465590..804e6ef5ec 100644 --- a/.licenserc.yaml +++ b/.licenserc.yaml @@ -21,17 +21,28 @@ header: copyright-owner: Apache Software Foundation paths-ignore: - - '**/*.csv' - - '**/*.txt' - - '**/*.md' - - '**/*.pem' - - '**/*.sha256' - - '**/Cargo.lock' - - '**/target/**' - - '.gitattributes' - - '.github/ISSUE_TEMPLATE/**' - - '.gitmodules' - - 'DISCLAIMER' + # License and notice files + - 'licenses' - 'LICENSE' - 'NOTICE' - - 'licenses' + - 'DISCLAIMER' + + # Documentation and configuration files + - '**/*.md' + - '**/Cargo.lock' + - 'rust-toolchain' + - '**/*.lds' + + # Cryptographic and binary files for testing + - '**/*.pem' + - '**/*.key' + - '**/*.crt' + - '**/*.rsa' + - '**/*.der' + + # Third-party libraries - included in licenses directory + - 'ring-0.17.14' # LICENSE-ring.txt + +dependency: + files: + - 'Cargo.toml' \ No newline at end of file diff --git a/LICENSE b/LICENSE index 46ae475fa3..570d0c9ac8 100644 --- a/LICENSE +++ b/LICENSE @@ -206,3 +206,8 @@ This product bundles various third-party components under other open source licenses. This section summarizes those components and their licenses. See licenses/ for text of these licenses. + +Third-party components included in this distribution: + +- ring-0.17.14/ + License: See licenses/LICENSE-ring.txt \ No newline at end of file diff --git a/licenses/LICENSE-BoringSSL b/licenses/LICENSE-BoringSSL new file mode 100644 index 0000000000..8e6eb34e5b --- /dev/null +++ b/licenses/LICENSE-BoringSSL @@ -0,0 +1,272 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +Licenses for support code +------------------------- + +Parts of the TLS test suite are under the Go license. This code is not included +in BoringSSL (i.e. libcrypto and libssl) when compiled, however, so +distributing code linked against BoringSSL does not trigger this license: + +Copyright (c) 2009 The Go Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +BoringSSL uses the Chromium test infrastructure to run a continuous build, +trybots etc. The scripts which manage this, and the script for generating build +metadata, are under the Chromium license. Distributing code linked against +BoringSSL does not trigger this license. + +Copyright 2015 The Chromium Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/licenses/LICENSE-other-bits b/licenses/LICENSE-other-bits new file mode 100644 index 0000000000..20cfadb6ee --- /dev/null +++ b/licenses/LICENSE-other-bits @@ -0,0 +1,13 @@ +Copyright 2015-2025 Brian Smith. + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted, provided that the above +copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. \ No newline at end of file diff --git a/licenses/LICENSE-ring.txt b/licenses/LICENSE-ring.txt new file mode 100644 index 0000000000..963c394c44 --- /dev/null +++ b/licenses/LICENSE-ring.txt @@ -0,0 +1,9 @@ +*ring* uses an "ISC" license, like BoringSSL used to use, for new code +files. See LICENSE-other-bits for the text of that license. + +See LICENSE-BoringSSL for code that was sourced from BoringSSL under the +Apache 2.0 license. Some code that was sourced from BoringSSL under the ISC +license. In each case, the license info is at the top of the file. + +See src/polyfill/once_cell/LICENSE-APACHE and src/polyfill/once_cell/LICENSE-MIT +for the license to code that was sourced from the once_cell project. \ No newline at end of file diff --git a/ring-0.17.14/.cargo_vcs_info.json b/ring-0.17.14/.cargo_vcs_info.json new file mode 100644 index 0000000000..9361bf9c60 --- /dev/null +++ b/ring-0.17.14/.cargo_vcs_info.json @@ -0,0 +1,7 @@ +{ + "git": { + "sha1": "2723abbca9e83347d82b056d5b239c6604f786df", + "dirty": true + }, + "path_in_vcs": "" +} \ No newline at end of file diff --git a/ring-0.17.14/Cargo.toml b/ring-0.17.14/Cargo.toml new file mode 100644 index 0000000000..273383ece7 --- /dev/null +++ b/ring-0.17.14/Cargo.toml @@ -0,0 +1,231 @@ +[package] +build = "build.rs" +categories = ["cryptography", "no-std"] +description = "An experiment." +edition = "2021" +keywords = ["crypto", "cryptography", "rand", "ECC", "RSA"] +license = "Apache-2.0 AND ISC" +name = "ring" +repository = "https://github.com/briansmith/ring" + +# Keep in sync with .github/workflows/ci.yml ("MSRV") and see the MSRV note +# in cpu/arm.rs. +# 1.66 is required on x86/x86_64 for https://github.com/rust-lang/rust/pull/101861. +rust-version = "1.66.0" + +# Keep in sync with `links` below. +version = "0.17.14" + +# Keep in sync with `version` above. +# +# build.rs verifies that this equals "ring_core_{major}_{minor}_{patch}_{pre}" +# as keeping this in sync with the symbol prefixing is crucial for ensuring +# the safety of multiple versions of *ring* being used in a program. +links = "ring_core_0_17_14_" + +include = [ + "LICENSE", + "LICENSE-other-bits", + "LICENSE-BoringSSL", + "src/polyfill/once_cell/LICENSE-APACHE", + "src/polyfill/once_cell/LICENSE-MIT", + + "Cargo.toml", + + "pregenerated/*", + + "benches/*.rs", + "build.rs", + + "crypto/chacha/asm/chacha-armv4.pl", + "crypto/chacha/asm/chacha-armv8.pl", + "crypto/chacha/asm/chacha-x86.pl", + "crypto/chacha/asm/chacha-x86_64.pl", + "crypto/constant_time_test.c", + "crypto/cpu_intel.c", + "crypto/crypto.c", + "crypto/curve25519/asm/x25519-asm-arm.S", + "crypto/curve25519/curve25519.c", + "crypto/curve25519/curve25519_64_adx.c", + "crypto/curve25519/curve25519_tables.h", + "crypto/curve25519/internal.h", + "crypto/fipsmodule/aes/aes_nohw.c", + "crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl", + "crypto/fipsmodule/aes/asm/aesni-x86.pl", + "crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl", + "crypto/fipsmodule/aes/asm/aesni-x86_64.pl", + "crypto/fipsmodule/aes/asm/aesv8-armx.pl", + "crypto/fipsmodule/aes/asm/aesv8-gcm-armv8.pl", + "crypto/fipsmodule/aes/asm/ghash-armv4.pl", + "crypto/fipsmodule/aes/asm/ghash-neon-armv8.pl", + "crypto/fipsmodule/aes/asm/ghash-x86.pl", + "crypto/fipsmodule/aes/asm/ghash-x86_64.pl", + "crypto/fipsmodule/aes/asm/ghashv8-armx.pl", + "crypto/fipsmodule/aes/asm/bsaes-armv7.pl", + "crypto/fipsmodule/aes/asm/bsaes-x86_64.pl", + "crypto/fipsmodule/aes/asm/vsaes-armv7.pl", + "crypto/fipsmodule/aes/asm/vpaes-armv7.pl", + "crypto/fipsmodule/aes/asm/vpaes-armv8.pl", + "crypto/fipsmodule/aes/asm/vpaes-x86.pl", + "crypto/fipsmodule/aes/asm/vpaes-x86_64.pl", + "crypto/fipsmodule/bn/asm/armv4-mont.pl", + "crypto/fipsmodule/bn/asm/armv8-mont.pl", + "crypto/fipsmodule/bn/asm/x86-mont.pl", + "crypto/fipsmodule/bn/asm/x86_64-mont.pl", + "crypto/fipsmodule/bn/asm/x86_64-mont5.pl", + "crypto/fipsmodule/bn/internal.h", + "crypto/fipsmodule/bn/montgomery.c", + "crypto/fipsmodule/bn/montgomery_inv.c", + "crypto/fipsmodule/ec/asm/p256-armv8-asm.pl", + "crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl", + "crypto/fipsmodule/ec/ecp_nistz.c", + "crypto/fipsmodule/ec/ecp_nistz.h", + "crypto/fipsmodule/ec/ecp_nistz384.h", + "crypto/fipsmodule/ec/ecp_nistz384.inl", + "crypto/fipsmodule/ec/gfp_p256.c", + "crypto/fipsmodule/ec/gfp_p384.c", + "crypto/fipsmodule/ec/p256.c", + "crypto/fipsmodule/ec/p256-nistz-table.h", + "crypto/fipsmodule/ec/p256-nistz.c", + "crypto/fipsmodule/ec/p256-nistz.h", + "crypto/fipsmodule/ec/p256_shared.h", + "crypto/fipsmodule/ec/p256_table.h", + "crypto/fipsmodule/ec/util.h", + "crypto/fipsmodule/ecdsa/ecdsa_verify_tests.txt", + "crypto/fipsmodule/sha/asm/sha256-armv4.pl", + "crypto/fipsmodule/sha/asm/sha512-armv4.pl", + "crypto/fipsmodule/sha/asm/sha512-armv8.pl", + "crypto/fipsmodule/sha/asm/sha512-x86_64.pl", + "crypto/internal.h", + "crypto/limbs/limbs.c", + "crypto/limbs/limbs.h", + "crypto/limbs/limbs.inl", + "crypto/mem.c", + "crypto/perlasm/arm-xlate.pl", + "crypto/perlasm/x86asm.pl", + "crypto/perlasm/x86gas.pl", + "crypto/perlasm/x86nasm.pl", + "crypto/perlasm/x86_64-xlate.pl", + "crypto/poly1305/poly1305.c", + "crypto/poly1305/poly1305_arm.c", + "crypto/poly1305/poly1305_arm_asm.S", + "crypto/cipher/asm/chacha20_poly1305_armv8.pl", + "crypto/cipher/asm/chacha20_poly1305_x86_64.pl", + "examples/**/*.rs", + "include/ring-core/aes.h", + "include/ring-core/asm_base.h", + "include/ring-core/base.h", + "include/ring-core/check.h", + "include/ring-core/mem.h", + "include/ring-core/target.h", + "include/ring-core/type_check.h", + "src/**/*.rs", + "src/aead/poly1305_test.txt", + "src/data/alg-rsa-encryption.der", + "src/ec/curve25519/ed25519/ed25519_pkcs8_v2_template.der", + "src/ec/suite_b/ecdsa/ecPublicKey_p256_pkcs8_v1_template.der", + "src/ec/suite_b/ecdsa/ecPublicKey_p384_pkcs8_v1_template.der", + "src/rsa/signature_rsa_example_private_key.der", + "src/rsa/signature_rsa_example_public_key.der", + "tests/**/*.rs", + "tests/ecdsa_test_private_key_p256.p8", + "tests/ecdsa_test_public_key_p256.der", + "tests/ecdsa_test_public_key_p256_debug.txt", + "tests/ed25519_test_private_key.bin", + "tests/ed25519_test_private_key.p8", + "tests/ed25519_test_public_key.bin", + "tests/ed25519_test_public_key.der", + "tests/rsa_test_private_key_2048.p8", + "tests/rsa_test_public_key_2048.der", + "tests/rsa_test_public_key_2048_debug.txt", + "tests/rsa_test_public_modulus.bin", + "third_party/fiat/asm/fiat_curve25519_adx_mul.S", + "third_party/fiat/asm/fiat_curve25519_adx_square.S", + "third_party/fiat/curve25519_32.h", + "third_party/fiat/curve25519_64.h", + "third_party/fiat/curve25519_64_adx.h", + "third_party/fiat/curve25519_64_msvc.h", + "third_party/fiat/p256_32.h", + "third_party/fiat/p256_64.h", + "third_party/fiat/p256_64_msvc.h", + "third_party/fiat/LICENSE", +] + +[package.metadata.docs.rs] +all-features = true + +[lib] +name = "ring" + +[dependencies] +cfg-if = { version = "1.0.0", default-features = false } +untrusted = { version = "0.9" } + +[target.'cfg(not(target_os = "optee"))'.dependencies] +getrandom = { version = "0.2.10" } + +[target.'cfg(all(any(all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little")), any(target_os = "android", target_os = "linux")))'.dependencies] +libc = { version = "0.2.148", default-features = false } + +[target.'cfg(all(all(target_arch = "aarch64", target_endian = "little"), target_vendor = "apple", any(target_os = "ios", target_os = "macos", target_os = "tvos", target_os = "visionos", target_os = "watchos")))'.dependencies] +libc = { version = "0.2.155", default-features = false } + +[target.'cfg(all(all(target_arch = "aarch64", target_endian = "little"), target_os = "windows"))'.dependencies] +windows-sys = { version = "0.52", features = ["Win32_Foundation", "Win32_System_Threading"] } + +[target.'cfg(target_os="optee")'.dependencies] +optee-utee = { version = "0.4.0" } + +[target.'cfg(all(target_arch = "wasm32", target_os = "unknown"))'.dev-dependencies] +wasm-bindgen-test = { version = "0.3.37", default-features = false, features = ["std"] } + +[target.'cfg(any(unix, windows, target_os = "wasi"))'.dev-dependencies] +libc = { version = "0.2.148", default-features = false } + +[build-dependencies] +cc = { version = "1.2.8", default-features = false } + +[features] +# These features are documented in the top-level module's documentation. +default = ["alloc", "dev_urandom_fallback"] +alloc = [] +dev_urandom_fallback = [] +less-safe-getrandom-custom-or-rdrand = [] +less-safe-getrandom-espidf = [] +slow_tests = [] +std = ["alloc"] +unstable-testing-arm-no-hw = [] +unstable-testing-arm-no-neon = [] +test_logging = [] +wasm32_unknown_unknown_js = ["getrandom/js"] + +# XXX: debug = false because of https://github.com/rust-lang/rust/issues/34122 + +[profile.bench] +opt-level = 3 +debug = false +rpath = false +lto = true +debug-assertions = false +codegen-units = 1 + +[profile.release] +opt-level = 3 +debug = false +rpath = false +lto = true +debug-assertions = false +codegen-units = 1 + +[workspace] +members = [ + # intentionally not a default member so that `cargo test` doesn't cause criterion.rs and all its + # dependencies to get built. + "bench", + + "cavp", +] +default-members = [ + ".", + "cavp" +] diff --git a/ring-0.17.14/LICENSE b/ring-0.17.14/LICENSE new file mode 100644 index 0000000000..2dac4d9d3f --- /dev/null +++ b/ring-0.17.14/LICENSE @@ -0,0 +1,9 @@ +*ring* uses an "ISC" license, like BoringSSL used to use, for new code +files. See LICENSE-other-bits for the text of that license. + +See LICENSE-BoringSSL for code that was sourced from BoringSSL under the +Apache 2.0 license. Some code that was sourced from BoringSSL under the ISC +license. In each case, the license info is at the top of the file. + +See src/polyfill/once_cell/LICENSE-APACHE and src/polyfill/once_cell/LICENSE-MIT +for the license to code that was sourced from the once_cell project. diff --git a/ring-0.17.14/LICENSE-BoringSSL b/ring-0.17.14/LICENSE-BoringSSL new file mode 100644 index 0000000000..a0f82a1f8e --- /dev/null +++ b/ring-0.17.14/LICENSE-BoringSSL @@ -0,0 +1,272 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +Licenses for support code +------------------------- + +Parts of the TLS test suite are under the Go license. This code is not included +in BoringSSL (i.e. libcrypto and libssl) when compiled, however, so +distributing code linked against BoringSSL does not trigger this license: + +Copyright (c) 2009 The Go Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +BoringSSL uses the Chromium test infrastructure to run a continuous build, +trybots etc. The scripts which manage this, and the script for generating build +metadata, are under the Chromium license. Distributing code linked against +BoringSSL does not trigger this license. + +Copyright 2015 The Chromium Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/ring-0.17.14/LICENSE-other-bits b/ring-0.17.14/LICENSE-other-bits new file mode 100644 index 0000000000..bccf030aec --- /dev/null +++ b/ring-0.17.14/LICENSE-other-bits @@ -0,0 +1,13 @@ +Copyright 2015-2025 Brian Smith. + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted, provided that the above +copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. diff --git a/ring-0.17.14/README.md b/ring-0.17.14/README.md new file mode 100644 index 0000000000..7ae6f6157e --- /dev/null +++ b/ring-0.17.14/README.md @@ -0,0 +1,54 @@ +THE SOFTWARE IS PROVIDED "AS IS" AND BRIAN SMITH AND THE AUTHORS DISCLAIM +ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES +OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL BRIAN SMITH OR THE AUTHORS +BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY +DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN +AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + +Most of the C and assembly language code in *ring* comes from BoringSSL. +BoringSSL is a fork of OpenSSL. This quote from the BoringSSL README.md +discouraging you from using it applies to this project: + +> BoringSSL is a fork of OpenSSL that is designed to meet Google's needs. +> +> Although BoringSSL is an open source project, it is not intended for general +> use, as OpenSSL is. We don't recommend that third parties depend upon it. + +This project was originally shared on GitHub in 2015 as an experiment. It was +put on crates.io shortly to help other people with their experiments. It is an +experiment. + + +Side Channels +------------- + +See [SIDE-CHANNELS.md](SIDE-CHANNELS.md) for important information regarding +the limitations of the side channel mitigations in this project. + + +Toolchains & Targets +-------------------- + +Be especially weary about using toolchains (C compilers, etc.) or targets +that aren't supported by other projects, especially BoringSSL. The further you +are from using the same version of Clang that Chrome uses, the more weary you +should be. + + +Bug Reporting +------------- + +For security vulnerabilities, see https://github.com/briansmith/ring/security/policy. + +Please report bugs that aren't security vulnerabilities either as pull requests or as issues in +[the issue tracker](https://github.com/briansmith/ring/issues). + + + +Release Notes +------------- +It is recommended that you review every commit in this project. Some +particularly noteworthy changes are noted in the [RELEASES.md](RELEASES.md). We could use some +help in making this better. diff --git a/ring-0.17.14/build.rs b/ring-0.17.14/build.rs new file mode 100644 index 0000000000..2441037315 --- /dev/null +++ b/ring-0.17.14/build.rs @@ -0,0 +1,1045 @@ +// Copyright 2015-2016 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! Build the non-Rust components. + +// It seems like it would be a good idea to use `log!` for logging, but it +// isn't worth having the external dependencies (one for the `log` crate, and +// another for the concrete logging implementation). Instead we use `eprintln!` +// to log everything to stderr. + +use std::{ + ffi::{OsStr, OsString}, + fs::{self, DirEntry}, + io::Write, + path::{Path, PathBuf}, + process::{Command, Stdio}, +}; + +mod env { + use std::ffi::OsString; + + /// Read an environment variable and tell Cargo that we depend on it. + /// + /// The name is static since we intend to only read a static set of environment + /// variables. + pub fn var_os(name: &'static str) -> Option { + println!("cargo:rerun-if-env-changed={}", name); + std::env::var_os(name) + } + + pub fn var(name: &'static str) -> Option { + var_os(name).and_then(|value| value.into_string().ok()) + } +} + +const X86: &str = "x86"; +const X86_64: &str = "x86_64"; +const AARCH64: &str = "aarch64"; +const ARM: &str = "arm"; +const WASM32: &str = "wasm32"; + +#[rustfmt::skip] +const RING_SRCS: &[(&[&str], &str)] = &[ + (&[], "crypto/curve25519/curve25519.c"), + (&[], "crypto/fipsmodule/aes/aes_nohw.c"), + (&[], "crypto/fipsmodule/bn/montgomery.c"), + (&[], "crypto/fipsmodule/bn/montgomery_inv.c"), + (&[], "crypto/fipsmodule/ec/ecp_nistz.c"), + (&[], "crypto/fipsmodule/ec/gfp_p256.c"), + (&[], "crypto/fipsmodule/ec/gfp_p384.c"), + (&[], "crypto/fipsmodule/ec/p256.c"), + (&[], "crypto/limbs/limbs.c"), + (&[], "crypto/mem.c"), + (&[], "crypto/poly1305/poly1305.c"), + + (&[ARM, X86_64, X86], "crypto/crypto.c"), + + (&[X86_64, X86], "crypto/cpu_intel.c"), + + (&[X86], "crypto/fipsmodule/aes/asm/aesni-x86.pl"), + (&[X86], "crypto/fipsmodule/aes/asm/ghash-x86.pl"), + (&[X86], "crypto/fipsmodule/aes/asm/vpaes-x86.pl"), + (&[X86], "crypto/fipsmodule/bn/asm/x86-mont.pl"), + (&[X86], "crypto/chacha/asm/chacha-x86.pl"), + + (&[X86_64], "crypto/chacha/asm/chacha-x86_64.pl"), + (&[X86_64], "crypto/curve25519/curve25519_64_adx.c"), + (&[X86_64], "crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl"), + (&[X86_64], "crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl"), + (&[X86_64], "crypto/fipsmodule/aes/asm/aesni-x86_64.pl"), + (&[X86_64], "crypto/fipsmodule/aes/asm/ghash-x86_64.pl"), + (&[X86_64], "crypto/fipsmodule/aes/asm/vpaes-x86_64.pl"), + (&[X86_64], "crypto/fipsmodule/bn/asm/x86_64-mont.pl"), + (&[X86_64], "crypto/fipsmodule/bn/asm/x86_64-mont5.pl"), + (&[X86_64], "crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl"), + (&[X86_64], SHA512_X86_64), + (&[X86_64], "crypto/cipher/asm/chacha20_poly1305_x86_64.pl"), + (&[X86_64], "third_party/fiat/asm/fiat_curve25519_adx_mul.S"), + (&[X86_64], "third_party/fiat/asm/fiat_curve25519_adx_square.S"), + + (&[AARCH64, X86_64], "crypto/fipsmodule/ec/p256-nistz.c"), + + (&[ARM], "crypto/fipsmodule/aes/asm/bsaes-armv7.pl"), + (&[ARM], "crypto/fipsmodule/aes/asm/ghash-armv4.pl"), + (&[ARM], "crypto/fipsmodule/aes/asm/vpaes-armv7.pl"), + (&[ARM], "crypto/fipsmodule/bn/asm/armv4-mont.pl"), + (&[ARM], "crypto/chacha/asm/chacha-armv4.pl"), + (&[ARM], "crypto/curve25519/asm/x25519-asm-arm.S"), + (&[ARM], "crypto/poly1305/poly1305_arm.c"), + (&[ARM], "crypto/poly1305/poly1305_arm_asm.S"), + (&[ARM], "crypto/fipsmodule/sha/asm/sha256-armv4.pl"), + (&[ARM], "crypto/fipsmodule/sha/asm/sha512-armv4.pl"), + + (&[AARCH64], "crypto/chacha/asm/chacha-armv8.pl"), + (&[AARCH64], "crypto/cipher/asm/chacha20_poly1305_armv8.pl"), + (&[AARCH64], "crypto/fipsmodule/aes/asm/aesv8-armx.pl"), + (&[AARCH64], "crypto/fipsmodule/aes/asm/aesv8-gcm-armv8.pl"), + (&[AARCH64], "crypto/fipsmodule/aes/asm/ghash-neon-armv8.pl"), + (&[AARCH64], "crypto/fipsmodule/aes/asm/ghashv8-armx.pl"), + (&[AARCH64], "crypto/fipsmodule/aes/asm/vpaes-armv8.pl"), + (&[AARCH64], "crypto/fipsmodule/bn/asm/armv8-mont.pl"), + (&[AARCH64], "crypto/fipsmodule/ec/asm/p256-armv8-asm.pl"), + (&[AARCH64], SHA512_ARMV8), +]; + +const SHA256_X86_64: &str = "crypto/fipsmodule/sha/asm/sha256-x86_64.pl"; +const SHA512_X86_64: &str = "crypto/fipsmodule/sha/asm/sha512-x86_64.pl"; + +const SHA256_ARMV8: &str = "crypto/fipsmodule/sha/asm/sha256-armv8.pl"; +const SHA512_ARMV8: &str = "crypto/fipsmodule/sha/asm/sha512-armv8.pl"; + +const RING_TEST_SRCS: &[&str] = &[("crypto/constant_time_test.c")]; + +const PREGENERATED: &str = "pregenerated"; + +fn cpp_flags(compiler: &cc::Tool) -> &'static [&'static str] { + if !compiler.is_like_msvc() { + static NON_MSVC_FLAGS: &[&str] = &[ + "-fvisibility=hidden", + "-std=c1x", // GCC 4.6 requires "c1x" instead of "c11" + "-Wall", + "-Wbad-function-cast", + "-Wcast-align", + "-Wcast-qual", + "-Wconversion", + "-Wmissing-field-initializers", + "-Wmissing-include-dirs", + "-Wnested-externs", + "-Wredundant-decls", + "-Wshadow", + "-Wsign-compare", + "-Wsign-conversion", + "-Wstrict-prototypes", + "-Wundef", + "-Wuninitialized", + ]; + NON_MSVC_FLAGS + } else { + static MSVC_FLAGS: &[&str] = &[ + "/Gy", // Enable function-level linking. + "/Zc:wchar_t", + "/Zc:forScope", + "/Zc:inline", + // Warnings. + "/Wall", + "/wd4127", // C4127: conditional expression is constant + "/wd4464", // C4464: relative include path contains '..' + "/wd4514", // C4514: : unreferenced inline function has be + "/wd4710", // C4710: function not inlined + "/wd4711", // C4711: function 'function' selected for inline expansion + "/wd4820", // C4820: : bytes padding added after + "/wd5045", /* C5045: Compiler will insert Spectre mitigation for memory load if + * /Qspectre switch specified */ + ]; + MSVC_FLAGS + } +} + +// None means "any OS" or "any target". The first match in sequence order is +// taken. +const ASM_TARGETS: &[AsmTarget] = &[ + AsmTarget { + oss: LINUX_ABI, + arch: AARCH64, + perlasm_format: "linux64", + }, + AsmTarget { + oss: LINUX_ABI, + arch: ARM, + perlasm_format: "linux32", + }, + AsmTarget { + oss: LINUX_ABI, + arch: X86, + perlasm_format: "elf", + }, + AsmTarget { + oss: LINUX_ABI, + arch: X86_64, + perlasm_format: "elf", + }, + AsmTarget { + oss: &["horizon"], + arch: ARM, + perlasm_format: "linux32", + }, + AsmTarget { + oss: APPLE_ABI, + arch: AARCH64, + perlasm_format: "ios64", + }, + AsmTarget { + oss: APPLE_ABI, + arch: X86_64, + perlasm_format: "macosx", + }, + AsmTarget { + oss: &[WINDOWS], + arch: X86, + perlasm_format: WIN32N, + }, + AsmTarget { + oss: &[WINDOWS], + arch: X86_64, + perlasm_format: NASM, + }, + AsmTarget { + oss: &[WINDOWS], + arch: AARCH64, + perlasm_format: "win64", + }, +]; + +struct AsmTarget { + /// Operating systems. + oss: &'static [&'static str], + + /// Architectures. + arch: &'static str, + + /// The PerlAsm format name. + perlasm_format: &'static str, +} + +impl AsmTarget { + fn use_nasm(&self) -> bool { + [WIN32N, NASM].contains(&self.perlasm_format) + } +} + +/// Operating systems that have the same ABI as Linux on every architecture +/// mentioned in `ASM_TARGETS`. +const LINUX_ABI: &[&str] = &[ + "android", + "dragonfly", + "freebsd", + "fuchsia", + "haiku", + "hurd", + "illumos", + "netbsd", + "openbsd", + "linux", + "redox", + "solaris", + "optee", +]; + +const WIN32N: &str = "win32n"; +const NASM: &str = "nasm"; + +/// Operating systems that have the same ABI as macOS on every architecture +/// mentioned in `ASM_TARGETS`. +const APPLE_ABI: &[&str] = &["ios", "macos", "tvos", "visionos", "watchos"]; + +const WINDOWS: &str = "windows"; + +fn main() { + // Avoid assuming the working directory is the same is the $CARGO_MANIFEST_DIR so that toolchains + // which may assume other working directories can still build this code. + let c_root_dir = PathBuf::from( + env::var_os("CARGO_MANIFEST_DIR").expect("CARGO_MANIFEST_DIR should always be set"), + ); + + // Keep in sync with `core_name_and_version!` in prefixed.rs. + let core_name_and_version = [ + &env::var("CARGO_PKG_NAME").unwrap(), + "core", + &env::var("CARGO_PKG_VERSION_MAJOR").unwrap(), + &env::var("CARGO_PKG_VERSION_MINOR").unwrap(), + &env::var("CARGO_PKG_VERSION_PATCH").unwrap(), + &env::var("CARGO_PKG_VERSION_PRE").unwrap(), // Often empty + ] + .join("_"); + // Ensure `links` in Cargo.toml is consistent with the version. + assert_eq!( + &env::var("CARGO_MANIFEST_LINKS").unwrap(), + &core_name_and_version + ); + + const RING_PREGENERATE_ASM: &str = "RING_PREGENERATE_ASM"; + match env::var_os(RING_PREGENERATE_ASM).as_deref() { + Some(s) if s == "1" => { + pregenerate_asm_main(&c_root_dir, &core_name_and_version); + } + None => ring_build_rs_main(&c_root_dir, &core_name_and_version), + _ => { + panic!("${} has an invalid value", RING_PREGENERATE_ASM); + } + } +} + +fn ring_build_rs_main(c_root_dir: &Path, core_name_and_version: &str) { + let out_dir = env::var_os("OUT_DIR").unwrap(); + let out_dir = PathBuf::from(out_dir); + + let arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap(); + let os = env::var("CARGO_CFG_TARGET_OS").unwrap(); + let env = env::var("CARGO_CFG_TARGET_ENV").unwrap(); + let endian = env::var("CARGO_CFG_TARGET_ENDIAN").unwrap(); + let is_little_endian = endian == "little"; + + let is_git = fs::metadata(c_root_dir.join(".git")).is_ok(); + + // Published builds are always built in release mode. + let is_debug = is_git && env::var("DEBUG").unwrap() != "false"; + + // During local development, force warnings in non-Rust code to be treated + // as errors. Since warnings are highly compiler-dependent and compilers + // don't maintain backward compatibility w.r.t. which warnings they issue, + // don't do this for packaged builds. + let force_warnings_into_errors = is_git; + + let target = Target { + arch, + os, + env, + is_debug, + force_warnings_into_errors, + }; + + let asm_target = if is_little_endian { + ASM_TARGETS.iter().find(|asm_target| { + asm_target.arch == target.arch && asm_target.oss.contains(&target.os.as_ref()) + }) + } else { + None + }; + + // If `.git` exists then assume this is the "local hacking" case where + // we want to make it easy to build *ring* using `cargo build`/`cargo test` + // without a prerequisite `package` step, at the cost of needing additional + // tools like `Perl` and/or `nasm`. + // + // If `.git` doesn't exist then assume that this is a packaged build where + // we want to optimize for minimizing the build tools required: No Perl, + // no nasm, etc. + let generated_dir = if !is_git { + c_root_dir.join(PREGENERATED) + } else { + generate_sources_and_preassemble( + &out_dir, + asm_target.into_iter(), + c_root_dir, + core_name_and_version, + ); + out_dir.clone() + }; + + build_c_code( + asm_target, + &target, + &generated_dir, + c_root_dir, + &out_dir, + core_name_and_version, + ); + emit_rerun_if_changed() +} + +fn pregenerate_asm_main(c_root_dir: &Path, core_name_and_version: &str) { + let pregenerated = c_root_dir.join(PREGENERATED); + fs::create_dir(&pregenerated).unwrap(); + generate_sources_and_preassemble( + &pregenerated, + ASM_TARGETS.iter(), + c_root_dir, + core_name_and_version, + ); +} + +fn generate_sources_and_preassemble<'a>( + out_dir: &Path, + asm_targets: impl Iterator, + c_root_dir: &Path, + core_name_and_version: &str, +) { + generate_prefix_symbols_headers(out_dir, core_name_and_version).unwrap(); + + let perl_exe = get_perl_exe(); + + for asm_target in asm_targets { + let perlasm_src_dsts = perlasm_src_dsts(out_dir, asm_target); + perlasm(&perl_exe, &perlasm_src_dsts, asm_target, c_root_dir); + + if asm_target.use_nasm() { + // Package pregenerated object files in addition to pregenerated + // assembly language source files, so that the user doesn't need + // to install the assembler. + let srcs = asm_srcs(perlasm_src_dsts); + for src in srcs { + nasm(&src, asm_target.arch, out_dir, out_dir, c_root_dir); + } + } + } +} + +struct Target { + arch: String, + os: String, + env: String, + + /// Is this a debug build? This affects whether assertions might be enabled + /// in the C code. For packaged builds, this should always be `false`. + is_debug: bool, + + /// true: Force warnings to be treated as errors. + /// false: Use the default behavior (perhaps determined by `$CFLAGS`, etc.) + force_warnings_into_errors: bool, +} + +fn build_c_code( + asm_target: Option<&AsmTarget>, + target: &Target, + generated_dir: &Path, + c_root_dir: &Path, + out_dir: &Path, + core_name_and_version: &str, +) { + let (asm_srcs, obj_srcs) = if let Some(asm_target) = asm_target { + let perlasm_src_dsts = perlasm_src_dsts(generated_dir, asm_target); + + let asm_srcs = asm_srcs(perlasm_src_dsts); + + if asm_target.use_nasm() { + // Nasm was already used to generate the object files, so use them instead of + // assembling. + let obj_srcs = asm_srcs + .iter() + .map(|src| obj_path(generated_dir, src.as_path())) + .collect::>(); + (vec![], obj_srcs) + } else { + (asm_srcs, vec![]) + } + } else { + (vec![], vec![]) + }; + + let core_srcs = sources_for_arch(&target.arch) + .into_iter() + .filter(|p| !is_perlasm(p)) + .filter(|p| { + if let Some(extension) = p.extension() { + // We don't (and can't) use any .S on Windows since MSVC and NASM can't assemble + // them. + if extension == "S" + && (target.arch == X86_64 || target.arch == X86) + && target.os == WINDOWS + { + return false; + } + } + true + }) + .collect::>(); + + let test_srcs = RING_TEST_SRCS.iter().map(PathBuf::from).collect::>(); + + let libs = [ + ( + core_name_and_version, + &core_srcs[..], + &asm_srcs[..], + &obj_srcs[..], + ), + ( + &(String::from(core_name_and_version) + "_test"), + &test_srcs[..], + &[], + &[], + ), + ]; + + // XXX: Ideally, ring-test would only be built for `cargo test`, but Cargo + // can't do that yet. + libs.iter() + .for_each(|&(lib_name, srcs, asm_srcs, obj_srcs)| { + let srcs = srcs.iter().chain(asm_srcs); + build_library( + target, + c_root_dir, + out_dir, + lib_name, + srcs, + generated_dir, + obj_srcs, + ) + }); + + println!( + "cargo:rustc-link-search=native={}", + out_dir.to_str().expect("Invalid path") + ); +} + +fn new_build(target: &Target, c_root_dir: &Path, include_dir: &Path) -> cc::Build { + let mut b = cc::Build::new(); + configure_cc(&mut b, target, c_root_dir, include_dir); + b +} + +fn build_library<'a>( + target: &Target, + c_root_dir: &Path, + out_dir: &Path, + lib_name: &str, + srcs: impl Iterator, + include_dir: &Path, + preassembled_objs: &[PathBuf], +) { + let mut c = new_build(target, c_root_dir, include_dir); + + // Compile all the (dirty) source files into object files. + srcs.for_each(|src| { + c.file(c_root_dir.join(src)); + }); + + preassembled_objs.iter().for_each(|obj| { + c.object(obj); + }); + + // Rebuild the library if necessary. + let lib_path = PathBuf::from(out_dir).join(format!("lib{}.a", lib_name)); + + // Handled below. + let _ = c.cargo_metadata(false); + + c.compile( + lib_path + .file_name() + .and_then(|f| f.to_str()) + .expect("No filename"), + ); + + // Link the library. This works even when the library doesn't need to be + // rebuilt. + println!("cargo:rustc-link-lib=static={}", lib_name); +} + +fn obj_path(out_dir: &Path, src: &Path) -> PathBuf { + let mut out_path = out_dir.join(src.file_name().unwrap()); + // To eliminate unnecessary conditional logic, use ".o" as the extension, + // even when the compiler (e.g. MSVC) would normally use something else + // (e.g. ".obj"). cc-rs seems to do the same. + assert!(out_path.set_extension("o")); + out_path +} + +fn configure_cc(c: &mut cc::Build, target: &Target, c_root_dir: &Path, include_dir: &Path) { + let compiler = c.get_compiler(); + // FIXME: On Windows AArch64 we currently must use Clang to compile C code + let compiler = if target.os == WINDOWS && target.arch == AARCH64 && !compiler.is_like_clang() { + let _ = c.compiler("clang"); + c.get_compiler() + } else { + compiler + }; + + let _ = c.include(c_root_dir.join("include")); + let _ = c.include(include_dir); + for f in cpp_flags(&compiler) { + let _ = c.flag(f); + } + + if APPLE_ABI.contains(&target.os.as_str()) { + // ``-gfull`` is required for Darwin's |-dead_strip|. + let _ = c.flag("-gfull"); + } else if !compiler.is_like_msvc() { + let _ = c.flag("-g3"); + }; + + if !target.is_debug { + let _ = c.define("NDEBUG", None); + } + + if target.arch == X86 { + let is_msvc_not_clang_cl = compiler.is_like_msvc() && !compiler.is_like_clang_cl(); + if !is_msvc_not_clang_cl { + let _ = c.flag("-msse2"); + } + } + + // Allow cross-compiling without a target sysroot for these targets. + if (target.arch == WASM32) + || (target.os == "linux" && target.env == "musl" && target.arch != X86_64) + { + // TODO: Expand this to non-clang compilers in 0.17.0 if practical. + if compiler.is_like_clang() { + let _ = c.flag("-nostdlibinc"); + let _ = c.define("RING_CORE_NOSTDLIBINC", "1"); + } + } + + if target.force_warnings_into_errors { + c.warnings_into_errors(true); + } +} + +fn nasm(file: &Path, arch: &str, include_dir: &Path, out_dir: &Path, c_root_dir: &Path) { + let out_file = obj_path(out_dir, file); + let oformat = match arch { + x if x == X86_64 => "win64", + x if x == X86 => "win32", + _ => panic!("unsupported arch: {}", arch), + }; + + // Nasm requires that the path end in a path separator. + let mut include_dir = include_dir.as_os_str().to_os_string(); + include_dir.push(OsString::from(String::from(std::path::MAIN_SEPARATOR))); + + let mut c = Command::new("./target/tools/windows/nasm/nasm"); + let _ = c + .arg("-o") + .arg(out_file.to_str().expect("Invalid path")) + .arg("-f") + .arg(oformat) + .arg("-i") + .arg("include/") + .arg("-i") + .arg(include_dir) + .arg("-Xgnu") + .arg("-gcv8") + .arg(c_root_dir.join(file)); + run_command(c); +} + +fn run_command_with_args(command_name: &Path, args: &[OsString]) { + let mut cmd = Command::new(command_name); + let _ = cmd.args(args); + run_command(cmd) +} + +fn run_command(mut cmd: Command) { + eprintln!("running {:?}", cmd); + cmd.stderr(Stdio::inherit()); + let status = cmd.status().unwrap_or_else(|e| { + panic!("failed to execute [{:?}]: {}", cmd, e); + }); + if !status.success() { + panic!("execution failed"); + } +} + +fn sources_for_arch(arch: &str) -> Vec { + RING_SRCS + .iter() + .filter(|&&(archs, _)| archs.is_empty() || archs.contains(&arch)) + .map(|&(_, p)| PathBuf::from(p)) + .collect::>() +} + +fn perlasm_src_dsts(out_dir: &Path, asm_target: &AsmTarget) -> Vec<(PathBuf, PathBuf)> { + let srcs = sources_for_arch(asm_target.arch); + let mut src_dsts = srcs + .iter() + .filter(|p| is_perlasm(p)) + .map(|src| (src.clone(), asm_path(out_dir, src, asm_target))) + .collect::>(); + + // Some PerlAsm source files need to be run multiple times with different + // output paths. + { + // Appease the borrow checker. + let mut maybe_synthesize = |concrete, synthesized| { + let concrete_path = PathBuf::from(concrete); + if srcs.contains(&concrete_path) { + let synthesized_path = PathBuf::from(synthesized); + src_dsts.push(( + concrete_path, + asm_path(out_dir, &synthesized_path, asm_target), + )) + } + }; + maybe_synthesize(SHA512_X86_64, SHA256_X86_64); + maybe_synthesize(SHA512_ARMV8, SHA256_ARMV8); + } + + src_dsts +} + +fn asm_srcs(perlasm_src_dsts: Vec<(PathBuf, PathBuf)>) -> Vec { + perlasm_src_dsts + .into_iter() + .map(|(_src, dst)| dst) + .collect::>() +} + +fn is_perlasm(path: &Path) -> bool { + path.extension().unwrap().to_str().unwrap() == "pl" +} + +fn asm_path(out_dir: &Path, src: &Path, asm_target: &AsmTarget) -> PathBuf { + let src_stem = src.file_stem().expect("source file without basename"); + + let dst_stem = src_stem.to_str().unwrap(); + let dst_filename = format!("{}-{}", dst_stem, asm_target.perlasm_format); + let extension = if asm_target.use_nasm() { "asm" } else { "S" }; + out_dir.join(dst_filename).with_extension(extension) +} + +fn perlasm( + perl_exe: &Path, + src_dst: &[(PathBuf, PathBuf)], + asm_target: &AsmTarget, + c_root_dir: &Path, +) { + for (src, dst) in src_dst { + let mut args = vec![ + join_components_with_forward_slashes(&c_root_dir.join(src)), + asm_target.perlasm_format.into(), + ]; + if asm_target.arch == X86 { + args.push("-fPIC".into()); + } + // Work around PerlAsm issue for ARM and AAarch64 targets by replacing + // back slashes with forward slashes. + args.push(join_components_with_forward_slashes(dst)); + run_command_with_args(perl_exe, &args); + } +} + +fn join_components_with_forward_slashes(path: &Path) -> OsString { + let parts = path.components().map(|c| c.as_os_str()).collect::>(); + parts.join(OsStr::new("/")) +} + +fn get_perl_exe() -> PathBuf { + get_command("PERL_EXECUTABLE", "perl") +} + +fn get_command(var: &'static str, default: &str) -> PathBuf { + PathBuf::from(env::var_os(var).unwrap_or_else(|| default.into())) +} + +// TODO: We should emit `cargo:rerun-if-changed-env` for the various +// environment variables that affect the build. +fn emit_rerun_if_changed() { + for path in &["crypto", "include", "third_party/fiat"] { + walk_dir(&PathBuf::from(path), &|entry| { + let path = entry.path(); + match path.extension().and_then(|ext| ext.to_str()) { + Some("c") | Some("S") | Some("h") | Some("inl") | Some("pl") | None => { + println!("cargo:rerun-if-changed={}", path.to_str().unwrap()); + } + _ => { + // Ignore other types of files. + } + } + }) + } +} + +fn walk_dir(dir: &Path, cb: &impl Fn(&DirEntry)) { + if dir.is_dir() { + for entry in fs::read_dir(dir).unwrap() { + let entry = entry.unwrap(); + let path = entry.path(); + if path.is_dir() { + walk_dir(&path, cb); + } else { + cb(&entry); + } + } + } +} + +/// Creates the necessary header files for symbol renaming. +/// +/// For simplicity, both non-Nasm- and Nasm- style headers are always +/// generated, even though local non-packaged builds need only one of them. +fn generate_prefix_symbols_headers( + out_dir: &Path, + core_name_and_version: &str, +) -> Result<(), std::io::Error> { + let prefix = &(String::from(core_name_and_version) + "_"); + + generate_prefix_symbols_header(out_dir, "prefix_symbols.h", '#', None, prefix)?; + + generate_prefix_symbols_header( + out_dir, + "prefix_symbols_asm.h", + '#', + Some("#if defined(__APPLE__)"), + prefix, + )?; + + generate_prefix_symbols_header( + out_dir, + "prefix_symbols_nasm.inc", + '%', + Some("%ifidn __OUTPUT_FORMAT__,win32"), + prefix, + )?; + + Ok(()) +} + +fn generate_prefix_symbols_header( + out_dir: &Path, + filename: &str, + pp: char, + prefix_condition: Option<&str>, + prefix: &str, +) -> Result<(), std::io::Error> { + let dir = out_dir.join("ring_core_generated"); + fs::create_dir_all(&dir)?; + + let path = dir.join(filename); + let mut file = fs::File::create(path)?; + + let filename_ident = filename.replace('.', "_").to_uppercase(); + writeln!( + file, + r#" +{pp}ifndef ring_core_generated_{filename_ident} +{pp}define ring_core_generated_{filename_ident} +"#, + pp = pp, + filename_ident = filename_ident + )?; + + if let Some(prefix_condition) = prefix_condition { + writeln!(file, "{}", prefix_condition)?; + writeln!(file, "{}", prefix_all_symbols(pp, "_", prefix))?; + writeln!(file, "{pp}else", pp = pp)?; + }; + writeln!(file, "{}", prefix_all_symbols(pp, "", prefix))?; + if prefix_condition.is_some() { + writeln!(file, "{pp}endif", pp = pp)? + } + + writeln!(file, "{pp}endif", pp = pp)?; + + Ok(()) +} + +fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String { + // Rename some nistz256 assembly functions to match the names of their + // polyfills. + static SYMBOLS_TO_RENAME: &[(&str, &str)] = &[ + ("ecp_nistz256_point_double", "p256_point_double"), + ("ecp_nistz256_point_add", "p256_point_add"), + ("ecp_nistz256_point_add_affine", "p256_point_add_affine"), + ("ecp_nistz256_ord_mul_mont", "p256_scalar_mul_mont"), + ("ecp_nistz256_ord_sqr_mont", "p256_scalar_sqr_rep_mont"), + ("ecp_nistz256_mul_mont", "p256_mul_mont"), + ("ecp_nistz256_sqr_mont", "p256_sqr_mont"), + ]; + + static SYMBOLS_TO_PREFIX: &[&str] = &[ + "adx_bmi2_available", + "avx2_available", + "CRYPTO_memcmp", + "CRYPTO_poly1305_finish", + "CRYPTO_poly1305_finish_neon", + "CRYPTO_poly1305_init", + "CRYPTO_poly1305_init_neon", + "CRYPTO_poly1305_update", + "CRYPTO_poly1305_update_neon", + "ChaCha20_ctr32", + "ChaCha20_ctr32_avx2", + "ChaCha20_ctr32_neon", + "ChaCha20_ctr32_nohw", + "ChaCha20_ctr32_ssse3", + "ChaCha20_ctr32_ssse3_4x", + "LIMB_is_zero", + "LIMBS_add_mod", + "LIMBS_are_zero", + "LIMBS_equal", + "LIMBS_less_than", + "LIMBS_reduce_once", + "LIMBS_select_512_32", + "LIMBS_shl_mod", + "LIMBS_sub_mod", + "LIMBS_window5_split_window", + "LIMBS_window5_unsplit_window", + "LIMB_shr", + "OPENSSL_cpuid_setup", + "aes_gcm_dec_kernel", + "aes_gcm_dec_update_vaes_avx2", + "aes_gcm_enc_kernel", + "aes_gcm_enc_update_vaes_avx2", + "aes_hw_ctr32_encrypt_blocks", + "aes_hw_set_encrypt_key", + "aes_hw_set_encrypt_key_alt", + "aes_hw_set_encrypt_key_base", + "aes_nohw_ctr32_encrypt_blocks", + "aes_nohw_encrypt", + "aes_nohw_set_encrypt_key", + "aesni_gcm_decrypt", + "aesni_gcm_encrypt", + "bn_from_montgomery_in_place", + "bn_gather5", + "bn_mul_mont", + "bn_mul_mont_nohw", + "bn_mul4x_mont", + "bn_mulx4x_mont", + "bn_mul8x_mont_neon", + "bn_mul4x_mont_gather5", + "bn_mulx4x_mont_gather5", + "bn_neg_inv_mod_r_u64", + "bn_power5_nohw", + "bn_powerx5", + "bn_scatter5", + "bn_sqr8x_internal", + "bn_sqr8x_mont", + "bn_sqrx8x_internal", + "bsaes_ctr32_encrypt_blocks", + "bssl_constant_time_test_conditional_memcpy", + "bssl_constant_time_test_conditional_memxor", + "bssl_constant_time_test_main", + "chacha20_poly1305_open", + "chacha20_poly1305_open_avx2", + "chacha20_poly1305_open_sse41", + "chacha20_poly1305_seal", + "chacha20_poly1305_seal_avx2", + "chacha20_poly1305_seal_sse41", + "ecp_nistz256_mul_mont_adx", + "ecp_nistz256_mul_mont_nohw", + "ecp_nistz256_ord_mul_mont_adx", + "ecp_nistz256_ord_mul_mont_nohw", + "ecp_nistz256_ord_sqr_mont_adx", + "ecp_nistz256_ord_sqr_mont_nohw", + "ecp_nistz256_point_add_adx", + "ecp_nistz256_point_add_nohw", + "ecp_nistz256_point_add_affine_adx", + "ecp_nistz256_point_add_affine_nohw", + "ecp_nistz256_point_double_adx", + "ecp_nistz256_point_double_nohw", + "ecp_nistz256_select_w5_avx2", + "ecp_nistz256_select_w5_nohw", + "ecp_nistz256_select_w7_avx2", + "ecp_nistz256_select_w7_nohw", + "ecp_nistz256_sqr_mont_adx", + "ecp_nistz256_sqr_mont_nohw", + "fiat_curve25519_adx_mul", + "fiat_curve25519_adx_square", + "gcm_ghash_avx", + "gcm_ghash_clmul", + "gcm_ghash_neon", + "gcm_ghash_vpclmulqdq_avx2_1", + "gcm_gmult_clmul", + "gcm_gmult_neon", + "gcm_init_avx", + "gcm_init_clmul", + "gcm_init_neon", + "gcm_init_vpclmulqdq_avx2", + "k25519Precomp", + "limbs_mul_add_limb", + "little_endian_bytes_from_scalar", + "ecp_nistz256_neg", + "ecp_nistz256_select_w5", + "ecp_nistz256_select_w7", + "neon_available", + "p256_mul_mont", + "p256_point_add", + "p256_point_add_affine", + "p256_point_double", + "p256_point_mul", + "p256_point_mul_base", + "p256_point_mul_base_vartime", + "p256_scalar_mul_mont", + "p256_scalar_sqr_rep_mont", + "p256_sqr_mont", + "p384_elem_div_by_2", + "p384_elem_mul_mont", + "p384_elem_neg", + "p384_elem_sub", + "p384_point_add", + "p384_point_double", + "p384_point_mul", + "p384_scalar_mul_mont", + "openssl_poly1305_neon2_addmulmod", + "openssl_poly1305_neon2_blocks", + "sha256_block_data_order", + "sha256_block_data_order_avx", + "sha256_block_data_order_ssse3", + "sha256_block_data_order_hw", + "sha256_block_data_order_neon", + "sha256_block_data_order_nohw", + "sha512_block_data_order", + "sha512_block_data_order_avx", + "sha512_block_data_order_hw", + "sha512_block_data_order_neon", + "sha512_block_data_order_nohw", + "vpaes_ctr32_encrypt_blocks", + "vpaes_encrypt", + "vpaes_encrypt_key_to_bsaes", + "vpaes_set_encrypt_key", + "x25519_NEON", + "x25519_fe_invert", + "x25519_fe_isnegative", + "x25519_fe_mul_ttt", + "x25519_fe_neg", + "x25519_fe_tobytes", + "x25519_ge_double_scalarmult_vartime", + "x25519_ge_frombytes_vartime", + "x25519_ge_scalarmult_base", + "x25519_ge_scalarmult_base_adx", + "x25519_public_from_private_generic_masked", + "x25519_sc_mask", + "x25519_sc_muladd", + "x25519_sc_reduce", + "x25519_scalar_mult_adx", + "x25519_scalar_mult_generic_masked", + ]; + + let mut out = String::new(); + + for (old, new) in SYMBOLS_TO_RENAME { + let line = format!( + "{pp}define {prefix_prefix}{old} {prefix_prefix}{new}\n", + pp = pp, + prefix_prefix = prefix_prefix, + old = old, + new = new + ); + out += &line; + } + + for symbol in SYMBOLS_TO_PREFIX { + let line = format!( + "{pp}define {prefix_prefix}{symbol} {prefix_prefix}{prefix}{symbol}\n", + pp = pp, + prefix_prefix = prefix_prefix, + prefix = prefix, + symbol = symbol + ); + out += &line; + } + + out +} diff --git a/ring-0.17.14/crypto/chacha/asm/chacha-armv4.pl b/ring-0.17.14/crypto/chacha/asm/chacha-armv4.pl new file mode 100644 index 0000000000..700da12005 --- /dev/null +++ b/ring-0.17.14/crypto/chacha/asm/chacha-armv4.pl @@ -0,0 +1,1137 @@ +#! /usr/bin/env perl +# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. +# ==================================================================== +# +# December 2014 +# +# ChaCha20 for ARMv4. +# +# Performance in cycles per byte out of large buffer. +# +# IALU/gcc-4.4 1xNEON 3xNEON+1xIALU +# +# Cortex-A5 19.3(*)/+95% 21.8 14.1 +# Cortex-A8 10.5(*)/+160% 13.9 6.35 +# Cortex-A9 12.9(**)/+110% 14.3 6.50 +# Cortex-A15 11.0/+40% 16.0 5.00 +# Snapdragon S4 11.5/+125% 13.6 4.90 +# +# (*) most "favourable" result for aligned data on little-endian +# processor, result for misaligned data is 10-15% lower; +# (**) this result is a trade-off: it can be improved by 20%, +# but then Snapdragon S4 and Cortex-A8 results get +# 20-25% worse; + +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; + *STDOUT=*OUT; +} else { + open OUT,">$output"; + *STDOUT=*OUT; +} + +sub AUTOLOAD() # thunk [simplified] x86-style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; + my $arg = pop; + $arg = "#$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; +} + +my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x")); +my @t=map("r$_",(8..11)); + +sub ROUND { +my ($a0,$b0,$c0,$d0)=@_; +my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); +my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); +my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); +my $odd = $d0&1; +my ($xc,$xc_) = (@t[0..1]); +my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]); +my @ret; + + # Consider order in which variables are addressed by their + # index: + # + # a b c d + # + # 0 4 8 12 < even round + # 1 5 9 13 + # 2 6 10 14 + # 3 7 11 15 + # 0 5 10 15 < odd round + # 1 6 11 12 + # 2 7 8 13 + # 3 4 9 14 + # + # 'a', 'b' are permanently allocated in registers, @x[0..7], + # while 'c's and pair of 'd's are maintained in memory. If + # you observe 'c' column, you'll notice that pair of 'c's is + # invariant between rounds. This means that we have to reload + # them once per round, in the middle. This is why you'll see + # bunch of 'c' stores and loads in the middle, but none in + # the beginning or end. If you observe 'd' column, you'll + # notice that 15 and 13 are reused in next pair of rounds. + # This is why these two are chosen for offloading to memory, + # to make loads count more. + push @ret,( + "&add (@x[$a0],@x[$a0],@x[$b0])", + "&mov ($xd,$xd,'ror#16')", + "&add (@x[$a1],@x[$a1],@x[$b1])", + "&mov ($xd_,$xd_,'ror#16')", + "&eor ($xd,$xd,@x[$a0],'ror#16')", + "&eor ($xd_,$xd_,@x[$a1],'ror#16')", + + "&add ($xc,$xc,$xd)", + "&mov (@x[$b0],@x[$b0],'ror#20')", + "&add ($xc_,$xc_,$xd_)", + "&mov (@x[$b1],@x[$b1],'ror#20')", + "&eor (@x[$b0],@x[$b0],$xc,'ror#20')", + "&eor (@x[$b1],@x[$b1],$xc_,'ror#20')", + + "&add (@x[$a0],@x[$a0],@x[$b0])", + "&mov ($xd,$xd,'ror#24')", + "&add (@x[$a1],@x[$a1],@x[$b1])", + "&mov ($xd_,$xd_,'ror#24')", + "&eor ($xd,$xd,@x[$a0],'ror#24')", + "&eor ($xd_,$xd_,@x[$a1],'ror#24')", + + "&add ($xc,$xc,$xd)", + "&mov (@x[$b0],@x[$b0],'ror#25')" ); + push @ret,( + "&str ($xd,'[sp,#4*(16+$d0)]')", + "&ldr ($xd,'[sp,#4*(16+$d2)]')" ) if ($odd); + push @ret,( + "&add ($xc_,$xc_,$xd_)", + "&mov (@x[$b1],@x[$b1],'ror#25')" ); + push @ret,( + "&str ($xd_,'[sp,#4*(16+$d1)]')", + "&ldr ($xd_,'[sp,#4*(16+$d3)]')" ) if (!$odd); + push @ret,( + "&eor (@x[$b0],@x[$b0],$xc,'ror#25')", + "&eor (@x[$b1],@x[$b1],$xc_,'ror#25')" ); + + $xd=@x[$d2] if (!$odd); + $xd_=@x[$d3] if ($odd); + push @ret,( + "&str ($xc,'[sp,#4*(16+$c0)]')", + "&ldr ($xc,'[sp,#4*(16+$c2)]')", + "&add (@x[$a2],@x[$a2],@x[$b2])", + "&mov ($xd,$xd,'ror#16')", + "&str ($xc_,'[sp,#4*(16+$c1)]')", + "&ldr ($xc_,'[sp,#4*(16+$c3)]')", + "&add (@x[$a3],@x[$a3],@x[$b3])", + "&mov ($xd_,$xd_,'ror#16')", + "&eor ($xd,$xd,@x[$a2],'ror#16')", + "&eor ($xd_,$xd_,@x[$a3],'ror#16')", + + "&add ($xc,$xc,$xd)", + "&mov (@x[$b2],@x[$b2],'ror#20')", + "&add ($xc_,$xc_,$xd_)", + "&mov (@x[$b3],@x[$b3],'ror#20')", + "&eor (@x[$b2],@x[$b2],$xc,'ror#20')", + "&eor (@x[$b3],@x[$b3],$xc_,'ror#20')", + + "&add (@x[$a2],@x[$a2],@x[$b2])", + "&mov ($xd,$xd,'ror#24')", + "&add (@x[$a3],@x[$a3],@x[$b3])", + "&mov ($xd_,$xd_,'ror#24')", + "&eor ($xd,$xd,@x[$a2],'ror#24')", + "&eor ($xd_,$xd_,@x[$a3],'ror#24')", + + "&add ($xc,$xc,$xd)", + "&mov (@x[$b2],@x[$b2],'ror#25')", + "&add ($xc_,$xc_,$xd_)", + "&mov (@x[$b3],@x[$b3],'ror#25')", + "&eor (@x[$b2],@x[$b2],$xc,'ror#25')", + "&eor (@x[$b3],@x[$b3],$xc_,'ror#25')" ); + + @ret; +} + +$code.=<<___; +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. +.arch armv7-a + +.text +#if defined(__thumb2__) || defined(__clang__) +.syntax unified +#endif +#if defined(__thumb2__) +.thumb +#else +.code 32 +#endif + +#if defined(__thumb2__) || defined(__clang__) +#define ldrhsb ldrbhs +#endif + +.align 5 +.Lsigma: +.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral +.Lone: +.long 1,0,0,0 + +.globl ChaCha20_ctr32_nohw +.type ChaCha20_ctr32_nohw,%function +.align 5 +ChaCha20_ctr32_nohw: + ldr r12,[sp,#0] @ pull pointer to counter and nonce + stmdb sp!,{r0-r2,r4-r11,lr} + adr r14,.Lsigma + ldmia r12,{r4-r7} @ load counter and nonce + sub sp,sp,#4*(16) @ off-load area + stmdb sp!,{r4-r7} @ copy counter and nonce + ldmia r3,{r4-r11} @ load key + ldmia r14,{r0-r3} @ load sigma + stmdb sp!,{r4-r11} @ copy key + stmdb sp!,{r0-r3} @ copy sigma + str r10,[sp,#4*(16+10)] @ off-load "@x[10]" + str r11,[sp,#4*(16+11)] @ off-load "@x[11]" + b .Loop_outer_enter + +.align 4 +.Loop_outer: + ldmia sp,{r0-r9} @ load key material + str @t[3],[sp,#4*(32+2)] @ save len + str r12, [sp,#4*(32+1)] @ save inp + str r14, [sp,#4*(32+0)] @ save out +.Loop_outer_enter: + ldr @t[3], [sp,#4*(15)] + ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load + ldr @t[2], [sp,#4*(13)] + ldr @x[14],[sp,#4*(14)] + str @t[3], [sp,#4*(16+15)] + mov @t[3],#10 + b .Loop + +.align 4 +.Loop: + subs @t[3],@t[3],#1 +___ + foreach (&ROUND(0, 4, 8,12)) { eval; } + foreach (&ROUND(0, 5,10,15)) { eval; } +$code.=<<___; + bne .Loop + + ldr @t[3],[sp,#4*(32+2)] @ load len + + str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store + str @t[1], [sp,#4*(16+9)] + str @x[12],[sp,#4*(16+12)] + str @t[2], [sp,#4*(16+13)] + str @x[14],[sp,#4*(16+14)] + + @ at this point we have first half of 512-bit result in + @ @x[0-7] and second half at sp+4*(16+8) + + cmp @t[3],#64 @ done yet? +#ifdef __thumb2__ + itete lo +#endif + addlo r12,sp,#4*(0) @ shortcut or ... + ldrhs r12,[sp,#4*(32+1)] @ ... load inp + addlo r14,sp,#4*(0) @ shortcut or ... + ldrhs r14,[sp,#4*(32+0)] @ ... load out + + ldr @t[0],[sp,#4*(0)] @ load key material + ldr @t[1],[sp,#4*(1)] + +#if __ARM_ARCH>=6 || !defined(__ARMEB__) +# if __ARM_ARCH<7 + orr @t[2],r12,r14 + tst @t[2],#3 @ are input and output aligned? + ldr @t[2],[sp,#4*(2)] + bne .Lunaligned + cmp @t[3],#64 @ restore flags +# else + ldr @t[2],[sp,#4*(2)] +# endif + ldr @t[3],[sp,#4*(3)] + + add @x[0],@x[0],@t[0] @ accumulate key material + add @x[1],@x[1],@t[1] +# ifdef __thumb2__ + itt hs +# endif + ldrhs @t[0],[r12],#16 @ load input + ldrhs @t[1],[r12,#-12] + + add @x[2],@x[2],@t[2] + add @x[3],@x[3],@t[3] +# ifdef __thumb2__ + itt hs +# endif + ldrhs @t[2],[r12,#-8] + ldrhs @t[3],[r12,#-4] +# if __ARM_ARCH>=6 && defined(__ARMEB__) + rev @x[0],@x[0] + rev @x[1],@x[1] + rev @x[2],@x[2] + rev @x[3],@x[3] +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs @x[0],@x[0],@t[0] @ xor with input + eorhs @x[1],@x[1],@t[1] + add @t[0],sp,#4*(4) + str @x[0],[r14],#16 @ store output +# ifdef __thumb2__ + itt hs +# endif + eorhs @x[2],@x[2],@t[2] + eorhs @x[3],@x[3],@t[3] + ldmia @t[0],{@t[0]-@t[3]} @ load key material + str @x[1],[r14,#-12] + str @x[2],[r14,#-8] + str @x[3],[r14,#-4] + + add @x[4],@x[4],@t[0] @ accumulate key material + add @x[5],@x[5],@t[1] +# ifdef __thumb2__ + itt hs +# endif + ldrhs @t[0],[r12],#16 @ load input + ldrhs @t[1],[r12,#-12] + add @x[6],@x[6],@t[2] + add @x[7],@x[7],@t[3] +# ifdef __thumb2__ + itt hs +# endif + ldrhs @t[2],[r12,#-8] + ldrhs @t[3],[r12,#-4] +# if __ARM_ARCH>=6 && defined(__ARMEB__) + rev @x[4],@x[4] + rev @x[5],@x[5] + rev @x[6],@x[6] + rev @x[7],@x[7] +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs @x[4],@x[4],@t[0] + eorhs @x[5],@x[5],@t[1] + add @t[0],sp,#4*(8) + str @x[4],[r14],#16 @ store output +# ifdef __thumb2__ + itt hs +# endif + eorhs @x[6],@x[6],@t[2] + eorhs @x[7],@x[7],@t[3] + str @x[5],[r14,#-12] + ldmia @t[0],{@t[0]-@t[3]} @ load key material + str @x[6],[r14,#-8] + add @x[0],sp,#4*(16+8) + str @x[7],[r14,#-4] + + ldmia @x[0],{@x[0]-@x[7]} @ load second half + + add @x[0],@x[0],@t[0] @ accumulate key material + add @x[1],@x[1],@t[1] +# ifdef __thumb2__ + itt hs +# endif + ldrhs @t[0],[r12],#16 @ load input + ldrhs @t[1],[r12,#-12] +# ifdef __thumb2__ + itt hi +# endif + strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it + strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it + add @x[2],@x[2],@t[2] + add @x[3],@x[3],@t[3] +# ifdef __thumb2__ + itt hs +# endif + ldrhs @t[2],[r12,#-8] + ldrhs @t[3],[r12,#-4] +# if __ARM_ARCH>=6 && defined(__ARMEB__) + rev @x[0],@x[0] + rev @x[1],@x[1] + rev @x[2],@x[2] + rev @x[3],@x[3] +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs @x[0],@x[0],@t[0] + eorhs @x[1],@x[1],@t[1] + add @t[0],sp,#4*(12) + str @x[0],[r14],#16 @ store output +# ifdef __thumb2__ + itt hs +# endif + eorhs @x[2],@x[2],@t[2] + eorhs @x[3],@x[3],@t[3] + str @x[1],[r14,#-12] + ldmia @t[0],{@t[0]-@t[3]} @ load key material + str @x[2],[r14,#-8] + str @x[3],[r14,#-4] + + add @x[4],@x[4],@t[0] @ accumulate key material + add @x[5],@x[5],@t[1] +# ifdef __thumb2__ + itt hi +# endif + addhi @t[0],@t[0],#1 @ next counter value + strhi @t[0],[sp,#4*(12)] @ save next counter value +# ifdef __thumb2__ + itt hs +# endif + ldrhs @t[0],[r12],#16 @ load input + ldrhs @t[1],[r12,#-12] + add @x[6],@x[6],@t[2] + add @x[7],@x[7],@t[3] +# ifdef __thumb2__ + itt hs +# endif + ldrhs @t[2],[r12,#-8] + ldrhs @t[3],[r12,#-4] +# if __ARM_ARCH>=6 && defined(__ARMEB__) + rev @x[4],@x[4] + rev @x[5],@x[5] + rev @x[6],@x[6] + rev @x[7],@x[7] +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs @x[4],@x[4],@t[0] + eorhs @x[5],@x[5],@t[1] +# ifdef __thumb2__ + it ne +# endif + ldrne @t[0],[sp,#4*(32+2)] @ re-load len +# ifdef __thumb2__ + itt hs +# endif + eorhs @x[6],@x[6],@t[2] + eorhs @x[7],@x[7],@t[3] + str @x[4],[r14],#16 @ store output + str @x[5],[r14,#-12] +# ifdef __thumb2__ + it hs +# endif + subhs @t[3],@t[0],#64 @ len-=64 + str @x[6],[r14,#-8] + str @x[7],[r14,#-4] + bhi .Loop_outer + + beq .Ldone +# if __ARM_ARCH<7 + b .Ltail + +.align 4 +.Lunaligned: @ unaligned endian-neutral path + cmp @t[3],#64 @ restore flags +# endif +#endif +#if __ARM_ARCH<7 + ldr @t[3],[sp,#4*(3)] +___ +for ($i=0;$i<16;$i+=4) { +my $j=$i&0x7; + +$code.=<<___ if ($i==4); + add @x[0],sp,#4*(16+8) +___ +$code.=<<___ if ($i==8); + ldmia @x[0],{@x[0]-@x[7]} @ load second half +# ifdef __thumb2__ + itt hi +# endif + strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" + strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" +___ +$code.=<<___; + add @x[$j+0],@x[$j+0],@t[0] @ accumulate key material +___ +$code.=<<___ if ($i==12); +# ifdef __thumb2__ + itt hi +# endif + addhi @t[0],@t[0],#1 @ next counter value + strhi @t[0],[sp,#4*(12)] @ save next counter value +___ +$code.=<<___; + add @x[$j+1],@x[$j+1],@t[1] + add @x[$j+2],@x[$j+2],@t[2] +# ifdef __thumb2__ + itete lo +# endif + eorlo @t[0],@t[0],@t[0] @ zero or ... + ldrhsb @t[0],[r12],#16 @ ... load input + eorlo @t[1],@t[1],@t[1] + ldrhsb @t[1],[r12,#-12] + + add @x[$j+3],@x[$j+3],@t[3] +# ifdef __thumb2__ + itete lo +# endif + eorlo @t[2],@t[2],@t[2] + ldrhsb @t[2],[r12,#-8] + eorlo @t[3],@t[3],@t[3] + ldrhsb @t[3],[r12,#-4] + + eor @x[$j+0],@t[0],@x[$j+0] @ xor with input (or zero) + eor @x[$j+1],@t[1],@x[$j+1] +# ifdef __thumb2__ + itt hs +# endif + ldrhsb @t[0],[r12,#-15] @ load more input + ldrhsb @t[1],[r12,#-11] + eor @x[$j+2],@t[2],@x[$j+2] + strb @x[$j+0],[r14],#16 @ store output + eor @x[$j+3],@t[3],@x[$j+3] +# ifdef __thumb2__ + itt hs +# endif + ldrhsb @t[2],[r12,#-7] + ldrhsb @t[3],[r12,#-3] + strb @x[$j+1],[r14,#-12] + eor @x[$j+0],@t[0],@x[$j+0],lsr#8 + strb @x[$j+2],[r14,#-8] + eor @x[$j+1],@t[1],@x[$j+1],lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb @t[0],[r12,#-14] @ load more input + ldrhsb @t[1],[r12,#-10] + strb @x[$j+3],[r14,#-4] + eor @x[$j+2],@t[2],@x[$j+2],lsr#8 + strb @x[$j+0],[r14,#-15] + eor @x[$j+3],@t[3],@x[$j+3],lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb @t[2],[r12,#-6] + ldrhsb @t[3],[r12,#-2] + strb @x[$j+1],[r14,#-11] + eor @x[$j+0],@t[0],@x[$j+0],lsr#8 + strb @x[$j+2],[r14,#-7] + eor @x[$j+1],@t[1],@x[$j+1],lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb @t[0],[r12,#-13] @ load more input + ldrhsb @t[1],[r12,#-9] + strb @x[$j+3],[r14,#-3] + eor @x[$j+2],@t[2],@x[$j+2],lsr#8 + strb @x[$j+0],[r14,#-14] + eor @x[$j+3],@t[3],@x[$j+3],lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb @t[2],[r12,#-5] + ldrhsb @t[3],[r12,#-1] + strb @x[$j+1],[r14,#-10] + strb @x[$j+2],[r14,#-6] + eor @x[$j+0],@t[0],@x[$j+0],lsr#8 + strb @x[$j+3],[r14,#-2] + eor @x[$j+1],@t[1],@x[$j+1],lsr#8 + strb @x[$j+0],[r14,#-13] + eor @x[$j+2],@t[2],@x[$j+2],lsr#8 + strb @x[$j+1],[r14,#-9] + eor @x[$j+3],@t[3],@x[$j+3],lsr#8 + strb @x[$j+2],[r14,#-5] + strb @x[$j+3],[r14,#-1] +___ +$code.=<<___ if ($i<12); + add @t[0],sp,#4*(4+$i) + ldmia @t[0],{@t[0]-@t[3]} @ load key material +___ +} +$code.=<<___; +# ifdef __thumb2__ + it ne +# endif + ldrne @t[0],[sp,#4*(32+2)] @ re-load len +# ifdef __thumb2__ + it hs +# endif + subhs @t[3],@t[0],#64 @ len-=64 + bhi .Loop_outer + + beq .Ldone +#endif + +.Ltail: + ldr r12,[sp,#4*(32+1)] @ load inp + add @t[1],sp,#4*(0) + ldr r14,[sp,#4*(32+0)] @ load out + +.Loop_tail: + ldrb @t[2],[@t[1]],#1 @ read buffer on stack + ldrb @t[3],[r12],#1 @ read input + subs @t[0],@t[0],#1 + eor @t[3],@t[3],@t[2] + strb @t[3],[r14],#1 @ store output + bne .Loop_tail + +.Ldone: + add sp,sp,#4*(32+3) + ldmia sp!,{r4-r11,pc} +.size ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw +___ + +{{{ +my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) = + map("q$_",(0..15)); + +sub NEONROUND { +my $odd = pop; +my ($a,$b,$c,$d,$t)=@_; + + ( + "&vadd_i32 ($a,$a,$b)", + "&veor ($d,$d,$a)", + "&vrev32_16 ($d,$d)", # vrot ($d,16) + + "&vadd_i32 ($c,$c,$d)", + "&veor ($t,$b,$c)", + "&vshr_u32 ($b,$t,20)", + "&vsli_32 ($b,$t,12)", + + "&vadd_i32 ($a,$a,$b)", + "&veor ($t,$d,$a)", + "&vshr_u32 ($d,$t,24)", + "&vsli_32 ($d,$t,8)", + + "&vadd_i32 ($c,$c,$d)", + "&veor ($t,$b,$c)", + "&vshr_u32 ($b,$t,25)", + "&vsli_32 ($b,$t,7)", + + "&vext_8 ($c,$c,$c,8)", + "&vext_8 ($b,$b,$b,$odd?12:4)", + "&vext_8 ($d,$d,$d,$odd?4:12)" + ); +} + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.globl ChaCha20_ctr32_neon +.type ChaCha20_ctr32_neon,%function +.align 5 +ChaCha20_ctr32_neon: + ldr r12,[sp,#0] @ pull pointer to counter and nonce + stmdb sp!,{r0-r2,r4-r11,lr} + adr r14,.Lsigma + vstmdb sp!,{d8-d15} @ ABI spec says so + stmdb sp!,{r0-r3} + + vld1.32 {$b0-$c0},[r3] @ load key + ldmia r3,{r4-r11} @ load key + + sub sp,sp,#4*(16+16) + vld1.32 {$d0},[r12] @ load counter and nonce + add r12,sp,#4*8 + ldmia r14,{r0-r3} @ load sigma + vld1.32 {$a0},[r14]! @ load sigma + vld1.32 {$t0},[r14] @ one + vst1.32 {$c0-$d0},[r12] @ copy 1/2key|counter|nonce + vst1.32 {$a0-$b0},[sp] @ copy sigma|1/2key + + str r10,[sp,#4*(16+10)] @ off-load "@x[10]" + str r11,[sp,#4*(16+11)] @ off-load "@x[11]" + vshl.i32 $t1#lo,$t0#lo,#1 @ two + vstr $t0#lo,[sp,#4*(16+0)] + vshl.i32 $t2#lo,$t0#lo,#2 @ four + vstr $t1#lo,[sp,#4*(16+2)] + vmov $a1,$a0 + vstr $t2#lo,[sp,#4*(16+4)] + vmov $a2,$a0 + vmov $b1,$b0 + vmov $b2,$b0 + b .Loop_neon_enter + +.align 4 +.Loop_neon_outer: + ldmia sp,{r0-r9} @ load key material + cmp @t[3],#64*2 @ if len<=64*2 + bls .Lbreak_neon @ switch to integer-only + vmov $a1,$a0 + str @t[3],[sp,#4*(32+2)] @ save len + vmov $a2,$a0 + str r12, [sp,#4*(32+1)] @ save inp + vmov $b1,$b0 + str r14, [sp,#4*(32+0)] @ save out + vmov $b2,$b0 +.Loop_neon_enter: + ldr @t[3], [sp,#4*(15)] + vadd.i32 $d1,$d0,$t0 @ counter+1 + ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load + vmov $c1,$c0 + ldr @t[2], [sp,#4*(13)] + vmov $c2,$c0 + ldr @x[14],[sp,#4*(14)] + vadd.i32 $d2,$d1,$t0 @ counter+2 + str @t[3], [sp,#4*(16+15)] + mov @t[3],#10 + add @x[12],@x[12],#3 @ counter+3 + b .Loop_neon + +.align 4 +.Loop_neon: + subs @t[3],@t[3],#1 +___ + my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0); + my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0); + my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0); + my @thread3=&ROUND(0,4,8,12); + + foreach (@thread0) { + eval; eval(shift(@thread3)); + eval(shift(@thread1)); eval(shift(@thread3)); + eval(shift(@thread2)); eval(shift(@thread3)); + } + + @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1); + @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1); + @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1); + @thread3=&ROUND(0,5,10,15); + + foreach (@thread0) { + eval; eval(shift(@thread3)); + eval(shift(@thread1)); eval(shift(@thread3)); + eval(shift(@thread2)); eval(shift(@thread3)); + } +$code.=<<___; + bne .Loop_neon + + add @t[3],sp,#32 + vld1.32 {$t0-$t1},[sp] @ load key material + vld1.32 {$t2-$t3},[@t[3]] + + ldr @t[3],[sp,#4*(32+2)] @ load len + + str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store + str @t[1], [sp,#4*(16+9)] + str @x[12],[sp,#4*(16+12)] + str @t[2], [sp,#4*(16+13)] + str @x[14],[sp,#4*(16+14)] + + @ at this point we have first half of 512-bit result in + @ @x[0-7] and second half at sp+4*(16+8) + + ldr r12,[sp,#4*(32+1)] @ load inp + ldr r14,[sp,#4*(32+0)] @ load out + + vadd.i32 $a0,$a0,$t0 @ accumulate key material + vadd.i32 $a1,$a1,$t0 + vadd.i32 $a2,$a2,$t0 + vldr $t0#lo,[sp,#4*(16+0)] @ one + + vadd.i32 $b0,$b0,$t1 + vadd.i32 $b1,$b1,$t1 + vadd.i32 $b2,$b2,$t1 + vldr $t1#lo,[sp,#4*(16+2)] @ two + + vadd.i32 $c0,$c0,$t2 + vadd.i32 $c1,$c1,$t2 + vadd.i32 $c2,$c2,$t2 + vadd.i32 $d1#lo,$d1#lo,$t0#lo @ counter+1 + vadd.i32 $d2#lo,$d2#lo,$t1#lo @ counter+2 + + vadd.i32 $d0,$d0,$t3 + vadd.i32 $d1,$d1,$t3 + vadd.i32 $d2,$d2,$t3 + + cmp @t[3],#64*4 + blo .Ltail_neon + + vld1.8 {$t0-$t1},[r12]! @ load input + mov @t[3],sp + vld1.8 {$t2-$t3},[r12]! + veor $a0,$a0,$t0 @ xor with input + veor $b0,$b0,$t1 + vld1.8 {$t0-$t1},[r12]! + veor $c0,$c0,$t2 + veor $d0,$d0,$t3 + vld1.8 {$t2-$t3},[r12]! + + veor $a1,$a1,$t0 + vst1.8 {$a0-$b0},[r14]! @ store output + veor $b1,$b1,$t1 + vld1.8 {$t0-$t1},[r12]! + veor $c1,$c1,$t2 + vst1.8 {$c0-$d0},[r14]! + veor $d1,$d1,$t3 + vld1.8 {$t2-$t3},[r12]! + + veor $a2,$a2,$t0 + vld1.32 {$a0-$b0},[@t[3]]! @ load for next iteration + veor $t0#hi,$t0#hi,$t0#hi + vldr $t0#lo,[sp,#4*(16+4)] @ four + veor $b2,$b2,$t1 + vld1.32 {$c0-$d0},[@t[3]] + veor $c2,$c2,$t2 + vst1.8 {$a1-$b1},[r14]! + veor $d2,$d2,$t3 + vst1.8 {$c1-$d1},[r14]! + + vadd.i32 $d0#lo,$d0#lo,$t0#lo @ next counter value + vldr $t0#lo,[sp,#4*(16+0)] @ one + + ldmia sp,{@t[0]-@t[3]} @ load key material + add @x[0],@x[0],@t[0] @ accumulate key material + ldr @t[0],[r12],#16 @ load input + vst1.8 {$a2-$b2},[r14]! + add @x[1],@x[1],@t[1] + ldr @t[1],[r12,#-12] + vst1.8 {$c2-$d2},[r14]! + add @x[2],@x[2],@t[2] + ldr @t[2],[r12,#-8] + add @x[3],@x[3],@t[3] + ldr @t[3],[r12,#-4] +# ifdef __ARMEB__ + rev @x[0],@x[0] + rev @x[1],@x[1] + rev @x[2],@x[2] + rev @x[3],@x[3] +# endif + eor @x[0],@x[0],@t[0] @ xor with input + add @t[0],sp,#4*(4) + eor @x[1],@x[1],@t[1] + str @x[0],[r14],#16 @ store output + eor @x[2],@x[2],@t[2] + str @x[1],[r14,#-12] + eor @x[3],@x[3],@t[3] + ldmia @t[0],{@t[0]-@t[3]} @ load key material + str @x[2],[r14,#-8] + str @x[3],[r14,#-4] + + add @x[4],@x[4],@t[0] @ accumulate key material + ldr @t[0],[r12],#16 @ load input + add @x[5],@x[5],@t[1] + ldr @t[1],[r12,#-12] + add @x[6],@x[6],@t[2] + ldr @t[2],[r12,#-8] + add @x[7],@x[7],@t[3] + ldr @t[3],[r12,#-4] +# ifdef __ARMEB__ + rev @x[4],@x[4] + rev @x[5],@x[5] + rev @x[6],@x[6] + rev @x[7],@x[7] +# endif + eor @x[4],@x[4],@t[0] + add @t[0],sp,#4*(8) + eor @x[5],@x[5],@t[1] + str @x[4],[r14],#16 @ store output + eor @x[6],@x[6],@t[2] + str @x[5],[r14,#-12] + eor @x[7],@x[7],@t[3] + ldmia @t[0],{@t[0]-@t[3]} @ load key material + str @x[6],[r14,#-8] + add @x[0],sp,#4*(16+8) + str @x[7],[r14,#-4] + + ldmia @x[0],{@x[0]-@x[7]} @ load second half + + add @x[0],@x[0],@t[0] @ accumulate key material + ldr @t[0],[r12],#16 @ load input + add @x[1],@x[1],@t[1] + ldr @t[1],[r12,#-12] +# ifdef __thumb2__ + it hi +# endif + strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it + add @x[2],@x[2],@t[2] + ldr @t[2],[r12,#-8] +# ifdef __thumb2__ + it hi +# endif + strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it + add @x[3],@x[3],@t[3] + ldr @t[3],[r12,#-4] +# ifdef __ARMEB__ + rev @x[0],@x[0] + rev @x[1],@x[1] + rev @x[2],@x[2] + rev @x[3],@x[3] +# endif + eor @x[0],@x[0],@t[0] + add @t[0],sp,#4*(12) + eor @x[1],@x[1],@t[1] + str @x[0],[r14],#16 @ store output + eor @x[2],@x[2],@t[2] + str @x[1],[r14,#-12] + eor @x[3],@x[3],@t[3] + ldmia @t[0],{@t[0]-@t[3]} @ load key material + str @x[2],[r14,#-8] + str @x[3],[r14,#-4] + + add @x[4],@x[4],@t[0] @ accumulate key material + add @t[0],@t[0],#4 @ next counter value + add @x[5],@x[5],@t[1] + str @t[0],[sp,#4*(12)] @ save next counter value + ldr @t[0],[r12],#16 @ load input + add @x[6],@x[6],@t[2] + add @x[4],@x[4],#3 @ counter+3 + ldr @t[1],[r12,#-12] + add @x[7],@x[7],@t[3] + ldr @t[2],[r12,#-8] + ldr @t[3],[r12,#-4] +# ifdef __ARMEB__ + rev @x[4],@x[4] + rev @x[5],@x[5] + rev @x[6],@x[6] + rev @x[7],@x[7] +# endif + eor @x[4],@x[4],@t[0] +# ifdef __thumb2__ + it hi +# endif + ldrhi @t[0],[sp,#4*(32+2)] @ re-load len + eor @x[5],@x[5],@t[1] + eor @x[6],@x[6],@t[2] + str @x[4],[r14],#16 @ store output + eor @x[7],@x[7],@t[3] + str @x[5],[r14,#-12] + sub @t[3],@t[0],#64*4 @ len-=64*4 + str @x[6],[r14,#-8] + str @x[7],[r14,#-4] + bhi .Loop_neon_outer + + b .Ldone_neon + +.align 4 +.Lbreak_neon: + @ harmonize NEON and integer-only stack frames: load data + @ from NEON frame, but save to integer-only one; distance + @ between the two is 4*(32+4+16-32)=4*(20). + + str @t[3], [sp,#4*(20+32+2)] @ save len + add @t[3],sp,#4*(32+4) + str r12, [sp,#4*(20+32+1)] @ save inp + str r14, [sp,#4*(20+32+0)] @ save out + + ldr @x[12],[sp,#4*(16+10)] + ldr @x[14],[sp,#4*(16+11)] + vldmia @t[3],{d8-d15} @ fulfill ABI requirement + str @x[12],[sp,#4*(20+16+10)] @ copy "@x[10]" + str @x[14],[sp,#4*(20+16+11)] @ copy "@x[11]" + + ldr @t[3], [sp,#4*(15)] + ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load + ldr @t[2], [sp,#4*(13)] + ldr @x[14],[sp,#4*(14)] + str @t[3], [sp,#4*(20+16+15)] + add @t[3],sp,#4*(20) + vst1.32 {$a0-$b0},[@t[3]]! @ copy key + add sp,sp,#4*(20) @ switch frame + vst1.32 {$c0-$d0},[@t[3]] + mov @t[3],#10 + b .Loop @ go integer-only + +.align 4 +.Ltail_neon: + cmp @t[3],#64*3 + bhs .L192_or_more_neon + cmp @t[3],#64*2 + bhs .L128_or_more_neon + cmp @t[3],#64*1 + bhs .L64_or_more_neon + + add @t[0],sp,#4*(8) + vst1.8 {$a0-$b0},[sp] + add @t[2],sp,#4*(0) + vst1.8 {$c0-$d0},[@t[0]] + b .Loop_tail_neon + +.align 4 +.L64_or_more_neon: + vld1.8 {$t0-$t1},[r12]! + vld1.8 {$t2-$t3},[r12]! + veor $a0,$a0,$t0 + veor $b0,$b0,$t1 + veor $c0,$c0,$t2 + veor $d0,$d0,$t3 + vst1.8 {$a0-$b0},[r14]! + vst1.8 {$c0-$d0},[r14]! + + beq .Ldone_neon + + add @t[0],sp,#4*(8) + vst1.8 {$a1-$b1},[sp] + add @t[2],sp,#4*(0) + vst1.8 {$c1-$d1},[@t[0]] + sub @t[3],@t[3],#64*1 @ len-=64*1 + b .Loop_tail_neon + +.align 4 +.L128_or_more_neon: + vld1.8 {$t0-$t1},[r12]! + vld1.8 {$t2-$t3},[r12]! + veor $a0,$a0,$t0 + veor $b0,$b0,$t1 + vld1.8 {$t0-$t1},[r12]! + veor $c0,$c0,$t2 + veor $d0,$d0,$t3 + vld1.8 {$t2-$t3},[r12]! + + veor $a1,$a1,$t0 + veor $b1,$b1,$t1 + vst1.8 {$a0-$b0},[r14]! + veor $c1,$c1,$t2 + vst1.8 {$c0-$d0},[r14]! + veor $d1,$d1,$t3 + vst1.8 {$a1-$b1},[r14]! + vst1.8 {$c1-$d1},[r14]! + + beq .Ldone_neon + + add @t[0],sp,#4*(8) + vst1.8 {$a2-$b2},[sp] + add @t[2],sp,#4*(0) + vst1.8 {$c2-$d2},[@t[0]] + sub @t[3],@t[3],#64*2 @ len-=64*2 + b .Loop_tail_neon + +.align 4 +.L192_or_more_neon: + vld1.8 {$t0-$t1},[r12]! + vld1.8 {$t2-$t3},[r12]! + veor $a0,$a0,$t0 + veor $b0,$b0,$t1 + vld1.8 {$t0-$t1},[r12]! + veor $c0,$c0,$t2 + veor $d0,$d0,$t3 + vld1.8 {$t2-$t3},[r12]! + + veor $a1,$a1,$t0 + veor $b1,$b1,$t1 + vld1.8 {$t0-$t1},[r12]! + veor $c1,$c1,$t2 + vst1.8 {$a0-$b0},[r14]! + veor $d1,$d1,$t3 + vld1.8 {$t2-$t3},[r12]! + + veor $a2,$a2,$t0 + vst1.8 {$c0-$d0},[r14]! + veor $b2,$b2,$t1 + vst1.8 {$a1-$b1},[r14]! + veor $c2,$c2,$t2 + vst1.8 {$c1-$d1},[r14]! + veor $d2,$d2,$t3 + vst1.8 {$a2-$b2},[r14]! + vst1.8 {$c2-$d2},[r14]! + + beq .Ldone_neon + + ldmia sp,{@t[0]-@t[3]} @ load key material + add @x[0],@x[0],@t[0] @ accumulate key material + add @t[0],sp,#4*(4) + add @x[1],@x[1],@t[1] + add @x[2],@x[2],@t[2] + add @x[3],@x[3],@t[3] + ldmia @t[0],{@t[0]-@t[3]} @ load key material + + add @x[4],@x[4],@t[0] @ accumulate key material + add @t[0],sp,#4*(8) + add @x[5],@x[5],@t[1] + add @x[6],@x[6],@t[2] + add @x[7],@x[7],@t[3] + ldmia @t[0],{@t[0]-@t[3]} @ load key material +# ifdef __ARMEB__ + rev @x[0],@x[0] + rev @x[1],@x[1] + rev @x[2],@x[2] + rev @x[3],@x[3] + rev @x[4],@x[4] + rev @x[5],@x[5] + rev @x[6],@x[6] + rev @x[7],@x[7] +# endif + stmia sp,{@x[0]-@x[7]} + add @x[0],sp,#4*(16+8) + + ldmia @x[0],{@x[0]-@x[7]} @ load second half + + add @x[0],@x[0],@t[0] @ accumulate key material + add @t[0],sp,#4*(12) + add @x[1],@x[1],@t[1] + add @x[2],@x[2],@t[2] + add @x[3],@x[3],@t[3] + ldmia @t[0],{@t[0]-@t[3]} @ load key material + + add @x[4],@x[4],@t[0] @ accumulate key material + add @t[0],sp,#4*(8) + add @x[5],@x[5],@t[1] + add @x[4],@x[4],#3 @ counter+3 + add @x[6],@x[6],@t[2] + add @x[7],@x[7],@t[3] + ldr @t[3],[sp,#4*(32+2)] @ re-load len +# ifdef __ARMEB__ + rev @x[0],@x[0] + rev @x[1],@x[1] + rev @x[2],@x[2] + rev @x[3],@x[3] + rev @x[4],@x[4] + rev @x[5],@x[5] + rev @x[6],@x[6] + rev @x[7],@x[7] +# endif + stmia @t[0],{@x[0]-@x[7]} + add @t[2],sp,#4*(0) + sub @t[3],@t[3],#64*3 @ len-=64*3 + +.Loop_tail_neon: + ldrb @t[0],[@t[2]],#1 @ read buffer on stack + ldrb @t[1],[r12],#1 @ read input + subs @t[3],@t[3],#1 + eor @t[0],@t[0],@t[1] + strb @t[0],[r14],#1 @ store output + bne .Loop_tail_neon + +.Ldone_neon: + add sp,sp,#4*(32+4) + vldmia sp,{d8-d15} + add sp,sp,#4*(16+3) + ldmia sp!,{r4-r11,pc} +.size ChaCha20_ctr32_neon,.-ChaCha20_ctr32_neon +#endif +___ +}}} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; + + print $_,"\n"; +} +close STDOUT or die "error closing STDOUT: $!"; diff --git a/ring-0.17.14/crypto/chacha/asm/chacha-armv8.pl b/ring-0.17.14/crypto/chacha/asm/chacha-armv8.pl new file mode 100644 index 0000000000..8f2fe30a7c --- /dev/null +++ b/ring-0.17.14/crypto/chacha/asm/chacha-armv8.pl @@ -0,0 +1,1130 @@ +#! /usr/bin/env perl +# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. +# ==================================================================== +# +# June 2015 +# +# ChaCha20 for ARMv8. +# +# Performance in cycles per byte out of large buffer. +# +# IALU/gcc-4.9 3xNEON+1xIALU 6xNEON+2xIALU +# +# Apple A7 5.50/+49% 3.33 1.70 +# Cortex-A53 8.40/+80% 4.72 4.72(*) +# Cortex-A57 8.06/+43% 4.90 4.43(**) +# Denver 4.50/+82% 2.63 2.67(*) +# X-Gene 9.50/+46% 8.82 8.89(*) +# Mongoose 8.00/+44% 3.64 3.25 +# Kryo 8.17/+50% 4.83 4.65 +# +# (*) it's expected that doubling interleave factor doesn't help +# all processors, only those with higher NEON latency and +# higher instruction issue rate; +# (**) expected improvement was actually higher; + +$flavour=shift; +$output=shift; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; + +sub AUTOLOAD() # thunk [simplified] x86-style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; + my $arg = pop; + $arg = "#$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; +} + +my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4)); + +my @x=map("x$_",(5..17,19..21)); +my @d=map("x$_",(22..28,30)); + +sub ROUND { +my ($a0,$b0,$c0,$d0)=@_; +my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); +my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); +my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); + + ( + "&add_32 (@x[$a0],@x[$a0],@x[$b0])", + "&add_32 (@x[$a1],@x[$a1],@x[$b1])", + "&add_32 (@x[$a2],@x[$a2],@x[$b2])", + "&add_32 (@x[$a3],@x[$a3],@x[$b3])", + "&eor_32 (@x[$d0],@x[$d0],@x[$a0])", + "&eor_32 (@x[$d1],@x[$d1],@x[$a1])", + "&eor_32 (@x[$d2],@x[$d2],@x[$a2])", + "&eor_32 (@x[$d3],@x[$d3],@x[$a3])", + "&ror_32 (@x[$d0],@x[$d0],16)", + "&ror_32 (@x[$d1],@x[$d1],16)", + "&ror_32 (@x[$d2],@x[$d2],16)", + "&ror_32 (@x[$d3],@x[$d3],16)", + + "&add_32 (@x[$c0],@x[$c0],@x[$d0])", + "&add_32 (@x[$c1],@x[$c1],@x[$d1])", + "&add_32 (@x[$c2],@x[$c2],@x[$d2])", + "&add_32 (@x[$c3],@x[$c3],@x[$d3])", + "&eor_32 (@x[$b0],@x[$b0],@x[$c0])", + "&eor_32 (@x[$b1],@x[$b1],@x[$c1])", + "&eor_32 (@x[$b2],@x[$b2],@x[$c2])", + "&eor_32 (@x[$b3],@x[$b3],@x[$c3])", + "&ror_32 (@x[$b0],@x[$b0],20)", + "&ror_32 (@x[$b1],@x[$b1],20)", + "&ror_32 (@x[$b2],@x[$b2],20)", + "&ror_32 (@x[$b3],@x[$b3],20)", + + "&add_32 (@x[$a0],@x[$a0],@x[$b0])", + "&add_32 (@x[$a1],@x[$a1],@x[$b1])", + "&add_32 (@x[$a2],@x[$a2],@x[$b2])", + "&add_32 (@x[$a3],@x[$a3],@x[$b3])", + "&eor_32 (@x[$d0],@x[$d0],@x[$a0])", + "&eor_32 (@x[$d1],@x[$d1],@x[$a1])", + "&eor_32 (@x[$d2],@x[$d2],@x[$a2])", + "&eor_32 (@x[$d3],@x[$d3],@x[$a3])", + "&ror_32 (@x[$d0],@x[$d0],24)", + "&ror_32 (@x[$d1],@x[$d1],24)", + "&ror_32 (@x[$d2],@x[$d2],24)", + "&ror_32 (@x[$d3],@x[$d3],24)", + + "&add_32 (@x[$c0],@x[$c0],@x[$d0])", + "&add_32 (@x[$c1],@x[$c1],@x[$d1])", + "&add_32 (@x[$c2],@x[$c2],@x[$d2])", + "&add_32 (@x[$c3],@x[$c3],@x[$d3])", + "&eor_32 (@x[$b0],@x[$b0],@x[$c0])", + "&eor_32 (@x[$b1],@x[$b1],@x[$c1])", + "&eor_32 (@x[$b2],@x[$b2],@x[$c2])", + "&eor_32 (@x[$b3],@x[$b3],@x[$c3])", + "&ror_32 (@x[$b0],@x[$b0],25)", + "&ror_32 (@x[$b1],@x[$b1],25)", + "&ror_32 (@x[$b2],@x[$b2],25)", + "&ror_32 (@x[$b3],@x[$b3],25)" + ); +} + +$code.=<<___; +.section .rodata + +.align 5 +.Lsigma: +.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral +.Lone: +.long 1,0,0,0 +.asciz "ChaCha20 for ARMv8, CRYPTOGAMS by " + +.text + +.globl ChaCha20_ctr32_nohw +.type ChaCha20_ctr32_nohw,%function +.align 5 +ChaCha20_ctr32_nohw: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adrp @x[0],:pg_hi21:.Lsigma + add @x[0],@x[0],:lo12:.Lsigma + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#64 + + ldp @d[0],@d[1],[@x[0]] // load sigma + ldp @d[2],@d[3],[$key] // load key + ldp @d[4],@d[5],[$key,#16] + ldp @d[6],@d[7],[$ctr] // load counter +#ifdef __AARCH64EB__ + ror @d[2],@d[2],#32 + ror @d[3],@d[3],#32 + ror @d[4],@d[4],#32 + ror @d[5],@d[5],#32 + ror @d[6],@d[6],#32 + ror @d[7],@d[7],#32 +#endif + +.Loop_outer: + mov.32 @x[0],@d[0] // unpack key block + lsr @x[1],@d[0],#32 + mov.32 @x[2],@d[1] + lsr @x[3],@d[1],#32 + mov.32 @x[4],@d[2] + lsr @x[5],@d[2],#32 + mov.32 @x[6],@d[3] + lsr @x[7],@d[3],#32 + mov.32 @x[8],@d[4] + lsr @x[9],@d[4],#32 + mov.32 @x[10],@d[5] + lsr @x[11],@d[5],#32 + mov.32 @x[12],@d[6] + lsr @x[13],@d[6],#32 + mov.32 @x[14],@d[7] + lsr @x[15],@d[7],#32 + + mov $ctr,#10 + subs $len,$len,#64 +.Loop: + sub $ctr,$ctr,#1 +___ + foreach (&ROUND(0, 4, 8,12)) { eval; } + foreach (&ROUND(0, 5,10,15)) { eval; } +$code.=<<___; + cbnz $ctr,.Loop + + add.32 @x[0],@x[0],@d[0] // accumulate key block + add @x[1],@x[1],@d[0],lsr#32 + add.32 @x[2],@x[2],@d[1] + add @x[3],@x[3],@d[1],lsr#32 + add.32 @x[4],@x[4],@d[2] + add @x[5],@x[5],@d[2],lsr#32 + add.32 @x[6],@x[6],@d[3] + add @x[7],@x[7],@d[3],lsr#32 + add.32 @x[8],@x[8],@d[4] + add @x[9],@x[9],@d[4],lsr#32 + add.32 @x[10],@x[10],@d[5] + add @x[11],@x[11],@d[5],lsr#32 + add.32 @x[12],@x[12],@d[6] + add @x[13],@x[13],@d[6],lsr#32 + add.32 @x[14],@x[14],@d[7] + add @x[15],@x[15],@d[7],lsr#32 + + b.lo .Ltail + + add @x[0],@x[0],@x[1],lsl#32 // pack + add @x[2],@x[2],@x[3],lsl#32 + ldp @x[1],@x[3],[$inp,#0] // load input + add @x[4],@x[4],@x[5],lsl#32 + add @x[6],@x[6],@x[7],lsl#32 + ldp @x[5],@x[7],[$inp,#16] + add @x[8],@x[8],@x[9],lsl#32 + add @x[10],@x[10],@x[11],lsl#32 + ldp @x[9],@x[11],[$inp,#32] + add @x[12],@x[12],@x[13],lsl#32 + add @x[14],@x[14],@x[15],lsl#32 + ldp @x[13],@x[15],[$inp,#48] + add $inp,$inp,#64 +#ifdef __AARCH64EB__ + rev @x[0],@x[0] + rev @x[2],@x[2] + rev @x[4],@x[4] + rev @x[6],@x[6] + rev @x[8],@x[8] + rev @x[10],@x[10] + rev @x[12],@x[12] + rev @x[14],@x[14] +#endif + eor @x[0],@x[0],@x[1] + eor @x[2],@x[2],@x[3] + eor @x[4],@x[4],@x[5] + eor @x[6],@x[6],@x[7] + eor @x[8],@x[8],@x[9] + eor @x[10],@x[10],@x[11] + eor @x[12],@x[12],@x[13] + eor @x[14],@x[14],@x[15] + + stp @x[0],@x[2],[$out,#0] // store output + add @d[6],@d[6],#1 // increment counter + stp @x[4],@x[6],[$out,#16] + stp @x[8],@x[10],[$out,#32] + stp @x[12],@x[14],[$out,#48] + add $out,$out,#64 + + b.hi .Loop_outer + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.align 4 +.Ltail: + add $len,$len,#64 +.Less_than_64: + sub $out,$out,#1 + add $inp,$inp,$len + add $out,$out,$len + add $ctr,sp,$len + neg $len,$len + + add @x[0],@x[0],@x[1],lsl#32 // pack + add @x[2],@x[2],@x[3],lsl#32 + add @x[4],@x[4],@x[5],lsl#32 + add @x[6],@x[6],@x[7],lsl#32 + add @x[8],@x[8],@x[9],lsl#32 + add @x[10],@x[10],@x[11],lsl#32 + add @x[12],@x[12],@x[13],lsl#32 + add @x[14],@x[14],@x[15],lsl#32 +#ifdef __AARCH64EB__ + rev @x[0],@x[0] + rev @x[2],@x[2] + rev @x[4],@x[4] + rev @x[6],@x[6] + rev @x[8],@x[8] + rev @x[10],@x[10] + rev @x[12],@x[12] + rev @x[14],@x[14] +#endif + stp @x[0],@x[2],[sp,#0] + stp @x[4],@x[6],[sp,#16] + stp @x[8],@x[10],[sp,#32] + stp @x[12],@x[14],[sp,#48] + +.Loop_tail: + ldrb w10,[$inp,$len] + ldrb w11,[$ctr,$len] + add $len,$len,#1 + eor w10,w10,w11 + strb w10,[$out,$len] + cbnz $len,.Loop_tail + + stp xzr,xzr,[sp,#0] + stp xzr,xzr,[sp,#16] + stp xzr,xzr,[sp,#32] + stp xzr,xzr,[sp,#48] + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw +___ + +{{{ +my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2,$T3) = + map("v$_.4s",(0..7,16..23)); +my (@K)=map("v$_.4s",(24..30)); +my $ONE="v31.4s"; + +sub NEONROUND { +my $odd = pop; +my ($a,$b,$c,$d,$t)=@_; + + ( + "&add ('$a','$a','$b')", + "&eor ('$d','$d','$a')", + "&rev32_16 ('$d','$d')", # vrot ($d,16) + + "&add ('$c','$c','$d')", + "&eor ('$t','$b','$c')", + "&ushr ('$b','$t',20)", + "&sli ('$b','$t',12)", + + "&add ('$a','$a','$b')", + "&eor ('$t','$d','$a')", + "&ushr ('$d','$t',24)", + "&sli ('$d','$t',8)", + + "&add ('$c','$c','$d')", + "&eor ('$t','$b','$c')", + "&ushr ('$b','$t',25)", + "&sli ('$b','$t',7)", + + "&ext ('$c','$c','$c',8)", + "&ext ('$d','$d','$d',$odd?4:12)", + "&ext ('$b','$b','$b',$odd?12:4)" + ); +} + +$code.=<<___; + +.globl ChaCha20_ctr32_neon +.type ChaCha20_ctr32_neon,%function +.align 5 +ChaCha20_ctr32_neon: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adrp @x[0],:pg_hi21:.Lsigma + add @x[0],@x[0],:lo12:.Lsigma + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + cmp $len,#512 + b.hs .L512_or_more_neon + + sub sp,sp,#64 + + ldp @d[0],@d[1],[@x[0]] // load sigma + ld1 {@K[0]},[@x[0]],#16 + ldp @d[2],@d[3],[$key] // load key + ldp @d[4],@d[5],[$key,#16] + ld1 {@K[1],@K[2]},[$key] + ldp @d[6],@d[7],[$ctr] // load counter + ld1 {@K[3]},[$ctr] + ld1 {$ONE},[@x[0]] +#ifdef __AARCH64EB__ + rev64 @K[0],@K[0] + ror @d[2],@d[2],#32 + ror @d[3],@d[3],#32 + ror @d[4],@d[4],#32 + ror @d[5],@d[5],#32 + ror @d[6],@d[6],#32 + ror @d[7],@d[7],#32 +#endif + add @K[3],@K[3],$ONE // += 1 + add @K[4],@K[3],$ONE + add @K[5],@K[4],$ONE + shl $ONE,$ONE,#2 // 1 -> 4 + +.Loop_outer_neon: + mov.32 @x[0],@d[0] // unpack key block + lsr @x[1],@d[0],#32 + mov $A0,@K[0] + mov.32 @x[2],@d[1] + lsr @x[3],@d[1],#32 + mov $A1,@K[0] + mov.32 @x[4],@d[2] + lsr @x[5],@d[2],#32 + mov $A2,@K[0] + mov.32 @x[6],@d[3] + mov $B0,@K[1] + lsr @x[7],@d[3],#32 + mov $B1,@K[1] + mov.32 @x[8],@d[4] + mov $B2,@K[1] + lsr @x[9],@d[4],#32 + mov $D0,@K[3] + mov.32 @x[10],@d[5] + mov $D1,@K[4] + lsr @x[11],@d[5],#32 + mov $D2,@K[5] + mov.32 @x[12],@d[6] + mov $C0,@K[2] + lsr @x[13],@d[6],#32 + mov $C1,@K[2] + mov.32 @x[14],@d[7] + mov $C2,@K[2] + lsr @x[15],@d[7],#32 + + mov $ctr,#10 + subs $len,$len,#256 +.Loop_neon: + sub $ctr,$ctr,#1 +___ + my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0); + my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0); + my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0); + my @thread3=&ROUND(0,4,8,12); + + foreach (@thread0) { + eval; eval(shift(@thread3)); + eval(shift(@thread1)); eval(shift(@thread3)); + eval(shift(@thread2)); eval(shift(@thread3)); + } + + @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1); + @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1); + @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1); + @thread3=&ROUND(0,5,10,15); + + foreach (@thread0) { + eval; eval(shift(@thread3)); + eval(shift(@thread1)); eval(shift(@thread3)); + eval(shift(@thread2)); eval(shift(@thread3)); + } +$code.=<<___; + cbnz $ctr,.Loop_neon + + add.32 @x[0],@x[0],@d[0] // accumulate key block + add $A0,$A0,@K[0] + add @x[1],@x[1],@d[0],lsr#32 + add $A1,$A1,@K[0] + add.32 @x[2],@x[2],@d[1] + add $A2,$A2,@K[0] + add @x[3],@x[3],@d[1],lsr#32 + add $C0,$C0,@K[2] + add.32 @x[4],@x[4],@d[2] + add $C1,$C1,@K[2] + add @x[5],@x[5],@d[2],lsr#32 + add $C2,$C2,@K[2] + add.32 @x[6],@x[6],@d[3] + add $D0,$D0,@K[3] + add @x[7],@x[7],@d[3],lsr#32 + add.32 @x[8],@x[8],@d[4] + add $D1,$D1,@K[4] + add @x[9],@x[9],@d[4],lsr#32 + add.32 @x[10],@x[10],@d[5] + add $D2,$D2,@K[5] + add @x[11],@x[11],@d[5],lsr#32 + add.32 @x[12],@x[12],@d[6] + add $B0,$B0,@K[1] + add @x[13],@x[13],@d[6],lsr#32 + add.32 @x[14],@x[14],@d[7] + add $B1,$B1,@K[1] + add @x[15],@x[15],@d[7],lsr#32 + add $B2,$B2,@K[1] + + b.lo .Ltail_neon + + add @x[0],@x[0],@x[1],lsl#32 // pack + add @x[2],@x[2],@x[3],lsl#32 + ldp @x[1],@x[3],[$inp,#0] // load input + add @x[4],@x[4],@x[5],lsl#32 + add @x[6],@x[6],@x[7],lsl#32 + ldp @x[5],@x[7],[$inp,#16] + add @x[8],@x[8],@x[9],lsl#32 + add @x[10],@x[10],@x[11],lsl#32 + ldp @x[9],@x[11],[$inp,#32] + add @x[12],@x[12],@x[13],lsl#32 + add @x[14],@x[14],@x[15],lsl#32 + ldp @x[13],@x[15],[$inp,#48] + add $inp,$inp,#64 +#ifdef __AARCH64EB__ + rev @x[0],@x[0] + rev @x[2],@x[2] + rev @x[4],@x[4] + rev @x[6],@x[6] + rev @x[8],@x[8] + rev @x[10],@x[10] + rev @x[12],@x[12] + rev @x[14],@x[14] +#endif + ld1.8 {$T0-$T3},[$inp],#64 + eor @x[0],@x[0],@x[1] + eor @x[2],@x[2],@x[3] + eor @x[4],@x[4],@x[5] + eor @x[6],@x[6],@x[7] + eor @x[8],@x[8],@x[9] + eor $A0,$A0,$T0 + eor @x[10],@x[10],@x[11] + eor $B0,$B0,$T1 + eor @x[12],@x[12],@x[13] + eor $C0,$C0,$T2 + eor @x[14],@x[14],@x[15] + eor $D0,$D0,$T3 + ld1.8 {$T0-$T3},[$inp],#64 + + stp @x[0],@x[2],[$out,#0] // store output + add @d[6],@d[6],#4 // increment counter + stp @x[4],@x[6],[$out,#16] + add @K[3],@K[3],$ONE // += 4 + stp @x[8],@x[10],[$out,#32] + add @K[4],@K[4],$ONE + stp @x[12],@x[14],[$out,#48] + add @K[5],@K[5],$ONE + add $out,$out,#64 + + st1.8 {$A0-$D0},[$out],#64 + ld1.8 {$A0-$D0},[$inp],#64 + + eor $A1,$A1,$T0 + eor $B1,$B1,$T1 + eor $C1,$C1,$T2 + eor $D1,$D1,$T3 + st1.8 {$A1-$D1},[$out],#64 + + eor $A2,$A2,$A0 + eor $B2,$B2,$B0 + eor $C2,$C2,$C0 + eor $D2,$D2,$D0 + st1.8 {$A2-$D2},[$out],#64 + + b.hi .Loop_outer_neon + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.Ltail_neon: + add $len,$len,#256 + cmp $len,#64 + b.lo .Less_than_64 + + add @x[0],@x[0],@x[1],lsl#32 // pack + add @x[2],@x[2],@x[3],lsl#32 + ldp @x[1],@x[3],[$inp,#0] // load input + add @x[4],@x[4],@x[5],lsl#32 + add @x[6],@x[6],@x[7],lsl#32 + ldp @x[5],@x[7],[$inp,#16] + add @x[8],@x[8],@x[9],lsl#32 + add @x[10],@x[10],@x[11],lsl#32 + ldp @x[9],@x[11],[$inp,#32] + add @x[12],@x[12],@x[13],lsl#32 + add @x[14],@x[14],@x[15],lsl#32 + ldp @x[13],@x[15],[$inp,#48] + add $inp,$inp,#64 +#ifdef __AARCH64EB__ + rev @x[0],@x[0] + rev @x[2],@x[2] + rev @x[4],@x[4] + rev @x[6],@x[6] + rev @x[8],@x[8] + rev @x[10],@x[10] + rev @x[12],@x[12] + rev @x[14],@x[14] +#endif + eor @x[0],@x[0],@x[1] + eor @x[2],@x[2],@x[3] + eor @x[4],@x[4],@x[5] + eor @x[6],@x[6],@x[7] + eor @x[8],@x[8],@x[9] + eor @x[10],@x[10],@x[11] + eor @x[12],@x[12],@x[13] + eor @x[14],@x[14],@x[15] + + stp @x[0],@x[2],[$out,#0] // store output + add @d[6],@d[6],#4 // increment counter + stp @x[4],@x[6],[$out,#16] + stp @x[8],@x[10],[$out,#32] + stp @x[12],@x[14],[$out,#48] + add $out,$out,#64 + b.eq .Ldone_neon + sub $len,$len,#64 + cmp $len,#64 + b.lo .Less_than_128 + + ld1.8 {$T0-$T3},[$inp],#64 + eor $A0,$A0,$T0 + eor $B0,$B0,$T1 + eor $C0,$C0,$T2 + eor $D0,$D0,$T3 + st1.8 {$A0-$D0},[$out],#64 + b.eq .Ldone_neon + sub $len,$len,#64 + cmp $len,#64 + b.lo .Less_than_192 + + ld1.8 {$T0-$T3},[$inp],#64 + eor $A1,$A1,$T0 + eor $B1,$B1,$T1 + eor $C1,$C1,$T2 + eor $D1,$D1,$T3 + st1.8 {$A1-$D1},[$out],#64 + b.eq .Ldone_neon + sub $len,$len,#64 + + st1.8 {$A2-$D2},[sp] + b .Last_neon + +.Less_than_128: + st1.8 {$A0-$D0},[sp] + b .Last_neon +.Less_than_192: + st1.8 {$A1-$D1},[sp] + b .Last_neon + +.align 4 +.Last_neon: + sub $out,$out,#1 + add $inp,$inp,$len + add $out,$out,$len + add $ctr,sp,$len + neg $len,$len + +.Loop_tail_neon: + ldrb w10,[$inp,$len] + ldrb w11,[$ctr,$len] + add $len,$len,#1 + eor w10,w10,w11 + strb w10,[$out,$len] + cbnz $len,.Loop_tail_neon + + stp xzr,xzr,[sp,#0] + stp xzr,xzr,[sp,#16] + stp xzr,xzr,[sp,#32] + stp xzr,xzr,[sp,#48] + +.Ldone_neon: + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ChaCha20_ctr32_neon,.-ChaCha20_ctr32_neon +___ +{ +my ($T0,$T1,$T2,$T3,$T4,$T5)=@K; +my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2, + $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23)); + +$code.=<<___; +.type ChaCha20_512_neon,%function +.align 5 +ChaCha20_512_neon: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adrp @x[0],:pg_hi21:.Lsigma + add @x[0],@x[0],:lo12:.Lsigma + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + +.L512_or_more_neon: + sub sp,sp,#128+64 + + ldp @d[0],@d[1],[@x[0]] // load sigma + ld1 {@K[0]},[@x[0]],#16 + ldp @d[2],@d[3],[$key] // load key + ldp @d[4],@d[5],[$key,#16] + ld1 {@K[1],@K[2]},[$key] + ldp @d[6],@d[7],[$ctr] // load counter + ld1 {@K[3]},[$ctr] + ld1 {$ONE},[@x[0]] +#ifdef __AARCH64EB__ + rev64 @K[0],@K[0] + ror @d[2],@d[2],#32 + ror @d[3],@d[3],#32 + ror @d[4],@d[4],#32 + ror @d[5],@d[5],#32 + ror @d[6],@d[6],#32 + ror @d[7],@d[7],#32 +#endif + add @K[3],@K[3],$ONE // += 1 + stp @K[0],@K[1],[sp,#0] // off-load key block, invariant part + add @K[3],@K[3],$ONE // not typo + str @K[2],[sp,#32] + add @K[4],@K[3],$ONE + add @K[5],@K[4],$ONE + add @K[6],@K[5],$ONE + shl $ONE,$ONE,#2 // 1 -> 4 + + stp d8,d9,[sp,#128+0] // meet ABI requirements + stp d10,d11,[sp,#128+16] + stp d12,d13,[sp,#128+32] + stp d14,d15,[sp,#128+48] + + sub $len,$len,#512 // not typo + +.Loop_outer_512_neon: + mov $A0,@K[0] + mov $A1,@K[0] + mov $A2,@K[0] + mov $A3,@K[0] + mov $A4,@K[0] + mov $A5,@K[0] + mov $B0,@K[1] + mov.32 @x[0],@d[0] // unpack key block + mov $B1,@K[1] + lsr @x[1],@d[0],#32 + mov $B2,@K[1] + mov.32 @x[2],@d[1] + mov $B3,@K[1] + lsr @x[3],@d[1],#32 + mov $B4,@K[1] + mov.32 @x[4],@d[2] + mov $B5,@K[1] + lsr @x[5],@d[2],#32 + mov $D0,@K[3] + mov.32 @x[6],@d[3] + mov $D1,@K[4] + lsr @x[7],@d[3],#32 + mov $D2,@K[5] + mov.32 @x[8],@d[4] + mov $D3,@K[6] + lsr @x[9],@d[4],#32 + mov $C0,@K[2] + mov.32 @x[10],@d[5] + mov $C1,@K[2] + lsr @x[11],@d[5],#32 + add $D4,$D0,$ONE // +4 + mov.32 @x[12],@d[6] + add $D5,$D1,$ONE // +4 + lsr @x[13],@d[6],#32 + mov $C2,@K[2] + mov.32 @x[14],@d[7] + mov $C3,@K[2] + lsr @x[15],@d[7],#32 + mov $C4,@K[2] + stp @K[3],@K[4],[sp,#48] // off-load key block, variable part + mov $C5,@K[2] + str @K[5],[sp,#80] + + mov $ctr,#5 + subs $len,$len,#512 +.Loop_upper_neon: + sub $ctr,$ctr,#1 +___ + my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0); + my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0); + my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0); + my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0); + my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0); + my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0); + my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); + my $diff = ($#thread0+1)*6 - $#thread67 - 1; + my $i = 0; + + foreach (@thread0) { + eval; eval(shift(@thread67)); + eval(shift(@thread1)); eval(shift(@thread67)); + eval(shift(@thread2)); eval(shift(@thread67)); + eval(shift(@thread3)); eval(shift(@thread67)); + eval(shift(@thread4)); eval(shift(@thread67)); + eval(shift(@thread5)); eval(shift(@thread67)); + } + + @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1); + @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1); + @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1); + @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1); + @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1); + @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1); + @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); + + foreach (@thread0) { + eval; eval(shift(@thread67)); + eval(shift(@thread1)); eval(shift(@thread67)); + eval(shift(@thread2)); eval(shift(@thread67)); + eval(shift(@thread3)); eval(shift(@thread67)); + eval(shift(@thread4)); eval(shift(@thread67)); + eval(shift(@thread5)); eval(shift(@thread67)); + } +$code.=<<___; + cbnz $ctr,.Loop_upper_neon + + add.32 @x[0],@x[0],@d[0] // accumulate key block + add @x[1],@x[1],@d[0],lsr#32 + add.32 @x[2],@x[2],@d[1] + add @x[3],@x[3],@d[1],lsr#32 + add.32 @x[4],@x[4],@d[2] + add @x[5],@x[5],@d[2],lsr#32 + add.32 @x[6],@x[6],@d[3] + add @x[7],@x[7],@d[3],lsr#32 + add.32 @x[8],@x[8],@d[4] + add @x[9],@x[9],@d[4],lsr#32 + add.32 @x[10],@x[10],@d[5] + add @x[11],@x[11],@d[5],lsr#32 + add.32 @x[12],@x[12],@d[6] + add @x[13],@x[13],@d[6],lsr#32 + add.32 @x[14],@x[14],@d[7] + add @x[15],@x[15],@d[7],lsr#32 + + add @x[0],@x[0],@x[1],lsl#32 // pack + add @x[2],@x[2],@x[3],lsl#32 + ldp @x[1],@x[3],[$inp,#0] // load input + add @x[4],@x[4],@x[5],lsl#32 + add @x[6],@x[6],@x[7],lsl#32 + ldp @x[5],@x[7],[$inp,#16] + add @x[8],@x[8],@x[9],lsl#32 + add @x[10],@x[10],@x[11],lsl#32 + ldp @x[9],@x[11],[$inp,#32] + add @x[12],@x[12],@x[13],lsl#32 + add @x[14],@x[14],@x[15],lsl#32 + ldp @x[13],@x[15],[$inp,#48] + add $inp,$inp,#64 +#ifdef __AARCH64EB__ + rev @x[0],@x[0] + rev @x[2],@x[2] + rev @x[4],@x[4] + rev @x[6],@x[6] + rev @x[8],@x[8] + rev @x[10],@x[10] + rev @x[12],@x[12] + rev @x[14],@x[14] +#endif + eor @x[0],@x[0],@x[1] + eor @x[2],@x[2],@x[3] + eor @x[4],@x[4],@x[5] + eor @x[6],@x[6],@x[7] + eor @x[8],@x[8],@x[9] + eor @x[10],@x[10],@x[11] + eor @x[12],@x[12],@x[13] + eor @x[14],@x[14],@x[15] + + stp @x[0],@x[2],[$out,#0] // store output + add @d[6],@d[6],#1 // increment counter + mov.32 @x[0],@d[0] // unpack key block + lsr @x[1],@d[0],#32 + stp @x[4],@x[6],[$out,#16] + mov.32 @x[2],@d[1] + lsr @x[3],@d[1],#32 + stp @x[8],@x[10],[$out,#32] + mov.32 @x[4],@d[2] + lsr @x[5],@d[2],#32 + stp @x[12],@x[14],[$out,#48] + add $out,$out,#64 + mov.32 @x[6],@d[3] + lsr @x[7],@d[3],#32 + mov.32 @x[8],@d[4] + lsr @x[9],@d[4],#32 + mov.32 @x[10],@d[5] + lsr @x[11],@d[5],#32 + mov.32 @x[12],@d[6] + lsr @x[13],@d[6],#32 + mov.32 @x[14],@d[7] + lsr @x[15],@d[7],#32 + + mov $ctr,#5 +.Loop_lower_neon: + sub $ctr,$ctr,#1 +___ + @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0); + @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0); + @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0); + @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0); + @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0); + @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0); + @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); + + foreach (@thread0) { + eval; eval(shift(@thread67)); + eval(shift(@thread1)); eval(shift(@thread67)); + eval(shift(@thread2)); eval(shift(@thread67)); + eval(shift(@thread3)); eval(shift(@thread67)); + eval(shift(@thread4)); eval(shift(@thread67)); + eval(shift(@thread5)); eval(shift(@thread67)); + } + + @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1); + @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1); + @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1); + @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1); + @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1); + @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1); + @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); + + foreach (@thread0) { + eval; eval(shift(@thread67)); + eval(shift(@thread1)); eval(shift(@thread67)); + eval(shift(@thread2)); eval(shift(@thread67)); + eval(shift(@thread3)); eval(shift(@thread67)); + eval(shift(@thread4)); eval(shift(@thread67)); + eval(shift(@thread5)); eval(shift(@thread67)); + } +$code.=<<___; + cbnz $ctr,.Loop_lower_neon + + add.32 @x[0],@x[0],@d[0] // accumulate key block + ldp @K[0],@K[1],[sp,#0] + add @x[1],@x[1],@d[0],lsr#32 + ldp @K[2],@K[3],[sp,#32] + add.32 @x[2],@x[2],@d[1] + ldp @K[4],@K[5],[sp,#64] + add @x[3],@x[3],@d[1],lsr#32 + add $A0,$A0,@K[0] + add.32 @x[4],@x[4],@d[2] + add $A1,$A1,@K[0] + add @x[5],@x[5],@d[2],lsr#32 + add $A2,$A2,@K[0] + add.32 @x[6],@x[6],@d[3] + add $A3,$A3,@K[0] + add @x[7],@x[7],@d[3],lsr#32 + add $A4,$A4,@K[0] + add.32 @x[8],@x[8],@d[4] + add $A5,$A5,@K[0] + add @x[9],@x[9],@d[4],lsr#32 + add $C0,$C0,@K[2] + add.32 @x[10],@x[10],@d[5] + add $C1,$C1,@K[2] + add @x[11],@x[11],@d[5],lsr#32 + add $C2,$C2,@K[2] + add.32 @x[12],@x[12],@d[6] + add $C3,$C3,@K[2] + add @x[13],@x[13],@d[6],lsr#32 + add $C4,$C4,@K[2] + add.32 @x[14],@x[14],@d[7] + add $C5,$C5,@K[2] + add @x[15],@x[15],@d[7],lsr#32 + add $D4,$D4,$ONE // +4 + add @x[0],@x[0],@x[1],lsl#32 // pack + add $D5,$D5,$ONE // +4 + add @x[2],@x[2],@x[3],lsl#32 + add $D0,$D0,@K[3] + ldp @x[1],@x[3],[$inp,#0] // load input + add $D1,$D1,@K[4] + add @x[4],@x[4],@x[5],lsl#32 + add $D2,$D2,@K[5] + add @x[6],@x[6],@x[7],lsl#32 + add $D3,$D3,@K[6] + ldp @x[5],@x[7],[$inp,#16] + add $D4,$D4,@K[3] + add @x[8],@x[8],@x[9],lsl#32 + add $D5,$D5,@K[4] + add @x[10],@x[10],@x[11],lsl#32 + add $B0,$B0,@K[1] + ldp @x[9],@x[11],[$inp,#32] + add $B1,$B1,@K[1] + add @x[12],@x[12],@x[13],lsl#32 + add $B2,$B2,@K[1] + add @x[14],@x[14],@x[15],lsl#32 + add $B3,$B3,@K[1] + ldp @x[13],@x[15],[$inp,#48] + add $B4,$B4,@K[1] + add $inp,$inp,#64 + add $B5,$B5,@K[1] + +#ifdef __AARCH64EB__ + rev @x[0],@x[0] + rev @x[2],@x[2] + rev @x[4],@x[4] + rev @x[6],@x[6] + rev @x[8],@x[8] + rev @x[10],@x[10] + rev @x[12],@x[12] + rev @x[14],@x[14] +#endif + ld1.8 {$T0-$T3},[$inp],#64 + eor @x[0],@x[0],@x[1] + eor @x[2],@x[2],@x[3] + eor @x[4],@x[4],@x[5] + eor @x[6],@x[6],@x[7] + eor @x[8],@x[8],@x[9] + eor $A0,$A0,$T0 + eor @x[10],@x[10],@x[11] + eor $B0,$B0,$T1 + eor @x[12],@x[12],@x[13] + eor $C0,$C0,$T2 + eor @x[14],@x[14],@x[15] + eor $D0,$D0,$T3 + ld1.8 {$T0-$T3},[$inp],#64 + + stp @x[0],@x[2],[$out,#0] // store output + add @d[6],@d[6],#7 // increment counter + stp @x[4],@x[6],[$out,#16] + stp @x[8],@x[10],[$out,#32] + stp @x[12],@x[14],[$out,#48] + add $out,$out,#64 + st1.8 {$A0-$D0},[$out],#64 + + ld1.8 {$A0-$D0},[$inp],#64 + eor $A1,$A1,$T0 + eor $B1,$B1,$T1 + eor $C1,$C1,$T2 + eor $D1,$D1,$T3 + st1.8 {$A1-$D1},[$out],#64 + + ld1.8 {$A1-$D1},[$inp],#64 + eor $A2,$A2,$A0 + ldp @K[0],@K[1],[sp,#0] + eor $B2,$B2,$B0 + ldp @K[2],@K[3],[sp,#32] + eor $C2,$C2,$C0 + eor $D2,$D2,$D0 + st1.8 {$A2-$D2},[$out],#64 + + ld1.8 {$A2-$D2},[$inp],#64 + eor $A3,$A3,$A1 + eor $B3,$B3,$B1 + eor $C3,$C3,$C1 + eor $D3,$D3,$D1 + st1.8 {$A3-$D3},[$out],#64 + + ld1.8 {$A3-$D3},[$inp],#64 + eor $A4,$A4,$A2 + eor $B4,$B4,$B2 + eor $C4,$C4,$C2 + eor $D4,$D4,$D2 + st1.8 {$A4-$D4},[$out],#64 + + shl $A0,$ONE,#1 // 4 -> 8 + eor $A5,$A5,$A3 + eor $B5,$B5,$B3 + eor $C5,$C5,$C3 + eor $D5,$D5,$D3 + st1.8 {$A5-$D5},[$out],#64 + + add @K[3],@K[3],$A0 // += 8 + add @K[4],@K[4],$A0 + add @K[5],@K[5],$A0 + add @K[6],@K[6],$A0 + + b.hs .Loop_outer_512_neon + + adds $len,$len,#512 + ushr $A0,$ONE,#2 // 4 -> 1 + + ldp d8,d9,[sp,#128+0] // meet ABI requirements + ldp d10,d11,[sp,#128+16] + ldp d12,d13,[sp,#128+32] + ldp d14,d15,[sp,#128+48] + + stp @K[0],$ONE,[sp,#0] // wipe off-load area + stp @K[0],$ONE,[sp,#32] + stp @K[0],$ONE,[sp,#64] + + b.eq .Ldone_512_neon + + cmp $len,#192 + sub @K[3],@K[3],$A0 // -= 1 + sub @K[4],@K[4],$A0 + sub @K[5],@K[5],$A0 + add sp,sp,#128 + b.hs .Loop_outer_neon + + eor @K[1],@K[1],@K[1] + eor @K[2],@K[2],@K[2] + eor @K[3],@K[3],@K[3] + eor @K[4],@K[4],@K[4] + eor @K[5],@K[5],@K[5] + eor @K[6],@K[6],@K[6] + b .Loop_outer + +.Ldone_512_neon: + ldp x19,x20,[x29,#16] + add sp,sp,#128+64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ChaCha20_512_neon,.-ChaCha20_512_neon +___ +} +}}} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + (s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1)) or + (m/\b(eor|ext|mov)\b/ and (s/\.4s/\.16b/g or 1)) or + (s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1)) or + (m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1)) or + (s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1)); + + #s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; + + print $_,"\n"; +} +close STDOUT or die "error closing STDOUT: $!"; # flush diff --git a/ring-0.17.14/crypto/chacha/asm/chacha-x86.pl b/ring-0.17.14/crypto/chacha/asm/chacha-x86.pl new file mode 100644 index 0000000000..06f8ad9f46 --- /dev/null +++ b/ring-0.17.14/crypto/chacha/asm/chacha-x86.pl @@ -0,0 +1,480 @@ +#! /usr/bin/env perl +# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. +# ==================================================================== +# +# January 2015 +# +# ChaCha20 for x86. +# +# Performance in cycles per byte out of large buffer. +# +# 1xIALU/gcc 4xSSSE3 +# Pentium 17.5/+80% +# PIII 14.2/+60% +# P4 18.6/+84% +# Core2 9.56/+89% 4.83 +# Westmere 9.50/+45% 3.35 +# Sandy Bridge 10.5/+47% 3.20 +# Haswell 8.15/+50% 2.83 +# Skylake 7.53/+22% 2.75 +# Silvermont 17.4/+36% 8.35 +# Goldmont 13.4/+40% 4.36 +# Sledgehammer 10.2/+54% +# Bulldozer 13.4/+50% 4.38(*) +# +# (*) Bulldozer actually executes 4xXOP code path that delivers 3.55; +# +# Modified from upstream OpenSSL to remove the XOP code. + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86asm.pl"; + +$output=pop; +open STDOUT,">$output"; + +&asm_init($ARGV[0]); + +$xmm=$ymm=1; +$gasver=999; # enable everything + +$a="eax"; +($b,$b_)=("ebx","ebp"); +($c,$c_)=("ecx","esi"); +($d,$d_)=("edx","edi"); + +&static_label("ssse3_data"); +&static_label("pic_point"); + +if ($xmm) { +my ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7)); +my ($out,$inp,$len)=("edi","esi","ecx"); + +sub QUARTERROUND_SSSE3 { +my ($ai,$bi,$ci,$di,$i)=@_; +my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next +my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous + + # a b c d + # + # 0 4 8 12 < even round + # 1 5 9 13 + # 2 6 10 14 + # 3 7 11 15 + # 0 5 10 15 < odd round + # 1 6 11 12 + # 2 7 8 13 + # 3 4 9 14 + + if ($i==0) { + my $j=4; + ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp)); + } elsif ($i==3) { + my $j=0; + ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn)); + } elsif ($i==4) { + my $j=4; + ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp)); + } elsif ($i==7) { + my $j=0; + ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn)); + } + + #&paddd ($xa,$xb); # see elsewhere + #&pxor ($xd,$xa); # see elsewhere + &movdqa(&QWP(16*$cp-128,"ebx"),$xc_) if ($ai>0 && $ai<3); + &pshufb ($xd,&QWP(0,"eax")); # rot16 + &movdqa(&QWP(16*$bp-128,"ebx"),$xb_) if ($i!=0); + &paddd ($xc,$xd); + &movdqa($xc_,&QWP(16*$cn-128,"ebx")) if ($ai>0 && $ai<3); + &pxor ($xb,$xc); + &movdqa($xb_,&QWP(16*$bn-128,"ebx")) if ($i<7); + &movdqa ($xa_,$xb); # borrow as temporary + &pslld ($xb,12); + &psrld ($xa_,20); + &por ($xb,$xa_); + &movdqa($xa_,&QWP(16*$an-128,"ebx")); + &paddd ($xa,$xb); + &movdqa($xd_,&QWP(16*$dn-128,"ebx")) if ($di!=$dn); + &pxor ($xd,$xa); + &movdqa (&QWP(16*$ai-128,"ebx"),$xa); + &pshufb ($xd,&QWP(16,"eax")); # rot8 + &paddd ($xc,$xd); + &movdqa (&QWP(16*$di-128,"ebx"),$xd) if ($di!=$dn); + &movdqa ($xd_,$xd) if ($di==$dn); + &pxor ($xb,$xc); + &paddd ($xa_,$xb_) if ($i<7); # elsewhere + &movdqa ($xa,$xb); # borrow as temporary + &pslld ($xb,7); + &psrld ($xa,25); + &pxor ($xd_,$xa_) if ($i<7); # elsewhere + &por ($xb,$xa); + + ($xa,$xa_)=($xa_,$xa); + ($xb,$xb_)=($xb_,$xb); + ($xc,$xc_)=($xc_,$xc); + ($xd,$xd_)=($xd_,$xd); +} + +&function_begin("ChaCha20_ctr32_ssse3"); + &call (&label("pic_point")); +&set_label("pic_point"); + &blindpop("eax"); + + &mov ($out,&wparam(0)); + &mov ($inp,&wparam(1)); + &mov ($len,&wparam(2)); + &mov ("edx",&wparam(3)); # key + &mov ("ebx",&wparam(4)); # counter and nonce + + &mov ("ebp","esp"); + &stack_push (131); + &and ("esp",-64); + &mov (&DWP(512,"esp"),"ebp"); + + &lea ("eax",&DWP(&label("ssse3_data")."-". + &label("pic_point"),"eax")); + &movdqu ("xmm3",&QWP(0,"ebx")); # counter and nonce + +if (defined($gasver) && $gasver>=2.17) { # even though we encode + # pshufb manually, we + # handle only register + # operands, while this + # segment uses memory + # operand... + &cmp ($len,64*4); + &jb (&label("1x")); + + &mov (&DWP(512+4,"esp"),"edx"); # offload pointers + &mov (&DWP(512+8,"esp"),"ebx"); + &sub ($len,64*4); # bias len + &lea ("ebp",&DWP(256+128,"esp")); # size optimization + + &movdqu ("xmm7",&QWP(0,"edx")); # key + &pshufd ("xmm0","xmm3",0x00); + &pshufd ("xmm1","xmm3",0x55); + &pshufd ("xmm2","xmm3",0xaa); + &pshufd ("xmm3","xmm3",0xff); + &paddd ("xmm0",&QWP(16*3,"eax")); # fix counters + &pshufd ("xmm4","xmm7",0x00); + &pshufd ("xmm5","xmm7",0x55); + &psubd ("xmm0",&QWP(16*4,"eax")); + &pshufd ("xmm6","xmm7",0xaa); + &pshufd ("xmm7","xmm7",0xff); + &movdqa (&QWP(16*12-128,"ebp"),"xmm0"); + &movdqa (&QWP(16*13-128,"ebp"),"xmm1"); + &movdqa (&QWP(16*14-128,"ebp"),"xmm2"); + &movdqa (&QWP(16*15-128,"ebp"),"xmm3"); + &movdqu ("xmm3",&QWP(16,"edx")); # key + &movdqa (&QWP(16*4-128,"ebp"),"xmm4"); + &movdqa (&QWP(16*5-128,"ebp"),"xmm5"); + &movdqa (&QWP(16*6-128,"ebp"),"xmm6"); + &movdqa (&QWP(16*7-128,"ebp"),"xmm7"); + &movdqa ("xmm7",&QWP(16*2,"eax")); # sigma + &lea ("ebx",&DWP(128,"esp")); # size optimization + + &pshufd ("xmm0","xmm3",0x00); + &pshufd ("xmm1","xmm3",0x55); + &pshufd ("xmm2","xmm3",0xaa); + &pshufd ("xmm3","xmm3",0xff); + &pshufd ("xmm4","xmm7",0x00); + &pshufd ("xmm5","xmm7",0x55); + &pshufd ("xmm6","xmm7",0xaa); + &pshufd ("xmm7","xmm7",0xff); + &movdqa (&QWP(16*8-128,"ebp"),"xmm0"); + &movdqa (&QWP(16*9-128,"ebp"),"xmm1"); + &movdqa (&QWP(16*10-128,"ebp"),"xmm2"); + &movdqa (&QWP(16*11-128,"ebp"),"xmm3"); + &movdqa (&QWP(16*0-128,"ebp"),"xmm4"); + &movdqa (&QWP(16*1-128,"ebp"),"xmm5"); + &movdqa (&QWP(16*2-128,"ebp"),"xmm6"); + &movdqa (&QWP(16*3-128,"ebp"),"xmm7"); + + &lea ($inp,&DWP(128,$inp)); # size optimization + &lea ($out,&DWP(128,$out)); # size optimization + &jmp (&label("outer_loop")); + +&set_label("outer_loop",16); + #&movdqa ("xmm0",&QWP(16*0-128,"ebp")); # copy key material + &movdqa ("xmm1",&QWP(16*1-128,"ebp")); + &movdqa ("xmm2",&QWP(16*2-128,"ebp")); + &movdqa ("xmm3",&QWP(16*3-128,"ebp")); + #&movdqa ("xmm4",&QWP(16*4-128,"ebp")); + &movdqa ("xmm5",&QWP(16*5-128,"ebp")); + &movdqa ("xmm6",&QWP(16*6-128,"ebp")); + &movdqa ("xmm7",&QWP(16*7-128,"ebp")); + #&movdqa (&QWP(16*0-128,"ebx"),"xmm0"); + &movdqa (&QWP(16*1-128,"ebx"),"xmm1"); + &movdqa (&QWP(16*2-128,"ebx"),"xmm2"); + &movdqa (&QWP(16*3-128,"ebx"),"xmm3"); + #&movdqa (&QWP(16*4-128,"ebx"),"xmm4"); + &movdqa (&QWP(16*5-128,"ebx"),"xmm5"); + &movdqa (&QWP(16*6-128,"ebx"),"xmm6"); + &movdqa (&QWP(16*7-128,"ebx"),"xmm7"); + #&movdqa ("xmm0",&QWP(16*8-128,"ebp")); + #&movdqa ("xmm1",&QWP(16*9-128,"ebp")); + &movdqa ("xmm2",&QWP(16*10-128,"ebp")); + &movdqa ("xmm3",&QWP(16*11-128,"ebp")); + &movdqa ("xmm4",&QWP(16*12-128,"ebp")); + &movdqa ("xmm5",&QWP(16*13-128,"ebp")); + &movdqa ("xmm6",&QWP(16*14-128,"ebp")); + &movdqa ("xmm7",&QWP(16*15-128,"ebp")); + &paddd ("xmm4",&QWP(16*4,"eax")); # counter value + #&movdqa (&QWP(16*8-128,"ebx"),"xmm0"); + #&movdqa (&QWP(16*9-128,"ebx"),"xmm1"); + &movdqa (&QWP(16*10-128,"ebx"),"xmm2"); + &movdqa (&QWP(16*11-128,"ebx"),"xmm3"); + &movdqa (&QWP(16*12-128,"ebx"),"xmm4"); + &movdqa (&QWP(16*13-128,"ebx"),"xmm5"); + &movdqa (&QWP(16*14-128,"ebx"),"xmm6"); + &movdqa (&QWP(16*15-128,"ebx"),"xmm7"); + &movdqa (&QWP(16*12-128,"ebp"),"xmm4"); # save counter value + + &movdqa ($xa, &QWP(16*0-128,"ebp")); + &movdqa ($xd, "xmm4"); + &movdqa ($xb_,&QWP(16*4-128,"ebp")); + &movdqa ($xc, &QWP(16*8-128,"ebp")); + &movdqa ($xc_,&QWP(16*9-128,"ebp")); + + &mov ("edx",10); # loop counter + &nop (); + +&set_label("loop",16); + &paddd ($xa,$xb_); # elsewhere + &movdqa ($xb,$xb_); + &pxor ($xd,$xa); # elsewhere + &QUARTERROUND_SSSE3(0, 4, 8, 12, 0); + &QUARTERROUND_SSSE3(1, 5, 9, 13, 1); + &QUARTERROUND_SSSE3(2, 6,10, 14, 2); + &QUARTERROUND_SSSE3(3, 7,11, 15, 3); + &QUARTERROUND_SSSE3(0, 5,10, 15, 4); + &QUARTERROUND_SSSE3(1, 6,11, 12, 5); + &QUARTERROUND_SSSE3(2, 7, 8, 13, 6); + &QUARTERROUND_SSSE3(3, 4, 9, 14, 7); + &dec ("edx"); + &jnz (&label("loop")); + + &movdqa (&QWP(16*4-128,"ebx"),$xb_); + &movdqa (&QWP(16*8-128,"ebx"),$xc); + &movdqa (&QWP(16*9-128,"ebx"),$xc_); + &movdqa (&QWP(16*12-128,"ebx"),$xd); + &movdqa (&QWP(16*14-128,"ebx"),$xd_); + + my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7)); + + #&movdqa ($xa0,&QWP(16*0-128,"ebx")); # it's there + &movdqa ($xa1,&QWP(16*1-128,"ebx")); + &movdqa ($xa2,&QWP(16*2-128,"ebx")); + &movdqa ($xa3,&QWP(16*3-128,"ebx")); + + for($i=0;$i<256;$i+=64) { + &paddd ($xa0,&QWP($i+16*0-128,"ebp")); # accumulate key material + &paddd ($xa1,&QWP($i+16*1-128,"ebp")); + &paddd ($xa2,&QWP($i+16*2-128,"ebp")); + &paddd ($xa3,&QWP($i+16*3-128,"ebp")); + + &movdqa ($xt2,$xa0); # "de-interlace" data + &punpckldq ($xa0,$xa1); + &movdqa ($xt3,$xa2); + &punpckldq ($xa2,$xa3); + &punpckhdq ($xt2,$xa1); + &punpckhdq ($xt3,$xa3); + &movdqa ($xa1,$xa0); + &punpcklqdq ($xa0,$xa2); # "a0" + &movdqa ($xa3,$xt2); + &punpcklqdq ($xt2,$xt3); # "a2" + &punpckhqdq ($xa1,$xa2); # "a1" + &punpckhqdq ($xa3,$xt3); # "a3" + + #($xa2,$xt2)=($xt2,$xa2); + + &movdqu ($xt0,&QWP(64*0-128,$inp)); # load input + &movdqu ($xt1,&QWP(64*1-128,$inp)); + &movdqu ($xa2,&QWP(64*2-128,$inp)); + &movdqu ($xt3,&QWP(64*3-128,$inp)); + &lea ($inp,&QWP($i<192?16:(64*4-16*3),$inp)); + &pxor ($xt0,$xa0); + &movdqa ($xa0,&QWP($i+16*4-128,"ebx")) if ($i<192); + &pxor ($xt1,$xa1); + &movdqa ($xa1,&QWP($i+16*5-128,"ebx")) if ($i<192); + &pxor ($xt2,$xa2); + &movdqa ($xa2,&QWP($i+16*6-128,"ebx")) if ($i<192); + &pxor ($xt3,$xa3); + &movdqa ($xa3,&QWP($i+16*7-128,"ebx")) if ($i<192); + &movdqu (&QWP(64*0-128,$out),$xt0); # store output + &movdqu (&QWP(64*1-128,$out),$xt1); + &movdqu (&QWP(64*2-128,$out),$xt2); + &movdqu (&QWP(64*3-128,$out),$xt3); + &lea ($out,&QWP($i<192?16:(64*4-16*3),$out)); + } + &sub ($len,64*4); + &jnc (&label("outer_loop")); + + &add ($len,64*4); + &jz (&label("done")); + + &mov ("ebx",&DWP(512+8,"esp")); # restore pointers + &lea ($inp,&DWP(-128,$inp)); + &mov ("edx",&DWP(512+4,"esp")); + &lea ($out,&DWP(-128,$out)); + + &movd ("xmm2",&DWP(16*12-128,"ebp")); # counter value + &movdqu ("xmm3",&QWP(0,"ebx")); + &paddd ("xmm2",&QWP(16*6,"eax")); # +four + &pand ("xmm3",&QWP(16*7,"eax")); + &por ("xmm3","xmm2"); # counter value +} +{ +my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7)); + +sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round + &paddd ($a,$b); + &pxor ($d,$a); + &pshufb ($d,$rot16); + + &paddd ($c,$d); + &pxor ($b,$c); + &movdqa ($t,$b); + &psrld ($b,20); + &pslld ($t,12); + &por ($b,$t); + + &paddd ($a,$b); + &pxor ($d,$a); + &pshufb ($d,$rot24); + + &paddd ($c,$d); + &pxor ($b,$c); + &movdqa ($t,$b); + &psrld ($b,25); + &pslld ($t,7); + &por ($b,$t); +} + +&set_label("1x"); + &movdqa ($a,&QWP(16*2,"eax")); # sigma + &movdqu ($b,&QWP(0,"edx")); + &movdqu ($c,&QWP(16,"edx")); + #&movdqu ($d,&QWP(0,"ebx")); # already loaded + &movdqa ($rot16,&QWP(0,"eax")); + &movdqa ($rot24,&QWP(16,"eax")); + &mov (&DWP(16*3,"esp"),"ebp"); + + &movdqa (&QWP(16*0,"esp"),$a); + &movdqa (&QWP(16*1,"esp"),$b); + &movdqa (&QWP(16*2,"esp"),$c); + &movdqa (&QWP(16*3,"esp"),$d); + &mov ("edx",10); + &jmp (&label("loop1x")); + +&set_label("outer1x",16); + &movdqa ($d,&QWP(16*5,"eax")); # one + &movdqa ($a,&QWP(16*0,"esp")); + &movdqa ($b,&QWP(16*1,"esp")); + &movdqa ($c,&QWP(16*2,"esp")); + &paddd ($d,&QWP(16*3,"esp")); + &mov ("edx",10); + &movdqa (&QWP(16*3,"esp"),$d); + &jmp (&label("loop1x")); + +&set_label("loop1x",16); + &SSSE3ROUND(); + &pshufd ($c,$c,0b01001110); + &pshufd ($b,$b,0b00111001); + &pshufd ($d,$d,0b10010011); + &nop (); + + &SSSE3ROUND(); + &pshufd ($c,$c,0b01001110); + &pshufd ($b,$b,0b10010011); + &pshufd ($d,$d,0b00111001); + + &dec ("edx"); + &jnz (&label("loop1x")); + + &paddd ($a,&QWP(16*0,"esp")); + &paddd ($b,&QWP(16*1,"esp")); + &paddd ($c,&QWP(16*2,"esp")); + &paddd ($d,&QWP(16*3,"esp")); + + &cmp ($len,64); + &jb (&label("tail")); + + &movdqu ($t,&QWP(16*0,$inp)); + &movdqu ($t1,&QWP(16*1,$inp)); + &pxor ($a,$t); # xor with input + &movdqu ($t,&QWP(16*2,$inp)); + &pxor ($b,$t1); + &movdqu ($t1,&QWP(16*3,$inp)); + &pxor ($c,$t); + &pxor ($d,$t1); + &lea ($inp,&DWP(16*4,$inp)); # inp+=64 + + &movdqu (&QWP(16*0,$out),$a); # write output + &movdqu (&QWP(16*1,$out),$b); + &movdqu (&QWP(16*2,$out),$c); + &movdqu (&QWP(16*3,$out),$d); + &lea ($out,&DWP(16*4,$out)); # inp+=64 + + &sub ($len,64); + &jnz (&label("outer1x")); + + &jmp (&label("done")); + +&set_label("tail"); + &movdqa (&QWP(16*0,"esp"),$a); + &movdqa (&QWP(16*1,"esp"),$b); + &movdqa (&QWP(16*2,"esp"),$c); + &movdqa (&QWP(16*3,"esp"),$d); + + &xor ("eax","eax"); + &xor ("edx","edx"); + &xor ("ebp","ebp"); + +&set_label("tail_loop"); + &movb ("al",&BP(0,"esp","ebp")); + &movb ("dl",&BP(0,$inp,"ebp")); + &lea ("ebp",&DWP(1,"ebp")); + &xor ("al","dl"); + &movb (&BP(-1,$out,"ebp"),"al"); + &dec ($len); + &jnz (&label("tail_loop")); +} +&set_label("done"); + &mov ("esp",&DWP(512,"esp")); +&function_end("ChaCha20_ctr32_ssse3"); + +&align (64); +&set_label("ssse3_data"); +&data_byte(0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd); +&data_byte(0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe); +&data_word(0x61707865,0x3320646e,0x79622d32,0x6b206574); +&data_word(0,1,2,3); +&data_word(4,4,4,4); +&data_word(1,0,0,0); +&data_word(4,0,0,0); +&data_word(0,-1,-1,-1); +&align (64); +} +&asciz ("ChaCha20 for x86, CRYPTOGAMS by "); + +&asm_finish(); + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/ring-0.17.14/crypto/chacha/asm/chacha-x86_64.pl b/ring-0.17.14/crypto/chacha/asm/chacha-x86_64.pl new file mode 100644 index 0000000000..002f369556 --- /dev/null +++ b/ring-0.17.14/crypto/chacha/asm/chacha-x86_64.pl @@ -0,0 +1,1854 @@ +#! /usr/bin/env perl +# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. +# ==================================================================== +# +# November 2014 +# +# ChaCha20 for x86_64. +# +# December 2016 +# +# Add AVX512F code path. +# +# Performance in cycles per byte out of large buffer. +# +# IALU/gcc 4.8(i) 1xSSSE3/SSE2 4xSSSE3 NxAVX(v) +# +# P4 9.48/+99% -/22.7(ii) - +# Core2 7.83/+55% 7.90/8.08 4.35 +# Westmere 7.19/+50% 5.60/6.70 3.00 +# Sandy Bridge 8.31/+42% 5.45/6.76 2.72 +# Ivy Bridge 6.71/+46% 5.40/6.49 2.41 +# Haswell 5.92/+43% 5.20/6.45 2.42 1.23 +# Skylake[-X] 5.87/+39% 4.70/- 2.31 1.19[0.57] +# Silvermont 12.0/+33% 7.75/7.40 7.03(iii) +# Knights L 11.7/- - 9.60(iii) 0.80 +# Goldmont 10.6/+17% 5.10/- 3.28 +# Sledgehammer 7.28/+52% -/14.2(ii) - +# Bulldozer 9.66/+28% 9.85/11.1 3.06(iv) +# Ryzen 5.96/+50% 5.19/- 2.40 2.09 +# VIA Nano 10.5/+46% 6.72/8.60 6.05 +# +# (i) compared to older gcc 3.x one can observe >2x improvement on +# most platforms; +# (ii) as it can be seen, SSE2 performance is too low on legacy +# processors; NxSSE2 results are naturally better, but not +# impressively better than IALU ones, which is why you won't +# find SSE2 code below; +# (iii) this is not optimal result for Atom because of MSROM +# limitations, SSE2 can do better, but gain is considered too +# low to justify the [maintenance] effort; +# (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20; +# +# Modified from upstream OpenSSL to remove the XOP code. + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +$avx = 2; + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; + +# input parameter block +($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8"); + +$code.=<<___; +.text + +.section .rodata +.align 64 +.Lzero: +.long 0,0,0,0 +.Lone: +.long 1,0,0,0 +.Linc: +.long 0,1,2,3 +.Lfour: +.long 4,4,4,4 +.Lincy: +.long 0,2,4,6,1,3,5,7 +.Leight: +.long 8,8,8,8,8,8,8,8 +.Lrot16: +.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd +.Lrot24: +.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe +.Lsigma: +.asciz "expand 32-byte k" +.align 64 +.Lzeroz: +.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 +.Lfourz: +.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 +.Lincz: +.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 +.Lsixteen: +.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 +.asciz "ChaCha20 for x86_64, CRYPTOGAMS by " +.text +___ + +sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; + my $arg = pop; + $arg = "\$$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; +} + +@x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)), + "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15))); +@t=("%esi","%edi"); + +sub ROUND { # critical path is 24 cycles per round +my ($a0,$b0,$c0,$d0)=@_; +my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); +my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); +my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); +my ($xc,$xc_)=map("\"$_\"",@t); +my @x=map("\"$_\"",@x); + + # Consider order in which variables are addressed by their + # index: + # + # a b c d + # + # 0 4 8 12 < even round + # 1 5 9 13 + # 2 6 10 14 + # 3 7 11 15 + # 0 5 10 15 < odd round + # 1 6 11 12 + # 2 7 8 13 + # 3 4 9 14 + # + # 'a', 'b' and 'd's are permanently allocated in registers, + # @x[0..7,12..15], while 'c's are maintained in memory. If + # you observe 'c' column, you'll notice that pair of 'c's is + # invariant between rounds. This means that we have to reload + # them once per round, in the middle. This is why you'll see + # bunch of 'c' stores and loads in the middle, but none in + # the beginning or end. + + # Normally instructions would be interleaved to favour in-order + # execution. Generally out-of-order cores manage it gracefully, + # but not this time for some reason. As in-order execution + # cores are dying breed, old Atom is the only one around, + # instructions are left uninterleaved. Besides, Atom is better + # off executing 1xSSSE3 code anyway... + + ( + "&add (@x[$a0],@x[$b0])", # Q1 + "&xor (@x[$d0],@x[$a0])", + "&rol (@x[$d0],16)", + "&add (@x[$a1],@x[$b1])", # Q2 + "&xor (@x[$d1],@x[$a1])", + "&rol (@x[$d1],16)", + + "&add ($xc,@x[$d0])", + "&xor (@x[$b0],$xc)", + "&rol (@x[$b0],12)", + "&add ($xc_,@x[$d1])", + "&xor (@x[$b1],$xc_)", + "&rol (@x[$b1],12)", + + "&add (@x[$a0],@x[$b0])", + "&xor (@x[$d0],@x[$a0])", + "&rol (@x[$d0],8)", + "&add (@x[$a1],@x[$b1])", + "&xor (@x[$d1],@x[$a1])", + "&rol (@x[$d1],8)", + + "&add ($xc,@x[$d0])", + "&xor (@x[$b0],$xc)", + "&rol (@x[$b0],7)", + "&add ($xc_,@x[$d1])", + "&xor (@x[$b1],$xc_)", + "&rol (@x[$b1],7)", + + "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's + "&mov (\"4*$c1(%rsp)\",$xc_)", + "&mov ($xc,\"4*$c2(%rsp)\")", + "&mov ($xc_,\"4*$c3(%rsp)\")", + + "&add (@x[$a2],@x[$b2])", # Q3 + "&xor (@x[$d2],@x[$a2])", + "&rol (@x[$d2],16)", + "&add (@x[$a3],@x[$b3])", # Q4 + "&xor (@x[$d3],@x[$a3])", + "&rol (@x[$d3],16)", + + "&add ($xc,@x[$d2])", + "&xor (@x[$b2],$xc)", + "&rol (@x[$b2],12)", + "&add ($xc_,@x[$d3])", + "&xor (@x[$b3],$xc_)", + "&rol (@x[$b3],12)", + + "&add (@x[$a2],@x[$b2])", + "&xor (@x[$d2],@x[$a2])", + "&rol (@x[$d2],8)", + "&add (@x[$a3],@x[$b3])", + "&xor (@x[$d3],@x[$a3])", + "&rol (@x[$d3],8)", + + "&add ($xc,@x[$d2])", + "&xor (@x[$b2],$xc)", + "&rol (@x[$b2],7)", + "&add ($xc_,@x[$d3])", + "&xor (@x[$b3],$xc_)", + "&rol (@x[$b3],7)" + ); +} + +######################################################################## +# Generic code path that handles all lengths on pre-SSSE3 processors. +$code.=<<___; +.globl ChaCha20_ctr32_nohw +.type ChaCha20_ctr32_nohw,\@function,5 +.align 64 +ChaCha20_ctr32_nohw: +.cfi_startproc + _CET_ENDBR + push %rbx +.cfi_push rbx + push %rbp +.cfi_push rbp + push %r12 +.cfi_push r12 + push %r13 +.cfi_push r13 + push %r14 +.cfi_push r14 + push %r15 +.cfi_push r15 + sub \$64+24,%rsp +.cfi_adjust_cfa_offset `64+24` +.Lctr32_body: + + #movdqa .Lsigma(%rip),%xmm0 + movdqu ($key),%xmm1 + movdqu 16($key),%xmm2 + movdqu ($counter),%xmm3 + movdqa .Lone(%rip),%xmm4 + + #movdqa %xmm0,4*0(%rsp) # key[0] + movdqa %xmm1,4*4(%rsp) # key[1] + movdqa %xmm2,4*8(%rsp) # key[2] + movdqa %xmm3,4*12(%rsp) # key[3] + mov $len,%rbp # reassign $len + jmp .Loop_outer + +.align 32 +.Loop_outer: + mov \$0x61707865,@x[0] # 'expa' + mov \$0x3320646e,@x[1] # 'nd 3' + mov \$0x79622d32,@x[2] # '2-by' + mov \$0x6b206574,@x[3] # 'te k' + mov 4*4(%rsp),@x[4] + mov 4*5(%rsp),@x[5] + mov 4*6(%rsp),@x[6] + mov 4*7(%rsp),@x[7] + movd %xmm3,@x[12] + mov 4*13(%rsp),@x[13] + mov 4*14(%rsp),@x[14] + mov 4*15(%rsp),@x[15] + + mov %rbp,64+0(%rsp) # save len + mov \$10,%ebp + mov $inp,64+8(%rsp) # save inp + movq %xmm2,%rsi # "@x[8]" + mov $out,64+16(%rsp) # save out + mov %rsi,%rdi + shr \$32,%rdi # "@x[9]" + jmp .Loop + +.align 32 +.Loop: +___ + foreach (&ROUND (0, 4, 8,12)) { eval; } + foreach (&ROUND (0, 5,10,15)) { eval; } + &dec ("%ebp"); + &jnz (".Loop"); + +$code.=<<___; + mov @t[1],4*9(%rsp) # modulo-scheduled + mov @t[0],4*8(%rsp) + mov 64(%rsp),%rbp # load len + movdqa %xmm2,%xmm1 + mov 64+8(%rsp),$inp # load inp + paddd %xmm4,%xmm3 # increment counter + mov 64+16(%rsp),$out # load out + + add \$0x61707865,@x[0] # 'expa' + add \$0x3320646e,@x[1] # 'nd 3' + add \$0x79622d32,@x[2] # '2-by' + add \$0x6b206574,@x[3] # 'te k' + add 4*4(%rsp),@x[4] + add 4*5(%rsp),@x[5] + add 4*6(%rsp),@x[6] + add 4*7(%rsp),@x[7] + add 4*12(%rsp),@x[12] + add 4*13(%rsp),@x[13] + add 4*14(%rsp),@x[14] + add 4*15(%rsp),@x[15] + paddd 4*8(%rsp),%xmm1 + + cmp \$64,%rbp + jb .Ltail + + xor 4*0($inp),@x[0] # xor with input + xor 4*1($inp),@x[1] + xor 4*2($inp),@x[2] + xor 4*3($inp),@x[3] + xor 4*4($inp),@x[4] + xor 4*5($inp),@x[5] + xor 4*6($inp),@x[6] + xor 4*7($inp),@x[7] + movdqu 4*8($inp),%xmm0 + xor 4*12($inp),@x[12] + xor 4*13($inp),@x[13] + xor 4*14($inp),@x[14] + xor 4*15($inp),@x[15] + lea 4*16($inp),$inp # inp+=64 + pxor %xmm1,%xmm0 + + movdqa %xmm2,4*8(%rsp) + movd %xmm3,4*12(%rsp) + + mov @x[0],4*0($out) # write output + mov @x[1],4*1($out) + mov @x[2],4*2($out) + mov @x[3],4*3($out) + mov @x[4],4*4($out) + mov @x[5],4*5($out) + mov @x[6],4*6($out) + mov @x[7],4*7($out) + movdqu %xmm0,4*8($out) + mov @x[12],4*12($out) + mov @x[13],4*13($out) + mov @x[14],4*14($out) + mov @x[15],4*15($out) + lea 4*16($out),$out # out+=64 + + sub \$64,%rbp + jnz .Loop_outer + + jmp .Ldone + +.align 16 +.Ltail: + mov @x[0],4*0(%rsp) + mov @x[1],4*1(%rsp) + xor %rbx,%rbx + mov @x[2],4*2(%rsp) + mov @x[3],4*3(%rsp) + mov @x[4],4*4(%rsp) + mov @x[5],4*5(%rsp) + mov @x[6],4*6(%rsp) + mov @x[7],4*7(%rsp) + movdqa %xmm1,4*8(%rsp) + mov @x[12],4*12(%rsp) + mov @x[13],4*13(%rsp) + mov @x[14],4*14(%rsp) + mov @x[15],4*15(%rsp) + +.Loop_tail: + movzb ($inp,%rbx),%eax + movzb (%rsp,%rbx),%edx + lea 1(%rbx),%rbx + xor %edx,%eax + mov %al,-1($out,%rbx) + dec %rbp + jnz .Loop_tail + +.Ldone: + lea 64+24+48(%rsp),%rsi + mov -48(%rsi),%r15 +.cfi_restore r15 + mov -40(%rsi),%r14 +.cfi_restore r14 + mov -32(%rsi),%r13 +.cfi_restore r13 + mov -24(%rsi),%r12 +.cfi_restore r12 + mov -16(%rsi),%rbp +.cfi_restore rbp + mov -8(%rsi),%rbx +.cfi_restore rbx + lea (%rsi),%rsp +.cfi_adjust_cfa_offset `-64-24-48` +.Lno_data: + ret +.cfi_endproc +.size ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw +___ + +######################################################################## +# SSSE3 code path that handles longer messages. +{ +# assign variables to favor Atom front-end +my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3, + $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15)); +my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, + "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3); + +sub SSSE3_lane_ROUND { +my ($a0,$b0,$c0,$d0)=@_; +my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); +my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); +my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); +my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3); +my @x=map("\"$_\"",@xx); + + # Consider order in which variables are addressed by their + # index: + # + # a b c d + # + # 0 4 8 12 < even round + # 1 5 9 13 + # 2 6 10 14 + # 3 7 11 15 + # 0 5 10 15 < odd round + # 1 6 11 12 + # 2 7 8 13 + # 3 4 9 14 + # + # 'a', 'b' and 'd's are permanently allocated in registers, + # @x[0..7,12..15], while 'c's are maintained in memory. If + # you observe 'c' column, you'll notice that pair of 'c's is + # invariant between rounds. This means that we have to reload + # them once per round, in the middle. This is why you'll see + # bunch of 'c' stores and loads in the middle, but none in + # the beginning or end. + + ( + "&paddd (@x[$a0],@x[$b0])", # Q1 + "&paddd (@x[$a1],@x[$b1])", # Q2 + "&pxor (@x[$d0],@x[$a0])", + "&pxor (@x[$d1],@x[$a1])", + "&pshufb (@x[$d0],$t1)", + "&pshufb (@x[$d1],$t1)", + + "&paddd ($xc,@x[$d0])", + "&paddd ($xc_,@x[$d1])", + "&pxor (@x[$b0],$xc)", + "&pxor (@x[$b1],$xc_)", + "&movdqa ($t0,@x[$b0])", + "&pslld (@x[$b0],12)", + "&psrld ($t0,20)", + "&movdqa ($t1,@x[$b1])", + "&pslld (@x[$b1],12)", + "&por (@x[$b0],$t0)", + "&psrld ($t1,20)", + "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip) + "&por (@x[$b1],$t1)", + + "&paddd (@x[$a0],@x[$b0])", + "&paddd (@x[$a1],@x[$b1])", + "&pxor (@x[$d0],@x[$a0])", + "&pxor (@x[$d1],@x[$a1])", + "&pshufb (@x[$d0],$t0)", + "&pshufb (@x[$d1],$t0)", + + "&paddd ($xc,@x[$d0])", + "&paddd ($xc_,@x[$d1])", + "&pxor (@x[$b0],$xc)", + "&pxor (@x[$b1],$xc_)", + "&movdqa ($t1,@x[$b0])", + "&pslld (@x[$b0],7)", + "&psrld ($t1,25)", + "&movdqa ($t0,@x[$b1])", + "&pslld (@x[$b1],7)", + "&por (@x[$b0],$t1)", + "&psrld ($t0,25)", + "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip) + "&por (@x[$b1],$t0)", + + "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's + "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)", + "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")", + "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")", + + "&paddd (@x[$a2],@x[$b2])", # Q3 + "&paddd (@x[$a3],@x[$b3])", # Q4 + "&pxor (@x[$d2],@x[$a2])", + "&pxor (@x[$d3],@x[$a3])", + "&pshufb (@x[$d2],$t1)", + "&pshufb (@x[$d3],$t1)", + + "&paddd ($xc,@x[$d2])", + "&paddd ($xc_,@x[$d3])", + "&pxor (@x[$b2],$xc)", + "&pxor (@x[$b3],$xc_)", + "&movdqa ($t0,@x[$b2])", + "&pslld (@x[$b2],12)", + "&psrld ($t0,20)", + "&movdqa ($t1,@x[$b3])", + "&pslld (@x[$b3],12)", + "&por (@x[$b2],$t0)", + "&psrld ($t1,20)", + "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip) + "&por (@x[$b3],$t1)", + + "&paddd (@x[$a2],@x[$b2])", + "&paddd (@x[$a3],@x[$b3])", + "&pxor (@x[$d2],@x[$a2])", + "&pxor (@x[$d3],@x[$a3])", + "&pshufb (@x[$d2],$t0)", + "&pshufb (@x[$d3],$t0)", + + "&paddd ($xc,@x[$d2])", + "&paddd ($xc_,@x[$d3])", + "&pxor (@x[$b2],$xc)", + "&pxor (@x[$b3],$xc_)", + "&movdqa ($t1,@x[$b2])", + "&pslld (@x[$b2],7)", + "&psrld ($t1,25)", + "&movdqa ($t0,@x[$b3])", + "&pslld (@x[$b3],7)", + "&por (@x[$b2],$t1)", + "&psrld ($t0,25)", + "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip) + "&por (@x[$b3],$t0)" + ); +} + +my $xframe = $win64 ? 0xa8 : 8; + +$code.=<<___; +.globl ChaCha20_ctr32_ssse3_4x +.type ChaCha20_ctr32_ssse3_4x,\@function,5 +.align 32 +ChaCha20_ctr32_ssse3_4x: +.cfi_startproc + _CET_ENDBR + mov %rsp,%r9 # frame pointer +.cfi_def_cfa_register r9 +___ +$code.=<<___; + sub \$0x140+$xframe,%rsp +___ + ################ stack layout + # +0x00 SIMD equivalent of @x[8-12] + # ... + # +0x40 constant copy of key[0-2] smashed by lanes + # ... + # +0x100 SIMD counters (with nonce smashed by lanes) + # ... + # +0x140 +$code.=<<___ if ($win64); + movaps %xmm6,-0xa8(%r9) + movaps %xmm7,-0x98(%r9) + movaps %xmm8,-0x88(%r9) + movaps %xmm9,-0x78(%r9) + movaps %xmm10,-0x68(%r9) + movaps %xmm11,-0x58(%r9) + movaps %xmm12,-0x48(%r9) + movaps %xmm13,-0x38(%r9) + movaps %xmm14,-0x28(%r9) + movaps %xmm15,-0x18(%r9) +.L4x_body: +___ +$code.=<<___; + movdqa .Lsigma(%rip),$xa3 # key[0] + movdqu ($key),$xb3 # key[1] + movdqu 16($key),$xt3 # key[2] + movdqu ($counter),$xd3 # key[3] + lea 0x100(%rsp),%rcx # size optimization + lea .Lrot16(%rip),%r10 + lea .Lrot24(%rip),%r11 + + pshufd \$0x00,$xa3,$xa0 # smash key by lanes... + pshufd \$0x55,$xa3,$xa1 + movdqa $xa0,0x40(%rsp) # ... and offload + pshufd \$0xaa,$xa3,$xa2 + movdqa $xa1,0x50(%rsp) + pshufd \$0xff,$xa3,$xa3 + movdqa $xa2,0x60(%rsp) + movdqa $xa3,0x70(%rsp) + + pshufd \$0x00,$xb3,$xb0 + pshufd \$0x55,$xb3,$xb1 + movdqa $xb0,0x80-0x100(%rcx) + pshufd \$0xaa,$xb3,$xb2 + movdqa $xb1,0x90-0x100(%rcx) + pshufd \$0xff,$xb3,$xb3 + movdqa $xb2,0xa0-0x100(%rcx) + movdqa $xb3,0xb0-0x100(%rcx) + + pshufd \$0x00,$xt3,$xt0 # "$xc0" + pshufd \$0x55,$xt3,$xt1 # "$xc1" + movdqa $xt0,0xc0-0x100(%rcx) + pshufd \$0xaa,$xt3,$xt2 # "$xc2" + movdqa $xt1,0xd0-0x100(%rcx) + pshufd \$0xff,$xt3,$xt3 # "$xc3" + movdqa $xt2,0xe0-0x100(%rcx) + movdqa $xt3,0xf0-0x100(%rcx) + + pshufd \$0x00,$xd3,$xd0 + pshufd \$0x55,$xd3,$xd1 + paddd .Linc(%rip),$xd0 # don't save counters yet + pshufd \$0xaa,$xd3,$xd2 + movdqa $xd1,0x110-0x100(%rcx) + pshufd \$0xff,$xd3,$xd3 + movdqa $xd2,0x120-0x100(%rcx) + movdqa $xd3,0x130-0x100(%rcx) + + jmp .Loop_enter4x + +.align 32 +.Loop_outer4x: + movdqa 0x40(%rsp),$xa0 # re-load smashed key + movdqa 0x50(%rsp),$xa1 + movdqa 0x60(%rsp),$xa2 + movdqa 0x70(%rsp),$xa3 + movdqa 0x80-0x100(%rcx),$xb0 + movdqa 0x90-0x100(%rcx),$xb1 + movdqa 0xa0-0x100(%rcx),$xb2 + movdqa 0xb0-0x100(%rcx),$xb3 + movdqa 0xc0-0x100(%rcx),$xt0 # "$xc0" + movdqa 0xd0-0x100(%rcx),$xt1 # "$xc1" + movdqa 0xe0-0x100(%rcx),$xt2 # "$xc2" + movdqa 0xf0-0x100(%rcx),$xt3 # "$xc3" + movdqa 0x100-0x100(%rcx),$xd0 + movdqa 0x110-0x100(%rcx),$xd1 + movdqa 0x120-0x100(%rcx),$xd2 + movdqa 0x130-0x100(%rcx),$xd3 + paddd .Lfour(%rip),$xd0 # next SIMD counters + +.Loop_enter4x: + movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]" + movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]" + movdqa (%r10),$xt3 # .Lrot16(%rip) + mov \$10,%eax + movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters + jmp .Loop4x + +.align 32 +.Loop4x: +___ + foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; } + foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; } +$code.=<<___; + dec %eax + jnz .Loop4x + + paddd 0x40(%rsp),$xa0 # accumulate key material + paddd 0x50(%rsp),$xa1 + paddd 0x60(%rsp),$xa2 + paddd 0x70(%rsp),$xa3 + + movdqa $xa0,$xt2 # "de-interlace" data + punpckldq $xa1,$xa0 + movdqa $xa2,$xt3 + punpckldq $xa3,$xa2 + punpckhdq $xa1,$xt2 + punpckhdq $xa3,$xt3 + movdqa $xa0,$xa1 + punpcklqdq $xa2,$xa0 # "a0" + movdqa $xt2,$xa3 + punpcklqdq $xt3,$xt2 # "a2" + punpckhqdq $xa2,$xa1 # "a1" + punpckhqdq $xt3,$xa3 # "a3" +___ + ($xa2,$xt2)=($xt2,$xa2); +$code.=<<___; + paddd 0x80-0x100(%rcx),$xb0 + paddd 0x90-0x100(%rcx),$xb1 + paddd 0xa0-0x100(%rcx),$xb2 + paddd 0xb0-0x100(%rcx),$xb3 + + movdqa $xa0,0x00(%rsp) # offload $xaN + movdqa $xa1,0x10(%rsp) + movdqa 0x20(%rsp),$xa0 # "xc2" + movdqa 0x30(%rsp),$xa1 # "xc3" + + movdqa $xb0,$xt2 + punpckldq $xb1,$xb0 + movdqa $xb2,$xt3 + punpckldq $xb3,$xb2 + punpckhdq $xb1,$xt2 + punpckhdq $xb3,$xt3 + movdqa $xb0,$xb1 + punpcklqdq $xb2,$xb0 # "b0" + movdqa $xt2,$xb3 + punpcklqdq $xt3,$xt2 # "b2" + punpckhqdq $xb2,$xb1 # "b1" + punpckhqdq $xt3,$xb3 # "b3" +___ + ($xb2,$xt2)=($xt2,$xb2); + my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); +$code.=<<___; + paddd 0xc0-0x100(%rcx),$xc0 + paddd 0xd0-0x100(%rcx),$xc1 + paddd 0xe0-0x100(%rcx),$xc2 + paddd 0xf0-0x100(%rcx),$xc3 + + movdqa $xa2,0x20(%rsp) # keep offloading $xaN + movdqa $xa3,0x30(%rsp) + + movdqa $xc0,$xt2 + punpckldq $xc1,$xc0 + movdqa $xc2,$xt3 + punpckldq $xc3,$xc2 + punpckhdq $xc1,$xt2 + punpckhdq $xc3,$xt3 + movdqa $xc0,$xc1 + punpcklqdq $xc2,$xc0 # "c0" + movdqa $xt2,$xc3 + punpcklqdq $xt3,$xt2 # "c2" + punpckhqdq $xc2,$xc1 # "c1" + punpckhqdq $xt3,$xc3 # "c3" +___ + ($xc2,$xt2)=($xt2,$xc2); + ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary +$code.=<<___; + paddd 0x100-0x100(%rcx),$xd0 + paddd 0x110-0x100(%rcx),$xd1 + paddd 0x120-0x100(%rcx),$xd2 + paddd 0x130-0x100(%rcx),$xd3 + + movdqa $xd0,$xt2 + punpckldq $xd1,$xd0 + movdqa $xd2,$xt3 + punpckldq $xd3,$xd2 + punpckhdq $xd1,$xt2 + punpckhdq $xd3,$xt3 + movdqa $xd0,$xd1 + punpcklqdq $xd2,$xd0 # "d0" + movdqa $xt2,$xd3 + punpcklqdq $xt3,$xt2 # "d2" + punpckhqdq $xd2,$xd1 # "d1" + punpckhqdq $xt3,$xd3 # "d3" +___ + ($xd2,$xt2)=($xt2,$xd2); +$code.=<<___; + cmp \$64*4,$len + jb .Ltail4x + + movdqu 0x00($inp),$xt0 # xor with input + movdqu 0x10($inp),$xt1 + movdqu 0x20($inp),$xt2 + movdqu 0x30($inp),$xt3 + pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? + pxor $xb0,$xt1 + pxor $xc0,$xt2 + pxor $xd0,$xt3 + + movdqu $xt0,0x00($out) + movdqu 0x40($inp),$xt0 + movdqu $xt1,0x10($out) + movdqu 0x50($inp),$xt1 + movdqu $xt2,0x20($out) + movdqu 0x60($inp),$xt2 + movdqu $xt3,0x30($out) + movdqu 0x70($inp),$xt3 + lea 0x80($inp),$inp # size optimization + pxor 0x10(%rsp),$xt0 + pxor $xb1,$xt1 + pxor $xc1,$xt2 + pxor $xd1,$xt3 + + movdqu $xt0,0x40($out) + movdqu 0x00($inp),$xt0 + movdqu $xt1,0x50($out) + movdqu 0x10($inp),$xt1 + movdqu $xt2,0x60($out) + movdqu 0x20($inp),$xt2 + movdqu $xt3,0x70($out) + lea 0x80($out),$out # size optimization + movdqu 0x30($inp),$xt3 + pxor 0x20(%rsp),$xt0 + pxor $xb2,$xt1 + pxor $xc2,$xt2 + pxor $xd2,$xt3 + + movdqu $xt0,0x00($out) + movdqu 0x40($inp),$xt0 + movdqu $xt1,0x10($out) + movdqu 0x50($inp),$xt1 + movdqu $xt2,0x20($out) + movdqu 0x60($inp),$xt2 + movdqu $xt3,0x30($out) + movdqu 0x70($inp),$xt3 + lea 0x80($inp),$inp # inp+=64*4 + pxor 0x30(%rsp),$xt0 + pxor $xb3,$xt1 + pxor $xc3,$xt2 + pxor $xd3,$xt3 + movdqu $xt0,0x40($out) + movdqu $xt1,0x50($out) + movdqu $xt2,0x60($out) + movdqu $xt3,0x70($out) + lea 0x80($out),$out # out+=64*4 + + sub \$64*4,$len + jnz .Loop_outer4x + + jmp .Ldone4x + +.Ltail4x: + cmp \$192,$len + jae .L192_or_more4x + cmp \$128,$len + jae .L128_or_more4x + cmp \$64,$len + jae .L64_or_more4x + + #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember? + xor %r10,%r10 + #movdqa $xt0,0x00(%rsp) + movdqa $xb0,0x10(%rsp) + movdqa $xc0,0x20(%rsp) + movdqa $xd0,0x30(%rsp) + jmp .Loop_tail4x + +.align 32 +.L64_or_more4x: + movdqu 0x00($inp),$xt0 # xor with input + movdqu 0x10($inp),$xt1 + movdqu 0x20($inp),$xt2 + movdqu 0x30($inp),$xt3 + pxor 0x00(%rsp),$xt0 # $xaxN is offloaded, remember? + pxor $xb0,$xt1 + pxor $xc0,$xt2 + pxor $xd0,$xt3 + movdqu $xt0,0x00($out) + movdqu $xt1,0x10($out) + movdqu $xt2,0x20($out) + movdqu $xt3,0x30($out) + je .Ldone4x + + movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember? + lea 0x40($inp),$inp # inp+=64*1 + xor %r10,%r10 + movdqa $xt0,0x00(%rsp) + movdqa $xb1,0x10(%rsp) + lea 0x40($out),$out # out+=64*1 + movdqa $xc1,0x20(%rsp) + sub \$64,$len # len-=64*1 + movdqa $xd1,0x30(%rsp) + jmp .Loop_tail4x + +.align 32 +.L128_or_more4x: + movdqu 0x00($inp),$xt0 # xor with input + movdqu 0x10($inp),$xt1 + movdqu 0x20($inp),$xt2 + movdqu 0x30($inp),$xt3 + pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? + pxor $xb0,$xt1 + pxor $xc0,$xt2 + pxor $xd0,$xt3 + + movdqu $xt0,0x00($out) + movdqu 0x40($inp),$xt0 + movdqu $xt1,0x10($out) + movdqu 0x50($inp),$xt1 + movdqu $xt2,0x20($out) + movdqu 0x60($inp),$xt2 + movdqu $xt3,0x30($out) + movdqu 0x70($inp),$xt3 + pxor 0x10(%rsp),$xt0 + pxor $xb1,$xt1 + pxor $xc1,$xt2 + pxor $xd1,$xt3 + movdqu $xt0,0x40($out) + movdqu $xt1,0x50($out) + movdqu $xt2,0x60($out) + movdqu $xt3,0x70($out) + je .Ldone4x + + movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember? + lea 0x80($inp),$inp # inp+=64*2 + xor %r10,%r10 + movdqa $xt0,0x00(%rsp) + movdqa $xb2,0x10(%rsp) + lea 0x80($out),$out # out+=64*2 + movdqa $xc2,0x20(%rsp) + sub \$128,$len # len-=64*2 + movdqa $xd2,0x30(%rsp) + jmp .Loop_tail4x + +.align 32 +.L192_or_more4x: + movdqu 0x00($inp),$xt0 # xor with input + movdqu 0x10($inp),$xt1 + movdqu 0x20($inp),$xt2 + movdqu 0x30($inp),$xt3 + pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? + pxor $xb0,$xt1 + pxor $xc0,$xt2 + pxor $xd0,$xt3 + + movdqu $xt0,0x00($out) + movdqu 0x40($inp),$xt0 + movdqu $xt1,0x10($out) + movdqu 0x50($inp),$xt1 + movdqu $xt2,0x20($out) + movdqu 0x60($inp),$xt2 + movdqu $xt3,0x30($out) + movdqu 0x70($inp),$xt3 + lea 0x80($inp),$inp # size optimization + pxor 0x10(%rsp),$xt0 + pxor $xb1,$xt1 + pxor $xc1,$xt2 + pxor $xd1,$xt3 + + movdqu $xt0,0x40($out) + movdqu 0x00($inp),$xt0 + movdqu $xt1,0x50($out) + movdqu 0x10($inp),$xt1 + movdqu $xt2,0x60($out) + movdqu 0x20($inp),$xt2 + movdqu $xt3,0x70($out) + lea 0x80($out),$out # size optimization + movdqu 0x30($inp),$xt3 + pxor 0x20(%rsp),$xt0 + pxor $xb2,$xt1 + pxor $xc2,$xt2 + pxor $xd2,$xt3 + movdqu $xt0,0x00($out) + movdqu $xt1,0x10($out) + movdqu $xt2,0x20($out) + movdqu $xt3,0x30($out) + je .Ldone4x + + movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember? + lea 0x40($inp),$inp # inp+=64*3 + xor %r10,%r10 + movdqa $xt0,0x00(%rsp) + movdqa $xb3,0x10(%rsp) + lea 0x40($out),$out # out+=64*3 + movdqa $xc3,0x20(%rsp) + sub \$192,$len # len-=64*3 + movdqa $xd3,0x30(%rsp) + +.Loop_tail4x: + movzb ($inp,%r10),%eax + movzb (%rsp,%r10),%ecx + lea 1(%r10),%r10 + xor %ecx,%eax + mov %al,-1($out,%r10) + dec $len + jnz .Loop_tail4x + +.Ldone4x: +___ +$code.=<<___ if ($win64); + movaps -0xa8(%r9),%xmm6 + movaps -0x98(%r9),%xmm7 + movaps -0x88(%r9),%xmm8 + movaps -0x78(%r9),%xmm9 + movaps -0x68(%r9),%xmm10 + movaps -0x58(%r9),%xmm11 + movaps -0x48(%r9),%xmm12 + movaps -0x38(%r9),%xmm13 + movaps -0x28(%r9),%xmm14 + movaps -0x18(%r9),%xmm15 +___ +$code.=<<___; + lea (%r9),%rsp +.cfi_def_cfa_register rsp +.L4x_epilogue: + ret +.cfi_endproc +.size ChaCha20_ctr32_ssse3_4x,.-ChaCha20_ctr32_ssse3_4x +___ +} + +######################################################################## +# AVX2 code path +if ($avx>1) { +my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3, + $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15)); +my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, + "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3); + +sub AVX2_lane_ROUND { +my ($a0,$b0,$c0,$d0)=@_; +my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); +my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); +my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); +my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3); +my @x=map("\"$_\"",@xx); + + # Consider order in which variables are addressed by their + # index: + # + # a b c d + # + # 0 4 8 12 < even round + # 1 5 9 13 + # 2 6 10 14 + # 3 7 11 15 + # 0 5 10 15 < odd round + # 1 6 11 12 + # 2 7 8 13 + # 3 4 9 14 + # + # 'a', 'b' and 'd's are permanently allocated in registers, + # @x[0..7,12..15], while 'c's are maintained in memory. If + # you observe 'c' column, you'll notice that pair of 'c's is + # invariant between rounds. This means that we have to reload + # them once per round, in the middle. This is why you'll see + # bunch of 'c' stores and loads in the middle, but none in + # the beginning or end. + + ( + "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 + "&vpxor (@x[$d0],@x[$a0],@x[$d0])", + "&vpshufb (@x[$d0],@x[$d0],$t1)", + "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 + "&vpxor (@x[$d1],@x[$a1],@x[$d1])", + "&vpshufb (@x[$d1],@x[$d1],$t1)", + + "&vpaddd ($xc,$xc,@x[$d0])", + "&vpxor (@x[$b0],$xc,@x[$b0])", + "&vpslld ($t0,@x[$b0],12)", + "&vpsrld (@x[$b0],@x[$b0],20)", + "&vpor (@x[$b0],$t0,@x[$b0])", + "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip) + "&vpaddd ($xc_,$xc_,@x[$d1])", + "&vpxor (@x[$b1],$xc_,@x[$b1])", + "&vpslld ($t1,@x[$b1],12)", + "&vpsrld (@x[$b1],@x[$b1],20)", + "&vpor (@x[$b1],$t1,@x[$b1])", + + "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", + "&vpxor (@x[$d0],@x[$a0],@x[$d0])", + "&vpshufb (@x[$d0],@x[$d0],$t0)", + "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", + "&vpxor (@x[$d1],@x[$a1],@x[$d1])", + "&vpshufb (@x[$d1],@x[$d1],$t0)", + + "&vpaddd ($xc,$xc,@x[$d0])", + "&vpxor (@x[$b0],$xc,@x[$b0])", + "&vpslld ($t1,@x[$b0],7)", + "&vpsrld (@x[$b0],@x[$b0],25)", + "&vpor (@x[$b0],$t1,@x[$b0])", + "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip) + "&vpaddd ($xc_,$xc_,@x[$d1])", + "&vpxor (@x[$b1],$xc_,@x[$b1])", + "&vpslld ($t0,@x[$b1],7)", + "&vpsrld (@x[$b1],@x[$b1],25)", + "&vpor (@x[$b1],$t0,@x[$b1])", + + "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's + "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)", + "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")", + "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")", + + "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 + "&vpxor (@x[$d2],@x[$a2],@x[$d2])", + "&vpshufb (@x[$d2],@x[$d2],$t1)", + "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 + "&vpxor (@x[$d3],@x[$a3],@x[$d3])", + "&vpshufb (@x[$d3],@x[$d3],$t1)", + + "&vpaddd ($xc,$xc,@x[$d2])", + "&vpxor (@x[$b2],$xc,@x[$b2])", + "&vpslld ($t0,@x[$b2],12)", + "&vpsrld (@x[$b2],@x[$b2],20)", + "&vpor (@x[$b2],$t0,@x[$b2])", + "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip) + "&vpaddd ($xc_,$xc_,@x[$d3])", + "&vpxor (@x[$b3],$xc_,@x[$b3])", + "&vpslld ($t1,@x[$b3],12)", + "&vpsrld (@x[$b3],@x[$b3],20)", + "&vpor (@x[$b3],$t1,@x[$b3])", + + "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", + "&vpxor (@x[$d2],@x[$a2],@x[$d2])", + "&vpshufb (@x[$d2],@x[$d2],$t0)", + "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", + "&vpxor (@x[$d3],@x[$a3],@x[$d3])", + "&vpshufb (@x[$d3],@x[$d3],$t0)", + + "&vpaddd ($xc,$xc,@x[$d2])", + "&vpxor (@x[$b2],$xc,@x[$b2])", + "&vpslld ($t1,@x[$b2],7)", + "&vpsrld (@x[$b2],@x[$b2],25)", + "&vpor (@x[$b2],$t1,@x[$b2])", + "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip) + "&vpaddd ($xc_,$xc_,@x[$d3])", + "&vpxor (@x[$b3],$xc_,@x[$b3])", + "&vpslld ($t0,@x[$b3],7)", + "&vpsrld (@x[$b3],@x[$b3],25)", + "&vpor (@x[$b3],$t0,@x[$b3])" + ); +} + +my $xframe = $win64 ? 0xa8 : 8; + +$code.=<<___; +.globl ChaCha20_ctr32_avx2 +.type ChaCha20_ctr32_avx2,\@function,5 +.align 32 +ChaCha20_ctr32_avx2: +.cfi_startproc + _CET_ENDBR + mov %rsp,%r9 # frame register +.cfi_def_cfa_register r9 + sub \$0x280+$xframe,%rsp + and \$-32,%rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,-0xa8(%r9) + movaps %xmm7,-0x98(%r9) + movaps %xmm8,-0x88(%r9) + movaps %xmm9,-0x78(%r9) + movaps %xmm10,-0x68(%r9) + movaps %xmm11,-0x58(%r9) + movaps %xmm12,-0x48(%r9) + movaps %xmm13,-0x38(%r9) + movaps %xmm14,-0x28(%r9) + movaps %xmm15,-0x18(%r9) +.L8x_body: +___ +$code.=<<___; + vzeroupper + + ################ stack layout + # +0x00 SIMD equivalent of @x[8-12] + # ... + # +0x80 constant copy of key[0-2] smashed by lanes + # ... + # +0x200 SIMD counters (with nonce smashed by lanes) + # ... + # +0x280 + + vbroadcasti128 .Lsigma(%rip),$xa3 # key[0] + vbroadcasti128 ($key),$xb3 # key[1] + vbroadcasti128 16($key),$xt3 # key[2] + vbroadcasti128 ($counter),$xd3 # key[3] + lea 0x100(%rsp),%rcx # size optimization + lea 0x200(%rsp),%rax # size optimization + lea .Lrot16(%rip),%r10 + lea .Lrot24(%rip),%r11 + + vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... + vpshufd \$0x55,$xa3,$xa1 + vmovdqa $xa0,0x80-0x100(%rcx) # ... and offload + vpshufd \$0xaa,$xa3,$xa2 + vmovdqa $xa1,0xa0-0x100(%rcx) + vpshufd \$0xff,$xa3,$xa3 + vmovdqa $xa2,0xc0-0x100(%rcx) + vmovdqa $xa3,0xe0-0x100(%rcx) + + vpshufd \$0x00,$xb3,$xb0 + vpshufd \$0x55,$xb3,$xb1 + vmovdqa $xb0,0x100-0x100(%rcx) + vpshufd \$0xaa,$xb3,$xb2 + vmovdqa $xb1,0x120-0x100(%rcx) + vpshufd \$0xff,$xb3,$xb3 + vmovdqa $xb2,0x140-0x100(%rcx) + vmovdqa $xb3,0x160-0x100(%rcx) + + vpshufd \$0x00,$xt3,$xt0 # "xc0" + vpshufd \$0x55,$xt3,$xt1 # "xc1" + vmovdqa $xt0,0x180-0x200(%rax) + vpshufd \$0xaa,$xt3,$xt2 # "xc2" + vmovdqa $xt1,0x1a0-0x200(%rax) + vpshufd \$0xff,$xt3,$xt3 # "xc3" + vmovdqa $xt2,0x1c0-0x200(%rax) + vmovdqa $xt3,0x1e0-0x200(%rax) + + vpshufd \$0x00,$xd3,$xd0 + vpshufd \$0x55,$xd3,$xd1 + vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet + vpshufd \$0xaa,$xd3,$xd2 + vmovdqa $xd1,0x220-0x200(%rax) + vpshufd \$0xff,$xd3,$xd3 + vmovdqa $xd2,0x240-0x200(%rax) + vmovdqa $xd3,0x260-0x200(%rax) + + jmp .Loop_enter8x + +.align 32 +.Loop_outer8x: + vmovdqa 0x80-0x100(%rcx),$xa0 # re-load smashed key + vmovdqa 0xa0-0x100(%rcx),$xa1 + vmovdqa 0xc0-0x100(%rcx),$xa2 + vmovdqa 0xe0-0x100(%rcx),$xa3 + vmovdqa 0x100-0x100(%rcx),$xb0 + vmovdqa 0x120-0x100(%rcx),$xb1 + vmovdqa 0x140-0x100(%rcx),$xb2 + vmovdqa 0x160-0x100(%rcx),$xb3 + vmovdqa 0x180-0x200(%rax),$xt0 # "xc0" + vmovdqa 0x1a0-0x200(%rax),$xt1 # "xc1" + vmovdqa 0x1c0-0x200(%rax),$xt2 # "xc2" + vmovdqa 0x1e0-0x200(%rax),$xt3 # "xc3" + vmovdqa 0x200-0x200(%rax),$xd0 + vmovdqa 0x220-0x200(%rax),$xd1 + vmovdqa 0x240-0x200(%rax),$xd2 + vmovdqa 0x260-0x200(%rax),$xd3 + vpaddd .Leight(%rip),$xd0,$xd0 # next SIMD counters + +.Loop_enter8x: + vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]" + vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]" + vbroadcasti128 (%r10),$xt3 + vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters + mov \$10,%eax + jmp .Loop8x + +.align 32 +.Loop8x: +___ + foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; } + foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; } +$code.=<<___; + dec %eax + jnz .Loop8x + + lea 0x200(%rsp),%rax # size optimization + vpaddd 0x80-0x100(%rcx),$xa0,$xa0 # accumulate key + vpaddd 0xa0-0x100(%rcx),$xa1,$xa1 + vpaddd 0xc0-0x100(%rcx),$xa2,$xa2 + vpaddd 0xe0-0x100(%rcx),$xa3,$xa3 + + vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data + vpunpckldq $xa3,$xa2,$xt3 + vpunpckhdq $xa1,$xa0,$xa0 + vpunpckhdq $xa3,$xa2,$xa2 + vpunpcklqdq $xt3,$xt2,$xa1 # "a0" + vpunpckhqdq $xt3,$xt2,$xt2 # "a1" + vpunpcklqdq $xa2,$xa0,$xa3 # "a2" + vpunpckhqdq $xa2,$xa0,$xa0 # "a3" +___ + ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); +$code.=<<___; + vpaddd 0x100-0x100(%rcx),$xb0,$xb0 + vpaddd 0x120-0x100(%rcx),$xb1,$xb1 + vpaddd 0x140-0x100(%rcx),$xb2,$xb2 + vpaddd 0x160-0x100(%rcx),$xb3,$xb3 + + vpunpckldq $xb1,$xb0,$xt2 + vpunpckldq $xb3,$xb2,$xt3 + vpunpckhdq $xb1,$xb0,$xb0 + vpunpckhdq $xb3,$xb2,$xb2 + vpunpcklqdq $xt3,$xt2,$xb1 # "b0" + vpunpckhqdq $xt3,$xt2,$xt2 # "b1" + vpunpcklqdq $xb2,$xb0,$xb3 # "b2" + vpunpckhqdq $xb2,$xb0,$xb0 # "b3" +___ + ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); +$code.=<<___; + vperm2i128 \$0x20,$xb0,$xa0,$xt3 # "de-interlace" further + vperm2i128 \$0x31,$xb0,$xa0,$xb0 + vperm2i128 \$0x20,$xb1,$xa1,$xa0 + vperm2i128 \$0x31,$xb1,$xa1,$xb1 + vperm2i128 \$0x20,$xb2,$xa2,$xa1 + vperm2i128 \$0x31,$xb2,$xa2,$xb2 + vperm2i128 \$0x20,$xb3,$xa3,$xa2 + vperm2i128 \$0x31,$xb3,$xa3,$xb3 +___ + ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); + my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); +$code.=<<___; + vmovdqa $xa0,0x00(%rsp) # offload $xaN + vmovdqa $xa1,0x20(%rsp) + vmovdqa 0x40(%rsp),$xc2 # $xa0 + vmovdqa 0x60(%rsp),$xc3 # $xa1 + + vpaddd 0x180-0x200(%rax),$xc0,$xc0 + vpaddd 0x1a0-0x200(%rax),$xc1,$xc1 + vpaddd 0x1c0-0x200(%rax),$xc2,$xc2 + vpaddd 0x1e0-0x200(%rax),$xc3,$xc3 + + vpunpckldq $xc1,$xc0,$xt2 + vpunpckldq $xc3,$xc2,$xt3 + vpunpckhdq $xc1,$xc0,$xc0 + vpunpckhdq $xc3,$xc2,$xc2 + vpunpcklqdq $xt3,$xt2,$xc1 # "c0" + vpunpckhqdq $xt3,$xt2,$xt2 # "c1" + vpunpcklqdq $xc2,$xc0,$xc3 # "c2" + vpunpckhqdq $xc2,$xc0,$xc0 # "c3" +___ + ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); +$code.=<<___; + vpaddd 0x200-0x200(%rax),$xd0,$xd0 + vpaddd 0x220-0x200(%rax),$xd1,$xd1 + vpaddd 0x240-0x200(%rax),$xd2,$xd2 + vpaddd 0x260-0x200(%rax),$xd3,$xd3 + + vpunpckldq $xd1,$xd0,$xt2 + vpunpckldq $xd3,$xd2,$xt3 + vpunpckhdq $xd1,$xd0,$xd0 + vpunpckhdq $xd3,$xd2,$xd2 + vpunpcklqdq $xt3,$xt2,$xd1 # "d0" + vpunpckhqdq $xt3,$xt2,$xt2 # "d1" + vpunpcklqdq $xd2,$xd0,$xd3 # "d2" + vpunpckhqdq $xd2,$xd0,$xd0 # "d3" +___ + ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); +$code.=<<___; + vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further + vperm2i128 \$0x31,$xd0,$xc0,$xd0 + vperm2i128 \$0x20,$xd1,$xc1,$xc0 + vperm2i128 \$0x31,$xd1,$xc1,$xd1 + vperm2i128 \$0x20,$xd2,$xc2,$xc1 + vperm2i128 \$0x31,$xd2,$xc2,$xd2 + vperm2i128 \$0x20,$xd3,$xc3,$xc2 + vperm2i128 \$0x31,$xd3,$xc3,$xd3 +___ + ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); + ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)= + ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3); + ($xa0,$xa1)=($xt2,$xt3); +$code.=<<___; + vmovdqa 0x00(%rsp),$xa0 # $xaN was offloaded, remember? + vmovdqa 0x20(%rsp),$xa1 + + cmp \$64*8,$len + jb .Ltail8x + + vpxor 0x00($inp),$xa0,$xa0 # xor with input + vpxor 0x20($inp),$xb0,$xb0 + vpxor 0x40($inp),$xc0,$xc0 + vpxor 0x60($inp),$xd0,$xd0 + lea 0x80($inp),$inp # size optimization + vmovdqu $xa0,0x00($out) + vmovdqu $xb0,0x20($out) + vmovdqu $xc0,0x40($out) + vmovdqu $xd0,0x60($out) + lea 0x80($out),$out # size optimization + + vpxor 0x00($inp),$xa1,$xa1 + vpxor 0x20($inp),$xb1,$xb1 + vpxor 0x40($inp),$xc1,$xc1 + vpxor 0x60($inp),$xd1,$xd1 + lea 0x80($inp),$inp # size optimization + vmovdqu $xa1,0x00($out) + vmovdqu $xb1,0x20($out) + vmovdqu $xc1,0x40($out) + vmovdqu $xd1,0x60($out) + lea 0x80($out),$out # size optimization + + vpxor 0x00($inp),$xa2,$xa2 + vpxor 0x20($inp),$xb2,$xb2 + vpxor 0x40($inp),$xc2,$xc2 + vpxor 0x60($inp),$xd2,$xd2 + lea 0x80($inp),$inp # size optimization + vmovdqu $xa2,0x00($out) + vmovdqu $xb2,0x20($out) + vmovdqu $xc2,0x40($out) + vmovdqu $xd2,0x60($out) + lea 0x80($out),$out # size optimization + + vpxor 0x00($inp),$xa3,$xa3 + vpxor 0x20($inp),$xb3,$xb3 + vpxor 0x40($inp),$xc3,$xc3 + vpxor 0x60($inp),$xd3,$xd3 + lea 0x80($inp),$inp # size optimization + vmovdqu $xa3,0x00($out) + vmovdqu $xb3,0x20($out) + vmovdqu $xc3,0x40($out) + vmovdqu $xd3,0x60($out) + lea 0x80($out),$out # size optimization + + sub \$64*8,$len + jnz .Loop_outer8x + + jmp .Ldone8x + +.Ltail8x: + cmp \$448,$len + jae .L448_or_more8x + cmp \$384,$len + jae .L384_or_more8x + cmp \$320,$len + jae .L320_or_more8x + cmp \$256,$len + jae .L256_or_more8x + cmp \$192,$len + jae .L192_or_more8x + cmp \$128,$len + jae .L128_or_more8x + cmp \$64,$len + jae .L64_or_more8x + + xor %r10,%r10 + vmovdqa $xa0,0x00(%rsp) + vmovdqa $xb0,0x20(%rsp) + jmp .Loop_tail8x + +.align 32 +.L64_or_more8x: + vpxor 0x00($inp),$xa0,$xa0 # xor with input + vpxor 0x20($inp),$xb0,$xb0 + vmovdqu $xa0,0x00($out) + vmovdqu $xb0,0x20($out) + je .Ldone8x + + lea 0x40($inp),$inp # inp+=64*1 + xor %r10,%r10 + vmovdqa $xc0,0x00(%rsp) + lea 0x40($out),$out # out+=64*1 + sub \$64,$len # len-=64*1 + vmovdqa $xd0,0x20(%rsp) + jmp .Loop_tail8x + +.align 32 +.L128_or_more8x: + vpxor 0x00($inp),$xa0,$xa0 # xor with input + vpxor 0x20($inp),$xb0,$xb0 + vpxor 0x40($inp),$xc0,$xc0 + vpxor 0x60($inp),$xd0,$xd0 + vmovdqu $xa0,0x00($out) + vmovdqu $xb0,0x20($out) + vmovdqu $xc0,0x40($out) + vmovdqu $xd0,0x60($out) + je .Ldone8x + + lea 0x80($inp),$inp # inp+=64*2 + xor %r10,%r10 + vmovdqa $xa1,0x00(%rsp) + lea 0x80($out),$out # out+=64*2 + sub \$128,$len # len-=64*2 + vmovdqa $xb1,0x20(%rsp) + jmp .Loop_tail8x + +.align 32 +.L192_or_more8x: + vpxor 0x00($inp),$xa0,$xa0 # xor with input + vpxor 0x20($inp),$xb0,$xb0 + vpxor 0x40($inp),$xc0,$xc0 + vpxor 0x60($inp),$xd0,$xd0 + vpxor 0x80($inp),$xa1,$xa1 + vpxor 0xa0($inp),$xb1,$xb1 + vmovdqu $xa0,0x00($out) + vmovdqu $xb0,0x20($out) + vmovdqu $xc0,0x40($out) + vmovdqu $xd0,0x60($out) + vmovdqu $xa1,0x80($out) + vmovdqu $xb1,0xa0($out) + je .Ldone8x + + lea 0xc0($inp),$inp # inp+=64*3 + xor %r10,%r10 + vmovdqa $xc1,0x00(%rsp) + lea 0xc0($out),$out # out+=64*3 + sub \$192,$len # len-=64*3 + vmovdqa $xd1,0x20(%rsp) + jmp .Loop_tail8x + +.align 32 +.L256_or_more8x: + vpxor 0x00($inp),$xa0,$xa0 # xor with input + vpxor 0x20($inp),$xb0,$xb0 + vpxor 0x40($inp),$xc0,$xc0 + vpxor 0x60($inp),$xd0,$xd0 + vpxor 0x80($inp),$xa1,$xa1 + vpxor 0xa0($inp),$xb1,$xb1 + vpxor 0xc0($inp),$xc1,$xc1 + vpxor 0xe0($inp),$xd1,$xd1 + vmovdqu $xa0,0x00($out) + vmovdqu $xb0,0x20($out) + vmovdqu $xc0,0x40($out) + vmovdqu $xd0,0x60($out) + vmovdqu $xa1,0x80($out) + vmovdqu $xb1,0xa0($out) + vmovdqu $xc1,0xc0($out) + vmovdqu $xd1,0xe0($out) + je .Ldone8x + + lea 0x100($inp),$inp # inp+=64*4 + xor %r10,%r10 + vmovdqa $xa2,0x00(%rsp) + lea 0x100($out),$out # out+=64*4 + sub \$256,$len # len-=64*4 + vmovdqa $xb2,0x20(%rsp) + jmp .Loop_tail8x + +.align 32 +.L320_or_more8x: + vpxor 0x00($inp),$xa0,$xa0 # xor with input + vpxor 0x20($inp),$xb0,$xb0 + vpxor 0x40($inp),$xc0,$xc0 + vpxor 0x60($inp),$xd0,$xd0 + vpxor 0x80($inp),$xa1,$xa1 + vpxor 0xa0($inp),$xb1,$xb1 + vpxor 0xc0($inp),$xc1,$xc1 + vpxor 0xe0($inp),$xd1,$xd1 + vpxor 0x100($inp),$xa2,$xa2 + vpxor 0x120($inp),$xb2,$xb2 + vmovdqu $xa0,0x00($out) + vmovdqu $xb0,0x20($out) + vmovdqu $xc0,0x40($out) + vmovdqu $xd0,0x60($out) + vmovdqu $xa1,0x80($out) + vmovdqu $xb1,0xa0($out) + vmovdqu $xc1,0xc0($out) + vmovdqu $xd1,0xe0($out) + vmovdqu $xa2,0x100($out) + vmovdqu $xb2,0x120($out) + je .Ldone8x + + lea 0x140($inp),$inp # inp+=64*5 + xor %r10,%r10 + vmovdqa $xc2,0x00(%rsp) + lea 0x140($out),$out # out+=64*5 + sub \$320,$len # len-=64*5 + vmovdqa $xd2,0x20(%rsp) + jmp .Loop_tail8x + +.align 32 +.L384_or_more8x: + vpxor 0x00($inp),$xa0,$xa0 # xor with input + vpxor 0x20($inp),$xb0,$xb0 + vpxor 0x40($inp),$xc0,$xc0 + vpxor 0x60($inp),$xd0,$xd0 + vpxor 0x80($inp),$xa1,$xa1 + vpxor 0xa0($inp),$xb1,$xb1 + vpxor 0xc0($inp),$xc1,$xc1 + vpxor 0xe0($inp),$xd1,$xd1 + vpxor 0x100($inp),$xa2,$xa2 + vpxor 0x120($inp),$xb2,$xb2 + vpxor 0x140($inp),$xc2,$xc2 + vpxor 0x160($inp),$xd2,$xd2 + vmovdqu $xa0,0x00($out) + vmovdqu $xb0,0x20($out) + vmovdqu $xc0,0x40($out) + vmovdqu $xd0,0x60($out) + vmovdqu $xa1,0x80($out) + vmovdqu $xb1,0xa0($out) + vmovdqu $xc1,0xc0($out) + vmovdqu $xd1,0xe0($out) + vmovdqu $xa2,0x100($out) + vmovdqu $xb2,0x120($out) + vmovdqu $xc2,0x140($out) + vmovdqu $xd2,0x160($out) + je .Ldone8x + + lea 0x180($inp),$inp # inp+=64*6 + xor %r10,%r10 + vmovdqa $xa3,0x00(%rsp) + lea 0x180($out),$out # out+=64*6 + sub \$384,$len # len-=64*6 + vmovdqa $xb3,0x20(%rsp) + jmp .Loop_tail8x + +.align 32 +.L448_or_more8x: + vpxor 0x00($inp),$xa0,$xa0 # xor with input + vpxor 0x20($inp),$xb0,$xb0 + vpxor 0x40($inp),$xc0,$xc0 + vpxor 0x60($inp),$xd0,$xd0 + vpxor 0x80($inp),$xa1,$xa1 + vpxor 0xa0($inp),$xb1,$xb1 + vpxor 0xc0($inp),$xc1,$xc1 + vpxor 0xe0($inp),$xd1,$xd1 + vpxor 0x100($inp),$xa2,$xa2 + vpxor 0x120($inp),$xb2,$xb2 + vpxor 0x140($inp),$xc2,$xc2 + vpxor 0x160($inp),$xd2,$xd2 + vpxor 0x180($inp),$xa3,$xa3 + vpxor 0x1a0($inp),$xb3,$xb3 + vmovdqu $xa0,0x00($out) + vmovdqu $xb0,0x20($out) + vmovdqu $xc0,0x40($out) + vmovdqu $xd0,0x60($out) + vmovdqu $xa1,0x80($out) + vmovdqu $xb1,0xa0($out) + vmovdqu $xc1,0xc0($out) + vmovdqu $xd1,0xe0($out) + vmovdqu $xa2,0x100($out) + vmovdqu $xb2,0x120($out) + vmovdqu $xc2,0x140($out) + vmovdqu $xd2,0x160($out) + vmovdqu $xa3,0x180($out) + vmovdqu $xb3,0x1a0($out) + je .Ldone8x + + lea 0x1c0($inp),$inp # inp+=64*7 + xor %r10,%r10 + vmovdqa $xc3,0x00(%rsp) + lea 0x1c0($out),$out # out+=64*7 + sub \$448,$len # len-=64*7 + vmovdqa $xd3,0x20(%rsp) + +.Loop_tail8x: + movzb ($inp,%r10),%eax + movzb (%rsp,%r10),%ecx + lea 1(%r10),%r10 + xor %ecx,%eax + mov %al,-1($out,%r10) + dec $len + jnz .Loop_tail8x + +.Ldone8x: + vzeroall +___ +$code.=<<___ if ($win64); + movaps -0xa8(%r9),%xmm6 + movaps -0x98(%r9),%xmm7 + movaps -0x88(%r9),%xmm8 + movaps -0x78(%r9),%xmm9 + movaps -0x68(%r9),%xmm10 + movaps -0x58(%r9),%xmm11 + movaps -0x48(%r9),%xmm12 + movaps -0x38(%r9),%xmm13 + movaps -0x28(%r9),%xmm14 + movaps -0x18(%r9),%xmm15 +___ +$code.=<<___; + lea (%r9),%rsp +.cfi_def_cfa_register rsp +.L8x_epilogue: + ret +.cfi_endproc +.size ChaCha20_ctr32_avx2,.-ChaCha20_ctr32_avx2 +___ +} + +######################################################################## +# AVX512 code paths were removed + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + lea .Lctr32_body(%rip),%r10 + cmp %r10,%rbx # context->Rip<.Lprologue + jb .Lcommon_seh_tail + + mov 152($context),%rax # pull context->Rsp + + lea .Lno_data(%rip),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=.Lepilogue + jae .Lcommon_seh_tail + + lea 64+24+48(%rax),%rax + + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R14 + +.Lcommon_seh_tail: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size se_handler,.-se_handler + +.type ssse3_handler,\@abi-omnipotent +.align 16 +ssse3_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->RipR9 + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + lea -0x28(%rax),%rsi + lea 512($context),%rdi # &context.Xmm6 + mov \$4,%ecx + .long 0xa548f3fc # cld; rep movsq + + jmp .Lcommon_seh_tail +.size ssse3_handler,.-ssse3_handler + +.type full_handler,\@abi-omnipotent +.align 16 +full_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->RipR9 + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + lea -0xa8(%rax),%rsi + lea 512($context),%rdi # &context.Xmm6 + mov \$20,%ecx + .long 0xa548f3fc # cld; rep movsq + + jmp .Lcommon_seh_tail +.size full_handler,.-full_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_ChaCha20_ctr32_nohw + .rva .LSEH_end_ChaCha20_ctr32_nohw + .rva .LSEH_info_ChaCha20_ctr32_nohw + + .rva .LSEH_begin_ChaCha20_ctr32_ssse3_4x + .rva .LSEH_end_ChaCha20_ctr32_ssse3_4x + .rva .LSEH_info_ChaCha20_ctr32_ssse3_4x +___ +$code.=<<___ if ($avx>1); + .rva .LSEH_begin_ChaCha20_ctr32_avx2 + .rva .LSEH_end_ChaCha20_ctr32_avx2 + .rva .LSEH_info_ChaCha20_ctr32_avx2 +___ +$code.=<<___; +.section .xdata +.align 8 +.LSEH_info_ChaCha20_ctr32_nohw: + .byte 9,0,0,0 + .rva se_handler + +.LSEH_info_ChaCha20_ctr32_ssse3_4x: + .byte 9,0,0,0 + .rva full_handler + .rva .L4x_body,.L4x_epilogue +___ +$code.=<<___ if ($avx>1); +.LSEH_info_ChaCha20_ctr32_avx2: + .byte 9,0,0,0 + .rva full_handler + .rva .L8x_body,.L8x_epilogue # HandlerData[] +___ +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + s/%x#%[yz]/%x/g; # "down-shift" + + print $_,"\n"; +} + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/ring-0.17.14/crypto/cipher/asm/chacha20_poly1305_armv8.pl b/ring-0.17.14/crypto/cipher/asm/chacha20_poly1305_armv8.pl new file mode 100644 index 0000000000..06a35e067e --- /dev/null +++ b/ring-0.17.14/crypto/cipher/asm/chacha20_poly1305_armv8.pl @@ -0,0 +1,1643 @@ +#!/usr/bin/env perl + +# Copyright (c) 2020, CloudFlare Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +############################################################################## +# # +# Author: Vlad Krasnov # +# # +############################################################################## + +$flavour = shift; +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +my ($oup,$inp,$inl,$adp,$adl,$keyp,$itr1,$itr2) = ("x0","x1","x2","x3","x4","x5","x6","x7"); +my ($acc0,$acc1,$acc2) = map("x$_",(8..10)); +my ($t0,$t1,$t2,$t3) = map("x$_",(11..14)); +my ($one, $r0, $r1) = ("x15","x16","x17"); +my ($t0w) = $t0 =~ s/x/w/r; + +my ($A0,$A1,$A2,$A3,$A4,$B0,$B1,$B2,$B3,$B4,$C0,$C1,$C2,$C3,$C4,$D0,$D1,$D2,$D3,$D4) = map("v$_",(0..19)); +my ($T0,$T1,$T2,$T3) = map("v$_",(20..23)); + +my $CONSTS = "v24"; +my $INC = "v25"; +my $ROL8 = "v26"; +my $CLAMP = "v27"; + +my ($B_STORE, $C_STORE, $D_STORE) = map("v$_",(28..30)); + +my $S_STORE = $CLAMP; +my $LEN_STORE = "v31"; + +sub chacha_qr { +my ($a,$b,$c,$d,$t,$dir)=@_; +my ($shift_b,$shift_d) = $dir =~ /left/ ? ("#4","#12") : ("#12","#4"); +$code.=<<___; + add $a.4s, $a.4s, $b.4s + eor $d.16b, $d.16b, $a.16b + rev32 $d.8h, $d.8h + + add $c.4s, $c.4s, $d.4s + eor $b.16b, $b.16b, $c.16b + ushr $t.4s, $b.4s, #20 + sli $t.4s, $b.4s, #12 +___ + ($t,$b) = ($b,$t); +$code.=<<___; + add $a.4s, $a.4s, $b.4s + eor $d.16b, $d.16b, $a.16b + tbl $d.16b, {$d.16b}, $ROL8.16b + + add $c.4s, $c.4s, $d.4s + eor $b.16b, $b.16b, $c.16b + ushr $t.4s, $b.4s, #25 + sli $t.4s, $b.4s, #7 +___ + ($t,$b) = ($b,$t); +$code.=<<___; + ext $b.16b, $b.16b, $b.16b, $shift_b + ext $c.16b, $c.16b, $c.16b, #8 + ext $d.16b, $d.16b, $d.16b, $shift_d +___ +} + +sub poly_add { +my ($src)=@_; +$code.="ldp $t0, $t1, [$src], 16 + adds $acc0, $acc0, $t0 + adcs $acc1, $acc1, $t1 + adc $acc2, $acc2, $one\n"; +} + +sub poly_add_vec { +my ($src)=@_; +$code.="mov $t0, $src.d[0] + mov $t1, $src.d[1] + adds $acc0, $acc0, $t0 + adcs $acc1, $acc1, $t1 + adc $acc2, $acc2, $one\n"; +} + +sub poly_stage1 { +$code.="mul $t0, $acc0, $r0 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh $t1, $acc0, $r0 + mul $t2, $acc1, $r0 + umulh $t3, $acc1, $r0 + adds $t1, $t1, $t2 + mul $t2, $acc2, $r0 + adc $t2, $t2, $t3\n"; +} + +sub poly_stage2 { +$code.="mul $t3, $acc0, $r1 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh $acc0, $acc0, $r1 + adds $t1, $t1, $t3 + mul $t3, $acc1, $r1 + umulh $acc1, $acc1, $r1 + adcs $t3, $t3, $acc0 + mul $acc2, $acc2, $r1 + adc $acc2, $acc2, $acc1 + adds $t2, $t2, $t3 + adc $t3, $acc2, xzr\n"; +} + +# At the beginning of the reduce stage t = [t3:t2:t1:t0] is a product of +# r = [r1:r0] and acc = [acc2:acc1:acc0] +# r is 124 bits at most (due to clamping) and acc is 131 bits at most +# (acc2 is at most 4 before the addition and can be at most 6 when we add in +# the next block) therefore t is at most 255 bits big, and t3 is 63 bits. +sub poly_reduce_stage { +$code.="and $acc2, $t2, #3 // At this point acc2 is 2 bits at most (value of 3) + and $acc0, $t2, #-4 + extr $t2, $t3, $t2, #2 + adds $acc0, $acc0, $t0 + lsr $t0, $t3, #2 + adc $acc1, $t3, $t0 // No carry out since t0 is 61 bits and t3 is 63 bits + adds $acc0, $acc0, $t2 + adcs $acc1, $acc1, $t1 + adc $acc2, $acc2, xzr // At this point acc2 has the value of 4 at most \n"; +} + +sub poly_mul { + &poly_stage1(); + &poly_stage2(); + &poly_reduce_stage(); +} + +sub chacha_qr_x3 { +my ($dir)=@_; +my ($shift_b,$shift_d) = $dir =~ /left/ ? ("#4","#12") : ("#12","#4"); +$code.=<<___; + add $A0.4s, $A0.4s, $B0.4s + add $A1.4s, $A1.4s, $B1.4s + add $A2.4s, $A2.4s, $B2.4s + eor $D0.16b, $D0.16b, $A0.16b + eor $D1.16b, $D1.16b, $A1.16b + eor $D2.16b, $D2.16b, $A2.16b + rev32 $D0.8h, $D0.8h + rev32 $D1.8h, $D1.8h + rev32 $D2.8h, $D2.8h + + add $C0.4s, $C0.4s, $D0.4s + add $C1.4s, $C1.4s, $D1.4s + add $C2.4s, $C2.4s, $D2.4s + eor $B0.16b, $B0.16b, $C0.16b + eor $B1.16b, $B1.16b, $C1.16b + eor $B2.16b, $B2.16b, $C2.16b + ushr $T0.4s, $B0.4s, #20 + sli $T0.4s, $B0.4s, #12 + ushr $B0.4s, $B1.4s, #20 + sli $B0.4s, $B1.4s, #12 + ushr $B1.4s, $B2.4s, #20 + sli $B1.4s, $B2.4s, #12 + + add $A0.4s, $A0.4s, $T0.4s + add $A1.4s, $A1.4s, $B0.4s + add $A2.4s, $A2.4s, $B1.4s + eor $D0.16b, $D0.16b, $A0.16b + eor $D1.16b, $D1.16b, $A1.16b + eor $D2.16b, $D2.16b, $A2.16b + tbl $D0.16b, {$D0.16b}, $ROL8.16b + tbl $D1.16b, {$D1.16b}, $ROL8.16b + tbl $D2.16b, {$D2.16b}, $ROL8.16b + + add $C0.4s, $C0.4s, $D0.4s + add $C1.4s, $C1.4s, $D1.4s + add $C2.4s, $C2.4s, $D2.4s + eor $T0.16b, $T0.16b, $C0.16b + eor $B0.16b, $B0.16b, $C1.16b + eor $B1.16b, $B1.16b, $C2.16b + ushr $B2.4s, $B1.4s, #25 + sli $B2.4s, $B1.4s, #7 + ushr $B1.4s, $B0.4s, #25 + sli $B1.4s, $B0.4s, #7 + ushr $B0.4s, $T0.4s, #25 + sli $B0.4s, $T0.4s, #7 + + ext $B0.16b, $B0.16b, $B0.16b, $shift_b + ext $B1.16b, $B1.16b, $B1.16b, $shift_b + ext $B2.16b, $B2.16b, $B2.16b, $shift_b + + ext $C0.16b, $C0.16b, $C0.16b, #8 + ext $C1.16b, $C1.16b, $C1.16b, #8 + ext $C2.16b, $C2.16b, $C2.16b, #8 + + ext $D0.16b, $D0.16b, $D0.16b, $shift_d + ext $D1.16b, $D1.16b, $D1.16b, $shift_d + ext $D2.16b, $D2.16b, $D2.16b, $shift_d +___ +} + +# When preparing 5 ChaCha20 blocks in parallel, we operate on 4 blocks vertically as introduced by Andrew Moon +# the fifth block is done horizontally +sub chacha_qr_x5 { +my ($dir)=@_; +my ($a0,$a1,$a2,$a3) = $dir =~ /left/ ? ($A0,$A1,$A2,$A3) : ($A0,$A1,$A2,$A3); +my ($b0,$b1,$b2,$b3) = $dir =~ /left/ ? ($B0,$B1,$B2,$B3) : ($B1,$B2,$B3,$B0); +my ($c0,$c1,$c2,$c3) = $dir =~ /left/ ? ($C0,$C1,$C2,$C3) : ($C2,$C3,$C0,$C1); +my ($d0,$d1,$d2,$d3) = $dir =~ /left/ ? ($D0,$D1,$D2,$D3) : ($D3,$D0,$D1,$D2); +my ($shift_b,$shift_d) = $dir =~ /left/ ? ("#4","#12") : ("#12","#4"); +$code.=<<___; + add $a0.4s, $a0.4s, $b0.4s + add $a1.4s, $a1.4s, $b1.4s + add $a2.4s, $a2.4s, $b2.4s + add $a3.4s, $a3.4s, $b3.4s + add $A4.4s, $A4.4s, $B4.4s + + eor $d0.16b, $d0.16b, $a0.16b + eor $d1.16b, $d1.16b, $a1.16b + eor $d2.16b, $d2.16b, $a2.16b + eor $d3.16b, $d3.16b, $a3.16b + eor $D4.16b, $D4.16b, $A4.16b + + rev32 $d0.8h, $d0.8h + rev32 $d1.8h, $d1.8h + rev32 $d2.8h, $d2.8h + rev32 $d3.8h, $d3.8h + rev32 $D4.8h, $D4.8h + + add $c0.4s, $c0.4s, $d0.4s + add $c1.4s, $c1.4s, $d1.4s + add $c2.4s, $c2.4s, $d2.4s + add $c3.4s, $c3.4s, $d3.4s + add $C4.4s, $C4.4s, $D4.4s + + eor $b0.16b, $b0.16b, $c0.16b + eor $b1.16b, $b1.16b, $c1.16b + eor $b2.16b, $b2.16b, $c2.16b + eor $b3.16b, $b3.16b, $c3.16b + eor $B4.16b, $B4.16b, $C4.16b + + ushr $T0.4s, $b0.4s, #20 + sli $T0.4s, $b0.4s, #12 + ushr $b0.4s, $b1.4s, #20 + sli $b0.4s, $b1.4s, #12 + ushr $b1.4s, $b2.4s, #20 + sli $b1.4s, $b2.4s, #12 + ushr $b2.4s, $b3.4s, #20 + sli $b2.4s, $b3.4s, #12 + ushr $b3.4s, $B4.4s, #20 + sli $b3.4s, $B4.4s, #12 + + add $a0.4s, $a0.4s, $T0.4s + add $a1.4s, $a1.4s, $b0.4s + add $a2.4s, $a2.4s, $b1.4s + add $a3.4s, $a3.4s, $b2.4s + add $A4.4s, $A4.4s, $b3.4s + + eor $d0.16b, $d0.16b, $a0.16b + eor $d1.16b, $d1.16b, $a1.16b + eor $d2.16b, $d2.16b, $a2.16b + eor $d3.16b, $d3.16b, $a3.16b + eor $D4.16b, $D4.16b, $A4.16b + + tbl $d0.16b, {$d0.16b}, $ROL8.16b + tbl $d1.16b, {$d1.16b}, $ROL8.16b + tbl $d2.16b, {$d2.16b}, $ROL8.16b + tbl $d3.16b, {$d3.16b}, $ROL8.16b + tbl $D4.16b, {$D4.16b}, $ROL8.16b + + add $c0.4s, $c0.4s, $d0.4s + add $c1.4s, $c1.4s, $d1.4s + add $c2.4s, $c2.4s, $d2.4s + add $c3.4s, $c3.4s, $d3.4s + add $C4.4s, $C4.4s, $D4.4s + + eor $T0.16b, $T0.16b, $c0.16b + eor $b0.16b, $b0.16b, $c1.16b + eor $b1.16b, $b1.16b, $c2.16b + eor $b2.16b, $b2.16b, $c3.16b + eor $b3.16b, $b3.16b, $C4.16b + + ushr $B4.4s, $b3.4s, #25 + sli $B4.4s, $b3.4s, #7 + ushr $b3.4s, $b2.4s, #25 + sli $b3.4s, $b2.4s, #7 + ushr $b2.4s, $b1.4s, #25 + sli $b2.4s, $b1.4s, #7 + ushr $b1.4s, $b0.4s, #25 + sli $b1.4s, $b0.4s, #7 + ushr $b0.4s, $T0.4s, #25 + sli $b0.4s, $T0.4s, #7 + + ext $B4.16b, $B4.16b, $B4.16b, $shift_b + ext $C4.16b, $C4.16b, $C4.16b, #8 + ext $D4.16b, $D4.16b, $D4.16b, $shift_d +___ +} + +{ +$code.=<<___; +.section .rodata + +.align 7 +.Lchacha20_consts: +.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' +.Linc: +.long 1,2,3,4 +.Lrol8: +.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 +.Lclamp: +.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC + +.text + +.type .Lpoly_hash_ad_internal,%function +.align 6 +.Lpoly_hash_ad_internal: + .cfi_startproc + cbnz $adl, .Lpoly_hash_intro + ret + +.Lpoly_hash_intro: + cmp $adl, #16 + b.lt .Lpoly_hash_ad_tail +___ + &poly_add($adp); + &poly_mul(); +$code.=<<___; + sub $adl, $adl, #16 + b .Lpoly_hash_ad_internal + +.Lpoly_hash_ad_tail: + cbz $adl, .Lpoly_hash_ad_ret + + eor $T0.16b, $T0.16b, $T0.16b // Use T0 to load the AAD + sub $adl, $adl, #1 + +.Lpoly_hash_tail_16_compose: + ext $T0.16b, $T0.16b, $T0.16b, #15 + ldrb $t0w, [$adp, $adl] + mov $T0.b[0], $t0w + subs $adl, $adl, #1 + b.ge .Lpoly_hash_tail_16_compose +___ + &poly_add_vec($T0); + &poly_mul(); +$code.=<<___; + +.Lpoly_hash_ad_ret: + ret + .cfi_endproc +.size .Lpoly_hash_ad_internal, .-.Lpoly_hash_ad_internal + +///////////////////////////////// +// +// void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data); +// +.globl chacha20_poly1305_seal +.type chacha20_poly1305_seal,%function +.align 6 +chacha20_poly1305_seal: + AARCH64_SIGN_LINK_REGISTER +.cfi_startproc + stp x29, x30, [sp, #-80]! +.cfi_def_cfa_offset 80 +.cfi_offset w30, -72 +.cfi_offset w29, -80 + mov x29, sp + // We probably could do .cfi_def_cfa w29, 80 at this point, but since + // we don't actually use the frame pointer like that, it's probably not + // worth bothering. + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] +.cfi_offset b15, -8 +.cfi_offset b14, -16 +.cfi_offset b13, -24 +.cfi_offset b12, -32 +.cfi_offset b11, -40 +.cfi_offset b10, -48 +.cfi_offset b9, -56 +.cfi_offset b8, -64 + + adrp $t0, :pg_hi21:.Lchacha20_consts + add $t0, $t0, :lo12:.Lchacha20_consts + + ld1 {$CONSTS.16b - $CLAMP.16b}, [$t0] // Load the CONSTS, INC, ROL8 and CLAMP values + ld1 {$B_STORE.16b - $D_STORE.16b}, [$keyp] + + mov $one, #1 // Prepare the Poly1305 state + mov $acc0, #0 + mov $acc1, #0 + mov $acc2, #0 + + ldr $t1, [$keyp, #56] // The total cipher text length includes extra_in_len + add $t1, $t1, $inl + mov $LEN_STORE.d[0], $adl // Store the input and aad lengths + mov $LEN_STORE.d[1], $t1 + + cmp $inl, #128 + b.le .Lseal_128 // Optimization for smaller buffers + + // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext, + // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically, + // the fifth block (A4-D4) horizontally. + ld4r {$A0.4s-$A3.4s}, [$t0] + mov $A4.16b, $CONSTS.16b + + ld4r {$B0.4s-$B3.4s}, [$keyp], #16 + mov $B4.16b, $B_STORE.16b + + ld4r {$C0.4s-$C3.4s}, [$keyp], #16 + mov $C4.16b, $C_STORE.16b + + ld4r {$D0.4s-$D3.4s}, [$keyp] + add $D0.4s, $D0.4s, $INC.4s + mov $D4.16b, $D_STORE.16b + + sub $keyp, $keyp, #32 + + mov $itr1, #10 + +.align 5 +.Lseal_init_rounds: +___ + &chacha_qr_x5("left"); + &chacha_qr_x5("right"); +$code.=<<___; + subs $itr1, $itr1, #1 + b.hi .Lseal_init_rounds + + add $D0.4s, $D0.4s, $INC.4s + mov $t0, #4 + dup $T0.4s, $t0w + add $INC.4s, $INC.4s, $T0.4s + + zip1 $T0.4s, $A0.4s, $A1.4s + zip2 $T1.4s, $A0.4s, $A1.4s + zip1 $T2.4s, $A2.4s, $A3.4s + zip2 $T3.4s, $A2.4s, $A3.4s + + zip1 $A0.2d, $T0.2d, $T2.2d + zip2 $A1.2d, $T0.2d, $T2.2d + zip1 $A2.2d, $T1.2d, $T3.2d + zip2 $A3.2d, $T1.2d, $T3.2d + + zip1 $T0.4s, $B0.4s, $B1.4s + zip2 $T1.4s, $B0.4s, $B1.4s + zip1 $T2.4s, $B2.4s, $B3.4s + zip2 $T3.4s, $B2.4s, $B3.4s + + zip1 $B0.2d, $T0.2d, $T2.2d + zip2 $B1.2d, $T0.2d, $T2.2d + zip1 $B2.2d, $T1.2d, $T3.2d + zip2 $B3.2d, $T1.2d, $T3.2d + + zip1 $T0.4s, $C0.4s, $C1.4s + zip2 $T1.4s, $C0.4s, $C1.4s + zip1 $T2.4s, $C2.4s, $C3.4s + zip2 $T3.4s, $C2.4s, $C3.4s + + zip1 $C0.2d, $T0.2d, $T2.2d + zip2 $C1.2d, $T0.2d, $T2.2d + zip1 $C2.2d, $T1.2d, $T3.2d + zip2 $C3.2d, $T1.2d, $T3.2d + + zip1 $T0.4s, $D0.4s, $D1.4s + zip2 $T1.4s, $D0.4s, $D1.4s + zip1 $T2.4s, $D2.4s, $D3.4s + zip2 $T3.4s, $D2.4s, $D3.4s + + zip1 $D0.2d, $T0.2d, $T2.2d + zip2 $D1.2d, $T0.2d, $T2.2d + zip1 $D2.2d, $T1.2d, $T3.2d + zip2 $D3.2d, $T1.2d, $T3.2d + + add $A4.4s, $A4.4s, $CONSTS.4s + add $B4.4s, $B4.4s, $B_STORE.4s + and $A4.16b, $A4.16b, $CLAMP.16b + + add $A0.4s, $A0.4s, $CONSTS.4s + add $B0.4s, $B0.4s, $B_STORE.4s + add $C0.4s, $C0.4s, $C_STORE.4s + add $D0.4s, $D0.4s, $D_STORE.4s + + add $A1.4s, $A1.4s, $CONSTS.4s + add $B1.4s, $B1.4s, $B_STORE.4s + add $C1.4s, $C1.4s, $C_STORE.4s + add $D1.4s, $D1.4s, $D_STORE.4s + + add $A2.4s, $A2.4s, $CONSTS.4s + add $B2.4s, $B2.4s, $B_STORE.4s + add $C2.4s, $C2.4s, $C_STORE.4s + add $D2.4s, $D2.4s, $D_STORE.4s + + add $A3.4s, $A3.4s, $CONSTS.4s + add $B3.4s, $B3.4s, $B_STORE.4s + add $C3.4s, $C3.4s, $C_STORE.4s + add $D3.4s, $D3.4s, $D_STORE.4s + + mov $r0, $A4.d[0] // Move the R key to GPRs + mov $r1, $A4.d[1] + mov $S_STORE.16b, $B4.16b // Store the S key + + bl .Lpoly_hash_ad_internal + + mov $adp, $oup + cmp $inl, #256 + b.le .Lseal_tail + + ld1 {$T0.16b - $T3.16b}, [$inp], #64 + eor $T0.16b, $T0.16b, $A0.16b + eor $T1.16b, $T1.16b, $B0.16b + eor $T2.16b, $T2.16b, $C0.16b + eor $T3.16b, $T3.16b, $D0.16b + st1 {$T0.16b - $T3.16b}, [$oup], #64 + + ld1 {$T0.16b - $T3.16b}, [$inp], #64 + eor $T0.16b, $T0.16b, $A1.16b + eor $T1.16b, $T1.16b, $B1.16b + eor $T2.16b, $T2.16b, $C1.16b + eor $T3.16b, $T3.16b, $D1.16b + st1 {$T0.16b - $T3.16b}, [$oup], #64 + + ld1 {$T0.16b - $T3.16b}, [$inp], #64 + eor $T0.16b, $T0.16b, $A2.16b + eor $T1.16b, $T1.16b, $B2.16b + eor $T2.16b, $T2.16b, $C2.16b + eor $T3.16b, $T3.16b, $D2.16b + st1 {$T0.16b - $T3.16b}, [$oup], #64 + + ld1 {$T0.16b - $T3.16b}, [$inp], #64 + eor $T0.16b, $T0.16b, $A3.16b + eor $T1.16b, $T1.16b, $B3.16b + eor $T2.16b, $T2.16b, $C3.16b + eor $T3.16b, $T3.16b, $D3.16b + st1 {$T0.16b - $T3.16b}, [$oup], #64 + + sub $inl, $inl, #256 + + mov $itr1, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds + mov $itr2, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256 + +.Lseal_main_loop: + adrp $t0, :pg_hi21:.Lchacha20_consts + add $t0, $t0, :lo12:.Lchacha20_consts + + ld4r {$A0.4s-$A3.4s}, [$t0] + mov $A4.16b, $CONSTS.16b + + ld4r {$B0.4s-$B3.4s}, [$keyp], #16 + mov $B4.16b, $B_STORE.16b + + ld4r {$C0.4s-$C3.4s}, [$keyp], #16 + mov $C4.16b, $C_STORE.16b + + ld4r {$D0.4s-$D3.4s}, [$keyp] + add $D0.4s, $D0.4s, $INC.4s + mov $D4.16b, $D_STORE.16b + + eor $T0.16b, $T0.16b, $T0.16b //zero + not $T1.16b, $T0.16b // -1 + sub $T1.4s, $INC.4s, $T1.4s // Add +1 + ext $T0.16b, $T1.16b, $T0.16b, #12 // Get the last element (counter) + add $D4.4s, $D4.4s, $T0.4s + + sub $keyp, $keyp, #32 +.align 5 +.Lseal_main_loop_rounds: +___ + &chacha_qr_x5("left"); + &poly_add($adp); + &poly_mul(); + &chacha_qr_x5("right"); +$code.=<<___; + subs $itr1, $itr1, #1 + b.ge .Lseal_main_loop_rounds +___ + &poly_add($adp); + &poly_mul(); +$code.=<<___; + subs $itr2, $itr2, #1 + b.gt .Lseal_main_loop_rounds + + eor $T0.16b, $T0.16b, $T0.16b //zero + not $T1.16b, $T0.16b // -1 + sub $T1.4s, $INC.4s, $T1.4s // Add +1 + ext $T0.16b, $T1.16b, $T0.16b, #12 // Get the last element (counter) + add $D4.4s, $D4.4s, $T0.4s + + add $D0.4s, $D0.4s, $INC.4s + mov $t0, #5 + dup $T0.4s, $t0w + add $INC.4s, $INC.4s, $T0.4s + + zip1 $T0.4s, $A0.4s, $A1.4s + zip2 $T1.4s, $A0.4s, $A1.4s + zip1 $T2.4s, $A2.4s, $A3.4s + zip2 $T3.4s, $A2.4s, $A3.4s + + zip1 $A0.2d, $T0.2d, $T2.2d + zip2 $A1.2d, $T0.2d, $T2.2d + zip1 $A2.2d, $T1.2d, $T3.2d + zip2 $A3.2d, $T1.2d, $T3.2d + + zip1 $T0.4s, $B0.4s, $B1.4s + zip2 $T1.4s, $B0.4s, $B1.4s + zip1 $T2.4s, $B2.4s, $B3.4s + zip2 $T3.4s, $B2.4s, $B3.4s + + zip1 $B0.2d, $T0.2d, $T2.2d + zip2 $B1.2d, $T0.2d, $T2.2d + zip1 $B2.2d, $T1.2d, $T3.2d + zip2 $B3.2d, $T1.2d, $T3.2d + + zip1 $T0.4s, $C0.4s, $C1.4s + zip2 $T1.4s, $C0.4s, $C1.4s + zip1 $T2.4s, $C2.4s, $C3.4s + zip2 $T3.4s, $C2.4s, $C3.4s + + zip1 $C0.2d, $T0.2d, $T2.2d + zip2 $C1.2d, $T0.2d, $T2.2d + zip1 $C2.2d, $T1.2d, $T3.2d + zip2 $C3.2d, $T1.2d, $T3.2d + + zip1 $T0.4s, $D0.4s, $D1.4s + zip2 $T1.4s, $D0.4s, $D1.4s + zip1 $T2.4s, $D2.4s, $D3.4s + zip2 $T3.4s, $D2.4s, $D3.4s + + zip1 $D0.2d, $T0.2d, $T2.2d + zip2 $D1.2d, $T0.2d, $T2.2d + zip1 $D2.2d, $T1.2d, $T3.2d + zip2 $D3.2d, $T1.2d, $T3.2d + + add $A0.4s, $A0.4s, $CONSTS.4s + add $B0.4s, $B0.4s, $B_STORE.4s + add $C0.4s, $C0.4s, $C_STORE.4s + add $D0.4s, $D0.4s, $D_STORE.4s + + add $A1.4s, $A1.4s, $CONSTS.4s + add $B1.4s, $B1.4s, $B_STORE.4s + add $C1.4s, $C1.4s, $C_STORE.4s + add $D1.4s, $D1.4s, $D_STORE.4s + + add $A2.4s, $A2.4s, $CONSTS.4s + add $B2.4s, $B2.4s, $B_STORE.4s + add $C2.4s, $C2.4s, $C_STORE.4s + add $D2.4s, $D2.4s, $D_STORE.4s + + add $A3.4s, $A3.4s, $CONSTS.4s + add $B3.4s, $B3.4s, $B_STORE.4s + add $C3.4s, $C3.4s, $C_STORE.4s + add $D3.4s, $D3.4s, $D_STORE.4s + + add $A4.4s, $A4.4s, $CONSTS.4s + add $B4.4s, $B4.4s, $B_STORE.4s + add $C4.4s, $C4.4s, $C_STORE.4s + add $D4.4s, $D4.4s, $D_STORE.4s + + cmp $inl, #320 + b.le .Lseal_tail + + ld1 {$T0.16b - $T3.16b}, [$inp], #64 + eor $T0.16b, $T0.16b, $A0.16b + eor $T1.16b, $T1.16b, $B0.16b + eor $T2.16b, $T2.16b, $C0.16b + eor $T3.16b, $T3.16b, $D0.16b + st1 {$T0.16b - $T3.16b}, [$oup], #64 + + ld1 {$T0.16b - $T3.16b}, [$inp], #64 + eor $T0.16b, $T0.16b, $A1.16b + eor $T1.16b, $T1.16b, $B1.16b + eor $T2.16b, $T2.16b, $C1.16b + eor $T3.16b, $T3.16b, $D1.16b + st1 {$T0.16b - $T3.16b}, [$oup], #64 + + ld1 {$T0.16b - $T3.16b}, [$inp], #64 + eor $T0.16b, $T0.16b, $A2.16b + eor $T1.16b, $T1.16b, $B2.16b + eor $T2.16b, $T2.16b, $C2.16b + eor $T3.16b, $T3.16b, $D2.16b + st1 {$T0.16b - $T3.16b}, [$oup], #64 + + ld1 {$T0.16b - $T3.16b}, [$inp], #64 + eor $T0.16b, $T0.16b, $A3.16b + eor $T1.16b, $T1.16b, $B3.16b + eor $T2.16b, $T2.16b, $C3.16b + eor $T3.16b, $T3.16b, $D3.16b + st1 {$T0.16b - $T3.16b}, [$oup], #64 + + ld1 {$T0.16b - $T3.16b}, [$inp], #64 + eor $T0.16b, $T0.16b, $A4.16b + eor $T1.16b, $T1.16b, $B4.16b + eor $T2.16b, $T2.16b, $C4.16b + eor $T3.16b, $T3.16b, $D4.16b + st1 {$T0.16b - $T3.16b}, [$oup], #64 + + sub $inl, $inl, #320 + + mov $itr1, #0 + mov $itr2, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration + + b .Lseal_main_loop + +.Lseal_tail: + // This part of the function handles the storage and authentication of the last [0,320) bytes + // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data. + cmp $inl, #64 + b.lt .Lseal_tail_64 + + // Store and authenticate 64B blocks per iteration + ld1 {$T0.16b - $T3.16b}, [$inp], #64 + + eor $T0.16b, $T0.16b, $A0.16b + eor $T1.16b, $T1.16b, $B0.16b + eor $T2.16b, $T2.16b, $C0.16b + eor $T3.16b, $T3.16b, $D0.16b +___ + &poly_add_vec($T0); + &poly_mul(); + &poly_add_vec($T1); + &poly_mul(); + &poly_add_vec($T2); + &poly_mul(); + &poly_add_vec($T3); + &poly_mul(); +$code.=<<___; + st1 {$T0.16b - $T3.16b}, [$oup], #64 + sub $inl, $inl, #64 + + // Shift the state left by 64 bytes for the next iteration of the loop + mov $A0.16b, $A1.16b + mov $B0.16b, $B1.16b + mov $C0.16b, $C1.16b + mov $D0.16b, $D1.16b + + mov $A1.16b, $A2.16b + mov $B1.16b, $B2.16b + mov $C1.16b, $C2.16b + mov $D1.16b, $D2.16b + + mov $A2.16b, $A3.16b + mov $B2.16b, $B3.16b + mov $C2.16b, $C3.16b + mov $D2.16b, $D3.16b + + mov $A3.16b, $A4.16b + mov $B3.16b, $B4.16b + mov $C3.16b, $C4.16b + mov $D3.16b, $D4.16b + + b .Lseal_tail + +.Lseal_tail_64: + ldp $adp, $adl, [$keyp, #48] // extra_in_len and extra_in_ptr + + // Here we handle the last [0,64) bytes of plaintext + cmp $inl, #16 + b.lt .Lseal_tail_16 + // Each iteration encrypt and authenticate a 16B block + ld1 {$T0.16b}, [$inp], #16 + eor $T0.16b, $T0.16b, $A0.16b +___ + &poly_add_vec($T0); + &poly_mul(); +$code.=<<___; + st1 {$T0.16b}, [$oup], #16 + + sub $inl, $inl, #16 + + // Shift the state left by 16 bytes for the next iteration of the loop + mov $A0.16b, $B0.16b + mov $B0.16b, $C0.16b + mov $C0.16b, $D0.16b + + b .Lseal_tail_64 + +.Lseal_tail_16: + // Here we handle the last [0,16) bytes of ciphertext that require a padded block + cbz $inl, .Lseal_hash_extra + + eor $T0.16b, $T0.16b, $T0.16b // Use T0 to load the plaintext/extra in + eor $T1.16b, $T1.16b, $T1.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes + not $T2.16b, $T0.16b + + mov $itr1, $inl + add $inp, $inp, $inl + + cbz $adl, .Lseal_tail_16_compose // No extra data to pad with, zero padding + + mov $itr2, #16 // We need to load some extra_in first for padding + sub $itr2, $itr2, $inl + cmp $adl, $itr2 + csel $itr2, $adl, $itr2, lt // Load the minimum of extra_in_len and the amount needed to fill the register + mov $t1, $itr2 + add $adp, $adp, $itr2 + sub $adl, $adl, $itr2 + +.Lseal_tail16_compose_extra_in: + ext $T0.16b, $T0.16b, $T0.16b, #15 + ldrb $t0w, [$adp, #-1]! + mov $T0.b[0], $t0w + subs $itr2, $itr2, #1 + b.gt .Lseal_tail16_compose_extra_in + + add $adp, $adp, $t1 + +.Lseal_tail_16_compose: + ext $T0.16b, $T0.16b, $T0.16b, #15 + ldrb $t0w, [$inp, #-1]! + mov $T0.b[0], $t0w + ext $T1.16b, $T2.16b, $T1.16b, #15 + subs $inl, $inl, #1 + b.gt .Lseal_tail_16_compose + + and $A0.16b, $A0.16b, $T1.16b + eor $T0.16b, $T0.16b, $A0.16b + mov $T1.16b, $T0.16b + +.Lseal_tail_16_store: + umov $t0w, $T0.b[0] + strb $t0w, [$oup], #1 + ext $T0.16b, $T0.16b, $T0.16b, #1 + subs $itr1, $itr1, #1 + b.gt .Lseal_tail_16_store + + // Hash in the final ct block concatenated with extra_in +___ + &poly_add_vec($T1); + &poly_mul(); +$code.=<<___; + +.Lseal_hash_extra: + cbz $adl, .Lseal_finalize + +.Lseal_hash_extra_loop: + cmp $adl, #16 + b.lt .Lseal_hash_extra_tail + ld1 {$T0.16b}, [$adp], #16 +___ + &poly_add_vec($T0); + &poly_mul(); +$code.=<<___; + sub $adl, $adl, #16 + b .Lseal_hash_extra_loop + +.Lseal_hash_extra_tail: + cbz $adl, .Lseal_finalize + eor $T0.16b, $T0.16b, $T0.16b // Use T0 to load the remaining extra ciphertext + add $adp, $adp, $adl + +.Lseal_hash_extra_load: + ext $T0.16b, $T0.16b, $T0.16b, #15 + ldrb $t0w, [$adp, #-1]! + mov $T0.b[0], $t0w + subs $adl, $adl, #1 + b.gt .Lseal_hash_extra_load + + // Hash in the final padded extra_in blcok +___ + &poly_add_vec($T0); + &poly_mul(); +$code.=<<___; + +.Lseal_finalize: +___ + &poly_add_vec($LEN_STORE); + &poly_mul(); +$code.=<<___; + // Final reduction step + sub $t1, xzr, $one + orr $t2, xzr, #3 + subs $t0, $acc0, #-5 + sbcs $t1, $acc1, $t1 + sbcs $t2, $acc2, $t2 + csel $acc0, $t0, $acc0, cs + csel $acc1, $t1, $acc1, cs + csel $acc2, $t2, $acc2, cs +___ + &poly_add_vec($S_STORE); +$code.=<<___; + + stp $acc0, $acc1, [$keyp] + + ldp d8, d9, [sp, #16] + ldp d10, d11, [sp, #32] + ldp d12, d13, [sp, #48] + ldp d14, d15, [sp, #64] +.cfi_restore b15 +.cfi_restore b14 +.cfi_restore b13 +.cfi_restore b12 +.cfi_restore b11 +.cfi_restore b10 +.cfi_restore b9 +.cfi_restore b8 + ldp x29, x30, [sp], 80 +.cfi_restore w29 +.cfi_restore w30 +.cfi_def_cfa_offset 0 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.Lseal_128: + // On some architectures preparing 5 blocks for small buffers is wasteful + eor $INC.16b, $INC.16b, $INC.16b + mov $t0, #1 + mov $INC.s[0], $t0w + mov $A0.16b, $CONSTS.16b + mov $A1.16b, $CONSTS.16b + mov $A2.16b, $CONSTS.16b + mov $B0.16b, $B_STORE.16b + mov $B1.16b, $B_STORE.16b + mov $B2.16b, $B_STORE.16b + mov $C0.16b, $C_STORE.16b + mov $C1.16b, $C_STORE.16b + mov $C2.16b, $C_STORE.16b + mov $D2.16b, $D_STORE.16b + add $D0.4s, $D2.4s, $INC.4s + add $D1.4s, $D0.4s, $INC.4s + + mov $itr1, #10 + +.Lseal_128_rounds: +___ + &chacha_qr_x3("left"); + &chacha_qr_x3("right"); +$code.=<<___; + subs $itr1, $itr1, #1 + b.hi .Lseal_128_rounds + + add $A0.4s, $A0.4s, $CONSTS.4s + add $A1.4s, $A1.4s, $CONSTS.4s + add $A2.4s, $A2.4s, $CONSTS.4s + + add $B0.4s, $B0.4s, $B_STORE.4s + add $B1.4s, $B1.4s, $B_STORE.4s + add $B2.4s, $B2.4s, $B_STORE.4s + + // Only the first 32 bytes of the third block (counter = 0) are needed, + // so skip updating $C2 and $D2. + add $C0.4s, $C0.4s, $C_STORE.4s + add $C1.4s, $C1.4s, $C_STORE.4s + + add $D_STORE.4s, $D_STORE.4s, $INC.4s + add $D0.4s, $D0.4s, $D_STORE.4s + add $D_STORE.4s, $D_STORE.4s, $INC.4s + add $D1.4s, $D1.4s, $D_STORE.4s + + and $A2.16b, $A2.16b, $CLAMP.16b + mov $r0, $A2.d[0] // Move the R key to GPRs + mov $r1, $A2.d[1] + mov $S_STORE.16b, $B2.16b // Store the S key + + bl .Lpoly_hash_ad_internal + b .Lseal_tail +.cfi_endproc +.size chacha20_poly1305_seal,.-chacha20_poly1305_seal + +///////////////////////////////// +// +// void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data); +// +.globl chacha20_poly1305_open +.type chacha20_poly1305_open,%function +.align 6 +chacha20_poly1305_open: + AARCH64_SIGN_LINK_REGISTER +.cfi_startproc + stp x29, x30, [sp, #-80]! +.cfi_def_cfa_offset 80 +.cfi_offset w30, -72 +.cfi_offset w29, -80 + mov x29, sp + // We probably could do .cfi_def_cfa w29, 80 at this point, but since + // we don't actually use the frame pointer like that, it's probably not + // worth bothering. + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] +.cfi_offset b15, -8 +.cfi_offset b14, -16 +.cfi_offset b13, -24 +.cfi_offset b12, -32 +.cfi_offset b11, -40 +.cfi_offset b10, -48 +.cfi_offset b9, -56 +.cfi_offset b8, -64 + + adrp $t0, :pg_hi21:.Lchacha20_consts + add $t0, $t0, :lo12:.Lchacha20_consts + + ld1 {$CONSTS.16b - $CLAMP.16b}, [$t0] // Load the CONSTS, INC, ROL8 and CLAMP values + ld1 {$B_STORE.16b - $D_STORE.16b}, [$keyp] + + mov $one, #1 // Prepare the Poly1305 state + mov $acc0, #0 + mov $acc1, #0 + mov $acc2, #0 + + mov $LEN_STORE.d[0], $adl // Store the input and aad lengths + mov $LEN_STORE.d[1], $inl + + cmp $inl, #128 + b.le .Lopen_128 // Optimization for smaller buffers + + // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys + mov $A0.16b, $CONSTS.16b + mov $B0.16b, $B_STORE.16b + mov $C0.16b, $C_STORE.16b + mov $D0.16b, $D_STORE.16b + + mov $itr1, #10 + +.align 5 +.Lopen_init_rounds: +___ + &chacha_qr($A0, $B0, $C0, $D0, $T0, "left"); + &chacha_qr($A0, $B0, $C0, $D0, $T0, "right"); +$code.=<<___; + subs $itr1, $itr1, #1 + b.hi .Lopen_init_rounds + + add $A0.4s, $A0.4s, $CONSTS.4s + add $B0.4s, $B0.4s, $B_STORE.4s + + and $A0.16b, $A0.16b, $CLAMP.16b + mov $r0, $A0.d[0] // Move the R key to GPRs + mov $r1, $A0.d[1] + mov $S_STORE.16b, $B0.16b // Store the S key + + bl .Lpoly_hash_ad_internal + +.Lopen_ad_done: + mov $adp, $inp + +// Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes +.Lopen_main_loop: + + cmp $inl, #192 + b.lt .Lopen_tail + + adrp $t0, :pg_hi21:.Lchacha20_consts + add $t0, $t0, :lo12:.Lchacha20_consts + + ld4r {$A0.4s-$A3.4s}, [$t0] + mov $A4.16b, $CONSTS.16b + + ld4r {$B0.4s-$B3.4s}, [$keyp], #16 + mov $B4.16b, $B_STORE.16b + + ld4r {$C0.4s-$C3.4s}, [$keyp], #16 + mov $C4.16b, $C_STORE.16b + + ld4r {$D0.4s-$D3.4s}, [$keyp] + sub $keyp, $keyp, #32 + add $D0.4s, $D0.4s, $INC.4s + mov $D4.16b, $D_STORE.16b + + eor $T0.16b, $T0.16b, $T0.16b //zero + not $T1.16b, $T0.16b // -1 + sub $T1.4s, $INC.4s, $T1.4s // Add +1 + ext $T0.16b, $T1.16b, $T0.16b, #12 // Get the last element (counter) + add $D4.4s, $D4.4s, $T0.4s + + lsr $adl, $inl, #4 // How many whole blocks we have to hash, will always be at least 12 + sub $adl, $adl, #10 + + mov $itr2, #10 + subs $itr1, $itr2, $adl + subs $itr1, $itr2, $adl // itr1 can be negative if we have more than 320 bytes to hash + csel $itr2, $itr2, $adl, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full + + cbz $itr2, .Lopen_main_loop_rounds_short + +.align 5 +.Lopen_main_loop_rounds: +___ + &poly_add($adp); + &poly_mul(); +$code.=<<___; +.Lopen_main_loop_rounds_short: +___ + &chacha_qr_x5("left"); + &poly_add($adp); + &poly_mul(); + &chacha_qr_x5("right"); +$code.=<<___; + subs $itr2, $itr2, #1 + b.gt .Lopen_main_loop_rounds + subs $itr1, $itr1, #1 + b.ge .Lopen_main_loop_rounds_short +___ +$code.=<<___; + + eor $T0.16b, $T0.16b, $T0.16b //zero + not $T1.16b, $T0.16b // -1 + sub $T1.4s, $INC.4s, $T1.4s // Add +1 + ext $T0.16b, $T1.16b, $T0.16b, #12 // Get the last element (counter) + add $D4.4s, $D4.4s, $T0.4s + + add $D0.4s, $D0.4s, $INC.4s + mov $t0, #5 + dup $T0.4s, $t0w + add $INC.4s, $INC.4s, $T0.4s + + zip1 $T0.4s, $A0.4s, $A1.4s + zip2 $T1.4s, $A0.4s, $A1.4s + zip1 $T2.4s, $A2.4s, $A3.4s + zip2 $T3.4s, $A2.4s, $A3.4s + + zip1 $A0.2d, $T0.2d, $T2.2d + zip2 $A1.2d, $T0.2d, $T2.2d + zip1 $A2.2d, $T1.2d, $T3.2d + zip2 $A3.2d, $T1.2d, $T3.2d + + zip1 $T0.4s, $B0.4s, $B1.4s + zip2 $T1.4s, $B0.4s, $B1.4s + zip1 $T2.4s, $B2.4s, $B3.4s + zip2 $T3.4s, $B2.4s, $B3.4s + + zip1 $B0.2d, $T0.2d, $T2.2d + zip2 $B1.2d, $T0.2d, $T2.2d + zip1 $B2.2d, $T1.2d, $T3.2d + zip2 $B3.2d, $T1.2d, $T3.2d + + zip1 $T0.4s, $C0.4s, $C1.4s + zip2 $T1.4s, $C0.4s, $C1.4s + zip1 $T2.4s, $C2.4s, $C3.4s + zip2 $T3.4s, $C2.4s, $C3.4s + + zip1 $C0.2d, $T0.2d, $T2.2d + zip2 $C1.2d, $T0.2d, $T2.2d + zip1 $C2.2d, $T1.2d, $T3.2d + zip2 $C3.2d, $T1.2d, $T3.2d + + zip1 $T0.4s, $D0.4s, $D1.4s + zip2 $T1.4s, $D0.4s, $D1.4s + zip1 $T2.4s, $D2.4s, $D3.4s + zip2 $T3.4s, $D2.4s, $D3.4s + + zip1 $D0.2d, $T0.2d, $T2.2d + zip2 $D1.2d, $T0.2d, $T2.2d + zip1 $D2.2d, $T1.2d, $T3.2d + zip2 $D3.2d, $T1.2d, $T3.2d + + add $A0.4s, $A0.4s, $CONSTS.4s + add $B0.4s, $B0.4s, $B_STORE.4s + add $C0.4s, $C0.4s, $C_STORE.4s + add $D0.4s, $D0.4s, $D_STORE.4s + + add $A1.4s, $A1.4s, $CONSTS.4s + add $B1.4s, $B1.4s, $B_STORE.4s + add $C1.4s, $C1.4s, $C_STORE.4s + add $D1.4s, $D1.4s, $D_STORE.4s + + add $A2.4s, $A2.4s, $CONSTS.4s + add $B2.4s, $B2.4s, $B_STORE.4s + add $C2.4s, $C2.4s, $C_STORE.4s + add $D2.4s, $D2.4s, $D_STORE.4s + + add $A3.4s, $A3.4s, $CONSTS.4s + add $B3.4s, $B3.4s, $B_STORE.4s + add $C3.4s, $C3.4s, $C_STORE.4s + add $D3.4s, $D3.4s, $D_STORE.4s + + add $A4.4s, $A4.4s, $CONSTS.4s + add $B4.4s, $B4.4s, $B_STORE.4s + add $C4.4s, $C4.4s, $C_STORE.4s + add $D4.4s, $D4.4s, $D_STORE.4s + + // We can always safely store 192 bytes + ld1 {$T0.16b - $T3.16b}, [$inp], #64 + eor $T0.16b, $T0.16b, $A0.16b + eor $T1.16b, $T1.16b, $B0.16b + eor $T2.16b, $T2.16b, $C0.16b + eor $T3.16b, $T3.16b, $D0.16b + st1 {$T0.16b - $T3.16b}, [$oup], #64 + + ld1 {$T0.16b - $T3.16b}, [$inp], #64 + eor $T0.16b, $T0.16b, $A1.16b + eor $T1.16b, $T1.16b, $B1.16b + eor $T2.16b, $T2.16b, $C1.16b + eor $T3.16b, $T3.16b, $D1.16b + st1 {$T0.16b - $T3.16b}, [$oup], #64 + + ld1 {$T0.16b - $T3.16b}, [$inp], #64 + eor $T0.16b, $T0.16b, $A2.16b + eor $T1.16b, $T1.16b, $B2.16b + eor $T2.16b, $T2.16b, $C2.16b + eor $T3.16b, $T3.16b, $D2.16b + st1 {$T0.16b - $T3.16b}, [$oup], #64 + + sub $inl, $inl, #192 + + mov $A0.16b, $A3.16b + mov $B0.16b, $B3.16b + mov $C0.16b, $C3.16b + mov $D0.16b, $D3.16b + + cmp $inl, #64 + b.lt .Lopen_tail_64_store + + ld1 {$T0.16b - $T3.16b}, [$inp], #64 + eor $T0.16b, $T0.16b, $A3.16b + eor $T1.16b, $T1.16b, $B3.16b + eor $T2.16b, $T2.16b, $C3.16b + eor $T3.16b, $T3.16b, $D3.16b + st1 {$T0.16b - $T3.16b}, [$oup], #64 + + sub $inl, $inl, #64 + + mov $A0.16b, $A4.16b + mov $B0.16b, $B4.16b + mov $C0.16b, $C4.16b + mov $D0.16b, $D4.16b + + cmp $inl, #64 + b.lt .Lopen_tail_64_store + + ld1 {$T0.16b - $T3.16b}, [$inp], #64 + eor $T0.16b, $T0.16b, $A4.16b + eor $T1.16b, $T1.16b, $B4.16b + eor $T2.16b, $T2.16b, $C4.16b + eor $T3.16b, $T3.16b, $D4.16b + st1 {$T0.16b - $T3.16b}, [$oup], #64 + + sub $inl, $inl, #64 + b .Lopen_main_loop + +.Lopen_tail: + + cbz $inl, .Lopen_finalize + + lsr $adl, $inl, #4 // How many whole blocks we have to hash + + cmp $inl, #64 + b.le .Lopen_tail_64 + cmp $inl, #128 + b.le .Lopen_tail_128 + +.Lopen_tail_192: + // We need three more blocks + mov $A0.16b, $CONSTS.16b + mov $A1.16b, $CONSTS.16b + mov $A2.16b, $CONSTS.16b + mov $B0.16b, $B_STORE.16b + mov $B1.16b, $B_STORE.16b + mov $B2.16b, $B_STORE.16b + mov $C0.16b, $C_STORE.16b + mov $C1.16b, $C_STORE.16b + mov $C2.16b, $C_STORE.16b + mov $D0.16b, $D_STORE.16b + mov $D1.16b, $D_STORE.16b + mov $D2.16b, $D_STORE.16b + eor $T3.16b, $T3.16b, $T3.16b + eor $T1.16b, $T1.16b, $T1.16b + ins $T3.s[0], $INC.s[0] + ins $T1.d[0], $one + + add $T2.4s, $T3.4s, $T1.4s + add $T1.4s, $T2.4s, $T1.4s + + add $D0.4s, $D0.4s, $T1.4s + add $D1.4s, $D1.4s, $T3.4s + add $D2.4s, $D2.4s, $T2.4s + + mov $itr2, #10 + subs $itr1, $itr2, $adl // itr1 can be negative if we have more than 160 bytes to hash + csel $itr2, $itr2, $adl, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing + sub $adl, $adl, $itr2 + + cbz $itr2, .Lopen_tail_192_rounds_no_hash + +.Lopen_tail_192_rounds: +___ + &poly_add($adp); + &poly_mul(); +$code.=<<___; +.Lopen_tail_192_rounds_no_hash: +___ + &chacha_qr_x3("left"); + &chacha_qr_x3("right"); +$code.=<<___; + subs $itr2, $itr2, #1 + b.gt .Lopen_tail_192_rounds + subs $itr1, $itr1, #1 + b.ge .Lopen_tail_192_rounds_no_hash + + // We hashed 160 bytes at most, may still have 32 bytes left +.Lopen_tail_192_hash: + cbz $adl, .Lopen_tail_192_hash_done +___ + &poly_add($adp); + &poly_mul(); +$code.=<<___; + sub $adl, $adl, #1 + b .Lopen_tail_192_hash + +.Lopen_tail_192_hash_done: + + add $A0.4s, $A0.4s, $CONSTS.4s + add $A1.4s, $A1.4s, $CONSTS.4s + add $A2.4s, $A2.4s, $CONSTS.4s + add $B0.4s, $B0.4s, $B_STORE.4s + add $B1.4s, $B1.4s, $B_STORE.4s + add $B2.4s, $B2.4s, $B_STORE.4s + add $C0.4s, $C0.4s, $C_STORE.4s + add $C1.4s, $C1.4s, $C_STORE.4s + add $C2.4s, $C2.4s, $C_STORE.4s + add $D0.4s, $D0.4s, $D_STORE.4s + add $D1.4s, $D1.4s, $D_STORE.4s + add $D2.4s, $D2.4s, $D_STORE.4s + + add $D0.4s, $D0.4s, $T1.4s + add $D1.4s, $D1.4s, $T3.4s + add $D2.4s, $D2.4s, $T2.4s + + ld1 {$T0.16b - $T3.16b}, [$inp], #64 + + eor $T0.16b, $T0.16b, $A1.16b + eor $T1.16b, $T1.16b, $B1.16b + eor $T2.16b, $T2.16b, $C1.16b + eor $T3.16b, $T3.16b, $D1.16b + + st1 {$T0.16b - $T3.16b}, [$oup], #64 + + ld1 {$T0.16b - $T3.16b}, [$inp], #64 + + eor $T0.16b, $T0.16b, $A2.16b + eor $T1.16b, $T1.16b, $B2.16b + eor $T2.16b, $T2.16b, $C2.16b + eor $T3.16b, $T3.16b, $D2.16b + + st1 {$T0.16b - $T3.16b}, [$oup], #64 + + sub $inl, $inl, #128 + b .Lopen_tail_64_store + +.Lopen_tail_128: + // We need two more blocks + mov $A0.16b, $CONSTS.16b + mov $A1.16b, $CONSTS.16b + mov $B0.16b, $B_STORE.16b + mov $B1.16b, $B_STORE.16b + mov $C0.16b, $C_STORE.16b + mov $C1.16b, $C_STORE.16b + mov $D0.16b, $D_STORE.16b + mov $D1.16b, $D_STORE.16b + eor $T3.16b, $T3.16b, $T3.16b + eor $T2.16b, $T2.16b, $T2.16b + ins $T3.s[0], $INC.s[0] + ins $T2.d[0], $one + add $T2.4s, $T2.4s, $T3.4s + + add $D0.4s, $D0.4s, $T2.4s + add $D1.4s, $D1.4s, $T3.4s + + mov $itr1, #10 + sub $itr1, $itr1, $adl + +.Lopen_tail_128_rounds: +___ + &chacha_qr($A0, $B0, $C0, $D0, $T0, "left"); + &chacha_qr($A1, $B1, $C1, $D1, $T0, "left"); + &chacha_qr($A0, $B0, $C0, $D0, $T0, "right"); + &chacha_qr($A1, $B1, $C1, $D1, $T0, "right"); +$code.=<<___; + subs $itr1, $itr1, #1 + b.gt .Lopen_tail_128_rounds + cbz $adl, .Lopen_tail_128_rounds_done + subs $adl, $adl, #1 +___ + &poly_add($adp); + &poly_mul(); +$code.=<<___; + b .Lopen_tail_128_rounds + +.Lopen_tail_128_rounds_done: + add $A0.4s, $A0.4s, $CONSTS.4s + add $A1.4s, $A1.4s, $CONSTS.4s + add $B0.4s, $B0.4s, $B_STORE.4s + add $B1.4s, $B1.4s, $B_STORE.4s + add $C0.4s, $C0.4s, $C_STORE.4s + add $C1.4s, $C1.4s, $C_STORE.4s + add $D0.4s, $D0.4s, $D_STORE.4s + add $D1.4s, $D1.4s, $D_STORE.4s + add $D0.4s, $D0.4s, $T2.4s + add $D1.4s, $D1.4s, $T3.4s + + ld1 {$T0.16b - $T3.16b}, [$inp], #64 + + eor $T0.16b, $T0.16b, $A1.16b + eor $T1.16b, $T1.16b, $B1.16b + eor $T2.16b, $T2.16b, $C1.16b + eor $T3.16b, $T3.16b, $D1.16b + + st1 {$T0.16b - $T3.16b}, [$oup], #64 + sub $inl, $inl, #64 + + b .Lopen_tail_64_store + +.Lopen_tail_64: + // We just need a single block + mov $A0.16b, $CONSTS.16b + mov $B0.16b, $B_STORE.16b + mov $C0.16b, $C_STORE.16b + mov $D0.16b, $D_STORE.16b + eor $T3.16b, $T3.16b, $T3.16b + ins $T3.s[0], $INC.s[0] + add $D0.4s, $D0.4s, $T3.4s + + mov $itr1, #10 + sub $itr1, $itr1, $adl + +.Lopen_tail_64_rounds: +___ + &chacha_qr($A0, $B0, $C0, $D0, $T0, "left"); + &chacha_qr($A0, $B0, $C0, $D0, $T0, "right"); +$code.=<<___; + subs $itr1, $itr1, #1 + b.gt .Lopen_tail_64_rounds + cbz $adl, .Lopen_tail_64_rounds_done + subs $adl, $adl, #1 +___ + &poly_add($adp); + &poly_mul(); +$code.=<<___; + b .Lopen_tail_64_rounds + +.Lopen_tail_64_rounds_done: + add $A0.4s, $A0.4s, $CONSTS.4s + add $B0.4s, $B0.4s, $B_STORE.4s + add $C0.4s, $C0.4s, $C_STORE.4s + add $D0.4s, $D0.4s, $D_STORE.4s + add $D0.4s, $D0.4s, $T3.4s + +.Lopen_tail_64_store: + cmp $inl, #16 + b.lt .Lopen_tail_16 + + ld1 {$T0.16b}, [$inp], #16 + eor $T0.16b, $T0.16b, $A0.16b + st1 {$T0.16b}, [$oup], #16 + mov $A0.16b, $B0.16b + mov $B0.16b, $C0.16b + mov $C0.16b, $D0.16b + sub $inl, $inl, #16 + b .Lopen_tail_64_store + +.Lopen_tail_16: + // Here we handle the last [0,16) bytes that require a padded block + cbz $inl, .Lopen_finalize + + eor $T0.16b, $T0.16b, $T0.16b // Use T0 to load the ciphertext + eor $T1.16b, $T1.16b, $T1.16b // Use T1 to generate an AND mask + not $T2.16b, $T0.16b + + add $itr2, $inp, $inl + mov $itr1, $inl + +.Lopen_tail_16_compose: + ext $T0.16b, $T0.16b, $T0.16b, #15 + ldrb $t0w, [$itr2, #-1]! + mov $T0.b[0], $t0w + ext $T1.16b, $T2.16b, $T1.16b, #15 + subs $inl, $inl, #1 + b.gt .Lopen_tail_16_compose + + and $T0.16b, $T0.16b, $T1.16b + // Hash in the final padded block +___ + &poly_add_vec($T0); + &poly_mul(); +$code.=<<___; + eor $T0.16b, $T0.16b, $A0.16b + +.Lopen_tail_16_store: + umov $t0w, $T0.b[0] + strb $t0w, [$oup], #1 + ext $T0.16b, $T0.16b, $T0.16b, #1 + subs $itr1, $itr1, #1 + b.gt .Lopen_tail_16_store + +.Lopen_finalize: +___ + &poly_add_vec($LEN_STORE); + &poly_mul(); +$code.=<<___; + // Final reduction step + sub $t1, xzr, $one + orr $t2, xzr, #3 + subs $t0, $acc0, #-5 + sbcs $t1, $acc1, $t1 + sbcs $t2, $acc2, $t2 + csel $acc0, $t0, $acc0, cs + csel $acc1, $t1, $acc1, cs + csel $acc2, $t2, $acc2, cs +___ + &poly_add_vec($S_STORE); +$code.=<<___; + + stp $acc0, $acc1, [$keyp] + + ldp d8, d9, [sp, #16] + ldp d10, d11, [sp, #32] + ldp d12, d13, [sp, #48] + ldp d14, d15, [sp, #64] +.cfi_restore b15 +.cfi_restore b14 +.cfi_restore b13 +.cfi_restore b12 +.cfi_restore b11 +.cfi_restore b10 +.cfi_restore b9 +.cfi_restore b8 + ldp x29, x30, [sp], 80 +.cfi_restore w29 +.cfi_restore w30 +.cfi_def_cfa_offset 0 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.Lopen_128: + // On some architectures preparing 5 blocks for small buffers is wasteful + eor $INC.16b, $INC.16b, $INC.16b + mov $t0, #1 + mov $INC.s[0], $t0w + mov $A0.16b, $CONSTS.16b + mov $A1.16b, $CONSTS.16b + mov $A2.16b, $CONSTS.16b + mov $B0.16b, $B_STORE.16b + mov $B1.16b, $B_STORE.16b + mov $B2.16b, $B_STORE.16b + mov $C0.16b, $C_STORE.16b + mov $C1.16b, $C_STORE.16b + mov $C2.16b, $C_STORE.16b + mov $D2.16b, $D_STORE.16b + add $D0.4s, $D2.4s, $INC.4s + add $D1.4s, $D0.4s, $INC.4s + + mov $itr1, #10 + +.Lopen_128_rounds: +___ + &chacha_qr_x3("left"); + &chacha_qr_x3("right"); +$code.=<<___; + subs $itr1, $itr1, #1 + b.hi .Lopen_128_rounds + + add $A0.4s, $A0.4s, $CONSTS.4s + add $A1.4s, $A1.4s, $CONSTS.4s + add $A2.4s, $A2.4s, $CONSTS.4s + + add $B0.4s, $B0.4s, $B_STORE.4s + add $B1.4s, $B1.4s, $B_STORE.4s + add $B2.4s, $B2.4s, $B_STORE.4s + + add $C0.4s, $C0.4s, $C_STORE.4s + add $C1.4s, $C1.4s, $C_STORE.4s + + add $D_STORE.4s, $D_STORE.4s, $INC.4s + add $D0.4s, $D0.4s, $D_STORE.4s + add $D_STORE.4s, $D_STORE.4s, $INC.4s + add $D1.4s, $D1.4s, $D_STORE.4s + + and $A2.16b, $A2.16b, $CLAMP.16b + mov $r0, $A2.d[0] // Move the R key to GPRs + mov $r1, $A2.d[1] + mov $S_STORE.16b, $B2.16b // Store the S key + + bl .Lpoly_hash_ad_internal + +.Lopen_128_store: + cmp $inl, #64 + b.lt .Lopen_128_store_64 + + ld1 {$T0.16b - $T3.16b}, [$inp], #64 + +___ + &poly_add_vec($T0); + &poly_mul(); + &poly_add_vec($T1); + &poly_mul(); + &poly_add_vec($T2); + &poly_mul(); + &poly_add_vec($T3); + &poly_mul(); +$code.=<<___; + + eor $T0.16b, $T0.16b, $A0.16b + eor $T1.16b, $T1.16b, $B0.16b + eor $T2.16b, $T2.16b, $C0.16b + eor $T3.16b, $T3.16b, $D0.16b + + st1 {$T0.16b - $T3.16b}, [$oup], #64 + + sub $inl, $inl, #64 + + mov $A0.16b, $A1.16b + mov $B0.16b, $B1.16b + mov $C0.16b, $C1.16b + mov $D0.16b, $D1.16b + +.Lopen_128_store_64: + + lsr $adl, $inl, #4 + mov $adp, $inp + +.Lopen_128_hash_64: + cbz $adl, .Lopen_tail_64_store +___ + &poly_add($adp); + &poly_mul(); +$code.=<<___; + sub $adl, $adl, #1 + b .Lopen_128_hash_64 +.cfi_endproc +.size chacha20_poly1305_open,.-chacha20_poly1305_open +___ +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + print $_,"\n"; +} +close STDOUT or die "error closing STDOUT"; diff --git a/ring-0.17.14/crypto/cipher/asm/chacha20_poly1305_x86_64.pl b/ring-0.17.14/crypto/cipher/asm/chacha20_poly1305_x86_64.pl new file mode 100644 index 0000000000..1abc178090 --- /dev/null +++ b/ring-0.17.14/crypto/cipher/asm/chacha20_poly1305_x86_64.pl @@ -0,0 +1,2609 @@ +#!/usr/bin/env perl + +# Copyright (c) 2015, CloudFlare Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +############################################################################## +# # +# Author: Vlad Krasnov # +# # +############################################################################## + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; + +$avx = 2; + +$code.=<<___; +.section .rodata +.align 64 +chacha20_poly1305_constants: +.Lchacha20_consts: +.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' +.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' +.Lrol8: +.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 +.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 +.Lrol16: +.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 +.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 +.Lavx2_init: +.long 0,0,0,0 +.Lsse_inc: +.long 1,0,0,0 +.Lavx2_inc: +.long 2,0,0,0,2,0,0,0 +.Lclamp: +.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC +.quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF +.align 16 +.Land_masks: +.byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff +.text +___ + +my ($oup,$inp,$inl,$adp,$keyp,$itr1,$itr2,$adl)=("%rdi","%rsi","%rbx","%rcx","%r9","%rcx","%r8","%r8"); +my ($acc0,$acc1,$acc2)=map("%r$_",(10..12)); +my ($t0,$t1,$t2,$t3)=("%r13","%r14","%r15","%r9"); +my ($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%xmm$_",(0..15)); +my ($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3); +my $xmm_storage = 0; +if ($win64) { + $xmm_storage = 10*16; +} +my $xmm_store="0*16(%rbp)"; +my $r_store="$xmm_storage+0*16(%rbp)"; +my $s_store="$xmm_storage+1*16(%rbp)"; +my $len_store="$xmm_storage+2*16(%rbp)"; +my $state1_store="$xmm_storage+3*16(%rbp)"; +my $state2_store="$xmm_storage+4*16(%rbp)"; +my $tmp_store="$xmm_storage+5*16(%rbp)"; +my $ctr0_store="$xmm_storage+6*16(%rbp)"; +my $ctr1_store="$xmm_storage+7*16(%rbp)"; +my $ctr2_store="$xmm_storage+8*16(%rbp)"; +my $ctr3_store="$xmm_storage+9*16(%rbp)"; + +sub chacha_qr { +my ($a,$b,$c,$d,$t,$dir)=@_; +$code.="movdqa $t, $tmp_store\n" if ($dir =~ /store/); +$code.="paddd $b, $a + pxor $a, $d + pshufb .Lrol16(%rip), $d + paddd $d, $c + pxor $c, $b + movdqa $b, $t + pslld \$12, $t + psrld \$20, $b + pxor $t, $b + paddd $b, $a + pxor $a, $d + pshufb .Lrol8(%rip), $d + paddd $d, $c + pxor $c, $b + movdqa $b, $t + pslld \$7, $t + psrld \$25, $b + pxor $t, $b\n"; +$code.="palignr \$4, $b, $b + palignr \$8, $c, $c + palignr \$12, $d, $d\n" if ($dir =~ /left/); +$code.="palignr \$12, $b, $b + palignr \$8, $c, $c + palignr \$4, $d, $d\n" if ($dir =~ /right/); +$code.="movdqa $tmp_store, $t\n" if ($dir =~ /load/); +} + +sub poly_add { +my ($src)=@_; +$code.="add 0+$src, $acc0 + adc 8+$src, $acc1 + adc \$1, $acc2\n"; +} + +sub poly_stage1 { +$code.="mov 0+$r_store, %rax + mov %rax, $t2 + mul $acc0 + mov %rax, $t0 + mov %rdx, $t1 + mov 0+$r_store, %rax + mul $acc1 + imulq $acc2, $t2 + add %rax, $t1 + adc %rdx, $t2\n"; +} + +sub poly_stage2 { +$code.="mov 8+$r_store, %rax + mov %rax, $t3 + mul $acc0 + add %rax, $t1 + adc \$0, %rdx + mov %rdx, $acc0 + mov 8+$r_store, %rax + mul $acc1 + add %rax, $t2 + adc \$0, %rdx\n"; +} + +sub poly_stage3 { +$code.="imulq $acc2, $t3 + add $acc0, $t2 + adc %rdx, $t3\n"; +} + +# At the beginning of the reduce stage t = [t3:t2:t1:t0] is a product of +# r = [r1:r0] and acc = [acc2:acc1:acc0] +# r is 124 bits at most (due to clamping) and acc is 131 bits at most +# (acc2 is at most 4 before the addition and can be at most 6 when we add in +# the next block) therefore t is at most 255 bits big, and t3 is 63 bits. +sub poly_reduce_stage { +$code.="mov $t0, $acc0 + mov $t1, $acc1 + mov $t2, $acc2 + and \$3, $acc2 # At this point acc2 is 2 bits at most (value of 3) + mov $t2, $t0 + and \$-4, $t0 + mov $t3, $t1 + shrd \$2, $t3, $t2 + shr \$2, $t3 + add $t0, $t2 + adc $t1, $t3 # No carry out since t3 is 61 bits and t1 is 63 bits + add $t2, $acc0 + adc $t3, $acc1 + adc \$0, $acc2\n"; # At this point acc2 has the value of 4 at most +} + +sub poly_mul { + &poly_stage1(); + &poly_stage2(); + &poly_stage3(); + &poly_reduce_stage(); +} + +sub prep_state { +my ($n)=@_; +$code.="movdqa .Lchacha20_consts(%rip), $A0 + movdqa $state1_store, $B0 + movdqa $state2_store, $C0\n"; +$code.="movdqa $A0, $A1 + movdqa $B0, $B1 + movdqa $C0, $C1\n" if ($n ge 2); +$code.="movdqa $A0, $A2 + movdqa $B0, $B2 + movdqa $C0, $C2\n" if ($n ge 3); +$code.="movdqa $A0, $A3 + movdqa $B0, $B3 + movdqa $C0, $C3\n" if ($n ge 4); +$code.="movdqa $ctr0_store, $D0 + paddd .Lsse_inc(%rip), $D0 + movdqa $D0, $ctr0_store\n" if ($n eq 1); +$code.="movdqa $ctr0_store, $D1 + paddd .Lsse_inc(%rip), $D1 + movdqa $D1, $D0 + paddd .Lsse_inc(%rip), $D0 + movdqa $D0, $ctr0_store + movdqa $D1, $ctr1_store\n" if ($n eq 2); +$code.="movdqa $ctr0_store, $D2 + paddd .Lsse_inc(%rip), $D2 + movdqa $D2, $D1 + paddd .Lsse_inc(%rip), $D1 + movdqa $D1, $D0 + paddd .Lsse_inc(%rip), $D0 + movdqa $D0, $ctr0_store + movdqa $D1, $ctr1_store + movdqa $D2, $ctr2_store\n" if ($n eq 3); +$code.="movdqa $ctr0_store, $D3 + paddd .Lsse_inc(%rip), $D3 + movdqa $D3, $D2 + paddd .Lsse_inc(%rip), $D2 + movdqa $D2, $D1 + paddd .Lsse_inc(%rip), $D1 + movdqa $D1, $D0 + paddd .Lsse_inc(%rip), $D0 + movdqa $D0, $ctr0_store + movdqa $D1, $ctr1_store + movdqa $D2, $ctr2_store + movdqa $D3, $ctr3_store\n" if ($n eq 4); +} + +sub finalize_state { +my ($n)=@_; +$code.="paddd .Lchacha20_consts(%rip), $A3 + paddd $state1_store, $B3 + paddd $state2_store, $C3 + paddd $ctr3_store, $D3\n" if ($n eq 4); +$code.="paddd .Lchacha20_consts(%rip), $A2 + paddd $state1_store, $B2 + paddd $state2_store, $C2 + paddd $ctr2_store, $D2\n" if ($n ge 3); +$code.="paddd .Lchacha20_consts(%rip), $A1 + paddd $state1_store, $B1 + paddd $state2_store, $C1 + paddd $ctr1_store, $D1\n" if ($n ge 2); +$code.="paddd .Lchacha20_consts(%rip), $A0 + paddd $state1_store, $B0 + paddd $state2_store, $C0 + paddd $ctr0_store, $D0\n"; +} + +sub xor_stream { +my ($A, $B, $C, $D, $offset)=@_; +$code.="movdqu 0*16 + $offset($inp), $A3 + movdqu 1*16 + $offset($inp), $B3 + movdqu 2*16 + $offset($inp), $C3 + movdqu 3*16 + $offset($inp), $D3 + pxor $A3, $A + pxor $B3, $B + pxor $C3, $C + pxor $D, $D3 + movdqu $A, 0*16 + $offset($oup) + movdqu $B, 1*16 + $offset($oup) + movdqu $C, 2*16 + $offset($oup) + movdqu $D3, 3*16 + $offset($oup)\n"; +} + +sub xor_stream_using_temp { +my ($A, $B, $C, $D, $offset, $temp)=@_; +$code.="movdqa $temp, $tmp_store + movdqu 0*16 + $offset($inp), $temp + pxor $A, $temp + movdqu $temp, 0*16 + $offset($oup) + movdqu 1*16 + $offset($inp), $temp + pxor $B, $temp + movdqu $temp, 1*16 + $offset($oup) + movdqu 2*16 + $offset($inp), $temp + pxor $C, $temp + movdqu $temp, 2*16 + $offset($oup) + movdqu 3*16 + $offset($inp), $temp + pxor $D, $temp + movdqu $temp, 3*16 + $offset($oup)\n"; +} + +sub gen_chacha_round { +my ($rot1, $rot2, $shift)=@_; +my $round=""; +$round.="movdqa $C0, $tmp_store\n" if ($rot1 eq 20); +$round.="movdqa $rot2, $C0 + paddd $B3, $A3 + paddd $B2, $A2 + paddd $B1, $A1 + paddd $B0, $A0 + pxor $A3, $D3 + pxor $A2, $D2 + pxor $A1, $D1 + pxor $A0, $D0 + pshufb $C0, $D3 + pshufb $C0, $D2 + pshufb $C0, $D1 + pshufb $C0, $D0 + movdqa $tmp_store, $C0 + paddd $D3, $C3 + paddd $D2, $C2 + paddd $D1, $C1 + paddd $D0, $C0 + pxor $C3, $B3 + pxor $C2, $B2 + pxor $C1, $B1 + pxor $C0, $B0 + movdqa $C0, $tmp_store + movdqa $B3, $C0 + psrld \$$rot1, $C0 + pslld \$32-$rot1, $B3 + pxor $C0, $B3 + movdqa $B2, $C0 + psrld \$$rot1, $C0 + pslld \$32-$rot1, $B2 + pxor $C0, $B2 + movdqa $B1, $C0 + psrld \$$rot1, $C0 + pslld \$32-$rot1, $B1 + pxor $C0, $B1 + movdqa $B0, $C0 + psrld \$$rot1, $C0 + pslld \$32-$rot1, $B0 + pxor $C0, $B0\n"; +($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/); +($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/); +$round.="movdqa $tmp_store, $C0 + palignr \$$s1, $B3, $B3 + palignr \$$s2, $C3, $C3 + palignr \$$s3, $D3, $D3 + palignr \$$s1, $B2, $B2 + palignr \$$s2, $C2, $C2 + palignr \$$s3, $D2, $D2 + palignr \$$s1, $B1, $B1 + palignr \$$s2, $C1, $C1 + palignr \$$s3, $D1, $D1 + palignr \$$s1, $B0, $B0 + palignr \$$s2, $C0, $C0 + palignr \$$s3, $D0, $D0\n" +if (($shift =~ /left/) || ($shift =~ /right/)); +return $round; +}; + +$chacha_body = &gen_chacha_round(20, ".Lrol16(%rip)") . + &gen_chacha_round(25, ".Lrol8(%rip)", "left") . + &gen_chacha_round(20, ".Lrol16(%rip)") . + &gen_chacha_round(25, ".Lrol8(%rip)", "right"); + +my @loop_body = split /\n/, $chacha_body; + +sub emit_body { +my ($n)=@_; + for (my $i=0; $i < $n; $i++) { + $code=$code.shift(@loop_body)."\n"; + }; +} + +{ +################################################################################ +# void poly_hash_ad_internal(); +$code.=" +.type poly_hash_ad_internal,\@abi-omnipotent +.align 64 +poly_hash_ad_internal: +.cfi_startproc +.cfi_def_cfa rsp, 8 + xor $acc0, $acc0 + xor $acc1, $acc1 + xor $acc2, $acc2 + cmp \$13, $itr2 + jne .Lhash_ad_loop +.Lpoly_fast_tls_ad: + # Special treatment for the TLS case of 13 bytes + mov ($adp), $acc0 + mov 5($adp), $acc1 + shr \$24, $acc1 + mov \$1, $acc2\n"; + &poly_mul(); $code.=" + ret +.Lhash_ad_loop: + # Hash in 16 byte chunk + cmp \$16, $itr2 + jb .Lhash_ad_tail\n"; + &poly_add("0($adp)"); + &poly_mul(); $code.=" + lea 1*16($adp), $adp + sub \$16, $itr2 + jmp .Lhash_ad_loop +.Lhash_ad_tail: + cmp \$0, $itr2 + je .Lhash_ad_done + # Hash last < 16 byte tail + xor $t0, $t0 + xor $t1, $t1 + xor $t2, $t2 + add $itr2, $adp +.Lhash_ad_tail_loop: + shld \$8, $t0, $t1 + shl \$8, $t0 + movzxb -1($adp), $t2 + xor $t2, $t0 + dec $adp + dec $itr2 + jne .Lhash_ad_tail_loop + + add $t0, $acc0 + adc $t1, $acc1 + adc \$1, $acc2\n"; + &poly_mul(); $code.=" + # Finished AD +.Lhash_ad_done: + ret +.cfi_endproc +.size poly_hash_ad_internal, .-poly_hash_ad_internal\n"; +} + +{ +################################################################################ +# void chacha20_poly1305_open(uint8_t *out_plaintext, const uint8_t *ciphertext, +# size_t plaintext_len, const uint8_t *ad, +# size_t ad_len, +# union chacha20_poly1305_open_data *aead_data) +# +$code.=" +.globl chacha20_poly1305_open_sse41 +.type chacha20_poly1305_open_sse41,\@function,6 +.align 64 +chacha20_poly1305_open_sse41: +.cfi_startproc + _CET_ENDBR + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + # We write the calculated authenticator back to keyp at the end, so save + # the pointer on the stack too. + push $keyp +.cfi_push $keyp + sub \$288 + $xmm_storage + 32, %rsp +.cfi_adjust_cfa_offset 288 + 32 + + lea 32(%rsp), %rbp + and \$-32, %rbp\n"; +$code.=" + movaps %xmm6,16*0+$xmm_store + movaps %xmm7,16*1+$xmm_store + movaps %xmm8,16*2+$xmm_store + movaps %xmm9,16*3+$xmm_store + movaps %xmm10,16*4+$xmm_store + movaps %xmm11,16*5+$xmm_store + movaps %xmm12,16*6+$xmm_store + movaps %xmm13,16*7+$xmm_store + movaps %xmm14,16*8+$xmm_store + movaps %xmm15,16*9+$xmm_store\n" if ($win64); +$code.=" + mov %rdx, $inl + mov $adl, 0+$len_store + mov $inl, 8+$len_store + + cmp \$128, $inl + jbe .Lopen_sse_128 + # For long buffers, prepare the poly key first + movdqa .Lchacha20_consts(%rip), $A0 + movdqu 0*16($keyp), $B0 + movdqu 1*16($keyp), $C0 + movdqu 2*16($keyp), $D0 + + movdqa $D0, $T1 + # Store on stack, to free keyp + movdqa $B0, $state1_store + movdqa $C0, $state2_store + movdqa $D0, $ctr0_store + mov \$10, $acc0 +.Lopen_sse_init_rounds:\n"; + &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); + &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.=" + dec $acc0 + jne .Lopen_sse_init_rounds + # A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded + paddd .Lchacha20_consts(%rip), $A0 + paddd $state1_store, $B0 + # Clamp and store the key + pand .Lclamp(%rip), $A0 + movdqa $A0, $r_store + movdqa $B0, $s_store + # Hash + mov $adl, $itr2 + call poly_hash_ad_internal +.Lopen_sse_main_loop: + cmp \$16*16, $inl + jb .Lopen_sse_tail + # Load state, increment counter blocks\n"; + &prep_state(4); $code.=" + # There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we + # hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16 + mov \$4, $itr1 + mov $inp, $itr2 +.Lopen_sse_main_loop_rounds:\n"; + &emit_body(20); + &poly_add("0($itr2)"); $code.=" + lea 2*8($itr2), $itr2\n"; + &emit_body(20); + &poly_stage1(); + &emit_body(20); + &poly_stage2(); + &emit_body(20); + &poly_stage3(); + &emit_body(20); + &poly_reduce_stage(); + foreach $l (@loop_body) {$code.=$l."\n";} + @loop_body = split /\n/, $chacha_body; $code.=" + dec $itr1 + jge .Lopen_sse_main_loop_rounds\n"; + &poly_add("0($itr2)"); + &poly_mul(); $code.=" + lea 2*8($itr2), $itr2 + cmp \$-6, $itr1 + jg .Lopen_sse_main_loop_rounds\n"; + &finalize_state(4); + &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0); + &xor_stream($A2, $B2, $C2, $D2, "4*16"); + &xor_stream($A1, $B1, $C1, $D1, "8*16"); + &xor_stream($A0, $B0, $C0, $tmp_store, "12*16"); $code.=" + lea 16*16($inp), $inp + lea 16*16($oup), $oup + sub \$16*16, $inl + jmp .Lopen_sse_main_loop +.Lopen_sse_tail: + # Handle the various tail sizes efficiently + test $inl, $inl + jz .Lopen_sse_finalize + cmp \$12*16, $inl + ja .Lopen_sse_tail_256 + cmp \$8*16, $inl + ja .Lopen_sse_tail_192 + cmp \$4*16, $inl + ja .Lopen_sse_tail_128\n"; +############################################################################### + # At most 64 bytes are left + &prep_state(1); $code.=" + xor $itr2, $itr2 + mov $inl, $itr1 + cmp \$16, $itr1 + jb .Lopen_sse_tail_64_rounds +.Lopen_sse_tail_64_rounds_and_x1hash: \n"; + &poly_add("0($inp,$itr2)"); + &poly_mul(); $code.=" + sub \$16, $itr1 +.Lopen_sse_tail_64_rounds: + add \$16, $itr2\n"; + &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); + &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.=" + cmp \$16, $itr1 + jae .Lopen_sse_tail_64_rounds_and_x1hash + cmp \$10*16, $itr2 + jne .Lopen_sse_tail_64_rounds\n"; + &finalize_state(1); $code.=" + jmp .Lopen_sse_tail_64_dec_loop +############################################################################### +.Lopen_sse_tail_128:\n"; + # 65 - 128 bytes are left + &prep_state(2); $code.=" + mov $inl, $itr1 + and \$-16, $itr1 + xor $itr2, $itr2 +.Lopen_sse_tail_128_rounds_and_x1hash: \n"; + &poly_add("0($inp,$itr2)"); + &poly_mul(); $code.=" +.Lopen_sse_tail_128_rounds: + add \$16, $itr2\n"; + &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); + &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); + &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); + &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");$code.=" + cmp $itr1, $itr2 + jb .Lopen_sse_tail_128_rounds_and_x1hash + cmp \$10*16, $itr2 + jne .Lopen_sse_tail_128_rounds\n"; + &finalize_state(2); + &xor_stream($A1, $B1, $C1, $D1, "0*16"); $code.=" + sub \$4*16, $inl + lea 4*16($inp), $inp + lea 4*16($oup), $oup + jmp .Lopen_sse_tail_64_dec_loop +############################################################################### +.Lopen_sse_tail_192:\n"; + # 129 - 192 bytes are left + &prep_state(3); $code.=" + mov $inl, $itr1 + mov \$10*16, $itr2 + cmp \$10*16, $itr1 + cmovg $itr2, $itr1 + and \$-16, $itr1 + xor $itr2, $itr2 +.Lopen_sse_tail_192_rounds_and_x1hash: \n"; + &poly_add("0($inp,$itr2)"); + &poly_mul(); $code.=" +.Lopen_sse_tail_192_rounds: + add \$16, $itr2\n"; + &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); + &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); + &chacha_qr($A2,$B2,$C2,$D2,$T0,"left"); + &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); + &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); + &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.=" + cmp $itr1, $itr2 + jb .Lopen_sse_tail_192_rounds_and_x1hash + cmp \$10*16, $itr2 + jne .Lopen_sse_tail_192_rounds + cmp \$11*16, $inl + jb .Lopen_sse_tail_192_finish\n"; + &poly_add("10*16($inp)"); + &poly_mul(); $code.=" + cmp \$12*16, $inl + jb .Lopen_sse_tail_192_finish\n"; + &poly_add("11*16($inp)"); + &poly_mul(); $code.=" +.Lopen_sse_tail_192_finish: \n"; + &finalize_state(3); + &xor_stream($A2, $B2, $C2, $D2, "0*16"); + &xor_stream($A1, $B1, $C1, $D1, "4*16"); $code.=" + sub \$8*16, $inl + lea 8*16($inp), $inp + lea 8*16($oup), $oup + jmp .Lopen_sse_tail_64_dec_loop +############################################################################### +.Lopen_sse_tail_256:\n"; + # 193 - 255 bytes are left + &prep_state(4); $code.=" + xor $itr2, $itr2 +.Lopen_sse_tail_256_rounds_and_x1hash: \n"; + &poly_add("0($inp,$itr2)"); + &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_left"); + &chacha_qr($A1,$B1,$C1,$D1,$C3,"left"); + &chacha_qr($A2,$B2,$C2,$D2,$C3,"left_load"); + &poly_stage1(); + &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_left_load"); + &poly_stage2(); + &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_right"); + &chacha_qr($A1,$B1,$C1,$D1,$C3,"right"); + &poly_stage3(); + &chacha_qr($A2,$B2,$C2,$D2,$C3,"right_load"); + &poly_reduce_stage(); + &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_right_load"); $code.=" + add \$16, $itr2 + cmp \$10*16, $itr2 + jb .Lopen_sse_tail_256_rounds_and_x1hash + + mov $inl, $itr1 + and \$-16, $itr1 +.Lopen_sse_tail_256_hash: \n"; + &poly_add("0($inp,$itr2)"); + &poly_mul(); $code.=" + add \$16, $itr2 + cmp $itr1, $itr2 + jb .Lopen_sse_tail_256_hash\n"; + &finalize_state(4); + &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0); + &xor_stream($A2, $B2, $C2, $D2, "4*16"); + &xor_stream($A1, $B1, $C1, $D1, "8*16"); $code.=" + movdqa $tmp_store, $D0 + sub \$12*16, $inl + lea 12*16($inp), $inp + lea 12*16($oup), $oup +############################################################################### + # Decrypt the remaining data, 16B at a time, using existing stream +.Lopen_sse_tail_64_dec_loop: + cmp \$16, $inl + jb .Lopen_sse_tail_16_init + sub \$16, $inl + movdqu ($inp), $T0 + pxor $T0, $A0 + movdqu $A0, ($oup) + lea 16($inp), $inp + lea 16($oup), $oup + movdqa $B0, $A0 + movdqa $C0, $B0 + movdqa $D0, $C0 + jmp .Lopen_sse_tail_64_dec_loop +.Lopen_sse_tail_16_init: + movdqa $A0, $A1 + + # Decrypt up to 16 bytes at the end. +.Lopen_sse_tail_16: + test $inl, $inl + jz .Lopen_sse_finalize + + # Read the final bytes into $T0. They need to be read in reverse order so + # that they end up in the correct order in $T0. + pxor $T0, $T0 + lea -1($inp,$inl), $inp + movq $inl, $itr2 +.Lopen_sse_tail_16_compose: + pslldq \$1, $T0 + pinsrb \$0, ($inp), $T0 + sub \$1, $inp + sub \$1, $itr2 + jnz .Lopen_sse_tail_16_compose + + movq $T0, $t0 + pextrq \$1, $T0, $t1 + # The final bytes of keystream are in $A1. + pxor $A1, $T0 + + # Copy the plaintext bytes out. +.Lopen_sse_tail_16_extract: + pextrb \$0, $T0, ($oup) + psrldq \$1, $T0 + add \$1, $oup + sub \$1, $inl + jne .Lopen_sse_tail_16_extract + + add $t0, $acc0 + adc $t1, $acc1 + adc \$1, $acc2\n"; + &poly_mul(); $code.=" + +.Lopen_sse_finalize:\n"; + &poly_add($len_store); + &poly_mul(); $code.=" + # Final reduce + mov $acc0, $t0 + mov $acc1, $t1 + mov $acc2, $t2 + sub \$-5, $acc0 + sbb \$-1, $acc1 + sbb \$3, $acc2 + cmovc $t0, $acc0 + cmovc $t1, $acc1 + cmovc $t2, $acc2 + # Add in s part of the key + add 0+$s_store, $acc0 + adc 8+$s_store, $acc1\n"; + +$code.=" + movaps 16*0+$xmm_store, %xmm6 + movaps 16*1+$xmm_store, %xmm7 + movaps 16*2+$xmm_store, %xmm8 + movaps 16*3+$xmm_store, %xmm9 + movaps 16*4+$xmm_store, %xmm10 + movaps 16*5+$xmm_store, %xmm11 + movaps 16*6+$xmm_store, %xmm12 + movaps 16*7+$xmm_store, %xmm13 + movaps 16*8+$xmm_store, %xmm14 + movaps 16*9+$xmm_store, %xmm15\n" if ($win64); +$code.=" +.cfi_remember_state + add \$288 + $xmm_storage + 32, %rsp +.cfi_adjust_cfa_offset -(288 + 32) + # The tag replaces the key on return + pop $keyp +.cfi_pop $keyp + mov $acc0, ($keyp) + mov $acc1, 8($keyp) + pop %r15 +.cfi_pop %r15 + pop %r14 +.cfi_pop %r14 + pop %r13 +.cfi_pop %r13 + pop %r12 +.cfi_pop %r12 + pop %rbx +.cfi_pop %rbx + pop %rbp +.cfi_pop %rbp + ret +############################################################################### +.Lopen_sse_128: +.cfi_restore_state + movdqu .Lchacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2 + movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2 + movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2 + movdqu 2*16($keyp), $D0 + movdqa $D0, $D1\npaddd .Lsse_inc(%rip), $D1 + movdqa $D1, $D2\npaddd .Lsse_inc(%rip), $D2 + movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D1, $T3 + mov \$10, $acc0 + +.Lopen_sse_128_rounds: \n"; + &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); + &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); + &chacha_qr($A2,$B2,$C2,$D2,$T0,"left"); + &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); + &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); + &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.=" + dec $acc0 + jnz .Lopen_sse_128_rounds + paddd .Lchacha20_consts(%rip), $A0 + paddd .Lchacha20_consts(%rip), $A1 + paddd .Lchacha20_consts(%rip), $A2 + paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2 + paddd $T2, $C1\npaddd $T2, $C2 + paddd $T3, $D1 + paddd .Lsse_inc(%rip), $T3 + paddd $T3, $D2 + # Clamp and store the key + pand .Lclamp(%rip), $A0 + movdqa $A0, $r_store + movdqa $B0, $s_store + # Hash + mov $adl, $itr2 + call poly_hash_ad_internal +.Lopen_sse_128_xor_hash: + cmp \$16, $inl + jb .Lopen_sse_tail_16 + sub \$16, $inl\n"; + # Load for hashing + &poly_add("0*8($inp)"); $code.=" + # Load for decryption + movdqu 0*16($inp), $T0 + pxor $T0, $A1 + movdqu $A1, 0*16($oup) + lea 1*16($inp), $inp + lea 1*16($oup), $oup\n"; + &poly_mul(); $code.=" + # Shift the stream left + movdqa $B1, $A1 + movdqa $C1, $B1 + movdqa $D1, $C1 + movdqa $A2, $D1 + movdqa $B2, $A2 + movdqa $C2, $B2 + movdqa $D2, $C2 + jmp .Lopen_sse_128_xor_hash +.size chacha20_poly1305_open_sse41, .-chacha20_poly1305_open_sse41 +.cfi_endproc + +################################################################################ +################################################################################ +# void chacha20_poly1305_seal(uint8_t *out_ciphertext, const uint8_t *plaintext, +# size_t plaintext_len, const uint8_t *ad, +# size_t ad_len, +# union chacha20_poly1305_seal_data *data); +.globl chacha20_poly1305_seal_sse41 +.type chacha20_poly1305_seal_sse41,\@function,6 +.align 64 +chacha20_poly1305_seal_sse41: +.cfi_startproc + _CET_ENDBR + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +# We write the calculated authenticator back to keyp at the end, so save +# the pointer on the stack too. + push $keyp +.cfi_push $keyp + sub \$288 + $xmm_storage + 32, %rsp +.cfi_adjust_cfa_offset 288 + 32 + lea 32(%rsp), %rbp + and \$-32, %rbp\n"; +$code.=" + movaps %xmm6,16*0+$xmm_store + movaps %xmm7,16*1+$xmm_store + movaps %xmm8,16*2+$xmm_store + movaps %xmm9,16*3+$xmm_store + movaps %xmm10,16*4+$xmm_store + movaps %xmm11,16*5+$xmm_store + movaps %xmm12,16*6+$xmm_store + movaps %xmm13,16*7+$xmm_store + movaps %xmm14,16*8+$xmm_store + movaps %xmm15,16*9+$xmm_store\n" if ($win64); +$code.=" + mov 56($keyp), $inl # extra_in_len + addq %rdx, $inl + mov $adl, 0+$len_store + mov $inl, 8+$len_store + mov %rdx, $inl + + cmp \$128, $inl + jbe .Lseal_sse_128 + # For longer buffers, prepare the poly key + some stream + movdqa .Lchacha20_consts(%rip), $A0 + movdqu 0*16($keyp), $B0 + movdqu 1*16($keyp), $C0 + movdqu 2*16($keyp), $D0 + + movdqa $A0, $A1 + movdqa $A0, $A2 + movdqa $A0, $A3 + movdqa $B0, $B1 + movdqa $B0, $B2 + movdqa $B0, $B3 + movdqa $C0, $C1 + movdqa $C0, $C2 + movdqa $C0, $C3 + movdqa $D0, $D3 + paddd .Lsse_inc(%rip), $D0 + movdqa $D0, $D2 + paddd .Lsse_inc(%rip), $D0 + movdqa $D0, $D1 + paddd .Lsse_inc(%rip), $D0 + # Store on stack + movdqa $B0, $state1_store + movdqa $C0, $state2_store + movdqa $D0, $ctr0_store + movdqa $D1, $ctr1_store + movdqa $D2, $ctr2_store + movdqa $D3, $ctr3_store + mov \$10, $acc0 +.Lseal_sse_init_rounds: \n"; + foreach $l (@loop_body) {$code.=$l."\n";} + @loop_body = split /\n/, $chacha_body; $code.=" + dec $acc0 + jnz .Lseal_sse_init_rounds\n"; + &finalize_state(4); $code.=" + # Clamp and store the key + pand .Lclamp(%rip), $A3 + movdqa $A3, $r_store + movdqa $B3, $s_store + # Hash + mov $adl, $itr2 + call poly_hash_ad_internal\n"; + &xor_stream($A2,$B2,$C2,$D2,"0*16"); + &xor_stream($A1,$B1,$C1,$D1,"4*16"); $code.=" + cmp \$12*16, $inl + ja .Lseal_sse_main_init + mov \$8*16, $itr1 + sub \$8*16, $inl + lea 8*16($inp), $inp + jmp .Lseal_sse_128_tail_hash +.Lseal_sse_main_init:\n"; + &xor_stream($A0, $B0, $C0, $D0, "8*16"); $code.=" + mov \$12*16, $itr1 + sub \$12*16, $inl + lea 12*16($inp), $inp + mov \$2, $itr1 + mov \$8, $itr2 + cmp \$4*16, $inl + jbe .Lseal_sse_tail_64 + cmp \$8*16, $inl + jbe .Lseal_sse_tail_128 + cmp \$12*16, $inl + jbe .Lseal_sse_tail_192 + +.Lseal_sse_main_loop: \n"; + # The main loop + &prep_state(4); $code.=" +.align 32 +.Lseal_sse_main_rounds: \n"; + &emit_body(20); + &poly_add("0($oup)"); + &emit_body(20); + &poly_stage1(); + &emit_body(20); + &poly_stage2(); + &emit_body(20); + &poly_stage3(); + &emit_body(20); + &poly_reduce_stage(); + foreach $l (@loop_body) {$code.=$l."\n";} + @loop_body = split /\n/, $chacha_body; $code.=" + lea 16($oup), $oup + dec $itr2 + jge .Lseal_sse_main_rounds\n"; + &poly_add("0*8($oup)"); + &poly_mul(); $code.=" + lea 16($oup), $oup + dec $itr1 + jg .Lseal_sse_main_rounds\n"; + + &finalize_state(4);$code.=" + movdqa $D2, $tmp_store\n"; + &xor_stream_using_temp($A3,$B3,$C3,$D3,0*16,$D2); $code.=" + movdqa $tmp_store, $D2\n"; + &xor_stream($A2,$B2,$C2,$D2, 4*16); + &xor_stream($A1,$B1,$C1,$D1, 8*16); $code.=" + cmp \$16*16, $inl + ja .Lseal_sse_main_loop_xor + + mov \$12*16, $itr1 + sub \$12*16, $inl + lea 12*16($inp), $inp + jmp .Lseal_sse_128_tail_hash +.Lseal_sse_main_loop_xor: \n"; + &xor_stream($A0,$B0,$C0,$D0,"12*16"); $code.=" + lea 16*16($inp), $inp + sub \$16*16, $inl + mov \$6, $itr1 + mov \$4, $itr2 + cmp \$12*16, $inl + jg .Lseal_sse_main_loop + mov $inl, $itr1 + test $inl, $inl + je .Lseal_sse_128_tail_hash + mov \$6, $itr1 + cmp \$8*16, $inl + ja .Lseal_sse_tail_192 + cmp \$4*16, $inl + ja .Lseal_sse_tail_128 +############################################################################### +.Lseal_sse_tail_64: \n"; + &prep_state(1); $code.=" +.Lseal_sse_tail_64_rounds_and_x2hash: \n"; + &poly_add("0($oup)"); + &poly_mul(); $code.=" + lea 16($oup), $oup +.Lseal_sse_tail_64_rounds_and_x1hash: \n"; + &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); + &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); + &poly_add("0($oup)"); + &poly_mul(); $code.=" + lea 16($oup), $oup + dec $itr1 + jg .Lseal_sse_tail_64_rounds_and_x2hash + dec $itr2 + jge .Lseal_sse_tail_64_rounds_and_x1hash\n"; + &finalize_state(1); $code.=" + jmp .Lseal_sse_128_tail_xor +############################################################################### +.Lseal_sse_tail_128:\n"; + &prep_state(2); $code.=" +.Lseal_sse_tail_128_rounds_and_x2hash: \n"; + &poly_add("0($oup)"); + &poly_mul(); $code.=" + lea 16($oup), $oup +.Lseal_sse_tail_128_rounds_and_x1hash: \n"; + &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); + &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); + &poly_add("0($oup)"); + &poly_mul(); + &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); + &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); $code.=" + lea 16($oup), $oup + dec $itr1 + jg .Lseal_sse_tail_128_rounds_and_x2hash + dec $itr2 + jge .Lseal_sse_tail_128_rounds_and_x1hash\n"; + &finalize_state(2); + &xor_stream($A1,$B1,$C1,$D1,0*16); $code.=" + mov \$4*16, $itr1 + sub \$4*16, $inl + lea 4*16($inp), $inp + jmp .Lseal_sse_128_tail_hash +############################################################################### +.Lseal_sse_tail_192:\n"; + &prep_state(3); $code.=" +.Lseal_sse_tail_192_rounds_and_x2hash: \n"; + &poly_add("0($oup)"); + &poly_mul(); $code.=" + lea 16($oup), $oup +.Lseal_sse_tail_192_rounds_and_x1hash: \n"; + &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); + &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); + &chacha_qr($A2,$B2,$C2,$D2,$T0,"left"); + &poly_add("0($oup)"); + &poly_mul(); + &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); + &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); + &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.=" + lea 16($oup), $oup + dec $itr1 + jg .Lseal_sse_tail_192_rounds_and_x2hash + dec $itr2 + jge .Lseal_sse_tail_192_rounds_and_x1hash\n"; + &finalize_state(3); + &xor_stream($A2,$B2,$C2,$D2,0*16); + &xor_stream($A1,$B1,$C1,$D1,4*16); $code.=" + mov \$8*16, $itr1 + sub \$8*16, $inl + lea 8*16($inp), $inp +############################################################################### +.Lseal_sse_128_tail_hash: + cmp \$16, $itr1 + jb .Lseal_sse_128_tail_xor\n"; + &poly_add("0($oup)"); + &poly_mul(); $code.=" + sub \$16, $itr1 + lea 16($oup), $oup + jmp .Lseal_sse_128_tail_hash + +.Lseal_sse_128_tail_xor: + cmp \$16, $inl + jb .Lseal_sse_tail_16 + sub \$16, $inl + # Load for decryption + movdqu 0*16($inp), $T0 + pxor $T0, $A0 + movdqu $A0, 0*16($oup) + # Then hash + add 0*8($oup), $acc0 + adc 1*8($oup), $acc1 + adc \$1, $acc2 + lea 1*16($inp), $inp + lea 1*16($oup), $oup\n"; + &poly_mul(); $code.=" + # Shift the stream left + movdqa $B0, $A0 + movdqa $C0, $B0 + movdqa $D0, $C0 + movdqa $A1, $D0 + movdqa $B1, $A1 + movdqa $C1, $B1 + movdqa $D1, $C1 + jmp .Lseal_sse_128_tail_xor + +.Lseal_sse_tail_16: + test $inl, $inl + jz .Lprocess_blocks_of_extra_in + # We can only load the PT one byte at a time to avoid buffer overread + mov $inl, $itr2 + mov $inl, $itr1 + lea -1($inp,$inl), $inp + pxor $T3, $T3 +.Lseal_sse_tail_16_compose: + pslldq \$1, $T3 + pinsrb \$0, ($inp), $T3 + lea -1($inp), $inp + dec $itr1 + jne .Lseal_sse_tail_16_compose + + # XOR the keystream with the plaintext. + pxor $A0, $T3 + + # Write ciphertext out, byte-by-byte. + movq $inl, $itr1 + movdqu $T3, $A0 +.Lseal_sse_tail_16_extract: + pextrb \$0, $A0, ($oup) + psrldq \$1, $A0 + add \$1, $oup + sub \$1, $itr1 + jnz .Lseal_sse_tail_16_extract + + # $T3 contains the final (partial, non-empty) block of ciphertext which + # needs to be fed into the Poly1305 state. The right-most $inl bytes of it + # are valid. We need to fill it with extra_in bytes until full, or until we + # run out of bytes. + # + # $keyp points to the tag output, which is actually a struct with the + # extra_in pointer and length at offset 48. + movq 288 + $xmm_storage + 32(%rsp), $keyp + movq 56($keyp), $t1 # extra_in_len + movq 48($keyp), $t0 # extra_in + test $t1, $t1 + jz .Lprocess_partial_block # Common case: no bytes of extra_in + + movq \$16, $t2 + subq $inl, $t2 # 16-$inl is the number of bytes that fit into $T3. + cmpq $t2, $t1 # if extra_in_len < 16-$inl, only copy extra_in_len + # (note that AT&T syntax reverses the arguments) + jge .Lload_extra_in + movq $t1, $t2 + +.Lload_extra_in: + # $t2 contains the number of bytes of extra_in (pointed to by $t0) to load + # into $T3. They are loaded in reverse order. + leaq -1($t0,$t2), $inp + # Update extra_in and extra_in_len to reflect the bytes that are about to + # be read. + addq $t2, $t0 + subq $t2, $t1 + movq $t0, 48($keyp) + movq $t1, 56($keyp) + + # Update $itr2, which is used to select the mask later on, to reflect the + # extra bytes about to be added. + addq $t2, $itr2 + + # Load $t2 bytes of extra_in into $T2. + pxor $T2, $T2 +.Lload_extra_load_loop: + pslldq \$1, $T2 + pinsrb \$0, ($inp), $T2 + lea -1($inp), $inp + sub \$1, $t2 + jnz .Lload_extra_load_loop + + # Shift $T2 up the length of the remainder from the main encryption. Sadly, + # the shift for an XMM register has to be a constant, thus we loop to do + # this. + movq $inl, $t2 + +.Lload_extra_shift_loop: + pslldq \$1, $T2 + sub \$1, $t2 + jnz .Lload_extra_shift_loop + + # Mask $T3 (the remainder from the main encryption) so that superfluous + # bytes are zero. This means that the non-zero bytes in $T2 and $T3 are + # disjoint and so we can merge them with an OR. + lea .Land_masks(%rip), $t2 + shl \$4, $inl + pand -16($t2,$inl), $T3 + + # Merge $T2 into $T3, forming the remainder block. + por $T2, $T3 + + # The block of ciphertext + extra_in is ready to be included in the + # Poly1305 state. + movq $T3, $t0 + pextrq \$1, $T3, $t1 + add $t0, $acc0 + adc $t1, $acc1 + adc \$1, $acc2\n"; + &poly_mul(); $code.=" + +.Lprocess_blocks_of_extra_in: + # There may be additional bytes of extra_in to process. + movq 288+32+$xmm_storage (%rsp), $keyp + movq 48($keyp), $inp # extra_in + movq 56($keyp), $itr2 # extra_in_len + movq $itr2, $itr1 + shr \$4, $itr2 # number of blocks + +.Lprocess_extra_hash_loop: + jz process_extra_in_trailer\n"; + &poly_add("0($inp)"); + &poly_mul(); $code.=" + leaq 16($inp), $inp + subq \$1, $itr2 + jmp .Lprocess_extra_hash_loop +process_extra_in_trailer: + andq \$15, $itr1 # remaining num bytes (<16) of extra_in + movq $itr1, $inl + jz .Ldo_length_block + leaq -1($inp,$itr1), $inp + +.Lprocess_extra_in_trailer_load: + pslldq \$1, $T3 + pinsrb \$0, ($inp), $T3 + lea -1($inp), $inp + sub \$1, $itr1 + jnz .Lprocess_extra_in_trailer_load + +.Lprocess_partial_block: + # $T3 contains $inl bytes of data to be fed into Poly1305. $inl != 0 + lea .Land_masks(%rip), $t2 + shl \$4, $inl + pand -16($t2,$inl), $T3 + movq $T3, $t0 + pextrq \$1, $T3, $t1 + add $t0, $acc0 + adc $t1, $acc1 + adc \$1, $acc2\n"; + &poly_mul(); $code.=" + +.Ldo_length_block:\n"; + &poly_add($len_store); + &poly_mul(); $code.=" + # Final reduce + mov $acc0, $t0 + mov $acc1, $t1 + mov $acc2, $t2 + sub \$-5, $acc0 + sbb \$-1, $acc1 + sbb \$3, $acc2 + cmovc $t0, $acc0 + cmovc $t1, $acc1 + cmovc $t2, $acc2 + # Add in s part of the key + add 0+$s_store, $acc0 + adc 8+$s_store, $acc1\n"; + +$code.=" + movaps 16*0+$xmm_store, %xmm6 + movaps 16*1+$xmm_store, %xmm7 + movaps 16*2+$xmm_store, %xmm8 + movaps 16*3+$xmm_store, %xmm9 + movaps 16*4+$xmm_store, %xmm10 + movaps 16*5+$xmm_store, %xmm11 + movaps 16*6+$xmm_store, %xmm12 + movaps 16*7+$xmm_store, %xmm13 + movaps 16*8+$xmm_store, %xmm14 + movaps 16*9+$xmm_store, %xmm15\n" if ($win64); +$code.=" +.cfi_remember_state + add \$288 + $xmm_storage + 32, %rsp +.cfi_adjust_cfa_offset -(288 + 32) + # The tag replaces the key on return + pop $keyp +.cfi_pop $keyp + mov $acc0, ($keyp) + mov $acc1, 8($keyp) + pop %r15 +.cfi_pop %r15 + pop %r14 +.cfi_pop %r14 + pop %r13 +.cfi_pop %r13 + pop %r12 +.cfi_pop %r12 + pop %rbx +.cfi_pop %rbx + pop %rbp +.cfi_pop %rbp + ret +################################################################################ +.Lseal_sse_128: +.cfi_restore_state + movdqu .Lchacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2 + movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2 + movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2 + movdqu 2*16($keyp), $D2 + movdqa $D2, $D0\npaddd .Lsse_inc(%rip), $D0 + movdqa $D0, $D1\npaddd .Lsse_inc(%rip), $D1 + movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D0, $T3 + mov \$10, $acc0 + +.Lseal_sse_128_rounds:\n"; + &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); + &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); + &chacha_qr($A2,$B2,$C2,$D2,$T0,"left"); + &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); + &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); + &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.=" + dec $acc0 + jnz .Lseal_sse_128_rounds + paddd .Lchacha20_consts(%rip), $A0 + paddd .Lchacha20_consts(%rip), $A1 + paddd .Lchacha20_consts(%rip), $A2 + paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2 + paddd $T2, $C0\npaddd $T2, $C1 + paddd $T3, $D0 + paddd .Lsse_inc(%rip), $T3 + paddd $T3, $D1 + # Clamp and store the key + pand .Lclamp(%rip), $A2 + movdqa $A2, $r_store + movdqa $B2, $s_store + # Hash + mov %r8, $itr2 + call poly_hash_ad_internal + jmp .Lseal_sse_128_tail_xor +.size chacha20_poly1305_seal_sse41, .-chacha20_poly1305_seal_sse41 +.cfi_endproc\n"; +} + +if ($avx>1) { + +($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%ymm$_",(0..15)); +my ($A0x,$A1x,$A2x,$A3x,$B0x,$B1x,$B2x,$B3x,$C0x,$C1x,$C2x,$C3x,$D0x,$D1x,$D2x,$D3x)=map("%xmm$_",(0..15)); +($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3); +$state1_store="$xmm_storage+2*32(%rbp)"; +$state2_store="$xmm_storage+3*32(%rbp)"; +$tmp_store="$xmm_storage+4*32(%rbp)"; +$ctr0_store="$xmm_storage+5*32(%rbp)"; +$ctr1_store="$xmm_storage+6*32(%rbp)"; +$ctr2_store="$xmm_storage+7*32(%rbp)"; +$ctr3_store="$xmm_storage+8*32(%rbp)"; + +sub chacha_qr_avx2 { +my ($a,$b,$c,$d,$t,$dir)=@_; +$code.=<<___ if ($dir =~ /store/); + vmovdqa $t, $tmp_store +___ +$code.=<<___; + vpaddd $b, $a, $a + vpxor $a, $d, $d + vpshufb .Lrol16(%rip), $d, $d + vpaddd $d, $c, $c + vpxor $c, $b, $b + vpsrld \$20, $b, $t + vpslld \$12, $b, $b + vpxor $t, $b, $b + vpaddd $b, $a, $a + vpxor $a, $d, $d + vpshufb .Lrol8(%rip), $d, $d + vpaddd $d, $c, $c + vpxor $c, $b, $b + vpslld \$7, $b, $t + vpsrld \$25, $b, $b + vpxor $t, $b, $b +___ +$code.=<<___ if ($dir =~ /left/); + vpalignr \$12, $d, $d, $d + vpalignr \$8, $c, $c, $c + vpalignr \$4, $b, $b, $b +___ +$code.=<<___ if ($dir =~ /right/); + vpalignr \$4, $d, $d, $d + vpalignr \$8, $c, $c, $c + vpalignr \$12, $b, $b, $b +___ +$code.=<<___ if ($dir =~ /load/); + vmovdqa $tmp_store, $t +___ +} + +sub prep_state_avx2 { +my ($n)=@_; +$code.=<<___; + vmovdqa .Lchacha20_consts(%rip), $A0 + vmovdqa $state1_store, $B0 + vmovdqa $state2_store, $C0 +___ +$code.=<<___ if ($n ge 2); + vmovdqa $A0, $A1 + vmovdqa $B0, $B1 + vmovdqa $C0, $C1 +___ +$code.=<<___ if ($n ge 3); + vmovdqa $A0, $A2 + vmovdqa $B0, $B2 + vmovdqa $C0, $C2 +___ +$code.=<<___ if ($n ge 4); + vmovdqa $A0, $A3 + vmovdqa $B0, $B3 + vmovdqa $C0, $C3 +___ +$code.=<<___ if ($n eq 1); + vmovdqa .Lavx2_inc(%rip), $D0 + vpaddd $ctr0_store, $D0, $D0 + vmovdqa $D0, $ctr0_store +___ +$code.=<<___ if ($n eq 2); + vmovdqa .Lavx2_inc(%rip), $D0 + vpaddd $ctr0_store, $D0, $D1 + vpaddd $D1, $D0, $D0 + vmovdqa $D0, $ctr0_store + vmovdqa $D1, $ctr1_store +___ +$code.=<<___ if ($n eq 3); + vmovdqa .Lavx2_inc(%rip), $D0 + vpaddd $ctr0_store, $D0, $D2 + vpaddd $D2, $D0, $D1 + vpaddd $D1, $D0, $D0 + vmovdqa $D0, $ctr0_store + vmovdqa $D1, $ctr1_store + vmovdqa $D2, $ctr2_store +___ +$code.=<<___ if ($n eq 4); + vmovdqa .Lavx2_inc(%rip), $D0 + vpaddd $ctr0_store, $D0, $D3 + vpaddd $D3, $D0, $D2 + vpaddd $D2, $D0, $D1 + vpaddd $D1, $D0, $D0 + vmovdqa $D3, $ctr3_store + vmovdqa $D2, $ctr2_store + vmovdqa $D1, $ctr1_store + vmovdqa $D0, $ctr0_store +___ +} + +sub finalize_state_avx2 { +my ($n)=@_; +$code.=<<___ if ($n eq 4); + vpaddd .Lchacha20_consts(%rip), $A3, $A3 + vpaddd $state1_store, $B3, $B3 + vpaddd $state2_store, $C3, $C3 + vpaddd $ctr3_store, $D3, $D3 +___ +$code.=<<___ if ($n ge 3); + vpaddd .Lchacha20_consts(%rip), $A2, $A2 + vpaddd $state1_store, $B2, $B2 + vpaddd $state2_store, $C2, $C2 + vpaddd $ctr2_store, $D2, $D2 +___ +$code.=<<___ if ($n ge 2); + vpaddd .Lchacha20_consts(%rip), $A1, $A1 + vpaddd $state1_store, $B1, $B1 + vpaddd $state2_store, $C1, $C1 + vpaddd $ctr1_store, $D1, $D1 +___ +$code.=<<___; + vpaddd .Lchacha20_consts(%rip), $A0, $A0 + vpaddd $state1_store, $B0, $B0 + vpaddd $state2_store, $C0, $C0 + vpaddd $ctr0_store, $D0, $D0 +___ +} + +sub xor_stream_avx2 { +my ($A, $B, $C, $D, $offset, $hlp)=@_; +$code.=<<___; + vperm2i128 \$0x02, $A, $B, $hlp + vperm2i128 \$0x13, $A, $B, $B + vperm2i128 \$0x02, $C, $D, $A + vperm2i128 \$0x13, $C, $D, $C + vpxor 0*32+$offset($inp), $hlp, $hlp + vpxor 1*32+$offset($inp), $A, $A + vpxor 2*32+$offset($inp), $B, $B + vpxor 3*32+$offset($inp), $C, $C + vmovdqu $hlp, 0*32+$offset($oup) + vmovdqu $A, 1*32+$offset($oup) + vmovdqu $B, 2*32+$offset($oup) + vmovdqu $C, 3*32+$offset($oup) +___ +} + +sub finish_stream_avx2 { +my ($A, $B, $C, $D, $hlp)=@_; +$code.=<<___; + vperm2i128 \$0x13, $A, $B, $hlp + vperm2i128 \$0x02, $A, $B, $A + vperm2i128 \$0x02, $C, $D, $B + vperm2i128 \$0x13, $C, $D, $D + vmovdqa $hlp, $C +___ +} + +sub poly_stage1_mulx { +$code.=<<___; + mov 0+$r_store, %rdx + mov %rdx, $t2 + mulx $acc0, $t0, $t1 + mulx $acc1, %rax, %rdx + imulq $acc2, $t2 + add %rax, $t1 + adc %rdx, $t2 +___ +} + +sub poly_stage2_mulx { +$code.=<<___; + mov 8+$r_store, %rdx + mulx $acc0, $acc0, %rax + add $acc0, $t1 + mulx $acc1, $acc1, $t3 + adc $acc1, $t2 + adc \$0, $t3 + imulq $acc2, %rdx +___ +} + +sub poly_stage3_mulx { +$code.=<<___; + add %rax, $t2 + adc %rdx, $t3 +___ +} + +sub poly_mul_mulx { + &poly_stage1_mulx(); + &poly_stage2_mulx(); + &poly_stage3_mulx(); + &poly_reduce_stage(); +} + +sub gen_chacha_round_avx2 { +my ($rot1, $rot2, $shift)=@_; +my $round=""; +$round=$round ."vmovdqa $C0, $tmp_store\n" if ($rot1 eq 20); +$round=$round ."vmovdqa $rot2, $C0 + vpaddd $B3, $A3, $A3 + vpaddd $B2, $A2, $A2 + vpaddd $B1, $A1, $A1 + vpaddd $B0, $A0, $A0 + vpxor $A3, $D3, $D3 + vpxor $A2, $D2, $D2 + vpxor $A1, $D1, $D1 + vpxor $A0, $D0, $D0 + vpshufb $C0, $D3, $D3 + vpshufb $C0, $D2, $D2 + vpshufb $C0, $D1, $D1 + vpshufb $C0, $D0, $D0 + vpaddd $D3, $C3, $C3 + vpaddd $D2, $C2, $C2 + vpaddd $D1, $C1, $C1 + vpaddd $tmp_store, $D0, $C0 + vpxor $C3, $B3, $B3 + vpxor $C2, $B2, $B2 + vpxor $C1, $B1, $B1 + vpxor $C0, $B0, $B0 + vmovdqa $C0, $tmp_store + vpsrld \$$rot1, $B3, $C0 + vpslld \$32-$rot1, $B3, $B3 + vpxor $C0, $B3, $B3 + vpsrld \$$rot1, $B2, $C0 + vpslld \$32-$rot1, $B2, $B2 + vpxor $C0, $B2, $B2 + vpsrld \$$rot1, $B1, $C0 + vpslld \$32-$rot1, $B1, $B1 + vpxor $C0, $B1, $B1 + vpsrld \$$rot1, $B0, $C0 + vpslld \$32-$rot1, $B0, $B0 + vpxor $C0, $B0, $B0\n"; +($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/); +($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/); +$round=$round ."vmovdqa $tmp_store, $C0 + vpalignr \$$s1, $B3, $B3, $B3 + vpalignr \$$s2, $C3, $C3, $C3 + vpalignr \$$s3, $D3, $D3, $D3 + vpalignr \$$s1, $B2, $B2, $B2 + vpalignr \$$s2, $C2, $C2, $C2 + vpalignr \$$s3, $D2, $D2, $D2 + vpalignr \$$s1, $B1, $B1, $B1 + vpalignr \$$s2, $C1, $C1, $C1 + vpalignr \$$s3, $D1, $D1, $D1 + vpalignr \$$s1, $B0, $B0, $B0 + vpalignr \$$s2, $C0, $C0, $C0 + vpalignr \$$s3, $D0, $D0, $D0\n" +if (($shift =~ /left/) || ($shift =~ /right/)); +return $round; +}; + +$chacha_body = &gen_chacha_round_avx2(20, ".Lrol16(%rip)") . + &gen_chacha_round_avx2(25, ".Lrol8(%rip)", "left") . + &gen_chacha_round_avx2(20, ".Lrol16(%rip)") . + &gen_chacha_round_avx2(25, ".Lrol8(%rip)", "right"); + +@loop_body = split /\n/, $chacha_body; + +$code.=" +############################################################################### +.globl chacha20_poly1305_open_avx2 +.type chacha20_poly1305_open_avx2,\@function,6 +.align 64 +chacha20_poly1305_open_avx2: +.cfi_startproc + _CET_ENDBR + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + # We write the calculated authenticator back to keyp at the end, so save + # the pointer on the stack too. + push $keyp +.cfi_push $keyp + sub \$288 + $xmm_storage + 32, %rsp +.cfi_adjust_cfa_offset 288 + 32 + + lea 32(%rsp), %rbp + and \$-32, %rbp\n"; +$code.=" + movaps %xmm6,16*0+$xmm_store + movaps %xmm7,16*1+$xmm_store + movaps %xmm8,16*2+$xmm_store + movaps %xmm9,16*3+$xmm_store + movaps %xmm10,16*4+$xmm_store + movaps %xmm11,16*5+$xmm_store + movaps %xmm12,16*6+$xmm_store + movaps %xmm13,16*7+$xmm_store + movaps %xmm14,16*8+$xmm_store + movaps %xmm15,16*9+$xmm_store\n" if ($win64); +$code.=" + mov %rdx, $inl + mov $adl, 0+$len_store + mov $inl, 8+$len_store + + vzeroupper + vmovdqa .Lchacha20_consts(%rip), $A0 + vbroadcasti128 0*16($keyp), $B0 + vbroadcasti128 1*16($keyp), $C0 + vbroadcasti128 2*16($keyp), $D0 + vpaddd .Lavx2_init(%rip), $D0, $D0 + cmp \$6*32, $inl + jbe .Lopen_avx2_192 + cmp \$10*32, $inl + jbe .Lopen_avx2_320 + + vmovdqa $B0, $state1_store + vmovdqa $C0, $state2_store + vmovdqa $D0, $ctr0_store + mov \$10, $acc0 +.Lopen_avx2_init_rounds: \n"; + &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); + &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.=" + dec $acc0 + jne .Lopen_avx2_init_rounds + vpaddd .Lchacha20_consts(%rip), $A0, $A0 + vpaddd $state1_store, $B0, $B0 + vpaddd $state2_store, $C0, $C0 + vpaddd $ctr0_store, $D0, $D0 + + vperm2i128 \$0x02, $A0, $B0, $T0 + # Clamp and store key + vpand .Lclamp(%rip), $T0, $T0 + vmovdqa $T0, $r_store + # Stream for the first 64 bytes + vperm2i128 \$0x13, $A0, $B0, $A0 + vperm2i128 \$0x13, $C0, $D0, $B0 + # Hash AD + first 64 bytes + mov $adl, $itr2 + call poly_hash_ad_internal + # Hash first 64 bytes + xor $itr1, $itr1 +.Lopen_avx2_init_hash: \n"; + &poly_add("0($inp,$itr1)"); + &poly_mul(); $code.=" + add \$16, $itr1 + cmp \$2*32, $itr1 + jne .Lopen_avx2_init_hash + # Decrypt first 64 bytes + vpxor 0*32($inp), $A0, $A0 + vpxor 1*32($inp), $B0, $B0 + # Store first 64 bytes of decrypted data + vmovdqu $A0, 0*32($oup) + vmovdqu $B0, 1*32($oup) + lea 2*32($inp), $inp + lea 2*32($oup), $oup + sub \$2*32, $inl +.Lopen_avx2_main_loop: + # Hash and decrypt 512 bytes each iteration + cmp \$16*32, $inl + jb .Lopen_avx2_main_loop_done\n"; + &prep_state_avx2(4); $code.=" + xor $itr1, $itr1 +.Lopen_avx2_main_loop_rounds: \n"; + &poly_add("0*8($inp,$itr1)"); + &emit_body(10); + &poly_stage1_mulx(); + &emit_body(9); + &poly_stage2_mulx(); + &emit_body(12); + &poly_stage3_mulx(); + &emit_body(10); + &poly_reduce_stage(); + &emit_body(9); + &poly_add("2*8($inp,$itr1)"); + &emit_body(8); + &poly_stage1_mulx(); + &emit_body(18); + &poly_stage2_mulx(); + &emit_body(18); + &poly_stage3_mulx(); + &emit_body(9); + &poly_reduce_stage(); + &emit_body(8); + &poly_add("4*8($inp,$itr1)"); $code.=" + lea 6*8($itr1), $itr1\n"; + &emit_body(18); + &poly_stage1_mulx(); + &emit_body(8); + &poly_stage2_mulx(); + &emit_body(8); + &poly_stage3_mulx(); + &emit_body(18); + &poly_reduce_stage(); + foreach $l (@loop_body) {$code.=$l."\n";} + @loop_body = split /\n/, $chacha_body; $code.=" + cmp \$10*6*8, $itr1 + jne .Lopen_avx2_main_loop_rounds\n"; + &finalize_state_avx2(4); $code.=" + vmovdqa $A0, $tmp_store\n"; + &poly_add("10*6*8($inp)"); + &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.=" + vmovdqa $tmp_store, $A0\n"; + &poly_mul(); + &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3); + &poly_add("10*6*8+2*8($inp)"); + &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3); + &poly_mul(); + &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.=" + lea 16*32($inp), $inp + lea 16*32($oup), $oup + sub \$16*32, $inl + jmp .Lopen_avx2_main_loop +.Lopen_avx2_main_loop_done: + test $inl, $inl + vzeroupper + je .Lopen_sse_finalize + + cmp \$12*32, $inl + ja .Lopen_avx2_tail_512 + cmp \$8*32, $inl + ja .Lopen_avx2_tail_384 + cmp \$4*32, $inl + ja .Lopen_avx2_tail_256\n"; +############################################################################### + # 1-128 bytes left + &prep_state_avx2(1); $code.=" + xor $itr2, $itr2 + mov $inl, $itr1 + and \$-16, $itr1 + test $itr1, $itr1 + je .Lopen_avx2_tail_128_rounds # Have nothing to hash +.Lopen_avx2_tail_128_rounds_and_x1hash: \n"; + &poly_add("0*8($inp,$itr2)"); + &poly_mul(); $code.=" +.Lopen_avx2_tail_128_rounds: + add \$16, $itr2\n"; + &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); + &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.=" + cmp $itr1, $itr2 + jb .Lopen_avx2_tail_128_rounds_and_x1hash + cmp \$160, $itr2 + jne .Lopen_avx2_tail_128_rounds\n"; + &finalize_state_avx2(1); + &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" + jmp .Lopen_avx2_tail_128_xor +############################################################################### +.Lopen_avx2_tail_256: \n"; + # 129-256 bytes left + &prep_state_avx2(2); $code.=" + mov $inl, $tmp_store + mov $inl, $itr1 + sub \$4*32, $itr1 + shr \$4, $itr1 + mov \$10, $itr2 + cmp \$10, $itr1 + cmovg $itr2, $itr1 + mov $inp, $inl + xor $itr2, $itr2 +.Lopen_avx2_tail_256_rounds_and_x1hash: \n"; + &poly_add("0*8($inl)"); + &poly_mul_mulx(); $code.=" + lea 16($inl), $inl +.Lopen_avx2_tail_256_rounds: \n"; + &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); + &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); $code.=" + inc $itr2\n"; + &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); + &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); + &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.=" + cmp $itr1, $itr2 + jb .Lopen_avx2_tail_256_rounds_and_x1hash + cmp \$10, $itr2 + jne .Lopen_avx2_tail_256_rounds + mov $inl, $itr2 + sub $inp, $inl + mov $inl, $itr1 + mov $tmp_store, $inl +.Lopen_avx2_tail_256_hash: + add \$16, $itr1 + cmp $inl, $itr1 + jg .Lopen_avx2_tail_256_done\n"; + &poly_add("0*8($itr2)"); + &poly_mul_mulx(); $code.=" + lea 16($itr2), $itr2 + jmp .Lopen_avx2_tail_256_hash +.Lopen_avx2_tail_256_done: \n"; + &finalize_state_avx2(2); + &xor_stream_avx2($A1, $B1, $C1, $D1, 0*32, $T0); + &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.=" + lea 4*32($inp), $inp + lea 4*32($oup), $oup + sub \$4*32, $inl + jmp .Lopen_avx2_tail_128_xor +############################################################################### +.Lopen_avx2_tail_384: \n"; + # 257-383 bytes left + &prep_state_avx2(3); $code.=" + mov $inl, $tmp_store + mov $inl, $itr1 + sub \$8*32, $itr1 + shr \$4, $itr1 + add \$6, $itr1 + mov \$10, $itr2 + cmp \$10, $itr1 + cmovg $itr2, $itr1 + mov $inp, $inl + xor $itr2, $itr2 +.Lopen_avx2_tail_384_rounds_and_x2hash: \n"; + &poly_add("0*8($inl)"); + &poly_mul_mulx(); $code.=" + lea 16($inl), $inl +.Lopen_avx2_tail_384_rounds_and_x1hash: \n"; + &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left"); + &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); + &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); + &poly_add("0*8($inl)"); + &poly_mul(); $code.=" + lea 16($inl), $inl + inc $itr2\n"; + &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); + &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); + &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.=" + cmp $itr1, $itr2 + jb .Lopen_avx2_tail_384_rounds_and_x2hash + cmp \$10, $itr2 + jne .Lopen_avx2_tail_384_rounds_and_x1hash + mov $inl, $itr2 + sub $inp, $inl + mov $inl, $itr1 + mov $tmp_store, $inl +.Lopen_avx2_384_tail_hash: + add \$16, $itr1 + cmp $inl, $itr1 + jg .Lopen_avx2_384_tail_done\n"; + &poly_add("0*8($itr2)"); + &poly_mul_mulx(); $code.=" + lea 16($itr2), $itr2 + jmp .Lopen_avx2_384_tail_hash +.Lopen_avx2_384_tail_done: \n"; + &finalize_state_avx2(3); + &xor_stream_avx2($A2, $B2, $C2, $D2, 0*32, $T0); + &xor_stream_avx2($A1, $B1, $C1, $D1, 4*32, $T0); + &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.=" + lea 8*32($inp), $inp + lea 8*32($oup), $oup + sub \$8*32, $inl + jmp .Lopen_avx2_tail_128_xor +############################################################################### +.Lopen_avx2_tail_512: \n"; + # 384-512 bytes left + &prep_state_avx2(4); $code.=" + xor $itr1, $itr1 + mov $inp, $itr2 +.Lopen_avx2_tail_512_rounds_and_x2hash: \n"; + &poly_add("0*8($itr2)"); + &poly_mul(); $code.=" + lea 2*8($itr2), $itr2 +.Lopen_avx2_tail_512_rounds_and_x1hash: \n"; + &emit_body(37); + &poly_add("0*8($itr2)"); + &poly_mul_mulx(); + &emit_body(48); + &poly_add("2*8($itr2)"); + &poly_mul_mulx(); $code.=" + lea 4*8($itr2), $itr2\n"; + foreach $l (@loop_body) {$code.=$l."\n";} + @loop_body = split /\n/, $chacha_body; $code.=" + inc $itr1 + cmp \$4, $itr1 + jl .Lopen_avx2_tail_512_rounds_and_x2hash + cmp \$10, $itr1 + jne .Lopen_avx2_tail_512_rounds_and_x1hash + mov $inl, $itr1 + sub \$12*32, $itr1 + and \$-16, $itr1 +.Lopen_avx2_tail_512_hash: + test $itr1, $itr1 + je .Lopen_avx2_tail_512_done\n"; + &poly_add("0*8($itr2)"); + &poly_mul_mulx(); $code.=" + lea 2*8($itr2), $itr2 + sub \$2*8, $itr1 + jmp .Lopen_avx2_tail_512_hash +.Lopen_avx2_tail_512_done: \n"; + &finalize_state_avx2(4); $code.=" + vmovdqa $A0, $tmp_store\n"; + &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.=" + vmovdqa $tmp_store, $A0\n"; + &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3); + &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3); + &finish_stream_avx2($A0, $B0, $C0, $D0, $A3); $code.=" + lea 12*32($inp), $inp + lea 12*32($oup), $oup + sub \$12*32, $inl +.Lopen_avx2_tail_128_xor: + cmp \$32, $inl + jb .Lopen_avx2_tail_32_xor + sub \$32, $inl + vpxor ($inp), $A0, $A0 + vmovdqu $A0, ($oup) + lea 1*32($inp), $inp + lea 1*32($oup), $oup + vmovdqa $B0, $A0 + vmovdqa $C0, $B0 + vmovdqa $D0, $C0 + jmp .Lopen_avx2_tail_128_xor +.Lopen_avx2_tail_32_xor: + cmp \$16, $inl + vmovdqa $A0x, $A1x + jb .Lopen_avx2_exit + sub \$16, $inl + #load for decryption + vpxor ($inp), $A0x, $A1x + vmovdqu $A1x, ($oup) + lea 1*16($inp), $inp + lea 1*16($oup), $oup + vperm2i128 \$0x11, $A0, $A0, $A0 + vmovdqa $A0x, $A1x +.Lopen_avx2_exit: + vzeroupper + jmp .Lopen_sse_tail_16 +############################################################################### +.Lopen_avx2_192: + vmovdqa $A0, $A1 + vmovdqa $A0, $A2 + vmovdqa $B0, $B1 + vmovdqa $B0, $B2 + vmovdqa $C0, $C1 + vmovdqa $C0, $C2 + vpaddd .Lavx2_inc(%rip), $D0, $D1 + vmovdqa $D0, $T2 + vmovdqa $D1, $T3 + mov \$10, $acc0 +.Lopen_avx2_192_rounds: \n"; + &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); + &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); + &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); + &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.=" + dec $acc0 + jne .Lopen_avx2_192_rounds + vpaddd $A2, $A0, $A0 + vpaddd $A2, $A1, $A1 + vpaddd $B2, $B0, $B0 + vpaddd $B2, $B1, $B1 + vpaddd $C2, $C0, $C0 + vpaddd $C2, $C1, $C1 + vpaddd $T2, $D0, $D0 + vpaddd $T3, $D1, $D1 + vperm2i128 \$0x02, $A0, $B0, $T0 + # Clamp and store the key + vpand .Lclamp(%rip), $T0, $T0 + vmovdqa $T0, $r_store + # Stream for up to 192 bytes + vperm2i128 \$0x13, $A0, $B0, $A0 + vperm2i128 \$0x13, $C0, $D0, $B0 + vperm2i128 \$0x02, $A1, $B1, $C0 + vperm2i128 \$0x02, $C1, $D1, $D0 + vperm2i128 \$0x13, $A1, $B1, $A1 + vperm2i128 \$0x13, $C1, $D1, $B1 +.Lopen_avx2_short: + mov $adl, $itr2 + call poly_hash_ad_internal +.Lopen_avx2_short_hash_and_xor_loop: + cmp \$32, $inl + jb .Lopen_avx2_short_tail_32 + sub \$32, $inl\n"; + # Load + hash + &poly_add("0*8($inp)"); + &poly_mul(); + &poly_add("2*8($inp)"); + &poly_mul(); $code.=" + # Load + decrypt + vpxor ($inp), $A0, $A0 + vmovdqu $A0, ($oup) + lea 1*32($inp), $inp + lea 1*32($oup), $oup + # Shift stream + vmovdqa $B0, $A0 + vmovdqa $C0, $B0 + vmovdqa $D0, $C0 + vmovdqa $A1, $D0 + vmovdqa $B1, $A1 + vmovdqa $C1, $B1 + vmovdqa $D1, $C1 + vmovdqa $A2, $D1 + vmovdqa $B2, $A2 + jmp .Lopen_avx2_short_hash_and_xor_loop +.Lopen_avx2_short_tail_32: + cmp \$16, $inl + vmovdqa $A0x, $A1x + jb .Lopen_avx2_short_tail_32_exit + sub \$16, $inl\n"; + &poly_add("0*8($inp)"); + &poly_mul(); $code.=" + vpxor ($inp), $A0x, $A3x + vmovdqu $A3x, ($oup) + lea 1*16($inp), $inp + lea 1*16($oup), $oup + vextracti128 \$1, $A0, $A1x +.Lopen_avx2_short_tail_32_exit: + vzeroupper + jmp .Lopen_sse_tail_16 +############################################################################### +.Lopen_avx2_320: + vmovdqa $A0, $A1 + vmovdqa $A0, $A2 + vmovdqa $B0, $B1 + vmovdqa $B0, $B2 + vmovdqa $C0, $C1 + vmovdqa $C0, $C2 + vpaddd .Lavx2_inc(%rip), $D0, $D1 + vpaddd .Lavx2_inc(%rip), $D1, $D2 + vmovdqa $B0, $T1 + vmovdqa $C0, $T2 + vmovdqa $D0, $ctr0_store + vmovdqa $D1, $ctr1_store + vmovdqa $D2, $ctr2_store + mov \$10, $acc0 +.Lopen_avx2_320_rounds: \n"; + &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); + &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); + &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left"); + &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); + &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); + &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.=" + dec $acc0 + jne .Lopen_avx2_320_rounds + vpaddd .Lchacha20_consts(%rip), $A0, $A0 + vpaddd .Lchacha20_consts(%rip), $A1, $A1 + vpaddd .Lchacha20_consts(%rip), $A2, $A2 + vpaddd $T1, $B0, $B0 + vpaddd $T1, $B1, $B1 + vpaddd $T1, $B2, $B2 + vpaddd $T2, $C0, $C0 + vpaddd $T2, $C1, $C1 + vpaddd $T2, $C2, $C2 + vpaddd $ctr0_store, $D0, $D0 + vpaddd $ctr1_store, $D1, $D1 + vpaddd $ctr2_store, $D2, $D2 + vperm2i128 \$0x02, $A0, $B0, $T0 + # Clamp and store the key + vpand .Lclamp(%rip), $T0, $T0 + vmovdqa $T0, $r_store + # Stream for up to 320 bytes + vperm2i128 \$0x13, $A0, $B0, $A0 + vperm2i128 \$0x13, $C0, $D0, $B0 + vperm2i128 \$0x02, $A1, $B1, $C0 + vperm2i128 \$0x02, $C1, $D1, $D0 + vperm2i128 \$0x13, $A1, $B1, $A1 + vperm2i128 \$0x13, $C1, $D1, $B1 + vperm2i128 \$0x02, $A2, $B2, $C1 + vperm2i128 \$0x02, $C2, $D2, $D1 + vperm2i128 \$0x13, $A2, $B2, $A2 + vperm2i128 \$0x13, $C2, $D2, $B2 + jmp .Lopen_avx2_short +.size chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2 +.cfi_endproc +############################################################################### +############################################################################### +.globl chacha20_poly1305_seal_avx2 +.type chacha20_poly1305_seal_avx2,\@function,6 +.align 64 +chacha20_poly1305_seal_avx2: +.cfi_startproc + _CET_ENDBR + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +# We write the calculated authenticator back to keyp at the end, so save +# the pointer on the stack too. + push $keyp +.cfi_push $keyp + sub \$288 + $xmm_storage + 32, %rsp +.cfi_adjust_cfa_offset 288 + 32 + lea 32(%rsp), %rbp + and \$-32, %rbp\n"; +$code.=" + movaps %xmm6,16*0+$xmm_store + movaps %xmm7,16*1+$xmm_store + movaps %xmm8,16*2+$xmm_store + movaps %xmm9,16*3+$xmm_store + movaps %xmm10,16*4+$xmm_store + movaps %xmm11,16*5+$xmm_store + movaps %xmm12,16*6+$xmm_store + movaps %xmm13,16*7+$xmm_store + movaps %xmm14,16*8+$xmm_store + movaps %xmm15,16*9+$xmm_store\n" if ($win64); +$code.=" + mov 56($keyp), $inl # extra_in_len + addq %rdx, $inl + mov $adl, 0+$len_store + mov $inl, 8+$len_store + mov %rdx, $inl + + vzeroupper + vmovdqa .Lchacha20_consts(%rip), $A0 + vbroadcasti128 0*16($keyp), $B0 + vbroadcasti128 1*16($keyp), $C0 + vbroadcasti128 2*16($keyp), $D0 + vpaddd .Lavx2_init(%rip), $D0, $D0 + cmp \$6*32, $inl + jbe .Lseal_avx2_192 + cmp \$10*32, $inl + jbe .Lseal_avx2_320 + vmovdqa $A0, $A1 + vmovdqa $A0, $A2 + vmovdqa $A0, $A3 + vmovdqa $B0, $B1 + vmovdqa $B0, $B2 + vmovdqa $B0, $B3 + vmovdqa $B0, $state1_store + vmovdqa $C0, $C1 + vmovdqa $C0, $C2 + vmovdqa $C0, $C3 + vmovdqa $C0, $state2_store + vmovdqa $D0, $D3 + vpaddd .Lavx2_inc(%rip), $D3, $D2 + vpaddd .Lavx2_inc(%rip), $D2, $D1 + vpaddd .Lavx2_inc(%rip), $D1, $D0 + vmovdqa $D0, $ctr0_store + vmovdqa $D1, $ctr1_store + vmovdqa $D2, $ctr2_store + vmovdqa $D3, $ctr3_store + mov \$10, $acc0 +.Lseal_avx2_init_rounds: \n"; + foreach $l (@loop_body) {$code.=$l."\n";} + @loop_body = split /\n/, $chacha_body; $code.=" + dec $acc0 + jnz .Lseal_avx2_init_rounds\n"; + &finalize_state_avx2(4); $code.=" + vperm2i128 \$0x13, $C3, $D3, $C3 + vperm2i128 \$0x02, $A3, $B3, $D3 + vperm2i128 \$0x13, $A3, $B3, $A3 + vpand .Lclamp(%rip), $D3, $D3 + vmovdqa $D3, $r_store + mov $adl, $itr2 + call poly_hash_ad_internal + # Safely store 320 bytes (otherwise would handle with optimized call) + vpxor 0*32($inp), $A3, $A3 + vpxor 1*32($inp), $C3, $C3 + vmovdqu $A3, 0*32($oup) + vmovdqu $C3, 1*32($oup)\n"; + &xor_stream_avx2($A2,$B2,$C2,$D2,2*32,$T3); + &xor_stream_avx2($A1,$B1,$C1,$D1,6*32,$T3); + &finish_stream_avx2($A0,$B0,$C0,$D0,$T3); $code.=" + lea 10*32($inp), $inp + sub \$10*32, $inl + mov \$10*32, $itr1 + cmp \$4*32, $inl + jbe .Lseal_avx2_short_hash_remainder + vpxor 0*32($inp), $A0, $A0 + vpxor 1*32($inp), $B0, $B0 + vpxor 2*32($inp), $C0, $C0 + vpxor 3*32($inp), $D0, $D0 + vmovdqu $A0, 10*32($oup) + vmovdqu $B0, 11*32($oup) + vmovdqu $C0, 12*32($oup) + vmovdqu $D0, 13*32($oup) + lea 4*32($inp), $inp + sub \$4*32, $inl + mov \$8, $itr1 + mov \$2, $itr2 + cmp \$4*32, $inl + jbe .Lseal_avx2_tail_128 + cmp \$8*32, $inl + jbe .Lseal_avx2_tail_256 + cmp \$12*32, $inl + jbe .Lseal_avx2_tail_384 + cmp \$16*32, $inl + jbe .Lseal_avx2_tail_512\n"; + # We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop + &prep_state_avx2(4); + foreach $l (@loop_body) {$code.=$l."\n";} + @loop_body = split /\n/, $chacha_body; + &emit_body(41); + @loop_body = split /\n/, $chacha_body; $code.=" + sub \$16, $oup + mov \$9, $itr1 + jmp .Lseal_avx2_main_loop_rounds_entry +.align 32 +.Lseal_avx2_main_loop: \n"; + &prep_state_avx2(4); $code.=" + mov \$10, $itr1 +.align 32 +.Lseal_avx2_main_loop_rounds: \n"; + &poly_add("0*8($oup)"); + &emit_body(10); + &poly_stage1_mulx(); + &emit_body(9); + &poly_stage2_mulx(); + &emit_body(12); + &poly_stage3_mulx(); + &emit_body(10); + &poly_reduce_stage(); $code.=" +.Lseal_avx2_main_loop_rounds_entry: \n"; + &emit_body(9); + &poly_add("2*8($oup)"); + &emit_body(8); + &poly_stage1_mulx(); + &emit_body(18); + &poly_stage2_mulx(); + &emit_body(18); + &poly_stage3_mulx(); + &emit_body(9); + &poly_reduce_stage(); + &emit_body(8); + &poly_add("4*8($oup)"); $code.=" + lea 6*8($oup), $oup\n"; + &emit_body(18); + &poly_stage1_mulx(); + &emit_body(8); + &poly_stage2_mulx(); + &emit_body(8); + &poly_stage3_mulx(); + &emit_body(18); + &poly_reduce_stage(); + foreach $l (@loop_body) {$code.=$l."\n";} + @loop_body = split /\n/, $chacha_body; $code.=" + dec $itr1 + jne .Lseal_avx2_main_loop_rounds\n"; + &finalize_state_avx2(4); $code.=" + vmovdqa $A0, $tmp_store\n"; + &poly_add("0*8($oup)"); + &poly_mul_mulx(); + &poly_add("2*8($oup)"); + &poly_mul_mulx(); $code.=" + lea 4*8($oup), $oup\n"; + &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.=" + vmovdqa $tmp_store, $A0\n"; + &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3); + &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3); + &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.=" + lea 16*32($inp), $inp + sub \$16*32, $inl + cmp \$16*32, $inl + jg .Lseal_avx2_main_loop +\n"; + &poly_add("0*8($oup)"); + &poly_mul_mulx(); + &poly_add("2*8($oup)"); + &poly_mul_mulx(); $code.=" + lea 4*8($oup), $oup + mov \$10, $itr1 + xor $itr2, $itr2 + + cmp \$12*32, $inl + ja .Lseal_avx2_tail_512 + cmp \$8*32, $inl + ja .Lseal_avx2_tail_384 + cmp \$4*32, $inl + ja .Lseal_avx2_tail_256 +############################################################################### +.Lseal_avx2_tail_128:\n"; + &prep_state_avx2(1); $code.=" +.Lseal_avx2_tail_128_rounds_and_3xhash: \n"; + &poly_add("0($oup)"); + &poly_mul_mulx(); $code.=" + lea 2*8($oup), $oup +.Lseal_avx2_tail_128_rounds_and_2xhash: \n"; + &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); + &poly_add("0*8($oup)"); + &poly_mul_mulx(); + &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); + &poly_add("2*8($oup)"); + &poly_mul_mulx(); $code.=" + lea 4*8($oup), $oup + dec $itr1 + jg .Lseal_avx2_tail_128_rounds_and_3xhash + dec $itr2 + jge .Lseal_avx2_tail_128_rounds_and_2xhash\n"; + &finalize_state_avx2(1); + &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" + jmp .Lseal_avx2_short_loop +############################################################################### +.Lseal_avx2_tail_256:\n"; + &prep_state_avx2(2); $code.=" +.Lseal_avx2_tail_256_rounds_and_3xhash: \n"; + &poly_add("0($oup)"); + &poly_mul(); $code.=" + lea 2*8($oup), $oup +.Lseal_avx2_tail_256_rounds_and_2xhash: \n"; + &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); + &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); + &poly_add("0*8($oup)"); + &poly_mul(); + &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); + &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); + &poly_add("2*8($oup)"); + &poly_mul(); $code.=" + lea 4*8($oup), $oup + dec $itr1 + jg .Lseal_avx2_tail_256_rounds_and_3xhash + dec $itr2 + jge .Lseal_avx2_tail_256_rounds_and_2xhash\n"; + &finalize_state_avx2(2); + &xor_stream_avx2($A1,$B1,$C1,$D1,0*32,$T0); + &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" + mov \$4*32, $itr1 + lea 4*32($inp), $inp + sub \$4*32, $inl + jmp .Lseal_avx2_short_hash_remainder +############################################################################### +.Lseal_avx2_tail_384:\n"; + &prep_state_avx2(3); $code.=" +.Lseal_avx2_tail_384_rounds_and_3xhash: \n"; + &poly_add("0($oup)"); + &poly_mul(); $code.=" + lea 2*8($oup), $oup +.Lseal_avx2_tail_384_rounds_and_2xhash: \n"; + &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); + &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); + &poly_add("0*8($oup)"); + &poly_mul(); + &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left"); + &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); + &poly_add("2*8($oup)"); + &poly_mul(); + &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); + &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.=" + lea 4*8($oup), $oup + dec $itr1 + jg .Lseal_avx2_tail_384_rounds_and_3xhash + dec $itr2 + jge .Lseal_avx2_tail_384_rounds_and_2xhash\n"; + &finalize_state_avx2(3); + &xor_stream_avx2($A2,$B2,$C2,$D2,0*32,$T0); + &xor_stream_avx2($A1,$B1,$C1,$D1,4*32,$T0); + &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" + mov \$8*32, $itr1 + lea 8*32($inp), $inp + sub \$8*32, $inl + jmp .Lseal_avx2_short_hash_remainder +############################################################################### +.Lseal_avx2_tail_512:\n"; + &prep_state_avx2(4); $code.=" +.Lseal_avx2_tail_512_rounds_and_3xhash: \n"; + &poly_add("0($oup)"); + &poly_mul_mulx(); $code.=" + lea 2*8($oup), $oup +.Lseal_avx2_tail_512_rounds_and_2xhash: \n"; + &emit_body(20); + &poly_add("0*8($oup)"); + &emit_body(20); + &poly_stage1_mulx(); + &emit_body(20); + &poly_stage2_mulx(); + &emit_body(20); + &poly_stage3_mulx(); + &emit_body(20); + &poly_reduce_stage(); + &emit_body(20); + &poly_add("2*8($oup)"); + &emit_body(20); + &poly_stage1_mulx(); + &emit_body(20); + &poly_stage2_mulx(); + &emit_body(20); + &poly_stage3_mulx(); + &emit_body(20); + &poly_reduce_stage(); + foreach $l (@loop_body) {$code.=$l."\n";} + @loop_body = split /\n/, $chacha_body; $code.=" + lea 4*8($oup), $oup + dec $itr1 + jg .Lseal_avx2_tail_512_rounds_and_3xhash + dec $itr2 + jge .Lseal_avx2_tail_512_rounds_and_2xhash\n"; + &finalize_state_avx2(4); $code.=" + vmovdqa $A0, $tmp_store\n"; + &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.=" + vmovdqa $tmp_store, $A0\n"; + &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3); + &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3); + &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" + mov \$12*32, $itr1 + lea 12*32($inp), $inp + sub \$12*32, $inl + jmp .Lseal_avx2_short_hash_remainder +################################################################################ +.Lseal_avx2_320: + vmovdqa $A0, $A1 + vmovdqa $A0, $A2 + vmovdqa $B0, $B1 + vmovdqa $B0, $B2 + vmovdqa $C0, $C1 + vmovdqa $C0, $C2 + vpaddd .Lavx2_inc(%rip), $D0, $D1 + vpaddd .Lavx2_inc(%rip), $D1, $D2 + vmovdqa $B0, $T1 + vmovdqa $C0, $T2 + vmovdqa $D0, $ctr0_store + vmovdqa $D1, $ctr1_store + vmovdqa $D2, $ctr2_store + mov \$10, $acc0 +.Lseal_avx2_320_rounds: \n"; + &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); + &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); + &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left"); + &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); + &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); + &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.=" + dec $acc0 + jne .Lseal_avx2_320_rounds + vpaddd .Lchacha20_consts(%rip), $A0, $A0 + vpaddd .Lchacha20_consts(%rip), $A1, $A1 + vpaddd .Lchacha20_consts(%rip), $A2, $A2 + vpaddd $T1, $B0, $B0 + vpaddd $T1, $B1, $B1 + vpaddd $T1, $B2, $B2 + vpaddd $T2, $C0, $C0 + vpaddd $T2, $C1, $C1 + vpaddd $T2, $C2, $C2 + vpaddd $ctr0_store, $D0, $D0 + vpaddd $ctr1_store, $D1, $D1 + vpaddd $ctr2_store, $D2, $D2 + vperm2i128 \$0x02, $A0, $B0, $T0 + # Clamp and store the key + vpand .Lclamp(%rip), $T0, $T0 + vmovdqa $T0, $r_store + # Stream for up to 320 bytes + vperm2i128 \$0x13, $A0, $B0, $A0 + vperm2i128 \$0x13, $C0, $D0, $B0 + vperm2i128 \$0x02, $A1, $B1, $C0 + vperm2i128 \$0x02, $C1, $D1, $D0 + vperm2i128 \$0x13, $A1, $B1, $A1 + vperm2i128 \$0x13, $C1, $D1, $B1 + vperm2i128 \$0x02, $A2, $B2, $C1 + vperm2i128 \$0x02, $C2, $D2, $D1 + vperm2i128 \$0x13, $A2, $B2, $A2 + vperm2i128 \$0x13, $C2, $D2, $B2 + jmp .Lseal_avx2_short +################################################################################ +.Lseal_avx2_192: + vmovdqa $A0, $A1 + vmovdqa $A0, $A2 + vmovdqa $B0, $B1 + vmovdqa $B0, $B2 + vmovdqa $C0, $C1 + vmovdqa $C0, $C2 + vpaddd .Lavx2_inc(%rip), $D0, $D1 + vmovdqa $D0, $T2 + vmovdqa $D1, $T3 + mov \$10, $acc0 +.Lseal_avx2_192_rounds: \n"; + &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); + &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); + &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); + &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.=" + dec $acc0 + jne .Lseal_avx2_192_rounds + vpaddd $A2, $A0, $A0 + vpaddd $A2, $A1, $A1 + vpaddd $B2, $B0, $B0 + vpaddd $B2, $B1, $B1 + vpaddd $C2, $C0, $C0 + vpaddd $C2, $C1, $C1 + vpaddd $T2, $D0, $D0 + vpaddd $T3, $D1, $D1 + vperm2i128 \$0x02, $A0, $B0, $T0 + # Clamp and store the key + vpand .Lclamp(%rip), $T0, $T0 + vmovdqa $T0, $r_store + # Stream for up to 192 bytes + vperm2i128 \$0x13, $A0, $B0, $A0 + vperm2i128 \$0x13, $C0, $D0, $B0 + vperm2i128 \$0x02, $A1, $B1, $C0 + vperm2i128 \$0x02, $C1, $D1, $D0 + vperm2i128 \$0x13, $A1, $B1, $A1 + vperm2i128 \$0x13, $C1, $D1, $B1 +.Lseal_avx2_short: + mov $adl, $itr2 + call poly_hash_ad_internal + xor $itr1, $itr1 +.Lseal_avx2_short_hash_remainder: + cmp \$16, $itr1 + jb .Lseal_avx2_short_loop\n"; + &poly_add("0($oup)"); + &poly_mul(); $code.=" + sub \$16, $itr1 + add \$16, $oup + jmp .Lseal_avx2_short_hash_remainder +.Lseal_avx2_short_loop: + cmp \$32, $inl + jb .Lseal_avx2_short_tail + sub \$32, $inl + # Encrypt + vpxor ($inp), $A0, $A0 + vmovdqu $A0, ($oup) + lea 1*32($inp), $inp + # Load + hash\n"; + &poly_add("0*8($oup)"); + &poly_mul(); + &poly_add("2*8($oup)"); + &poly_mul(); $code.=" + lea 1*32($oup), $oup + # Shift stream + vmovdqa $B0, $A0 + vmovdqa $C0, $B0 + vmovdqa $D0, $C0 + vmovdqa $A1, $D0 + vmovdqa $B1, $A1 + vmovdqa $C1, $B1 + vmovdqa $D1, $C1 + vmovdqa $A2, $D1 + vmovdqa $B2, $A2 + jmp .Lseal_avx2_short_loop +.Lseal_avx2_short_tail: + cmp \$16, $inl + jb .Lseal_avx2_exit + sub \$16, $inl + vpxor ($inp), $A0x, $A3x + vmovdqu $A3x, ($oup) + lea 1*16($inp), $inp\n"; + &poly_add("0*8($oup)"); + &poly_mul(); $code.=" + lea 1*16($oup), $oup + vextracti128 \$1, $A0, $A0x +.Lseal_avx2_exit: + vzeroupper + jmp .Lseal_sse_tail_16 +.cfi_endproc +.size chacha20_poly1305_seal_avx2, .-chacha20_poly1305_seal_avx2 +"; +} + +$code =~ s/\`([^\`]*)\`/eval $1/gem; + +print $code; + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/ring-0.17.14/crypto/constant_time_test.c b/ring-0.17.14/crypto/constant_time_test.c new file mode 100644 index 0000000000..ed079f6a23 --- /dev/null +++ b/ring-0.17.14/crypto/constant_time_test.c @@ -0,0 +1,110 @@ +// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "internal.h" + +int bssl_constant_time_test_main(void); + +static int test_binary_op_w(crypto_word_t (*op)(crypto_word_t a, crypto_word_t b), + crypto_word_t a, crypto_word_t b, int is_true) { + crypto_word_t c = op(a, b); + if (is_true && c != CONSTTIME_TRUE_W) { + return 1; + } else if (!is_true && c != CONSTTIME_FALSE_W) { + return 1; + } + return 0; +} + +static int test_is_zero_w(crypto_word_t a) { + crypto_word_t c = constant_time_is_zero_w(a); + if (a == 0 && c != CONSTTIME_TRUE_W) { + return 1; + } else if (a != 0 && c != CONSTTIME_FALSE_W) { + return 1; + } + + c = constant_time_is_nonzero_w(a); + if (a == 0 && c != CONSTTIME_FALSE_W) { + return 1; + } else if (a != 0 && c != CONSTTIME_TRUE_W) { + return 1; + } + + return 0; +} + +static int test_select_w(crypto_word_t a, crypto_word_t b) { + crypto_word_t selected = constant_time_select_w(CONSTTIME_TRUE_W, a, b); + if (selected != a) { + return 1; + } + selected = constant_time_select_w(CONSTTIME_FALSE_W, a, b); + if (selected != b) { + return 1; + } + return 0; +} + +static crypto_word_t test_values_w[] = { + 0, + 1, + 1024, + 12345, + 32000, +#if defined(OPENSSL_64_BIT) + 0xffffffff / 2 - 1, + 0xffffffff / 2, + 0xffffffff / 2 + 1, + 0xffffffff - 1, + 0xffffffff, +#endif + SIZE_MAX / 2 - 1, + SIZE_MAX / 2, + SIZE_MAX / 2 + 1, + SIZE_MAX - 1, + SIZE_MAX +}; + +int bssl_constant_time_test_main(void) { + int num_failed = 0; + + for (size_t i = 0; + i < sizeof(test_values_w) / sizeof(test_values_w[0]); ++i) { + crypto_word_t a = test_values_w[i]; + num_failed += test_is_zero_w(a); + for (size_t j = 0; + j < sizeof(test_values_w) / sizeof(test_values_w[0]); ++j) { + crypto_word_t b = test_values_w[j]; + num_failed += test_binary_op_w(&constant_time_eq_w, a, b, a == b); + num_failed += test_binary_op_w(&constant_time_eq_w, b, a, b == a); + num_failed += test_select_w(a, b); + } + } + + return num_failed == 0; +} + +// Exposes `constant_time_conditional_memcpy` to Rust for tests only. +void bssl_constant_time_test_conditional_memcpy(uint8_t dst[256], const uint8_t src[256], + crypto_word_t b) { + constant_time_conditional_memcpy(dst, src, 256, b); + } + +// Exposes `constant_time_conditional_memxor` to Rust for tests only. +void bssl_constant_time_test_conditional_memxor(uint8_t dst[256], + const uint8_t src[256], + crypto_word_t b) { + constant_time_conditional_memxor(dst, src, 256, b); +} diff --git a/ring-0.17.14/crypto/cpu_intel.c b/ring-0.17.14/crypto/cpu_intel.c new file mode 100644 index 0000000000..6e792b6ba4 --- /dev/null +++ b/ring-0.17.14/crypto/cpu_intel.c @@ -0,0 +1,198 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + + +#if !defined(OPENSSL_NO_ASM) && (defined(OPENSSL_X86) || defined(OPENSSL_X86_64)) + +#if defined(_MSC_VER) && !defined(__clang__) +#pragma warning(push, 3) +#include +#include +#pragma warning(pop) +#endif + +#include "internal.h" + + +// OPENSSL_cpuid runs the cpuid instruction. |leaf| is passed in as EAX and ECX +// is set to zero. It writes EAX, EBX, ECX, and EDX to |*out_eax| through +// |*out_edx|. +static void OPENSSL_cpuid(uint32_t *out_eax, uint32_t *out_ebx, + uint32_t *out_ecx, uint32_t *out_edx, uint32_t leaf) { +#if defined(_MSC_VER) && !defined(__clang__) + int tmp[4]; + __cpuid(tmp, (int)leaf); + *out_eax = (uint32_t)tmp[0]; + *out_ebx = (uint32_t)tmp[1]; + *out_ecx = (uint32_t)tmp[2]; + *out_edx = (uint32_t)tmp[3]; +#elif defined(__pic__) && defined(OPENSSL_32_BIT) + // Inline assembly may not clobber the PIC register. For 32-bit, this is EBX. + // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602. + __asm__ volatile ( + "xor %%ecx, %%ecx\n" + "mov %%ebx, %%edi\n" + "cpuid\n" + "xchg %%edi, %%ebx\n" + : "=a"(*out_eax), "=D"(*out_ebx), "=c"(*out_ecx), "=d"(*out_edx) + : "a"(leaf) + ); +#else + __asm__ volatile ( + "xor %%ecx, %%ecx\n" + "cpuid\n" + : "=a"(*out_eax), "=b"(*out_ebx), "=c"(*out_ecx), "=d"(*out_edx) + : "a"(leaf) + ); +#endif +} + +// OPENSSL_xgetbv returns the value of an Intel Extended Control Register (XCR). +// Currently only XCR0 is defined by Intel so |xcr| should always be zero. +// +// See https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family +static uint64_t OPENSSL_xgetbv(uint32_t xcr) { +#if defined(_MSC_VER) && !defined(__clang__) + return (uint64_t)_xgetbv(xcr); +#else + uint32_t eax, edx; + __asm__ volatile ("xgetbv" : "=a"(eax), "=d"(edx) : "c"(xcr)); + return (((uint64_t)edx) << 32) | eax; +#endif +} + +void OPENSSL_cpuid_setup(uint32_t OPENSSL_ia32cap_P[4]) { + // Determine the vendor and maximum input value. + uint32_t eax, ebx, ecx, edx; + OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 0); + + uint32_t num_ids = eax; + + int is_intel = ebx == 0x756e6547 /* Genu */ && + edx == 0x49656e69 /* ineI */ && + ecx == 0x6c65746e /* ntel */; + + uint32_t extended_features[2] = {0}; + if (num_ids >= 7) { + OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 7); + extended_features[0] = ebx; + extended_features[1] = ecx; + } + + OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 1); + + const uint32_t base_family = (eax >> 8) & 15; + const uint32_t base_model = (eax >> 4) & 15; + + uint32_t family = base_family; + uint32_t model = base_model; + if (base_family == 15) { + const uint32_t ext_family = (eax >> 20) & 255; + family += ext_family; + } + if (base_family == 6 || base_family == 15) { + const uint32_t ext_model = (eax >> 16) & 15; + model |= ext_model << 4; + } + + // Reserved bit #30 is repurposed to signal an Intel CPU. + if (is_intel) { + edx |= (1u << 30); + } else { + edx &= ~(1u << 30); + } + + uint64_t xcr0 = 0; + if (ecx & (1u << 27)) { + // XCR0 may only be queried if the OSXSAVE bit is set. + xcr0 = OPENSSL_xgetbv(0); + } + // See Intel manual, volume 1, section 14.3. + if ((xcr0 & 6) != 6) { + // YMM registers cannot be used. + ecx &= ~(1u << 28); // AVX + ecx &= ~(1u << 12); // FMA + ecx &= ~(1u << 11); // AMD XOP + extended_features[0] &= ~(1u << 5); // AVX2 + extended_features[1] &= ~(1u << 9); // VAES + extended_features[1] &= ~(1u << 10); // VPCLMULQDQ + } + // See Intel manual, volume 1, sections 15.2 ("Detection of AVX-512 Foundation + // Instructions") through 15.4 ("Detection of Intel AVX-512 Instruction Groups + // Operating at 256 and 128-bit Vector Lengths"). + if ((xcr0 & 0xe6) != 0xe6) { + // Without XCR0.111xx11x, no AVX512 feature can be used. This includes ZMM + // registers, masking, SIMD registers 16-31 (even if accessed as YMM or + // XMM), and EVEX-coded instructions (even on YMM or XMM). Even if only + // XCR0.ZMM_Hi256 is missing, it isn't valid to use AVX512 features on + // shorter vectors, since AVX512 ties everything to the availability of + // 512-bit vectors. See the above-mentioned sections of the Intel manual, + // which say that *all* these XCR0 bits must be checked even when just using + // 128-bit or 256-bit vectors, and also volume 2a section 2.7.11 ("#UD + // Equations for EVEX") which says that all EVEX-coded instructions raise an + // undefined-instruction exception if any of these XCR0 bits is zero. + // + // AVX10 fixes this by reorganizing the features that used to be part of + // "AVX512" and allowing them to be used independently of 512-bit support. + // TODO: add AVX10 detection. + extended_features[0] &= ~(1u << 16); // AVX512F + extended_features[0] &= ~(1u << 17); // AVX512DQ + extended_features[0] &= ~(1u << 21); // AVX512IFMA + extended_features[0] &= ~(1u << 26); // AVX512PF + extended_features[0] &= ~(1u << 27); // AVX512ER + extended_features[0] &= ~(1u << 28); // AVX512CD + extended_features[0] &= ~(1u << 30); // AVX512BW + extended_features[0] &= ~(1u << 31); // AVX512VL + extended_features[1] &= ~(1u << 1); // AVX512VBMI + extended_features[1] &= ~(1u << 6); // AVX512VBMI2 + extended_features[1] &= ~(1u << 11); // AVX512VNNI + extended_features[1] &= ~(1u << 12); // AVX512BITALG + extended_features[1] &= ~(1u << 14); // AVX512VPOPCNTDQ + } + + // Repurpose the bit for the removed MPX feature to indicate when using zmm + // registers should be avoided even when they are supported. (When set, AVX512 + // features can still be used, but only using ymm or xmm registers.) Skylake + // suffered from severe downclocking when zmm registers were used, which + // affected unrelated code running on the system, making zmm registers not too + // useful outside of benchmarks. The situation improved significantly by Ice + // Lake, but a small amount of downclocking remained. (See + // https://lore.kernel.org/linux-crypto/e8ce1146-3952-6977-1d0e-a22758e58914@intel.com/) + // We take a conservative approach of not allowing zmm registers until after + // Ice Lake and Tiger Lake, i.e. until Sapphire Rapids on the server side. + // + // AMD CPUs, which support AVX512 starting with Zen 4, have not been reported + // to have any downclocking problem when zmm registers are used. + if (is_intel && family == 6 && + (model == 85 || // Skylake, Cascade Lake, Cooper Lake (server) + model == 106 || // Ice Lake (server) + model == 108 || // Ice Lake (micro server) + model == 125 || // Ice Lake (client) + model == 126 || // Ice Lake (mobile) + model == 140 || // Tiger Lake (mobile) + model == 141)) { // Tiger Lake (client) + extended_features[0] |= 1u << 14; + } else { + extended_features[0] &= ~(1u << 14); + } + + OPENSSL_ia32cap_P[0] = edx; + OPENSSL_ia32cap_P[1] = ecx; + OPENSSL_ia32cap_P[2] = extended_features[0]; + OPENSSL_ia32cap_P[3] = extended_features[1]; +} + +#endif // !OPENSSL_NO_ASM && (OPENSSL_X86 || OPENSSL_X86_64) diff --git a/ring-0.17.14/crypto/crypto.c b/ring-0.17.14/crypto/crypto.c new file mode 100644 index 0000000000..153f97f821 --- /dev/null +++ b/ring-0.17.14/crypto/crypto.c @@ -0,0 +1,34 @@ +// Copyright 2014 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +// Our assembly does not use the GOT to reference symbols, which means +// references to visible symbols will often require a TEXTREL. This is +// undesirable, so all assembly-referenced symbols should be hidden. CPU +// capabilities are the only such symbols defined in C. Explicitly hide them, +// rather than rely on being built with -fvisibility=hidden. +#if defined(OPENSSL_WINDOWS) +#define HIDDEN +#else +#define HIDDEN __attribute__((visibility("hidden"))) +#endif + +#if defined(OPENSSL_X86_64) +// These are declared as `AtomicU32` on the Rust side. +HIDDEN uint32_t avx2_available = 0; +HIDDEN uint32_t adx_bmi2_available = 0; +#elif defined(OPENSSL_ARM) +HIDDEN uint32_t neon_available = 0; +#endif diff --git a/ring-0.17.14/crypto/curve25519/asm/x25519-asm-arm.S b/ring-0.17.14/crypto/curve25519/asm/x25519-asm-arm.S new file mode 100644 index 0000000000..c085734102 --- /dev/null +++ b/ring-0.17.14/crypto/curve25519/asm/x25519-asm-arm.S @@ -0,0 +1,2124 @@ +// Copyright 2015 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/* This file is taken from crypto_scalarmult/curve25519/neon2/scalarmult.s in + * SUPERCOP 20141124 (http://bench.cr.yp.to/supercop.html). That code is public + * domain licensed but the standard Apache 2.0 license is included above to keep + * licensing simple. */ + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__) + +.fpu neon +.text +.align 4 + +.global x25519_NEON +.hidden x25519_NEON +.type x25519_NEON, %function +x25519_NEON: +vpush {q4,q5,q6,q7} +mov r12,sp +sub sp,sp,#736 +and sp,sp,#0xffffffe0 +strd r4,[sp,#0] +strd r6,[sp,#8] +strd r8,[sp,#16] +strd r10,[sp,#24] +str r12,[sp,#480] +str r14,[sp,#484] +mov r0,r0 +mov r1,r1 +mov r2,r2 +add r3,sp,#32 +ldr r4,=0 +ldr r5,=254 +vmov.i32 q0,#1 +vshr.u64 q1,q0,#7 +vshr.u64 q0,q0,#8 +vmov.i32 d4,#19 +vmov.i32 d5,#38 +add r6,sp,#512 +vst1.8 {d2-d3},[r6,: 128] +add r6,sp,#528 +vst1.8 {d0-d1},[r6,: 128] +add r6,sp,#544 +vst1.8 {d4-d5},[r6,: 128] +add r6,r3,#0 +vmov.i32 q2,#0 +vst1.8 {d4-d5},[r6,: 128]! +vst1.8 {d4-d5},[r6,: 128]! +vst1.8 d4,[r6,: 64] +add r6,r3,#0 +ldr r7,=960 +sub r7,r7,#2 +neg r7,r7 +sub r7,r7,r7,LSL #7 +str r7,[r6] +add r6,sp,#704 +vld1.8 {d4-d5},[r1]! +vld1.8 {d6-d7},[r1] +vst1.8 {d4-d5},[r6,: 128]! +vst1.8 {d6-d7},[r6,: 128] +sub r1,r6,#16 +ldrb r6,[r1] +and r6,r6,#248 +strb r6,[r1] +ldrb r6,[r1,#31] +and r6,r6,#127 +orr r6,r6,#64 +strb r6,[r1,#31] +vmov.i64 q2,#0xffffffff +vshr.u64 q3,q2,#7 +vshr.u64 q2,q2,#6 +vld1.8 {d8},[r2] +vld1.8 {d10},[r2] +add r2,r2,#6 +vld1.8 {d12},[r2] +vld1.8 {d14},[r2] +add r2,r2,#6 +vld1.8 {d16},[r2] +add r2,r2,#4 +vld1.8 {d18},[r2] +vld1.8 {d20},[r2] +add r2,r2,#6 +vld1.8 {d22},[r2] +add r2,r2,#2 +vld1.8 {d24},[r2] +vld1.8 {d26},[r2] +vshr.u64 q5,q5,#26 +vshr.u64 q6,q6,#3 +vshr.u64 q7,q7,#29 +vshr.u64 q8,q8,#6 +vshr.u64 q10,q10,#25 +vshr.u64 q11,q11,#3 +vshr.u64 q12,q12,#12 +vshr.u64 q13,q13,#38 +vand q4,q4,q2 +vand q6,q6,q2 +vand q8,q8,q2 +vand q10,q10,q2 +vand q2,q12,q2 +vand q5,q5,q3 +vand q7,q7,q3 +vand q9,q9,q3 +vand q11,q11,q3 +vand q3,q13,q3 +add r2,r3,#48 +vadd.i64 q12,q4,q1 +vadd.i64 q13,q10,q1 +vshr.s64 q12,q12,#26 +vshr.s64 q13,q13,#26 +vadd.i64 q5,q5,q12 +vshl.i64 q12,q12,#26 +vadd.i64 q14,q5,q0 +vadd.i64 q11,q11,q13 +vshl.i64 q13,q13,#26 +vadd.i64 q15,q11,q0 +vsub.i64 q4,q4,q12 +vshr.s64 q12,q14,#25 +vsub.i64 q10,q10,q13 +vshr.s64 q13,q15,#25 +vadd.i64 q6,q6,q12 +vshl.i64 q12,q12,#25 +vadd.i64 q14,q6,q1 +vadd.i64 q2,q2,q13 +vsub.i64 q5,q5,q12 +vshr.s64 q12,q14,#26 +vshl.i64 q13,q13,#25 +vadd.i64 q14,q2,q1 +vadd.i64 q7,q7,q12 +vshl.i64 q12,q12,#26 +vadd.i64 q15,q7,q0 +vsub.i64 q11,q11,q13 +vshr.s64 q13,q14,#26 +vsub.i64 q6,q6,q12 +vshr.s64 q12,q15,#25 +vadd.i64 q3,q3,q13 +vshl.i64 q13,q13,#26 +vadd.i64 q14,q3,q0 +vadd.i64 q8,q8,q12 +vshl.i64 q12,q12,#25 +vadd.i64 q15,q8,q1 +add r2,r2,#8 +vsub.i64 q2,q2,q13 +vshr.s64 q13,q14,#25 +vsub.i64 q7,q7,q12 +vshr.s64 q12,q15,#26 +vadd.i64 q14,q13,q13 +vadd.i64 q9,q9,q12 +vtrn.32 d12,d14 +vshl.i64 q12,q12,#26 +vtrn.32 d13,d15 +vadd.i64 q0,q9,q0 +vadd.i64 q4,q4,q14 +vst1.8 d12,[r2,: 64]! +vshl.i64 q6,q13,#4 +vsub.i64 q7,q8,q12 +vshr.s64 q0,q0,#25 +vadd.i64 q4,q4,q6 +vadd.i64 q6,q10,q0 +vshl.i64 q0,q0,#25 +vadd.i64 q8,q6,q1 +vadd.i64 q4,q4,q13 +vshl.i64 q10,q13,#25 +vadd.i64 q1,q4,q1 +vsub.i64 q0,q9,q0 +vshr.s64 q8,q8,#26 +vsub.i64 q3,q3,q10 +vtrn.32 d14,d0 +vshr.s64 q1,q1,#26 +vtrn.32 d15,d1 +vadd.i64 q0,q11,q8 +vst1.8 d14,[r2,: 64] +vshl.i64 q7,q8,#26 +vadd.i64 q5,q5,q1 +vtrn.32 d4,d6 +vshl.i64 q1,q1,#26 +vtrn.32 d5,d7 +vsub.i64 q3,q6,q7 +add r2,r2,#16 +vsub.i64 q1,q4,q1 +vst1.8 d4,[r2,: 64] +vtrn.32 d6,d0 +vtrn.32 d7,d1 +sub r2,r2,#8 +vtrn.32 d2,d10 +vtrn.32 d3,d11 +vst1.8 d6,[r2,: 64] +sub r2,r2,#24 +vst1.8 d2,[r2,: 64] +add r2,r3,#96 +vmov.i32 q0,#0 +vmov.i64 d2,#0xff +vmov.i64 d3,#0 +vshr.u32 q1,q1,#7 +vst1.8 {d2-d3},[r2,: 128]! +vst1.8 {d0-d1},[r2,: 128]! +vst1.8 d0,[r2,: 64] +add r2,r3,#144 +vmov.i32 q0,#0 +vst1.8 {d0-d1},[r2,: 128]! +vst1.8 {d0-d1},[r2,: 128]! +vst1.8 d0,[r2,: 64] +add r2,r3,#240 +vmov.i32 q0,#0 +vmov.i64 d2,#0xff +vmov.i64 d3,#0 +vshr.u32 q1,q1,#7 +vst1.8 {d2-d3},[r2,: 128]! +vst1.8 {d0-d1},[r2,: 128]! +vst1.8 d0,[r2,: 64] +add r2,r3,#48 +add r6,r3,#192 +vld1.8 {d0-d1},[r2,: 128]! +vld1.8 {d2-d3},[r2,: 128]! +vld1.8 {d4},[r2,: 64] +vst1.8 {d0-d1},[r6,: 128]! +vst1.8 {d2-d3},[r6,: 128]! +vst1.8 d4,[r6,: 64] +._mainloop: +mov r2,r5,LSR #3 +and r6,r5,#7 +ldrb r2,[r1,r2] +mov r2,r2,LSR r6 +and r2,r2,#1 +str r5,[sp,#488] +eor r4,r4,r2 +str r2,[sp,#492] +neg r2,r4 +add r4,r3,#96 +add r5,r3,#192 +add r6,r3,#144 +vld1.8 {d8-d9},[r4,: 128]! +add r7,r3,#240 +vld1.8 {d10-d11},[r5,: 128]! +veor q6,q4,q5 +vld1.8 {d14-d15},[r6,: 128]! +vdup.i32 q8,r2 +vld1.8 {d18-d19},[r7,: 128]! +veor q10,q7,q9 +vld1.8 {d22-d23},[r4,: 128]! +vand q6,q6,q8 +vld1.8 {d24-d25},[r5,: 128]! +vand q10,q10,q8 +vld1.8 {d26-d27},[r6,: 128]! +veor q4,q4,q6 +vld1.8 {d28-d29},[r7,: 128]! +veor q5,q5,q6 +vld1.8 {d0},[r4,: 64] +veor q6,q7,q10 +vld1.8 {d2},[r5,: 64] +veor q7,q9,q10 +vld1.8 {d4},[r6,: 64] +veor q9,q11,q12 +vld1.8 {d6},[r7,: 64] +veor q10,q0,q1 +sub r2,r4,#32 +vand q9,q9,q8 +sub r4,r5,#32 +vand q10,q10,q8 +sub r5,r6,#32 +veor q11,q11,q9 +sub r6,r7,#32 +veor q0,q0,q10 +veor q9,q12,q9 +veor q1,q1,q10 +veor q10,q13,q14 +veor q12,q2,q3 +vand q10,q10,q8 +vand q8,q12,q8 +veor q12,q13,q10 +veor q2,q2,q8 +veor q10,q14,q10 +veor q3,q3,q8 +vadd.i32 q8,q4,q6 +vsub.i32 q4,q4,q6 +vst1.8 {d16-d17},[r2,: 128]! +vadd.i32 q6,q11,q12 +vst1.8 {d8-d9},[r5,: 128]! +vsub.i32 q4,q11,q12 +vst1.8 {d12-d13},[r2,: 128]! +vadd.i32 q6,q0,q2 +vst1.8 {d8-d9},[r5,: 128]! +vsub.i32 q0,q0,q2 +vst1.8 d12,[r2,: 64] +vadd.i32 q2,q5,q7 +vst1.8 d0,[r5,: 64] +vsub.i32 q0,q5,q7 +vst1.8 {d4-d5},[r4,: 128]! +vadd.i32 q2,q9,q10 +vst1.8 {d0-d1},[r6,: 128]! +vsub.i32 q0,q9,q10 +vst1.8 {d4-d5},[r4,: 128]! +vadd.i32 q2,q1,q3 +vst1.8 {d0-d1},[r6,: 128]! +vsub.i32 q0,q1,q3 +vst1.8 d4,[r4,: 64] +vst1.8 d0,[r6,: 64] +add r2,sp,#544 +add r4,r3,#96 +add r5,r3,#144 +vld1.8 {d0-d1},[r2,: 128] +vld1.8 {d2-d3},[r4,: 128]! +vld1.8 {d4-d5},[r5,: 128]! +vzip.i32 q1,q2 +vld1.8 {d6-d7},[r4,: 128]! +vld1.8 {d8-d9},[r5,: 128]! +vshl.i32 q5,q1,#1 +vzip.i32 q3,q4 +vshl.i32 q6,q2,#1 +vld1.8 {d14},[r4,: 64] +vshl.i32 q8,q3,#1 +vld1.8 {d15},[r5,: 64] +vshl.i32 q9,q4,#1 +vmul.i32 d21,d7,d1 +vtrn.32 d14,d15 +vmul.i32 q11,q4,q0 +vmul.i32 q0,q7,q0 +vmull.s32 q12,d2,d2 +vmlal.s32 q12,d11,d1 +vmlal.s32 q12,d12,d0 +vmlal.s32 q12,d13,d23 +vmlal.s32 q12,d16,d22 +vmlal.s32 q12,d7,d21 +vmull.s32 q10,d2,d11 +vmlal.s32 q10,d4,d1 +vmlal.s32 q10,d13,d0 +vmlal.s32 q10,d6,d23 +vmlal.s32 q10,d17,d22 +vmull.s32 q13,d10,d4 +vmlal.s32 q13,d11,d3 +vmlal.s32 q13,d13,d1 +vmlal.s32 q13,d16,d0 +vmlal.s32 q13,d17,d23 +vmlal.s32 q13,d8,d22 +vmull.s32 q1,d10,d5 +vmlal.s32 q1,d11,d4 +vmlal.s32 q1,d6,d1 +vmlal.s32 q1,d17,d0 +vmlal.s32 q1,d8,d23 +vmull.s32 q14,d10,d6 +vmlal.s32 q14,d11,d13 +vmlal.s32 q14,d4,d4 +vmlal.s32 q14,d17,d1 +vmlal.s32 q14,d18,d0 +vmlal.s32 q14,d9,d23 +vmull.s32 q11,d10,d7 +vmlal.s32 q11,d11,d6 +vmlal.s32 q11,d12,d5 +vmlal.s32 q11,d8,d1 +vmlal.s32 q11,d19,d0 +vmull.s32 q15,d10,d8 +vmlal.s32 q15,d11,d17 +vmlal.s32 q15,d12,d6 +vmlal.s32 q15,d13,d5 +vmlal.s32 q15,d19,d1 +vmlal.s32 q15,d14,d0 +vmull.s32 q2,d10,d9 +vmlal.s32 q2,d11,d8 +vmlal.s32 q2,d12,d7 +vmlal.s32 q2,d13,d6 +vmlal.s32 q2,d14,d1 +vmull.s32 q0,d15,d1 +vmlal.s32 q0,d10,d14 +vmlal.s32 q0,d11,d19 +vmlal.s32 q0,d12,d8 +vmlal.s32 q0,d13,d17 +vmlal.s32 q0,d6,d6 +add r2,sp,#512 +vld1.8 {d18-d19},[r2,: 128] +vmull.s32 q3,d16,d7 +vmlal.s32 q3,d10,d15 +vmlal.s32 q3,d11,d14 +vmlal.s32 q3,d12,d9 +vmlal.s32 q3,d13,d8 +add r2,sp,#528 +vld1.8 {d8-d9},[r2,: 128] +vadd.i64 q5,q12,q9 +vadd.i64 q6,q15,q9 +vshr.s64 q5,q5,#26 +vshr.s64 q6,q6,#26 +vadd.i64 q7,q10,q5 +vshl.i64 q5,q5,#26 +vadd.i64 q8,q7,q4 +vadd.i64 q2,q2,q6 +vshl.i64 q6,q6,#26 +vadd.i64 q10,q2,q4 +vsub.i64 q5,q12,q5 +vshr.s64 q8,q8,#25 +vsub.i64 q6,q15,q6 +vshr.s64 q10,q10,#25 +vadd.i64 q12,q13,q8 +vshl.i64 q8,q8,#25 +vadd.i64 q13,q12,q9 +vadd.i64 q0,q0,q10 +vsub.i64 q7,q7,q8 +vshr.s64 q8,q13,#26 +vshl.i64 q10,q10,#25 +vadd.i64 q13,q0,q9 +vadd.i64 q1,q1,q8 +vshl.i64 q8,q8,#26 +vadd.i64 q15,q1,q4 +vsub.i64 q2,q2,q10 +vshr.s64 q10,q13,#26 +vsub.i64 q8,q12,q8 +vshr.s64 q12,q15,#25 +vadd.i64 q3,q3,q10 +vshl.i64 q10,q10,#26 +vadd.i64 q13,q3,q4 +vadd.i64 q14,q14,q12 +add r2,r3,#288 +vshl.i64 q12,q12,#25 +add r4,r3,#336 +vadd.i64 q15,q14,q9 +add r2,r2,#8 +vsub.i64 q0,q0,q10 +add r4,r4,#8 +vshr.s64 q10,q13,#25 +vsub.i64 q1,q1,q12 +vshr.s64 q12,q15,#26 +vadd.i64 q13,q10,q10 +vadd.i64 q11,q11,q12 +vtrn.32 d16,d2 +vshl.i64 q12,q12,#26 +vtrn.32 d17,d3 +vadd.i64 q1,q11,q4 +vadd.i64 q4,q5,q13 +vst1.8 d16,[r2,: 64]! +vshl.i64 q5,q10,#4 +vst1.8 d17,[r4,: 64]! +vsub.i64 q8,q14,q12 +vshr.s64 q1,q1,#25 +vadd.i64 q4,q4,q5 +vadd.i64 q5,q6,q1 +vshl.i64 q1,q1,#25 +vadd.i64 q6,q5,q9 +vadd.i64 q4,q4,q10 +vshl.i64 q10,q10,#25 +vadd.i64 q9,q4,q9 +vsub.i64 q1,q11,q1 +vshr.s64 q6,q6,#26 +vsub.i64 q3,q3,q10 +vtrn.32 d16,d2 +vshr.s64 q9,q9,#26 +vtrn.32 d17,d3 +vadd.i64 q1,q2,q6 +vst1.8 d16,[r2,: 64] +vshl.i64 q2,q6,#26 +vst1.8 d17,[r4,: 64] +vadd.i64 q6,q7,q9 +vtrn.32 d0,d6 +vshl.i64 q7,q9,#26 +vtrn.32 d1,d7 +vsub.i64 q2,q5,q2 +add r2,r2,#16 +vsub.i64 q3,q4,q7 +vst1.8 d0,[r2,: 64] +add r4,r4,#16 +vst1.8 d1,[r4,: 64] +vtrn.32 d4,d2 +vtrn.32 d5,d3 +sub r2,r2,#8 +sub r4,r4,#8 +vtrn.32 d6,d12 +vtrn.32 d7,d13 +vst1.8 d4,[r2,: 64] +vst1.8 d5,[r4,: 64] +sub r2,r2,#24 +sub r4,r4,#24 +vst1.8 d6,[r2,: 64] +vst1.8 d7,[r4,: 64] +add r2,r3,#240 +add r4,r3,#96 +vld1.8 {d0-d1},[r4,: 128]! +vld1.8 {d2-d3},[r4,: 128]! +vld1.8 {d4},[r4,: 64] +add r4,r3,#144 +vld1.8 {d6-d7},[r4,: 128]! +vtrn.32 q0,q3 +vld1.8 {d8-d9},[r4,: 128]! +vshl.i32 q5,q0,#4 +vtrn.32 q1,q4 +vshl.i32 q6,q3,#4 +vadd.i32 q5,q5,q0 +vadd.i32 q6,q6,q3 +vshl.i32 q7,q1,#4 +vld1.8 {d5},[r4,: 64] +vshl.i32 q8,q4,#4 +vtrn.32 d4,d5 +vadd.i32 q7,q7,q1 +vadd.i32 q8,q8,q4 +vld1.8 {d18-d19},[r2,: 128]! +vshl.i32 q10,q2,#4 +vld1.8 {d22-d23},[r2,: 128]! +vadd.i32 q10,q10,q2 +vld1.8 {d24},[r2,: 64] +vadd.i32 q5,q5,q0 +add r2,r3,#192 +vld1.8 {d26-d27},[r2,: 128]! +vadd.i32 q6,q6,q3 +vld1.8 {d28-d29},[r2,: 128]! +vadd.i32 q8,q8,q4 +vld1.8 {d25},[r2,: 64] +vadd.i32 q10,q10,q2 +vtrn.32 q9,q13 +vadd.i32 q7,q7,q1 +vadd.i32 q5,q5,q0 +vtrn.32 q11,q14 +vadd.i32 q6,q6,q3 +add r2,sp,#560 +vadd.i32 q10,q10,q2 +vtrn.32 d24,d25 +vst1.8 {d12-d13},[r2,: 128] +vshl.i32 q6,q13,#1 +add r2,sp,#576 +vst1.8 {d20-d21},[r2,: 128] +vshl.i32 q10,q14,#1 +add r2,sp,#592 +vst1.8 {d12-d13},[r2,: 128] +vshl.i32 q15,q12,#1 +vadd.i32 q8,q8,q4 +vext.32 d10,d31,d30,#0 +vadd.i32 q7,q7,q1 +add r2,sp,#608 +vst1.8 {d16-d17},[r2,: 128] +vmull.s32 q8,d18,d5 +vmlal.s32 q8,d26,d4 +vmlal.s32 q8,d19,d9 +vmlal.s32 q8,d27,d3 +vmlal.s32 q8,d22,d8 +vmlal.s32 q8,d28,d2 +vmlal.s32 q8,d23,d7 +vmlal.s32 q8,d29,d1 +vmlal.s32 q8,d24,d6 +vmlal.s32 q8,d25,d0 +add r2,sp,#624 +vst1.8 {d14-d15},[r2,: 128] +vmull.s32 q2,d18,d4 +vmlal.s32 q2,d12,d9 +vmlal.s32 q2,d13,d8 +vmlal.s32 q2,d19,d3 +vmlal.s32 q2,d22,d2 +vmlal.s32 q2,d23,d1 +vmlal.s32 q2,d24,d0 +add r2,sp,#640 +vst1.8 {d20-d21},[r2,: 128] +vmull.s32 q7,d18,d9 +vmlal.s32 q7,d26,d3 +vmlal.s32 q7,d19,d8 +vmlal.s32 q7,d27,d2 +vmlal.s32 q7,d22,d7 +vmlal.s32 q7,d28,d1 +vmlal.s32 q7,d23,d6 +vmlal.s32 q7,d29,d0 +add r2,sp,#656 +vst1.8 {d10-d11},[r2,: 128] +vmull.s32 q5,d18,d3 +vmlal.s32 q5,d19,d2 +vmlal.s32 q5,d22,d1 +vmlal.s32 q5,d23,d0 +vmlal.s32 q5,d12,d8 +add r2,sp,#672 +vst1.8 {d16-d17},[r2,: 128] +vmull.s32 q4,d18,d8 +vmlal.s32 q4,d26,d2 +vmlal.s32 q4,d19,d7 +vmlal.s32 q4,d27,d1 +vmlal.s32 q4,d22,d6 +vmlal.s32 q4,d28,d0 +vmull.s32 q8,d18,d7 +vmlal.s32 q8,d26,d1 +vmlal.s32 q8,d19,d6 +vmlal.s32 q8,d27,d0 +add r2,sp,#576 +vld1.8 {d20-d21},[r2,: 128] +vmlal.s32 q7,d24,d21 +vmlal.s32 q7,d25,d20 +vmlal.s32 q4,d23,d21 +vmlal.s32 q4,d29,d20 +vmlal.s32 q8,d22,d21 +vmlal.s32 q8,d28,d20 +vmlal.s32 q5,d24,d20 +add r2,sp,#576 +vst1.8 {d14-d15},[r2,: 128] +vmull.s32 q7,d18,d6 +vmlal.s32 q7,d26,d0 +add r2,sp,#656 +vld1.8 {d30-d31},[r2,: 128] +vmlal.s32 q2,d30,d21 +vmlal.s32 q7,d19,d21 +vmlal.s32 q7,d27,d20 +add r2,sp,#624 +vld1.8 {d26-d27},[r2,: 128] +vmlal.s32 q4,d25,d27 +vmlal.s32 q8,d29,d27 +vmlal.s32 q8,d25,d26 +vmlal.s32 q7,d28,d27 +vmlal.s32 q7,d29,d26 +add r2,sp,#608 +vld1.8 {d28-d29},[r2,: 128] +vmlal.s32 q4,d24,d29 +vmlal.s32 q8,d23,d29 +vmlal.s32 q8,d24,d28 +vmlal.s32 q7,d22,d29 +vmlal.s32 q7,d23,d28 +add r2,sp,#608 +vst1.8 {d8-d9},[r2,: 128] +add r2,sp,#560 +vld1.8 {d8-d9},[r2,: 128] +vmlal.s32 q7,d24,d9 +vmlal.s32 q7,d25,d31 +vmull.s32 q1,d18,d2 +vmlal.s32 q1,d19,d1 +vmlal.s32 q1,d22,d0 +vmlal.s32 q1,d24,d27 +vmlal.s32 q1,d23,d20 +vmlal.s32 q1,d12,d7 +vmlal.s32 q1,d13,d6 +vmull.s32 q6,d18,d1 +vmlal.s32 q6,d19,d0 +vmlal.s32 q6,d23,d27 +vmlal.s32 q6,d22,d20 +vmlal.s32 q6,d24,d26 +vmull.s32 q0,d18,d0 +vmlal.s32 q0,d22,d27 +vmlal.s32 q0,d23,d26 +vmlal.s32 q0,d24,d31 +vmlal.s32 q0,d19,d20 +add r2,sp,#640 +vld1.8 {d18-d19},[r2,: 128] +vmlal.s32 q2,d18,d7 +vmlal.s32 q2,d19,d6 +vmlal.s32 q5,d18,d6 +vmlal.s32 q5,d19,d21 +vmlal.s32 q1,d18,d21 +vmlal.s32 q1,d19,d29 +vmlal.s32 q0,d18,d28 +vmlal.s32 q0,d19,d9 +vmlal.s32 q6,d18,d29 +vmlal.s32 q6,d19,d28 +add r2,sp,#592 +vld1.8 {d18-d19},[r2,: 128] +add r2,sp,#512 +vld1.8 {d22-d23},[r2,: 128] +vmlal.s32 q5,d19,d7 +vmlal.s32 q0,d18,d21 +vmlal.s32 q0,d19,d29 +vmlal.s32 q6,d18,d6 +add r2,sp,#528 +vld1.8 {d6-d7},[r2,: 128] +vmlal.s32 q6,d19,d21 +add r2,sp,#576 +vld1.8 {d18-d19},[r2,: 128] +vmlal.s32 q0,d30,d8 +add r2,sp,#672 +vld1.8 {d20-d21},[r2,: 128] +vmlal.s32 q5,d30,d29 +add r2,sp,#608 +vld1.8 {d24-d25},[r2,: 128] +vmlal.s32 q1,d30,d28 +vadd.i64 q13,q0,q11 +vadd.i64 q14,q5,q11 +vmlal.s32 q6,d30,d9 +vshr.s64 q4,q13,#26 +vshr.s64 q13,q14,#26 +vadd.i64 q7,q7,q4 +vshl.i64 q4,q4,#26 +vadd.i64 q14,q7,q3 +vadd.i64 q9,q9,q13 +vshl.i64 q13,q13,#26 +vadd.i64 q15,q9,q3 +vsub.i64 q0,q0,q4 +vshr.s64 q4,q14,#25 +vsub.i64 q5,q5,q13 +vshr.s64 q13,q15,#25 +vadd.i64 q6,q6,q4 +vshl.i64 q4,q4,#25 +vadd.i64 q14,q6,q11 +vadd.i64 q2,q2,q13 +vsub.i64 q4,q7,q4 +vshr.s64 q7,q14,#26 +vshl.i64 q13,q13,#25 +vadd.i64 q14,q2,q11 +vadd.i64 q8,q8,q7 +vshl.i64 q7,q7,#26 +vadd.i64 q15,q8,q3 +vsub.i64 q9,q9,q13 +vshr.s64 q13,q14,#26 +vsub.i64 q6,q6,q7 +vshr.s64 q7,q15,#25 +vadd.i64 q10,q10,q13 +vshl.i64 q13,q13,#26 +vadd.i64 q14,q10,q3 +vadd.i64 q1,q1,q7 +add r2,r3,#144 +vshl.i64 q7,q7,#25 +add r4,r3,#96 +vadd.i64 q15,q1,q11 +add r2,r2,#8 +vsub.i64 q2,q2,q13 +add r4,r4,#8 +vshr.s64 q13,q14,#25 +vsub.i64 q7,q8,q7 +vshr.s64 q8,q15,#26 +vadd.i64 q14,q13,q13 +vadd.i64 q12,q12,q8 +vtrn.32 d12,d14 +vshl.i64 q8,q8,#26 +vtrn.32 d13,d15 +vadd.i64 q3,q12,q3 +vadd.i64 q0,q0,q14 +vst1.8 d12,[r2,: 64]! +vshl.i64 q7,q13,#4 +vst1.8 d13,[r4,: 64]! +vsub.i64 q1,q1,q8 +vshr.s64 q3,q3,#25 +vadd.i64 q0,q0,q7 +vadd.i64 q5,q5,q3 +vshl.i64 q3,q3,#25 +vadd.i64 q6,q5,q11 +vadd.i64 q0,q0,q13 +vshl.i64 q7,q13,#25 +vadd.i64 q8,q0,q11 +vsub.i64 q3,q12,q3 +vshr.s64 q6,q6,#26 +vsub.i64 q7,q10,q7 +vtrn.32 d2,d6 +vshr.s64 q8,q8,#26 +vtrn.32 d3,d7 +vadd.i64 q3,q9,q6 +vst1.8 d2,[r2,: 64] +vshl.i64 q6,q6,#26 +vst1.8 d3,[r4,: 64] +vadd.i64 q1,q4,q8 +vtrn.32 d4,d14 +vshl.i64 q4,q8,#26 +vtrn.32 d5,d15 +vsub.i64 q5,q5,q6 +add r2,r2,#16 +vsub.i64 q0,q0,q4 +vst1.8 d4,[r2,: 64] +add r4,r4,#16 +vst1.8 d5,[r4,: 64] +vtrn.32 d10,d6 +vtrn.32 d11,d7 +sub r2,r2,#8 +sub r4,r4,#8 +vtrn.32 d0,d2 +vtrn.32 d1,d3 +vst1.8 d10,[r2,: 64] +vst1.8 d11,[r4,: 64] +sub r2,r2,#24 +sub r4,r4,#24 +vst1.8 d0,[r2,: 64] +vst1.8 d1,[r4,: 64] +add r2,r3,#288 +add r4,r3,#336 +vld1.8 {d0-d1},[r2,: 128]! +vld1.8 {d2-d3},[r4,: 128]! +vsub.i32 q0,q0,q1 +vld1.8 {d2-d3},[r2,: 128]! +vld1.8 {d4-d5},[r4,: 128]! +vsub.i32 q1,q1,q2 +add r5,r3,#240 +vld1.8 {d4},[r2,: 64] +vld1.8 {d6},[r4,: 64] +vsub.i32 q2,q2,q3 +vst1.8 {d0-d1},[r5,: 128]! +vst1.8 {d2-d3},[r5,: 128]! +vst1.8 d4,[r5,: 64] +add r2,r3,#144 +add r4,r3,#96 +add r5,r3,#144 +add r6,r3,#192 +vld1.8 {d0-d1},[r2,: 128]! +vld1.8 {d2-d3},[r4,: 128]! +vsub.i32 q2,q0,q1 +vadd.i32 q0,q0,q1 +vld1.8 {d2-d3},[r2,: 128]! +vld1.8 {d6-d7},[r4,: 128]! +vsub.i32 q4,q1,q3 +vadd.i32 q1,q1,q3 +vld1.8 {d6},[r2,: 64] +vld1.8 {d10},[r4,: 64] +vsub.i32 q6,q3,q5 +vadd.i32 q3,q3,q5 +vst1.8 {d4-d5},[r5,: 128]! +vst1.8 {d0-d1},[r6,: 128]! +vst1.8 {d8-d9},[r5,: 128]! +vst1.8 {d2-d3},[r6,: 128]! +vst1.8 d12,[r5,: 64] +vst1.8 d6,[r6,: 64] +add r2,r3,#0 +add r4,r3,#240 +vld1.8 {d0-d1},[r4,: 128]! +vld1.8 {d2-d3},[r4,: 128]! +vld1.8 {d4},[r4,: 64] +add r4,r3,#336 +vld1.8 {d6-d7},[r4,: 128]! +vtrn.32 q0,q3 +vld1.8 {d8-d9},[r4,: 128]! +vshl.i32 q5,q0,#4 +vtrn.32 q1,q4 +vshl.i32 q6,q3,#4 +vadd.i32 q5,q5,q0 +vadd.i32 q6,q6,q3 +vshl.i32 q7,q1,#4 +vld1.8 {d5},[r4,: 64] +vshl.i32 q8,q4,#4 +vtrn.32 d4,d5 +vadd.i32 q7,q7,q1 +vadd.i32 q8,q8,q4 +vld1.8 {d18-d19},[r2,: 128]! +vshl.i32 q10,q2,#4 +vld1.8 {d22-d23},[r2,: 128]! +vadd.i32 q10,q10,q2 +vld1.8 {d24},[r2,: 64] +vadd.i32 q5,q5,q0 +add r2,r3,#288 +vld1.8 {d26-d27},[r2,: 128]! +vadd.i32 q6,q6,q3 +vld1.8 {d28-d29},[r2,: 128]! +vadd.i32 q8,q8,q4 +vld1.8 {d25},[r2,: 64] +vadd.i32 q10,q10,q2 +vtrn.32 q9,q13 +vadd.i32 q7,q7,q1 +vadd.i32 q5,q5,q0 +vtrn.32 q11,q14 +vadd.i32 q6,q6,q3 +add r2,sp,#560 +vadd.i32 q10,q10,q2 +vtrn.32 d24,d25 +vst1.8 {d12-d13},[r2,: 128] +vshl.i32 q6,q13,#1 +add r2,sp,#576 +vst1.8 {d20-d21},[r2,: 128] +vshl.i32 q10,q14,#1 +add r2,sp,#592 +vst1.8 {d12-d13},[r2,: 128] +vshl.i32 q15,q12,#1 +vadd.i32 q8,q8,q4 +vext.32 d10,d31,d30,#0 +vadd.i32 q7,q7,q1 +add r2,sp,#608 +vst1.8 {d16-d17},[r2,: 128] +vmull.s32 q8,d18,d5 +vmlal.s32 q8,d26,d4 +vmlal.s32 q8,d19,d9 +vmlal.s32 q8,d27,d3 +vmlal.s32 q8,d22,d8 +vmlal.s32 q8,d28,d2 +vmlal.s32 q8,d23,d7 +vmlal.s32 q8,d29,d1 +vmlal.s32 q8,d24,d6 +vmlal.s32 q8,d25,d0 +add r2,sp,#624 +vst1.8 {d14-d15},[r2,: 128] +vmull.s32 q2,d18,d4 +vmlal.s32 q2,d12,d9 +vmlal.s32 q2,d13,d8 +vmlal.s32 q2,d19,d3 +vmlal.s32 q2,d22,d2 +vmlal.s32 q2,d23,d1 +vmlal.s32 q2,d24,d0 +add r2,sp,#640 +vst1.8 {d20-d21},[r2,: 128] +vmull.s32 q7,d18,d9 +vmlal.s32 q7,d26,d3 +vmlal.s32 q7,d19,d8 +vmlal.s32 q7,d27,d2 +vmlal.s32 q7,d22,d7 +vmlal.s32 q7,d28,d1 +vmlal.s32 q7,d23,d6 +vmlal.s32 q7,d29,d0 +add r2,sp,#656 +vst1.8 {d10-d11},[r2,: 128] +vmull.s32 q5,d18,d3 +vmlal.s32 q5,d19,d2 +vmlal.s32 q5,d22,d1 +vmlal.s32 q5,d23,d0 +vmlal.s32 q5,d12,d8 +add r2,sp,#672 +vst1.8 {d16-d17},[r2,: 128] +vmull.s32 q4,d18,d8 +vmlal.s32 q4,d26,d2 +vmlal.s32 q4,d19,d7 +vmlal.s32 q4,d27,d1 +vmlal.s32 q4,d22,d6 +vmlal.s32 q4,d28,d0 +vmull.s32 q8,d18,d7 +vmlal.s32 q8,d26,d1 +vmlal.s32 q8,d19,d6 +vmlal.s32 q8,d27,d0 +add r2,sp,#576 +vld1.8 {d20-d21},[r2,: 128] +vmlal.s32 q7,d24,d21 +vmlal.s32 q7,d25,d20 +vmlal.s32 q4,d23,d21 +vmlal.s32 q4,d29,d20 +vmlal.s32 q8,d22,d21 +vmlal.s32 q8,d28,d20 +vmlal.s32 q5,d24,d20 +add r2,sp,#576 +vst1.8 {d14-d15},[r2,: 128] +vmull.s32 q7,d18,d6 +vmlal.s32 q7,d26,d0 +add r2,sp,#656 +vld1.8 {d30-d31},[r2,: 128] +vmlal.s32 q2,d30,d21 +vmlal.s32 q7,d19,d21 +vmlal.s32 q7,d27,d20 +add r2,sp,#624 +vld1.8 {d26-d27},[r2,: 128] +vmlal.s32 q4,d25,d27 +vmlal.s32 q8,d29,d27 +vmlal.s32 q8,d25,d26 +vmlal.s32 q7,d28,d27 +vmlal.s32 q7,d29,d26 +add r2,sp,#608 +vld1.8 {d28-d29},[r2,: 128] +vmlal.s32 q4,d24,d29 +vmlal.s32 q8,d23,d29 +vmlal.s32 q8,d24,d28 +vmlal.s32 q7,d22,d29 +vmlal.s32 q7,d23,d28 +add r2,sp,#608 +vst1.8 {d8-d9},[r2,: 128] +add r2,sp,#560 +vld1.8 {d8-d9},[r2,: 128] +vmlal.s32 q7,d24,d9 +vmlal.s32 q7,d25,d31 +vmull.s32 q1,d18,d2 +vmlal.s32 q1,d19,d1 +vmlal.s32 q1,d22,d0 +vmlal.s32 q1,d24,d27 +vmlal.s32 q1,d23,d20 +vmlal.s32 q1,d12,d7 +vmlal.s32 q1,d13,d6 +vmull.s32 q6,d18,d1 +vmlal.s32 q6,d19,d0 +vmlal.s32 q6,d23,d27 +vmlal.s32 q6,d22,d20 +vmlal.s32 q6,d24,d26 +vmull.s32 q0,d18,d0 +vmlal.s32 q0,d22,d27 +vmlal.s32 q0,d23,d26 +vmlal.s32 q0,d24,d31 +vmlal.s32 q0,d19,d20 +add r2,sp,#640 +vld1.8 {d18-d19},[r2,: 128] +vmlal.s32 q2,d18,d7 +vmlal.s32 q2,d19,d6 +vmlal.s32 q5,d18,d6 +vmlal.s32 q5,d19,d21 +vmlal.s32 q1,d18,d21 +vmlal.s32 q1,d19,d29 +vmlal.s32 q0,d18,d28 +vmlal.s32 q0,d19,d9 +vmlal.s32 q6,d18,d29 +vmlal.s32 q6,d19,d28 +add r2,sp,#592 +vld1.8 {d18-d19},[r2,: 128] +add r2,sp,#512 +vld1.8 {d22-d23},[r2,: 128] +vmlal.s32 q5,d19,d7 +vmlal.s32 q0,d18,d21 +vmlal.s32 q0,d19,d29 +vmlal.s32 q6,d18,d6 +add r2,sp,#528 +vld1.8 {d6-d7},[r2,: 128] +vmlal.s32 q6,d19,d21 +add r2,sp,#576 +vld1.8 {d18-d19},[r2,: 128] +vmlal.s32 q0,d30,d8 +add r2,sp,#672 +vld1.8 {d20-d21},[r2,: 128] +vmlal.s32 q5,d30,d29 +add r2,sp,#608 +vld1.8 {d24-d25},[r2,: 128] +vmlal.s32 q1,d30,d28 +vadd.i64 q13,q0,q11 +vadd.i64 q14,q5,q11 +vmlal.s32 q6,d30,d9 +vshr.s64 q4,q13,#26 +vshr.s64 q13,q14,#26 +vadd.i64 q7,q7,q4 +vshl.i64 q4,q4,#26 +vadd.i64 q14,q7,q3 +vadd.i64 q9,q9,q13 +vshl.i64 q13,q13,#26 +vadd.i64 q15,q9,q3 +vsub.i64 q0,q0,q4 +vshr.s64 q4,q14,#25 +vsub.i64 q5,q5,q13 +vshr.s64 q13,q15,#25 +vadd.i64 q6,q6,q4 +vshl.i64 q4,q4,#25 +vadd.i64 q14,q6,q11 +vadd.i64 q2,q2,q13 +vsub.i64 q4,q7,q4 +vshr.s64 q7,q14,#26 +vshl.i64 q13,q13,#25 +vadd.i64 q14,q2,q11 +vadd.i64 q8,q8,q7 +vshl.i64 q7,q7,#26 +vadd.i64 q15,q8,q3 +vsub.i64 q9,q9,q13 +vshr.s64 q13,q14,#26 +vsub.i64 q6,q6,q7 +vshr.s64 q7,q15,#25 +vadd.i64 q10,q10,q13 +vshl.i64 q13,q13,#26 +vadd.i64 q14,q10,q3 +vadd.i64 q1,q1,q7 +add r2,r3,#288 +vshl.i64 q7,q7,#25 +add r4,r3,#96 +vadd.i64 q15,q1,q11 +add r2,r2,#8 +vsub.i64 q2,q2,q13 +add r4,r4,#8 +vshr.s64 q13,q14,#25 +vsub.i64 q7,q8,q7 +vshr.s64 q8,q15,#26 +vadd.i64 q14,q13,q13 +vadd.i64 q12,q12,q8 +vtrn.32 d12,d14 +vshl.i64 q8,q8,#26 +vtrn.32 d13,d15 +vadd.i64 q3,q12,q3 +vadd.i64 q0,q0,q14 +vst1.8 d12,[r2,: 64]! +vshl.i64 q7,q13,#4 +vst1.8 d13,[r4,: 64]! +vsub.i64 q1,q1,q8 +vshr.s64 q3,q3,#25 +vadd.i64 q0,q0,q7 +vadd.i64 q5,q5,q3 +vshl.i64 q3,q3,#25 +vadd.i64 q6,q5,q11 +vadd.i64 q0,q0,q13 +vshl.i64 q7,q13,#25 +vadd.i64 q8,q0,q11 +vsub.i64 q3,q12,q3 +vshr.s64 q6,q6,#26 +vsub.i64 q7,q10,q7 +vtrn.32 d2,d6 +vshr.s64 q8,q8,#26 +vtrn.32 d3,d7 +vadd.i64 q3,q9,q6 +vst1.8 d2,[r2,: 64] +vshl.i64 q6,q6,#26 +vst1.8 d3,[r4,: 64] +vadd.i64 q1,q4,q8 +vtrn.32 d4,d14 +vshl.i64 q4,q8,#26 +vtrn.32 d5,d15 +vsub.i64 q5,q5,q6 +add r2,r2,#16 +vsub.i64 q0,q0,q4 +vst1.8 d4,[r2,: 64] +add r4,r4,#16 +vst1.8 d5,[r4,: 64] +vtrn.32 d10,d6 +vtrn.32 d11,d7 +sub r2,r2,#8 +sub r4,r4,#8 +vtrn.32 d0,d2 +vtrn.32 d1,d3 +vst1.8 d10,[r2,: 64] +vst1.8 d11,[r4,: 64] +sub r2,r2,#24 +sub r4,r4,#24 +vst1.8 d0,[r2,: 64] +vst1.8 d1,[r4,: 64] +add r2,sp,#544 +add r4,r3,#144 +add r5,r3,#192 +vld1.8 {d0-d1},[r2,: 128] +vld1.8 {d2-d3},[r4,: 128]! +vld1.8 {d4-d5},[r5,: 128]! +vzip.i32 q1,q2 +vld1.8 {d6-d7},[r4,: 128]! +vld1.8 {d8-d9},[r5,: 128]! +vshl.i32 q5,q1,#1 +vzip.i32 q3,q4 +vshl.i32 q6,q2,#1 +vld1.8 {d14},[r4,: 64] +vshl.i32 q8,q3,#1 +vld1.8 {d15},[r5,: 64] +vshl.i32 q9,q4,#1 +vmul.i32 d21,d7,d1 +vtrn.32 d14,d15 +vmul.i32 q11,q4,q0 +vmul.i32 q0,q7,q0 +vmull.s32 q12,d2,d2 +vmlal.s32 q12,d11,d1 +vmlal.s32 q12,d12,d0 +vmlal.s32 q12,d13,d23 +vmlal.s32 q12,d16,d22 +vmlal.s32 q12,d7,d21 +vmull.s32 q10,d2,d11 +vmlal.s32 q10,d4,d1 +vmlal.s32 q10,d13,d0 +vmlal.s32 q10,d6,d23 +vmlal.s32 q10,d17,d22 +vmull.s32 q13,d10,d4 +vmlal.s32 q13,d11,d3 +vmlal.s32 q13,d13,d1 +vmlal.s32 q13,d16,d0 +vmlal.s32 q13,d17,d23 +vmlal.s32 q13,d8,d22 +vmull.s32 q1,d10,d5 +vmlal.s32 q1,d11,d4 +vmlal.s32 q1,d6,d1 +vmlal.s32 q1,d17,d0 +vmlal.s32 q1,d8,d23 +vmull.s32 q14,d10,d6 +vmlal.s32 q14,d11,d13 +vmlal.s32 q14,d4,d4 +vmlal.s32 q14,d17,d1 +vmlal.s32 q14,d18,d0 +vmlal.s32 q14,d9,d23 +vmull.s32 q11,d10,d7 +vmlal.s32 q11,d11,d6 +vmlal.s32 q11,d12,d5 +vmlal.s32 q11,d8,d1 +vmlal.s32 q11,d19,d0 +vmull.s32 q15,d10,d8 +vmlal.s32 q15,d11,d17 +vmlal.s32 q15,d12,d6 +vmlal.s32 q15,d13,d5 +vmlal.s32 q15,d19,d1 +vmlal.s32 q15,d14,d0 +vmull.s32 q2,d10,d9 +vmlal.s32 q2,d11,d8 +vmlal.s32 q2,d12,d7 +vmlal.s32 q2,d13,d6 +vmlal.s32 q2,d14,d1 +vmull.s32 q0,d15,d1 +vmlal.s32 q0,d10,d14 +vmlal.s32 q0,d11,d19 +vmlal.s32 q0,d12,d8 +vmlal.s32 q0,d13,d17 +vmlal.s32 q0,d6,d6 +add r2,sp,#512 +vld1.8 {d18-d19},[r2,: 128] +vmull.s32 q3,d16,d7 +vmlal.s32 q3,d10,d15 +vmlal.s32 q3,d11,d14 +vmlal.s32 q3,d12,d9 +vmlal.s32 q3,d13,d8 +add r2,sp,#528 +vld1.8 {d8-d9},[r2,: 128] +vadd.i64 q5,q12,q9 +vadd.i64 q6,q15,q9 +vshr.s64 q5,q5,#26 +vshr.s64 q6,q6,#26 +vadd.i64 q7,q10,q5 +vshl.i64 q5,q5,#26 +vadd.i64 q8,q7,q4 +vadd.i64 q2,q2,q6 +vshl.i64 q6,q6,#26 +vadd.i64 q10,q2,q4 +vsub.i64 q5,q12,q5 +vshr.s64 q8,q8,#25 +vsub.i64 q6,q15,q6 +vshr.s64 q10,q10,#25 +vadd.i64 q12,q13,q8 +vshl.i64 q8,q8,#25 +vadd.i64 q13,q12,q9 +vadd.i64 q0,q0,q10 +vsub.i64 q7,q7,q8 +vshr.s64 q8,q13,#26 +vshl.i64 q10,q10,#25 +vadd.i64 q13,q0,q9 +vadd.i64 q1,q1,q8 +vshl.i64 q8,q8,#26 +vadd.i64 q15,q1,q4 +vsub.i64 q2,q2,q10 +vshr.s64 q10,q13,#26 +vsub.i64 q8,q12,q8 +vshr.s64 q12,q15,#25 +vadd.i64 q3,q3,q10 +vshl.i64 q10,q10,#26 +vadd.i64 q13,q3,q4 +vadd.i64 q14,q14,q12 +add r2,r3,#144 +vshl.i64 q12,q12,#25 +add r4,r3,#192 +vadd.i64 q15,q14,q9 +add r2,r2,#8 +vsub.i64 q0,q0,q10 +add r4,r4,#8 +vshr.s64 q10,q13,#25 +vsub.i64 q1,q1,q12 +vshr.s64 q12,q15,#26 +vadd.i64 q13,q10,q10 +vadd.i64 q11,q11,q12 +vtrn.32 d16,d2 +vshl.i64 q12,q12,#26 +vtrn.32 d17,d3 +vadd.i64 q1,q11,q4 +vadd.i64 q4,q5,q13 +vst1.8 d16,[r2,: 64]! +vshl.i64 q5,q10,#4 +vst1.8 d17,[r4,: 64]! +vsub.i64 q8,q14,q12 +vshr.s64 q1,q1,#25 +vadd.i64 q4,q4,q5 +vadd.i64 q5,q6,q1 +vshl.i64 q1,q1,#25 +vadd.i64 q6,q5,q9 +vadd.i64 q4,q4,q10 +vshl.i64 q10,q10,#25 +vadd.i64 q9,q4,q9 +vsub.i64 q1,q11,q1 +vshr.s64 q6,q6,#26 +vsub.i64 q3,q3,q10 +vtrn.32 d16,d2 +vshr.s64 q9,q9,#26 +vtrn.32 d17,d3 +vadd.i64 q1,q2,q6 +vst1.8 d16,[r2,: 64] +vshl.i64 q2,q6,#26 +vst1.8 d17,[r4,: 64] +vadd.i64 q6,q7,q9 +vtrn.32 d0,d6 +vshl.i64 q7,q9,#26 +vtrn.32 d1,d7 +vsub.i64 q2,q5,q2 +add r2,r2,#16 +vsub.i64 q3,q4,q7 +vst1.8 d0,[r2,: 64] +add r4,r4,#16 +vst1.8 d1,[r4,: 64] +vtrn.32 d4,d2 +vtrn.32 d5,d3 +sub r2,r2,#8 +sub r4,r4,#8 +vtrn.32 d6,d12 +vtrn.32 d7,d13 +vst1.8 d4,[r2,: 64] +vst1.8 d5,[r4,: 64] +sub r2,r2,#24 +sub r4,r4,#24 +vst1.8 d6,[r2,: 64] +vst1.8 d7,[r4,: 64] +add r2,r3,#336 +add r4,r3,#288 +vld1.8 {d0-d1},[r2,: 128]! +vld1.8 {d2-d3},[r4,: 128]! +vadd.i32 q0,q0,q1 +vld1.8 {d2-d3},[r2,: 128]! +vld1.8 {d4-d5},[r4,: 128]! +vadd.i32 q1,q1,q2 +add r5,r3,#288 +vld1.8 {d4},[r2,: 64] +vld1.8 {d6},[r4,: 64] +vadd.i32 q2,q2,q3 +vst1.8 {d0-d1},[r5,: 128]! +vst1.8 {d2-d3},[r5,: 128]! +vst1.8 d4,[r5,: 64] +add r2,r3,#48 +add r4,r3,#144 +vld1.8 {d0-d1},[r4,: 128]! +vld1.8 {d2-d3},[r4,: 128]! +vld1.8 {d4},[r4,: 64] +add r4,r3,#288 +vld1.8 {d6-d7},[r4,: 128]! +vtrn.32 q0,q3 +vld1.8 {d8-d9},[r4,: 128]! +vshl.i32 q5,q0,#4 +vtrn.32 q1,q4 +vshl.i32 q6,q3,#4 +vadd.i32 q5,q5,q0 +vadd.i32 q6,q6,q3 +vshl.i32 q7,q1,#4 +vld1.8 {d5},[r4,: 64] +vshl.i32 q8,q4,#4 +vtrn.32 d4,d5 +vadd.i32 q7,q7,q1 +vadd.i32 q8,q8,q4 +vld1.8 {d18-d19},[r2,: 128]! +vshl.i32 q10,q2,#4 +vld1.8 {d22-d23},[r2,: 128]! +vadd.i32 q10,q10,q2 +vld1.8 {d24},[r2,: 64] +vadd.i32 q5,q5,q0 +add r2,r3,#240 +vld1.8 {d26-d27},[r2,: 128]! +vadd.i32 q6,q6,q3 +vld1.8 {d28-d29},[r2,: 128]! +vadd.i32 q8,q8,q4 +vld1.8 {d25},[r2,: 64] +vadd.i32 q10,q10,q2 +vtrn.32 q9,q13 +vadd.i32 q7,q7,q1 +vadd.i32 q5,q5,q0 +vtrn.32 q11,q14 +vadd.i32 q6,q6,q3 +add r2,sp,#560 +vadd.i32 q10,q10,q2 +vtrn.32 d24,d25 +vst1.8 {d12-d13},[r2,: 128] +vshl.i32 q6,q13,#1 +add r2,sp,#576 +vst1.8 {d20-d21},[r2,: 128] +vshl.i32 q10,q14,#1 +add r2,sp,#592 +vst1.8 {d12-d13},[r2,: 128] +vshl.i32 q15,q12,#1 +vadd.i32 q8,q8,q4 +vext.32 d10,d31,d30,#0 +vadd.i32 q7,q7,q1 +add r2,sp,#608 +vst1.8 {d16-d17},[r2,: 128] +vmull.s32 q8,d18,d5 +vmlal.s32 q8,d26,d4 +vmlal.s32 q8,d19,d9 +vmlal.s32 q8,d27,d3 +vmlal.s32 q8,d22,d8 +vmlal.s32 q8,d28,d2 +vmlal.s32 q8,d23,d7 +vmlal.s32 q8,d29,d1 +vmlal.s32 q8,d24,d6 +vmlal.s32 q8,d25,d0 +add r2,sp,#624 +vst1.8 {d14-d15},[r2,: 128] +vmull.s32 q2,d18,d4 +vmlal.s32 q2,d12,d9 +vmlal.s32 q2,d13,d8 +vmlal.s32 q2,d19,d3 +vmlal.s32 q2,d22,d2 +vmlal.s32 q2,d23,d1 +vmlal.s32 q2,d24,d0 +add r2,sp,#640 +vst1.8 {d20-d21},[r2,: 128] +vmull.s32 q7,d18,d9 +vmlal.s32 q7,d26,d3 +vmlal.s32 q7,d19,d8 +vmlal.s32 q7,d27,d2 +vmlal.s32 q7,d22,d7 +vmlal.s32 q7,d28,d1 +vmlal.s32 q7,d23,d6 +vmlal.s32 q7,d29,d0 +add r2,sp,#656 +vst1.8 {d10-d11},[r2,: 128] +vmull.s32 q5,d18,d3 +vmlal.s32 q5,d19,d2 +vmlal.s32 q5,d22,d1 +vmlal.s32 q5,d23,d0 +vmlal.s32 q5,d12,d8 +add r2,sp,#672 +vst1.8 {d16-d17},[r2,: 128] +vmull.s32 q4,d18,d8 +vmlal.s32 q4,d26,d2 +vmlal.s32 q4,d19,d7 +vmlal.s32 q4,d27,d1 +vmlal.s32 q4,d22,d6 +vmlal.s32 q4,d28,d0 +vmull.s32 q8,d18,d7 +vmlal.s32 q8,d26,d1 +vmlal.s32 q8,d19,d6 +vmlal.s32 q8,d27,d0 +add r2,sp,#576 +vld1.8 {d20-d21},[r2,: 128] +vmlal.s32 q7,d24,d21 +vmlal.s32 q7,d25,d20 +vmlal.s32 q4,d23,d21 +vmlal.s32 q4,d29,d20 +vmlal.s32 q8,d22,d21 +vmlal.s32 q8,d28,d20 +vmlal.s32 q5,d24,d20 +add r2,sp,#576 +vst1.8 {d14-d15},[r2,: 128] +vmull.s32 q7,d18,d6 +vmlal.s32 q7,d26,d0 +add r2,sp,#656 +vld1.8 {d30-d31},[r2,: 128] +vmlal.s32 q2,d30,d21 +vmlal.s32 q7,d19,d21 +vmlal.s32 q7,d27,d20 +add r2,sp,#624 +vld1.8 {d26-d27},[r2,: 128] +vmlal.s32 q4,d25,d27 +vmlal.s32 q8,d29,d27 +vmlal.s32 q8,d25,d26 +vmlal.s32 q7,d28,d27 +vmlal.s32 q7,d29,d26 +add r2,sp,#608 +vld1.8 {d28-d29},[r2,: 128] +vmlal.s32 q4,d24,d29 +vmlal.s32 q8,d23,d29 +vmlal.s32 q8,d24,d28 +vmlal.s32 q7,d22,d29 +vmlal.s32 q7,d23,d28 +add r2,sp,#608 +vst1.8 {d8-d9},[r2,: 128] +add r2,sp,#560 +vld1.8 {d8-d9},[r2,: 128] +vmlal.s32 q7,d24,d9 +vmlal.s32 q7,d25,d31 +vmull.s32 q1,d18,d2 +vmlal.s32 q1,d19,d1 +vmlal.s32 q1,d22,d0 +vmlal.s32 q1,d24,d27 +vmlal.s32 q1,d23,d20 +vmlal.s32 q1,d12,d7 +vmlal.s32 q1,d13,d6 +vmull.s32 q6,d18,d1 +vmlal.s32 q6,d19,d0 +vmlal.s32 q6,d23,d27 +vmlal.s32 q6,d22,d20 +vmlal.s32 q6,d24,d26 +vmull.s32 q0,d18,d0 +vmlal.s32 q0,d22,d27 +vmlal.s32 q0,d23,d26 +vmlal.s32 q0,d24,d31 +vmlal.s32 q0,d19,d20 +add r2,sp,#640 +vld1.8 {d18-d19},[r2,: 128] +vmlal.s32 q2,d18,d7 +vmlal.s32 q2,d19,d6 +vmlal.s32 q5,d18,d6 +vmlal.s32 q5,d19,d21 +vmlal.s32 q1,d18,d21 +vmlal.s32 q1,d19,d29 +vmlal.s32 q0,d18,d28 +vmlal.s32 q0,d19,d9 +vmlal.s32 q6,d18,d29 +vmlal.s32 q6,d19,d28 +add r2,sp,#592 +vld1.8 {d18-d19},[r2,: 128] +add r2,sp,#512 +vld1.8 {d22-d23},[r2,: 128] +vmlal.s32 q5,d19,d7 +vmlal.s32 q0,d18,d21 +vmlal.s32 q0,d19,d29 +vmlal.s32 q6,d18,d6 +add r2,sp,#528 +vld1.8 {d6-d7},[r2,: 128] +vmlal.s32 q6,d19,d21 +add r2,sp,#576 +vld1.8 {d18-d19},[r2,: 128] +vmlal.s32 q0,d30,d8 +add r2,sp,#672 +vld1.8 {d20-d21},[r2,: 128] +vmlal.s32 q5,d30,d29 +add r2,sp,#608 +vld1.8 {d24-d25},[r2,: 128] +vmlal.s32 q1,d30,d28 +vadd.i64 q13,q0,q11 +vadd.i64 q14,q5,q11 +vmlal.s32 q6,d30,d9 +vshr.s64 q4,q13,#26 +vshr.s64 q13,q14,#26 +vadd.i64 q7,q7,q4 +vshl.i64 q4,q4,#26 +vadd.i64 q14,q7,q3 +vadd.i64 q9,q9,q13 +vshl.i64 q13,q13,#26 +vadd.i64 q15,q9,q3 +vsub.i64 q0,q0,q4 +vshr.s64 q4,q14,#25 +vsub.i64 q5,q5,q13 +vshr.s64 q13,q15,#25 +vadd.i64 q6,q6,q4 +vshl.i64 q4,q4,#25 +vadd.i64 q14,q6,q11 +vadd.i64 q2,q2,q13 +vsub.i64 q4,q7,q4 +vshr.s64 q7,q14,#26 +vshl.i64 q13,q13,#25 +vadd.i64 q14,q2,q11 +vadd.i64 q8,q8,q7 +vshl.i64 q7,q7,#26 +vadd.i64 q15,q8,q3 +vsub.i64 q9,q9,q13 +vshr.s64 q13,q14,#26 +vsub.i64 q6,q6,q7 +vshr.s64 q7,q15,#25 +vadd.i64 q10,q10,q13 +vshl.i64 q13,q13,#26 +vadd.i64 q14,q10,q3 +vadd.i64 q1,q1,q7 +add r2,r3,#240 +vshl.i64 q7,q7,#25 +add r4,r3,#144 +vadd.i64 q15,q1,q11 +add r2,r2,#8 +vsub.i64 q2,q2,q13 +add r4,r4,#8 +vshr.s64 q13,q14,#25 +vsub.i64 q7,q8,q7 +vshr.s64 q8,q15,#26 +vadd.i64 q14,q13,q13 +vadd.i64 q12,q12,q8 +vtrn.32 d12,d14 +vshl.i64 q8,q8,#26 +vtrn.32 d13,d15 +vadd.i64 q3,q12,q3 +vadd.i64 q0,q0,q14 +vst1.8 d12,[r2,: 64]! +vshl.i64 q7,q13,#4 +vst1.8 d13,[r4,: 64]! +vsub.i64 q1,q1,q8 +vshr.s64 q3,q3,#25 +vadd.i64 q0,q0,q7 +vadd.i64 q5,q5,q3 +vshl.i64 q3,q3,#25 +vadd.i64 q6,q5,q11 +vadd.i64 q0,q0,q13 +vshl.i64 q7,q13,#25 +vadd.i64 q8,q0,q11 +vsub.i64 q3,q12,q3 +vshr.s64 q6,q6,#26 +vsub.i64 q7,q10,q7 +vtrn.32 d2,d6 +vshr.s64 q8,q8,#26 +vtrn.32 d3,d7 +vadd.i64 q3,q9,q6 +vst1.8 d2,[r2,: 64] +vshl.i64 q6,q6,#26 +vst1.8 d3,[r4,: 64] +vadd.i64 q1,q4,q8 +vtrn.32 d4,d14 +vshl.i64 q4,q8,#26 +vtrn.32 d5,d15 +vsub.i64 q5,q5,q6 +add r2,r2,#16 +vsub.i64 q0,q0,q4 +vst1.8 d4,[r2,: 64] +add r4,r4,#16 +vst1.8 d5,[r4,: 64] +vtrn.32 d10,d6 +vtrn.32 d11,d7 +sub r2,r2,#8 +sub r4,r4,#8 +vtrn.32 d0,d2 +vtrn.32 d1,d3 +vst1.8 d10,[r2,: 64] +vst1.8 d11,[r4,: 64] +sub r2,r2,#24 +sub r4,r4,#24 +vst1.8 d0,[r2,: 64] +vst1.8 d1,[r4,: 64] +ldr r2,[sp,#488] +ldr r4,[sp,#492] +subs r5,r2,#1 +bge ._mainloop +add r1,r3,#144 +add r2,r3,#336 +vld1.8 {d0-d1},[r1,: 128]! +vld1.8 {d2-d3},[r1,: 128]! +vld1.8 {d4},[r1,: 64] +vst1.8 {d0-d1},[r2,: 128]! +vst1.8 {d2-d3},[r2,: 128]! +vst1.8 d4,[r2,: 64] +ldr r1,=0 +._invertloop: +add r2,r3,#144 +ldr r4,=0 +ldr r5,=2 +cmp r1,#1 +ldreq r5,=1 +addeq r2,r3,#336 +addeq r4,r3,#48 +cmp r1,#2 +ldreq r5,=1 +addeq r2,r3,#48 +cmp r1,#3 +ldreq r5,=5 +addeq r4,r3,#336 +cmp r1,#4 +ldreq r5,=10 +cmp r1,#5 +ldreq r5,=20 +cmp r1,#6 +ldreq r5,=10 +addeq r2,r3,#336 +addeq r4,r3,#336 +cmp r1,#7 +ldreq r5,=50 +cmp r1,#8 +ldreq r5,=100 +cmp r1,#9 +ldreq r5,=50 +addeq r2,r3,#336 +cmp r1,#10 +ldreq r5,=5 +addeq r2,r3,#48 +cmp r1,#11 +ldreq r5,=0 +addeq r2,r3,#96 +add r6,r3,#144 +add r7,r3,#288 +vld1.8 {d0-d1},[r6,: 128]! +vld1.8 {d2-d3},[r6,: 128]! +vld1.8 {d4},[r6,: 64] +vst1.8 {d0-d1},[r7,: 128]! +vst1.8 {d2-d3},[r7,: 128]! +vst1.8 d4,[r7,: 64] +cmp r5,#0 +beq ._skipsquaringloop +._squaringloop: +add r6,r3,#288 +add r7,r3,#288 +add r8,r3,#288 +vmov.i32 q0,#19 +vmov.i32 q1,#0 +vmov.i32 q2,#1 +vzip.i32 q1,q2 +vld1.8 {d4-d5},[r7,: 128]! +vld1.8 {d6-d7},[r7,: 128]! +vld1.8 {d9},[r7,: 64] +vld1.8 {d10-d11},[r6,: 128]! +add r7,sp,#416 +vld1.8 {d12-d13},[r6,: 128]! +vmul.i32 q7,q2,q0 +vld1.8 {d8},[r6,: 64] +vext.32 d17,d11,d10,#1 +vmul.i32 q9,q3,q0 +vext.32 d16,d10,d8,#1 +vshl.u32 q10,q5,q1 +vext.32 d22,d14,d4,#1 +vext.32 d24,d18,d6,#1 +vshl.u32 q13,q6,q1 +vshl.u32 d28,d8,d2 +vrev64.i32 d22,d22 +vmul.i32 d1,d9,d1 +vrev64.i32 d24,d24 +vext.32 d29,d8,d13,#1 +vext.32 d0,d1,d9,#1 +vrev64.i32 d0,d0 +vext.32 d2,d9,d1,#1 +vext.32 d23,d15,d5,#1 +vmull.s32 q4,d20,d4 +vrev64.i32 d23,d23 +vmlal.s32 q4,d21,d1 +vrev64.i32 d2,d2 +vmlal.s32 q4,d26,d19 +vext.32 d3,d5,d15,#1 +vmlal.s32 q4,d27,d18 +vrev64.i32 d3,d3 +vmlal.s32 q4,d28,d15 +vext.32 d14,d12,d11,#1 +vmull.s32 q5,d16,d23 +vext.32 d15,d13,d12,#1 +vmlal.s32 q5,d17,d4 +vst1.8 d8,[r7,: 64]! +vmlal.s32 q5,d14,d1 +vext.32 d12,d9,d8,#0 +vmlal.s32 q5,d15,d19 +vmov.i64 d13,#0 +vmlal.s32 q5,d29,d18 +vext.32 d25,d19,d7,#1 +vmlal.s32 q6,d20,d5 +vrev64.i32 d25,d25 +vmlal.s32 q6,d21,d4 +vst1.8 d11,[r7,: 64]! +vmlal.s32 q6,d26,d1 +vext.32 d9,d10,d10,#0 +vmlal.s32 q6,d27,d19 +vmov.i64 d8,#0 +vmlal.s32 q6,d28,d18 +vmlal.s32 q4,d16,d24 +vmlal.s32 q4,d17,d5 +vmlal.s32 q4,d14,d4 +vst1.8 d12,[r7,: 64]! +vmlal.s32 q4,d15,d1 +vext.32 d10,d13,d12,#0 +vmlal.s32 q4,d29,d19 +vmov.i64 d11,#0 +vmlal.s32 q5,d20,d6 +vmlal.s32 q5,d21,d5 +vmlal.s32 q5,d26,d4 +vext.32 d13,d8,d8,#0 +vmlal.s32 q5,d27,d1 +vmov.i64 d12,#0 +vmlal.s32 q5,d28,d19 +vst1.8 d9,[r7,: 64]! +vmlal.s32 q6,d16,d25 +vmlal.s32 q6,d17,d6 +vst1.8 d10,[r7,: 64] +vmlal.s32 q6,d14,d5 +vext.32 d8,d11,d10,#0 +vmlal.s32 q6,d15,d4 +vmov.i64 d9,#0 +vmlal.s32 q6,d29,d1 +vmlal.s32 q4,d20,d7 +vmlal.s32 q4,d21,d6 +vmlal.s32 q4,d26,d5 +vext.32 d11,d12,d12,#0 +vmlal.s32 q4,d27,d4 +vmov.i64 d10,#0 +vmlal.s32 q4,d28,d1 +vmlal.s32 q5,d16,d0 +sub r6,r7,#32 +vmlal.s32 q5,d17,d7 +vmlal.s32 q5,d14,d6 +vext.32 d30,d9,d8,#0 +vmlal.s32 q5,d15,d5 +vld1.8 {d31},[r6,: 64]! +vmlal.s32 q5,d29,d4 +vmlal.s32 q15,d20,d0 +vext.32 d0,d6,d18,#1 +vmlal.s32 q15,d21,d25 +vrev64.i32 d0,d0 +vmlal.s32 q15,d26,d24 +vext.32 d1,d7,d19,#1 +vext.32 d7,d10,d10,#0 +vmlal.s32 q15,d27,d23 +vrev64.i32 d1,d1 +vld1.8 {d6},[r6,: 64] +vmlal.s32 q15,d28,d22 +vmlal.s32 q3,d16,d4 +add r6,r6,#24 +vmlal.s32 q3,d17,d2 +vext.32 d4,d31,d30,#0 +vmov d17,d11 +vmlal.s32 q3,d14,d1 +vext.32 d11,d13,d13,#0 +vext.32 d13,d30,d30,#0 +vmlal.s32 q3,d15,d0 +vext.32 d1,d8,d8,#0 +vmlal.s32 q3,d29,d3 +vld1.8 {d5},[r6,: 64] +sub r6,r6,#16 +vext.32 d10,d6,d6,#0 +vmov.i32 q1,#0xffffffff +vshl.i64 q4,q1,#25 +add r7,sp,#512 +vld1.8 {d14-d15},[r7,: 128] +vadd.i64 q9,q2,q7 +vshl.i64 q1,q1,#26 +vshr.s64 q10,q9,#26 +vld1.8 {d0},[r6,: 64]! +vadd.i64 q5,q5,q10 +vand q9,q9,q1 +vld1.8 {d16},[r6,: 64]! +add r6,sp,#528 +vld1.8 {d20-d21},[r6,: 128] +vadd.i64 q11,q5,q10 +vsub.i64 q2,q2,q9 +vshr.s64 q9,q11,#25 +vext.32 d12,d5,d4,#0 +vand q11,q11,q4 +vadd.i64 q0,q0,q9 +vmov d19,d7 +vadd.i64 q3,q0,q7 +vsub.i64 q5,q5,q11 +vshr.s64 q11,q3,#26 +vext.32 d18,d11,d10,#0 +vand q3,q3,q1 +vadd.i64 q8,q8,q11 +vadd.i64 q11,q8,q10 +vsub.i64 q0,q0,q3 +vshr.s64 q3,q11,#25 +vand q11,q11,q4 +vadd.i64 q3,q6,q3 +vadd.i64 q6,q3,q7 +vsub.i64 q8,q8,q11 +vshr.s64 q11,q6,#26 +vand q6,q6,q1 +vadd.i64 q9,q9,q11 +vadd.i64 d25,d19,d21 +vsub.i64 q3,q3,q6 +vshr.s64 d23,d25,#25 +vand q4,q12,q4 +vadd.i64 d21,d23,d23 +vshl.i64 d25,d23,#4 +vadd.i64 d21,d21,d23 +vadd.i64 d25,d25,d21 +vadd.i64 d4,d4,d25 +vzip.i32 q0,q8 +vadd.i64 d12,d4,d14 +add r6,r8,#8 +vst1.8 d0,[r6,: 64] +vsub.i64 d19,d19,d9 +add r6,r6,#16 +vst1.8 d16,[r6,: 64] +vshr.s64 d22,d12,#26 +vand q0,q6,q1 +vadd.i64 d10,d10,d22 +vzip.i32 q3,q9 +vsub.i64 d4,d4,d0 +sub r6,r6,#8 +vst1.8 d6,[r6,: 64] +add r6,r6,#16 +vst1.8 d18,[r6,: 64] +vzip.i32 q2,q5 +sub r6,r6,#32 +vst1.8 d4,[r6,: 64] +subs r5,r5,#1 +bhi ._squaringloop +._skipsquaringloop: +mov r2,r2 +add r5,r3,#288 +add r6,r3,#144 +vmov.i32 q0,#19 +vmov.i32 q1,#0 +vmov.i32 q2,#1 +vzip.i32 q1,q2 +vld1.8 {d4-d5},[r5,: 128]! +vld1.8 {d6-d7},[r5,: 128]! +vld1.8 {d9},[r5,: 64] +vld1.8 {d10-d11},[r2,: 128]! +add r5,sp,#416 +vld1.8 {d12-d13},[r2,: 128]! +vmul.i32 q7,q2,q0 +vld1.8 {d8},[r2,: 64] +vext.32 d17,d11,d10,#1 +vmul.i32 q9,q3,q0 +vext.32 d16,d10,d8,#1 +vshl.u32 q10,q5,q1 +vext.32 d22,d14,d4,#1 +vext.32 d24,d18,d6,#1 +vshl.u32 q13,q6,q1 +vshl.u32 d28,d8,d2 +vrev64.i32 d22,d22 +vmul.i32 d1,d9,d1 +vrev64.i32 d24,d24 +vext.32 d29,d8,d13,#1 +vext.32 d0,d1,d9,#1 +vrev64.i32 d0,d0 +vext.32 d2,d9,d1,#1 +vext.32 d23,d15,d5,#1 +vmull.s32 q4,d20,d4 +vrev64.i32 d23,d23 +vmlal.s32 q4,d21,d1 +vrev64.i32 d2,d2 +vmlal.s32 q4,d26,d19 +vext.32 d3,d5,d15,#1 +vmlal.s32 q4,d27,d18 +vrev64.i32 d3,d3 +vmlal.s32 q4,d28,d15 +vext.32 d14,d12,d11,#1 +vmull.s32 q5,d16,d23 +vext.32 d15,d13,d12,#1 +vmlal.s32 q5,d17,d4 +vst1.8 d8,[r5,: 64]! +vmlal.s32 q5,d14,d1 +vext.32 d12,d9,d8,#0 +vmlal.s32 q5,d15,d19 +vmov.i64 d13,#0 +vmlal.s32 q5,d29,d18 +vext.32 d25,d19,d7,#1 +vmlal.s32 q6,d20,d5 +vrev64.i32 d25,d25 +vmlal.s32 q6,d21,d4 +vst1.8 d11,[r5,: 64]! +vmlal.s32 q6,d26,d1 +vext.32 d9,d10,d10,#0 +vmlal.s32 q6,d27,d19 +vmov.i64 d8,#0 +vmlal.s32 q6,d28,d18 +vmlal.s32 q4,d16,d24 +vmlal.s32 q4,d17,d5 +vmlal.s32 q4,d14,d4 +vst1.8 d12,[r5,: 64]! +vmlal.s32 q4,d15,d1 +vext.32 d10,d13,d12,#0 +vmlal.s32 q4,d29,d19 +vmov.i64 d11,#0 +vmlal.s32 q5,d20,d6 +vmlal.s32 q5,d21,d5 +vmlal.s32 q5,d26,d4 +vext.32 d13,d8,d8,#0 +vmlal.s32 q5,d27,d1 +vmov.i64 d12,#0 +vmlal.s32 q5,d28,d19 +vst1.8 d9,[r5,: 64]! +vmlal.s32 q6,d16,d25 +vmlal.s32 q6,d17,d6 +vst1.8 d10,[r5,: 64] +vmlal.s32 q6,d14,d5 +vext.32 d8,d11,d10,#0 +vmlal.s32 q6,d15,d4 +vmov.i64 d9,#0 +vmlal.s32 q6,d29,d1 +vmlal.s32 q4,d20,d7 +vmlal.s32 q4,d21,d6 +vmlal.s32 q4,d26,d5 +vext.32 d11,d12,d12,#0 +vmlal.s32 q4,d27,d4 +vmov.i64 d10,#0 +vmlal.s32 q4,d28,d1 +vmlal.s32 q5,d16,d0 +sub r2,r5,#32 +vmlal.s32 q5,d17,d7 +vmlal.s32 q5,d14,d6 +vext.32 d30,d9,d8,#0 +vmlal.s32 q5,d15,d5 +vld1.8 {d31},[r2,: 64]! +vmlal.s32 q5,d29,d4 +vmlal.s32 q15,d20,d0 +vext.32 d0,d6,d18,#1 +vmlal.s32 q15,d21,d25 +vrev64.i32 d0,d0 +vmlal.s32 q15,d26,d24 +vext.32 d1,d7,d19,#1 +vext.32 d7,d10,d10,#0 +vmlal.s32 q15,d27,d23 +vrev64.i32 d1,d1 +vld1.8 {d6},[r2,: 64] +vmlal.s32 q15,d28,d22 +vmlal.s32 q3,d16,d4 +add r2,r2,#24 +vmlal.s32 q3,d17,d2 +vext.32 d4,d31,d30,#0 +vmov d17,d11 +vmlal.s32 q3,d14,d1 +vext.32 d11,d13,d13,#0 +vext.32 d13,d30,d30,#0 +vmlal.s32 q3,d15,d0 +vext.32 d1,d8,d8,#0 +vmlal.s32 q3,d29,d3 +vld1.8 {d5},[r2,: 64] +sub r2,r2,#16 +vext.32 d10,d6,d6,#0 +vmov.i32 q1,#0xffffffff +vshl.i64 q4,q1,#25 +add r5,sp,#512 +vld1.8 {d14-d15},[r5,: 128] +vadd.i64 q9,q2,q7 +vshl.i64 q1,q1,#26 +vshr.s64 q10,q9,#26 +vld1.8 {d0},[r2,: 64]! +vadd.i64 q5,q5,q10 +vand q9,q9,q1 +vld1.8 {d16},[r2,: 64]! +add r2,sp,#528 +vld1.8 {d20-d21},[r2,: 128] +vadd.i64 q11,q5,q10 +vsub.i64 q2,q2,q9 +vshr.s64 q9,q11,#25 +vext.32 d12,d5,d4,#0 +vand q11,q11,q4 +vadd.i64 q0,q0,q9 +vmov d19,d7 +vadd.i64 q3,q0,q7 +vsub.i64 q5,q5,q11 +vshr.s64 q11,q3,#26 +vext.32 d18,d11,d10,#0 +vand q3,q3,q1 +vadd.i64 q8,q8,q11 +vadd.i64 q11,q8,q10 +vsub.i64 q0,q0,q3 +vshr.s64 q3,q11,#25 +vand q11,q11,q4 +vadd.i64 q3,q6,q3 +vadd.i64 q6,q3,q7 +vsub.i64 q8,q8,q11 +vshr.s64 q11,q6,#26 +vand q6,q6,q1 +vadd.i64 q9,q9,q11 +vadd.i64 d25,d19,d21 +vsub.i64 q3,q3,q6 +vshr.s64 d23,d25,#25 +vand q4,q12,q4 +vadd.i64 d21,d23,d23 +vshl.i64 d25,d23,#4 +vadd.i64 d21,d21,d23 +vadd.i64 d25,d25,d21 +vadd.i64 d4,d4,d25 +vzip.i32 q0,q8 +vadd.i64 d12,d4,d14 +add r2,r6,#8 +vst1.8 d0,[r2,: 64] +vsub.i64 d19,d19,d9 +add r2,r2,#16 +vst1.8 d16,[r2,: 64] +vshr.s64 d22,d12,#26 +vand q0,q6,q1 +vadd.i64 d10,d10,d22 +vzip.i32 q3,q9 +vsub.i64 d4,d4,d0 +sub r2,r2,#8 +vst1.8 d6,[r2,: 64] +add r2,r2,#16 +vst1.8 d18,[r2,: 64] +vzip.i32 q2,q5 +sub r2,r2,#32 +vst1.8 d4,[r2,: 64] +cmp r4,#0 +beq ._skippostcopy +add r2,r3,#144 +mov r4,r4 +vld1.8 {d0-d1},[r2,: 128]! +vld1.8 {d2-d3},[r2,: 128]! +vld1.8 {d4},[r2,: 64] +vst1.8 {d0-d1},[r4,: 128]! +vst1.8 {d2-d3},[r4,: 128]! +vst1.8 d4,[r4,: 64] +._skippostcopy: +cmp r1,#1 +bne ._skipfinalcopy +add r2,r3,#288 +add r4,r3,#144 +vld1.8 {d0-d1},[r2,: 128]! +vld1.8 {d2-d3},[r2,: 128]! +vld1.8 {d4},[r2,: 64] +vst1.8 {d0-d1},[r4,: 128]! +vst1.8 {d2-d3},[r4,: 128]! +vst1.8 d4,[r4,: 64] +._skipfinalcopy: +add r1,r1,#1 +cmp r1,#12 +blo ._invertloop +add r1,r3,#144 +ldr r2,[r1],#4 +ldr r3,[r1],#4 +ldr r4,[r1],#4 +ldr r5,[r1],#4 +ldr r6,[r1],#4 +ldr r7,[r1],#4 +ldr r8,[r1],#4 +ldr r9,[r1],#4 +ldr r10,[r1],#4 +ldr r1,[r1] +add r11,r1,r1,LSL #4 +add r11,r11,r1,LSL #1 +add r11,r11,#16777216 +mov r11,r11,ASR #25 +add r11,r11,r2 +mov r11,r11,ASR #26 +add r11,r11,r3 +mov r11,r11,ASR #25 +add r11,r11,r4 +mov r11,r11,ASR #26 +add r11,r11,r5 +mov r11,r11,ASR #25 +add r11,r11,r6 +mov r11,r11,ASR #26 +add r11,r11,r7 +mov r11,r11,ASR #25 +add r11,r11,r8 +mov r11,r11,ASR #26 +add r11,r11,r9 +mov r11,r11,ASR #25 +add r11,r11,r10 +mov r11,r11,ASR #26 +add r11,r11,r1 +mov r11,r11,ASR #25 +add r2,r2,r11 +add r2,r2,r11,LSL #1 +add r2,r2,r11,LSL #4 +mov r11,r2,ASR #26 +add r3,r3,r11 +sub r2,r2,r11,LSL #26 +mov r11,r3,ASR #25 +add r4,r4,r11 +sub r3,r3,r11,LSL #25 +mov r11,r4,ASR #26 +add r5,r5,r11 +sub r4,r4,r11,LSL #26 +mov r11,r5,ASR #25 +add r6,r6,r11 +sub r5,r5,r11,LSL #25 +mov r11,r6,ASR #26 +add r7,r7,r11 +sub r6,r6,r11,LSL #26 +mov r11,r7,ASR #25 +add r8,r8,r11 +sub r7,r7,r11,LSL #25 +mov r11,r8,ASR #26 +add r9,r9,r11 +sub r8,r8,r11,LSL #26 +mov r11,r9,ASR #25 +add r10,r10,r11 +sub r9,r9,r11,LSL #25 +mov r11,r10,ASR #26 +add r1,r1,r11 +sub r10,r10,r11,LSL #26 +mov r11,r1,ASR #25 +sub r1,r1,r11,LSL #25 +add r2,r2,r3,LSL #26 +mov r3,r3,LSR #6 +add r3,r3,r4,LSL #19 +mov r4,r4,LSR #13 +add r4,r4,r5,LSL #13 +mov r5,r5,LSR #19 +add r5,r5,r6,LSL #6 +add r6,r7,r8,LSL #25 +mov r7,r8,LSR #7 +add r7,r7,r9,LSL #19 +mov r8,r9,LSR #13 +add r8,r8,r10,LSL #12 +mov r9,r10,LSR #20 +add r1,r9,r1,LSL #6 +str r2,[r0],#4 +str r3,[r0],#4 +str r4,[r0],#4 +str r5,[r0],#4 +str r6,[r0],#4 +str r7,[r0],#4 +str r8,[r0],#4 +str r1,[r0] +ldrd r4,[sp,#0] +ldrd r6,[sp,#8] +ldrd r8,[sp,#16] +ldrd r10,[sp,#24] +ldr r12,[sp,#480] +ldr r14,[sp,#484] +ldr r0,=0 +mov sp,r12 +vpop {q4,q5,q6,q7} +bx lr + +#endif /* !OPENSSL_NO_ASM && OPENSSL_ARM && __ELF__ */ diff --git a/ring-0.17.14/crypto/curve25519/curve25519.c b/ring-0.17.14/crypto/curve25519/curve25519.c new file mode 100644 index 0000000000..99d7d7fbb2 --- /dev/null +++ b/ring-0.17.14/crypto/curve25519/curve25519.c @@ -0,0 +1,1924 @@ +// Copyright 2020 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Some of this code is taken from the ref10 version of Ed25519 in SUPERCOP +// 20141124 (http://bench.cr.yp.to/supercop.html). That code is released as +// public domain. Other parts have been replaced to call into code generated by +// Fiat (https://github.com/mit-plv/fiat-crypto) in //third_party/fiat. +// +// The field functions are shared by Ed25519 and X25519 where possible. + +#include + +#include "internal.h" +#include "../internal.h" + +#if defined(_MSC_VER) && !defined(__clang__) +// '=': conversion from 'int64_t' to 'int32_t', possible loss of data +#pragma warning(disable: 4242) +// '=': conversion from 'int32_t' to 'uint8_t', possible loss of data +#pragma warning(disable: 4244) +#endif + +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic ignored "-Wconversion" +#pragma GCC diagnostic ignored "-Wsign-conversion" +#endif + +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC diagnostic ignored "-Winline" +#endif + +// Various pre-computed constants. +#include "./curve25519_tables.h" + +#if defined(BORINGSSL_HAS_UINT128) +#if defined(__GNUC__) +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include "../../third_party/fiat/curve25519_64.h" +#elif defined(OPENSSL_64_BIT) +#include "../../third_party/fiat/curve25519_64_msvc.h" +#else +#include "../../third_party/fiat/curve25519_32.h" +#endif + + +// Low-level intrinsic operations + +static uint64_t load_3(const uint8_t *in) { + uint64_t result; + result = (uint64_t)in[0]; + result |= ((uint64_t)in[1]) << 8; + result |= ((uint64_t)in[2]) << 16; + return result; +} + +static uint64_t load_4(const uint8_t *in) { + uint64_t result; + result = (uint64_t)in[0]; + result |= ((uint64_t)in[1]) << 8; + result |= ((uint64_t)in[2]) << 16; + result |= ((uint64_t)in[3]) << 24; + return result; +} + + +// Field operations. + +#if defined(OPENSSL_64_BIT) + +// assert_fe asserts that |f| satisfies bounds: +// +// [[0x0 ~> 0x8cccccccccccc], +// [0x0 ~> 0x8cccccccccccc], +// [0x0 ~> 0x8cccccccccccc], +// [0x0 ~> 0x8cccccccccccc], +// [0x0 ~> 0x8cccccccccccc]] +// +// See comments in curve25519_64.h for which functions use these bounds for +// inputs or outputs. +#define assert_fe(f) \ + do { \ + for (unsigned _assert_fe_i = 0; _assert_fe_i < 5; _assert_fe_i++) { \ + declassify_assert(f[_assert_fe_i] <= UINT64_C(0x8cccccccccccc)); \ + } \ + } while (0) + +// assert_fe_loose asserts that |f| satisfies bounds: +// +// [[0x0 ~> 0x1a666666666664], +// [0x0 ~> 0x1a666666666664], +// [0x0 ~> 0x1a666666666664], +// [0x0 ~> 0x1a666666666664], +// [0x0 ~> 0x1a666666666664]] +// +// See comments in curve25519_64.h for which functions use these bounds for +// inputs or outputs. +#define assert_fe_loose(f) \ + do { \ + for (unsigned _assert_fe_i = 0; _assert_fe_i < 5; _assert_fe_i++) { \ + declassify_assert(f[_assert_fe_i] <= UINT64_C(0x1a666666666664)); \ + } \ + } while (0) + +#else + +// assert_fe asserts that |f| satisfies bounds: +// +// [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333], +// [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], +// [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], +// [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], +// [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]] +// +// See comments in curve25519_32.h for which functions use these bounds for +// inputs or outputs. +#define assert_fe(f) \ + do { \ + for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \ + declassify_assert(f[_assert_fe_i] <= \ + ((_assert_fe_i & 1) ? 0x2333333u : 0x4666666u)); \ + } \ + } while (0) + +// assert_fe_loose asserts that |f| satisfies bounds: +// +// [[0x0 ~> 0xd333332], [0x0 ~> 0x6999999], +// [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], +// [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], +// [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], +// [0x0 ~> 0xd333332], [0x0 ~> 0x6999999]] +// +// See comments in curve25519_32.h for which functions use these bounds for +// inputs or outputs. +#define assert_fe_loose(f) \ + do { \ + for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \ + declassify_assert(f[_assert_fe_i] <= \ + ((_assert_fe_i & 1) ? 0x6999999u : 0xd333332u)); \ + } \ + } while (0) + +#endif // OPENSSL_64_BIT + +OPENSSL_STATIC_ASSERT(sizeof(fe) == sizeof(fe_limb_t) * FE_NUM_LIMBS, + "fe_limb_t[FE_NUM_LIMBS] is inconsistent with fe"); + +static void fe_frombytes_strict(fe *h, const uint8_t s[32]) { + // |fiat_25519_from_bytes| requires the top-most bit be clear. + declassify_assert((s[31] & 0x80) == 0); + fiat_25519_from_bytes(h->v, s); + assert_fe(h->v); +} + +static void fe_frombytes(fe *h, const uint8_t s[32]) { + uint8_t s_copy[32]; + OPENSSL_memcpy(s_copy, s, 32); + s_copy[31] &= 0x7f; + fe_frombytes_strict(h, s_copy); +} + +static void fe_tobytes(uint8_t s[32], const fe *f) { + assert_fe(f->v); + fiat_25519_to_bytes(s, f->v); +} + +// h = 0 +static void fe_0(fe *h) { + OPENSSL_memset(h, 0, sizeof(fe)); +} + +#if defined(OPENSSL_SMALL) + +static void fe_loose_0(fe_loose *h) { + OPENSSL_memset(h, 0, sizeof(fe_loose)); +} + +#endif + +// h = 1 +static void fe_1(fe *h) { + OPENSSL_memset(h, 0, sizeof(fe)); + h->v[0] = 1; +} + +#if defined(OPENSSL_SMALL) + +static void fe_loose_1(fe_loose *h) { + OPENSSL_memset(h, 0, sizeof(fe_loose)); + h->v[0] = 1; +} + +#endif + +// h = f + g +// Can overlap h with f or g. +static void fe_add(fe_loose *h, const fe *f, const fe *g) { + assert_fe(f->v); + assert_fe(g->v); + fiat_25519_add(h->v, f->v, g->v); + assert_fe_loose(h->v); +} + +// h = f - g +// Can overlap h with f or g. +static void fe_sub(fe_loose *h, const fe *f, const fe *g) { + assert_fe(f->v); + assert_fe(g->v); + fiat_25519_sub(h->v, f->v, g->v); + assert_fe_loose(h->v); +} + +static void fe_carry(fe *h, const fe_loose* f) { + assert_fe_loose(f->v); + fiat_25519_carry(h->v, f->v); + assert_fe(h->v); +} + +static void fe_mul_impl(fe_limb_t out[FE_NUM_LIMBS], + const fe_limb_t in1[FE_NUM_LIMBS], + const fe_limb_t in2[FE_NUM_LIMBS]) { + assert_fe_loose(in1); + assert_fe_loose(in2); + fiat_25519_carry_mul(out, in1, in2); + assert_fe(out); +} + +static void fe_mul_ltt(fe_loose *h, const fe *f, const fe *g) { + fe_mul_impl(h->v, f->v, g->v); +} + +#if defined(OPENSSL_SMALL) +static void fe_mul_llt(fe_loose *h, const fe_loose *f, const fe *g) { + fe_mul_impl(h->v, f->v, g->v); +} +#endif + +static void fe_mul_ttt(fe *h, const fe *f, const fe *g) { + fe_mul_impl(h->v, f->v, g->v); +} + +static void fe_mul_tlt(fe *h, const fe_loose *f, const fe *g) { + fe_mul_impl(h->v, f->v, g->v); +} + +static void fe_mul_ttl(fe *h, const fe *f, const fe_loose *g) { + fe_mul_impl(h->v, f->v, g->v); +} + +static void fe_mul_tll(fe *h, const fe_loose *f, const fe_loose *g) { + fe_mul_impl(h->v, f->v, g->v); +} + +static void fe_sq_tl(fe *h, const fe_loose *f) { + assert_fe_loose(f->v); + fiat_25519_carry_square(h->v, f->v); + assert_fe(h->v); +} + +static void fe_sq_tt(fe *h, const fe *f) { + assert_fe_loose(f->v); + fiat_25519_carry_square(h->v, f->v); + assert_fe(h->v); +} + +// Replace (f,g) with (g,f) if b == 1; +// replace (f,g) with (f,g) if b == 0. +// +// Preconditions: b in {0,1}. +static void fe_cswap(fe *f, fe *g, fe_limb_t b) { + b = 0-b; + for (unsigned i = 0; i < FE_NUM_LIMBS; i++) { + fe_limb_t x = f->v[i] ^ g->v[i]; + x &= b; + f->v[i] ^= x; + g->v[i] ^= x; + } +} + +static void fe_mul121666(fe *h, const fe_loose *f) { + assert_fe_loose(f->v); + fiat_25519_carry_scmul_121666(h->v, f->v); + assert_fe(h->v); +} + +// h = -f +static void fe_neg(fe_loose *h, const fe *f) { + assert_fe(f->v); + fiat_25519_opp(h->v, f->v); + assert_fe_loose(h->v); +} + +// Replace (f,g) with (g,g) if b == 1; +// replace (f,g) with (f,g) if b == 0. +// +// Preconditions: b in {0,1}. +static void fe_cmov(fe_loose *f, const fe_loose *g, fe_limb_t b) { + // TODO(davidben): Switch to fiat's calling convention, or ask fiat to emit a + // different one. + + b = 0-b; + for (unsigned i = 0; i < FE_NUM_LIMBS; i++) { + fe_limb_t x = f->v[i] ^ g->v[i]; + x &= b; + f->v[i] ^= x; + } +} + +// h = f +static void fe_copy(fe *h, const fe *f) { + fe_limbs_copy(h->v, f->v); +} + +static void fe_copy_lt(fe_loose *h, const fe *f) { + OPENSSL_STATIC_ASSERT(sizeof(fe_loose) == sizeof(fe), "fe and fe_loose mismatch"); + fe_limbs_copy(h->v, f->v); +} + +static void fe_loose_invert(fe *out, const fe_loose *z) { + fe t0; + fe t1; + fe t2; + fe t3; + int i; + + fe_sq_tl(&t0, z); + fe_sq_tt(&t1, &t0); + for (i = 1; i < 2; ++i) { + fe_sq_tt(&t1, &t1); + } + fe_mul_tlt(&t1, z, &t1); + fe_mul_ttt(&t0, &t0, &t1); + fe_sq_tt(&t2, &t0); + fe_mul_ttt(&t1, &t1, &t2); + fe_sq_tt(&t2, &t1); + for (i = 1; i < 5; ++i) { + fe_sq_tt(&t2, &t2); + } + fe_mul_ttt(&t1, &t2, &t1); + fe_sq_tt(&t2, &t1); + for (i = 1; i < 10; ++i) { + fe_sq_tt(&t2, &t2); + } + fe_mul_ttt(&t2, &t2, &t1); + fe_sq_tt(&t3, &t2); + for (i = 1; i < 20; ++i) { + fe_sq_tt(&t3, &t3); + } + fe_mul_ttt(&t2, &t3, &t2); + fe_sq_tt(&t2, &t2); + for (i = 1; i < 10; ++i) { + fe_sq_tt(&t2, &t2); + } + fe_mul_ttt(&t1, &t2, &t1); + fe_sq_tt(&t2, &t1); + for (i = 1; i < 50; ++i) { + fe_sq_tt(&t2, &t2); + } + fe_mul_ttt(&t2, &t2, &t1); + fe_sq_tt(&t3, &t2); + for (i = 1; i < 100; ++i) { + fe_sq_tt(&t3, &t3); + } + fe_mul_ttt(&t2, &t3, &t2); + fe_sq_tt(&t2, &t2); + for (i = 1; i < 50; ++i) { + fe_sq_tt(&t2, &t2); + } + fe_mul_ttt(&t1, &t2, &t1); + fe_sq_tt(&t1, &t1); + for (i = 1; i < 5; ++i) { + fe_sq_tt(&t1, &t1); + } + fe_mul_ttt(out, &t1, &t0); +} + +static void fe_invert(fe *out, const fe *z) { + fe_loose l; + fe_copy_lt(&l, z); + fe_loose_invert(out, &l); +} + +// return 0 if f == 0 +// return 1 if f != 0 +static int fe_isnonzero(const fe_loose *f) { + fe tight; + fe_carry(&tight, f); + uint8_t s[32]; + fe_tobytes(s, &tight); + + static const uint8_t zero[32] = {0}; + return CRYPTO_memcmp(s, zero, sizeof(zero)) != 0; +} + +// return 1 if f is in {1,3,5,...,q-2} +// return 0 if f is in {0,2,4,...,q-1} +static int fe_isnegative(const fe *f) { + uint8_t s[32]; + fe_tobytes(s, f); + return s[0] & 1; +} + +static void fe_sq2_tt(fe *h, const fe *f) { + // h = f^2 + fe_sq_tt(h, f); + + // h = h + h + fe_loose tmp; + fe_add(&tmp, h, h); + fe_carry(h, &tmp); +} + +static void fe_pow22523(fe *out, const fe *z) { + fe t0; + fe t1; + fe t2; + int i; + + fe_sq_tt(&t0, z); + fe_sq_tt(&t1, &t0); + for (i = 1; i < 2; ++i) { + fe_sq_tt(&t1, &t1); + } + fe_mul_ttt(&t1, z, &t1); + fe_mul_ttt(&t0, &t0, &t1); + fe_sq_tt(&t0, &t0); + fe_mul_ttt(&t0, &t1, &t0); + fe_sq_tt(&t1, &t0); + for (i = 1; i < 5; ++i) { + fe_sq_tt(&t1, &t1); + } + fe_mul_ttt(&t0, &t1, &t0); + fe_sq_tt(&t1, &t0); + for (i = 1; i < 10; ++i) { + fe_sq_tt(&t1, &t1); + } + fe_mul_ttt(&t1, &t1, &t0); + fe_sq_tt(&t2, &t1); + for (i = 1; i < 20; ++i) { + fe_sq_tt(&t2, &t2); + } + fe_mul_ttt(&t1, &t2, &t1); + fe_sq_tt(&t1, &t1); + for (i = 1; i < 10; ++i) { + fe_sq_tt(&t1, &t1); + } + fe_mul_ttt(&t0, &t1, &t0); + fe_sq_tt(&t1, &t0); + for (i = 1; i < 50; ++i) { + fe_sq_tt(&t1, &t1); + } + fe_mul_ttt(&t1, &t1, &t0); + fe_sq_tt(&t2, &t1); + for (i = 1; i < 100; ++i) { + fe_sq_tt(&t2, &t2); + } + fe_mul_ttt(&t1, &t2, &t1); + fe_sq_tt(&t1, &t1); + for (i = 1; i < 50; ++i) { + fe_sq_tt(&t1, &t1); + } + fe_mul_ttt(&t0, &t1, &t0); + fe_sq_tt(&t0, &t0); + for (i = 1; i < 2; ++i) { + fe_sq_tt(&t0, &t0); + } + fe_mul_ttt(out, &t0, z); +} + + +// Group operations. + +int x25519_ge_frombytes_vartime(ge_p3 *h, const uint8_t s[32]) { + fe u; + fe_loose v; + fe w; + fe vxx; + fe_loose check; + + fe_frombytes(&h->Y, s); + fe_1(&h->Z); + fe_sq_tt(&w, &h->Y); + fe_mul_ttt(&vxx, &w, &d); + fe_sub(&v, &w, &h->Z); // u = y^2-1 + fe_carry(&u, &v); + fe_add(&v, &vxx, &h->Z); // v = dy^2+1 + + fe_mul_ttl(&w, &u, &v); // w = u*v + fe_pow22523(&h->X, &w); // x = w^((q-5)/8) + fe_mul_ttt(&h->X, &h->X, &u); // x = u*w^((q-5)/8) + + fe_sq_tt(&vxx, &h->X); + fe_mul_ttl(&vxx, &vxx, &v); + fe_sub(&check, &vxx, &u); + if (fe_isnonzero(&check)) { + fe_add(&check, &vxx, &u); + if (fe_isnonzero(&check)) { + return 0; + } + fe_mul_ttt(&h->X, &h->X, &sqrtm1); + } + + if (fe_isnegative(&h->X) != (s[31] >> 7)) { + fe_loose t; + fe_neg(&t, &h->X); + fe_carry(&h->X, &t); + } + + fe_mul_ttt(&h->T, &h->X, &h->Y); + return 1; +} + +static void ge_p2_0(ge_p2 *h) { + fe_0(&h->X); + fe_1(&h->Y); + fe_1(&h->Z); +} + +static void ge_p3_0(ge_p3 *h) { + fe_0(&h->X); + fe_1(&h->Y); + fe_1(&h->Z); + fe_0(&h->T); +} + +#if defined(OPENSSL_SMALL) + +static void ge_precomp_0(ge_precomp *h) { + fe_loose_1(&h->yplusx); + fe_loose_1(&h->yminusx); + fe_loose_0(&h->xy2d); +} + +#endif + +// r = p +static void ge_p3_to_p2(ge_p2 *r, const ge_p3 *p) { + fe_copy(&r->X, &p->X); + fe_copy(&r->Y, &p->Y); + fe_copy(&r->Z, &p->Z); +} + +// r = p +static void x25519_ge_p3_to_cached(ge_cached *r, const ge_p3 *p) { + fe_add(&r->YplusX, &p->Y, &p->X); + fe_sub(&r->YminusX, &p->Y, &p->X); + fe_copy_lt(&r->Z, &p->Z); + fe_mul_ltt(&r->T2d, &p->T, &d2); +} + +// r = p +static void x25519_ge_p1p1_to_p2(ge_p2 *r, const ge_p1p1 *p) { + fe_mul_tll(&r->X, &p->X, &p->T); + fe_mul_tll(&r->Y, &p->Y, &p->Z); + fe_mul_tll(&r->Z, &p->Z, &p->T); +} + +// r = p +static void x25519_ge_p1p1_to_p3(ge_p3 *r, const ge_p1p1 *p) { + fe_mul_tll(&r->X, &p->X, &p->T); + fe_mul_tll(&r->Y, &p->Y, &p->Z); + fe_mul_tll(&r->Z, &p->Z, &p->T); + fe_mul_tll(&r->T, &p->X, &p->Y); +} + +// r = 2 * p +static void ge_p2_dbl(ge_p1p1 *r, const ge_p2 *p) { + fe trX, trZ, trT; + fe t0; + + fe_sq_tt(&trX, &p->X); + fe_sq_tt(&trZ, &p->Y); + fe_sq2_tt(&trT, &p->Z); + fe_add(&r->Y, &p->X, &p->Y); + fe_sq_tl(&t0, &r->Y); + + fe_add(&r->Y, &trZ, &trX); + fe_sub(&r->Z, &trZ, &trX); + fe_carry(&trZ, &r->Y); + fe_sub(&r->X, &t0, &trZ); + fe_carry(&trZ, &r->Z); + fe_sub(&r->T, &trT, &trZ); +} + +// r = 2 * p +static void ge_p3_dbl(ge_p1p1 *r, const ge_p3 *p) { + ge_p2 q; + ge_p3_to_p2(&q, p); + ge_p2_dbl(r, &q); +} + +// r = p + q +static void ge_madd(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) { + fe trY, trZ, trT; + + fe_add(&r->X, &p->Y, &p->X); + fe_sub(&r->Y, &p->Y, &p->X); + fe_mul_tll(&trZ, &r->X, &q->yplusx); + fe_mul_tll(&trY, &r->Y, &q->yminusx); + fe_mul_tlt(&trT, &q->xy2d, &p->T); + fe_add(&r->T, &p->Z, &p->Z); + fe_sub(&r->X, &trZ, &trY); + fe_add(&r->Y, &trZ, &trY); + fe_carry(&trZ, &r->T); + fe_add(&r->Z, &trZ, &trT); + fe_sub(&r->T, &trZ, &trT); +} + +// r = p - q +static void ge_msub(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) { + fe trY, trZ, trT; + + fe_add(&r->X, &p->Y, &p->X); + fe_sub(&r->Y, &p->Y, &p->X); + fe_mul_tll(&trZ, &r->X, &q->yminusx); + fe_mul_tll(&trY, &r->Y, &q->yplusx); + fe_mul_tlt(&trT, &q->xy2d, &p->T); + fe_add(&r->T, &p->Z, &p->Z); + fe_sub(&r->X, &trZ, &trY); + fe_add(&r->Y, &trZ, &trY); + fe_carry(&trZ, &r->T); + fe_sub(&r->Z, &trZ, &trT); + fe_add(&r->T, &trZ, &trT); +} + +// r = p + q +static void x25519_ge_add(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) { + fe trX, trY, trZ, trT; + + fe_add(&r->X, &p->Y, &p->X); + fe_sub(&r->Y, &p->Y, &p->X); + fe_mul_tll(&trZ, &r->X, &q->YplusX); + fe_mul_tll(&trY, &r->Y, &q->YminusX); + fe_mul_tlt(&trT, &q->T2d, &p->T); + fe_mul_ttl(&trX, &p->Z, &q->Z); + fe_add(&r->T, &trX, &trX); + fe_sub(&r->X, &trZ, &trY); + fe_add(&r->Y, &trZ, &trY); + fe_carry(&trZ, &r->T); + fe_add(&r->Z, &trZ, &trT); + fe_sub(&r->T, &trZ, &trT); +} + +// r = p - q +static void x25519_ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) { + fe trX, trY, trZ, trT; + + fe_add(&r->X, &p->Y, &p->X); + fe_sub(&r->Y, &p->Y, &p->X); + fe_mul_tll(&trZ, &r->X, &q->YminusX); + fe_mul_tll(&trY, &r->Y, &q->YplusX); + fe_mul_tlt(&trT, &q->T2d, &p->T); + fe_mul_ttl(&trX, &p->Z, &q->Z); + fe_add(&r->T, &trX, &trX); + fe_sub(&r->X, &trZ, &trY); + fe_add(&r->Y, &trZ, &trY); + fe_carry(&trZ, &r->T); + fe_sub(&r->Z, &trZ, &trT); + fe_add(&r->T, &trZ, &trT); +} + +static void cmov(ge_precomp *t, const ge_precomp *u, uint8_t b) { + fe_cmov(&t->yplusx, &u->yplusx, b); + fe_cmov(&t->yminusx, &u->yminusx, b); + fe_cmov(&t->xy2d, &u->xy2d, b); +} + +#if defined(OPENSSL_SMALL) + +static void x25519_ge_scalarmult_small_precomp( + ge_p3 *h, const uint8_t a[32], const uint8_t precomp_table[15 * 2 * 32]) { + // precomp_table is first expanded into matching |ge_precomp| + // elements. + ge_precomp multiples[15]; + + unsigned i; + for (i = 0; i < 15; i++) { + // The precomputed table is assumed to already clear the top bit, so + // |fe_frombytes_strict| may be used directly. + const uint8_t *bytes = &precomp_table[i*(2 * 32)]; + fe x, y; + fe_frombytes_strict(&x, bytes); + fe_frombytes_strict(&y, bytes + 32); + + ge_precomp *out = &multiples[i]; + fe_add(&out->yplusx, &y, &x); + fe_sub(&out->yminusx, &y, &x); + fe_mul_ltt(&out->xy2d, &x, &y); + fe_mul_llt(&out->xy2d, &out->xy2d, &d2); + } + + // See the comment above |k25519SmallPrecomp| about the structure of the + // precomputed elements. This loop does 64 additions and 64 doublings to + // calculate the result. + ge_p3_0(h); + + for (i = 63; i < 64; i--) { + unsigned j; + signed char index = 0; + + for (j = 0; j < 4; j++) { + const uint8_t bit = 1 & (a[(8 * j) + (i / 8)] >> (i & 7)); + index |= (bit << j); + } + + ge_precomp e; + ge_precomp_0(&e); + + for (j = 1; j < 16; j++) { + cmov(&e, &multiples[j-1], 1&constant_time_eq_w(index, j)); + } + + ge_cached cached; + ge_p1p1 r; + x25519_ge_p3_to_cached(&cached, h); + x25519_ge_add(&r, h, &cached); + x25519_ge_p1p1_to_p3(h, &r); + + ge_madd(&r, h, &e); + x25519_ge_p1p1_to_p3(h, &r); + } +} + +void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32], int use_adx) { + (void)use_adx; + x25519_ge_scalarmult_small_precomp(h, a, k25519SmallPrecomp); +} + +#else + +static void table_select(ge_precomp *t, const int pos, const signed char b) { + uint8_t bnegative = constant_time_msb_w(b); + uint8_t babs = b - ((bnegative & b) << 1); + + uint8_t t_bytes[3][32] = { + {constant_time_is_zero_w(b) & 1}, {constant_time_is_zero_w(b) & 1}, {0}}; +#if defined(__clang__) // materialize for vectorization, 6% speedup + __asm__("" : "+m" (t_bytes) : /*no inputs*/); +#endif + OPENSSL_STATIC_ASSERT(sizeof(t_bytes) == sizeof(k25519Precomp[pos][0]), ""); + for (int i = 0; i < 8; i++) { + constant_time_conditional_memxor(t_bytes, k25519Precomp[pos][i], + sizeof(t_bytes), + constant_time_eq_w(babs, 1 + i)); + } + + fe yplusx, yminusx, xy2d; + fe_frombytes_strict(&yplusx, t_bytes[0]); + fe_frombytes_strict(&yminusx, t_bytes[1]); + fe_frombytes_strict(&xy2d, t_bytes[2]); + + fe_copy_lt(&t->yplusx, &yplusx); + fe_copy_lt(&t->yminusx, &yminusx); + fe_copy_lt(&t->xy2d, &xy2d); + + ge_precomp minust; + fe_copy_lt(&minust.yplusx, &yminusx); + fe_copy_lt(&minust.yminusx, &yplusx); + fe_neg(&minust.xy2d, &xy2d); + cmov(t, &minust, bnegative>>7); +} + +// h = a * B +// where a = a[0]+256*a[1]+...+256^31 a[31] +// B is the Ed25519 base point (x,4/5) with x positive. +// +// Preconditions: +// a[31] <= 127 +void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32], int use_adx) { +#if defined(BORINGSSL_FE25519_ADX) + if (use_adx) { + uint8_t t[4][32]; + x25519_ge_scalarmult_base_adx(t, a); + fiat_25519_from_bytes(h->X.v, t[0]); + fiat_25519_from_bytes(h->Y.v, t[1]); + fiat_25519_from_bytes(h->Z.v, t[2]); + fiat_25519_from_bytes(h->T.v, t[3]); + return; + } +#else + (void)use_adx; +#endif + signed char e[64]; + signed char carry; + ge_p1p1 r; + ge_p2 s; + ge_precomp t; + int i; + + for (i = 0; i < 32; ++i) { + e[2 * i + 0] = (a[i] >> 0) & 15; + e[2 * i + 1] = (a[i] >> 4) & 15; + } + // each e[i] is between 0 and 15 + // e[63] is between 0 and 7 + + carry = 0; + for (i = 0; i < 63; ++i) { + e[i] += carry; + carry = e[i] + 8; + carry >>= 4; + e[i] -= carry << 4; + } + e[63] += carry; + // each e[i] is between -8 and 8 + + ge_p3_0(h); + for (i = 1; i < 64; i += 2) { + table_select(&t, i / 2, e[i]); + ge_madd(&r, h, &t); + x25519_ge_p1p1_to_p3(h, &r); + } + + ge_p3_dbl(&r, h); + x25519_ge_p1p1_to_p2(&s, &r); + ge_p2_dbl(&r, &s); + x25519_ge_p1p1_to_p2(&s, &r); + ge_p2_dbl(&r, &s); + x25519_ge_p1p1_to_p2(&s, &r); + ge_p2_dbl(&r, &s); + x25519_ge_p1p1_to_p3(h, &r); + + for (i = 0; i < 64; i += 2) { + table_select(&t, i / 2, e[i]); + ge_madd(&r, h, &t); + x25519_ge_p1p1_to_p3(h, &r); + } +} + +#endif + +static void slide(signed char *r, const uint8_t *a) { + int i; + int b; + int k; + + for (i = 0; i < 256; ++i) { + r[i] = 1 & (a[i >> 3] >> (i & 7)); + } + + for (i = 0; i < 256; ++i) { + if (r[i]) { + for (b = 1; b <= 6 && i + b < 256; ++b) { + if (r[i + b]) { + if (r[i] + (r[i + b] << b) <= 15) { + r[i] += r[i + b] << b; + r[i + b] = 0; + } else if (r[i] - (r[i + b] << b) >= -15) { + r[i] -= r[i + b] << b; + for (k = i + b; k < 256; ++k) { + if (!r[k]) { + r[k] = 1; + break; + } + r[k] = 0; + } + } else { + break; + } + } + } + } + } +} + +// r = a * A + b * B +// where a = a[0]+256*a[1]+...+256^31 a[31]. +// and b = b[0]+256*b[1]+...+256^31 b[31]. +// B is the Ed25519 base point (x,4/5) with x positive. +static void ge_double_scalarmult_vartime(ge_p2 *r, const uint8_t *a, + const ge_p3 *A, const uint8_t *b) { + signed char aslide[256]; + signed char bslide[256]; + ge_cached Ai[8]; // A,3A,5A,7A,9A,11A,13A,15A + ge_p1p1 t; + ge_p3 u; + ge_p3 A2; + int i; + + slide(aslide, a); + slide(bslide, b); + + x25519_ge_p3_to_cached(&Ai[0], A); + ge_p3_dbl(&t, A); + x25519_ge_p1p1_to_p3(&A2, &t); + x25519_ge_add(&t, &A2, &Ai[0]); + x25519_ge_p1p1_to_p3(&u, &t); + x25519_ge_p3_to_cached(&Ai[1], &u); + x25519_ge_add(&t, &A2, &Ai[1]); + x25519_ge_p1p1_to_p3(&u, &t); + x25519_ge_p3_to_cached(&Ai[2], &u); + x25519_ge_add(&t, &A2, &Ai[2]); + x25519_ge_p1p1_to_p3(&u, &t); + x25519_ge_p3_to_cached(&Ai[3], &u); + x25519_ge_add(&t, &A2, &Ai[3]); + x25519_ge_p1p1_to_p3(&u, &t); + x25519_ge_p3_to_cached(&Ai[4], &u); + x25519_ge_add(&t, &A2, &Ai[4]); + x25519_ge_p1p1_to_p3(&u, &t); + x25519_ge_p3_to_cached(&Ai[5], &u); + x25519_ge_add(&t, &A2, &Ai[5]); + x25519_ge_p1p1_to_p3(&u, &t); + x25519_ge_p3_to_cached(&Ai[6], &u); + x25519_ge_add(&t, &A2, &Ai[6]); + x25519_ge_p1p1_to_p3(&u, &t); + x25519_ge_p3_to_cached(&Ai[7], &u); + + ge_p2_0(r); + + for (i = 255; i >= 0; --i) { + if (aslide[i] || bslide[i]) { + break; + } + } + + for (; i >= 0; --i) { + ge_p2_dbl(&t, r); + + if (aslide[i] > 0) { + x25519_ge_p1p1_to_p3(&u, &t); + x25519_ge_add(&t, &u, &Ai[aslide[i] / 2]); + } else if (aslide[i] < 0) { + x25519_ge_p1p1_to_p3(&u, &t); + x25519_ge_sub(&t, &u, &Ai[(-aslide[i]) / 2]); + } + + if (bslide[i] > 0) { + x25519_ge_p1p1_to_p3(&u, &t); + ge_madd(&t, &u, &Bi[bslide[i] / 2]); + } else if (bslide[i] < 0) { + x25519_ge_p1p1_to_p3(&u, &t); + ge_msub(&t, &u, &Bi[(-bslide[i]) / 2]); + } + + x25519_ge_p1p1_to_p2(r, &t); + } +} + +// int64_lshift21 returns |a << 21| but is defined when shifting bits into the +// sign bit. This works around a language flaw in C. +static inline int64_t int64_lshift21(int64_t a) { + return (int64_t)((uint64_t)a << 21); +} + +// The set of scalars is \Z/l +// where l = 2^252 + 27742317777372353535851937790883648493. + +// Input: +// s[0]+256*s[1]+...+256^63*s[63] = s +// +// Output: +// s[0]+256*s[1]+...+256^31*s[31] = s mod l +// where l = 2^252 + 27742317777372353535851937790883648493. +// Overwrites s in place. +void x25519_sc_reduce(uint8_t s[64]) { + int64_t s0 = 2097151 & load_3(s); + int64_t s1 = 2097151 & (load_4(s + 2) >> 5); + int64_t s2 = 2097151 & (load_3(s + 5) >> 2); + int64_t s3 = 2097151 & (load_4(s + 7) >> 7); + int64_t s4 = 2097151 & (load_4(s + 10) >> 4); + int64_t s5 = 2097151 & (load_3(s + 13) >> 1); + int64_t s6 = 2097151 & (load_4(s + 15) >> 6); + int64_t s7 = 2097151 & (load_3(s + 18) >> 3); + int64_t s8 = 2097151 & load_3(s + 21); + int64_t s9 = 2097151 & (load_4(s + 23) >> 5); + int64_t s10 = 2097151 & (load_3(s + 26) >> 2); + int64_t s11 = 2097151 & (load_4(s + 28) >> 7); + int64_t s12 = 2097151 & (load_4(s + 31) >> 4); + int64_t s13 = 2097151 & (load_3(s + 34) >> 1); + int64_t s14 = 2097151 & (load_4(s + 36) >> 6); + int64_t s15 = 2097151 & (load_3(s + 39) >> 3); + int64_t s16 = 2097151 & load_3(s + 42); + int64_t s17 = 2097151 & (load_4(s + 44) >> 5); + int64_t s18 = 2097151 & (load_3(s + 47) >> 2); + int64_t s19 = 2097151 & (load_4(s + 49) >> 7); + int64_t s20 = 2097151 & (load_4(s + 52) >> 4); + int64_t s21 = 2097151 & (load_3(s + 55) >> 1); + int64_t s22 = 2097151 & (load_4(s + 57) >> 6); + int64_t s23 = (load_4(s + 60) >> 3); + int64_t carry0; + int64_t carry1; + int64_t carry2; + int64_t carry3; + int64_t carry4; + int64_t carry5; + int64_t carry6; + int64_t carry7; + int64_t carry8; + int64_t carry9; + int64_t carry10; + int64_t carry11; + int64_t carry12; + int64_t carry13; + int64_t carry14; + int64_t carry15; + int64_t carry16; + + s11 += s23 * 666643; + s12 += s23 * 470296; + s13 += s23 * 654183; + s14 -= s23 * 997805; + s15 += s23 * 136657; + s16 -= s23 * 683901; + s23 = 0; + + s10 += s22 * 666643; + s11 += s22 * 470296; + s12 += s22 * 654183; + s13 -= s22 * 997805; + s14 += s22 * 136657; + s15 -= s22 * 683901; + s22 = 0; + + s9 += s21 * 666643; + s10 += s21 * 470296; + s11 += s21 * 654183; + s12 -= s21 * 997805; + s13 += s21 * 136657; + s14 -= s21 * 683901; + s21 = 0; + + s8 += s20 * 666643; + s9 += s20 * 470296; + s10 += s20 * 654183; + s11 -= s20 * 997805; + s12 += s20 * 136657; + s13 -= s20 * 683901; + s20 = 0; + + s7 += s19 * 666643; + s8 += s19 * 470296; + s9 += s19 * 654183; + s10 -= s19 * 997805; + s11 += s19 * 136657; + s12 -= s19 * 683901; + s19 = 0; + + s6 += s18 * 666643; + s7 += s18 * 470296; + s8 += s18 * 654183; + s9 -= s18 * 997805; + s10 += s18 * 136657; + s11 -= s18 * 683901; + s18 = 0; + + carry6 = (s6 + (1 << 20)) >> 21; + s7 += carry6; + s6 -= int64_lshift21(carry6); + carry8 = (s8 + (1 << 20)) >> 21; + s9 += carry8; + s8 -= int64_lshift21(carry8); + carry10 = (s10 + (1 << 20)) >> 21; + s11 += carry10; + s10 -= int64_lshift21(carry10); + carry12 = (s12 + (1 << 20)) >> 21; + s13 += carry12; + s12 -= int64_lshift21(carry12); + carry14 = (s14 + (1 << 20)) >> 21; + s15 += carry14; + s14 -= int64_lshift21(carry14); + carry16 = (s16 + (1 << 20)) >> 21; + s17 += carry16; + s16 -= int64_lshift21(carry16); + + carry7 = (s7 + (1 << 20)) >> 21; + s8 += carry7; + s7 -= int64_lshift21(carry7); + carry9 = (s9 + (1 << 20)) >> 21; + s10 += carry9; + s9 -= int64_lshift21(carry9); + carry11 = (s11 + (1 << 20)) >> 21; + s12 += carry11; + s11 -= int64_lshift21(carry11); + carry13 = (s13 + (1 << 20)) >> 21; + s14 += carry13; + s13 -= int64_lshift21(carry13); + carry15 = (s15 + (1 << 20)) >> 21; + s16 += carry15; + s15 -= int64_lshift21(carry15); + + s5 += s17 * 666643; + s6 += s17 * 470296; + s7 += s17 * 654183; + s8 -= s17 * 997805; + s9 += s17 * 136657; + s10 -= s17 * 683901; + s17 = 0; + + s4 += s16 * 666643; + s5 += s16 * 470296; + s6 += s16 * 654183; + s7 -= s16 * 997805; + s8 += s16 * 136657; + s9 -= s16 * 683901; + s16 = 0; + + s3 += s15 * 666643; + s4 += s15 * 470296; + s5 += s15 * 654183; + s6 -= s15 * 997805; + s7 += s15 * 136657; + s8 -= s15 * 683901; + s15 = 0; + + s2 += s14 * 666643; + s3 += s14 * 470296; + s4 += s14 * 654183; + s5 -= s14 * 997805; + s6 += s14 * 136657; + s7 -= s14 * 683901; + s14 = 0; + + s1 += s13 * 666643; + s2 += s13 * 470296; + s3 += s13 * 654183; + s4 -= s13 * 997805; + s5 += s13 * 136657; + s6 -= s13 * 683901; + s13 = 0; + + s0 += s12 * 666643; + s1 += s12 * 470296; + s2 += s12 * 654183; + s3 -= s12 * 997805; + s4 += s12 * 136657; + s5 -= s12 * 683901; + s12 = 0; + + carry0 = (s0 + (1 << 20)) >> 21; + s1 += carry0; + s0 -= int64_lshift21(carry0); + carry2 = (s2 + (1 << 20)) >> 21; + s3 += carry2; + s2 -= int64_lshift21(carry2); + carry4 = (s4 + (1 << 20)) >> 21; + s5 += carry4; + s4 -= int64_lshift21(carry4); + carry6 = (s6 + (1 << 20)) >> 21; + s7 += carry6; + s6 -= int64_lshift21(carry6); + carry8 = (s8 + (1 << 20)) >> 21; + s9 += carry8; + s8 -= int64_lshift21(carry8); + carry10 = (s10 + (1 << 20)) >> 21; + s11 += carry10; + s10 -= int64_lshift21(carry10); + + carry1 = (s1 + (1 << 20)) >> 21; + s2 += carry1; + s1 -= int64_lshift21(carry1); + carry3 = (s3 + (1 << 20)) >> 21; + s4 += carry3; + s3 -= int64_lshift21(carry3); + carry5 = (s5 + (1 << 20)) >> 21; + s6 += carry5; + s5 -= int64_lshift21(carry5); + carry7 = (s7 + (1 << 20)) >> 21; + s8 += carry7; + s7 -= int64_lshift21(carry7); + carry9 = (s9 + (1 << 20)) >> 21; + s10 += carry9; + s9 -= int64_lshift21(carry9); + carry11 = (s11 + (1 << 20)) >> 21; + s12 += carry11; + s11 -= int64_lshift21(carry11); + + s0 += s12 * 666643; + s1 += s12 * 470296; + s2 += s12 * 654183; + s3 -= s12 * 997805; + s4 += s12 * 136657; + s5 -= s12 * 683901; + s12 = 0; + + carry0 = s0 >> 21; + s1 += carry0; + s0 -= int64_lshift21(carry0); + carry1 = s1 >> 21; + s2 += carry1; + s1 -= int64_lshift21(carry1); + carry2 = s2 >> 21; + s3 += carry2; + s2 -= int64_lshift21(carry2); + carry3 = s3 >> 21; + s4 += carry3; + s3 -= int64_lshift21(carry3); + carry4 = s4 >> 21; + s5 += carry4; + s4 -= int64_lshift21(carry4); + carry5 = s5 >> 21; + s6 += carry5; + s5 -= int64_lshift21(carry5); + carry6 = s6 >> 21; + s7 += carry6; + s6 -= int64_lshift21(carry6); + carry7 = s7 >> 21; + s8 += carry7; + s7 -= int64_lshift21(carry7); + carry8 = s8 >> 21; + s9 += carry8; + s8 -= int64_lshift21(carry8); + carry9 = s9 >> 21; + s10 += carry9; + s9 -= int64_lshift21(carry9); + carry10 = s10 >> 21; + s11 += carry10; + s10 -= int64_lshift21(carry10); + carry11 = s11 >> 21; + s12 += carry11; + s11 -= int64_lshift21(carry11); + + s0 += s12 * 666643; + s1 += s12 * 470296; + s2 += s12 * 654183; + s3 -= s12 * 997805; + s4 += s12 * 136657; + s5 -= s12 * 683901; + s12 = 0; + + carry0 = s0 >> 21; + s1 += carry0; + s0 -= int64_lshift21(carry0); + carry1 = s1 >> 21; + s2 += carry1; + s1 -= int64_lshift21(carry1); + carry2 = s2 >> 21; + s3 += carry2; + s2 -= int64_lshift21(carry2); + carry3 = s3 >> 21; + s4 += carry3; + s3 -= int64_lshift21(carry3); + carry4 = s4 >> 21; + s5 += carry4; + s4 -= int64_lshift21(carry4); + carry5 = s5 >> 21; + s6 += carry5; + s5 -= int64_lshift21(carry5); + carry6 = s6 >> 21; + s7 += carry6; + s6 -= int64_lshift21(carry6); + carry7 = s7 >> 21; + s8 += carry7; + s7 -= int64_lshift21(carry7); + carry8 = s8 >> 21; + s9 += carry8; + s8 -= int64_lshift21(carry8); + carry9 = s9 >> 21; + s10 += carry9; + s9 -= int64_lshift21(carry9); + carry10 = s10 >> 21; + s11 += carry10; + s10 -= int64_lshift21(carry10); + + s[0] = s0 >> 0; + s[1] = s0 >> 8; + s[2] = (s0 >> 16) | (s1 << 5); + s[3] = s1 >> 3; + s[4] = s1 >> 11; + s[5] = (s1 >> 19) | (s2 << 2); + s[6] = s2 >> 6; + s[7] = (s2 >> 14) | (s3 << 7); + s[8] = s3 >> 1; + s[9] = s3 >> 9; + s[10] = (s3 >> 17) | (s4 << 4); + s[11] = s4 >> 4; + s[12] = s4 >> 12; + s[13] = (s4 >> 20) | (s5 << 1); + s[14] = s5 >> 7; + s[15] = (s5 >> 15) | (s6 << 6); + s[16] = s6 >> 2; + s[17] = s6 >> 10; + s[18] = (s6 >> 18) | (s7 << 3); + s[19] = s7 >> 5; + s[20] = s7 >> 13; + s[21] = s8 >> 0; + s[22] = s8 >> 8; + s[23] = (s8 >> 16) | (s9 << 5); + s[24] = s9 >> 3; + s[25] = s9 >> 11; + s[26] = (s9 >> 19) | (s10 << 2); + s[27] = s10 >> 6; + s[28] = (s10 >> 14) | (s11 << 7); + s[29] = s11 >> 1; + s[30] = s11 >> 9; + s[31] = s11 >> 17; +} + +// Input: +// a[0]+256*a[1]+...+256^31*a[31] = a +// b[0]+256*b[1]+...+256^31*b[31] = b +// c[0]+256*c[1]+...+256^31*c[31] = c +// +// Output: +// s[0]+256*s[1]+...+256^31*s[31] = (ab+c) mod l +// where l = 2^252 + 27742317777372353535851937790883648493. +static void sc_muladd(uint8_t *s, const uint8_t *a, const uint8_t *b, + const uint8_t *c) { + int64_t a0 = 2097151 & load_3(a); + int64_t a1 = 2097151 & (load_4(a + 2) >> 5); + int64_t a2 = 2097151 & (load_3(a + 5) >> 2); + int64_t a3 = 2097151 & (load_4(a + 7) >> 7); + int64_t a4 = 2097151 & (load_4(a + 10) >> 4); + int64_t a5 = 2097151 & (load_3(a + 13) >> 1); + int64_t a6 = 2097151 & (load_4(a + 15) >> 6); + int64_t a7 = 2097151 & (load_3(a + 18) >> 3); + int64_t a8 = 2097151 & load_3(a + 21); + int64_t a9 = 2097151 & (load_4(a + 23) >> 5); + int64_t a10 = 2097151 & (load_3(a + 26) >> 2); + int64_t a11 = (load_4(a + 28) >> 7); + int64_t b0 = 2097151 & load_3(b); + int64_t b1 = 2097151 & (load_4(b + 2) >> 5); + int64_t b2 = 2097151 & (load_3(b + 5) >> 2); + int64_t b3 = 2097151 & (load_4(b + 7) >> 7); + int64_t b4 = 2097151 & (load_4(b + 10) >> 4); + int64_t b5 = 2097151 & (load_3(b + 13) >> 1); + int64_t b6 = 2097151 & (load_4(b + 15) >> 6); + int64_t b7 = 2097151 & (load_3(b + 18) >> 3); + int64_t b8 = 2097151 & load_3(b + 21); + int64_t b9 = 2097151 & (load_4(b + 23) >> 5); + int64_t b10 = 2097151 & (load_3(b + 26) >> 2); + int64_t b11 = (load_4(b + 28) >> 7); + int64_t c0 = 2097151 & load_3(c); + int64_t c1 = 2097151 & (load_4(c + 2) >> 5); + int64_t c2 = 2097151 & (load_3(c + 5) >> 2); + int64_t c3 = 2097151 & (load_4(c + 7) >> 7); + int64_t c4 = 2097151 & (load_4(c + 10) >> 4); + int64_t c5 = 2097151 & (load_3(c + 13) >> 1); + int64_t c6 = 2097151 & (load_4(c + 15) >> 6); + int64_t c7 = 2097151 & (load_3(c + 18) >> 3); + int64_t c8 = 2097151 & load_3(c + 21); + int64_t c9 = 2097151 & (load_4(c + 23) >> 5); + int64_t c10 = 2097151 & (load_3(c + 26) >> 2); + int64_t c11 = (load_4(c + 28) >> 7); + int64_t s0; + int64_t s1; + int64_t s2; + int64_t s3; + int64_t s4; + int64_t s5; + int64_t s6; + int64_t s7; + int64_t s8; + int64_t s9; + int64_t s10; + int64_t s11; + int64_t s12; + int64_t s13; + int64_t s14; + int64_t s15; + int64_t s16; + int64_t s17; + int64_t s18; + int64_t s19; + int64_t s20; + int64_t s21; + int64_t s22; + int64_t s23; + int64_t carry0; + int64_t carry1; + int64_t carry2; + int64_t carry3; + int64_t carry4; + int64_t carry5; + int64_t carry6; + int64_t carry7; + int64_t carry8; + int64_t carry9; + int64_t carry10; + int64_t carry11; + int64_t carry12; + int64_t carry13; + int64_t carry14; + int64_t carry15; + int64_t carry16; + int64_t carry17; + int64_t carry18; + int64_t carry19; + int64_t carry20; + int64_t carry21; + int64_t carry22; + + s0 = c0 + a0 * b0; + s1 = c1 + a0 * b1 + a1 * b0; + s2 = c2 + a0 * b2 + a1 * b1 + a2 * b0; + s3 = c3 + a0 * b3 + a1 * b2 + a2 * b1 + a3 * b0; + s4 = c4 + a0 * b4 + a1 * b3 + a2 * b2 + a3 * b1 + a4 * b0; + s5 = c5 + a0 * b5 + a1 * b4 + a2 * b3 + a3 * b2 + a4 * b1 + a5 * b0; + s6 = c6 + a0 * b6 + a1 * b5 + a2 * b4 + a3 * b3 + a4 * b2 + a5 * b1 + a6 * b0; + s7 = c7 + a0 * b7 + a1 * b6 + a2 * b5 + a3 * b4 + a4 * b3 + a5 * b2 + + a6 * b1 + a7 * b0; + s8 = c8 + a0 * b8 + a1 * b7 + a2 * b6 + a3 * b5 + a4 * b4 + a5 * b3 + + a6 * b2 + a7 * b1 + a8 * b0; + s9 = c9 + a0 * b9 + a1 * b8 + a2 * b7 + a3 * b6 + a4 * b5 + a5 * b4 + + a6 * b3 + a7 * b2 + a8 * b1 + a9 * b0; + s10 = c10 + a0 * b10 + a1 * b9 + a2 * b8 + a3 * b7 + a4 * b6 + a5 * b5 + + a6 * b4 + a7 * b3 + a8 * b2 + a9 * b1 + a10 * b0; + s11 = c11 + a0 * b11 + a1 * b10 + a2 * b9 + a3 * b8 + a4 * b7 + a5 * b6 + + a6 * b5 + a7 * b4 + a8 * b3 + a9 * b2 + a10 * b1 + a11 * b0; + s12 = a1 * b11 + a2 * b10 + a3 * b9 + a4 * b8 + a5 * b7 + a6 * b6 + a7 * b5 + + a8 * b4 + a9 * b3 + a10 * b2 + a11 * b1; + s13 = a2 * b11 + a3 * b10 + a4 * b9 + a5 * b8 + a6 * b7 + a7 * b6 + a8 * b5 + + a9 * b4 + a10 * b3 + a11 * b2; + s14 = a3 * b11 + a4 * b10 + a5 * b9 + a6 * b8 + a7 * b7 + a8 * b6 + a9 * b5 + + a10 * b4 + a11 * b3; + s15 = a4 * b11 + a5 * b10 + a6 * b9 + a7 * b8 + a8 * b7 + a9 * b6 + a10 * b5 + + a11 * b4; + s16 = a5 * b11 + a6 * b10 + a7 * b9 + a8 * b8 + a9 * b7 + a10 * b6 + a11 * b5; + s17 = a6 * b11 + a7 * b10 + a8 * b9 + a9 * b8 + a10 * b7 + a11 * b6; + s18 = a7 * b11 + a8 * b10 + a9 * b9 + a10 * b8 + a11 * b7; + s19 = a8 * b11 + a9 * b10 + a10 * b9 + a11 * b8; + s20 = a9 * b11 + a10 * b10 + a11 * b9; + s21 = a10 * b11 + a11 * b10; + s22 = a11 * b11; + s23 = 0; + + carry0 = (s0 + (1 << 20)) >> 21; + s1 += carry0; + s0 -= int64_lshift21(carry0); + carry2 = (s2 + (1 << 20)) >> 21; + s3 += carry2; + s2 -= int64_lshift21(carry2); + carry4 = (s4 + (1 << 20)) >> 21; + s5 += carry4; + s4 -= int64_lshift21(carry4); + carry6 = (s6 + (1 << 20)) >> 21; + s7 += carry6; + s6 -= int64_lshift21(carry6); + carry8 = (s8 + (1 << 20)) >> 21; + s9 += carry8; + s8 -= int64_lshift21(carry8); + carry10 = (s10 + (1 << 20)) >> 21; + s11 += carry10; + s10 -= int64_lshift21(carry10); + carry12 = (s12 + (1 << 20)) >> 21; + s13 += carry12; + s12 -= int64_lshift21(carry12); + carry14 = (s14 + (1 << 20)) >> 21; + s15 += carry14; + s14 -= int64_lshift21(carry14); + carry16 = (s16 + (1 << 20)) >> 21; + s17 += carry16; + s16 -= int64_lshift21(carry16); + carry18 = (s18 + (1 << 20)) >> 21; + s19 += carry18; + s18 -= int64_lshift21(carry18); + carry20 = (s20 + (1 << 20)) >> 21; + s21 += carry20; + s20 -= int64_lshift21(carry20); + carry22 = (s22 + (1 << 20)) >> 21; + s23 += carry22; + s22 -= int64_lshift21(carry22); + + carry1 = (s1 + (1 << 20)) >> 21; + s2 += carry1; + s1 -= int64_lshift21(carry1); + carry3 = (s3 + (1 << 20)) >> 21; + s4 += carry3; + s3 -= int64_lshift21(carry3); + carry5 = (s5 + (1 << 20)) >> 21; + s6 += carry5; + s5 -= int64_lshift21(carry5); + carry7 = (s7 + (1 << 20)) >> 21; + s8 += carry7; + s7 -= int64_lshift21(carry7); + carry9 = (s9 + (1 << 20)) >> 21; + s10 += carry9; + s9 -= int64_lshift21(carry9); + carry11 = (s11 + (1 << 20)) >> 21; + s12 += carry11; + s11 -= int64_lshift21(carry11); + carry13 = (s13 + (1 << 20)) >> 21; + s14 += carry13; + s13 -= int64_lshift21(carry13); + carry15 = (s15 + (1 << 20)) >> 21; + s16 += carry15; + s15 -= int64_lshift21(carry15); + carry17 = (s17 + (1 << 20)) >> 21; + s18 += carry17; + s17 -= int64_lshift21(carry17); + carry19 = (s19 + (1 << 20)) >> 21; + s20 += carry19; + s19 -= int64_lshift21(carry19); + carry21 = (s21 + (1 << 20)) >> 21; + s22 += carry21; + s21 -= int64_lshift21(carry21); + + s11 += s23 * 666643; + s12 += s23 * 470296; + s13 += s23 * 654183; + s14 -= s23 * 997805; + s15 += s23 * 136657; + s16 -= s23 * 683901; + s23 = 0; + + s10 += s22 * 666643; + s11 += s22 * 470296; + s12 += s22 * 654183; + s13 -= s22 * 997805; + s14 += s22 * 136657; + s15 -= s22 * 683901; + s22 = 0; + + s9 += s21 * 666643; + s10 += s21 * 470296; + s11 += s21 * 654183; + s12 -= s21 * 997805; + s13 += s21 * 136657; + s14 -= s21 * 683901; + s21 = 0; + + s8 += s20 * 666643; + s9 += s20 * 470296; + s10 += s20 * 654183; + s11 -= s20 * 997805; + s12 += s20 * 136657; + s13 -= s20 * 683901; + s20 = 0; + + s7 += s19 * 666643; + s8 += s19 * 470296; + s9 += s19 * 654183; + s10 -= s19 * 997805; + s11 += s19 * 136657; + s12 -= s19 * 683901; + s19 = 0; + + s6 += s18 * 666643; + s7 += s18 * 470296; + s8 += s18 * 654183; + s9 -= s18 * 997805; + s10 += s18 * 136657; + s11 -= s18 * 683901; + s18 = 0; + + carry6 = (s6 + (1 << 20)) >> 21; + s7 += carry6; + s6 -= int64_lshift21(carry6); + carry8 = (s8 + (1 << 20)) >> 21; + s9 += carry8; + s8 -= int64_lshift21(carry8); + carry10 = (s10 + (1 << 20)) >> 21; + s11 += carry10; + s10 -= int64_lshift21(carry10); + carry12 = (s12 + (1 << 20)) >> 21; + s13 += carry12; + s12 -= int64_lshift21(carry12); + carry14 = (s14 + (1 << 20)) >> 21; + s15 += carry14; + s14 -= int64_lshift21(carry14); + carry16 = (s16 + (1 << 20)) >> 21; + s17 += carry16; + s16 -= int64_lshift21(carry16); + + carry7 = (s7 + (1 << 20)) >> 21; + s8 += carry7; + s7 -= int64_lshift21(carry7); + carry9 = (s9 + (1 << 20)) >> 21; + s10 += carry9; + s9 -= int64_lshift21(carry9); + carry11 = (s11 + (1 << 20)) >> 21; + s12 += carry11; + s11 -= int64_lshift21(carry11); + carry13 = (s13 + (1 << 20)) >> 21; + s14 += carry13; + s13 -= int64_lshift21(carry13); + carry15 = (s15 + (1 << 20)) >> 21; + s16 += carry15; + s15 -= int64_lshift21(carry15); + + s5 += s17 * 666643; + s6 += s17 * 470296; + s7 += s17 * 654183; + s8 -= s17 * 997805; + s9 += s17 * 136657; + s10 -= s17 * 683901; + s17 = 0; + + s4 += s16 * 666643; + s5 += s16 * 470296; + s6 += s16 * 654183; + s7 -= s16 * 997805; + s8 += s16 * 136657; + s9 -= s16 * 683901; + s16 = 0; + + s3 += s15 * 666643; + s4 += s15 * 470296; + s5 += s15 * 654183; + s6 -= s15 * 997805; + s7 += s15 * 136657; + s8 -= s15 * 683901; + s15 = 0; + + s2 += s14 * 666643; + s3 += s14 * 470296; + s4 += s14 * 654183; + s5 -= s14 * 997805; + s6 += s14 * 136657; + s7 -= s14 * 683901; + s14 = 0; + + s1 += s13 * 666643; + s2 += s13 * 470296; + s3 += s13 * 654183; + s4 -= s13 * 997805; + s5 += s13 * 136657; + s6 -= s13 * 683901; + s13 = 0; + + s0 += s12 * 666643; + s1 += s12 * 470296; + s2 += s12 * 654183; + s3 -= s12 * 997805; + s4 += s12 * 136657; + s5 -= s12 * 683901; + s12 = 0; + + carry0 = (s0 + (1 << 20)) >> 21; + s1 += carry0; + s0 -= int64_lshift21(carry0); + carry2 = (s2 + (1 << 20)) >> 21; + s3 += carry2; + s2 -= int64_lshift21(carry2); + carry4 = (s4 + (1 << 20)) >> 21; + s5 += carry4; + s4 -= int64_lshift21(carry4); + carry6 = (s6 + (1 << 20)) >> 21; + s7 += carry6; + s6 -= int64_lshift21(carry6); + carry8 = (s8 + (1 << 20)) >> 21; + s9 += carry8; + s8 -= int64_lshift21(carry8); + carry10 = (s10 + (1 << 20)) >> 21; + s11 += carry10; + s10 -= int64_lshift21(carry10); + + carry1 = (s1 + (1 << 20)) >> 21; + s2 += carry1; + s1 -= int64_lshift21(carry1); + carry3 = (s3 + (1 << 20)) >> 21; + s4 += carry3; + s3 -= int64_lshift21(carry3); + carry5 = (s5 + (1 << 20)) >> 21; + s6 += carry5; + s5 -= int64_lshift21(carry5); + carry7 = (s7 + (1 << 20)) >> 21; + s8 += carry7; + s7 -= int64_lshift21(carry7); + carry9 = (s9 + (1 << 20)) >> 21; + s10 += carry9; + s9 -= int64_lshift21(carry9); + carry11 = (s11 + (1 << 20)) >> 21; + s12 += carry11; + s11 -= int64_lshift21(carry11); + + s0 += s12 * 666643; + s1 += s12 * 470296; + s2 += s12 * 654183; + s3 -= s12 * 997805; + s4 += s12 * 136657; + s5 -= s12 * 683901; + s12 = 0; + + carry0 = s0 >> 21; + s1 += carry0; + s0 -= int64_lshift21(carry0); + carry1 = s1 >> 21; + s2 += carry1; + s1 -= int64_lshift21(carry1); + carry2 = s2 >> 21; + s3 += carry2; + s2 -= int64_lshift21(carry2); + carry3 = s3 >> 21; + s4 += carry3; + s3 -= int64_lshift21(carry3); + carry4 = s4 >> 21; + s5 += carry4; + s4 -= int64_lshift21(carry4); + carry5 = s5 >> 21; + s6 += carry5; + s5 -= int64_lshift21(carry5); + carry6 = s6 >> 21; + s7 += carry6; + s6 -= int64_lshift21(carry6); + carry7 = s7 >> 21; + s8 += carry7; + s7 -= int64_lshift21(carry7); + carry8 = s8 >> 21; + s9 += carry8; + s8 -= int64_lshift21(carry8); + carry9 = s9 >> 21; + s10 += carry9; + s9 -= int64_lshift21(carry9); + carry10 = s10 >> 21; + s11 += carry10; + s10 -= int64_lshift21(carry10); + carry11 = s11 >> 21; + s12 += carry11; + s11 -= int64_lshift21(carry11); + + s0 += s12 * 666643; + s1 += s12 * 470296; + s2 += s12 * 654183; + s3 -= s12 * 997805; + s4 += s12 * 136657; + s5 -= s12 * 683901; + s12 = 0; + + carry0 = s0 >> 21; + s1 += carry0; + s0 -= int64_lshift21(carry0); + carry1 = s1 >> 21; + s2 += carry1; + s1 -= int64_lshift21(carry1); + carry2 = s2 >> 21; + s3 += carry2; + s2 -= int64_lshift21(carry2); + carry3 = s3 >> 21; + s4 += carry3; + s3 -= int64_lshift21(carry3); + carry4 = s4 >> 21; + s5 += carry4; + s4 -= int64_lshift21(carry4); + carry5 = s5 >> 21; + s6 += carry5; + s5 -= int64_lshift21(carry5); + carry6 = s6 >> 21; + s7 += carry6; + s6 -= int64_lshift21(carry6); + carry7 = s7 >> 21; + s8 += carry7; + s7 -= int64_lshift21(carry7); + carry8 = s8 >> 21; + s9 += carry8; + s8 -= int64_lshift21(carry8); + carry9 = s9 >> 21; + s10 += carry9; + s9 -= int64_lshift21(carry9); + carry10 = s10 >> 21; + s11 += carry10; + s10 -= int64_lshift21(carry10); + + s[0] = s0 >> 0; + s[1] = s0 >> 8; + s[2] = (s0 >> 16) | (s1 << 5); + s[3] = s1 >> 3; + s[4] = s1 >> 11; + s[5] = (s1 >> 19) | (s2 << 2); + s[6] = s2 >> 6; + s[7] = (s2 >> 14) | (s3 << 7); + s[8] = s3 >> 1; + s[9] = s3 >> 9; + s[10] = (s3 >> 17) | (s4 << 4); + s[11] = s4 >> 4; + s[12] = s4 >> 12; + s[13] = (s4 >> 20) | (s5 << 1); + s[14] = s5 >> 7; + s[15] = (s5 >> 15) | (s6 << 6); + s[16] = s6 >> 2; + s[17] = s6 >> 10; + s[18] = (s6 >> 18) | (s7 << 3); + s[19] = s7 >> 5; + s[20] = s7 >> 13; + s[21] = s8 >> 0; + s[22] = s8 >> 8; + s[23] = (s8 >> 16) | (s9 << 5); + s[24] = s9 >> 3; + s[25] = s9 >> 11; + s[26] = (s9 >> 19) | (s10 << 2); + s[27] = s10 >> 6; + s[28] = (s10 >> 14) | (s11 << 7); + s[29] = s11 >> 1; + s[30] = s11 >> 9; + s[31] = s11 >> 17; +} + + +void x25519_scalar_mult_generic_masked(uint8_t out[32], + const uint8_t scalar_masked[32], + const uint8_t point[32]) { + fe x1, x2, z2, x3, z3, tmp0, tmp1; + fe_loose x2l, z2l, x3l, tmp0l, tmp1l; + + uint8_t e[32]; + OPENSSL_memcpy(e, scalar_masked, 32); + // The following implementation was transcribed to Coq and proven to + // correspond to unary scalar multiplication in affine coordinates given that + // x1 != 0 is the x coordinate of some point on the curve. It was also checked + // in Coq that doing a ladderstep with x1 = x3 = 0 gives z2' = z3' = 0, and z2 + // = z3 = 0 gives z2' = z3' = 0. The statement was quantified over the + // underlying field, so it applies to Curve25519 itself and the quadratic + // twist of Curve25519. It was not proven in Coq that prime-field arithmetic + // correctly simulates extension-field arithmetic on prime-field values. + // The decoding of the byte array representation of e was not considered. + // Specification of Montgomery curves in affine coordinates: + // + // Proof that these form a group that is isomorphic to a Weierstrass curve: + // + // Coq transcription and correctness proof of the loop (where scalarbits=255): + // + // + // preconditions: 0 <= e < 2^255 (not necessarily e < order), fe_invert(0) = 0 + fe_frombytes(&x1, point); + fe_1(&x2); + fe_0(&z2); + fe_copy(&x3, &x1); + fe_1(&z3); + + unsigned swap = 0; + int pos; + for (pos = 254; pos >= 0; --pos) { + // loop invariant as of right before the test, for the case where x1 != 0: + // pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3 is nonzero + // let r := e >> (pos+1) in the following equalities of projective points: + // to_xz (r*P) === if swap then (x3, z3) else (x2, z2) + // to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3) + // x1 is the nonzero x coordinate of the nonzero point (r*P-(r+1)*P) + unsigned b = 1 & (e[pos / 8] >> (pos & 7)); + swap ^= b; + fe_cswap(&x2, &x3, swap); + fe_cswap(&z2, &z3, swap); + swap = b; + // Coq transcription of ladderstep formula (called from transcribed loop): + // + // + // x1 != 0 + // x1 = 0 + fe_sub(&tmp0l, &x3, &z3); + fe_sub(&tmp1l, &x2, &z2); + fe_add(&x2l, &x2, &z2); + fe_add(&z2l, &x3, &z3); + fe_mul_tll(&z3, &tmp0l, &x2l); + fe_mul_tll(&z2, &z2l, &tmp1l); + fe_sq_tl(&tmp0, &tmp1l); + fe_sq_tl(&tmp1, &x2l); + fe_add(&x3l, &z3, &z2); + fe_sub(&z2l, &z3, &z2); + fe_mul_ttt(&x2, &tmp1, &tmp0); + fe_sub(&tmp1l, &tmp1, &tmp0); + fe_sq_tl(&z2, &z2l); + fe_mul121666(&z3, &tmp1l); + fe_sq_tl(&x3, &x3l); + fe_add(&tmp0l, &tmp0, &z3); + fe_mul_ttt(&z3, &x1, &z2); + fe_mul_tll(&z2, &tmp1l, &tmp0l); + } + // here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3) else (x2, z2) + fe_cswap(&x2, &x3, swap); + fe_cswap(&z2, &z3, swap); + + fe_invert(&z2, &z2); + fe_mul_ttt(&x2, &x2, &z2); + fe_tobytes(out, &x2); +} + +void x25519_public_from_private_generic_masked(uint8_t out_public_value[32], + const uint8_t private_key_masked[32], + int use_adx) { + uint8_t e[32]; + OPENSSL_memcpy(e, private_key_masked, 32); + + ge_p3 A; + x25519_ge_scalarmult_base(&A, e, use_adx); + + // We only need the u-coordinate of the curve25519 point. The map is + // u=(y+1)/(1-y). Since y=Y/Z, this gives u=(Z+Y)/(Z-Y). + fe_loose zplusy, zminusy; + fe zminusy_inv; + fe_add(&zplusy, &A.Z, &A.Y); + fe_sub(&zminusy, &A.Z, &A.Y); + fe_loose_invert(&zminusy_inv, &zminusy); + fe_mul_tlt(&zminusy_inv, &zplusy, &zminusy_inv); + fe_tobytes(out_public_value, &zminusy_inv); + CONSTTIME_DECLASSIFY(out_public_value, 32); +} + +void x25519_fe_invert(fe *out, const fe *z) { + fe_invert(out, z); +} + +uint8_t x25519_fe_isnegative(const fe *f) { + return (uint8_t)fe_isnegative(f); +} + +void x25519_fe_mul_ttt(fe *h, const fe *f, const fe *g) { + fe_mul_ttt(h, f, g); +} + +void x25519_fe_neg(fe *f) { + fe_loose t; + fe_neg(&t, f); + fe_carry(f, &t); +} + +void x25519_fe_tobytes(uint8_t s[32], const fe *h) { + fe_tobytes(s, h); +} + +void x25519_ge_double_scalarmult_vartime(ge_p2 *r, const uint8_t *a, + const ge_p3 *A, const uint8_t *b) { + ge_double_scalarmult_vartime(r, a, A, b); +} + +void x25519_sc_mask(uint8_t a[32]) { + a[0] &= 248; + a[31] &= 127; + a[31] |= 64; +} + +void x25519_sc_muladd(uint8_t *s, const uint8_t *a, const uint8_t *b, + const uint8_t *c) { + sc_muladd(s, a, b, c); +} diff --git a/ring-0.17.14/crypto/curve25519/curve25519_64_adx.c b/ring-0.17.14/crypto/curve25519/curve25519_64_adx.c new file mode 100644 index 0000000000..88964a9ddc --- /dev/null +++ b/ring-0.17.14/crypto/curve25519/curve25519_64_adx.c @@ -0,0 +1,23 @@ +// Copyright 2023 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "internal.h" +#if defined(BORINGSSL_FE25519_ADX) + +#pragma GCC diagnostic ignored "-Wconversion" +#pragma GCC diagnostic ignored "-Wpedantic" +#pragma GCC diagnostic ignored "-Wsign-conversion" + +#include "../../third_party/fiat/curve25519_64_adx.h" +#endif diff --git a/ring-0.17.14/crypto/curve25519/curve25519_tables.h b/ring-0.17.14/crypto/curve25519/curve25519_tables.h new file mode 100644 index 0000000000..72caa33783 --- /dev/null +++ b/ring-0.17.14/crypto/curve25519/curve25519_tables.h @@ -0,0 +1,3264 @@ +// Copyright 2020 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This file is generated from +// ./make_curve25519_tables.py > curve25519_tables.h + + +static const fe d = {{ +#if defined(OPENSSL_64_BIT) + 929955233495203, 466365720129213, 1662059464998953, 2033849074728123, + 1442794654840575 +#else + 56195235, 13857412, 51736253, 6949390, 114729, 24766616, 60832955, 30306712, + 48412415, 21499315 +#endif +}}; + +static const fe sqrtm1 = {{ +#if defined(OPENSSL_64_BIT) + 1718705420411056, 234908883556509, 2233514472574048, 2117202627021982, + 765476049583133 +#else + 34513072, 25610706, 9377949, 3500415, 12389472, 33281959, 41962654, + 31548777, 326685, 11406482 +#endif +}}; + +static const fe d2 = {{ +#if defined(OPENSSL_64_BIT) + 1859910466990425, 932731440258426, 1072319116312658, 1815898335770999, + 633789495995903 +#else + 45281625, 27714825, 36363642, 13898781, 229458, 15978800, 54557047, + 27058993, 29715967, 9444199 +#endif +}}; + +#if defined(OPENSSL_SMALL) + +// This block of code replaces the standard base-point table with a much smaller +// one. The standard table is 30,720 bytes while this one is just 960. +// +// This table contains 15 pairs of group elements, (x, y), where each field +// element is serialised with |fe_tobytes|. If |i| is the index of the group +// element then consider i+1 as a four-bit number: (i₀, i₁, i₂, i₃) (where i₀ +// is the most significant bit). The value of the group element is then: +// (i₀×2^192 + i₁×2^128 + i₂×2^64 + i₃)G, where G is the generator. +static const uint8_t k25519SmallPrecomp[15 * 2 * 32] = { + 0x1a, 0xd5, 0x25, 0x8f, 0x60, 0x2d, 0x56, 0xc9, 0xb2, 0xa7, 0x25, 0x95, + 0x60, 0xc7, 0x2c, 0x69, 0x5c, 0xdc, 0xd6, 0xfd, 0x31, 0xe2, 0xa4, 0xc0, + 0xfe, 0x53, 0x6e, 0xcd, 0xd3, 0x36, 0x69, 0x21, 0x58, 0x66, 0x66, 0x66, + 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, + 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, + 0x66, 0x66, 0x66, 0x66, 0x02, 0xa2, 0xed, 0xf4, 0x8f, 0x6b, 0x0b, 0x3e, + 0xeb, 0x35, 0x1a, 0xd5, 0x7e, 0xdb, 0x78, 0x00, 0x96, 0x8a, 0xa0, 0xb4, + 0xcf, 0x60, 0x4b, 0xd4, 0xd5, 0xf9, 0x2d, 0xbf, 0x88, 0xbd, 0x22, 0x62, + 0x13, 0x53, 0xe4, 0x82, 0x57, 0xfa, 0x1e, 0x8f, 0x06, 0x2b, 0x90, 0xba, + 0x08, 0xb6, 0x10, 0x54, 0x4f, 0x7c, 0x1b, 0x26, 0xed, 0xda, 0x6b, 0xdd, + 0x25, 0xd0, 0x4e, 0xea, 0x42, 0xbb, 0x25, 0x03, 0xa2, 0xfb, 0xcc, 0x61, + 0x67, 0x06, 0x70, 0x1a, 0xc4, 0x78, 0x3a, 0xff, 0x32, 0x62, 0xdd, 0x2c, + 0xab, 0x50, 0x19, 0x3b, 0xf2, 0x9b, 0x7d, 0xb8, 0xfd, 0x4f, 0x29, 0x9c, + 0xa7, 0x91, 0xba, 0x0e, 0x46, 0x5e, 0x51, 0xfe, 0x1d, 0xbf, 0xe5, 0xe5, + 0x9b, 0x95, 0x0d, 0x67, 0xf8, 0xd1, 0xb5, 0x5a, 0xa1, 0x93, 0x2c, 0xc3, + 0xde, 0x0e, 0x97, 0x85, 0x2d, 0x7f, 0xea, 0xab, 0x3e, 0x47, 0x30, 0x18, + 0x24, 0xe8, 0xb7, 0x60, 0xae, 0x47, 0x80, 0xfc, 0xe5, 0x23, 0xe7, 0xc2, + 0xc9, 0x85, 0xe6, 0x98, 0xa0, 0x29, 0x4e, 0xe1, 0x84, 0x39, 0x2d, 0x95, + 0x2c, 0xf3, 0x45, 0x3c, 0xff, 0xaf, 0x27, 0x4c, 0x6b, 0xa6, 0xf5, 0x4b, + 0x11, 0xbd, 0xba, 0x5b, 0x9e, 0xc4, 0xa4, 0x51, 0x1e, 0xbe, 0xd0, 0x90, + 0x3a, 0x9c, 0xc2, 0x26, 0xb6, 0x1e, 0xf1, 0x95, 0x7d, 0xc8, 0x6d, 0x52, + 0xe6, 0x99, 0x2c, 0x5f, 0x9a, 0x96, 0x0c, 0x68, 0x29, 0xfd, 0xe2, 0xfb, + 0xe6, 0xbc, 0xec, 0x31, 0x08, 0xec, 0xe6, 0xb0, 0x53, 0x60, 0xc3, 0x8c, + 0xbe, 0xc1, 0xb3, 0x8a, 0x8f, 0xe4, 0x88, 0x2b, 0x55, 0xe5, 0x64, 0x6e, + 0x9b, 0xd0, 0xaf, 0x7b, 0x64, 0x2a, 0x35, 0x25, 0x10, 0x52, 0xc5, 0x9e, + 0x58, 0x11, 0x39, 0x36, 0x45, 0x51, 0xb8, 0x39, 0x93, 0xfc, 0x9d, 0x6a, + 0xbe, 0x58, 0xcb, 0xa4, 0x0f, 0x51, 0x3c, 0x38, 0x05, 0xca, 0xab, 0x43, + 0x63, 0x0e, 0xf3, 0x8b, 0x41, 0xa6, 0xf8, 0x9b, 0x53, 0x70, 0x80, 0x53, + 0x86, 0x5e, 0x8f, 0xe3, 0xc3, 0x0d, 0x18, 0xc8, 0x4b, 0x34, 0x1f, 0xd8, + 0x1d, 0xbc, 0xf2, 0x6d, 0x34, 0x3a, 0xbe, 0xdf, 0xd9, 0xf6, 0xf3, 0x89, + 0xa1, 0xe1, 0x94, 0x9f, 0x5d, 0x4c, 0x5d, 0xe9, 0xa1, 0x49, 0x92, 0xef, + 0x0e, 0x53, 0x81, 0x89, 0x58, 0x87, 0xa6, 0x37, 0xf1, 0xdd, 0x62, 0x60, + 0x63, 0x5a, 0x9d, 0x1b, 0x8c, 0xc6, 0x7d, 0x52, 0xea, 0x70, 0x09, 0x6a, + 0xe1, 0x32, 0xf3, 0x73, 0x21, 0x1f, 0x07, 0x7b, 0x7c, 0x9b, 0x49, 0xd8, + 0xc0, 0xf3, 0x25, 0x72, 0x6f, 0x9d, 0xed, 0x31, 0x67, 0x36, 0x36, 0x54, + 0x40, 0x92, 0x71, 0xe6, 0x11, 0x28, 0x11, 0xad, 0x93, 0x32, 0x85, 0x7b, + 0x3e, 0xb7, 0x3b, 0x49, 0x13, 0x1c, 0x07, 0xb0, 0x2e, 0x93, 0xaa, 0xfd, + 0xfd, 0x28, 0x47, 0x3d, 0x8d, 0xd2, 0xda, 0xc7, 0x44, 0xd6, 0x7a, 0xdb, + 0x26, 0x7d, 0x1d, 0xb8, 0xe1, 0xde, 0x9d, 0x7a, 0x7d, 0x17, 0x7e, 0x1c, + 0x37, 0x04, 0x8d, 0x2d, 0x7c, 0x5e, 0x18, 0x38, 0x1e, 0xaf, 0xc7, 0x1b, + 0x33, 0x48, 0x31, 0x00, 0x59, 0xf6, 0xf2, 0xca, 0x0f, 0x27, 0x1b, 0x63, + 0x12, 0x7e, 0x02, 0x1d, 0x49, 0xc0, 0x5d, 0x79, 0x87, 0xef, 0x5e, 0x7a, + 0x2f, 0x1f, 0x66, 0x55, 0xd8, 0x09, 0xd9, 0x61, 0x38, 0x68, 0xb0, 0x07, + 0xa3, 0xfc, 0xcc, 0x85, 0x10, 0x7f, 0x4c, 0x65, 0x65, 0xb3, 0xfa, 0xfa, + 0xa5, 0x53, 0x6f, 0xdb, 0x74, 0x4c, 0x56, 0x46, 0x03, 0xe2, 0xd5, 0x7a, + 0x29, 0x1c, 0xc6, 0x02, 0xbc, 0x59, 0xf2, 0x04, 0x75, 0x63, 0xc0, 0x84, + 0x2f, 0x60, 0x1c, 0x67, 0x76, 0xfd, 0x63, 0x86, 0xf3, 0xfa, 0xbf, 0xdc, + 0xd2, 0x2d, 0x90, 0x91, 0xbd, 0x33, 0xa9, 0xe5, 0x66, 0x0c, 0xda, 0x42, + 0x27, 0xca, 0xf4, 0x66, 0xc2, 0xec, 0x92, 0x14, 0x57, 0x06, 0x63, 0xd0, + 0x4d, 0x15, 0x06, 0xeb, 0x69, 0x58, 0x4f, 0x77, 0xc5, 0x8b, 0xc7, 0xf0, + 0x8e, 0xed, 0x64, 0xa0, 0xb3, 0x3c, 0x66, 0x71, 0xc6, 0x2d, 0xda, 0x0a, + 0x0d, 0xfe, 0x70, 0x27, 0x64, 0xf8, 0x27, 0xfa, 0xf6, 0x5f, 0x30, 0xa5, + 0x0d, 0x6c, 0xda, 0xf2, 0x62, 0x5e, 0x78, 0x47, 0xd3, 0x66, 0x00, 0x1c, + 0xfd, 0x56, 0x1f, 0x5d, 0x3f, 0x6f, 0xf4, 0x4c, 0xd8, 0xfd, 0x0e, 0x27, + 0xc9, 0x5c, 0x2b, 0xbc, 0xc0, 0xa4, 0xe7, 0x23, 0x29, 0x02, 0x9f, 0x31, + 0xd6, 0xe9, 0xd7, 0x96, 0xf4, 0xe0, 0x5e, 0x0b, 0x0e, 0x13, 0xee, 0x3c, + 0x09, 0xed, 0xf2, 0x3d, 0x76, 0x91, 0xc3, 0xa4, 0x97, 0xae, 0xd4, 0x87, + 0xd0, 0x5d, 0xf6, 0x18, 0x47, 0x1f, 0x1d, 0x67, 0xf2, 0xcf, 0x63, 0xa0, + 0x91, 0x27, 0xf8, 0x93, 0x45, 0x75, 0x23, 0x3f, 0xd1, 0xf1, 0xad, 0x23, + 0xdd, 0x64, 0x93, 0x96, 0x41, 0x70, 0x7f, 0xf7, 0xf5, 0xa9, 0x89, 0xa2, + 0x34, 0xb0, 0x8d, 0x1b, 0xae, 0x19, 0x15, 0x49, 0x58, 0x23, 0x6d, 0x87, + 0x15, 0x4f, 0x81, 0x76, 0xfb, 0x23, 0xb5, 0xea, 0xcf, 0xac, 0x54, 0x8d, + 0x4e, 0x42, 0x2f, 0xeb, 0x0f, 0x63, 0xdb, 0x68, 0x37, 0xa8, 0xcf, 0x8b, + 0xab, 0xf5, 0xa4, 0x6e, 0x96, 0x2a, 0xb2, 0xd6, 0xbe, 0x9e, 0xbd, 0x0d, + 0xb4, 0x42, 0xa9, 0xcf, 0x01, 0x83, 0x8a, 0x17, 0x47, 0x76, 0xc4, 0xc6, + 0x83, 0x04, 0x95, 0x0b, 0xfc, 0x11, 0xc9, 0x62, 0xb8, 0x0c, 0x76, 0x84, + 0xd9, 0xb9, 0x37, 0xfa, 0xfc, 0x7c, 0xc2, 0x6d, 0x58, 0x3e, 0xb3, 0x04, + 0xbb, 0x8c, 0x8f, 0x48, 0xbc, 0x91, 0x27, 0xcc, 0xf9, 0xb7, 0x22, 0x19, + 0x83, 0x2e, 0x09, 0xb5, 0x72, 0xd9, 0x54, 0x1c, 0x4d, 0xa1, 0xea, 0x0b, + 0xf1, 0xc6, 0x08, 0x72, 0x46, 0x87, 0x7a, 0x6e, 0x80, 0x56, 0x0a, 0x8a, + 0xc0, 0xdd, 0x11, 0x6b, 0xd6, 0xdd, 0x47, 0xdf, 0x10, 0xd9, 0xd8, 0xea, + 0x7c, 0xb0, 0x8f, 0x03, 0x00, 0x2e, 0xc1, 0x8f, 0x44, 0xa8, 0xd3, 0x30, + 0x06, 0x89, 0xa2, 0xf9, 0x34, 0xad, 0xdc, 0x03, 0x85, 0xed, 0x51, 0xa7, + 0x82, 0x9c, 0xe7, 0x5d, 0x52, 0x93, 0x0c, 0x32, 0x9a, 0x5b, 0xe1, 0xaa, + 0xca, 0xb8, 0x02, 0x6d, 0x3a, 0xd4, 0xb1, 0x3a, 0xf0, 0x5f, 0xbe, 0xb5, + 0x0d, 0x10, 0x6b, 0x38, 0x32, 0xac, 0x76, 0x80, 0xbd, 0xca, 0x94, 0x71, + 0x7a, 0xf2, 0xc9, 0x35, 0x2a, 0xde, 0x9f, 0x42, 0x49, 0x18, 0x01, 0xab, + 0xbc, 0xef, 0x7c, 0x64, 0x3f, 0x58, 0x3d, 0x92, 0x59, 0xdb, 0x13, 0xdb, + 0x58, 0x6e, 0x0a, 0xe0, 0xb7, 0x91, 0x4a, 0x08, 0x20, 0xd6, 0x2e, 0x3c, + 0x45, 0xc9, 0x8b, 0x17, 0x79, 0xe7, 0xc7, 0x90, 0x99, 0x3a, 0x18, 0x25, +}; + +#else + +// k25519Precomp[i][j] = (j+1)*256^i*B +const uint8_t k25519Precomp[32][8][3][32] = { + { + { + {0x85, 0x3b, 0x8c, 0xf5, 0xc6, 0x93, 0xbc, 0x2f, 0x19, 0xe, 0x8c, + 0xfb, 0xc6, 0x2d, 0x93, 0xcf, 0xc2, 0x42, 0x3d, 0x64, 0x98, 0x48, + 0xb, 0x27, 0x65, 0xba, 0xd4, 0x33, 0x3a, 0x9d, 0xcf, 0x7}, + {0x3e, 0x91, 0x40, 0xd7, 0x5, 0x39, 0x10, 0x9d, 0xb3, 0xbe, 0x40, + 0xd1, 0x5, 0x9f, 0x39, 0xfd, 0x9, 0x8a, 0x8f, 0x68, 0x34, 0x84, + 0xc1, 0xa5, 0x67, 0x12, 0xf8, 0x98, 0x92, 0x2f, 0xfd, 0x44}, + {0x68, 0xaa, 0x7a, 0x87, 0x5, 0x12, 0xc9, 0xab, 0x9e, 0xc4, 0xaa, + 0xcc, 0x23, 0xe8, 0xd9, 0x26, 0x8c, 0x59, 0x43, 0xdd, 0xcb, 0x7d, + 0x1b, 0x5a, 0xa8, 0x65, 0xc, 0x9f, 0x68, 0x7b, 0x11, 0x6f}, + }, + { + {0xd7, 0x71, 0x3c, 0x93, 0xfc, 0xe7, 0x24, 0x92, 0xb5, 0xf5, 0xf, + 0x7a, 0x96, 0x9d, 0x46, 0x9f, 0x2, 0x7, 0xd6, 0xe1, 0x65, 0x9a, + 0xa6, 0x5a, 0x2e, 0x2e, 0x7d, 0xa8, 0x3f, 0x6, 0xc, 0x59}, + {0xa8, 0xd5, 0xb4, 0x42, 0x60, 0xa5, 0x99, 0x8a, 0xf6, 0xac, 0x60, + 0x4e, 0xc, 0x81, 0x2b, 0x8f, 0xaa, 0x37, 0x6e, 0xb1, 0x6b, 0x23, + 0x9e, 0xe0, 0x55, 0x25, 0xc9, 0x69, 0xa6, 0x95, 0xb5, 0x6b}, + {0x5f, 0x7a, 0x9b, 0xa5, 0xb3, 0xa8, 0xfa, 0x43, 0x78, 0xcf, 0x9a, + 0x5d, 0xdd, 0x6b, 0xc1, 0x36, 0x31, 0x6a, 0x3d, 0xb, 0x84, 0xa0, + 0xf, 0x50, 0x73, 0xb, 0xa5, 0x3e, 0xb1, 0xf5, 0x1a, 0x70}, + }, + { + {0x30, 0x97, 0xee, 0x4c, 0xa8, 0xb0, 0x25, 0xaf, 0x8a, 0x4b, 0x86, + 0xe8, 0x30, 0x84, 0x5a, 0x2, 0x32, 0x67, 0x1, 0x9f, 0x2, 0x50, + 0x1b, 0xc1, 0xf4, 0xf8, 0x80, 0x9a, 0x1b, 0x4e, 0x16, 0x7a}, + {0x65, 0xd2, 0xfc, 0xa4, 0xe8, 0x1f, 0x61, 0x56, 0x7d, 0xba, 0xc1, + 0xe5, 0xfd, 0x53, 0xd3, 0x3b, 0xbd, 0xd6, 0x4b, 0x21, 0x1a, 0xf3, + 0x31, 0x81, 0x62, 0xda, 0x5b, 0x55, 0x87, 0x15, 0xb9, 0x2a}, + {0x89, 0xd8, 0xd0, 0xd, 0x3f, 0x93, 0xae, 0x14, 0x62, 0xda, 0x35, + 0x1c, 0x22, 0x23, 0x94, 0x58, 0x4c, 0xdb, 0xf2, 0x8c, 0x45, 0xe5, + 0x70, 0xd1, 0xc6, 0xb4, 0xb9, 0x12, 0xaf, 0x26, 0x28, 0x5a}, + }, + { + {0x9f, 0x9, 0xfc, 0x8e, 0xb9, 0x51, 0x73, 0x28, 0x38, 0x25, 0xfd, + 0x7d, 0xf4, 0xc6, 0x65, 0x67, 0x65, 0x92, 0xa, 0xfb, 0x3d, 0x8d, + 0x34, 0xca, 0x27, 0x87, 0xe5, 0x21, 0x3, 0x91, 0xe, 0x68}, + {0xbf, 0x18, 0x68, 0x5, 0xa, 0x5, 0xfe, 0x95, 0xa9, 0xfa, 0x60, + 0x56, 0x71, 0x89, 0x7e, 0x32, 0x73, 0x50, 0xa0, 0x6, 0xcd, 0xe3, + 0xe8, 0xc3, 0x9a, 0xa4, 0x45, 0x74, 0x4c, 0x3f, 0x93, 0x27}, + {0x9, 0xff, 0x76, 0xc4, 0xe9, 0xfb, 0x13, 0x5a, 0x72, 0xc1, 0x5c, + 0x7b, 0x45, 0x39, 0x9e, 0x6e, 0x94, 0x44, 0x2b, 0x10, 0xf9, 0xdc, + 0xdb, 0x5d, 0x2b, 0x3e, 0x55, 0x63, 0xbf, 0xc, 0x9d, 0x7f}, + }, + { + {0x33, 0xbb, 0xa5, 0x8, 0x44, 0xbc, 0x12, 0xa2, 0x2, 0xed, 0x5e, + 0xc7, 0xc3, 0x48, 0x50, 0x8d, 0x44, 0xec, 0xbf, 0x5a, 0xc, 0xeb, + 0x1b, 0xdd, 0xeb, 0x6, 0xe2, 0x46, 0xf1, 0xcc, 0x45, 0x29}, + {0xba, 0xd6, 0x47, 0xa4, 0xc3, 0x82, 0x91, 0x7f, 0xb7, 0x29, 0x27, + 0x4b, 0xd1, 0x14, 0x0, 0xd5, 0x87, 0xa0, 0x64, 0xb8, 0x1c, 0xf1, + 0x3c, 0xe3, 0xf3, 0x55, 0x1b, 0xeb, 0x73, 0x7e, 0x4a, 0x15}, + {0x85, 0x82, 0x2a, 0x81, 0xf1, 0xdb, 0xbb, 0xbc, 0xfc, 0xd1, 0xbd, + 0xd0, 0x7, 0x8, 0xe, 0x27, 0x2d, 0xa7, 0xbd, 0x1b, 0xb, 0x67, + 0x1b, 0xb4, 0x9a, 0xb6, 0x3b, 0x6b, 0x69, 0xbe, 0xaa, 0x43}, + }, + { + {0x31, 0x71, 0x15, 0x77, 0xeb, 0xee, 0xc, 0x3a, 0x88, 0xaf, 0xc8, + 0x0, 0x89, 0x15, 0x27, 0x9b, 0x36, 0xa7, 0x59, 0xda, 0x68, 0xb6, + 0x65, 0x80, 0xbd, 0x38, 0xcc, 0xa2, 0xb6, 0x7b, 0xe5, 0x51}, + {0xa4, 0x8c, 0x7d, 0x7b, 0xb6, 0x6, 0x98, 0x49, 0x39, 0x27, 0xd2, + 0x27, 0x84, 0xe2, 0x5b, 0x57, 0xb9, 0x53, 0x45, 0x20, 0xe7, 0x5c, + 0x8, 0xbb, 0x84, 0x78, 0x41, 0xae, 0x41, 0x4c, 0xb6, 0x38}, + {0x71, 0x4b, 0xea, 0x2, 0x67, 0x32, 0xac, 0x85, 0x1, 0xbb, 0xa1, + 0x41, 0x3, 0xe0, 0x70, 0xbe, 0x44, 0xc1, 0x3b, 0x8, 0x4b, 0xa2, + 0xe4, 0x53, 0xe3, 0x61, 0xd, 0x9f, 0x1a, 0xe9, 0xb8, 0x10}, + }, + { + {0xbf, 0xa3, 0x4e, 0x94, 0xd0, 0x5c, 0x1a, 0x6b, 0xd2, 0xc0, 0x9d, + 0xb3, 0x3a, 0x35, 0x70, 0x74, 0x49, 0x2e, 0x54, 0x28, 0x82, 0x52, + 0xb2, 0x71, 0x7e, 0x92, 0x3c, 0x28, 0x69, 0xea, 0x1b, 0x46}, + {0xb1, 0x21, 0x32, 0xaa, 0x9a, 0x2c, 0x6f, 0xba, 0xa7, 0x23, 0xba, + 0x3b, 0x53, 0x21, 0xa0, 0x6c, 0x3a, 0x2c, 0x19, 0x92, 0x4f, 0x76, + 0xea, 0x9d, 0xe0, 0x17, 0x53, 0x2e, 0x5d, 0xdd, 0x6e, 0x1d}, + {0xa2, 0xb3, 0xb8, 0x1, 0xc8, 0x6d, 0x83, 0xf1, 0x9a, 0xa4, 0x3e, + 0x5, 0x47, 0x5f, 0x3, 0xb3, 0xf3, 0xad, 0x77, 0x58, 0xba, 0x41, + 0x9c, 0x52, 0xa7, 0x90, 0xf, 0x6a, 0x1c, 0xbb, 0x9f, 0x7a}, + }, + { + {0x8f, 0x3e, 0xdd, 0x4, 0x66, 0x59, 0xb7, 0x59, 0x2c, 0x70, 0x88, + 0xe2, 0x77, 0x3, 0xb3, 0x6c, 0x23, 0xc3, 0xd9, 0x5e, 0x66, 0x9c, + 0x33, 0xb1, 0x2f, 0xe5, 0xbc, 0x61, 0x60, 0xe7, 0x15, 0x9}, + {0xd9, 0x34, 0x92, 0xf3, 0xed, 0x5d, 0xa7, 0xe2, 0xf9, 0x58, 0xb5, + 0xe1, 0x80, 0x76, 0x3d, 0x96, 0xfb, 0x23, 0x3c, 0x6e, 0xac, 0x41, + 0x27, 0x2c, 0xc3, 0x1, 0xe, 0x32, 0xa1, 0x24, 0x90, 0x3a}, + {0x1a, 0x91, 0xa2, 0xc9, 0xd9, 0xf5, 0xc1, 0xe7, 0xd7, 0xa7, 0xcc, + 0x8b, 0x78, 0x71, 0xa3, 0xb8, 0x32, 0x2a, 0xb6, 0xe, 0x19, 0x12, + 0x64, 0x63, 0x95, 0x4e, 0xcc, 0x2e, 0x5c, 0x7c, 0x90, 0x26}, + }, + }, + { + { + {0x1d, 0x9c, 0x2f, 0x63, 0xe, 0xdd, 0xcc, 0x2e, 0x15, 0x31, 0x89, + 0x76, 0x96, 0xb6, 0xd0, 0x51, 0x58, 0x7a, 0x63, 0xa8, 0x6b, 0xb7, + 0xdf, 0x52, 0x39, 0xef, 0xe, 0xa0, 0x49, 0x7d, 0xd3, 0x6d}, + {0x5e, 0x51, 0xaa, 0x49, 0x54, 0x63, 0x5b, 0xed, 0x3a, 0x82, 0xc6, + 0xb, 0x9f, 0xc4, 0x65, 0xa8, 0xc4, 0xd1, 0x42, 0x5b, 0xe9, 0x1f, + 0xc, 0x85, 0xb9, 0x15, 0xd3, 0x3, 0x6f, 0x6d, 0xd7, 0x30}, + {0xc7, 0xe4, 0x6, 0x21, 0x17, 0x44, 0x44, 0x6c, 0x69, 0x7f, 0x8d, + 0x92, 0x80, 0xd6, 0x53, 0xfb, 0x26, 0x3f, 0x4d, 0x69, 0xa4, 0x9e, + 0x73, 0xb4, 0xb0, 0x4b, 0x86, 0x2e, 0x11, 0x97, 0xc6, 0x10}, + }, + { + {0x5, 0xc8, 0x58, 0x83, 0xa0, 0x2a, 0xa6, 0xc, 0x47, 0x42, 0x20, + 0x7a, 0xe3, 0x4a, 0x3d, 0x6a, 0xdc, 0xed, 0x11, 0x3b, 0xa6, 0xd3, + 0x64, 0x74, 0xef, 0x6, 0x8, 0x55, 0xaf, 0x9b, 0xbf, 0x3}, + {0xde, 0x5f, 0xbe, 0x7d, 0x27, 0xc4, 0x93, 0x64, 0xa2, 0x7e, 0xad, + 0x19, 0xad, 0x4f, 0x5d, 0x26, 0x90, 0x45, 0x30, 0x46, 0xc8, 0xdf, + 0x0, 0xe, 0x9, 0xfe, 0x66, 0xed, 0xab, 0x1c, 0xe6, 0x25}, + {0x4, 0x66, 0x58, 0xcc, 0x28, 0xe1, 0x13, 0x3f, 0x7e, 0x74, 0x59, + 0xb4, 0xec, 0x73, 0x58, 0x6f, 0xf5, 0x68, 0x12, 0xcc, 0xed, 0x3d, + 0xb6, 0xa0, 0x2c, 0xe2, 0x86, 0x45, 0x63, 0x78, 0x6d, 0x56}, + }, + { + {0xd0, 0x2f, 0x5a, 0xc6, 0x85, 0x42, 0x5, 0xa1, 0xc3, 0x67, 0x16, + 0xf3, 0x2a, 0x11, 0x64, 0x6c, 0x58, 0xee, 0x1a, 0x73, 0x40, 0xe2, + 0xa, 0x68, 0x2a, 0xb2, 0x93, 0x47, 0xf3, 0xa5, 0xfb, 0x14}, + {0x34, 0x8, 0xc1, 0x9c, 0x9f, 0xa4, 0x37, 0x16, 0x51, 0xc4, 0x9b, + 0xa8, 0xd5, 0x56, 0x8e, 0xbc, 0xdb, 0xd2, 0x7f, 0x7f, 0xf, 0xec, + 0xb5, 0x1c, 0xd9, 0x35, 0xcc, 0x5e, 0xca, 0x5b, 0x97, 0x33}, + {0xd4, 0xf7, 0x85, 0x69, 0x16, 0x46, 0xd7, 0x3c, 0x57, 0x0, 0xc8, + 0xc9, 0x84, 0x5e, 0x3e, 0x59, 0x1e, 0x13, 0x61, 0x7b, 0xb6, 0xf2, + 0xc3, 0x2f, 0x6c, 0x52, 0xfc, 0x83, 0xea, 0x9c, 0x82, 0x14}, + }, + { + {0xb8, 0xec, 0x71, 0x4e, 0x2f, 0xb, 0xe7, 0x21, 0xe3, 0x77, 0xa4, + 0x40, 0xb9, 0xdd, 0x56, 0xe6, 0x80, 0x4f, 0x1d, 0xce, 0xce, 0x56, + 0x65, 0xbf, 0x7e, 0x7b, 0x5d, 0x53, 0xc4, 0x3b, 0xfc, 0x5}, + {0xc2, 0x95, 0xdd, 0x97, 0x84, 0x7b, 0x43, 0xff, 0xa7, 0xb5, 0x4e, + 0xaa, 0x30, 0x4e, 0x74, 0x6c, 0x8b, 0xe8, 0x85, 0x3c, 0x61, 0x5d, + 0xc, 0x9e, 0x73, 0x81, 0x75, 0x5f, 0x1e, 0xc7, 0xd9, 0x2f}, + {0xdd, 0xde, 0xaf, 0x52, 0xae, 0xb3, 0xb8, 0x24, 0xcf, 0x30, 0x3b, + 0xed, 0x8c, 0x63, 0x95, 0x34, 0x95, 0x81, 0xbe, 0xa9, 0x83, 0xbc, + 0xa4, 0x33, 0x4, 0x1f, 0x65, 0x5c, 0x47, 0x67, 0x37, 0x37}, + }, + { + {0x90, 0x65, 0x24, 0x14, 0xcb, 0x95, 0x40, 0x63, 0x35, 0x55, 0xc1, + 0x16, 0x40, 0x14, 0x12, 0xef, 0x60, 0xbc, 0x10, 0x89, 0xc, 0x14, + 0x38, 0x9e, 0x8c, 0x7c, 0x90, 0x30, 0x57, 0x90, 0xf5, 0x6b}, + {0xd9, 0xad, 0xd1, 0x40, 0xfd, 0x99, 0xba, 0x2f, 0x27, 0xd0, 0xf4, + 0x96, 0x6f, 0x16, 0x7, 0xb3, 0xae, 0x3b, 0xf0, 0x15, 0x52, 0xf0, + 0x63, 0x43, 0x99, 0xf9, 0x18, 0x3b, 0x6c, 0xa5, 0xbe, 0x1f}, + {0x8a, 0x5b, 0x41, 0xe1, 0xf1, 0x78, 0xa7, 0xf, 0x7e, 0xa7, 0xc3, + 0xba, 0xf7, 0x9f, 0x40, 0x6, 0x50, 0x9a, 0xa2, 0x9a, 0xb8, 0xd7, + 0x52, 0x6f, 0x56, 0x5a, 0x63, 0x7a, 0xf6, 0x1c, 0x52, 0x2}, + }, + { + {0xe4, 0x5e, 0x2f, 0x77, 0x20, 0x67, 0x14, 0xb1, 0xce, 0x9a, 0x7, + 0x96, 0xb1, 0x94, 0xf8, 0xe8, 0x4a, 0x82, 0xac, 0x0, 0x4d, 0x22, + 0xf8, 0x4a, 0xc4, 0x6c, 0xcd, 0xf7, 0xd9, 0x53, 0x17, 0x0}, + {0x94, 0x52, 0x9d, 0xa, 0xb, 0xee, 0x3f, 0x51, 0x66, 0x5a, 0xdf, + 0xf, 0x5c, 0xe7, 0x98, 0x8f, 0xce, 0x7, 0xe1, 0xbf, 0x88, 0x86, + 0x61, 0xd4, 0xed, 0x2c, 0x38, 0x71, 0x7e, 0xa, 0xa0, 0x3f}, + {0x34, 0xdb, 0x3d, 0x96, 0x2d, 0x23, 0x69, 0x3c, 0x58, 0x38, 0x97, + 0xb4, 0xda, 0x87, 0xde, 0x1d, 0x85, 0xf2, 0x91, 0xa0, 0xf9, 0xd1, + 0xd7, 0xaa, 0xb6, 0xed, 0x48, 0xa0, 0x2f, 0xfe, 0xb5, 0x12}, + }, + { + {0x92, 0x1e, 0x6f, 0xad, 0x26, 0x7c, 0x2b, 0xdf, 0x13, 0x89, 0x4b, + 0x50, 0x23, 0xd3, 0x66, 0x4b, 0xc3, 0x8b, 0x1c, 0x75, 0xc0, 0x9d, + 0x40, 0x8c, 0xb8, 0xc7, 0x96, 0x7, 0xc2, 0x93, 0x7e, 0x6f}, + {0x4d, 0xe3, 0xfc, 0x96, 0xc4, 0xfb, 0xf0, 0x71, 0xed, 0x5b, 0xf3, + 0xad, 0x6b, 0x82, 0xb9, 0x73, 0x61, 0xc5, 0x28, 0xff, 0x61, 0x72, + 0x4, 0xd2, 0x6f, 0x20, 0xb1, 0x6f, 0xf9, 0x76, 0x9b, 0x74}, + {0x5, 0xae, 0xa6, 0xae, 0x4, 0xf6, 0x5a, 0x1f, 0x99, 0x9c, 0xe4, + 0xbe, 0xf1, 0x51, 0x23, 0xc1, 0x66, 0x6b, 0xff, 0xee, 0xb5, 0x8, + 0xa8, 0x61, 0x51, 0x21, 0xe0, 0x1, 0xf, 0xc1, 0xce, 0xf}, + }, + { + {0x45, 0x4e, 0x24, 0xc4, 0x9d, 0xd2, 0xf2, 0x3d, 0xa, 0xde, 0xd8, + 0x93, 0x74, 0xe, 0x2, 0x2b, 0x4d, 0x21, 0xc, 0x82, 0x7e, 0x6, + 0xc8, 0x6c, 0xa, 0xb9, 0xea, 0x6f, 0x16, 0x79, 0x37, 0x41}, + {0x44, 0x1e, 0xfe, 0x49, 0xa6, 0x58, 0x4d, 0x64, 0x7e, 0x77, 0xad, + 0x31, 0xa2, 0xae, 0xfc, 0x21, 0xd2, 0xd0, 0x7f, 0x88, 0x5a, 0x1c, + 0x44, 0x2, 0xf3, 0x11, 0xc5, 0x83, 0x71, 0xaa, 0x1, 0x49}, + {0xf0, 0xf8, 0x1a, 0x8c, 0x54, 0xb7, 0xb1, 0x8, 0xb4, 0x99, 0x62, + 0x24, 0x7c, 0x7a, 0xf, 0xce, 0x39, 0xd9, 0x6, 0x1e, 0xf9, 0xb0, + 0x60, 0xf7, 0x13, 0x12, 0x6d, 0x72, 0x7b, 0x88, 0xbb, 0x41}, + }, + }, + { + { + {0xae, 0x91, 0x66, 0x7c, 0x59, 0x4c, 0x23, 0x7e, 0xc8, 0xb4, 0x85, + 0xa, 0x3d, 0x9d, 0x88, 0x64, 0xe7, 0xfa, 0x4a, 0x35, 0xc, 0xc9, + 0xe2, 0xda, 0x1d, 0x9e, 0x6a, 0xc, 0x7, 0x1e, 0x87, 0xa}, + {0xbe, 0x46, 0x43, 0x74, 0x44, 0x7d, 0xe8, 0x40, 0x25, 0x2b, 0xb5, + 0x15, 0xd4, 0xda, 0x48, 0x1d, 0x3e, 0x60, 0x3b, 0xa1, 0x18, 0x8a, + 0x3a, 0x7c, 0xf7, 0xbd, 0xcd, 0x2f, 0xc1, 0x28, 0xb7, 0x4e}, + {0x89, 0x89, 0xbc, 0x4b, 0x99, 0xb5, 0x1, 0x33, 0x60, 0x42, 0xdd, + 0x5b, 0x3a, 0xae, 0x6b, 0x73, 0x3c, 0x9e, 0xd5, 0x19, 0xe2, 0xad, + 0x61, 0xd, 0x64, 0xd4, 0x85, 0x26, 0xf, 0x30, 0xe7, 0x3e}, + }, + { + {0x18, 0x75, 0x1e, 0x84, 0x47, 0x79, 0xfa, 0x43, 0xd7, 0x46, 0x9c, + 0x63, 0x59, 0xfa, 0xc6, 0xe5, 0x74, 0x2b, 0x5, 0xe3, 0x1d, 0x5e, + 0x6, 0xa1, 0x30, 0x90, 0xb8, 0xcf, 0xa2, 0xc6, 0x47, 0x7d}, + {0xb7, 0xd6, 0x7d, 0x9e, 0xe4, 0x55, 0xd2, 0xf5, 0xac, 0x1e, 0xb, + 0x61, 0x5c, 0x11, 0x16, 0x80, 0xca, 0x87, 0xe1, 0x92, 0x5d, 0x97, + 0x99, 0x3c, 0xc2, 0x25, 0x91, 0x97, 0x62, 0x57, 0x81, 0x13}, + {0xe0, 0xd6, 0xf0, 0x8e, 0x14, 0xd0, 0xda, 0x3f, 0x3c, 0x6f, 0x54, + 0x91, 0x9a, 0x74, 0x3e, 0x9d, 0x57, 0x81, 0xbb, 0x26, 0x10, 0x62, + 0xec, 0x71, 0x80, 0xec, 0xc9, 0x34, 0x8d, 0xf5, 0x8c, 0x14}, + }, + { + {0x6d, 0x75, 0xe4, 0x9a, 0x7d, 0x2f, 0x57, 0xe2, 0x7f, 0x48, 0xf3, + 0x88, 0xbb, 0x45, 0xc3, 0x56, 0x8d, 0xa8, 0x60, 0x69, 0x6d, 0xb, + 0xd1, 0x9f, 0xb9, 0xa1, 0xae, 0x4e, 0xad, 0xeb, 0x8f, 0x27}, + {0x27, 0xf0, 0x34, 0x79, 0xf6, 0x92, 0xa4, 0x46, 0xa9, 0xa, 0x84, + 0xf6, 0xbe, 0x84, 0x99, 0x46, 0x54, 0x18, 0x61, 0x89, 0x2a, 0xbc, + 0xa1, 0x5c, 0xd4, 0xbb, 0x5d, 0xbd, 0x1e, 0xfa, 0xf2, 0x3f}, + {0x66, 0x39, 0x93, 0x8c, 0x1f, 0x68, 0xaa, 0xb1, 0x98, 0xc, 0x29, + 0x20, 0x9c, 0x94, 0x21, 0x8c, 0x52, 0x3c, 0x9d, 0x21, 0x91, 0x52, + 0x11, 0x39, 0x7b, 0x67, 0x9c, 0xfe, 0x2, 0xdd, 0x4, 0x41}, + }, + { + {0xb8, 0x6a, 0x9, 0xdb, 0x6, 0x4e, 0x21, 0x81, 0x35, 0x4f, 0xe4, + 0xc, 0xc9, 0xb6, 0xa8, 0x21, 0xf5, 0x2a, 0x9e, 0x40, 0x2a, 0xc1, + 0x24, 0x65, 0x81, 0xa4, 0xfc, 0x8e, 0xa4, 0xb5, 0x65, 0x1}, + {0x2a, 0x42, 0x24, 0x11, 0x5e, 0xbf, 0xb2, 0x72, 0xb5, 0x3a, 0xa3, + 0x98, 0x33, 0xc, 0xfa, 0xa1, 0x66, 0xb6, 0x52, 0xfa, 0x1, 0x61, + 0xcb, 0x94, 0xd5, 0x53, 0xaf, 0xaf, 0x0, 0x3b, 0x86, 0x2c}, + {0x76, 0x6a, 0x84, 0xa0, 0x74, 0xa4, 0x90, 0xf1, 0xc0, 0x7c, 0x2f, + 0xcd, 0x84, 0xf9, 0xef, 0x12, 0x8f, 0x2b, 0xaa, 0x58, 0x6, 0x29, + 0x5e, 0x69, 0xb8, 0xc8, 0xfe, 0xbf, 0xd9, 0x67, 0x1b, 0x59}, + }, + { + {0x5d, 0xb5, 0x18, 0x9f, 0x71, 0xb3, 0xb9, 0x99, 0x1e, 0x64, 0x8c, + 0xa1, 0xfa, 0xe5, 0x65, 0xe4, 0xed, 0x5, 0x9f, 0xc2, 0x36, 0x11, + 0x8, 0x61, 0x8b, 0x12, 0x30, 0x70, 0x86, 0x4f, 0x9b, 0x48}, + {0xfa, 0x9b, 0xb4, 0x80, 0x1c, 0xd, 0x2f, 0x31, 0x8a, 0xec, 0xf3, + 0xab, 0x5e, 0x51, 0x79, 0x59, 0x88, 0x1c, 0xf0, 0x9e, 0xc0, 0x33, + 0x70, 0x72, 0xcb, 0x7b, 0x8f, 0xca, 0xc7, 0x2e, 0xe0, 0x3d}, + {0xef, 0x92, 0xeb, 0x3a, 0x2d, 0x10, 0x32, 0xd2, 0x61, 0xa8, 0x16, + 0x61, 0xb4, 0x53, 0x62, 0xe1, 0x24, 0xaa, 0xb, 0x19, 0xe7, 0xab, + 0x7e, 0x3d, 0xbf, 0xbe, 0x6c, 0x49, 0xba, 0xfb, 0xf5, 0x49}, + }, + { + {0x2e, 0x57, 0x9c, 0x1e, 0x8c, 0x62, 0x5d, 0x15, 0x41, 0x47, 0x88, + 0xc5, 0xac, 0x86, 0x4d, 0x8a, 0xeb, 0x63, 0x57, 0x51, 0xf6, 0x52, + 0xa3, 0x91, 0x5b, 0x51, 0x67, 0x88, 0xc2, 0xa6, 0xa1, 0x6}, + {0xd4, 0xcf, 0x5b, 0x8a, 0x10, 0x9a, 0x94, 0x30, 0xeb, 0x73, 0x64, + 0xbc, 0x70, 0xdd, 0x40, 0xdc, 0x1c, 0xd, 0x7c, 0x30, 0xc1, 0x94, + 0xc2, 0x92, 0x74, 0x6e, 0xfa, 0xcb, 0x6d, 0xa8, 0x4, 0x56}, + {0xb6, 0x64, 0x17, 0x7c, 0xd4, 0xd1, 0x88, 0x72, 0x51, 0x8b, 0x41, + 0xe0, 0x40, 0x11, 0x54, 0x72, 0xd1, 0xf6, 0xac, 0x18, 0x60, 0x1a, + 0x3, 0x9f, 0xc6, 0x42, 0x27, 0xfe, 0x89, 0x9e, 0x98, 0x20}, + }, + { + {0x2e, 0xec, 0xea, 0x85, 0x8b, 0x27, 0x74, 0x16, 0xdf, 0x2b, 0xcb, + 0x7a, 0x7, 0xdc, 0x21, 0x56, 0x5a, 0xf4, 0xcb, 0x61, 0x16, 0x4c, + 0xa, 0x64, 0xd3, 0x95, 0x5, 0xf7, 0x50, 0x99, 0xb, 0x73}, + {0x7f, 0xcc, 0x2d, 0x3a, 0xfd, 0x77, 0x97, 0x49, 0x92, 0xd8, 0x4f, + 0xa5, 0x2c, 0x7c, 0x85, 0x32, 0xa0, 0xe3, 0x7, 0xd2, 0x64, 0xd8, + 0x79, 0xa2, 0x29, 0x7e, 0xa6, 0xc, 0x1d, 0xed, 0x3, 0x4}, + {0x52, 0xc5, 0x4e, 0x87, 0x35, 0x2d, 0x4b, 0xc9, 0x8d, 0x6f, 0x24, + 0x98, 0xcf, 0xc8, 0xe6, 0xc5, 0xce, 0x35, 0xc0, 0x16, 0xfa, 0x46, + 0xcb, 0xf7, 0xcc, 0x3d, 0x30, 0x8, 0x43, 0x45, 0xd7, 0x5b}, + }, + { + {0x2a, 0x79, 0xe7, 0x15, 0x21, 0x93, 0xc4, 0x85, 0xc9, 0xdd, 0xcd, + 0xbd, 0xa2, 0x89, 0x4c, 0xc6, 0x62, 0xd7, 0xa3, 0xad, 0xa8, 0x3d, + 0x1e, 0x9d, 0x2c, 0xf8, 0x67, 0x30, 0x12, 0xdb, 0xb7, 0x5b}, + {0xc2, 0x4c, 0xb2, 0x28, 0x95, 0xd1, 0x9a, 0x7f, 0x81, 0xc1, 0x35, + 0x63, 0x65, 0x54, 0x6b, 0x7f, 0x36, 0x72, 0xc0, 0x4f, 0x6e, 0xb6, + 0xb8, 0x66, 0x83, 0xad, 0x80, 0x73, 0x0, 0x78, 0x3a, 0x13}, + {0xbe, 0x62, 0xca, 0xc6, 0x67, 0xf4, 0x61, 0x9, 0xee, 0x52, 0x19, + 0x21, 0xd6, 0x21, 0xec, 0x4, 0x70, 0x47, 0xd5, 0x9b, 0x77, 0x60, + 0x23, 0x18, 0xd2, 0xe0, 0xf0, 0x58, 0x6d, 0xca, 0xd, 0x74}, + }, + }, + { + { + {0x3c, 0x43, 0x78, 0x4, 0x57, 0x8c, 0x1a, 0x23, 0x9d, 0x43, 0x81, + 0xc2, 0xe, 0x27, 0xb5, 0xb7, 0x9f, 0x7, 0xd9, 0xe3, 0xea, 0x99, + 0xaa, 0xdb, 0xd9, 0x3, 0x2b, 0x6c, 0x25, 0xf5, 0x3, 0x2c}, + {0x4e, 0xce, 0xcf, 0x52, 0x7, 0xee, 0x48, 0xdf, 0xb7, 0x8, 0xec, + 0x6, 0xf3, 0xfa, 0xff, 0xc3, 0xc4, 0x59, 0x54, 0xb9, 0x2a, 0xb, + 0x71, 0x5, 0x8d, 0xa3, 0x3e, 0x96, 0xfa, 0x25, 0x1d, 0x16}, + {0x7d, 0xa4, 0x53, 0x7b, 0x75, 0x18, 0xf, 0x79, 0x79, 0x58, 0xc, + 0xcf, 0x30, 0x1, 0x7b, 0x30, 0xf9, 0xf7, 0x7e, 0x25, 0x77, 0x3d, + 0x90, 0x31, 0xaf, 0xbb, 0x96, 0xbd, 0xbd, 0x68, 0x94, 0x69}, + }, + { + {0x48, 0x19, 0xa9, 0x6a, 0xe6, 0x3d, 0xdd, 0xd8, 0xcc, 0xd2, 0xc0, + 0x2f, 0xc2, 0x64, 0x50, 0x48, 0x2f, 0xea, 0xfd, 0x34, 0x66, 0x24, + 0x48, 0x9b, 0x3a, 0x2e, 0x4a, 0x6c, 0x4e, 0x1c, 0x3e, 0x29}, + {0xcf, 0xfe, 0xda, 0xf4, 0x46, 0x2f, 0x1f, 0xbd, 0xf7, 0xd6, 0x7f, + 0xa4, 0x14, 0x1, 0xef, 0x7c, 0x7f, 0xb3, 0x47, 0x4a, 0xda, 0xfd, + 0x1f, 0xd3, 0x85, 0x57, 0x90, 0x73, 0xa4, 0x19, 0x52, 0x52}, + {0xe1, 0x12, 0x51, 0x92, 0x4b, 0x13, 0x6e, 0x37, 0xa0, 0x5d, 0xa1, + 0xdc, 0xb5, 0x78, 0x37, 0x70, 0x11, 0x31, 0x1c, 0x46, 0xaf, 0x89, + 0x45, 0xb0, 0x23, 0x28, 0x3, 0x7f, 0x44, 0x5c, 0x60, 0x5b}, + }, + { + {0x4c, 0xf0, 0xe7, 0xf0, 0xc6, 0xfe, 0xe9, 0x3b, 0x62, 0x49, 0xe3, + 0x75, 0x9e, 0x57, 0x6a, 0x86, 0x1a, 0xe6, 0x1d, 0x1e, 0x16, 0xef, + 0x42, 0x55, 0xd5, 0xbd, 0x5a, 0xcc, 0xf4, 0xfe, 0x12, 0x2f}, + {0x89, 0x7c, 0xc4, 0x20, 0x59, 0x80, 0x65, 0xb9, 0xcc, 0x8f, 0x3b, + 0x92, 0xc, 0x10, 0xf0, 0xe7, 0x77, 0xef, 0xe2, 0x2, 0x65, 0x25, + 0x1, 0x0, 0xee, 0xb3, 0xae, 0xa8, 0xce, 0x6d, 0xa7, 0x24}, + {0x40, 0xc7, 0xc0, 0xdf, 0xb2, 0x22, 0x45, 0xa, 0x7, 0xa4, 0xc9, + 0x40, 0x7f, 0x6e, 0xd0, 0x10, 0x68, 0xf6, 0xcf, 0x78, 0x41, 0x14, + 0xcf, 0xc6, 0x90, 0x37, 0xa4, 0x18, 0x25, 0x7b, 0x60, 0x5e}, + }, + { + {0x14, 0xcf, 0x96, 0xa5, 0x1c, 0x43, 0x2c, 0xa0, 0x0, 0xe4, 0xd3, + 0xae, 0x40, 0x2d, 0xc4, 0xe3, 0xdb, 0x26, 0xf, 0x2e, 0x80, 0x26, + 0x45, 0xd2, 0x68, 0x70, 0x45, 0x9e, 0x13, 0x33, 0x1f, 0x20}, + {0x18, 0x18, 0xdf, 0x6c, 0x8f, 0x1d, 0xb3, 0x58, 0xa2, 0x58, 0x62, + 0xc3, 0x4f, 0xa7, 0xcf, 0x35, 0x6e, 0x1d, 0xe6, 0x66, 0x4f, 0xff, + 0xb3, 0xe1, 0xf7, 0xd5, 0xcd, 0x6c, 0xab, 0xac, 0x67, 0x50}, + {0x51, 0x9d, 0x3, 0x8, 0x6b, 0x7f, 0x52, 0xfd, 0x6, 0x0, 0x7c, + 0x1, 0x64, 0x49, 0xb1, 0x18, 0xa8, 0xa4, 0x25, 0x2e, 0xb0, 0xe, + 0x22, 0xd5, 0x75, 0x3, 0x46, 0x62, 0x88, 0xba, 0x7c, 0x39}, + }, + { + {0xe7, 0x79, 0x13, 0xc8, 0xfb, 0xc3, 0x15, 0x78, 0xf1, 0x2a, 0xe1, + 0xdd, 0x20, 0x94, 0x61, 0xa6, 0xd5, 0xfd, 0xa8, 0x85, 0xf8, 0xc0, + 0xa9, 0xff, 0x52, 0xc2, 0xe1, 0xc1, 0x22, 0x40, 0x1b, 0x77}, + {0xb2, 0x59, 0x59, 0xf0, 0x93, 0x30, 0xc1, 0x30, 0x76, 0x79, 0xa9, + 0xe9, 0x8d, 0xa1, 0x3a, 0xe2, 0x26, 0x5e, 0x1d, 0x72, 0x91, 0xd4, + 0x2f, 0x22, 0x3a, 0x6c, 0x6e, 0x76, 0x20, 0xd3, 0x39, 0x23}, + {0xa7, 0x2f, 0x3a, 0x51, 0x86, 0xd9, 0x7d, 0xd8, 0x8, 0xcf, 0xd4, + 0xf9, 0x71, 0x9b, 0xac, 0xf5, 0xb3, 0x83, 0xa2, 0x1e, 0x1b, 0xc3, + 0x6b, 0xd0, 0x76, 0x1a, 0x97, 0x19, 0x92, 0x18, 0x1a, 0x33}, + }, + { + {0xaf, 0x72, 0x75, 0x9d, 0x3a, 0x2f, 0x51, 0x26, 0x9e, 0x4a, 0x7, + 0x68, 0x88, 0xe2, 0xcb, 0x5b, 0xc4, 0xf7, 0x80, 0x11, 0xc1, 0xc1, + 0xed, 0x84, 0x7b, 0xa6, 0x49, 0xf6, 0x9f, 0x61, 0xc9, 0x1a}, + {0xc6, 0x80, 0x4f, 0xfb, 0x45, 0x6f, 0x16, 0xf5, 0xcf, 0x75, 0xc7, + 0x61, 0xde, 0xc7, 0x36, 0x9c, 0x1c, 0xd9, 0x41, 0x90, 0x1b, 0xe8, + 0xd4, 0xe3, 0x21, 0xfe, 0xbd, 0x83, 0x6b, 0x7c, 0x16, 0x31}, + {0x68, 0x10, 0x4b, 0x52, 0x42, 0x38, 0x2b, 0xf2, 0x87, 0xe9, 0x9c, + 0xee, 0x3b, 0x34, 0x68, 0x50, 0xc8, 0x50, 0x62, 0x4a, 0x84, 0x71, + 0x9d, 0xfc, 0x11, 0xb1, 0x8, 0x1f, 0x34, 0x36, 0x24, 0x61}, + }, + { + {0x38, 0x26, 0x2d, 0x1a, 0xe3, 0x49, 0x63, 0x8b, 0x35, 0xfd, 0xd3, + 0x9b, 0x0, 0xb7, 0xdf, 0x9d, 0xa4, 0x6b, 0xa0, 0xa3, 0xb8, 0xf1, + 0x8b, 0x7f, 0x45, 0x4, 0xd9, 0x78, 0x31, 0xaa, 0x22, 0x15}, + {0x8d, 0x89, 0x4e, 0x87, 0xdb, 0x41, 0x9d, 0xd9, 0x20, 0xdc, 0x7, + 0x6c, 0xf1, 0xa5, 0xfe, 0x9, 0xbc, 0x9b, 0xf, 0xd0, 0x67, 0x2c, + 0x3d, 0x79, 0x40, 0xff, 0x5e, 0x9e, 0x30, 0xe2, 0xeb, 0x46}, + {0x38, 0x49, 0x61, 0x69, 0x53, 0x2f, 0x38, 0x2c, 0x10, 0x6d, 0x2d, + 0xb7, 0x9a, 0x40, 0xfe, 0xda, 0x27, 0xf2, 0x46, 0xb6, 0x91, 0x33, + 0xc8, 0xe8, 0x6c, 0x30, 0x24, 0x5, 0xf5, 0x70, 0xfe, 0x45}, + }, + { + {0x91, 0x14, 0x95, 0xc8, 0x20, 0x49, 0xf2, 0x62, 0xa2, 0xc, 0x63, + 0x3f, 0xc8, 0x7, 0xf0, 0x5, 0xb8, 0xd4, 0xc9, 0xf5, 0xd2, 0x45, + 0xbb, 0x6f, 0x45, 0x22, 0x7a, 0xb5, 0x6d, 0x9f, 0x61, 0x16}, + {0x8c, 0xb, 0xc, 0x96, 0xa6, 0x75, 0x48, 0xda, 0x20, 0x2f, 0xe, + 0xef, 0x76, 0xd0, 0x68, 0x5b, 0xd4, 0x8f, 0xb, 0x3d, 0xcf, 0x51, + 0xfb, 0x7, 0xd4, 0x92, 0xe3, 0xa0, 0x23, 0x16, 0x8d, 0x42}, + {0xfd, 0x8, 0xa3, 0x1, 0x44, 0x4a, 0x4f, 0x8, 0xac, 0xca, 0xa5, + 0x76, 0xc3, 0x19, 0x22, 0xa8, 0x7d, 0xbc, 0xd1, 0x43, 0x46, 0xde, + 0xb8, 0xde, 0xc6, 0x38, 0xbd, 0x60, 0x2d, 0x59, 0x81, 0x1d}, + }, + }, + { + { + {0xe8, 0xc5, 0x85, 0x7b, 0x9f, 0xb6, 0x65, 0x87, 0xb2, 0xba, 0x68, + 0xd1, 0x8b, 0x67, 0xf0, 0x6f, 0x9b, 0xf, 0x33, 0x1d, 0x7c, 0xe7, + 0x70, 0x3a, 0x7c, 0x8e, 0xaf, 0xb0, 0x51, 0x6d, 0x5f, 0x3a}, + {0x5f, 0xac, 0xd, 0xa6, 0x56, 0x87, 0x36, 0x61, 0x57, 0xdc, 0xab, + 0xeb, 0x6a, 0x2f, 0xe0, 0x17, 0x7d, 0xf, 0xce, 0x4c, 0x2d, 0x3f, + 0x19, 0x7f, 0xf0, 0xdc, 0xec, 0x89, 0x77, 0x4a, 0x23, 0x20}, + {0x52, 0xb2, 0x78, 0x71, 0xb6, 0xd, 0xd2, 0x76, 0x60, 0xd1, 0x1e, + 0xd5, 0xf9, 0x34, 0x1c, 0x7, 0x70, 0x11, 0xe4, 0xb3, 0x20, 0x4a, + 0x2a, 0xf6, 0x66, 0xe3, 0xff, 0x3c, 0x35, 0x82, 0xd6, 0x7c}, + }, + { + {0xf3, 0xf4, 0xac, 0x68, 0x60, 0xcd, 0x65, 0xa6, 0xd3, 0xe3, 0xd7, + 0x3c, 0x18, 0x2d, 0xd9, 0x42, 0xd9, 0x25, 0x60, 0x33, 0x9d, 0x38, + 0x59, 0x57, 0xff, 0xd8, 0x2c, 0x2b, 0x3b, 0x25, 0xf0, 0x3e}, + {0xb6, 0xfa, 0x87, 0xd8, 0x5b, 0xa4, 0xe1, 0xb, 0x6e, 0x3b, 0x40, + 0xba, 0x32, 0x6a, 0x84, 0x2a, 0x0, 0x60, 0x6e, 0xe9, 0x12, 0x10, + 0x92, 0xd9, 0x43, 0x9, 0xdc, 0x3b, 0x86, 0xc8, 0x38, 0x28}, + {0x30, 0x50, 0x46, 0x4a, 0xcf, 0xb0, 0x6b, 0xd1, 0xab, 0x77, 0xc5, + 0x15, 0x41, 0x6b, 0x49, 0xfa, 0x9d, 0x41, 0xab, 0xf4, 0x8a, 0xae, + 0xcf, 0x82, 0x12, 0x28, 0xa8, 0x6, 0xa6, 0xb8, 0xdc, 0x21}, + }, + { + {0xba, 0x31, 0x77, 0xbe, 0xfa, 0x0, 0x8d, 0x9a, 0x89, 0x18, 0x9e, + 0x62, 0x7e, 0x60, 0x3, 0x82, 0x7f, 0xd9, 0xf3, 0x43, 0x37, 0x2, + 0xcc, 0xb2, 0x8b, 0x67, 0x6f, 0x6c, 0xbf, 0xd, 0x84, 0x5d}, + {0xc8, 0x9f, 0x9d, 0x8c, 0x46, 0x4, 0x60, 0x5c, 0xcb, 0xa3, 0x2a, + 0xd4, 0x6e, 0x9, 0x40, 0x25, 0x9c, 0x2f, 0xee, 0x12, 0x4c, 0x4d, + 0x5b, 0x12, 0xab, 0x1d, 0xa3, 0x94, 0x81, 0xd0, 0xc3, 0xb}, + {0x8b, 0xe1, 0x9f, 0x30, 0xd, 0x38, 0x6e, 0x70, 0xc7, 0x65, 0xe1, + 0xb9, 0xa6, 0x2d, 0xb0, 0x6e, 0xab, 0x20, 0xae, 0x7d, 0x99, 0xba, + 0xbb, 0x57, 0xdd, 0x96, 0xc1, 0x2a, 0x23, 0x76, 0x42, 0x3a}, + }, + { + {0xcb, 0x7e, 0x44, 0xdb, 0x72, 0xc1, 0xf8, 0x3b, 0xbd, 0x2d, 0x28, + 0xc6, 0x1f, 0xc4, 0xcf, 0x5f, 0xfe, 0x15, 0xaa, 0x75, 0xc0, 0xff, + 0xac, 0x80, 0xf9, 0xa9, 0xe1, 0x24, 0xe8, 0xc9, 0x70, 0x7}, + {0xfa, 0x84, 0x70, 0x8a, 0x2c, 0x43, 0x42, 0x4b, 0x45, 0xe5, 0xb9, + 0xdf, 0xe3, 0x19, 0x8a, 0x89, 0x5d, 0xe4, 0x58, 0x9c, 0x21, 0x0, + 0x9f, 0xbe, 0xd1, 0xeb, 0x6d, 0xa1, 0xce, 0x77, 0xf1, 0x1f}, + {0xfd, 0xb5, 0xb5, 0x45, 0x9a, 0xd9, 0x61, 0xcf, 0x24, 0x79, 0x3a, + 0x1b, 0xe9, 0x84, 0x9, 0x86, 0x89, 0x3e, 0x3e, 0x30, 0x19, 0x9, + 0x30, 0xe7, 0x1e, 0xb, 0x50, 0x41, 0xfd, 0x64, 0xf2, 0x39}, + }, + { + {0xe1, 0x7b, 0x9, 0xfe, 0xab, 0x4a, 0x9b, 0xd1, 0x29, 0x19, 0xe0, + 0xdf, 0xe1, 0xfc, 0x6d, 0xa4, 0xff, 0xf1, 0xa6, 0x2c, 0x94, 0x8, + 0xc9, 0xc3, 0x4e, 0xf1, 0x35, 0x2c, 0x27, 0x21, 0xc6, 0x65}, + {0x9c, 0xe2, 0xe7, 0xdb, 0x17, 0x34, 0xad, 0xa7, 0x9c, 0x13, 0x9c, + 0x2b, 0x6a, 0x37, 0x94, 0xbd, 0xa9, 0x7b, 0x59, 0x93, 0x8e, 0x1b, + 0xe9, 0xa0, 0x40, 0x98, 0x88, 0x68, 0x34, 0xd7, 0x12, 0x17}, + {0xdd, 0x93, 0x31, 0xce, 0xf8, 0x89, 0x2b, 0xe7, 0xbb, 0xc0, 0x25, + 0xa1, 0x56, 0x33, 0x10, 0x4d, 0x83, 0xfe, 0x1c, 0x2e, 0x3d, 0xa9, + 0x19, 0x4, 0x72, 0xe2, 0x9c, 0xb1, 0xa, 0x80, 0xf9, 0x22}, + }, + { + {0xac, 0xfd, 0x6e, 0x9a, 0xdd, 0x9f, 0x2, 0x42, 0x41, 0x49, 0xa5, + 0x34, 0xbe, 0xce, 0x12, 0xb9, 0x7b, 0xf3, 0xbd, 0x87, 0xb9, 0x64, + 0xf, 0x64, 0xb4, 0xca, 0x98, 0x85, 0xd3, 0xa4, 0x71, 0x41}, + {0xcb, 0xf8, 0x9e, 0x3e, 0x8a, 0x36, 0x5a, 0x60, 0x15, 0x47, 0x50, + 0xa5, 0x22, 0xc0, 0xe9, 0xe3, 0x8f, 0x24, 0x24, 0x5f, 0xb0, 0x48, + 0x3d, 0x55, 0xe5, 0x26, 0x76, 0x64, 0xcd, 0x16, 0xf4, 0x13}, + {0x8c, 0x4c, 0xc9, 0x99, 0xaa, 0x58, 0x27, 0xfa, 0x7, 0xb8, 0x0, + 0xb0, 0x6f, 0x6f, 0x0, 0x23, 0x92, 0x53, 0xda, 0xad, 0xdd, 0x91, + 0xd2, 0xfb, 0xab, 0xd1, 0x4b, 0x57, 0xfa, 0x14, 0x82, 0x50}, + }, + { + {0xd6, 0x3, 0xd0, 0x53, 0xbb, 0x15, 0x1a, 0x46, 0x65, 0xc9, 0xf3, + 0xbc, 0x88, 0x28, 0x10, 0xb2, 0x5a, 0x3a, 0x68, 0x6c, 0x75, 0x76, + 0xc5, 0x27, 0x47, 0xb4, 0x6c, 0xc8, 0xa4, 0x58, 0x77, 0x3a}, + {0x4b, 0xfe, 0xd6, 0x3e, 0x15, 0x69, 0x2, 0xc2, 0xc4, 0x77, 0x1d, + 0x51, 0x39, 0x67, 0x5a, 0xa6, 0x94, 0xaf, 0x14, 0x2c, 0x46, 0x26, + 0xde, 0xcb, 0x4b, 0xa7, 0xab, 0x6f, 0xec, 0x60, 0xf9, 0x22}, + {0x76, 0x50, 0xae, 0x93, 0xf6, 0x11, 0x81, 0x54, 0xa6, 0x54, 0xfd, + 0x1d, 0xdf, 0x21, 0xae, 0x1d, 0x65, 0x5e, 0x11, 0xf3, 0x90, 0x8c, + 0x24, 0x12, 0x94, 0xf4, 0xe7, 0x8d, 0x5f, 0xd1, 0x9f, 0x5d}, + }, + { + {0x1e, 0x52, 0xd7, 0xee, 0x2a, 0x4d, 0x24, 0x3f, 0x15, 0x96, 0x2e, + 0x43, 0x28, 0x90, 0x3a, 0x8e, 0xd4, 0x16, 0x9c, 0x2e, 0x77, 0xba, + 0x64, 0xe1, 0xd8, 0x98, 0xeb, 0x47, 0xfa, 0x87, 0xc1, 0x3b}, + {0x7f, 0x72, 0x63, 0x6d, 0xd3, 0x8, 0x14, 0x3, 0x33, 0xb5, 0xc7, + 0xd7, 0xef, 0x9a, 0x37, 0x6a, 0x4b, 0xe2, 0xae, 0xcc, 0xc5, 0x8f, + 0xe1, 0xa9, 0xd3, 0xbe, 0x8f, 0x4f, 0x91, 0x35, 0x2f, 0x33}, + {0xc, 0xc2, 0x86, 0xea, 0x15, 0x1, 0x47, 0x6d, 0x25, 0xd1, 0x46, + 0x6c, 0xcb, 0xb7, 0x8a, 0x99, 0x88, 0x1, 0x66, 0x3a, 0xb5, 0x32, + 0x78, 0xd7, 0x3, 0xba, 0x6f, 0x90, 0xce, 0x81, 0xd, 0x45}, + }, + }, + { + { + {0x3f, 0x74, 0xae, 0x1c, 0x96, 0xd8, 0x74, 0xd0, 0xed, 0x63, 0x1c, + 0xee, 0xf5, 0x18, 0x6d, 0xf8, 0x29, 0xed, 0xf4, 0xe7, 0x5b, 0xc5, + 0xbd, 0x97, 0x8, 0xb1, 0x3a, 0x66, 0x79, 0xd2, 0xba, 0x4c}, + {0x75, 0x52, 0x20, 0xa6, 0xa1, 0xb6, 0x7b, 0x6e, 0x83, 0x8e, 0x3c, + 0x41, 0xd7, 0x21, 0x4f, 0xaa, 0xb2, 0x5c, 0x8f, 0xe8, 0x55, 0xd1, + 0x56, 0x6f, 0xe1, 0x5b, 0x34, 0xa6, 0x4b, 0x5d, 0xe2, 0x2d}, + {0xcd, 0x1f, 0xd7, 0xa0, 0x24, 0x90, 0xd1, 0x80, 0xf8, 0x8a, 0x28, + 0xfb, 0xa, 0xc2, 0x25, 0xc5, 0x19, 0x64, 0x3a, 0x5f, 0x4b, 0x97, + 0xa3, 0xb1, 0x33, 0x72, 0x0, 0xe2, 0xef, 0xbc, 0x7f, 0x7d}, + }, + { + {0x94, 0x90, 0xc2, 0xf3, 0xc5, 0x5d, 0x7c, 0xcd, 0xab, 0x5, 0x91, + 0x2a, 0x9a, 0xa2, 0x81, 0xc7, 0x58, 0x30, 0x1c, 0x42, 0x36, 0x1d, + 0xc6, 0x80, 0xd7, 0xd4, 0xd8, 0xdc, 0x96, 0xd1, 0x9c, 0x4f}, + {0x1, 0x28, 0x6b, 0x26, 0x6a, 0x1e, 0xef, 0xfa, 0x16, 0x9f, 0x73, + 0xd5, 0xc4, 0x68, 0x6c, 0x86, 0x2c, 0x76, 0x3, 0x1b, 0xbc, 0x2f, + 0x8a, 0xf6, 0x8d, 0x5a, 0xb7, 0x87, 0x5e, 0x43, 0x75, 0x59}, + {0x68, 0x37, 0x7b, 0x6a, 0xd8, 0x97, 0x92, 0x19, 0x63, 0x7a, 0xd1, + 0x1a, 0x24, 0x58, 0xd0, 0xd0, 0x17, 0xc, 0x1c, 0x5c, 0xad, 0x9c, + 0x2, 0xba, 0x7, 0x3, 0x7a, 0x38, 0x84, 0xd0, 0xcd, 0x7c}, + }, + { + {0x93, 0xcc, 0x60, 0x67, 0x18, 0x84, 0xc, 0x9b, 0x99, 0x2a, 0xb3, + 0x1a, 0x7a, 0x0, 0xae, 0xcd, 0x18, 0xda, 0xb, 0x62, 0x86, 0xec, + 0x8d, 0xa8, 0x44, 0xca, 0x90, 0x81, 0x84, 0xca, 0x93, 0x35}, + {0x17, 0x4, 0x26, 0x6d, 0x2c, 0x42, 0xa6, 0xdc, 0xbd, 0x40, 0x82, + 0x94, 0x50, 0x3d, 0x15, 0xae, 0x77, 0xc6, 0x68, 0xfb, 0xb4, 0xc1, + 0xc0, 0xa9, 0x53, 0xcf, 0xd0, 0x61, 0xed, 0xd0, 0x8b, 0x42}, + {0xa7, 0x9a, 0x84, 0x5e, 0x9a, 0x18, 0x13, 0x92, 0xcd, 0xfa, 0xd8, + 0x65, 0x35, 0xc3, 0xd8, 0xd4, 0xd1, 0xbb, 0xfd, 0x53, 0x5b, 0x54, + 0x52, 0x8c, 0xe6, 0x63, 0x2d, 0xda, 0x8, 0x83, 0x39, 0x27}, + }, + { + {0x53, 0x24, 0x70, 0xa, 0x4c, 0xe, 0xa1, 0xb9, 0xde, 0x1b, 0x7d, + 0xd5, 0x66, 0x58, 0xa2, 0xf, 0xf7, 0xda, 0x27, 0xcd, 0xb5, 0xd9, + 0xb9, 0xff, 0xfd, 0x33, 0x2c, 0x49, 0x45, 0x29, 0x2c, 0x57}, + {0x13, 0xd4, 0x5e, 0x43, 0x28, 0x8d, 0xc3, 0x42, 0xc9, 0xcc, 0x78, + 0x32, 0x60, 0xf3, 0x50, 0xbd, 0xef, 0x3, 0xda, 0x79, 0x1a, 0xab, + 0x7, 0xbb, 0x55, 0x33, 0x8c, 0xbe, 0xae, 0x97, 0x95, 0x26}, + {0xbe, 0x30, 0xcd, 0xd6, 0x45, 0xc7, 0x7f, 0xc7, 0xfb, 0xae, 0xba, + 0xe3, 0xd3, 0xe8, 0xdf, 0xe4, 0xc, 0xda, 0x5d, 0xaa, 0x30, 0x88, + 0x2c, 0xa2, 0x80, 0xca, 0x5b, 0xc0, 0x98, 0x54, 0x98, 0x7f}, + }, + { + {0x63, 0x63, 0xbf, 0xf, 0x52, 0x15, 0x56, 0xd3, 0xa6, 0xfb, 0x4d, + 0xcf, 0x45, 0x5a, 0x4, 0x8, 0xc2, 0xa0, 0x3f, 0x87, 0xbc, 0x4f, + 0xc2, 0xee, 0xe7, 0x12, 0x9b, 0xd6, 0x3c, 0x65, 0xf2, 0x30}, + {0x17, 0xe1, 0xb, 0x9f, 0x88, 0xce, 0x49, 0x38, 0x88, 0xa2, 0x54, + 0x7b, 0x1b, 0xad, 0x5, 0x80, 0x1c, 0x92, 0xfc, 0x23, 0x9f, 0xc3, + 0xa3, 0x3d, 0x4, 0xf3, 0x31, 0xa, 0x47, 0xec, 0xc2, 0x76}, + {0x85, 0xc, 0xc1, 0xaa, 0x38, 0xc9, 0x8, 0x8a, 0xcb, 0x6b, 0x27, + 0xdb, 0x60, 0x9b, 0x17, 0x46, 0x70, 0xac, 0x6f, 0xe, 0x1e, 0xc0, + 0x20, 0xa9, 0xda, 0x73, 0x64, 0x59, 0xf1, 0x73, 0x12, 0x2f}, + }, + { + {0xc0, 0xb, 0xa7, 0x55, 0xd7, 0x8b, 0x48, 0x30, 0xe7, 0x42, 0xd4, + 0xf1, 0xa4, 0xb5, 0xd6, 0x6, 0x62, 0x61, 0x59, 0xbc, 0x9e, 0xa6, + 0xd1, 0xea, 0x84, 0xf7, 0xc5, 0xed, 0x97, 0x19, 0xac, 0x38}, + {0x11, 0x1e, 0xe0, 0x8a, 0x7c, 0xfc, 0x39, 0x47, 0x9f, 0xab, 0x6a, + 0x4a, 0x90, 0x74, 0x52, 0xfd, 0x2e, 0x8f, 0x72, 0x87, 0x82, 0x8a, + 0xd9, 0x41, 0xf2, 0x69, 0x5b, 0xd8, 0x2a, 0x57, 0x9e, 0x5d}, + {0x3b, 0xb1, 0x51, 0xa7, 0x17, 0xb5, 0x66, 0x6, 0x8c, 0x85, 0x9b, + 0x7e, 0x86, 0x6, 0x7d, 0x74, 0x49, 0xde, 0x4d, 0x45, 0x11, 0xc0, + 0xac, 0xac, 0x9c, 0xe6, 0xe9, 0xbf, 0x9c, 0xcd, 0xdf, 0x22}, + }, + { + {0xa1, 0xe0, 0x3b, 0x10, 0xb4, 0x59, 0xec, 0x56, 0x69, 0xf9, 0x59, + 0xd2, 0xec, 0xba, 0xe3, 0x2e, 0x32, 0xcd, 0xf5, 0x13, 0x94, 0xb2, + 0x7c, 0x79, 0x72, 0xe4, 0xcd, 0x24, 0x78, 0x87, 0xe9, 0xf}, + {0xd9, 0xc, 0xd, 0xc3, 0xe0, 0xd2, 0xdb, 0x8d, 0x33, 0x43, 0xbb, + 0xac, 0x5f, 0x66, 0x8e, 0xad, 0x1f, 0x96, 0x2a, 0x32, 0x8c, 0x25, + 0x6b, 0x8f, 0xc7, 0xc1, 0x48, 0x54, 0xc0, 0x16, 0x29, 0x6b}, + {0x3b, 0x91, 0xba, 0xa, 0xd1, 0x34, 0xdb, 0x7e, 0xe, 0xac, 0x6d, + 0x2e, 0x82, 0xcd, 0xa3, 0x4e, 0x15, 0xf8, 0x78, 0x65, 0xff, 0x3d, + 0x8, 0x66, 0x17, 0xa, 0xf0, 0x7f, 0x30, 0x3f, 0x30, 0x4c}, + }, + { + {0x0, 0x45, 0xd9, 0xd, 0x58, 0x3, 0xfc, 0x29, 0x93, 0xec, 0xbb, + 0x6f, 0xa4, 0x7a, 0xd2, 0xec, 0xf8, 0xa7, 0xe2, 0xc2, 0x5f, 0x15, + 0xa, 0x13, 0xd5, 0xa1, 0x6, 0xb7, 0x1a, 0x15, 0x6b, 0x41}, + {0x85, 0x8c, 0xb2, 0x17, 0xd6, 0x3b, 0xa, 0xd3, 0xea, 0x3b, 0x77, + 0x39, 0xb7, 0x77, 0xd3, 0xc5, 0xbf, 0x5c, 0x6a, 0x1e, 0x8c, 0xe7, + 0xc6, 0xc6, 0xc4, 0xb7, 0x2a, 0x8b, 0xf7, 0xb8, 0x61, 0xd}, + {0xb0, 0x36, 0xc1, 0xe9, 0xef, 0xd7, 0xa8, 0x56, 0x20, 0x4b, 0xe4, + 0x58, 0xcd, 0xe5, 0x7, 0xbd, 0xab, 0xe0, 0x57, 0x1b, 0xda, 0x2f, + 0xe6, 0xaf, 0xd2, 0xe8, 0x77, 0x42, 0xf7, 0x2a, 0x1a, 0x19}, + }, + }, + { + { + {0xfb, 0xe, 0x46, 0x4f, 0x43, 0x2b, 0xe6, 0x9f, 0xd6, 0x7, 0x36, + 0xa6, 0xd4, 0x3, 0xd3, 0xde, 0x24, 0xda, 0xa0, 0xb7, 0xe, 0x21, + 0x52, 0xf0, 0x93, 0x5b, 0x54, 0x0, 0xbe, 0x7d, 0x7e, 0x23}, + {0x31, 0x14, 0x3c, 0xc5, 0x4b, 0xf7, 0x16, 0xce, 0xde, 0xed, 0x72, + 0x20, 0xce, 0x25, 0x97, 0x2b, 0xe7, 0x3e, 0xb2, 0xb5, 0x6f, 0xc3, + 0xb9, 0xb8, 0x8, 0xc9, 0x5c, 0xb, 0x45, 0xe, 0x2e, 0x7e}, + {0x30, 0xb4, 0x1, 0x67, 0xed, 0x75, 0x35, 0x1, 0x10, 0xfd, 0xb, + 0x9f, 0xe6, 0x94, 0x10, 0x23, 0x22, 0x7f, 0xe4, 0x83, 0x15, 0xf, + 0x32, 0x75, 0xe3, 0x55, 0x11, 0xb1, 0x99, 0xa6, 0xaf, 0x71}, + }, + { + {0xd6, 0x50, 0x3b, 0x47, 0x1c, 0x3c, 0x42, 0xea, 0x10, 0xef, 0x38, + 0x3b, 0x1f, 0x7a, 0xe8, 0x51, 0x95, 0xbe, 0xc9, 0xb2, 0x5f, 0xbf, + 0x84, 0x9b, 0x1c, 0x9a, 0xf8, 0x78, 0xbc, 0x1f, 0x73, 0x0}, + {0x1d, 0xb6, 0x53, 0x39, 0x9b, 0x6f, 0xce, 0x65, 0xe6, 0x41, 0xa1, + 0xaf, 0xea, 0x39, 0x58, 0xc6, 0xfe, 0x59, 0xf7, 0xa9, 0xfd, 0x5f, + 0x43, 0xf, 0x8e, 0xc2, 0xb1, 0xc2, 0xe9, 0x42, 0x11, 0x2}, + {0x80, 0x18, 0xf8, 0x48, 0x18, 0xc7, 0x30, 0xe4, 0x19, 0xc1, 0xce, + 0x5e, 0x22, 0xc, 0x96, 0xbf, 0xe3, 0x15, 0xba, 0x6b, 0x83, 0xe0, + 0xda, 0xb6, 0x8, 0x58, 0xe1, 0x47, 0x33, 0x6f, 0x4d, 0x4c}, + }, + { + {0x70, 0x19, 0x8f, 0x98, 0xfc, 0xdd, 0xc, 0x2f, 0x1b, 0xf5, 0xb9, + 0xb0, 0x27, 0x62, 0x91, 0x6b, 0xbe, 0x76, 0x91, 0x77, 0xc4, 0xb6, + 0xc7, 0x6e, 0xa8, 0x9f, 0x8f, 0xa8, 0x0, 0x95, 0xbf, 0x38}, + {0xc9, 0x1f, 0x7d, 0xc1, 0xcf, 0xec, 0xf7, 0x18, 0x14, 0x3c, 0x40, + 0x51, 0xa6, 0xf5, 0x75, 0x6c, 0xdf, 0xc, 0xee, 0xf7, 0x2b, 0x71, + 0xde, 0xdb, 0x22, 0x7a, 0xe4, 0xa7, 0xaa, 0xdd, 0x3f, 0x19}, + {0x6f, 0x87, 0xe8, 0x37, 0x3c, 0xc9, 0xd2, 0x1f, 0x2c, 0x46, 0xd1, + 0x18, 0x5a, 0x1e, 0xf6, 0xa2, 0x76, 0x12, 0x24, 0x39, 0x82, 0xf5, + 0x80, 0x50, 0x69, 0x49, 0xd, 0xbf, 0x9e, 0xb9, 0x6f, 0x6a}, + }, + { + {0xc6, 0x23, 0xe4, 0xb6, 0xb5, 0x22, 0xb1, 0xee, 0x8e, 0xff, 0x86, + 0xf2, 0x10, 0x70, 0x9d, 0x93, 0x8c, 0x5d, 0xcf, 0x1d, 0x83, 0x2a, + 0xa9, 0x90, 0x10, 0xeb, 0xc5, 0x42, 0x9f, 0xda, 0x6f, 0x13}, + {0xeb, 0x55, 0x8, 0x56, 0xbb, 0xc1, 0x46, 0x6a, 0x9d, 0xf0, 0x93, + 0xf8, 0x38, 0xbb, 0x16, 0x24, 0xc1, 0xac, 0x71, 0x8f, 0x37, 0x11, + 0x1d, 0xd7, 0xea, 0x96, 0x18, 0xa3, 0x14, 0x69, 0xf7, 0x75}, + {0xd1, 0xbd, 0x5, 0xa3, 0xb1, 0xdf, 0x4c, 0xf9, 0x8, 0x2c, 0xf8, + 0x9f, 0x9d, 0x4b, 0x36, 0xf, 0x8a, 0x58, 0xbb, 0xc3, 0xa5, 0xd8, + 0x87, 0x2a, 0xba, 0xdc, 0xe8, 0xb, 0x51, 0x83, 0x21, 0x2}, + }, + { + {0x7f, 0x7a, 0x30, 0x43, 0x1, 0x71, 0x5a, 0x9d, 0x5f, 0xa4, 0x7d, + 0xc4, 0x9e, 0xde, 0x63, 0xb0, 0xd3, 0x7a, 0x92, 0xbe, 0x52, 0xfe, + 0xbb, 0x22, 0x6c, 0x42, 0x40, 0xfd, 0x41, 0xc4, 0x87, 0x13}, + {0x14, 0x2d, 0xad, 0x5e, 0x38, 0x66, 0xf7, 0x4a, 0x30, 0x58, 0x7c, + 0xca, 0x80, 0xd8, 0x8e, 0xa0, 0x3d, 0x1e, 0x21, 0x10, 0xe6, 0xa6, + 0x13, 0xd, 0x3, 0x6c, 0x80, 0x7b, 0xe1, 0x1c, 0x7, 0x6a}, + {0xf8, 0x8a, 0x97, 0x87, 0xd1, 0xc3, 0xd3, 0xb5, 0x13, 0x44, 0xe, + 0x7f, 0x3d, 0x5a, 0x2b, 0x72, 0xa0, 0x7c, 0x47, 0xbb, 0x48, 0x48, + 0x7b, 0xd, 0x92, 0xdc, 0x1e, 0xaf, 0x6a, 0xb2, 0x71, 0x31}, + }, + { + {0xd1, 0x47, 0x8a, 0xb2, 0xd8, 0xb7, 0xd, 0xa6, 0xf1, 0xa4, 0x70, + 0x17, 0xd6, 0x14, 0xbf, 0xa6, 0x58, 0xbd, 0xdd, 0x53, 0x93, 0xf8, + 0xa1, 0xd4, 0xe9, 0x43, 0x42, 0x34, 0x63, 0x4a, 0x51, 0x6c}, + {0xa8, 0x4c, 0x56, 0x97, 0x90, 0x31, 0x2f, 0xa9, 0x19, 0xe1, 0x75, + 0x22, 0x4c, 0xb8, 0x7b, 0xff, 0x50, 0x51, 0x87, 0xa4, 0x37, 0xfe, + 0x55, 0x4f, 0x5a, 0x83, 0xf0, 0x3c, 0x87, 0xd4, 0x1f, 0x22}, + {0x41, 0x63, 0x15, 0x3a, 0x4f, 0x20, 0x22, 0x23, 0x2d, 0x3, 0xa, + 0xba, 0xe9, 0xe0, 0x73, 0xfb, 0xe, 0x3, 0xf, 0x41, 0x4c, 0xdd, + 0xe0, 0xfc, 0xaa, 0x4a, 0x92, 0xfb, 0x96, 0xa5, 0xda, 0x48}, + }, + { + {0x93, 0x97, 0x4c, 0xc8, 0x5d, 0x1d, 0xf6, 0x14, 0x6, 0x82, 0x41, + 0xef, 0xe3, 0xf9, 0x41, 0x99, 0xac, 0x77, 0x62, 0x34, 0x8f, 0xb8, + 0xf5, 0xcd, 0xa9, 0x79, 0x8a, 0xe, 0xfa, 0x37, 0xc8, 0x58}, + {0xc7, 0x9c, 0xa5, 0x5c, 0x66, 0x8e, 0xca, 0x6e, 0xa0, 0xac, 0x38, + 0x2e, 0x4b, 0x25, 0x47, 0xa8, 0xce, 0x17, 0x1e, 0xd2, 0x8, 0xc7, + 0xaf, 0x31, 0xf7, 0x4a, 0xd8, 0xca, 0xfc, 0xd6, 0x6d, 0x67}, + {0x58, 0x90, 0xfc, 0x96, 0x85, 0x68, 0xf9, 0xc, 0x1b, 0xa0, 0x56, + 0x7b, 0xf3, 0xbb, 0xdc, 0x1d, 0x6a, 0xd6, 0x35, 0x49, 0x7d, 0xe7, + 0xc2, 0xdc, 0xa, 0x7f, 0xa5, 0xc6, 0xf2, 0x73, 0x4f, 0x1c}, + }, + { + {0x84, 0x34, 0x7c, 0xfc, 0x6e, 0x70, 0x6e, 0xb3, 0x61, 0xcf, 0xc1, + 0xc3, 0xb4, 0xc9, 0xdf, 0x73, 0xe5, 0xc7, 0x1c, 0x78, 0xc9, 0x79, + 0x1d, 0xeb, 0x5c, 0x67, 0xaf, 0x7d, 0xdb, 0x9a, 0x45, 0x70}, + {0xbb, 0xa0, 0x5f, 0x30, 0xbd, 0x4f, 0x7a, 0xe, 0xad, 0x63, 0xc6, + 0x54, 0xe0, 0x4c, 0x9d, 0x82, 0x48, 0x38, 0xe3, 0x2f, 0x83, 0xc3, + 0x21, 0xf4, 0x42, 0x4c, 0xf6, 0x1b, 0xd, 0xc8, 0x5a, 0x79}, + {0xb3, 0x2b, 0xb4, 0x91, 0x49, 0xdb, 0x91, 0x1b, 0xca, 0xdc, 0x2, + 0x4b, 0x23, 0x96, 0x26, 0x57, 0xdc, 0x78, 0x8c, 0x1f, 0xe5, 0x9e, + 0xdf, 0x9f, 0xd3, 0x1f, 0xe2, 0x8c, 0x84, 0x62, 0xe1, 0x5f}, + }, + }, + { + { + {0x8, 0xb2, 0x7c, 0x5d, 0x2d, 0x85, 0x79, 0x28, 0xe7, 0xf2, 0x7d, + 0x68, 0x70, 0xdd, 0xde, 0xb8, 0x91, 0x78, 0x68, 0x21, 0xab, 0xff, + 0xb, 0xdc, 0x35, 0xaa, 0x7d, 0x67, 0x43, 0xc0, 0x44, 0x2b}, + {0x1a, 0x96, 0x94, 0xe1, 0x4f, 0x21, 0x59, 0x4e, 0x4f, 0xcd, 0x71, + 0xd, 0xc7, 0x7d, 0xbe, 0x49, 0x2d, 0xf2, 0x50, 0x3b, 0xd2, 0xcf, + 0x0, 0x93, 0x32, 0x72, 0x91, 0xfc, 0x46, 0xd4, 0x89, 0x47}, + {0x8e, 0xb7, 0x4e, 0x7, 0xab, 0x87, 0x1c, 0x1a, 0x67, 0xf4, 0xda, + 0x99, 0x8e, 0xd1, 0xc6, 0xfa, 0x67, 0x90, 0x4f, 0x48, 0xcd, 0xbb, + 0xac, 0x3e, 0xe4, 0xa4, 0xb9, 0x2b, 0xef, 0x2e, 0xc5, 0x60}, + }, + { + {0x11, 0x6d, 0xae, 0x7c, 0xc2, 0xc5, 0x2b, 0x70, 0xab, 0x8c, 0xa4, + 0x54, 0x9b, 0x69, 0xc7, 0x44, 0xb2, 0x2e, 0x49, 0xba, 0x56, 0x40, + 0xbc, 0xef, 0x6d, 0x67, 0xb6, 0xd9, 0x48, 0x72, 0xd7, 0x70}, + {0xf1, 0x8b, 0xfd, 0x3b, 0xbc, 0x89, 0x5d, 0xb, 0x1a, 0x55, 0xf3, + 0xc9, 0x37, 0x92, 0x6b, 0xb0, 0xf5, 0x28, 0x30, 0xd5, 0xb0, 0x16, + 0x4c, 0xe, 0xab, 0xca, 0xcf, 0x2c, 0x31, 0x9c, 0xbc, 0x10}, + {0x5b, 0xa0, 0xc2, 0x3e, 0x4b, 0xe8, 0x8a, 0xaa, 0xe0, 0x81, 0x17, + 0xed, 0xf4, 0x9e, 0x69, 0x98, 0xd1, 0x85, 0x8e, 0x70, 0xe4, 0x13, + 0x45, 0x79, 0x13, 0xf4, 0x76, 0xa9, 0xd3, 0x5b, 0x75, 0x63}, + }, + { + {0xb7, 0xac, 0xf1, 0x97, 0x18, 0x10, 0xc7, 0x3d, 0xd8, 0xbb, 0x65, + 0xc1, 0x5e, 0x7d, 0xda, 0x5d, 0xf, 0x2, 0xa1, 0xf, 0x9c, 0x5b, + 0x8e, 0x50, 0x56, 0x2a, 0xc5, 0x37, 0x17, 0x75, 0x63, 0x27}, + {0x53, 0x8, 0xd1, 0x2a, 0x3e, 0xa0, 0x5f, 0xb5, 0x69, 0x35, 0xe6, + 0x9e, 0x90, 0x75, 0x6f, 0x35, 0x90, 0xb8, 0x69, 0xbe, 0xfd, 0xf1, + 0xf9, 0x9f, 0x84, 0x6f, 0xc1, 0x8b, 0xc4, 0xc1, 0x8c, 0xd}, + {0xa9, 0x19, 0xb4, 0x6e, 0xd3, 0x2, 0x94, 0x2, 0xa5, 0x60, 0xb4, + 0x77, 0x7e, 0x4e, 0xb4, 0xf0, 0x56, 0x49, 0x3c, 0xd4, 0x30, 0x62, + 0xa8, 0xcf, 0xe7, 0x66, 0xd1, 0x7a, 0x8a, 0xdd, 0xc2, 0x70}, + }, + { + {0x13, 0x7e, 0xed, 0xb8, 0x7d, 0x96, 0xd4, 0x91, 0x7a, 0x81, 0x76, + 0xd7, 0xa, 0x2f, 0x25, 0x74, 0x64, 0x25, 0x85, 0xd, 0xe0, 0x82, + 0x9, 0xe4, 0xe5, 0x3c, 0xa5, 0x16, 0x38, 0x61, 0xb8, 0x32}, + {0xe, 0xec, 0x6f, 0x9f, 0x50, 0x94, 0x61, 0x65, 0x8d, 0x51, 0xc6, + 0x46, 0xa9, 0x7e, 0x2e, 0xee, 0x5c, 0x9b, 0xe0, 0x67, 0xf3, 0xc1, + 0x33, 0x97, 0x95, 0x84, 0x94, 0x63, 0x63, 0xac, 0xf, 0x2e}, + {0x64, 0xcd, 0x48, 0xe4, 0xbe, 0xf7, 0xe7, 0x79, 0xd0, 0x86, 0x78, + 0x8, 0x67, 0x3a, 0xc8, 0x6a, 0x2e, 0xdb, 0xe4, 0xa0, 0xd9, 0xd4, + 0x9f, 0xf8, 0x41, 0x4f, 0x5a, 0x73, 0x5c, 0x21, 0x79, 0x41}, + }, + { + {0x34, 0xcd, 0x6b, 0x28, 0xb9, 0x33, 0xae, 0xe4, 0xdc, 0xd6, 0x9d, + 0x55, 0xb6, 0x7e, 0xef, 0xb7, 0x1f, 0x8e, 0xd3, 0xb3, 0x1f, 0x14, + 0x8b, 0x27, 0x86, 0xc2, 0x41, 0x22, 0x66, 0x85, 0xfa, 0x31}, + {0x2a, 0xed, 0xdc, 0xd7, 0xe7, 0x94, 0x70, 0x8c, 0x70, 0x9c, 0xd3, + 0x47, 0xc3, 0x8a, 0xfb, 0x97, 0x2, 0xd9, 0x6, 0xa9, 0x33, 0xe0, + 0x3b, 0xe1, 0x76, 0x9d, 0xd9, 0xc, 0xa3, 0x44, 0x3, 0x70}, + {0xf4, 0x22, 0x36, 0x2e, 0x42, 0x6c, 0x82, 0xaf, 0x2d, 0x50, 0x33, + 0x98, 0x87, 0x29, 0x20, 0xc1, 0x23, 0x91, 0x38, 0x2b, 0xe1, 0xb7, + 0xc1, 0x9b, 0x89, 0x24, 0x95, 0xa9, 0x12, 0x23, 0xbb, 0x24}, + }, + { + {0x6b, 0x5c, 0xf8, 0xf5, 0x2a, 0xc, 0xf8, 0x41, 0x94, 0x67, 0xfa, + 0x4, 0xc3, 0x84, 0x72, 0x68, 0xad, 0x1b, 0xba, 0xa3, 0x99, 0xdf, + 0x45, 0x89, 0x16, 0x5d, 0xeb, 0xff, 0xf9, 0x2a, 0x1d, 0xd}, + {0xc3, 0x67, 0xde, 0x32, 0x17, 0xed, 0xa8, 0xb1, 0x48, 0x49, 0x1b, + 0x46, 0x18, 0x94, 0xb4, 0x3c, 0xd2, 0xbc, 0xcf, 0x76, 0x43, 0x43, + 0xbd, 0x8e, 0x8, 0x80, 0x18, 0x1e, 0x87, 0x3e, 0xee, 0xf}, + {0xdf, 0x1e, 0x62, 0x32, 0xa1, 0x8a, 0xda, 0xa9, 0x79, 0x65, 0x22, + 0x59, 0xa1, 0x22, 0xb8, 0x30, 0x93, 0xc1, 0x9a, 0xa7, 0x7b, 0x19, + 0x4, 0x40, 0x76, 0x1d, 0x53, 0x18, 0x97, 0xd7, 0xac, 0x16}, + }, + { + {0xad, 0xb6, 0x87, 0x78, 0xc5, 0xc6, 0x59, 0xc9, 0xba, 0xfe, 0x90, + 0x5f, 0xad, 0x9e, 0xe1, 0x94, 0x4, 0xf5, 0x42, 0xa3, 0x62, 0x4e, + 0xe2, 0x16, 0x0, 0x17, 0x16, 0x18, 0x4b, 0xd3, 0x4e, 0x16}, + {0x3d, 0x1d, 0x9b, 0x2d, 0xaf, 0x72, 0xdf, 0x72, 0x5a, 0x24, 0x32, + 0xa4, 0x36, 0x2a, 0x46, 0x63, 0x37, 0x96, 0xb3, 0x16, 0x79, 0xa0, + 0xce, 0x3e, 0x9, 0x23, 0x30, 0xb9, 0xf6, 0xe, 0x3e, 0x12}, + {0x9a, 0xe6, 0x2f, 0x19, 0x4c, 0xd9, 0x7e, 0x48, 0x13, 0x15, 0x91, + 0x3a, 0xea, 0x2c, 0xae, 0x61, 0x27, 0xde, 0xa4, 0xb9, 0xd3, 0xf6, + 0x7b, 0x87, 0xeb, 0xf3, 0x73, 0x10, 0xc6, 0xf, 0xda, 0x78}, + }, + { + {0x94, 0x3a, 0xc, 0x68, 0xf1, 0x80, 0x9f, 0xa2, 0xe6, 0xe7, 0xe9, + 0x1a, 0x15, 0x7e, 0xf7, 0x71, 0x73, 0x79, 0x1, 0x48, 0x58, 0xf1, + 0x0, 0x11, 0xdd, 0x8d, 0xb3, 0x16, 0xb3, 0xa4, 0x4a, 0x5}, + {0x6a, 0xc6, 0x2b, 0xe5, 0x28, 0x5d, 0xf1, 0x5b, 0x8e, 0x1a, 0xf0, + 0x70, 0x18, 0xe3, 0x47, 0x2c, 0xdd, 0x8b, 0xc2, 0x6, 0xbc, 0xaf, + 0x19, 0x24, 0x3a, 0x17, 0x6b, 0x25, 0xeb, 0xde, 0x25, 0x2d}, + {0xb8, 0x7c, 0x26, 0x19, 0x8d, 0x46, 0xc8, 0xdf, 0xaf, 0x4d, 0xe5, + 0x66, 0x9c, 0x78, 0x28, 0xb, 0x17, 0xec, 0x6e, 0x66, 0x2a, 0x1d, + 0xeb, 0x2a, 0x60, 0xa7, 0x7d, 0xab, 0xa6, 0x10, 0x46, 0x13}, + }, + }, + { + { + {0x15, 0xf5, 0xd1, 0x77, 0xe7, 0x65, 0x2a, 0xcd, 0xf1, 0x60, 0xaa, + 0x8f, 0x87, 0x91, 0x89, 0x54, 0xe5, 0x6, 0xbc, 0xda, 0xbc, 0x3b, + 0xb7, 0xb1, 0xfb, 0xc9, 0x7c, 0xa9, 0xcb, 0x78, 0x48, 0x65}, + {0xfe, 0xb0, 0xf6, 0x8d, 0xc7, 0x8e, 0x13, 0x51, 0x1b, 0xf5, 0x75, + 0xe5, 0x89, 0xda, 0x97, 0x53, 0xb9, 0xf1, 0x7a, 0x71, 0x1d, 0x7a, + 0x20, 0x9, 0x50, 0xd6, 0x20, 0x2b, 0xba, 0xfd, 0x2, 0x21}, + {0xa1, 0xe6, 0x5c, 0x5, 0x5, 0xe4, 0x9e, 0x96, 0x29, 0xad, 0x51, + 0x12, 0x68, 0xa7, 0xbc, 0x36, 0x15, 0xa4, 0x7d, 0xaa, 0x17, 0xf5, + 0x1a, 0x3a, 0xba, 0xb2, 0xec, 0x29, 0xdb, 0x25, 0xd7, 0xa}, + }, + { + {0x85, 0x6f, 0x5, 0x9b, 0xc, 0xbc, 0xc7, 0xfe, 0xd7, 0xff, 0xf5, + 0xe7, 0x68, 0x52, 0x7d, 0x53, 0xfa, 0xae, 0x12, 0x43, 0x62, 0xc6, + 0xaf, 0x77, 0xd9, 0x9f, 0x39, 0x2, 0x53, 0x5f, 0x67, 0x4f}, + {0x57, 0x24, 0x4e, 0x83, 0xb1, 0x67, 0x42, 0xdc, 0xc5, 0x1b, 0xce, + 0x70, 0xb5, 0x44, 0x75, 0xb6, 0xd7, 0x5e, 0xd1, 0xf7, 0xb, 0x7a, + 0xf0, 0x1a, 0x50, 0x36, 0xa0, 0x71, 0xfb, 0xcf, 0xef, 0x4a}, + {0x1e, 0x17, 0x15, 0x4, 0x36, 0x36, 0x2d, 0xc3, 0x3b, 0x48, 0x98, + 0x89, 0x11, 0xef, 0x2b, 0xcd, 0x10, 0x51, 0x94, 0xd0, 0xad, 0x6e, + 0xa, 0x87, 0x61, 0x65, 0xa8, 0xa2, 0x72, 0xbb, 0xcc, 0xb}, + }, + { + {0x96, 0x12, 0xfe, 0x50, 0x4c, 0x5e, 0x6d, 0x18, 0x7e, 0x9f, 0xe8, + 0xfe, 0x82, 0x7b, 0x39, 0xe0, 0xb0, 0x31, 0x70, 0x50, 0xc5, 0xf6, + 0xc7, 0x3b, 0xc2, 0x37, 0x8f, 0x10, 0x69, 0xfd, 0x78, 0x66}, + {0xc8, 0xa9, 0xb1, 0xea, 0x2f, 0x96, 0x5e, 0x18, 0xcd, 0x7d, 0x14, + 0x65, 0x35, 0xe6, 0xe7, 0x86, 0xf2, 0x6d, 0x5b, 0xbb, 0x31, 0xe0, + 0x92, 0xb0, 0x3e, 0xb7, 0xd6, 0x59, 0xab, 0xf0, 0x24, 0x40}, + {0xc2, 0x63, 0x68, 0x63, 0x31, 0xfa, 0x86, 0x15, 0xf2, 0x33, 0x2d, + 0x57, 0x48, 0x8c, 0xf6, 0x7, 0xfc, 0xae, 0x9e, 0x78, 0x9f, 0xcc, + 0x73, 0x4f, 0x1, 0x47, 0xad, 0x8e, 0x10, 0xe2, 0x42, 0x2d}, + }, + { + {0x93, 0x75, 0x53, 0xf, 0xd, 0x7b, 0x71, 0x21, 0x4c, 0x6, 0x1e, + 0x13, 0xb, 0x69, 0x4e, 0x91, 0x9f, 0xe0, 0x2a, 0x75, 0xae, 0x87, + 0xb6, 0x1b, 0x6e, 0x3c, 0x42, 0x9b, 0xa7, 0xf3, 0xb, 0x42}, + {0x9b, 0xd2, 0xdf, 0x94, 0x15, 0x13, 0xf5, 0x97, 0x6a, 0x4c, 0x3f, + 0x31, 0x5d, 0x98, 0x55, 0x61, 0x10, 0x50, 0x45, 0x8, 0x7, 0x3f, + 0xa1, 0xeb, 0x22, 0xd3, 0xd2, 0xb8, 0x8, 0x26, 0x6b, 0x67}, + {0x47, 0x2b, 0x5b, 0x1c, 0x65, 0xba, 0x38, 0x81, 0x80, 0x1b, 0x1b, + 0x31, 0xec, 0xb6, 0x71, 0x86, 0xb0, 0x35, 0x31, 0xbc, 0xb1, 0xc, + 0xff, 0x7b, 0xe0, 0xf1, 0xc, 0x9c, 0xfa, 0x2f, 0x5d, 0x74}, + }, + { + {0x6a, 0x4e, 0xd3, 0x21, 0x57, 0xdf, 0x36, 0x60, 0xd0, 0xb3, 0x7b, + 0x99, 0x27, 0x88, 0xdb, 0xb1, 0xfa, 0x6a, 0x75, 0xc8, 0xc3, 0x9, + 0xc2, 0xd3, 0x39, 0xc8, 0x1d, 0x4c, 0xe5, 0x5b, 0xe1, 0x6}, + {0xbd, 0xc8, 0xc9, 0x2b, 0x1e, 0x5a, 0x52, 0xbf, 0x81, 0x9d, 0x47, + 0x26, 0x8, 0x26, 0x5b, 0xea, 0xdb, 0x55, 0x1, 0xdf, 0xe, 0xc7, + 0x11, 0xd5, 0xd0, 0xf5, 0xc, 0x96, 0xeb, 0x3c, 0xe2, 0x1a}, + {0x4a, 0x99, 0x32, 0x19, 0x87, 0x5d, 0x72, 0x5b, 0xb0, 0xda, 0xb1, + 0xce, 0xb5, 0x1c, 0x35, 0x32, 0x5, 0xca, 0xb7, 0xda, 0x49, 0x15, + 0xc4, 0x7d, 0xf7, 0xc1, 0x8e, 0x27, 0x61, 0xd8, 0xde, 0x58}, + }, + { + {0xa8, 0xc9, 0xc2, 0xb6, 0xa8, 0x5b, 0xfb, 0x2d, 0x8c, 0x59, 0x2c, + 0xf5, 0x8e, 0xef, 0xee, 0x48, 0x73, 0x15, 0x2d, 0xf1, 0x7, 0x91, + 0x80, 0x33, 0xd8, 0x5b, 0x1d, 0x53, 0x6b, 0x69, 0xba, 0x8}, + {0x5c, 0xc5, 0x66, 0xf2, 0x93, 0x37, 0x17, 0xd8, 0x49, 0x4e, 0x45, + 0xcc, 0xc5, 0x76, 0xc9, 0xc8, 0xa8, 0xc3, 0x26, 0xbc, 0xf8, 0x82, + 0xe3, 0x5c, 0xf9, 0xf6, 0x85, 0x54, 0xe8, 0x9d, 0xf3, 0x2f}, + {0x7a, 0xc5, 0xef, 0xc3, 0xee, 0x3e, 0xed, 0x77, 0x11, 0x48, 0xff, + 0xd4, 0x17, 0x55, 0xe0, 0x4, 0xcb, 0x71, 0xa6, 0xf1, 0x3f, 0x7a, + 0x3d, 0xea, 0x54, 0xfe, 0x7c, 0x94, 0xb4, 0x33, 0x6, 0x12}, + }, + { + {0xa, 0x10, 0x12, 0x49, 0x47, 0x31, 0xbd, 0x82, 0x6, 0xbe, 0x6f, + 0x7e, 0x6d, 0x7b, 0x23, 0xde, 0xc6, 0x79, 0xea, 0x11, 0x19, 0x76, + 0x1e, 0xe1, 0xde, 0x3b, 0x39, 0xcb, 0xe3, 0x3b, 0x43, 0x7}, + {0x42, 0x0, 0x61, 0x91, 0x78, 0x98, 0x94, 0xb, 0xe8, 0xfa, 0xeb, + 0xec, 0x3c, 0xb1, 0xe7, 0x4e, 0xc0, 0xa4, 0xf0, 0x94, 0x95, 0x73, + 0xbe, 0x70, 0x85, 0x91, 0xd5, 0xb4, 0x99, 0xa, 0xd3, 0x35}, + {0xf4, 0x97, 0xe9, 0x5c, 0xc0, 0x44, 0x79, 0xff, 0xa3, 0x51, 0x5c, + 0xb0, 0xe4, 0x3d, 0x5d, 0x57, 0x7c, 0x84, 0x76, 0x5a, 0xfd, 0x81, + 0x33, 0x58, 0x9f, 0xda, 0xf6, 0x7a, 0xde, 0x3e, 0x87, 0x2d}, + }, + { + {0x81, 0xf9, 0x5d, 0x4e, 0xe1, 0x2, 0x62, 0xaa, 0xf5, 0xe1, 0x15, + 0x50, 0x17, 0x59, 0xd, 0xa2, 0x6c, 0x1d, 0xe2, 0xba, 0xd3, 0x75, + 0xa2, 0x18, 0x53, 0x2, 0x60, 0x1, 0x8a, 0x61, 0x43, 0x5}, + {0x9, 0x34, 0x37, 0x43, 0x64, 0x31, 0x7a, 0x15, 0xd9, 0x81, 0xaa, + 0xf4, 0xee, 0xb7, 0xb8, 0xfa, 0x6, 0x48, 0xa6, 0xf5, 0xe6, 0xfe, + 0x93, 0xb0, 0xb6, 0xa7, 0x7f, 0x70, 0x54, 0x36, 0x77, 0x2e}, + {0xc1, 0x23, 0x4c, 0x97, 0xf4, 0xbd, 0xea, 0xd, 0x93, 0x46, 0xce, + 0x9d, 0x25, 0xa, 0x6f, 0xaa, 0x2c, 0xba, 0x9a, 0xa2, 0xb8, 0x2c, + 0x20, 0x4, 0xd, 0x96, 0x7, 0x2d, 0x36, 0x43, 0x14, 0x4b}, + }, + }, + { + { + {0xcb, 0x9c, 0x52, 0x1c, 0xe9, 0x54, 0x7c, 0x96, 0xfb, 0x35, 0xc6, + 0x64, 0x92, 0x26, 0xf6, 0x30, 0x65, 0x19, 0x12, 0x78, 0xf4, 0xaf, + 0x47, 0x27, 0x5c, 0x6f, 0xf6, 0xea, 0x18, 0x84, 0x3, 0x17}, + {0x7a, 0x1f, 0x6e, 0xb6, 0xc7, 0xb7, 0xc4, 0xcc, 0x7e, 0x2f, 0xc, + 0xf5, 0x25, 0x7e, 0x15, 0x44, 0x1c, 0xaf, 0x3e, 0x71, 0xfc, 0x6d, + 0xf0, 0x3e, 0xf7, 0x63, 0xda, 0x52, 0x67, 0x44, 0x2f, 0x58}, + {0xe4, 0x4c, 0x32, 0x20, 0xd3, 0x7b, 0x31, 0xc6, 0xc4, 0x8b, 0x48, + 0xa4, 0xe8, 0x42, 0x10, 0xa8, 0x64, 0x13, 0x5a, 0x4e, 0x8b, 0xf1, + 0x1e, 0xb2, 0xc9, 0x8d, 0xa2, 0xcd, 0x4b, 0x1c, 0x2a, 0xc}, + }, + { + {0x45, 0x69, 0xbd, 0x69, 0x48, 0x81, 0xc4, 0xed, 0x22, 0x8d, 0x1c, + 0xbe, 0x7d, 0x90, 0x6d, 0xd, 0xab, 0xc5, 0x5c, 0xd5, 0x12, 0xd2, + 0x3b, 0xc6, 0x83, 0xdc, 0x14, 0xa3, 0x30, 0x9b, 0x6a, 0x5a}, + {0x47, 0x4, 0x1f, 0x6f, 0xd0, 0xc7, 0x4d, 0xd2, 0x59, 0xc0, 0x87, + 0xdb, 0x3e, 0x9e, 0x26, 0xb2, 0x8f, 0xd2, 0xb2, 0xfb, 0x72, 0x2, + 0x5b, 0xd1, 0x77, 0x48, 0xf6, 0xc6, 0xd1, 0x8b, 0x55, 0x7c}, + {0x3d, 0x46, 0x96, 0xd3, 0x24, 0x15, 0xec, 0xd0, 0xf0, 0x24, 0x5a, + 0xc3, 0x8a, 0x62, 0xbb, 0x12, 0xa4, 0x5f, 0xbc, 0x1c, 0x79, 0x3a, + 0xc, 0xa5, 0xc3, 0xaf, 0xfb, 0xa, 0xca, 0xa5, 0x4, 0x4}, + }, + { + {0xd1, 0x6f, 0x41, 0x2a, 0x1b, 0x9e, 0xbc, 0x62, 0x8b, 0x59, 0x50, + 0xe3, 0x28, 0xf7, 0xc6, 0xb5, 0x67, 0x69, 0x5d, 0x3d, 0xd8, 0x3f, + 0x34, 0x4, 0x98, 0xee, 0xf8, 0xe7, 0x16, 0x75, 0x52, 0x39}, + {0xd6, 0x43, 0xa7, 0xa, 0x7, 0x40, 0x1f, 0x8c, 0xe8, 0x5e, 0x26, + 0x5b, 0xcb, 0xd0, 0xba, 0xcc, 0xde, 0xd2, 0x8f, 0x66, 0x6b, 0x4, + 0x4b, 0x57, 0x33, 0x96, 0xdd, 0xca, 0xfd, 0x5b, 0x39, 0x46}, + {0x9c, 0x9a, 0x5d, 0x1a, 0x2d, 0xdb, 0x7f, 0x11, 0x2a, 0x5c, 0x0, + 0xd1, 0xbc, 0x45, 0x77, 0x9c, 0xea, 0x6f, 0xd5, 0x54, 0xf1, 0xbe, + 0xd4, 0xef, 0x16, 0xd0, 0x22, 0xe8, 0x29, 0x9a, 0x57, 0x76}, + }, + { + {0xf2, 0x34, 0xb4, 0x52, 0x13, 0xb5, 0x3c, 0x33, 0xe1, 0x80, 0xde, + 0x93, 0x49, 0x28, 0x32, 0xd8, 0xce, 0x35, 0xd, 0x75, 0x87, 0x28, + 0x51, 0xb5, 0xc1, 0x77, 0x27, 0x2a, 0xbb, 0x14, 0xc5, 0x2}, + {0x17, 0x2a, 0xc0, 0x49, 0x7e, 0x8e, 0xb6, 0x45, 0x7f, 0xa3, 0xa9, + 0xbc, 0xa2, 0x51, 0xcd, 0x23, 0x1b, 0x4c, 0x22, 0xec, 0x11, 0x5f, + 0xd6, 0x3e, 0xb1, 0xbd, 0x5, 0x9e, 0xdc, 0x84, 0xa3, 0x43}, + {0x45, 0xb6, 0xf1, 0x8b, 0xda, 0xd5, 0x4b, 0x68, 0x53, 0x4b, 0xb5, + 0xf6, 0x7e, 0xd3, 0x8b, 0xfb, 0x53, 0xd2, 0xb0, 0xa9, 0xd7, 0x16, + 0x39, 0x31, 0x59, 0x80, 0x54, 0x61, 0x9, 0x92, 0x60, 0x11}, + }, + { + {0xcd, 0x4d, 0x9b, 0x36, 0x16, 0x56, 0x38, 0x7a, 0x63, 0x35, 0x5c, + 0x65, 0xa7, 0x2c, 0xc0, 0x75, 0x21, 0x80, 0xf1, 0xd4, 0xf9, 0x1b, + 0xc2, 0x7d, 0x42, 0xe0, 0xe6, 0x91, 0x74, 0x7d, 0x63, 0x2f}, + {0xaa, 0xcf, 0xda, 0x29, 0x69, 0x16, 0x4d, 0xb4, 0x8f, 0x59, 0x13, + 0x84, 0x4c, 0x9f, 0x52, 0xda, 0x59, 0x55, 0x3d, 0x45, 0xca, 0x63, + 0xef, 0xe9, 0xb, 0x8e, 0x69, 0xc5, 0x5b, 0x12, 0x1e, 0x35}, + {0xbe, 0x7b, 0xf6, 0x1a, 0x46, 0x9b, 0xb4, 0xd4, 0x61, 0x89, 0xab, + 0xc8, 0x7a, 0x3, 0x3, 0xd6, 0xfb, 0x99, 0xa6, 0xf9, 0x9f, 0xe1, + 0xde, 0x71, 0x9a, 0x2a, 0xce, 0xe7, 0x6, 0x2d, 0x18, 0x7f}, + }, + { + {0x22, 0x75, 0x21, 0x8e, 0x72, 0x4b, 0x45, 0x9, 0xd8, 0xb8, 0x84, + 0xd4, 0xf4, 0xe8, 0x58, 0xaa, 0x3c, 0x90, 0x46, 0x7f, 0x4d, 0x25, + 0x58, 0xd3, 0x17, 0x52, 0x1c, 0x24, 0x43, 0xc0, 0xac, 0x44}, + {0xec, 0x68, 0x1, 0xab, 0x64, 0x8e, 0x7c, 0x7a, 0x43, 0xc5, 0xed, + 0x15, 0x55, 0x4a, 0x5a, 0xcb, 0xda, 0xe, 0xcd, 0x47, 0xd3, 0x19, + 0x55, 0x9, 0xb0, 0x93, 0x3e, 0x34, 0x8c, 0xac, 0xd4, 0x67}, + {0x77, 0x57, 0x7a, 0x4f, 0xbb, 0x6b, 0x7d, 0x1c, 0xe1, 0x13, 0x83, + 0x91, 0xd4, 0xfe, 0x35, 0x8b, 0x84, 0x46, 0x6b, 0xc9, 0xc6, 0xa1, + 0xdc, 0x4a, 0xbd, 0x71, 0xad, 0x12, 0x83, 0x1c, 0x6d, 0x55}, + }, + { + {0x21, 0xe8, 0x1b, 0xb1, 0x56, 0x67, 0xf0, 0x81, 0xdd, 0xf3, 0xa3, + 0x10, 0x23, 0xf8, 0xaf, 0xf, 0x5d, 0x46, 0x99, 0x6a, 0x55, 0xd0, + 0xb2, 0xf8, 0x5, 0x7f, 0x8c, 0xcc, 0x38, 0xbe, 0x7a, 0x9}, + {0x82, 0x39, 0x8d, 0xc, 0xe3, 0x40, 0xef, 0x17, 0x34, 0xfa, 0xa3, + 0x15, 0x3e, 0x7, 0xf7, 0x31, 0x6e, 0x64, 0x73, 0x7, 0xcb, 0xf3, + 0x21, 0x4f, 0xff, 0x4e, 0x82, 0x1d, 0x6d, 0x6c, 0x6c, 0x74}, + {0xa4, 0x2d, 0xa5, 0x7e, 0x87, 0xc9, 0x49, 0xc, 0x43, 0x1d, 0xdc, + 0x9b, 0x55, 0x69, 0x43, 0x4c, 0xd2, 0xeb, 0xcc, 0xf7, 0x9, 0x38, + 0x2c, 0x2, 0xbd, 0x84, 0xee, 0x4b, 0xa3, 0x14, 0x7e, 0x57}, + }, + { + {0x2b, 0xd7, 0x4d, 0xbd, 0xbe, 0xce, 0xfe, 0x94, 0x11, 0x22, 0xf, + 0x6, 0xda, 0x4f, 0x6a, 0xf4, 0xff, 0xd1, 0xc8, 0xc0, 0x77, 0x59, + 0x4a, 0x12, 0x95, 0x92, 0x0, 0xfb, 0xb8, 0x4, 0x53, 0x70}, + {0xa, 0x3b, 0xa7, 0x61, 0xac, 0x68, 0xe2, 0xf0, 0xf5, 0xa5, 0x91, + 0x37, 0x10, 0xfa, 0xfa, 0xf2, 0xe9, 0x0, 0x6d, 0x6b, 0x82, 0x3e, + 0xe1, 0xc1, 0x42, 0x8f, 0xd7, 0x6f, 0xe9, 0x7e, 0xfa, 0x60}, + {0xc6, 0x6e, 0x29, 0x4d, 0x35, 0x1d, 0x3d, 0xb6, 0xd8, 0x31, 0xad, + 0x5f, 0x3e, 0x5, 0xc3, 0xf3, 0xec, 0x42, 0xbd, 0xb4, 0x8c, 0x95, + 0xb, 0x67, 0xfd, 0x53, 0x63, 0xa1, 0xc, 0x8e, 0x39, 0x21}, + }, + }, + { + { + {0x1, 0x56, 0xb7, 0xb4, 0xf9, 0xaa, 0x98, 0x27, 0x72, 0xad, 0x8d, + 0x5c, 0x13, 0x72, 0xac, 0x5e, 0x23, 0xa0, 0xb7, 0x61, 0x61, 0xaa, + 0xce, 0xd2, 0x4e, 0x7d, 0x8f, 0xe9, 0x84, 0xb2, 0xbf, 0x1b}, + {0xf3, 0x33, 0x2b, 0x38, 0x8a, 0x5, 0xf5, 0x89, 0xb4, 0xc0, 0x48, + 0xad, 0xb, 0xba, 0xe2, 0x5a, 0x6e, 0xb3, 0x3d, 0xa5, 0x3, 0xb5, + 0x93, 0x8f, 0xe6, 0x32, 0xa2, 0x95, 0x9d, 0xed, 0xa3, 0x5a}, + {0x61, 0x65, 0xd9, 0xc7, 0xe9, 0x77, 0x67, 0x65, 0x36, 0x80, 0xc7, + 0x72, 0x54, 0x12, 0x2b, 0xcb, 0xee, 0x6e, 0x50, 0xd9, 0x99, 0x32, + 0x5, 0x65, 0xcc, 0x57, 0x89, 0x5e, 0x4e, 0xe1, 0x7, 0x4a}, + }, + { + {0x9b, 0xa4, 0x77, 0xc4, 0xcd, 0x58, 0xb, 0x24, 0x17, 0xf0, 0x47, + 0x64, 0xde, 0xda, 0x38, 0xfd, 0xad, 0x6a, 0xc8, 0xa7, 0x32, 0x8d, + 0x92, 0x19, 0x81, 0xa0, 0xaf, 0x84, 0xed, 0x7a, 0xaf, 0x50}, + {0x99, 0xf9, 0xd, 0x98, 0xcb, 0x12, 0xe4, 0x4e, 0x71, 0xc7, 0x6e, + 0x3c, 0x6f, 0xd7, 0x15, 0xa3, 0xfd, 0x77, 0x5c, 0x92, 0xde, 0xed, + 0xa5, 0xbb, 0x2, 0x34, 0x31, 0x1d, 0x39, 0xac, 0xb, 0x3f}, + {0xe5, 0x5b, 0xf6, 0x15, 0x1, 0xde, 0x4f, 0x6e, 0xb2, 0x9, 0x61, + 0x21, 0x21, 0x26, 0x98, 0x29, 0xd9, 0xd6, 0xad, 0xb, 0x81, 0x5, + 0x2, 0x78, 0x6, 0xd0, 0xeb, 0xba, 0x16, 0xa3, 0x21, 0x19}, + }, + { + {0x8b, 0xc1, 0xf3, 0xd9, 0x9a, 0xad, 0x5a, 0xd7, 0x9c, 0xc1, 0xb1, + 0x60, 0xef, 0xe, 0x6a, 0x56, 0xd9, 0xe, 0x5c, 0x25, 0xac, 0xb, + 0x9a, 0x3e, 0xf5, 0xc7, 0x62, 0xa0, 0xec, 0x9d, 0x4, 0x7b}, + {0xfc, 0x70, 0xb8, 0xdf, 0x7e, 0x2f, 0x42, 0x89, 0xbd, 0xb3, 0x76, + 0x4f, 0xeb, 0x6b, 0x29, 0x2c, 0xf7, 0x4d, 0xc2, 0x36, 0xd4, 0xf1, + 0x38, 0x7, 0xb0, 0xae, 0x73, 0xe2, 0x41, 0xdf, 0x58, 0x64}, + {0x83, 0x44, 0x44, 0x35, 0x7a, 0xe3, 0xcb, 0xdc, 0x93, 0xbe, 0xed, + 0xf, 0x33, 0x79, 0x88, 0x75, 0x87, 0xdd, 0xc5, 0x12, 0xc3, 0x4, + 0x60, 0x78, 0x64, 0xe, 0x95, 0xc2, 0xcb, 0xdc, 0x93, 0x60}, + }, + { + {0x4b, 0x3, 0x84, 0x60, 0xbe, 0xee, 0xde, 0x6b, 0x54, 0xb8, 0xf, + 0x78, 0xb6, 0xc2, 0x99, 0x31, 0x95, 0x6, 0x2d, 0xb6, 0xab, 0x76, + 0x33, 0x97, 0x90, 0x7d, 0x64, 0x8b, 0xc9, 0x80, 0x31, 0x6e}, + {0x6d, 0x70, 0xe0, 0x85, 0x85, 0x9a, 0xf3, 0x1f, 0x33, 0x39, 0xe7, + 0xb3, 0xd8, 0xa5, 0xd0, 0x36, 0x3b, 0x45, 0x8f, 0x71, 0xe1, 0xf2, + 0xb9, 0x43, 0x7c, 0xa9, 0x27, 0x48, 0x8, 0xea, 0xd1, 0x57}, + {0x71, 0xb0, 0x28, 0xa1, 0xe7, 0xb6, 0x7a, 0xee, 0xaa, 0x8b, 0xa8, + 0x93, 0x6d, 0x59, 0xc1, 0xa4, 0x30, 0x61, 0x21, 0xb2, 0x82, 0xde, + 0xb4, 0xf7, 0x18, 0xbd, 0x97, 0xdd, 0x9d, 0x99, 0x3e, 0x36}, + }, + { + {0xc6, 0xae, 0x4b, 0xe2, 0xdc, 0x48, 0x18, 0x2f, 0x60, 0xaf, 0xbc, + 0xba, 0x55, 0x72, 0x9b, 0x76, 0x31, 0xe9, 0xef, 0x3c, 0x6e, 0x3c, + 0xcb, 0x90, 0x55, 0xb3, 0xf9, 0xc6, 0x9b, 0x97, 0x1f, 0x23}, + {0xc4, 0x1f, 0xee, 0x35, 0xc1, 0x43, 0xa8, 0x96, 0xcf, 0xc8, 0xe4, + 0x8, 0x55, 0xb3, 0x6e, 0x97, 0x30, 0xd3, 0x8c, 0xb5, 0x1, 0x68, + 0x2f, 0xb4, 0x2b, 0x5, 0x3a, 0x69, 0x78, 0x9b, 0xee, 0x48}, + {0xc6, 0xf3, 0x2a, 0xcc, 0x4b, 0xde, 0x31, 0x5c, 0x1f, 0x8d, 0x20, + 0xfe, 0x30, 0xb0, 0x4b, 0xb0, 0x66, 0xb4, 0x4f, 0xc1, 0x9, 0x70, + 0x8d, 0xb7, 0x13, 0x24, 0x79, 0x8, 0x9b, 0xfa, 0x9b, 0x7}, + }, + { + {0x45, 0x42, 0xd5, 0xa2, 0x80, 0xed, 0xc9, 0xf3, 0x52, 0x39, 0xf6, + 0x77, 0x78, 0x8b, 0xa0, 0xa, 0x75, 0x54, 0x8, 0xd1, 0x63, 0xac, + 0x6d, 0xd7, 0x6b, 0x63, 0x70, 0x94, 0x15, 0xfb, 0xf4, 0x1e}, + {0xf4, 0xd, 0x30, 0xda, 0x51, 0x3a, 0x90, 0xe3, 0xb0, 0x5a, 0xa9, + 0x3d, 0x23, 0x64, 0x39, 0x84, 0x80, 0x64, 0x35, 0xb, 0x2d, 0xf1, + 0x3c, 0xed, 0x94, 0x71, 0x81, 0x84, 0xf6, 0x77, 0x8c, 0x3}, + {0xec, 0x7b, 0x16, 0x5b, 0xe6, 0x5e, 0x4e, 0x85, 0xc2, 0xcd, 0xd0, + 0x96, 0x42, 0xa, 0x59, 0x59, 0x99, 0x21, 0x10, 0x98, 0x34, 0xdf, + 0xb2, 0x72, 0x56, 0xff, 0xb, 0x4a, 0x2a, 0xe9, 0x5e, 0x57}, + }, + { + {0x1, 0xd8, 0xa4, 0xa, 0x45, 0xbc, 0x46, 0x5d, 0xd8, 0xb9, 0x33, + 0xa5, 0x27, 0x12, 0xaf, 0xc3, 0xc2, 0x6, 0x89, 0x2b, 0x26, 0x3b, + 0x9e, 0x38, 0x1b, 0x58, 0x2f, 0x38, 0x7e, 0x1e, 0xa, 0x20}, + {0xcf, 0x2f, 0x18, 0x8a, 0x90, 0x80, 0xc0, 0xd4, 0xbd, 0x9d, 0x48, + 0x99, 0xc2, 0x70, 0xe1, 0x30, 0xde, 0x33, 0xf7, 0x52, 0x57, 0xbd, + 0xba, 0x5, 0x0, 0xfd, 0xd3, 0x2c, 0x11, 0xe7, 0xd4, 0x43}, + {0xc5, 0x3a, 0xf9, 0xea, 0x67, 0xb9, 0x8d, 0x51, 0xc0, 0x52, 0x66, + 0x5, 0x9b, 0x98, 0xbc, 0x71, 0xf5, 0x97, 0x71, 0x56, 0xd9, 0x85, + 0x2b, 0xfe, 0x38, 0x4e, 0x1e, 0x65, 0x52, 0xca, 0xe, 0x5}, + }, + { + {0xea, 0x68, 0xe6, 0x60, 0x76, 0x39, 0xac, 0x97, 0x97, 0xb4, 0x3a, + 0x15, 0xfe, 0xbb, 0x19, 0x9b, 0x9f, 0xa7, 0xec, 0x34, 0xb5, 0x79, + 0xb1, 0x4c, 0x57, 0xae, 0x31, 0xa1, 0x9f, 0xc0, 0x51, 0x61}, + {0x9c, 0xc, 0x3f, 0x45, 0xde, 0x1a, 0x43, 0xc3, 0x9b, 0x3b, 0x70, + 0xff, 0x5e, 0x4, 0xf5, 0xe9, 0x3d, 0x7b, 0x84, 0xed, 0xc9, 0x7a, + 0xd9, 0xfc, 0xc6, 0xf4, 0x58, 0x1c, 0xc2, 0xe6, 0xe, 0x4b}, + {0x96, 0x5d, 0xf0, 0xfd, 0xd, 0x5c, 0xf5, 0x3a, 0x7a, 0xee, 0xb4, + 0x2a, 0xe0, 0x2e, 0x26, 0xdd, 0x9, 0x17, 0x17, 0x12, 0x87, 0xbb, + 0xb2, 0x11, 0xb, 0x3, 0xf, 0x80, 0xfa, 0x24, 0xef, 0x1f}, + }, + }, + { + { + {0x86, 0x6b, 0x97, 0x30, 0xf5, 0xaf, 0xd2, 0x22, 0x4, 0x46, 0xd2, + 0xc2, 0x6, 0xb8, 0x90, 0x8d, 0xe5, 0xba, 0xe5, 0x4d, 0x6c, 0x89, + 0xa1, 0xdc, 0x17, 0xc, 0x34, 0xc8, 0xe6, 0x5f, 0x0, 0x28}, + {0x96, 0x31, 0xa7, 0x1a, 0xfb, 0x53, 0xd6, 0x37, 0x18, 0x64, 0xd7, + 0x3f, 0x30, 0x95, 0x94, 0xf, 0xb2, 0x17, 0x3a, 0xfb, 0x9, 0xb, + 0x20, 0xad, 0x3e, 0x61, 0xc8, 0x2f, 0x29, 0x49, 0x4d, 0x54}, + {0x88, 0x86, 0x52, 0x34, 0x9f, 0xba, 0xef, 0x6a, 0xa1, 0x7d, 0x10, + 0x25, 0x94, 0xff, 0x1b, 0x5c, 0x36, 0x4b, 0xd9, 0x66, 0xcd, 0xbb, + 0x5b, 0xf7, 0xfa, 0x6d, 0x31, 0xf, 0x93, 0x72, 0xe4, 0x72}, + }, + { + {0x27, 0x76, 0x2a, 0xd3, 0x35, 0xf6, 0xf3, 0x7, 0xf0, 0x66, 0x65, + 0x5f, 0x86, 0x4d, 0xaa, 0x7a, 0x50, 0x44, 0xd0, 0x28, 0x97, 0xe7, + 0x85, 0x3c, 0x38, 0x64, 0xe0, 0xf, 0x0, 0x7f, 0xee, 0x1f}, + {0x4f, 0x8, 0x81, 0x97, 0x8c, 0x20, 0x95, 0x26, 0xe1, 0xe, 0x45, + 0x23, 0xb, 0x2a, 0x50, 0xb1, 0x2, 0xde, 0xef, 0x3, 0xa6, 0xae, + 0x9d, 0xfd, 0x4c, 0xa3, 0x33, 0x27, 0x8c, 0x2e, 0x9d, 0x5a}, + {0xe5, 0xf7, 0xdb, 0x3, 0xda, 0x5, 0x53, 0x76, 0xbd, 0xcd, 0x34, + 0x14, 0x49, 0xf2, 0xda, 0xa4, 0xec, 0x88, 0x4a, 0xd2, 0xcd, 0xd5, + 0x4a, 0x7b, 0x43, 0x5, 0x4, 0xee, 0x51, 0x40, 0xf9, 0x0}, + }, + { + {0x53, 0x97, 0xaf, 0x7, 0xbb, 0x93, 0xef, 0xd7, 0xa7, 0x66, 0xb7, + 0x3d, 0xcf, 0xd0, 0x3e, 0x58, 0xc5, 0x1e, 0xb, 0x6e, 0xbf, 0x98, + 0x69, 0xce, 0x52, 0x4, 0xd4, 0x5d, 0xd2, 0xff, 0xb7, 0x47}, + {0xb2, 0x30, 0xd3, 0xc3, 0x23, 0x6b, 0x35, 0x8d, 0x6, 0x1b, 0x47, + 0xb0, 0x9b, 0x8b, 0x1c, 0xf2, 0x3c, 0xb8, 0x42, 0x6e, 0x6c, 0x31, + 0x6c, 0xb3, 0xd, 0xb1, 0xea, 0x8b, 0x7e, 0x9c, 0xd7, 0x7}, + {0x12, 0xdd, 0x8, 0xbc, 0x9c, 0xfb, 0xfb, 0x87, 0x9b, 0xc2, 0xee, + 0xe1, 0x3a, 0x6b, 0x6, 0x8a, 0xbf, 0xc1, 0x1f, 0xdb, 0x2b, 0x24, + 0x57, 0xd, 0xb6, 0x4b, 0xa6, 0x5e, 0xa3, 0x20, 0x35, 0x1c}, + }, + { + {0x59, 0xc0, 0x6b, 0x21, 0x40, 0x6f, 0xa8, 0xcd, 0x7e, 0xd8, 0xbc, + 0x12, 0x1d, 0x23, 0xbb, 0x1f, 0x90, 0x9, 0xc7, 0x17, 0x9e, 0x6a, + 0x95, 0xb4, 0x55, 0x2e, 0xd1, 0x66, 0x3b, 0xc, 0x75, 0x38}, + {0x4a, 0xa3, 0xcb, 0xbc, 0xa6, 0x53, 0xd2, 0x80, 0x9b, 0x21, 0x38, + 0x38, 0xa1, 0xc3, 0x61, 0x3e, 0x96, 0xe3, 0x82, 0x98, 0x1, 0xb6, + 0xc3, 0x90, 0x6f, 0xe6, 0xe, 0x5d, 0x77, 0x5, 0x3d, 0x1c}, + {0x1a, 0xe5, 0x22, 0x94, 0x40, 0xf1, 0x2e, 0x69, 0x71, 0xf6, 0x5d, + 0x2b, 0x3c, 0xc7, 0xc0, 0xcb, 0x29, 0xe0, 0x4c, 0x74, 0xe7, 0x4f, + 0x1, 0x21, 0x7c, 0x48, 0x30, 0xd3, 0xc7, 0xe2, 0x21, 0x6}, + }, + { + {0xf3, 0xf0, 0xdb, 0xb0, 0x96, 0x17, 0xae, 0xb7, 0x96, 0xe1, 0x7c, + 0xe1, 0xb9, 0xaf, 0xdf, 0x54, 0xb4, 0xa3, 0xaa, 0xe9, 0x71, 0x30, + 0x92, 0x25, 0x9d, 0x2e, 0x0, 0xa1, 0x9c, 0x58, 0x8e, 0x5d}, + {0x8d, 0x83, 0x59, 0x82, 0xcc, 0x60, 0x98, 0xaf, 0xdc, 0x9a, 0x9f, + 0xc6, 0xc1, 0x48, 0xea, 0x90, 0x30, 0x1e, 0x58, 0x65, 0x37, 0x48, + 0x26, 0x65, 0xbc, 0xa5, 0xd3, 0x7b, 0x9, 0xd6, 0x7, 0x0}, + {0x4b, 0xa9, 0x42, 0x8, 0x95, 0x1d, 0xbf, 0xc0, 0x3e, 0x2e, 0x8f, + 0x58, 0x63, 0xc3, 0xd3, 0xb2, 0xef, 0xe2, 0x51, 0xbb, 0x38, 0x14, + 0x96, 0xa, 0x86, 0xbf, 0x1c, 0x3c, 0x78, 0xd7, 0x83, 0x15}, + }, + { + {0xc7, 0x28, 0x9d, 0xcc, 0x4, 0x47, 0x3, 0x90, 0x8f, 0xc5, 0x2c, + 0xf7, 0x9e, 0x67, 0x1b, 0x1d, 0x26, 0x87, 0x5b, 0xbe, 0x5f, 0x2b, + 0xe1, 0x16, 0xa, 0x58, 0xc5, 0x83, 0x4e, 0x6, 0x58, 0x49}, + {0xe1, 0x7a, 0xa2, 0x5d, 0xef, 0xa2, 0xee, 0xec, 0x74, 0x1, 0x67, + 0x55, 0x14, 0x3a, 0x7c, 0x59, 0x7a, 0x16, 0x9, 0x66, 0x12, 0x2a, + 0xa6, 0xc9, 0x70, 0x8f, 0xed, 0x81, 0x2e, 0x5f, 0x2a, 0x25}, + {0xd, 0xe8, 0x66, 0x50, 0x26, 0x94, 0x28, 0xd, 0x6b, 0x8c, 0x7c, + 0x30, 0x85, 0xf7, 0xc3, 0xfc, 0xfd, 0x12, 0x11, 0xc, 0x78, 0xda, + 0x53, 0x1b, 0x88, 0xb3, 0x43, 0xd8, 0xb, 0x17, 0x9c, 0x7}, + }, + { + {0x56, 0xd0, 0xd5, 0xc0, 0x50, 0xcd, 0xd6, 0xcd, 0x3b, 0x57, 0x3, + 0xbb, 0x6d, 0x68, 0xf7, 0x9a, 0x48, 0xef, 0xc3, 0xf3, 0x3f, 0x72, + 0xa6, 0x3c, 0xcc, 0x8a, 0x7b, 0x31, 0xd7, 0xc0, 0x68, 0x67}, + {0xff, 0x6f, 0xfa, 0x64, 0xe4, 0xec, 0x6, 0x5, 0x23, 0xe5, 0x5, + 0x62, 0x1e, 0x43, 0xe3, 0xbe, 0x42, 0xea, 0xb8, 0x51, 0x24, 0x42, + 0x79, 0x35, 0x0, 0xfb, 0xc9, 0x4a, 0xe3, 0x5, 0xec, 0x6d}, + {0xb3, 0xc1, 0x55, 0xf1, 0xe5, 0x25, 0xb6, 0x94, 0x91, 0x7b, 0x7b, + 0x99, 0xa7, 0xf3, 0x7b, 0x41, 0x0, 0x26, 0x6b, 0x6d, 0xdc, 0xbd, + 0x2c, 0xc2, 0xf4, 0x52, 0xcd, 0xdd, 0x14, 0x5e, 0x44, 0x51}, + }, + { + {0x55, 0xa4, 0xbe, 0x2b, 0xab, 0x47, 0x31, 0x89, 0x29, 0x91, 0x7, + 0x92, 0x4f, 0xa2, 0x53, 0x8c, 0xa7, 0xf7, 0x30, 0xbe, 0x48, 0xf9, + 0x49, 0x4b, 0x3d, 0xd4, 0x4f, 0x6e, 0x8, 0x90, 0xe9, 0x12}, + {0x51, 0x49, 0x14, 0x3b, 0x4b, 0x2b, 0x50, 0x57, 0xb3, 0xbc, 0x4b, + 0x44, 0x6b, 0xff, 0x67, 0x8e, 0xdb, 0x85, 0x63, 0x16, 0x27, 0x69, + 0xbd, 0xb8, 0xc8, 0x95, 0x92, 0xe3, 0x31, 0x6f, 0x18, 0x13}, + {0x2e, 0xbb, 0xdf, 0x7f, 0xb3, 0x96, 0xc, 0xf1, 0xf9, 0xea, 0x1c, + 0x12, 0x5e, 0x93, 0x9a, 0x9f, 0x3f, 0x98, 0x5b, 0x3a, 0xc4, 0x36, + 0x11, 0xdf, 0xaf, 0x99, 0x3e, 0x5d, 0xf0, 0xe3, 0xb2, 0x77}, + }, + }, + { + { + {0xa4, 0xb0, 0xdd, 0x12, 0x9c, 0x63, 0x98, 0xd5, 0x6b, 0x86, 0x24, + 0xc0, 0x30, 0x9f, 0xd1, 0xa5, 0x60, 0xe4, 0xfc, 0x58, 0x3, 0x2f, + 0x7c, 0xd1, 0x8a, 0x5e, 0x9, 0x2e, 0x15, 0x95, 0xa1, 0x7}, + {0xde, 0xc4, 0x2e, 0x9c, 0xc5, 0xa9, 0x6f, 0x29, 0xcb, 0xf3, 0x84, + 0x4f, 0xbf, 0x61, 0x8b, 0xbc, 0x8, 0xf9, 0xa8, 0x17, 0xd9, 0x6, + 0x77, 0x1c, 0x5d, 0x25, 0xd3, 0x7a, 0xfc, 0x95, 0xb7, 0x63}, + {0xc8, 0x5f, 0x9e, 0x38, 0x2, 0x8f, 0x36, 0xa8, 0x3b, 0xe4, 0x8d, + 0xcf, 0x2, 0x3b, 0x43, 0x90, 0x43, 0x26, 0x41, 0xc5, 0x5d, 0xfd, + 0xa1, 0xaf, 0x37, 0x1, 0x2f, 0x3, 0x3d, 0xe8, 0x8f, 0x3e}, + }, + { + {0x3c, 0xd1, 0xef, 0xe8, 0x8d, 0x4c, 0x70, 0x8, 0x31, 0x37, 0xe0, + 0x33, 0x8e, 0x1a, 0xc5, 0xdf, 0xe3, 0xcd, 0x60, 0x12, 0xa5, 0x5d, + 0x9d, 0xa5, 0x86, 0x8c, 0x25, 0xa6, 0x99, 0x8, 0xd6, 0x22}, + {0x94, 0xa2, 0x70, 0x5, 0xb9, 0x15, 0x8b, 0x2f, 0x49, 0x45, 0x8, + 0x67, 0x70, 0x42, 0xf2, 0x94, 0x84, 0xfd, 0xbb, 0x61, 0xe1, 0x5a, + 0x1c, 0xde, 0x7, 0x40, 0xac, 0x7f, 0x79, 0x3b, 0xba, 0x75}, + {0x96, 0xd1, 0xcd, 0x70, 0xc0, 0xdb, 0x39, 0x62, 0x9a, 0x8a, 0x7d, + 0x6c, 0x8b, 0x8a, 0xfe, 0x60, 0x60, 0x12, 0x40, 0xeb, 0xbc, 0x47, + 0x88, 0xb3, 0x5e, 0x9e, 0x77, 0x87, 0x7b, 0xd0, 0x4, 0x9}, + }, + { + {0xb9, 0x40, 0xf9, 0x48, 0x66, 0x2d, 0x32, 0xf4, 0x39, 0xc, 0x2d, + 0xbd, 0xc, 0x2f, 0x95, 0x6, 0x31, 0xf9, 0x81, 0xa0, 0xad, 0x97, + 0x76, 0x16, 0x6c, 0x2a, 0xf7, 0xba, 0xce, 0xaa, 0x40, 0x62}, + {0x9c, 0x91, 0xba, 0xdd, 0xd4, 0x1f, 0xce, 0xb4, 0xaa, 0x8d, 0x4c, + 0xc7, 0x3e, 0xdb, 0x31, 0xcf, 0x51, 0xcc, 0x86, 0xad, 0x63, 0xcc, + 0x63, 0x2c, 0x7, 0xde, 0x1d, 0xbc, 0x3f, 0x14, 0xe2, 0x43}, + {0xa0, 0x95, 0xa2, 0x5b, 0x9c, 0x74, 0x34, 0xf8, 0x5a, 0xd2, 0x37, + 0xca, 0x5b, 0x7c, 0x94, 0xd6, 0x6a, 0x31, 0xc9, 0xe7, 0xa7, 0x3b, + 0xf1, 0x66, 0xac, 0xc, 0xb4, 0x8d, 0x23, 0xaf, 0xbd, 0x56}, + }, + { + {0xb2, 0x3b, 0x9d, 0xc1, 0x6c, 0xd3, 0x10, 0x13, 0xb9, 0x86, 0x23, + 0x62, 0xb7, 0x6b, 0x2a, 0x6, 0x5c, 0x4f, 0xa1, 0xd7, 0x91, 0x85, + 0x9b, 0x7c, 0x54, 0x57, 0x1e, 0x7e, 0x50, 0x31, 0xaa, 0x3}, + {0xeb, 0x33, 0x35, 0xf5, 0xe3, 0xb9, 0x2a, 0x36, 0x40, 0x3d, 0xb9, + 0x6e, 0xd5, 0x68, 0x85, 0x33, 0x72, 0x55, 0x5a, 0x1d, 0x52, 0x14, + 0xe, 0x9e, 0x18, 0x13, 0x74, 0x83, 0x6d, 0xa8, 0x24, 0x1d}, + {0x1f, 0xce, 0xd4, 0xff, 0x48, 0x76, 0xec, 0xf4, 0x1c, 0x8c, 0xac, + 0x54, 0xf0, 0xea, 0x45, 0xe0, 0x7c, 0x35, 0x9, 0x1d, 0x82, 0x25, + 0xd2, 0x88, 0x59, 0x48, 0xeb, 0x9a, 0xdc, 0x61, 0xb2, 0x43}, + }, + { + {0x64, 0x13, 0x95, 0x6c, 0x8b, 0x3d, 0x51, 0x19, 0x7b, 0xf4, 0xb, + 0x0, 0x26, 0x71, 0xfe, 0x94, 0x67, 0x95, 0x4f, 0xd5, 0xdd, 0x10, + 0x8d, 0x2, 0x64, 0x9, 0x94, 0x42, 0xe2, 0xd5, 0xb4, 0x2}, + {0xbb, 0x79, 0xbb, 0x88, 0x19, 0x1e, 0x5b, 0xe5, 0x9d, 0x35, 0x7a, + 0xc1, 0x7d, 0xd0, 0x9e, 0xa0, 0x33, 0xea, 0x3d, 0x60, 0xe2, 0x2e, + 0x2c, 0xb0, 0xc2, 0x6b, 0x27, 0x5b, 0xcf, 0x55, 0x60, 0x32}, + {0xf2, 0x8d, 0xd1, 0x28, 0xcb, 0x55, 0xa1, 0xb4, 0x8, 0xe5, 0x6c, + 0x18, 0x46, 0x46, 0xcc, 0xea, 0x89, 0x43, 0x82, 0x6c, 0x93, 0xf4, + 0x9c, 0xc4, 0x10, 0x34, 0x5d, 0xae, 0x9, 0xc8, 0xa6, 0x27}, + }, + { + {0x54, 0x69, 0x3d, 0xc4, 0xa, 0x27, 0x2c, 0xcd, 0xb2, 0xca, 0x66, + 0x6a, 0x57, 0x3e, 0x4a, 0xdd, 0x6c, 0x3, 0xd7, 0x69, 0x24, 0x59, + 0xfa, 0x79, 0x99, 0x25, 0x8c, 0x3d, 0x60, 0x3, 0x15, 0x22}, + {0x88, 0xb1, 0xd, 0x1f, 0xcd, 0xeb, 0xa6, 0x8b, 0xe8, 0x5b, 0x5a, + 0x67, 0x3a, 0xd7, 0xd3, 0x37, 0x5a, 0x58, 0xf5, 0x15, 0xa3, 0xdf, + 0x2e, 0xf2, 0x7e, 0xa1, 0x60, 0xff, 0x74, 0x71, 0xb6, 0x2c}, + {0xd0, 0xe1, 0xb, 0x39, 0xf9, 0xcd, 0xee, 0x59, 0xf1, 0xe3, 0x8c, + 0x72, 0x44, 0x20, 0x42, 0xa9, 0xf4, 0xf0, 0x94, 0x7a, 0x66, 0x1c, + 0x89, 0x82, 0x36, 0xf4, 0x90, 0x38, 0xb7, 0xf4, 0x1d, 0x7b}, + }, + { + {0x8c, 0xf5, 0xf8, 0x7, 0x18, 0x22, 0x2e, 0x5f, 0xd4, 0x9, 0x94, + 0xd4, 0x9f, 0x5c, 0x55, 0xe3, 0x30, 0xa6, 0xb6, 0x1f, 0x8d, 0xa8, + 0xaa, 0xb2, 0x3d, 0xe0, 0x52, 0xd3, 0x45, 0x82, 0x69, 0x68}, + {0x24, 0xa2, 0xb2, 0xb3, 0xe0, 0xf2, 0x92, 0xe4, 0x60, 0x11, 0x55, + 0x2b, 0x6, 0x9e, 0x6c, 0x7c, 0xe, 0x7b, 0x7f, 0xd, 0xe2, 0x8f, + 0xeb, 0x15, 0x92, 0x59, 0xfc, 0x58, 0x26, 0xef, 0xfc, 0x61}, + {0x7a, 0x18, 0x18, 0x2a, 0x85, 0x5d, 0xb1, 0xdb, 0xd7, 0xac, 0xdd, + 0x86, 0xd3, 0xaa, 0xe4, 0xf3, 0x82, 0xc4, 0xf6, 0xf, 0x81, 0xe2, + 0xba, 0x44, 0xcf, 0x1, 0xaf, 0x3d, 0x47, 0x4c, 0xcf, 0x46}, + }, + { + {0x40, 0x81, 0x49, 0xf1, 0xa7, 0x6e, 0x3c, 0x21, 0x54, 0x48, 0x2b, + 0x39, 0xf8, 0x7e, 0x1e, 0x7c, 0xba, 0xce, 0x29, 0x56, 0x8c, 0xc3, + 0x88, 0x24, 0xbb, 0xc5, 0x8c, 0xd, 0xe5, 0xaa, 0x65, 0x10}, + {0xf9, 0xe5, 0xc4, 0x9e, 0xed, 0x25, 0x65, 0x42, 0x3, 0x33, 0x90, + 0x16, 0x1, 0xda, 0x5e, 0xe, 0xdc, 0xca, 0xe5, 0xcb, 0xf2, 0xa7, + 0xb1, 0x72, 0x40, 0x5f, 0xeb, 0x14, 0xcd, 0x7b, 0x38, 0x29}, + {0x57, 0xd, 0x20, 0xdf, 0x25, 0x45, 0x2c, 0x1c, 0x4a, 0x67, 0xca, + 0xbf, 0xd6, 0x2d, 0x3b, 0x5c, 0x30, 0x40, 0x83, 0xe1, 0xb1, 0xe7, + 0x7, 0xa, 0x16, 0xe7, 0x1c, 0x4f, 0xe6, 0x98, 0xa1, 0x69}, + }, + }, + { + { + {0xed, 0xca, 0xc5, 0xdc, 0x34, 0x44, 0x1, 0xe1, 0x33, 0xfb, 0x84, + 0x3c, 0x96, 0x5d, 0xed, 0x47, 0xe7, 0xa0, 0x86, 0xed, 0x76, 0x95, + 0x1, 0x70, 0xe4, 0xf9, 0x67, 0xd2, 0x7b, 0x69, 0xb2, 0x25}, + {0xbc, 0x78, 0x1a, 0xd9, 0xe0, 0xb2, 0x62, 0x90, 0x67, 0x96, 0x50, + 0xc8, 0x9c, 0x88, 0xc9, 0x47, 0xb8, 0x70, 0x50, 0x40, 0x66, 0x4a, + 0xf5, 0x9d, 0xbf, 0xa1, 0x93, 0x24, 0xa9, 0xe6, 0x69, 0x73}, + {0x64, 0x68, 0x98, 0x13, 0xfb, 0x3f, 0x67, 0x9d, 0xb8, 0xc7, 0x5d, + 0x41, 0xd9, 0xfb, 0xa5, 0x3c, 0x5e, 0x3b, 0x27, 0xdf, 0x3b, 0xcc, + 0x4e, 0xe0, 0xd2, 0x4c, 0x4e, 0xb5, 0x3d, 0x68, 0x20, 0x14}, + }, + { + {0xd0, 0x5a, 0xcc, 0xc1, 0x6f, 0xbb, 0xee, 0x34, 0x8b, 0xac, 0x46, + 0x96, 0xe9, 0xc, 0x1b, 0x6a, 0x53, 0xde, 0x6b, 0xa6, 0x49, 0xda, + 0xb0, 0xd3, 0xc1, 0x81, 0xd0, 0x61, 0x41, 0x3b, 0xe8, 0x31}, + {0x97, 0xd1, 0x9d, 0x24, 0x1e, 0xbd, 0x78, 0xb4, 0x2, 0xc1, 0x58, + 0x5e, 0x0, 0x35, 0xc, 0x62, 0x5c, 0xac, 0xba, 0xcc, 0x2f, 0xd3, + 0x2, 0xfb, 0x2d, 0xa7, 0x8, 0xf5, 0xeb, 0x3b, 0xb6, 0x60}, + {0x4f, 0x2b, 0x6, 0x9e, 0x12, 0xc7, 0xe8, 0x97, 0xd8, 0xa, 0x32, + 0x29, 0x4f, 0x8f, 0xe4, 0x49, 0x3f, 0x68, 0x18, 0x6f, 0x4b, 0xe1, + 0xec, 0x5b, 0x17, 0x3, 0x55, 0x2d, 0xb6, 0x1e, 0xcf, 0x55}, + }, + { + {0x52, 0x8c, 0xf5, 0x7d, 0xe3, 0xb5, 0x76, 0x30, 0x36, 0xcc, 0x99, + 0xe7, 0xdd, 0xb9, 0x3a, 0xd7, 0x20, 0xee, 0x13, 0x49, 0xe3, 0x1c, + 0x83, 0xbd, 0x33, 0x1, 0xba, 0x62, 0xaa, 0xfb, 0x56, 0x1a}, + {0x58, 0x3d, 0xc2, 0x65, 0x10, 0x10, 0x79, 0x58, 0x9c, 0x81, 0x94, + 0x50, 0x6d, 0x8, 0x9d, 0x8b, 0xa7, 0x5f, 0xc5, 0x12, 0xa9, 0x2f, + 0x40, 0xe2, 0xd4, 0x91, 0x8, 0x57, 0x64, 0x65, 0x9a, 0x66}, + {0xec, 0xc9, 0x9d, 0x5c, 0x50, 0x6b, 0x3e, 0x94, 0x1a, 0x37, 0x7c, + 0xa7, 0xbb, 0x57, 0x25, 0x30, 0x51, 0x76, 0x34, 0x41, 0x56, 0xae, + 0x73, 0x98, 0x5c, 0x8a, 0xc5, 0x99, 0x67, 0x83, 0xc4, 0x13}, + }, + { + {0x80, 0xd0, 0x8b, 0x5d, 0x6a, 0xfb, 0xdc, 0xc4, 0x42, 0x48, 0x1a, + 0x57, 0xec, 0xc4, 0xeb, 0xde, 0x65, 0x53, 0xe5, 0xb8, 0x83, 0xe8, + 0xb2, 0xd4, 0x27, 0xb8, 0xe5, 0xc8, 0x7d, 0xc8, 0xbd, 0x50}, + {0xb9, 0xe1, 0xb3, 0x5a, 0x46, 0x5d, 0x3a, 0x42, 0x61, 0x3f, 0xf1, + 0xc7, 0x87, 0xc1, 0x13, 0xfc, 0xb6, 0xb9, 0xb5, 0xec, 0x64, 0x36, + 0xf8, 0x19, 0x7, 0xb6, 0x37, 0xa6, 0x93, 0xc, 0xf8, 0x66}, + {0x11, 0xe1, 0xdf, 0x6e, 0x83, 0x37, 0x6d, 0x60, 0xd9, 0xab, 0x11, + 0xf0, 0x15, 0x3e, 0x35, 0x32, 0x96, 0x3b, 0xb7, 0x25, 0xc3, 0x3a, + 0xb0, 0x64, 0xae, 0xd5, 0x5f, 0x72, 0x44, 0x64, 0xd5, 0x1d}, + }, + { + {0x9a, 0xc8, 0xba, 0x8, 0x0, 0xe6, 0x97, 0xc2, 0xe0, 0xc3, 0xe1, + 0xea, 0x11, 0xea, 0x4c, 0x7d, 0x7c, 0x97, 0xe7, 0x9f, 0xe1, 0x8b, + 0xe3, 0xf3, 0xcd, 0x5, 0xa3, 0x63, 0xf, 0x45, 0x3a, 0x3a}, + {0x7d, 0x12, 0x62, 0x33, 0xf8, 0x7f, 0xa4, 0x8f, 0x15, 0x7c, 0xcd, + 0x71, 0xc4, 0x6a, 0x9f, 0xbc, 0x8b, 0xc, 0x22, 0x49, 0x43, 0x45, + 0x71, 0x6e, 0x2e, 0x73, 0x9f, 0x21, 0x12, 0x59, 0x64, 0xe}, + {0x27, 0x46, 0x39, 0xd8, 0x31, 0x2f, 0x8f, 0x7, 0x10, 0xa5, 0x94, + 0xde, 0x83, 0x31, 0x9d, 0x38, 0x80, 0x6f, 0x99, 0x17, 0x6d, 0x6c, + 0xe3, 0xd1, 0x7b, 0xa8, 0xa9, 0x93, 0x93, 0x8d, 0x8c, 0x31}, + }, + { + {0x98, 0xd3, 0x1d, 0xab, 0x29, 0x9e, 0x66, 0x5d, 0x3b, 0x9e, 0x2d, + 0x34, 0x58, 0x16, 0x92, 0xfc, 0xcd, 0x73, 0x59, 0xf3, 0xfd, 0x1d, + 0x85, 0x55, 0xf6, 0xa, 0x95, 0x25, 0xc3, 0x41, 0x9a, 0x50}, + {0x19, 0xfe, 0xff, 0x2a, 0x3, 0x5d, 0x74, 0xf2, 0x66, 0xdb, 0x24, + 0x7f, 0x49, 0x3c, 0x9f, 0xc, 0xef, 0x98, 0x85, 0xba, 0xe3, 0xd3, + 0x98, 0xbc, 0x14, 0x53, 0x1d, 0x9a, 0x67, 0x7c, 0x4c, 0x22}, + {0xe9, 0x25, 0xf9, 0xa6, 0xdc, 0x6e, 0xc0, 0xbd, 0x33, 0x1f, 0x1b, + 0x64, 0xf4, 0xf3, 0x3e, 0x79, 0x89, 0x3e, 0x83, 0x9d, 0x80, 0x12, + 0xec, 0x82, 0x89, 0x13, 0xa1, 0x28, 0x23, 0xf0, 0xbf, 0x5}, + }, + { + {0xe4, 0x12, 0xc5, 0xd, 0xdd, 0xa0, 0x81, 0x68, 0xfe, 0xfa, 0xa5, + 0x44, 0xc8, 0xd, 0xe7, 0x4f, 0x40, 0x52, 0x4a, 0x8f, 0x6b, 0x8e, + 0x74, 0x1f, 0xea, 0xa3, 0x1, 0xee, 0xcd, 0x77, 0x62, 0x57}, + {0xb, 0xe0, 0xca, 0x23, 0x70, 0x13, 0x32, 0x36, 0x59, 0xcf, 0xac, + 0xd1, 0xa, 0xcf, 0x4a, 0x54, 0x88, 0x1c, 0x1a, 0xd2, 0x49, 0x10, + 0x74, 0x96, 0xa7, 0x44, 0x2a, 0xfa, 0xc3, 0x8c, 0xb, 0x78}, + {0x5f, 0x30, 0x4f, 0x23, 0xbc, 0x8a, 0xf3, 0x1e, 0x8, 0xde, 0x5, + 0x14, 0xbd, 0x7f, 0x57, 0x9a, 0xd, 0x2a, 0xe6, 0x34, 0x14, 0xa5, + 0x82, 0x5e, 0xa1, 0xb7, 0x71, 0x62, 0x72, 0x18, 0xf4, 0x5f}, + }, + { + {0x40, 0x95, 0xb6, 0x13, 0xe8, 0x47, 0xdb, 0xe5, 0xe1, 0x10, 0x26, + 0x43, 0x3b, 0x2a, 0x5d, 0xf3, 0x76, 0x12, 0x78, 0x38, 0xe9, 0x26, + 0x1f, 0xac, 0x69, 0xcb, 0xa0, 0xa0, 0x8c, 0xdb, 0xd4, 0x29}, + {0x9d, 0xdb, 0x89, 0x17, 0xc, 0x8, 0x8e, 0x39, 0xf5, 0x78, 0xe7, + 0xf3, 0x25, 0x20, 0x60, 0xa7, 0x5d, 0x3, 0xbd, 0x6, 0x4c, 0x89, + 0x98, 0xfa, 0xbe, 0x66, 0xa9, 0x25, 0xdc, 0x3, 0x6a, 0x10}, + {0xd0, 0x53, 0x33, 0x33, 0xaf, 0xa, 0xad, 0xd9, 0xe5, 0x9, 0xd3, + 0xac, 0xa5, 0x9d, 0x66, 0x38, 0xf0, 0xf7, 0x88, 0xc8, 0x8a, 0x65, + 0x57, 0x3c, 0xfa, 0xbe, 0x2c, 0x5, 0x51, 0x8a, 0xb3, 0x4a}, + }, + }, + { + { + {0x9c, 0xc0, 0xdd, 0x5f, 0xef, 0xd1, 0xcf, 0xd6, 0xce, 0x5d, 0x57, + 0xf7, 0xfd, 0x3e, 0x2b, 0xe8, 0xc2, 0x34, 0x16, 0x20, 0x5d, 0x6b, + 0xd5, 0x25, 0x9b, 0x2b, 0xed, 0x4, 0xbb, 0xc6, 0x41, 0x30}, + {0x93, 0xd5, 0x68, 0x67, 0x25, 0x2b, 0x7c, 0xda, 0x13, 0xca, 0x22, + 0x44, 0x57, 0xc0, 0xc1, 0x98, 0x1d, 0xce, 0xa, 0xca, 0xd5, 0xb, + 0xa8, 0xf1, 0x90, 0xa6, 0x88, 0xc0, 0xad, 0xd1, 0xcd, 0x29}, + {0x48, 0xe1, 0x56, 0xd9, 0xf9, 0xf2, 0xf2, 0xf, 0x2e, 0x6b, 0x35, + 0x9f, 0x75, 0x97, 0xe7, 0xad, 0x5c, 0x2, 0x6c, 0x5f, 0xbb, 0x98, + 0x46, 0x1a, 0x7b, 0x9a, 0x4, 0x14, 0x68, 0xbd, 0x4b, 0x10}, + }, + { + {0x63, 0xf1, 0x7f, 0xd6, 0x5f, 0x9a, 0x5d, 0xa9, 0x81, 0x56, 0xc7, + 0x4c, 0x9d, 0xe6, 0x2b, 0xe9, 0x57, 0xf2, 0x20, 0xde, 0x4c, 0x2, + 0xf8, 0xb7, 0xf5, 0x2d, 0x7, 0xfb, 0x20, 0x2a, 0x4f, 0x20}, + {0x67, 0xed, 0xf1, 0x68, 0x31, 0xfd, 0xf0, 0x51, 0xc2, 0x3b, 0x6f, + 0xd8, 0xcd, 0x1d, 0x81, 0x2c, 0xde, 0xf2, 0xd2, 0x4, 0x43, 0x5c, + 0xdc, 0x44, 0x49, 0x71, 0x2a, 0x9, 0x57, 0xcc, 0xe8, 0x5b}, + {0x79, 0xb0, 0xeb, 0x30, 0x3d, 0x3b, 0x14, 0xc8, 0x30, 0x2e, 0x65, + 0xbd, 0x5a, 0x15, 0x89, 0x75, 0x31, 0x5c, 0x6d, 0x8f, 0x31, 0x3c, + 0x3c, 0x65, 0x1f, 0x16, 0x79, 0xc2, 0x17, 0xfb, 0x70, 0x25}, + }, + { + {0x5a, 0x24, 0xb8, 0xb, 0x55, 0xa9, 0x2e, 0x19, 0xd1, 0x50, 0x90, + 0x8f, 0xa8, 0xfb, 0xe6, 0xc8, 0x35, 0xc9, 0xa4, 0x88, 0x2d, 0xea, + 0x86, 0x79, 0x68, 0x86, 0x1, 0xde, 0x91, 0x5f, 0x1c, 0x24}, + {0x75, 0x15, 0xb6, 0x2c, 0x7f, 0x36, 0xfa, 0x3e, 0x6c, 0x2, 0xd6, + 0x1c, 0x76, 0x6f, 0xf9, 0xf5, 0x62, 0x25, 0xb5, 0x65, 0x2a, 0x14, + 0xc7, 0xe8, 0xcd, 0xa, 0x3, 0x53, 0xea, 0x65, 0xcb, 0x3d}, + {0xaa, 0x6c, 0xde, 0x40, 0x29, 0x17, 0xd8, 0x28, 0x3a, 0x73, 0xd9, + 0x22, 0xf0, 0x2c, 0xbf, 0x8f, 0xd1, 0x1, 0x5b, 0x23, 0xdd, 0xfc, + 0xd7, 0x16, 0xe5, 0xf0, 0xcd, 0x5f, 0xdd, 0xe, 0x42, 0x8}, + }, + { + {0xce, 0x10, 0xf4, 0x4, 0x4e, 0xc3, 0x58, 0x3, 0x85, 0x6, 0x6e, + 0x27, 0x5a, 0x5b, 0x13, 0xb6, 0x21, 0x15, 0xb9, 0xeb, 0xc7, 0x70, + 0x96, 0x5d, 0x9c, 0x88, 0xdb, 0x21, 0xf3, 0x54, 0xd6, 0x4}, + {0x4a, 0xfa, 0x62, 0x83, 0xab, 0x20, 0xff, 0xcd, 0x6e, 0x3e, 0x1a, + 0xe2, 0xd4, 0x18, 0xe1, 0x57, 0x2b, 0xe6, 0x39, 0xfc, 0x17, 0x96, + 0x17, 0xe3, 0xfd, 0x69, 0x17, 0xbc, 0xef, 0x53, 0x9a, 0xd}, + {0xd5, 0xb5, 0xbd, 0xdd, 0x16, 0xc1, 0x7d, 0x5e, 0x2d, 0xdd, 0xa5, + 0x8d, 0xb6, 0xde, 0x54, 0x29, 0x92, 0xa2, 0x34, 0x33, 0x17, 0x8, + 0xb6, 0x1c, 0xd7, 0x1a, 0x99, 0x18, 0x26, 0x4f, 0x7a, 0x4a}, + }, + { + {0x4b, 0x2a, 0x37, 0xaf, 0x91, 0xb2, 0xc3, 0x24, 0xf2, 0x47, 0x81, + 0x71, 0x70, 0x82, 0xda, 0x93, 0xf2, 0x9e, 0x89, 0x86, 0x64, 0x85, + 0x84, 0xdd, 0x33, 0xee, 0xe0, 0x23, 0x42, 0x31, 0x96, 0x4a}, + {0x95, 0x5f, 0xb1, 0x5f, 0x2, 0x18, 0xa7, 0xf4, 0x8f, 0x1b, 0x5c, + 0x6b, 0x34, 0x5f, 0xf6, 0x3d, 0x12, 0x11, 0xe0, 0x0, 0x85, 0xf0, + 0xfc, 0xcd, 0x48, 0x18, 0xd3, 0xdd, 0x4c, 0xc, 0xb5, 0x11}, + {0xd6, 0xff, 0xa4, 0x8, 0x44, 0x27, 0xe8, 0xa6, 0xd9, 0x76, 0x15, + 0x9c, 0x7e, 0x17, 0x8e, 0x73, 0xf2, 0xb3, 0x2, 0x3d, 0xb6, 0x48, + 0x33, 0x77, 0x51, 0xcc, 0x6b, 0xce, 0x4d, 0xce, 0x4b, 0x4f}, + }, + { + {0x6f, 0xb, 0x9d, 0xc4, 0x6e, 0x61, 0xe2, 0x30, 0x17, 0x23, 0xec, + 0xca, 0x8f, 0x71, 0x56, 0xe4, 0xa6, 0x4f, 0x6b, 0xf2, 0x9b, 0x40, + 0xeb, 0x48, 0x37, 0x5f, 0x59, 0x61, 0xe5, 0xce, 0x42, 0x30}, + {0x84, 0x25, 0x24, 0xe2, 0x5a, 0xce, 0x1f, 0xa7, 0x9e, 0x8a, 0xf5, + 0x92, 0x56, 0x72, 0xea, 0x26, 0xf4, 0x3c, 0xea, 0x1c, 0xd7, 0x9, + 0x1a, 0xd2, 0xe6, 0x1, 0x1c, 0xb7, 0x14, 0xdd, 0xfc, 0x73}, + {0x41, 0xac, 0x9b, 0x44, 0x79, 0x70, 0x7e, 0x42, 0xa, 0x31, 0xe2, + 0xbc, 0x6d, 0xe3, 0x5a, 0x85, 0x7c, 0x1a, 0x84, 0x5f, 0x21, 0x76, + 0xae, 0x4c, 0xd6, 0xe1, 0x9c, 0x9a, 0xc, 0x74, 0x9e, 0x38}, + }, + { + {0x28, 0xac, 0xe, 0x57, 0xf6, 0x78, 0xbd, 0xc9, 0xe1, 0x9c, 0x91, + 0x27, 0x32, 0xb, 0x5b, 0xe5, 0xed, 0x91, 0x9b, 0xa1, 0xab, 0x3e, + 0xfc, 0x65, 0x90, 0x36, 0x26, 0xd6, 0xe5, 0x25, 0xc4, 0x25}, + {0xce, 0xb9, 0xdc, 0x34, 0xae, 0xb3, 0xfc, 0x64, 0xad, 0xd0, 0x48, + 0xe3, 0x23, 0x3, 0x50, 0x97, 0x1b, 0x38, 0xc6, 0x62, 0x7d, 0xf0, + 0xb3, 0x45, 0x88, 0x67, 0x5a, 0x46, 0x79, 0x53, 0x54, 0x61}, + {0x6e, 0xde, 0xd7, 0xf1, 0xa6, 0x6, 0x3e, 0x3f, 0x8, 0x23, 0x6, + 0x8e, 0x27, 0x76, 0xf9, 0x3e, 0x77, 0x6c, 0x8a, 0x4e, 0x26, 0xf6, + 0x14, 0x8c, 0x59, 0x47, 0x48, 0x15, 0x89, 0xa0, 0x39, 0x65}, + }, + { + {0x19, 0x4a, 0xbb, 0x14, 0xd4, 0xdb, 0xc4, 0xdd, 0x8e, 0x4f, 0x42, + 0x98, 0x3c, 0xbc, 0xb2, 0x19, 0x69, 0x71, 0xca, 0x36, 0xd7, 0x9f, + 0xa8, 0x48, 0x90, 0xbd, 0x19, 0xf0, 0xe, 0x32, 0x65, 0xf}, + {0x73, 0xf7, 0xd2, 0xc3, 0x74, 0x1f, 0xd2, 0xe9, 0x45, 0x68, 0xc4, + 0x25, 0x41, 0x54, 0x50, 0xc1, 0x33, 0x9e, 0xb9, 0xf9, 0xe8, 0x5c, + 0x4e, 0x62, 0x6c, 0x18, 0xcd, 0xc5, 0xaa, 0xe4, 0xc5, 0x11}, + {0xc6, 0xe0, 0xfd, 0xca, 0xb1, 0xd1, 0x86, 0xd4, 0x81, 0x51, 0x3b, + 0x16, 0xe3, 0xe6, 0x3f, 0x4f, 0x9a, 0x93, 0xf2, 0xfa, 0xd, 0xaf, + 0xa8, 0x59, 0x2a, 0x7, 0x33, 0xec, 0xbd, 0xc7, 0xab, 0x4c}, + }, + }, + { + { + {0x89, 0xd2, 0x78, 0x3f, 0x8f, 0x78, 0x8f, 0xc0, 0x9f, 0x4d, 0x40, + 0xa1, 0x2c, 0xa7, 0x30, 0xfe, 0x9d, 0xcc, 0x65, 0xcf, 0xfc, 0x8b, + 0x77, 0xf2, 0x21, 0x20, 0xcb, 0x5a, 0x16, 0x98, 0xe4, 0x7e}, + {0x2e, 0xa, 0x9c, 0x8, 0x24, 0x96, 0x9e, 0x23, 0x38, 0x47, 0xfe, + 0x3a, 0xc0, 0xc4, 0x48, 0xc7, 0x2a, 0xa1, 0x4f, 0x76, 0x2a, 0xed, + 0xdb, 0x17, 0x82, 0x85, 0x1c, 0x32, 0xf0, 0x93, 0x9b, 0x63}, + {0xc3, 0xa1, 0x11, 0x91, 0xe3, 0x8, 0xd5, 0x7b, 0x89, 0x74, 0x90, + 0x80, 0xd4, 0x90, 0x2b, 0x2b, 0x19, 0xfd, 0x72, 0xae, 0xc2, 0xae, + 0xd2, 0xe7, 0xa6, 0x2, 0xb6, 0x85, 0x3c, 0x49, 0xdf, 0xe}, + }, + { + {0x13, 0x41, 0x76, 0x84, 0xd2, 0xc4, 0x67, 0x67, 0x35, 0xf8, 0xf5, + 0xf7, 0x3f, 0x40, 0x90, 0xa0, 0xde, 0xbe, 0xe6, 0xca, 0xfa, 0xcf, + 0x8f, 0x1c, 0x69, 0xa3, 0xdf, 0xd1, 0x54, 0xc, 0xc0, 0x4}, + {0x68, 0x5a, 0x9b, 0x59, 0x58, 0x81, 0xcc, 0xae, 0xe, 0xe2, 0xad, + 0xeb, 0xf, 0x4f, 0x57, 0xea, 0x7, 0x7f, 0xb6, 0x22, 0x74, 0x1d, + 0xe4, 0x4f, 0xb4, 0x4f, 0x9d, 0x1, 0xe3, 0x92, 0x3b, 0x40}, + {0xf8, 0x5c, 0x46, 0x8b, 0x81, 0x2f, 0xc2, 0x4d, 0xf8, 0xef, 0x80, + 0x14, 0x5a, 0xf3, 0xa0, 0x71, 0x57, 0xd6, 0xc7, 0x4, 0xad, 0xbf, + 0xe8, 0xae, 0xf4, 0x76, 0x61, 0xb2, 0x2a, 0xb1, 0x5b, 0x35}, + }, + { + {0x18, 0x73, 0x8c, 0x5a, 0xc7, 0xda, 0x1, 0xa3, 0x11, 0xaa, 0xce, + 0xb3, 0x9d, 0x3, 0x90, 0xed, 0x2d, 0x3f, 0xae, 0x3b, 0xbf, 0x7c, + 0x7, 0x6f, 0x8e, 0xad, 0x52, 0xe0, 0xf8, 0xea, 0x18, 0x75}, + {0xf4, 0xbb, 0x93, 0x74, 0xcc, 0x64, 0x1e, 0xa7, 0xc3, 0xb0, 0xa3, + 0xec, 0xd9, 0x84, 0xbd, 0xe5, 0x85, 0xe7, 0x5, 0xfa, 0xc, 0xc5, + 0x6b, 0xa, 0x12, 0xc3, 0x2e, 0x18, 0x32, 0x81, 0x9b, 0xf}, + {0x32, 0x6c, 0x7f, 0x1b, 0xc4, 0x59, 0x88, 0xa4, 0x98, 0x32, 0x38, + 0xf4, 0xbc, 0x60, 0x2d, 0xf, 0xd9, 0xd1, 0xb1, 0xc9, 0x29, 0xa9, + 0x15, 0x18, 0xc4, 0x55, 0x17, 0xbb, 0x1b, 0x87, 0xc3, 0x47}, + }, + { + {0xb0, 0x66, 0x50, 0xc8, 0x50, 0x5d, 0xe6, 0xfb, 0xb0, 0x99, 0xa2, + 0xb3, 0xb0, 0xc4, 0xec, 0x62, 0xe0, 0xe8, 0x1a, 0x44, 0xea, 0x54, + 0x37, 0xe5, 0x5f, 0x8d, 0xd4, 0xe8, 0x2c, 0xa0, 0xfe, 0x8}, + {0x48, 0x4f, 0xec, 0x71, 0x97, 0x53, 0x44, 0x51, 0x6e, 0x5d, 0x8c, + 0xc9, 0x7d, 0xb1, 0x5, 0xf8, 0x6b, 0xc6, 0xc3, 0x47, 0x1a, 0xc1, + 0x62, 0xf7, 0xdc, 0x99, 0x46, 0x76, 0x85, 0x9b, 0xb8, 0x0}, + {0xd0, 0xea, 0xde, 0x68, 0x76, 0xdd, 0x4d, 0x82, 0x23, 0x5d, 0x68, + 0x4b, 0x20, 0x45, 0x64, 0xc8, 0x65, 0xd6, 0x89, 0x5d, 0xcd, 0xcf, + 0x14, 0xb5, 0x37, 0xd5, 0x75, 0x4f, 0xa7, 0x29, 0x38, 0x47}, + }, + { + {0xc9, 0x2, 0x39, 0xad, 0x3a, 0x53, 0xd9, 0x23, 0x8f, 0x58, 0x3, + 0xef, 0xce, 0xdd, 0xc2, 0x64, 0xb4, 0x2f, 0xe1, 0xcf, 0x90, 0x73, + 0x25, 0x15, 0x90, 0xd3, 0xe4, 0x44, 0x4d, 0x8b, 0x66, 0x6c}, + {0x18, 0xc4, 0x79, 0x46, 0x75, 0xda, 0xd2, 0x82, 0xf0, 0x8d, 0x61, + 0xb2, 0xd8, 0xd7, 0x3b, 0xe6, 0xa, 0xeb, 0x47, 0xac, 0x24, 0xef, + 0x5e, 0x35, 0xb4, 0xc6, 0x33, 0x48, 0x4c, 0x68, 0x78, 0x20}, + {0xc, 0x82, 0x78, 0x7a, 0x21, 0xcf, 0x48, 0x3b, 0x97, 0x3e, 0x27, + 0x81, 0xb2, 0xa, 0x6a, 0xf7, 0x7b, 0xed, 0x8e, 0x8c, 0xa7, 0x65, + 0x6c, 0xa9, 0x3f, 0x43, 0x8a, 0x4f, 0x5, 0xa6, 0x11, 0x74}, + }, + { + {0xb4, 0x75, 0xb1, 0x18, 0x3d, 0xe5, 0x9a, 0x57, 0x2, 0xa1, 0x92, + 0xf3, 0x59, 0x31, 0x71, 0x68, 0xf5, 0x35, 0xef, 0x1e, 0xba, 0xec, + 0x55, 0x84, 0x8f, 0x39, 0x8c, 0x45, 0x72, 0xa8, 0xc9, 0x1e}, + {0x6d, 0xc8, 0x9d, 0xb9, 0x32, 0x9d, 0x65, 0x4d, 0x15, 0xf1, 0x3a, + 0x60, 0x75, 0xdc, 0x4c, 0x4, 0x88, 0xe4, 0xc2, 0xdc, 0x2c, 0x71, + 0x4c, 0xb3, 0xff, 0x34, 0x81, 0xfb, 0x74, 0x65, 0x13, 0x7c}, + {0x9b, 0x50, 0xa2, 0x0, 0xd4, 0xa4, 0xe6, 0xb8, 0xb4, 0x82, 0xc8, + 0xb, 0x2, 0xd7, 0x81, 0x9b, 0x61, 0x75, 0x95, 0xf1, 0x9b, 0xcc, + 0xe7, 0x57, 0x60, 0x64, 0xcd, 0xc7, 0xa5, 0x88, 0xdd, 0x3a}, + }, + { + {0x46, 0x30, 0x39, 0x59, 0xd4, 0x98, 0xc2, 0x85, 0xec, 0x59, 0xf6, + 0x5f, 0x98, 0x35, 0x7e, 0x8f, 0x3a, 0x6e, 0xf6, 0xf2, 0x2a, 0xa2, + 0x2c, 0x1d, 0x20, 0xa7, 0x6, 0xa4, 0x31, 0x11, 0xba, 0x61}, + {0xf2, 0xdc, 0x35, 0xb6, 0x70, 0x57, 0x89, 0xab, 0xbc, 0x1f, 0x6c, + 0xf6, 0x6c, 0xef, 0xdf, 0x2, 0x87, 0xd1, 0xb6, 0xbe, 0x68, 0x2, + 0x53, 0x85, 0x74, 0x9e, 0x87, 0xcc, 0xfc, 0x29, 0x99, 0x24}, + {0x29, 0x90, 0x95, 0x16, 0xf1, 0xa0, 0xd0, 0xa3, 0x89, 0xbd, 0x7e, + 0xba, 0x6c, 0x6b, 0x3b, 0x2, 0x7, 0x33, 0x78, 0x26, 0x3e, 0x5a, + 0xf1, 0x7b, 0xe7, 0xec, 0xd8, 0xbb, 0xc, 0x31, 0x20, 0x56}, + }, + { + {0xd6, 0x85, 0xe2, 0x77, 0xf4, 0xb5, 0x46, 0x66, 0x93, 0x61, 0x8f, + 0x6c, 0x67, 0xff, 0xe8, 0x40, 0xdd, 0x94, 0xb5, 0xab, 0x11, 0x73, + 0xec, 0xa6, 0x4d, 0xec, 0x8c, 0x65, 0xf3, 0x46, 0xc8, 0x7e}, + {0x43, 0xd6, 0x34, 0x49, 0x43, 0x93, 0x89, 0x52, 0xf5, 0x22, 0x12, + 0xa5, 0x6, 0xf8, 0xdb, 0xb9, 0x22, 0x1c, 0xf4, 0xc3, 0x8f, 0x87, + 0x6d, 0x8f, 0x30, 0x97, 0x9d, 0x4d, 0x2a, 0x6a, 0x67, 0x37}, + {0xc7, 0x2e, 0xa2, 0x1d, 0x3f, 0x8f, 0x5e, 0x9b, 0x13, 0xcd, 0x1, + 0x6c, 0x77, 0x1d, 0xf, 0x13, 0xb8, 0x9f, 0x98, 0xa2, 0xcf, 0x8f, + 0x4c, 0x21, 0xd5, 0x9d, 0x9b, 0x39, 0x23, 0xf7, 0xaa, 0x6d}, + }, + }, + { + { + {0xa2, 0x8e, 0xad, 0xac, 0xbf, 0x4, 0x3b, 0x58, 0x84, 0xe8, 0x8b, + 0x14, 0xe8, 0x43, 0xb7, 0x29, 0xdb, 0xc5, 0x10, 0x8, 0x3b, 0x58, + 0x1e, 0x2b, 0xaa, 0xbb, 0xb3, 0x8e, 0xe5, 0x49, 0x54, 0x2b}, + {0x47, 0xbe, 0x3d, 0xeb, 0x62, 0x75, 0x3a, 0x5f, 0xb8, 0xa0, 0xbd, + 0x8e, 0x54, 0x38, 0xea, 0xf7, 0x99, 0x72, 0x74, 0x45, 0x31, 0xe5, + 0xc3, 0x0, 0x51, 0xd5, 0x27, 0x16, 0xe7, 0xe9, 0x4, 0x13}, + {0xfe, 0x9c, 0xdc, 0x6a, 0xd2, 0x14, 0x98, 0x78, 0xb, 0xdd, 0x48, + 0x8b, 0x3f, 0xab, 0x1b, 0x3c, 0xa, 0xc6, 0x79, 0xf9, 0xff, 0xe1, + 0xf, 0xda, 0x93, 0xd6, 0x2d, 0x7c, 0x2d, 0xde, 0x68, 0x44}, + }, + { + {0xce, 0x7, 0x63, 0xf8, 0xc6, 0xd8, 0x9a, 0x4b, 0x28, 0xc, 0x5d, + 0x43, 0x31, 0x35, 0x11, 0x21, 0x2c, 0x77, 0x7a, 0x65, 0xc5, 0x66, + 0xa8, 0xd4, 0x52, 0x73, 0x24, 0x63, 0x7e, 0x42, 0xa6, 0x5d}, + {0x9e, 0x46, 0x19, 0x94, 0x5e, 0x35, 0xbb, 0x51, 0x54, 0xc7, 0xdd, + 0x23, 0x4c, 0xdc, 0xe6, 0x33, 0x62, 0x99, 0x7f, 0x44, 0xd6, 0xb6, + 0xa5, 0x93, 0x63, 0xbd, 0x44, 0xfb, 0x6f, 0x7c, 0xce, 0x6c}, + {0xca, 0x22, 0xac, 0xde, 0x88, 0xc6, 0x94, 0x1a, 0xf8, 0x1f, 0xae, + 0xbb, 0xf7, 0x6e, 0x6, 0xb9, 0xf, 0x58, 0x59, 0x8d, 0x38, 0x8c, + 0xad, 0x88, 0xa8, 0x2c, 0x9f, 0xe7, 0xbf, 0x9a, 0xf2, 0x58}, + }, + { + {0xf6, 0xcd, 0xe, 0x71, 0xbf, 0x64, 0x5a, 0x4b, 0x3c, 0x29, 0x2c, + 0x46, 0x38, 0xe5, 0x4c, 0xb1, 0xb9, 0x3a, 0xb, 0xd5, 0x56, 0xd0, + 0x43, 0x36, 0x70, 0x48, 0x5b, 0x18, 0x24, 0x37, 0xf9, 0x6a}, + {0x68, 0x3e, 0xe7, 0x8d, 0xab, 0xcf, 0xe, 0xe9, 0xa5, 0x76, 0x7e, + 0x37, 0x9f, 0x6f, 0x3, 0x54, 0x82, 0x59, 0x1, 0xbe, 0xb, 0x5b, + 0x49, 0xf0, 0x36, 0x1e, 0xf4, 0xa7, 0xc4, 0x29, 0x76, 0x57}, + {0x88, 0xa8, 0xc6, 0x9, 0x45, 0x2, 0x20, 0x32, 0x73, 0x89, 0x55, + 0x4b, 0x13, 0x36, 0xe0, 0xd2, 0x9f, 0x28, 0x33, 0x3c, 0x23, 0x36, + 0xe2, 0x83, 0x8f, 0xc1, 0xae, 0xc, 0xbb, 0x25, 0x1f, 0x70}, + }, + { + {0x13, 0xc1, 0xbe, 0x7c, 0xd9, 0xf6, 0x18, 0x9d, 0xe4, 0xdb, 0xbf, + 0x74, 0xe6, 0x6, 0x4a, 0x84, 0xd6, 0x60, 0x4e, 0xac, 0x22, 0xb5, + 0xf5, 0x20, 0x51, 0x5e, 0x95, 0x50, 0xc0, 0x5b, 0xa, 0x72}, + {0xed, 0x6c, 0x61, 0xe4, 0xf8, 0xb0, 0xa8, 0xc3, 0x7d, 0xa8, 0x25, + 0x9e, 0xe, 0x66, 0x0, 0xf7, 0x9c, 0xa5, 0xbc, 0xf4, 0x1f, 0x6, + 0xe3, 0x61, 0xe9, 0xb, 0xc4, 0xbd, 0xbf, 0x92, 0xc, 0x2e}, + {0x35, 0x5a, 0x80, 0x9b, 0x43, 0x9, 0x3f, 0xc, 0xfc, 0xab, 0x42, + 0x62, 0x37, 0x8b, 0x4e, 0xe8, 0x46, 0x93, 0x22, 0x5c, 0xf3, 0x17, + 0x14, 0x69, 0xec, 0xf0, 0x4e, 0x14, 0xbb, 0x9c, 0x9b, 0xe}, + }, + { + {0xee, 0xbe, 0xb1, 0x5d, 0xd5, 0x9b, 0xee, 0x8d, 0xb9, 0x3f, 0x72, + 0xa, 0x37, 0xab, 0xc3, 0xc9, 0x91, 0xd7, 0x68, 0x1c, 0xbf, 0xf1, + 0xa8, 0x44, 0xde, 0x3c, 0xfd, 0x1c, 0x19, 0x44, 0x6d, 0x36}, + {0xad, 0x20, 0x57, 0xfb, 0x8f, 0xd4, 0xba, 0xfb, 0xe, 0xd, 0xf9, + 0xdb, 0x6b, 0x91, 0x81, 0xee, 0xbf, 0x43, 0x55, 0x63, 0x52, 0x31, + 0x81, 0xd4, 0xd8, 0x7b, 0x33, 0x3f, 0xeb, 0x4, 0x11, 0x22}, + {0x14, 0x8c, 0xbc, 0xf2, 0x43, 0x17, 0x3c, 0x9e, 0x3b, 0x6c, 0x85, + 0xb5, 0xfc, 0x26, 0xda, 0x2e, 0x97, 0xfb, 0xa7, 0x68, 0xe, 0x2f, + 0xb8, 0xcc, 0x44, 0x32, 0x59, 0xbc, 0xe6, 0xa4, 0x67, 0x41}, + }, + { + {0xee, 0x8f, 0xce, 0xf8, 0x65, 0x26, 0xbe, 0xc2, 0x2c, 0xd6, 0x80, + 0xe8, 0x14, 0xff, 0x67, 0xe9, 0xee, 0x4e, 0x36, 0x2f, 0x7e, 0x6e, + 0x2e, 0xf1, 0xf6, 0xd2, 0x7e, 0xcb, 0x70, 0x33, 0xb3, 0x34}, + {0x0, 0x27, 0xf6, 0x76, 0x28, 0x9d, 0x3b, 0x64, 0xeb, 0x68, 0x76, + 0xe, 0x40, 0x9d, 0x1d, 0x5d, 0x84, 0x6, 0xfc, 0x21, 0x3, 0x43, + 0x4b, 0x1b, 0x6a, 0x24, 0x55, 0x22, 0x7e, 0xbb, 0x38, 0x79}, + {0xcc, 0xd6, 0x81, 0x86, 0xee, 0x91, 0xc5, 0xcd, 0x53, 0xa7, 0x85, + 0xed, 0x9c, 0x10, 0x2, 0xce, 0x83, 0x88, 0x80, 0x58, 0xc1, 0x85, + 0x74, 0xed, 0xe4, 0x65, 0xfe, 0x2d, 0x6e, 0xfc, 0x76, 0x11}, + }, + { + {0xb8, 0xe, 0x77, 0x49, 0x89, 0xe2, 0x90, 0xdb, 0xa3, 0x40, 0xf4, + 0xac, 0x2a, 0xcc, 0xfb, 0x98, 0x9b, 0x87, 0xd7, 0xde, 0xfe, 0x4f, + 0x35, 0x21, 0xb6, 0x6, 0x69, 0xf2, 0x54, 0x3e, 0x6a, 0x1f}, + {0x9b, 0x61, 0x9c, 0x5b, 0xd0, 0x6c, 0xaf, 0xb4, 0x80, 0x84, 0xa5, + 0xb2, 0xf4, 0xc9, 0xdf, 0x2d, 0xc4, 0x4d, 0xe9, 0xeb, 0x2, 0xa5, + 0x4f, 0x3d, 0x34, 0x5f, 0x7d, 0x67, 0x4c, 0x3a, 0xfc, 0x8}, + {0xea, 0x34, 0x7, 0xd3, 0x99, 0xc1, 0xa4, 0x60, 0xd6, 0x5c, 0x16, + 0x31, 0xb6, 0x85, 0xc0, 0x40, 0x95, 0x82, 0x59, 0xf7, 0x23, 0x3e, + 0x33, 0xe2, 0xd1, 0x0, 0xb9, 0x16, 0x1, 0xad, 0x2f, 0x4f}, + }, + { + {0x38, 0xb6, 0x3b, 0xb7, 0x1d, 0xd9, 0x2c, 0x96, 0x8, 0x9c, 0x12, + 0xfc, 0xaa, 0x77, 0x5, 0xe6, 0x89, 0x16, 0xb6, 0xf3, 0x39, 0x9b, + 0x61, 0x6f, 0x81, 0xee, 0x44, 0x29, 0x5f, 0x99, 0x51, 0x34}, + {0x54, 0x4e, 0xae, 0x94, 0x41, 0xb2, 0xbe, 0x44, 0x6c, 0xef, 0x57, + 0x18, 0x51, 0x1c, 0x54, 0x5f, 0x98, 0x4, 0x8d, 0x36, 0x2d, 0x6b, + 0x1e, 0xa6, 0xab, 0xf7, 0x2e, 0x97, 0xa4, 0x84, 0x54, 0x44}, + {0x7c, 0x7d, 0xea, 0x9f, 0xd0, 0xfc, 0x52, 0x91, 0xf6, 0x5c, 0x93, + 0xb0, 0x94, 0x6c, 0x81, 0x4a, 0x40, 0x5c, 0x28, 0x47, 0xaa, 0x9a, + 0x8e, 0x25, 0xb7, 0x93, 0x28, 0x4, 0xa6, 0x9c, 0xb8, 0x10}, + }, + }, + { + { + {0x6e, 0xf0, 0x45, 0x5a, 0xbe, 0x41, 0x39, 0x75, 0x65, 0x5f, 0x9c, + 0x6d, 0xed, 0xae, 0x7c, 0xd0, 0xb6, 0x51, 0xff, 0x72, 0x9c, 0x6b, + 0x77, 0x11, 0xa9, 0x4d, 0xd, 0xef, 0xd9, 0xd1, 0xd2, 0x17}, + {0x9c, 0x28, 0x18, 0x97, 0x49, 0x47, 0x59, 0x3d, 0x26, 0x3f, 0x53, + 0x24, 0xc5, 0xf8, 0xeb, 0x12, 0x15, 0xef, 0xc3, 0x14, 0xcb, 0xbf, + 0x62, 0x2, 0x8e, 0x51, 0xb7, 0x77, 0xd5, 0x78, 0xb8, 0x20}, + {0x6a, 0x3e, 0x3f, 0x7, 0x18, 0xaf, 0xf2, 0x27, 0x69, 0x10, 0x52, + 0xd7, 0x19, 0xe5, 0x3f, 0xfd, 0x22, 0x0, 0xa6, 0x3c, 0x2c, 0xb7, + 0xe3, 0x22, 0xa7, 0xc6, 0x65, 0xcc, 0x63, 0x4f, 0x21, 0x72}, + }, + { + {0xc9, 0x29, 0x3b, 0xf4, 0xb9, 0xb7, 0x9d, 0x1d, 0x75, 0x8f, 0x51, + 0x4f, 0x4a, 0x82, 0x5, 0xd6, 0xc4, 0x9d, 0x2f, 0x31, 0xbd, 0x72, + 0xc0, 0xf2, 0xb0, 0x45, 0x15, 0x5a, 0x85, 0xac, 0x24, 0x1f}, + {0x93, 0xa6, 0x7, 0x53, 0x40, 0x7f, 0xe3, 0xb4, 0x95, 0x67, 0x33, + 0x2f, 0xd7, 0x14, 0xa7, 0xab, 0x99, 0x10, 0x76, 0x73, 0xa7, 0xd0, + 0xfb, 0xd6, 0xc9, 0xcb, 0x71, 0x81, 0xc5, 0x48, 0xdf, 0x5f}, + {0xaa, 0x5, 0x95, 0x8e, 0x32, 0x8, 0xd6, 0x24, 0xee, 0x20, 0x14, + 0xc, 0xd1, 0xc1, 0x48, 0x47, 0xa2, 0x25, 0xfb, 0x6, 0x5c, 0xe4, + 0xff, 0xc7, 0xe6, 0x95, 0xe3, 0x2a, 0x9e, 0x73, 0xba, 0x0}, + }, + { + {0x26, 0xbb, 0x88, 0xea, 0xf5, 0x26, 0x44, 0xae, 0xfb, 0x3b, 0x97, + 0x84, 0xd9, 0x79, 0x6, 0x36, 0x50, 0x4e, 0x69, 0x26, 0xc, 0x3, + 0x9f, 0x5c, 0x26, 0xd2, 0x18, 0xd5, 0xe7, 0x7d, 0x29, 0x72}, + {0xd6, 0x90, 0x87, 0x5c, 0xde, 0x98, 0x2e, 0x59, 0xdf, 0xa2, 0xc2, + 0x45, 0xd3, 0xb7, 0xbf, 0xe5, 0x22, 0x99, 0xb4, 0xf9, 0x60, 0x3b, + 0x5a, 0x11, 0xf3, 0x78, 0xad, 0x67, 0x3e, 0x3a, 0x28, 0x3}, + {0x39, 0xb9, 0xc, 0xbe, 0xc7, 0x1d, 0x24, 0x48, 0x80, 0x30, 0x63, + 0x8b, 0x4d, 0x9b, 0xf1, 0x32, 0x8, 0x93, 0x28, 0x2, 0xd, 0xc9, + 0xdf, 0xd3, 0x45, 0x19, 0x27, 0x46, 0x68, 0x29, 0xe1, 0x5}, + }, + { + {0x50, 0x45, 0x2c, 0x24, 0xc8, 0xbb, 0xbf, 0xad, 0xd9, 0x81, 0x30, + 0xd0, 0xec, 0xc, 0xc8, 0xbc, 0x92, 0xdf, 0xc8, 0xf5, 0xa6, 0x66, + 0x35, 0x84, 0x4c, 0xce, 0x58, 0x82, 0xd3, 0x25, 0xcf, 0x78}, + {0x5a, 0x49, 0x9c, 0x2d, 0xb3, 0xee, 0x82, 0xba, 0x7c, 0xb9, 0x2b, + 0xf1, 0xfc, 0xc8, 0xef, 0xce, 0xe0, 0xd1, 0xb5, 0x93, 0xae, 0xab, + 0x2d, 0xb0, 0x9b, 0x8d, 0x69, 0x13, 0x9c, 0xc, 0xc0, 0x39}, + {0x68, 0x9d, 0x48, 0x31, 0x8e, 0x6b, 0xae, 0x15, 0x87, 0xf0, 0x2b, + 0x9c, 0xab, 0x1c, 0x85, 0xaa, 0x5, 0xfa, 0x4e, 0xf0, 0x97, 0x5a, + 0xa7, 0xc9, 0x32, 0xf8, 0x3f, 0x6b, 0x7, 0x52, 0x6b, 0x0}, + }, + { + {0x2d, 0x8, 0xce, 0xb9, 0x16, 0x7e, 0xcb, 0xf5, 0x29, 0xbc, 0x7a, + 0x41, 0x4c, 0xf1, 0x7, 0x34, 0xab, 0xa7, 0xf4, 0x2b, 0xce, 0x6b, + 0xb3, 0xd4, 0xce, 0x75, 0x9f, 0x1a, 0x56, 0xe9, 0xe2, 0x7d}, + {0x1c, 0x78, 0x95, 0x9d, 0xe1, 0xcf, 0xe0, 0x29, 0xe2, 0x10, 0x63, + 0x96, 0x18, 0xdf, 0x81, 0xb6, 0x39, 0x6b, 0x51, 0x70, 0xd3, 0x39, + 0xdf, 0x57, 0x22, 0x61, 0xc7, 0x3b, 0x44, 0xe3, 0x57, 0x4d}, + {0xcb, 0x5e, 0xa5, 0xb6, 0xf4, 0xd4, 0x70, 0xde, 0x99, 0xdb, 0x85, + 0x5d, 0x7f, 0x52, 0x1, 0x48, 0x81, 0x9a, 0xee, 0xd3, 0x40, 0xc4, + 0xc9, 0xdb, 0xed, 0x29, 0x60, 0x1a, 0xaf, 0x90, 0x2a, 0x6b}, + }, + { + {0xa, 0xd8, 0xb2, 0x5b, 0x24, 0xf3, 0xeb, 0x77, 0x9b, 0x7, 0xb9, + 0x2f, 0x47, 0x1b, 0x30, 0xd8, 0x33, 0x73, 0xee, 0x4c, 0xf2, 0xe6, + 0x47, 0xc6, 0x9, 0x21, 0x6c, 0x27, 0xc8, 0x12, 0x58, 0x46}, + {0x97, 0x1e, 0xe6, 0x9a, 0xfc, 0xf4, 0x23, 0x69, 0xd1, 0x5f, 0x3f, + 0xe0, 0x1d, 0x28, 0x35, 0x57, 0x2d, 0xd1, 0xed, 0xe6, 0x43, 0xae, + 0x64, 0xa7, 0x4a, 0x3e, 0x2d, 0xd1, 0xe9, 0xf4, 0xd8, 0x5f}, + {0xd9, 0x62, 0x10, 0x2a, 0xb2, 0xbe, 0x43, 0x4d, 0x16, 0xdc, 0x31, + 0x38, 0x75, 0xfb, 0x65, 0x70, 0xd7, 0x68, 0x29, 0xde, 0x7b, 0x4a, + 0xd, 0x18, 0x90, 0x67, 0xb1, 0x1c, 0x2b, 0x2c, 0xb3, 0x5}, + }, + { + {0x95, 0x81, 0xd5, 0x7a, 0x2c, 0xa4, 0xfc, 0xf7, 0xcc, 0xf3, 0x33, + 0x43, 0x6e, 0x28, 0x14, 0x32, 0x9d, 0x97, 0xb, 0x34, 0xd, 0x9d, + 0xc2, 0xb6, 0xe1, 0x7, 0x73, 0x56, 0x48, 0x1a, 0x77, 0x31}, + {0xfd, 0xa8, 0x4d, 0xd2, 0xcc, 0x5e, 0xc0, 0xc8, 0x83, 0xef, 0xdf, + 0x5, 0xac, 0x1a, 0xcf, 0xa1, 0x61, 0xcd, 0xf9, 0x7d, 0xf2, 0xef, + 0xbe, 0xdb, 0x99, 0x1e, 0x47, 0x7b, 0xa3, 0x56, 0x55, 0x3b}, + {0x82, 0xd4, 0x4d, 0xe1, 0x24, 0xc5, 0xb0, 0x32, 0xb6, 0xa4, 0x2b, + 0x1a, 0x54, 0x51, 0xb3, 0xed, 0xf3, 0x5a, 0x2b, 0x28, 0x48, 0x60, + 0xd1, 0xa3, 0xeb, 0x36, 0x73, 0x7a, 0xd2, 0x79, 0xc0, 0x4f}, + }, + { + {0xd, 0xc5, 0x86, 0xc, 0x44, 0x8b, 0x34, 0xdc, 0x51, 0xe6, 0x94, + 0xcc, 0xc9, 0xcb, 0x37, 0x13, 0xb9, 0x3c, 0x3e, 0x64, 0x4d, 0xf7, + 0x22, 0x64, 0x8, 0xcd, 0xe3, 0xba, 0xc2, 0x70, 0x11, 0x24}, + {0x7f, 0x2f, 0xbf, 0x89, 0xb0, 0x38, 0xc9, 0x51, 0xa7, 0xe9, 0xdf, + 0x2, 0x65, 0xbd, 0x97, 0x24, 0x53, 0xe4, 0x80, 0x78, 0x9c, 0xc0, + 0xff, 0xff, 0x92, 0x8e, 0xf9, 0xca, 0xce, 0x67, 0x45, 0x12}, + {0xb4, 0x73, 0xc4, 0xa, 0x86, 0xab, 0xf9, 0x3f, 0x35, 0xe4, 0x13, + 0x1, 0xee, 0x1d, 0x91, 0xf0, 0xaf, 0xc4, 0xc6, 0xeb, 0x60, 0x50, + 0xe7, 0x4a, 0xd, 0x0, 0x87, 0x6c, 0x96, 0x12, 0x86, 0x3f}, + }, + }, + { + { + {0x13, 0x8d, 0x4, 0x36, 0xfa, 0xfc, 0x18, 0x9c, 0xdd, 0x9d, 0x89, + 0x73, 0xb3, 0x9d, 0x15, 0x29, 0xaa, 0xd0, 0x92, 0x9f, 0xb, 0x35, + 0x9f, 0xdc, 0xd4, 0x19, 0x8a, 0x87, 0xee, 0x7e, 0xf5, 0x26}, + {0xde, 0xd, 0x2a, 0x78, 0xc9, 0xc, 0x9a, 0x55, 0x85, 0x83, 0x71, + 0xea, 0xb2, 0xcd, 0x1d, 0x55, 0x8c, 0x23, 0xef, 0x31, 0x5b, 0x86, + 0x62, 0x7f, 0x3d, 0x61, 0x73, 0x79, 0x76, 0xa7, 0x4a, 0x50}, + {0xb1, 0xef, 0x87, 0x56, 0xd5, 0x2c, 0xab, 0xc, 0x7b, 0xf1, 0x7a, + 0x24, 0x62, 0xd1, 0x80, 0x51, 0x67, 0x24, 0x5a, 0x4f, 0x34, 0x5a, + 0xc1, 0x85, 0x69, 0x30, 0xba, 0x9d, 0x3d, 0x94, 0x41, 0x40}, + }, + { + {0xdd, 0xaa, 0x6c, 0xa2, 0x43, 0x77, 0x21, 0x4b, 0xce, 0xb7, 0x8a, + 0x64, 0x24, 0xb4, 0xa6, 0x47, 0xe3, 0xc9, 0xfb, 0x3, 0x7a, 0x4f, + 0x1d, 0xcb, 0x19, 0xd0, 0x0, 0x98, 0x42, 0x31, 0xd9, 0x12}, + {0x96, 0xcc, 0xeb, 0x43, 0xba, 0xee, 0xc0, 0xc3, 0xaf, 0x9c, 0xea, + 0x26, 0x9c, 0x9c, 0x74, 0x8d, 0xc6, 0xcc, 0x77, 0x1c, 0xee, 0x95, + 0xfa, 0xd9, 0xf, 0x34, 0x84, 0x76, 0xd9, 0xa1, 0x20, 0x14}, + {0x4f, 0x59, 0x37, 0xd3, 0x99, 0x77, 0xc6, 0x0, 0x7b, 0xa4, 0x3a, + 0xb2, 0x40, 0x51, 0x3c, 0x5e, 0x95, 0xf3, 0x5f, 0xe3, 0x54, 0x28, + 0x18, 0x44, 0x12, 0xa0, 0x59, 0x43, 0x31, 0x92, 0x4f, 0x1b}, + }, + { + {0xb1, 0x66, 0x98, 0xa4, 0x30, 0x30, 0xcf, 0x33, 0x59, 0x48, 0x5f, + 0x21, 0xd2, 0x73, 0x1f, 0x25, 0xf6, 0xf4, 0xde, 0x51, 0x40, 0xaa, + 0x82, 0xab, 0xf6, 0x23, 0x9a, 0x6f, 0xd5, 0x91, 0xf1, 0x5f}, + {0x51, 0x9, 0x15, 0x89, 0x9d, 0x10, 0x5c, 0x3e, 0x6a, 0x69, 0xe9, + 0x2d, 0x91, 0xfa, 0xce, 0x39, 0x20, 0x30, 0x5f, 0x97, 0x3f, 0xe4, + 0xea, 0x20, 0xae, 0x2d, 0x13, 0x7f, 0x2a, 0x57, 0x9b, 0x23}, + {0x68, 0x90, 0x2d, 0xac, 0x33, 0xd4, 0x9e, 0x81, 0x23, 0x85, 0xc9, + 0x5f, 0x79, 0xab, 0x83, 0x28, 0x3d, 0xeb, 0x93, 0x55, 0x80, 0x72, + 0x45, 0xef, 0xcb, 0x36, 0x8f, 0x75, 0x6a, 0x52, 0xc, 0x2}, + }, + { + {0x89, 0xcc, 0x42, 0xf0, 0x59, 0xef, 0x31, 0xe9, 0xb6, 0x4b, 0x12, + 0x8e, 0x9d, 0x9c, 0x58, 0x2c, 0x97, 0x59, 0xc7, 0xae, 0x8a, 0xe1, + 0xc8, 0xad, 0xc, 0xc5, 0x2, 0x56, 0xa, 0xfe, 0x2c, 0x45}, + {0xbc, 0xdb, 0xd8, 0x9e, 0xf8, 0x34, 0x98, 0x77, 0x6c, 0xa4, 0x7c, + 0xdc, 0xf9, 0xaa, 0xf2, 0xc8, 0x74, 0xb0, 0xe1, 0xa3, 0xdc, 0x4c, + 0x52, 0xa9, 0x77, 0x38, 0x31, 0x15, 0x46, 0xcc, 0xaa, 0x2}, + {0xdf, 0x77, 0x78, 0x64, 0xa0, 0xf7, 0xa0, 0x86, 0x9f, 0x7c, 0x60, + 0xe, 0x27, 0x64, 0xc4, 0xbb, 0xc9, 0x11, 0xfb, 0xf1, 0x25, 0xea, + 0x17, 0xab, 0x7b, 0x87, 0x4b, 0x30, 0x7b, 0x7d, 0xfb, 0x4c}, + }, + { + {0x12, 0xef, 0x89, 0x97, 0xc2, 0x99, 0x86, 0xe2, 0xd, 0x19, 0x57, + 0xdf, 0x71, 0xcd, 0x6e, 0x2b, 0xd0, 0x70, 0xc9, 0xec, 0x57, 0xc8, + 0x43, 0xc3, 0xc5, 0x3a, 0x4d, 0x43, 0xbc, 0x4c, 0x1d, 0x5b}, + {0xfe, 0x75, 0x9b, 0xb8, 0x6c, 0x3d, 0xb4, 0x72, 0x80, 0xdc, 0x6a, + 0x9c, 0xd9, 0x94, 0xc6, 0x54, 0x9f, 0x4c, 0xe3, 0x3e, 0x37, 0xaa, + 0xc3, 0xb8, 0x64, 0x53, 0x7, 0x39, 0x2b, 0x62, 0xb4, 0x14}, + {0x26, 0x9f, 0xa, 0xcc, 0x15, 0x26, 0xfb, 0xb6, 0xe5, 0xcc, 0x8d, + 0xb8, 0x2b, 0xe, 0x4f, 0x3a, 0x5, 0xa7, 0x69, 0x33, 0x8b, 0x49, + 0x1, 0x13, 0xd1, 0x2d, 0x59, 0x58, 0x12, 0xf7, 0x98, 0x2f}, + }, + { + {0x1, 0xa7, 0x54, 0x4f, 0x44, 0xae, 0x12, 0x2e, 0xde, 0xd7, 0xcb, + 0xa9, 0xf0, 0x3e, 0xfe, 0xfc, 0xe0, 0x5d, 0x83, 0x75, 0xd, 0x89, + 0xbf, 0xce, 0x54, 0x45, 0x61, 0xe7, 0xe9, 0x62, 0x80, 0x1d}, + {0x56, 0x9e, 0xf, 0xb5, 0x4c, 0xa7, 0x94, 0xc, 0x20, 0x13, 0x8e, + 0x8e, 0xa9, 0xf4, 0x1f, 0x5b, 0x67, 0xf, 0x30, 0x82, 0x21, 0xcc, + 0x2a, 0x9a, 0xf9, 0xaa, 0x6, 0xd8, 0x49, 0xe2, 0x6a, 0x3a}, + {0x5a, 0x7c, 0x90, 0xa9, 0x85, 0xda, 0x7a, 0x65, 0x62, 0xf, 0xb9, + 0x91, 0xb5, 0xa8, 0xe, 0x1a, 0xe9, 0xb4, 0x34, 0xdf, 0xfb, 0x1d, + 0xe, 0x8d, 0xf3, 0x5f, 0xf2, 0xae, 0xe8, 0x8c, 0x8b, 0x29}, + }, + { + {0xde, 0x65, 0x21, 0xa, 0xea, 0x72, 0x7a, 0x83, 0xf6, 0x79, 0xcf, + 0xb, 0xb4, 0x7, 0xab, 0x3f, 0x70, 0xae, 0x38, 0x77, 0xc7, 0x36, + 0x16, 0x52, 0xdc, 0xd7, 0xa7, 0x3, 0x18, 0x27, 0xa6, 0x6b}, + {0xb2, 0xc, 0xf7, 0xef, 0x53, 0x79, 0x92, 0x2a, 0x76, 0x70, 0x15, + 0x79, 0x2a, 0xc9, 0x89, 0x4b, 0x6a, 0xcf, 0xa7, 0x30, 0x7a, 0x45, + 0x18, 0x94, 0x85, 0xe4, 0x5c, 0x4d, 0x40, 0xa8, 0xb8, 0x34}, + {0x35, 0x33, 0x69, 0x83, 0xb5, 0xec, 0x6e, 0xc2, 0xfd, 0xfe, 0xb5, + 0x63, 0xdf, 0x13, 0xa8, 0xd5, 0x73, 0x25, 0xb2, 0xa4, 0x9a, 0xaa, + 0x93, 0xa2, 0x6a, 0x1c, 0x5e, 0x46, 0xdd, 0x2b, 0xd6, 0x71}, + }, + { + {0xf5, 0x5e, 0xf7, 0xb1, 0xda, 0xb5, 0x2d, 0xcd, 0xf5, 0x65, 0xb0, + 0x16, 0xcf, 0x95, 0x7f, 0xd7, 0x85, 0xf0, 0x49, 0x3f, 0xea, 0x1f, + 0x57, 0x14, 0x3d, 0x2b, 0x2b, 0x26, 0x21, 0x36, 0x33, 0x1c}, + {0x80, 0xdf, 0x78, 0xd3, 0x28, 0xcc, 0x33, 0x65, 0xb4, 0xa4, 0xf, + 0xa, 0x79, 0x43, 0xdb, 0xf6, 0x5a, 0xda, 0x1, 0xf7, 0xf9, 0x5f, + 0x64, 0xe3, 0xa4, 0x2b, 0x17, 0xf3, 0x17, 0xf3, 0xd5, 0x74}, + {0x81, 0xca, 0xd9, 0x67, 0x54, 0xe5, 0x6f, 0xa8, 0x37, 0x8c, 0x29, + 0x2b, 0x75, 0x7c, 0x8b, 0x39, 0x3b, 0x62, 0xac, 0xe3, 0x92, 0x8, + 0x6d, 0xda, 0x8c, 0xd9, 0xe9, 0x47, 0x45, 0xcc, 0xeb, 0x4a}, + }, + }, + { + { + {0x10, 0xb6, 0x54, 0x73, 0x9e, 0x8d, 0x40, 0xb, 0x6e, 0x5b, 0xa8, + 0x5b, 0x53, 0x32, 0x6b, 0x80, 0x7, 0xa2, 0x58, 0x4a, 0x3, 0x3a, + 0xe6, 0xdb, 0x2c, 0xdf, 0xa1, 0xc9, 0xdd, 0xd9, 0x3b, 0x17}, + {0xc9, 0x1, 0x6d, 0x27, 0x1b, 0x7, 0xf0, 0x12, 0x70, 0x8c, 0xc4, + 0x86, 0xc5, 0xba, 0xb8, 0xe7, 0xa9, 0xfb, 0xd6, 0x71, 0x9b, 0x12, + 0x8, 0x53, 0x92, 0xb7, 0x3d, 0x5a, 0xf9, 0xfb, 0x88, 0x5d}, + {0xdf, 0x72, 0x58, 0xfe, 0x1e, 0xf, 0x50, 0x2b, 0xc1, 0x18, 0x39, + 0xd4, 0x2e, 0x58, 0xd6, 0x58, 0xe0, 0x3a, 0x67, 0xc9, 0x8e, 0x27, + 0xed, 0xe6, 0x19, 0xa3, 0x9e, 0xb1, 0x13, 0xcd, 0xe1, 0x6}, + }, + { + {0x53, 0x3, 0x5b, 0x9e, 0x62, 0xaf, 0x2b, 0x47, 0x47, 0x4, 0x8d, + 0x27, 0x90, 0xb, 0xaa, 0x3b, 0x27, 0xbf, 0x43, 0x96, 0x46, 0x5f, + 0x78, 0xc, 0x13, 0x7b, 0x83, 0x8d, 0x1a, 0x6a, 0x3a, 0x7f}, + {0x23, 0x6f, 0x16, 0x6f, 0x51, 0xad, 0xd0, 0x40, 0xbe, 0x6a, 0xab, + 0x1f, 0x93, 0x32, 0x8e, 0x11, 0x8e, 0x8, 0x4d, 0xa0, 0x14, 0x5e, + 0xe3, 0x3f, 0x66, 0x62, 0xe1, 0x26, 0x35, 0x60, 0x80, 0x30}, + {0xb, 0x80, 0x3d, 0x5d, 0x39, 0x44, 0xe6, 0xf7, 0xf6, 0xed, 0x1, + 0xc9, 0x55, 0xd5, 0xa8, 0x95, 0x39, 0x63, 0x2c, 0x59, 0x30, 0x78, + 0xcd, 0x68, 0x7e, 0x30, 0x51, 0x2e, 0xed, 0xfd, 0xd0, 0x30}, + }, + { + {0x50, 0x47, 0xb8, 0x68, 0x1e, 0x97, 0xb4, 0x9c, 0xcf, 0xbb, 0x64, + 0x66, 0x29, 0x72, 0x95, 0xa0, 0x2b, 0x41, 0xfa, 0x72, 0x26, 0xe7, + 0x8d, 0x5c, 0xd9, 0x89, 0xc5, 0x51, 0x43, 0x8, 0x15, 0x46}, + {0xb3, 0x33, 0x12, 0xf2, 0x1a, 0x4d, 0x59, 0xe0, 0x9c, 0x4d, 0xcc, + 0xf0, 0x8e, 0xe7, 0xdb, 0x1b, 0x77, 0x9a, 0x49, 0x8f, 0x7f, 0x18, + 0x65, 0x69, 0x68, 0x98, 0x9, 0x2c, 0x20, 0x14, 0x92, 0xa}, + {0x2e, 0xa0, 0xb9, 0xae, 0xc0, 0x19, 0x90, 0xbc, 0xae, 0x4c, 0x3, + 0x16, 0xd, 0x11, 0xc7, 0x55, 0xec, 0x32, 0x99, 0x65, 0x1, 0xf5, + 0x6d, 0xe, 0xfe, 0x5d, 0xca, 0x95, 0x28, 0xd, 0xca, 0x3b}, + }, + { + {0xbf, 0x1, 0xcc, 0x9e, 0xb6, 0x8e, 0x68, 0x9c, 0x6f, 0x89, 0x44, + 0xa6, 0xad, 0x83, 0xbc, 0xf0, 0xe2, 0x9f, 0x7a, 0x5f, 0x5f, 0x95, + 0x2d, 0xca, 0x41, 0x82, 0xf2, 0x8d, 0x3, 0xb4, 0xa8, 0x4e}, + {0xa4, 0x62, 0x5d, 0x3c, 0xbc, 0x31, 0xf0, 0x40, 0x60, 0x7a, 0xf0, + 0xcf, 0x3e, 0x8b, 0xfc, 0x19, 0x45, 0xb5, 0xf, 0x13, 0xa2, 0x3d, + 0x18, 0x98, 0xcd, 0x13, 0x8f, 0xae, 0xdd, 0xde, 0x31, 0x56}, + {0x2, 0xd2, 0xca, 0xf1, 0xa, 0x46, 0xed, 0x2a, 0x83, 0xee, 0x8c, + 0xa4, 0x5, 0x53, 0x30, 0x46, 0x5f, 0x1a, 0xf1, 0x49, 0x45, 0x77, + 0x21, 0x91, 0x63, 0xa4, 0x2c, 0x54, 0x30, 0x9, 0xce, 0x24}, + }, + { + {0x85, 0xb, 0xf3, 0xfd, 0x55, 0xa1, 0xcf, 0x3f, 0xa4, 0x2e, 0x37, + 0x36, 0x8e, 0x16, 0xf7, 0xd2, 0x44, 0xf8, 0x92, 0x64, 0xde, 0x64, + 0xe0, 0xb2, 0x80, 0x42, 0x4f, 0x32, 0xa7, 0x28, 0x99, 0x54}, + {0x6, 0xc1, 0x6, 0xfd, 0xf5, 0x90, 0xe8, 0x1f, 0xf2, 0x10, 0x88, + 0x5d, 0x35, 0x68, 0xc4, 0xb5, 0x3e, 0xaf, 0x8c, 0x6e, 0xfe, 0x8, + 0x78, 0x82, 0x4b, 0xd7, 0x6, 0x8a, 0xc2, 0xe3, 0xd4, 0x41}, + {0x2e, 0x1a, 0xee, 0x63, 0xa7, 0x32, 0x6e, 0xf2, 0xea, 0xfd, 0x5f, + 0xd2, 0xb7, 0xe4, 0x91, 0xae, 0x69, 0x4d, 0x7f, 0xd1, 0x3b, 0xd3, + 0x3b, 0xbc, 0x6a, 0xff, 0xdc, 0xc0, 0xde, 0x66, 0x1b, 0x49}, + }, + { + {0xa1, 0x64, 0xda, 0xd0, 0x8e, 0x4a, 0xf0, 0x75, 0x4b, 0x28, 0xe2, + 0x67, 0xaf, 0x2c, 0x22, 0xed, 0xa4, 0x7b, 0x7b, 0x1f, 0x79, 0xa3, + 0x34, 0x82, 0x67, 0x8b, 0x1, 0xb7, 0xb0, 0xb8, 0xf6, 0x4c}, + {0xa7, 0x32, 0xea, 0xc7, 0x3d, 0xb1, 0xf5, 0x98, 0x98, 0xdb, 0x16, + 0x7e, 0xcc, 0xf8, 0xd5, 0xe3, 0x47, 0xd9, 0xf8, 0xcb, 0x52, 0xbf, + 0xa, 0xac, 0xac, 0xe4, 0x5e, 0xc8, 0xd0, 0x38, 0xf3, 0x8}, + {0xbd, 0x73, 0x1a, 0x99, 0x21, 0xa8, 0x83, 0xc3, 0x7a, 0xc, 0x32, + 0xdf, 0x1, 0xbc, 0x27, 0xab, 0x63, 0x70, 0x77, 0x84, 0x1b, 0x33, + 0x3d, 0xc1, 0x99, 0x8a, 0x7, 0xeb, 0x82, 0x4a, 0xd, 0x53}, + }, + { + {0x9e, 0xbf, 0x9a, 0x6c, 0x45, 0x73, 0x69, 0x6d, 0x80, 0xa8, 0x0, + 0x49, 0xfc, 0xb2, 0x7f, 0x25, 0x50, 0xb8, 0xcf, 0xc8, 0x12, 0xf4, + 0xac, 0x2b, 0x5b, 0xbd, 0xbf, 0xc, 0xe0, 0xe7, 0xb3, 0xd}, + {0x25, 0x48, 0xf9, 0xe1, 0x30, 0x36, 0x4c, 0x0, 0x5a, 0x53, 0xab, + 0x8c, 0x26, 0x78, 0x2d, 0x7e, 0x8b, 0xff, 0x84, 0xcc, 0x23, 0x23, + 0x48, 0xc7, 0xb9, 0x70, 0x17, 0x10, 0x3f, 0x75, 0xea, 0x65}, + {0x63, 0x63, 0x9, 0xe2, 0x3e, 0xfc, 0x66, 0x3d, 0x6b, 0xcb, 0xb5, + 0x61, 0x7f, 0x2c, 0xd6, 0x81, 0x1a, 0x3b, 0x44, 0x13, 0x42, 0x4, + 0xbe, 0xf, 0xdb, 0xa1, 0xe1, 0x21, 0x19, 0xec, 0xa4, 0x2}, + }, + { + {0x5f, 0x79, 0xcf, 0xf1, 0x62, 0x61, 0xc8, 0xf5, 0xf2, 0x57, 0xee, + 0x26, 0x19, 0x86, 0x8c, 0x11, 0x78, 0x35, 0x6, 0x1c, 0x85, 0x24, + 0x21, 0x17, 0xcf, 0x7f, 0x6, 0xec, 0x5d, 0x2b, 0xd1, 0x36}, + {0xa2, 0xb8, 0x24, 0x3b, 0x9a, 0x25, 0xe6, 0x5c, 0xb8, 0xa0, 0xaf, + 0x45, 0xcc, 0x7a, 0x57, 0xb8, 0x37, 0x70, 0xa0, 0x8b, 0xe8, 0xe6, + 0xcb, 0xcc, 0xbf, 0x9, 0x78, 0x12, 0x51, 0x3c, 0x14, 0x3d}, + {0x57, 0x45, 0x15, 0x79, 0x91, 0x27, 0x6d, 0x12, 0xa, 0x3a, 0x78, + 0xfc, 0x5c, 0x8f, 0xe4, 0xd5, 0xac, 0x9b, 0x17, 0xdf, 0xe8, 0xb6, + 0xbd, 0x36, 0x59, 0x28, 0xa8, 0x5b, 0x88, 0x17, 0xf5, 0x2e}, + }, + }, + { + { + {0x51, 0x2f, 0x5b, 0x30, 0xfb, 0xbf, 0xee, 0x96, 0xb8, 0x96, 0x95, + 0x88, 0xad, 0x38, 0xf9, 0xd3, 0x25, 0xdd, 0xd5, 0x46, 0xc7, 0x2d, + 0xf5, 0xf0, 0x95, 0x0, 0x3a, 0xbb, 0x90, 0x82, 0x96, 0x57}, + {0xdc, 0xae, 0x58, 0x8c, 0x4e, 0x97, 0x37, 0x46, 0xa4, 0x41, 0xf0, + 0xab, 0xfb, 0x22, 0xef, 0xb9, 0x8a, 0x71, 0x80, 0xe9, 0x56, 0xd9, + 0x85, 0xe1, 0xa6, 0xa8, 0x43, 0xb1, 0xfa, 0x78, 0x1b, 0x2f}, + {0x1, 0xe1, 0x20, 0xa, 0x43, 0xb8, 0x1a, 0xf7, 0x47, 0xec, 0xf0, + 0x24, 0x8d, 0x65, 0x93, 0xf3, 0xd1, 0xee, 0xe2, 0x6e, 0xa8, 0x9, + 0x75, 0xcf, 0xe1, 0xa3, 0x2a, 0xdc, 0x35, 0x3e, 0xc4, 0x7d}, + }, + { + {0x18, 0x97, 0x3e, 0x27, 0x5c, 0x2a, 0x78, 0x5a, 0x94, 0xfd, 0x4e, + 0x5e, 0x99, 0xc6, 0x76, 0x35, 0x3e, 0x7d, 0x23, 0x1f, 0x5, 0xd8, + 0x2e, 0xf, 0x99, 0xa, 0xd5, 0x82, 0x1d, 0xb8, 0x4f, 0x4}, + {0xc3, 0xd9, 0x7d, 0x88, 0x65, 0x66, 0x96, 0x85, 0x55, 0x53, 0xb0, + 0x4b, 0x31, 0x9b, 0xf, 0xc9, 0xb1, 0x79, 0x20, 0xef, 0xf8, 0x8d, + 0xe0, 0xc6, 0x2f, 0xc1, 0x8c, 0x75, 0x16, 0x20, 0xf7, 0x7e}, + {0xd9, 0xe3, 0x7, 0xa9, 0xc5, 0x18, 0xdf, 0xc1, 0x59, 0x63, 0x4c, + 0xce, 0x1d, 0x37, 0xb3, 0x57, 0x49, 0xbb, 0x1, 0xb2, 0x34, 0x45, + 0x70, 0xca, 0x2e, 0xdd, 0x30, 0x9c, 0x3f, 0x82, 0x79, 0x7f}, + }, + { + {0xba, 0x87, 0xf5, 0x68, 0xf0, 0x1f, 0x9c, 0x6a, 0xde, 0xc8, 0x50, + 0x0, 0x4e, 0x89, 0x27, 0x8, 0xe7, 0x5b, 0xed, 0x7d, 0x55, 0x99, + 0xbf, 0x3c, 0xf0, 0xd6, 0x6, 0x1c, 0x43, 0xb0, 0xa9, 0x64}, + {0xe8, 0x13, 0xb5, 0xa3, 0x39, 0xd2, 0x34, 0x83, 0xd8, 0xa8, 0x1f, + 0xb9, 0xd4, 0x70, 0x36, 0xc1, 0x33, 0xbd, 0x90, 0xf5, 0x36, 0x41, + 0xb5, 0x12, 0xb4, 0xd9, 0x84, 0xd7, 0x73, 0x3, 0x4e, 0xa}, + {0x19, 0x29, 0x7d, 0x5b, 0xa1, 0xd6, 0xb3, 0x2e, 0x35, 0x82, 0x3a, + 0xd5, 0xa0, 0xf6, 0xb4, 0xb0, 0x47, 0x5d, 0xa4, 0x89, 0x43, 0xce, + 0x56, 0x71, 0x6c, 0x34, 0x18, 0xce, 0xa, 0x7d, 0x1a, 0x7}, + }, + { + {0x31, 0x44, 0xe1, 0x20, 0x52, 0x35, 0xc, 0xcc, 0x41, 0x51, 0xb1, + 0x9, 0x7, 0x95, 0x65, 0xd, 0x36, 0x5f, 0x9d, 0x20, 0x1b, 0x62, + 0xf5, 0x9a, 0xd3, 0x55, 0x77, 0x61, 0xf7, 0xbc, 0x69, 0x7c}, + {0xb, 0xba, 0x87, 0xc8, 0xaa, 0x2d, 0x7, 0xd3, 0xee, 0x62, 0xa5, + 0xbf, 0x5, 0x29, 0x26, 0x1, 0x8b, 0x76, 0xef, 0xc0, 0x2, 0x30, + 0x54, 0xcf, 0x9c, 0x7e, 0xea, 0x46, 0x71, 0xcc, 0x3b, 0x2c}, + {0x5f, 0x29, 0xe8, 0x4, 0xeb, 0xd7, 0xf0, 0x7, 0x7d, 0xf3, 0x50, + 0x2f, 0x25, 0x18, 0xdb, 0x10, 0xd7, 0x98, 0x17, 0x17, 0xa3, 0xa9, + 0x51, 0xe9, 0x1d, 0xa5, 0xac, 0x22, 0x73, 0x9a, 0x5a, 0x6f}, + }, + { + {0xbe, 0x44, 0xd9, 0xa3, 0xeb, 0xd4, 0x29, 0xe7, 0x9e, 0xaf, 0x78, + 0x80, 0x40, 0x9, 0x9e, 0x8d, 0x3, 0x9c, 0x86, 0x47, 0x7a, 0x56, + 0x25, 0x45, 0x24, 0x3b, 0x8d, 0xee, 0x80, 0x96, 0xab, 0x2}, + {0xc5, 0xc6, 0x41, 0x2f, 0xc, 0x0, 0xa1, 0x8b, 0x9b, 0xfb, 0xfe, + 0xc, 0xc1, 0x79, 0x9f, 0xc4, 0x9f, 0x1c, 0xc5, 0x3c, 0x70, 0x47, + 0xfa, 0x4e, 0xca, 0xaf, 0x47, 0xe1, 0xa2, 0x21, 0x4e, 0x49}, + {0x9a, 0xd, 0xe5, 0xdd, 0x85, 0x8a, 0xa4, 0xef, 0x49, 0xa2, 0xb9, + 0xf, 0x4e, 0x22, 0x9a, 0x21, 0xd9, 0xf6, 0x1e, 0xd9, 0x1d, 0x1f, + 0x9, 0xfa, 0x34, 0xbb, 0x46, 0xea, 0xcb, 0x76, 0x5d, 0x6b}, + }, + { + {0x22, 0x25, 0x78, 0x1e, 0x17, 0x41, 0xf9, 0xe0, 0xd3, 0x36, 0x69, + 0x3, 0x74, 0xae, 0xe6, 0xf1, 0x46, 0xc7, 0xfc, 0xd0, 0xa2, 0x3e, + 0x8b, 0x40, 0x3e, 0x31, 0xdd, 0x3, 0x9c, 0x86, 0xfb, 0x16}, + {0x94, 0xd9, 0xc, 0xec, 0x6c, 0x55, 0x57, 0x88, 0xba, 0x1d, 0xd0, + 0x5c, 0x6f, 0xdc, 0x72, 0x64, 0x77, 0xb4, 0x42, 0x8f, 0x14, 0x69, + 0x1, 0xaf, 0x54, 0x73, 0x27, 0x85, 0xf6, 0x33, 0xe3, 0xa}, + {0x62, 0x9, 0xb6, 0x33, 0x97, 0x19, 0x8e, 0x28, 0x33, 0xe1, 0xab, + 0xd8, 0xb4, 0x72, 0xfc, 0x24, 0x3e, 0xd0, 0x91, 0x9, 0xed, 0xf7, + 0x11, 0x48, 0x75, 0xd0, 0x70, 0x8f, 0x8b, 0xe3, 0x81, 0x3f}, + }, + { + {0x24, 0xc8, 0x17, 0x5f, 0x35, 0x7f, 0xdb, 0xa, 0xa4, 0x99, 0x42, + 0xd7, 0xc3, 0x23, 0xb9, 0x74, 0xf7, 0xea, 0xf8, 0xcb, 0x8b, 0x3e, + 0x7c, 0xd5, 0x3d, 0xdc, 0xde, 0x4c, 0xd3, 0xe2, 0xd3, 0xa}, + {0xfe, 0xaf, 0xd9, 0x7e, 0xcc, 0xf, 0x91, 0x7f, 0x4b, 0x87, 0x65, + 0x24, 0xa1, 0xb8, 0x5c, 0x54, 0x4, 0x47, 0xc, 0x4b, 0xd2, 0x7e, + 0x39, 0xa8, 0x93, 0x9, 0xf5, 0x4, 0xc1, 0xf, 0x51, 0x50}, + {0x9d, 0x24, 0x6e, 0x33, 0xc5, 0xf, 0xc, 0x6f, 0xd9, 0xcf, 0x31, + 0xc3, 0x19, 0xde, 0x5e, 0x74, 0x1c, 0xfe, 0xee, 0x9, 0x0, 0xfd, + 0xd6, 0xf2, 0xbe, 0x1e, 0xfa, 0xf0, 0x8b, 0x15, 0x7c, 0x12}, + }, + { + {0x74, 0xb9, 0x51, 0xae, 0xc4, 0x8f, 0xa2, 0xde, 0x96, 0xfe, 0x4d, + 0x74, 0xd3, 0x73, 0x99, 0x1d, 0xa8, 0x48, 0x38, 0x87, 0xb, 0x68, + 0x40, 0x62, 0x95, 0xdf, 0x67, 0xd1, 0x79, 0x24, 0xd8, 0x4e}, + {0xa2, 0x79, 0x98, 0x2e, 0x42, 0x7c, 0x19, 0xf6, 0x47, 0x36, 0xca, + 0x52, 0xd4, 0xdd, 0x4a, 0xa4, 0xcb, 0xac, 0x4e, 0x4b, 0xc1, 0x3f, + 0x41, 0x9b, 0x68, 0x4f, 0xef, 0x7, 0x7d, 0xf8, 0x4e, 0x35}, + {0x75, 0xd9, 0xc5, 0x60, 0x22, 0xb5, 0xe3, 0xfe, 0xb8, 0xb0, 0x41, + 0xeb, 0xfc, 0x2e, 0x35, 0x50, 0x3c, 0x65, 0xf6, 0xa9, 0x30, 0xac, + 0x8, 0x88, 0x6d, 0x23, 0x39, 0x5, 0xd2, 0x92, 0x2d, 0x30}, + }, + }, + { + { + {0x77, 0xf1, 0xe0, 0xe4, 0xb6, 0x6f, 0xbc, 0x2d, 0x93, 0x6a, 0xbd, + 0xa4, 0x29, 0xbf, 0xe1, 0x4, 0xe8, 0xf6, 0x7a, 0x78, 0xd4, 0x66, + 0x19, 0x5e, 0x60, 0xd0, 0x26, 0xb4, 0x5e, 0x5f, 0xdc, 0xe}, + {0x3d, 0x28, 0xa4, 0xbc, 0xa2, 0xc1, 0x13, 0x78, 0xd9, 0x3d, 0x86, + 0xa1, 0x91, 0xf0, 0x62, 0xed, 0x86, 0xfa, 0x68, 0xc2, 0xb8, 0xbc, + 0xc7, 0xae, 0x4c, 0xae, 0x1c, 0x6f, 0xb7, 0xd3, 0xe5, 0x10}, + {0x67, 0x8e, 0xda, 0x53, 0xd6, 0xbf, 0x53, 0x54, 0x41, 0xf6, 0xa9, + 0x24, 0xec, 0x1e, 0xdc, 0xe9, 0x23, 0x8a, 0x57, 0x3, 0x3b, 0x26, + 0x87, 0xbf, 0x72, 0xba, 0x1c, 0x36, 0x51, 0x6c, 0xb4, 0x45}, + }, + { + {0xe4, 0xe3, 0x7f, 0x8a, 0xdd, 0x4d, 0x9d, 0xce, 0x30, 0xe, 0x62, + 0x76, 0x56, 0x64, 0x13, 0xab, 0x58, 0x99, 0xe, 0xb3, 0x7b, 0x4f, + 0x59, 0x4b, 0xdf, 0x29, 0x12, 0x32, 0xef, 0xa, 0x1c, 0x5c}, + {0xa1, 0x7f, 0x4f, 0x31, 0xbf, 0x2a, 0x40, 0xa9, 0x50, 0xf4, 0x8c, + 0x8e, 0xdc, 0xf1, 0x57, 0xe2, 0x84, 0xbe, 0xa8, 0x23, 0x4b, 0xd5, + 0xbb, 0x1d, 0x3b, 0x71, 0xcb, 0x6d, 0xa3, 0xbf, 0x77, 0x21}, + {0x8f, 0xdb, 0x79, 0xfa, 0xbc, 0x1b, 0x8, 0x37, 0xb3, 0x59, 0x5f, + 0xc2, 0x1e, 0x81, 0x48, 0x60, 0x87, 0x24, 0x83, 0x9c, 0x65, 0x76, + 0x7a, 0x8, 0xbb, 0xb5, 0x8a, 0x7d, 0x38, 0x19, 0xe6, 0x4a}, + }, + { + {0x83, 0xfb, 0x5b, 0x98, 0x44, 0x7e, 0x11, 0x61, 0x36, 0x31, 0x96, + 0x71, 0x2a, 0x46, 0xe0, 0xfc, 0x4b, 0x90, 0x25, 0xd4, 0x48, 0x34, + 0xac, 0x83, 0x64, 0x3d, 0xa4, 0x5b, 0xbe, 0x5a, 0x68, 0x75}, + {0x2e, 0xa3, 0x44, 0x53, 0xaa, 0xf6, 0xdb, 0x8d, 0x78, 0x40, 0x1b, + 0xb4, 0xb4, 0xea, 0x88, 0x7d, 0x60, 0xd, 0x13, 0x4a, 0x97, 0xeb, + 0xb0, 0x5e, 0x3, 0x3e, 0xbf, 0x17, 0x1b, 0xd9, 0x0, 0x1a}, + {0xb2, 0xf2, 0x61, 0xeb, 0x33, 0x9, 0x96, 0x6e, 0x52, 0x49, 0xff, + 0xc9, 0xa8, 0xf, 0x3d, 0x54, 0x69, 0x65, 0xf6, 0x7a, 0x10, 0x75, + 0x72, 0xdf, 0xaa, 0xe6, 0xb0, 0x23, 0xb6, 0x29, 0x55, 0x13}, + }, + { + {0xfe, 0x83, 0x2e, 0xe2, 0xbc, 0x16, 0xc7, 0xf5, 0xc1, 0x85, 0x9, + 0xe8, 0x19, 0xeb, 0x2b, 0xb4, 0xae, 0x4a, 0x25, 0x14, 0x37, 0xa6, + 0x9d, 0xec, 0x13, 0xa6, 0x90, 0x15, 0x5, 0xea, 0x72, 0x59}, + {0x18, 0xd5, 0xd1, 0xad, 0xd7, 0xdb, 0xf0, 0x18, 0x11, 0x1f, 0xc1, + 0xcf, 0x88, 0x78, 0x9f, 0x97, 0x9b, 0x75, 0x14, 0x71, 0xf0, 0xe1, + 0x32, 0x87, 0x1, 0x3a, 0xca, 0x65, 0x1a, 0xb8, 0xb5, 0x79}, + {0x11, 0x78, 0x8f, 0xdc, 0x20, 0xac, 0xd4, 0xf, 0xa8, 0x4f, 0x4d, + 0xac, 0x94, 0xd2, 0x9a, 0x9a, 0x34, 0x4, 0x36, 0xb3, 0x64, 0x2d, + 0x1b, 0xc0, 0xdb, 0x3b, 0x5f, 0x90, 0x95, 0x9c, 0x7e, 0x4f}, + }, + { + {0xfe, 0x99, 0x52, 0x35, 0x3d, 0x44, 0xc8, 0x71, 0xd7, 0xea, 0xeb, + 0xdb, 0x1c, 0x3b, 0xcd, 0x8b, 0x66, 0x94, 0xa4, 0xf1, 0x9e, 0x49, + 0x92, 0x80, 0xc8, 0xad, 0x44, 0xa1, 0xc4, 0xee, 0x42, 0x19}, + {0x2e, 0x30, 0x81, 0x57, 0xbc, 0x4b, 0x67, 0x62, 0xf, 0xdc, 0xad, + 0x89, 0x39, 0xf, 0x52, 0xd8, 0xc6, 0xd9, 0xfb, 0x53, 0xae, 0x99, + 0x29, 0x8c, 0x4c, 0x8e, 0x63, 0x2e, 0xd9, 0x3a, 0x99, 0x31}, + {0x92, 0x49, 0x23, 0xae, 0x19, 0x53, 0xac, 0x7d, 0x92, 0x3e, 0xea, + 0xc, 0x91, 0x3d, 0x1b, 0x2c, 0x22, 0x11, 0x3c, 0x25, 0x94, 0xe4, + 0x3c, 0x55, 0x75, 0xca, 0xf9, 0x4e, 0x31, 0x65, 0xa, 0x2a}, + }, + { + {0x3a, 0x79, 0x1c, 0x3c, 0xcd, 0x1a, 0x36, 0xcf, 0x3b, 0xbc, 0x35, + 0x5a, 0xac, 0xbc, 0x9e, 0x2f, 0xab, 0xa6, 0xcd, 0xa8, 0xe9, 0x60, + 0xe8, 0x60, 0x13, 0x1a, 0xea, 0x6d, 0x9b, 0xc3, 0x5d, 0x5}, + {0xc2, 0x27, 0xf9, 0xf7, 0x7f, 0x93, 0xb7, 0x2d, 0x35, 0xa6, 0xd0, + 0x17, 0x6, 0x1f, 0x74, 0xdb, 0x76, 0xaf, 0x55, 0x11, 0xa2, 0xf3, + 0x82, 0x59, 0xed, 0x2d, 0x7c, 0x64, 0x18, 0xe2, 0xf6, 0x4c}, + {0xb6, 0x5b, 0x8d, 0xc2, 0x7c, 0x22, 0x19, 0xb1, 0xab, 0xff, 0x4d, + 0x77, 0xbc, 0x4e, 0xe2, 0x7, 0x89, 0x2c, 0xa3, 0xe4, 0xce, 0x78, + 0x3c, 0xa8, 0xb6, 0x24, 0xaa, 0x10, 0x77, 0x30, 0x1a, 0x12}, + }, + { + {0xc9, 0x83, 0x74, 0xc7, 0x3e, 0x71, 0x59, 0xd6, 0xaf, 0x96, 0x2b, + 0xb8, 0x77, 0xe0, 0xbf, 0x88, 0xd3, 0xbc, 0x97, 0x10, 0x23, 0x28, + 0x9e, 0x28, 0x9b, 0x3a, 0xed, 0x6c, 0x4a, 0xb9, 0x7b, 0x52}, + {0x97, 0x4a, 0x3, 0x9f, 0x5e, 0x5d, 0xdb, 0xe4, 0x2d, 0xbc, 0x34, + 0x30, 0x9, 0xfc, 0x53, 0xe1, 0xb1, 0xd3, 0x51, 0x95, 0x91, 0x46, + 0x5, 0x46, 0x2d, 0xe5, 0x40, 0x7a, 0x6c, 0xc7, 0x3f, 0x33}, + {0x2e, 0x48, 0x5b, 0x99, 0x2a, 0x99, 0x3d, 0x56, 0x1, 0x38, 0x38, + 0x6e, 0x7c, 0xd0, 0x5, 0x34, 0xe5, 0xd8, 0x64, 0x2f, 0xde, 0x35, + 0x50, 0x48, 0xf7, 0xa9, 0xa7, 0x20, 0x9b, 0x6, 0x89, 0x6b}, + }, + { + {0x77, 0xdb, 0xc7, 0xb5, 0x8c, 0xfa, 0x82, 0x40, 0x55, 0xc1, 0x34, + 0xc7, 0xf8, 0x86, 0x86, 0x6, 0x7e, 0xa5, 0xe7, 0xf6, 0xd9, 0xc8, + 0xe6, 0x29, 0xcf, 0x9b, 0x63, 0xa7, 0x8, 0xd3, 0x73, 0x4}, + {0xd, 0x22, 0x70, 0x62, 0x41, 0xa0, 0x2a, 0x81, 0x4e, 0x5b, 0x24, + 0xf9, 0xfa, 0x89, 0x5a, 0x99, 0x5, 0xef, 0x72, 0x50, 0xce, 0xc4, + 0xad, 0xff, 0x73, 0xeb, 0x73, 0xaa, 0x3, 0x21, 0xbc, 0x23}, + {0x5, 0x9e, 0x58, 0x3, 0x26, 0x79, 0xee, 0xca, 0x92, 0xc4, 0xdc, + 0x46, 0x12, 0x42, 0x4b, 0x2b, 0x4f, 0xa9, 0x1, 0xe6, 0x74, 0xef, + 0xa1, 0x2, 0x1a, 0x34, 0x4, 0xde, 0xbf, 0x73, 0x2f, 0x10}, + }, + }, + { + { + {0x9a, 0x1c, 0x51, 0xb5, 0xe0, 0xda, 0xb4, 0xa2, 0x6, 0xff, 0xff, + 0x2b, 0x29, 0x60, 0xc8, 0x7a, 0x34, 0x42, 0x50, 0xf5, 0x5d, 0x37, + 0x1f, 0x98, 0x2d, 0xa1, 0x4e, 0xda, 0x25, 0xd7, 0x6b, 0x3f}, + {0xc6, 0x45, 0x57, 0x7f, 0xab, 0xb9, 0x18, 0xeb, 0x90, 0xc6, 0x87, + 0x57, 0xee, 0x8a, 0x3a, 0x2, 0xa9, 0xaf, 0xf7, 0x2d, 0xda, 0x12, + 0x27, 0xb7, 0x3d, 0x1, 0x5c, 0xea, 0x25, 0x7d, 0x59, 0x36}, + {0xac, 0x58, 0x60, 0x10, 0x7b, 0x8d, 0x4d, 0x73, 0x5f, 0x90, 0xc6, + 0x6f, 0x9e, 0x57, 0x40, 0xd9, 0x2d, 0x93, 0x2, 0x92, 0xf9, 0xf8, + 0x66, 0x64, 0xd0, 0xd6, 0x60, 0xda, 0x19, 0xcc, 0x7e, 0x7b}, + }, + { + {0x9b, 0xfa, 0x7c, 0xa7, 0x51, 0x4a, 0xae, 0x6d, 0x50, 0x86, 0xa3, + 0xe7, 0x54, 0x36, 0x26, 0x82, 0xdb, 0x82, 0x2d, 0x8f, 0xcd, 0xff, + 0xbb, 0x9, 0xba, 0xca, 0xf5, 0x1b, 0x66, 0xdc, 0xbe, 0x3}, + {0xd, 0x69, 0x5c, 0x69, 0x3c, 0x37, 0xc2, 0x78, 0x6e, 0x90, 0x42, + 0x6, 0x66, 0x2e, 0x25, 0xdd, 0xd2, 0x2b, 0xe1, 0x4a, 0x44, 0x44, + 0x1d, 0x95, 0x56, 0x39, 0x74, 0x1, 0x76, 0xad, 0x35, 0x42}, + {0xf5, 0x75, 0x89, 0x7, 0xd, 0xcb, 0x58, 0x62, 0x98, 0xf2, 0x89, + 0x91, 0x54, 0x42, 0x29, 0x49, 0xe4, 0x6e, 0xe3, 0xe2, 0x23, 0xb4, + 0xca, 0xa0, 0xa1, 0x66, 0xf0, 0xcd, 0xb0, 0xe2, 0x7c, 0xe}, + }, + { + {0xf9, 0x70, 0x4b, 0xd9, 0xdf, 0xfe, 0xa6, 0xfe, 0x2d, 0xba, 0xfc, + 0xc1, 0x51, 0xc0, 0x30, 0xf1, 0x89, 0xab, 0x2f, 0x7f, 0x7e, 0xd4, + 0x82, 0x48, 0xb5, 0xee, 0xec, 0x8a, 0x13, 0x56, 0x52, 0x61}, + {0xa3, 0x85, 0x8c, 0xc4, 0x3a, 0x64, 0x94, 0xc4, 0xad, 0x39, 0x61, + 0x3c, 0xf4, 0x1d, 0x36, 0xfd, 0x48, 0x4d, 0xe9, 0x3a, 0xdd, 0x17, + 0xdb, 0x9, 0x4a, 0x67, 0xb4, 0x8f, 0x5d, 0xa, 0x6e, 0x66}, + {0xd, 0xcb, 0x70, 0x48, 0x4e, 0xf6, 0xbb, 0x2a, 0x6b, 0x8b, 0x45, + 0xaa, 0xf0, 0xbc, 0x65, 0xcd, 0x5d, 0x98, 0xe8, 0x75, 0xba, 0x4e, + 0xbe, 0x9a, 0xe4, 0xde, 0x14, 0xd5, 0x10, 0xc8, 0xb, 0x7f}, + }, + { + {0xa0, 0x13, 0x72, 0x73, 0xad, 0x9d, 0xac, 0x83, 0x98, 0x2e, 0xf7, + 0x2e, 0xba, 0xf8, 0xf6, 0x9f, 0x57, 0x69, 0xec, 0x43, 0xdd, 0x2e, + 0x1e, 0x31, 0x75, 0xab, 0xc5, 0xde, 0x7d, 0x90, 0x3a, 0x1d}, + {0x6f, 0x13, 0xf4, 0x26, 0xa4, 0x6b, 0x0, 0xb9, 0x35, 0x30, 0xe0, + 0x57, 0x9e, 0x36, 0x67, 0x8d, 0x28, 0x3c, 0x46, 0x4f, 0xd9, 0xdf, + 0xc8, 0xcb, 0xf5, 0xdb, 0xee, 0xf8, 0xbc, 0x8d, 0x1f, 0xd}, + {0xdc, 0x81, 0xd0, 0x3e, 0x31, 0x93, 0x16, 0xba, 0x80, 0x34, 0x1b, + 0x85, 0xad, 0x9f, 0x32, 0x29, 0xcb, 0x21, 0x3, 0x3, 0x3c, 0x1, + 0x28, 0x1, 0xe3, 0xfd, 0x1b, 0xa3, 0x44, 0x1b, 0x1, 0x0}, + }, + { + {0x5c, 0xa7, 0xa, 0x6a, 0x69, 0x1f, 0x56, 0x16, 0x6a, 0xbd, 0x52, + 0x58, 0x5c, 0x72, 0xbf, 0xc1, 0xad, 0x66, 0x79, 0x9a, 0x7f, 0xdd, + 0xa8, 0x11, 0x26, 0x10, 0x85, 0xd2, 0xa2, 0x88, 0xd9, 0x63}, + {0xc, 0x6c, 0xc6, 0x3f, 0x6c, 0xa0, 0xdf, 0x3f, 0xd2, 0xd, 0xd6, + 0x4d, 0x8e, 0xe3, 0x40, 0x5d, 0x71, 0x4d, 0x8e, 0x26, 0x38, 0x8b, + 0xe3, 0x7a, 0xe1, 0x57, 0x83, 0x6e, 0x91, 0x8d, 0xc4, 0x3a}, + {0x2e, 0x23, 0xbd, 0xaf, 0x53, 0x7, 0x12, 0x0, 0x83, 0xf6, 0xd8, + 0xfd, 0xb8, 0xce, 0x2b, 0xe9, 0x91, 0x2b, 0xe7, 0x84, 0xb3, 0x69, + 0x16, 0xf8, 0x66, 0xa0, 0x68, 0x23, 0x2b, 0xd5, 0xfa, 0x33}, + }, + { + {0xe8, 0xcf, 0x22, 0xc4, 0xd0, 0xc8, 0x2c, 0x8d, 0xcb, 0x3a, 0xa1, + 0x5, 0x7b, 0x4f, 0x2b, 0x7, 0x6f, 0xa5, 0xf6, 0xec, 0xe6, 0xb6, + 0xfe, 0xa3, 0xe2, 0x71, 0xa, 0xb9, 0xcc, 0x55, 0xc3, 0x3c}, + {0x16, 0x1e, 0xe4, 0xc5, 0xc6, 0x49, 0x6, 0x54, 0x35, 0x77, 0x3f, + 0x33, 0x30, 0x64, 0xf8, 0xa, 0x46, 0xe7, 0x5, 0xf3, 0xd2, 0xfc, + 0xac, 0xb2, 0xa7, 0xdc, 0x56, 0xa2, 0x29, 0xf4, 0xc0, 0x16}, + {0x31, 0x91, 0x3e, 0x90, 0x43, 0x94, 0xb6, 0xe9, 0xce, 0x37, 0x56, + 0x7a, 0xcb, 0x94, 0xa4, 0xb8, 0x44, 0x92, 0xba, 0xba, 0xa4, 0xd1, + 0x7c, 0xc8, 0x68, 0x75, 0xae, 0x6b, 0x42, 0xaf, 0x1e, 0x63}, + }, + { + {0xe8, 0xd, 0x70, 0xa3, 0xb9, 0x75, 0xd9, 0x47, 0x52, 0x5, 0xf8, + 0xe2, 0xfb, 0xc5, 0x80, 0x72, 0xe1, 0x5d, 0xe4, 0x32, 0x27, 0x8f, + 0x65, 0x53, 0xb5, 0x80, 0x5f, 0x66, 0x7f, 0x2c, 0x1f, 0x43}, + {0x9f, 0xfe, 0x66, 0xda, 0x10, 0x4, 0xe9, 0xb3, 0xa6, 0xe5, 0x16, + 0x6c, 0x52, 0x4b, 0xdd, 0x85, 0x83, 0xbf, 0xf9, 0x1e, 0x61, 0x97, + 0x3d, 0xbc, 0xb5, 0x19, 0xa9, 0x1e, 0x8b, 0x64, 0x99, 0x55}, + {0x19, 0x7b, 0x8f, 0x85, 0x44, 0x63, 0x2, 0xd6, 0x4a, 0x51, 0xea, + 0xa1, 0x2f, 0x35, 0xab, 0x14, 0xd7, 0xa9, 0x90, 0x20, 0x1a, 0x44, + 0x0, 0x89, 0x26, 0x3b, 0x25, 0x91, 0x5f, 0x71, 0x4, 0x7b}, + }, + { + {0xc6, 0xba, 0xe6, 0xc4, 0x80, 0xc2, 0x76, 0xb3, 0xb, 0x9b, 0x1d, + 0x6d, 0xdd, 0xd3, 0xe, 0x97, 0x44, 0xf9, 0xb, 0x45, 0x58, 0x95, + 0x9a, 0xb0, 0x23, 0xe2, 0xcd, 0x57, 0xfa, 0xac, 0xd0, 0x48}, + {0x43, 0xae, 0xf6, 0xac, 0x28, 0xbd, 0xed, 0x83, 0xb4, 0x7a, 0x5c, + 0x7d, 0x8b, 0x7c, 0x35, 0x86, 0x44, 0x2c, 0xeb, 0xb7, 0x69, 0x47, + 0x40, 0xc0, 0x3f, 0x58, 0xf6, 0xc2, 0xf5, 0x7b, 0xb3, 0x59}, + {0x71, 0xe6, 0xab, 0x7d, 0xe4, 0x26, 0xf, 0xb6, 0x37, 0x3a, 0x2f, + 0x62, 0x97, 0xa1, 0xd1, 0xf1, 0x94, 0x3, 0x96, 0xe9, 0x7e, 0xce, + 0x8, 0x42, 0xdb, 0x3b, 0x6d, 0x33, 0x91, 0x41, 0x23, 0x16}, + }, + }, + { + { + {0x40, 0x86, 0xf3, 0x1f, 0xd6, 0x9c, 0x49, 0xdd, 0xa0, 0x25, 0x36, + 0x6, 0xc3, 0x9b, 0xcd, 0x29, 0xc3, 0x3d, 0xd7, 0x3d, 0x2, 0xd8, + 0xe2, 0x51, 0x31, 0x92, 0x3b, 0x20, 0x7a, 0x70, 0x25, 0x4a}, + {0xf6, 0x7f, 0x26, 0xf6, 0xde, 0x99, 0xe4, 0xb9, 0x43, 0x8, 0x2c, + 0x74, 0x7b, 0xca, 0x72, 0x77, 0xb1, 0xf2, 0xa4, 0xe9, 0x3f, 0x15, + 0xa0, 0x23, 0x6, 0x50, 0xd0, 0xd5, 0xec, 0xdf, 0xdf, 0x2c}, + {0x6a, 0xed, 0xf6, 0x53, 0x8a, 0x66, 0xb7, 0x2a, 0xa1, 0x70, 0xd1, + 0x1d, 0x58, 0x42, 0x42, 0x30, 0x61, 0x1, 0xe2, 0x3a, 0x4c, 0x14, + 0x0, 0x40, 0xfc, 0x49, 0x8e, 0x24, 0x6d, 0x89, 0x21, 0x57}, + }, + { + {0x4e, 0xda, 0xd0, 0xa1, 0x91, 0x50, 0x5d, 0x28, 0x8, 0x3e, 0xfe, + 0xb5, 0xa7, 0x6f, 0xaa, 0x4b, 0xb3, 0x93, 0x93, 0xe1, 0x7c, 0x17, + 0xe5, 0x63, 0xfd, 0x30, 0xb0, 0xc4, 0xaf, 0x35, 0xc9, 0x3}, + {0xae, 0x1b, 0x18, 0xfd, 0x17, 0x55, 0x6e, 0xb, 0xb4, 0x63, 0xb9, + 0x2b, 0x9f, 0x62, 0x22, 0x90, 0x25, 0x46, 0x6, 0x32, 0xe9, 0xbc, + 0x9, 0x55, 0xda, 0x13, 0x3c, 0xf6, 0x74, 0xdd, 0x8e, 0x57}, + {0x3d, 0xc, 0x2b, 0x49, 0xc6, 0x76, 0x72, 0x99, 0xfc, 0x5, 0xe2, + 0xdf, 0xc4, 0xc2, 0xcc, 0x47, 0x3c, 0x3a, 0x62, 0xdd, 0x84, 0x9b, + 0xd2, 0xdc, 0xa2, 0xc7, 0x88, 0x2, 0x59, 0xab, 0xc2, 0x3e}, + }, + { + {0xcb, 0xd1, 0x32, 0xae, 0x9, 0x3a, 0x21, 0xa7, 0xd5, 0xc2, 0xf5, + 0x40, 0xdf, 0x87, 0x2b, 0xf, 0x29, 0xab, 0x1e, 0xe8, 0xc6, 0xa4, + 0xae, 0xb, 0x5e, 0xac, 0xdb, 0x6a, 0x6c, 0xf6, 0x1b, 0xe}, + {0xb9, 0x7b, 0xd8, 0xe4, 0x7b, 0xd2, 0xa0, 0xa1, 0xed, 0x1a, 0x39, + 0x61, 0xeb, 0x4d, 0x8b, 0xa9, 0x83, 0x9b, 0xcb, 0x73, 0xd0, 0xdd, + 0xa0, 0x99, 0xce, 0xca, 0xf, 0x20, 0x5a, 0xc2, 0xd5, 0x2d}, + {0x7e, 0x88, 0x2c, 0x79, 0xe9, 0xd5, 0xab, 0xe2, 0x5d, 0x6d, 0x92, + 0xcb, 0x18, 0x0, 0x2, 0x1a, 0x1e, 0x5f, 0xae, 0xba, 0xcd, 0x69, + 0xba, 0xbf, 0x5f, 0x8f, 0xe8, 0x5a, 0xb3, 0x48, 0x5, 0x73}, + }, + { + {0x34, 0xe3, 0xd6, 0xa1, 0x4b, 0x9, 0x5b, 0x80, 0x19, 0x3f, 0x35, + 0x9, 0x77, 0xf1, 0x3e, 0xbf, 0x2b, 0x70, 0x22, 0x6, 0xcb, 0x6, + 0x3f, 0x42, 0xdd, 0x45, 0x78, 0xd8, 0x77, 0x22, 0x5a, 0x58}, + {0xee, 0xb8, 0xa8, 0xcb, 0xa3, 0x51, 0x35, 0xc4, 0x16, 0x5f, 0x11, + 0xb2, 0x1d, 0x6f, 0xa2, 0x65, 0x50, 0x38, 0x8c, 0xab, 0x52, 0x4f, + 0xf, 0x76, 0xca, 0xb8, 0x1d, 0x41, 0x3b, 0x44, 0x43, 0x30}, + {0x62, 0x89, 0xd4, 0x33, 0x82, 0x5f, 0x8a, 0xa1, 0x7f, 0x25, 0x78, + 0xec, 0xb5, 0xc4, 0x98, 0x66, 0xff, 0x41, 0x3e, 0x37, 0xa5, 0x6f, + 0x8e, 0xa7, 0x1f, 0x98, 0xef, 0x50, 0x89, 0x27, 0x56, 0x76}, + }, + { + {0x9d, 0xcf, 0x86, 0xea, 0xa3, 0x73, 0x70, 0xe1, 0xdc, 0x5f, 0x15, + 0x7, 0xb7, 0xfb, 0x8c, 0x3a, 0x8e, 0x8a, 0x83, 0x31, 0xfc, 0xe7, + 0x53, 0x48, 0x16, 0xf6, 0x13, 0xb6, 0x84, 0xf4, 0xbb, 0x28}, + {0xc0, 0xc8, 0x1f, 0xd5, 0x59, 0xcf, 0xc3, 0x38, 0xf2, 0xb6, 0x6, + 0x5, 0xfd, 0xd2, 0xed, 0x9b, 0x8f, 0xe, 0x57, 0xab, 0x9f, 0x10, + 0xbf, 0x26, 0xa6, 0x46, 0xb8, 0xc1, 0xa8, 0x60, 0x41, 0x3f}, + {0x7c, 0x6c, 0x13, 0x6f, 0x5c, 0x2f, 0x61, 0xf2, 0xbe, 0x11, 0xdd, + 0xf6, 0x7, 0xd1, 0xea, 0xaf, 0x33, 0x6f, 0xde, 0x13, 0xd2, 0x9a, + 0x7e, 0x52, 0x5d, 0xf7, 0x88, 0x81, 0x35, 0xcb, 0x79, 0x1e}, + }, + { + {0x81, 0x81, 0xe0, 0xf5, 0xd8, 0x53, 0xe9, 0x77, 0xd9, 0xde, 0x9d, + 0x29, 0x44, 0xc, 0xa5, 0x84, 0xe5, 0x25, 0x45, 0x86, 0xc, 0x2d, + 0x6c, 0xdc, 0xf4, 0xf2, 0xd1, 0x39, 0x2d, 0xb5, 0x8a, 0x47}, + {0xf1, 0xe3, 0xf7, 0xee, 0xc3, 0x36, 0x34, 0x1, 0xf8, 0x10, 0x9e, + 0xfe, 0x7f, 0x6a, 0x8b, 0x82, 0xfc, 0xde, 0xf9, 0xbc, 0xe5, 0x8, + 0xf9, 0x7f, 0x31, 0x38, 0x3b, 0x3a, 0x1b, 0x95, 0xd7, 0x65}, + {0x59, 0xd1, 0x52, 0x92, 0xd3, 0xa4, 0xa6, 0x66, 0x7, 0xc8, 0x1a, + 0x87, 0xbc, 0xe1, 0xdd, 0xe5, 0x6f, 0xc9, 0xc1, 0xa6, 0x40, 0x6b, + 0x2c, 0xb8, 0x14, 0x22, 0x21, 0x1a, 0x41, 0x7a, 0xd8, 0x16}, + }, + { + {0x83, 0x5, 0x4e, 0xd5, 0xe2, 0xd5, 0xa4, 0xfb, 0xfa, 0x99, 0xbd, + 0x2e, 0xd7, 0xaf, 0x1f, 0xe2, 0x8f, 0x77, 0xe9, 0x6e, 0x73, 0xc2, + 0x7a, 0x49, 0xde, 0x6d, 0x5a, 0x7a, 0x57, 0xb, 0x99, 0x1f}, + {0x15, 0x62, 0x6, 0x42, 0x5a, 0x7e, 0xbd, 0xb3, 0xc1, 0x24, 0x5a, + 0xc, 0xcd, 0xe3, 0x9b, 0x87, 0xb7, 0x94, 0xf9, 0xd6, 0xb1, 0x5d, + 0xc0, 0x57, 0xa6, 0x8c, 0xf3, 0x65, 0x81, 0x7c, 0xf8, 0x28}, + {0xd6, 0xf7, 0xe8, 0x1b, 0xad, 0x4e, 0x34, 0xa3, 0x8f, 0x79, 0xea, + 0xac, 0xeb, 0x50, 0x1e, 0x7d, 0x52, 0xe0, 0xd, 0x52, 0x9e, 0x56, + 0xc6, 0x77, 0x3e, 0x6d, 0x4d, 0x53, 0xe1, 0x2f, 0x88, 0x45}, + }, + { + {0xe4, 0x6f, 0x3c, 0x94, 0x29, 0x99, 0xac, 0xd8, 0xa2, 0x92, 0x83, + 0xa3, 0x61, 0xf1, 0xf9, 0xb5, 0xf3, 0x9a, 0xc8, 0xbe, 0x13, 0xdb, + 0x99, 0x26, 0x74, 0xf0, 0x5, 0xe4, 0x3c, 0x84, 0xcf, 0x7d}, + {0xd6, 0x83, 0x79, 0x75, 0x5d, 0x34, 0x69, 0x66, 0xa6, 0x11, 0xaa, + 0x17, 0x11, 0xed, 0xb6, 0x62, 0x8f, 0x12, 0x5e, 0x98, 0x57, 0x18, + 0xdd, 0x7d, 0xdd, 0xf6, 0x26, 0xf6, 0xb8, 0xe5, 0x8f, 0x68}, + {0xc0, 0x32, 0x47, 0x4a, 0x48, 0xd6, 0x90, 0x6c, 0x99, 0x32, 0x56, + 0xca, 0xfd, 0x43, 0x21, 0xd5, 0xe1, 0xc6, 0x5d, 0x91, 0xc3, 0x28, + 0xbe, 0xb3, 0x1b, 0x19, 0x27, 0x73, 0x7e, 0x68, 0x39, 0x67}, + }, + }, + { + { + {0xc0, 0x1a, 0xc, 0xc8, 0x9d, 0xcc, 0x6d, 0xa6, 0x36, 0xa4, 0x38, + 0x1b, 0xf4, 0x5c, 0xa0, 0x97, 0xc6, 0xd7, 0xdb, 0x95, 0xbe, 0xf3, + 0xeb, 0xa7, 0xab, 0x7d, 0x7e, 0x8d, 0xf6, 0xb8, 0xa0, 0x7d}, + {0xa6, 0x75, 0x56, 0x38, 0x14, 0x20, 0x78, 0xef, 0xe8, 0xa9, 0xfd, + 0xaa, 0x30, 0x9f, 0x64, 0xa2, 0xcb, 0xa8, 0xdf, 0x5c, 0x50, 0xeb, + 0xd1, 0x4c, 0xb3, 0xc0, 0x4d, 0x1d, 0xba, 0x5a, 0x11, 0x46}, + {0x76, 0xda, 0xb5, 0xc3, 0x53, 0x19, 0xf, 0xd4, 0x9b, 0x9e, 0x11, + 0x21, 0x73, 0x6f, 0xac, 0x1d, 0x60, 0x59, 0xb2, 0xfe, 0x21, 0x60, + 0xcc, 0x3, 0x4b, 0x4b, 0x67, 0x83, 0x7e, 0x88, 0x5f, 0x5a}, + }, + { + {0xb9, 0x43, 0xa6, 0xa0, 0xd3, 0x28, 0x96, 0x9e, 0x64, 0x20, 0xc3, + 0xe6, 0x0, 0xcb, 0xc3, 0xb5, 0x32, 0xec, 0x2d, 0x7c, 0x89, 0x2, + 0x53, 0x9b, 0xc, 0xc7, 0xd1, 0xd5, 0xe2, 0x7a, 0xe3, 0x43}, + {0x11, 0x3d, 0xa1, 0x70, 0xcf, 0x1, 0x63, 0x8f, 0xc4, 0xd0, 0xd, + 0x35, 0x15, 0xb8, 0xce, 0xcf, 0x7e, 0xa4, 0xbc, 0xa4, 0xd4, 0x97, + 0x2, 0xf7, 0x34, 0x14, 0x4d, 0xe4, 0x56, 0xb6, 0x69, 0x36}, + {0x33, 0xe1, 0xa6, 0xed, 0x6, 0x3f, 0x7e, 0x38, 0xc0, 0x3a, 0xa1, + 0x99, 0x51, 0x1d, 0x30, 0x67, 0x11, 0x38, 0x26, 0x36, 0xf8, 0xd8, + 0x5a, 0xbd, 0xbe, 0xe9, 0xd5, 0x4f, 0xcd, 0xe6, 0x21, 0x6a}, + }, + { + {0xe3, 0xb2, 0x99, 0x66, 0x12, 0x29, 0x41, 0xef, 0x1, 0x13, 0x8d, + 0x70, 0x47, 0x8, 0xd3, 0x71, 0xbd, 0xb0, 0x82, 0x11, 0xd0, 0x32, + 0x54, 0x32, 0x36, 0x8b, 0x1e, 0x0, 0x7, 0x1b, 0x37, 0x45}, + {0x5f, 0xe6, 0x46, 0x30, 0xa, 0x17, 0xc6, 0xf1, 0x24, 0x35, 0xd2, + 0x0, 0x2a, 0x2a, 0x71, 0x58, 0x55, 0xb7, 0x82, 0x8c, 0x3c, 0xbd, + 0xdb, 0x69, 0x57, 0xff, 0x95, 0xa1, 0xf1, 0xf9, 0x6b, 0x58}, + {0xb, 0x79, 0xf8, 0x5e, 0x8d, 0x8, 0xdb, 0xa6, 0xe5, 0x37, 0x9, + 0x61, 0xdc, 0xf0, 0x78, 0x52, 0xb8, 0x6e, 0xa1, 0x61, 0xd2, 0x49, + 0x3, 0xac, 0x79, 0x21, 0xe5, 0x90, 0x37, 0xb0, 0xaf, 0xe}, + }, + { + {0x1d, 0xae, 0x75, 0xf, 0x5e, 0x80, 0x40, 0x51, 0x30, 0xcc, 0x62, + 0x26, 0xe3, 0xfb, 0x2, 0xec, 0x6d, 0x39, 0x92, 0xea, 0x1e, 0xdf, + 0xeb, 0x2c, 0xb3, 0x5b, 0x43, 0xc5, 0x44, 0x33, 0xae, 0x44}, + {0x2f, 0x4, 0x48, 0x37, 0xc1, 0x55, 0x5, 0x96, 0x11, 0xaa, 0xb, + 0x82, 0xe6, 0x41, 0x9a, 0x21, 0xc, 0x6d, 0x48, 0x73, 0x38, 0xf7, + 0x81, 0x1c, 0x61, 0xc6, 0x2, 0x5a, 0x67, 0xcc, 0x9a, 0x30}, + {0xee, 0x43, 0xa5, 0xbb, 0xb9, 0x89, 0xf2, 0x9c, 0x42, 0x71, 0xc9, + 0x5a, 0x9d, 0xe, 0x76, 0xf3, 0xaa, 0x60, 0x93, 0x4f, 0xc6, 0xe5, + 0x82, 0x1d, 0x8f, 0x67, 0x94, 0x7f, 0x1b, 0x22, 0xd5, 0x62}, + }, + { + {0x3c, 0x7a, 0xf7, 0x3a, 0x26, 0xd4, 0x85, 0x75, 0x4d, 0x14, 0xe9, + 0xfe, 0x11, 0x7b, 0xae, 0xdf, 0x3d, 0x19, 0xf7, 0x59, 0x80, 0x70, + 0x6, 0xa5, 0x37, 0x20, 0x92, 0x83, 0x53, 0x9a, 0xf2, 0x14}, + {0x6d, 0x93, 0xd0, 0x18, 0x9c, 0x29, 0x4c, 0x52, 0xc, 0x1a, 0xc, + 0x8a, 0x6c, 0xb5, 0x6b, 0xc8, 0x31, 0x86, 0x4a, 0xdb, 0x2e, 0x5, + 0x75, 0xa3, 0x62, 0x45, 0x75, 0xbc, 0xe4, 0xfd, 0xe, 0x5c}, + {0xf5, 0xd7, 0xb2, 0x25, 0xdc, 0x7e, 0x71, 0xdf, 0x40, 0x30, 0xb5, + 0x99, 0xdb, 0x70, 0xf9, 0x21, 0x62, 0x4c, 0xed, 0xc3, 0xb7, 0x34, + 0x92, 0xda, 0x3e, 0x9, 0xee, 0x7b, 0x5c, 0x36, 0x72, 0x5e}, + }, + { + {0x3e, 0xb3, 0x8, 0x2f, 0x6, 0x39, 0x93, 0x7d, 0xbe, 0x32, 0x9f, + 0xdf, 0xe5, 0x59, 0x96, 0x5b, 0xfd, 0xbd, 0x9e, 0x1f, 0xad, 0x3d, + 0xff, 0xac, 0xb7, 0x49, 0x73, 0xcb, 0x55, 0x5, 0xb2, 0x70}, + {0x7f, 0x21, 0x71, 0x45, 0x7, 0xfc, 0x5b, 0x57, 0x5b, 0xd9, 0x94, + 0x6, 0x5d, 0x67, 0x79, 0x37, 0x33, 0x1e, 0x19, 0xf4, 0xbb, 0x37, + 0xa, 0x9a, 0xbc, 0xea, 0xb4, 0x47, 0x4c, 0x10, 0xf1, 0x77}, + {0x4c, 0x2c, 0x11, 0x55, 0xc5, 0x13, 0x51, 0xbe, 0xcd, 0x1f, 0x88, + 0x9a, 0x3a, 0x42, 0x88, 0x66, 0x47, 0x3b, 0x50, 0x5e, 0x85, 0x77, + 0x66, 0x44, 0x4a, 0x40, 0x6, 0x4a, 0x8f, 0x39, 0x34, 0xe}, + }, + { + {0x28, 0x19, 0x4b, 0x3e, 0x9, 0xb, 0x93, 0x18, 0x40, 0xf6, 0xf3, + 0x73, 0xe, 0xe1, 0xe3, 0x7d, 0x6f, 0x5d, 0x39, 0x73, 0xda, 0x17, + 0x32, 0xf4, 0x3e, 0x9c, 0x37, 0xca, 0xd6, 0xde, 0x8a, 0x6f}, + {0xe8, 0xbd, 0xce, 0x3e, 0xd9, 0x22, 0x7d, 0xb6, 0x7, 0x2f, 0x82, + 0x27, 0x41, 0xe8, 0xb3, 0x9, 0x8d, 0x6d, 0x5b, 0xb0, 0x1f, 0xa6, + 0x3f, 0x74, 0x72, 0x23, 0x36, 0x8a, 0x36, 0x5, 0x54, 0x5e}, + {0x9a, 0xb2, 0xb7, 0xfd, 0x3d, 0x12, 0x40, 0xe3, 0x91, 0xb2, 0x1a, + 0xa2, 0xe1, 0x97, 0x7b, 0x48, 0x9e, 0x94, 0xe6, 0xfd, 0x2, 0x7d, + 0x96, 0xf9, 0x97, 0xde, 0xd3, 0xc8, 0x2e, 0xe7, 0xd, 0x78}, + }, + { + {0x72, 0x27, 0xf4, 0x0, 0xf3, 0xea, 0x1f, 0x67, 0xaa, 0x41, 0x8c, + 0x2a, 0x2a, 0xeb, 0x72, 0x8f, 0x92, 0x32, 0x37, 0x97, 0xd7, 0x7f, + 0xa1, 0x29, 0xa6, 0x87, 0xb5, 0x32, 0xad, 0xc6, 0xef, 0x1d}, + {0xbc, 0xe7, 0x9a, 0x8, 0x45, 0x85, 0xe2, 0xa, 0x6, 0x4d, 0x7f, + 0x1c, 0xcf, 0xde, 0x8d, 0x38, 0xb8, 0x11, 0x48, 0xa, 0x51, 0x15, + 0xac, 0x38, 0xe4, 0x8c, 0x92, 0x71, 0xf6, 0x8b, 0xb2, 0xe}, + {0xa7, 0x95, 0x51, 0xef, 0x1a, 0xbe, 0x5b, 0xaf, 0xed, 0x15, 0x7b, + 0x91, 0x77, 0x12, 0x8c, 0x14, 0x2e, 0xda, 0xe5, 0x7a, 0xfb, 0xf7, + 0x91, 0x29, 0x67, 0x28, 0xdd, 0xf8, 0x1b, 0x20, 0x7d, 0x46}, + }, + }, + { + { + {0xa9, 0xe7, 0x7a, 0x56, 0xbd, 0xf4, 0x1e, 0xbc, 0xbd, 0x98, 0x44, + 0xd6, 0xb2, 0x4c, 0x62, 0x3f, 0xc8, 0x4e, 0x1f, 0x2c, 0xd2, 0x64, + 0x10, 0xe4, 0x1, 0x40, 0x38, 0xba, 0xa5, 0xc5, 0xf9, 0x2e}, + {0xad, 0x4f, 0xef, 0x74, 0x9a, 0x91, 0xfe, 0x95, 0xa2, 0x8, 0xa3, + 0xf6, 0xec, 0x7b, 0x82, 0x3a, 0x1, 0x7b, 0xa4, 0x9, 0xd3, 0x1, + 0x4e, 0x96, 0x97, 0xc7, 0xa3, 0x5b, 0x4f, 0x3c, 0xc4, 0x71}, + {0xcd, 0x74, 0x9e, 0xfa, 0xf6, 0x6d, 0xfd, 0xb6, 0x7a, 0x26, 0xaf, + 0xe4, 0xbc, 0x78, 0x82, 0xf1, 0xe, 0x99, 0xef, 0xf1, 0xd0, 0xb3, + 0x55, 0x82, 0x93, 0xf2, 0xc5, 0x90, 0xa3, 0x8c, 0x75, 0x5a}, + }, + { + {0x94, 0xdc, 0x61, 0x1d, 0x8b, 0x91, 0xe0, 0x8c, 0x66, 0x30, 0x81, + 0x9a, 0x46, 0x36, 0xed, 0x8d, 0xd3, 0xaa, 0xe8, 0xaf, 0x29, 0xa8, + 0xe6, 0xd4, 0x3f, 0xd4, 0x39, 0xf6, 0x27, 0x80, 0x73, 0xa}, + {0x95, 0x24, 0x46, 0xd9, 0x10, 0x27, 0xb7, 0xa2, 0x3, 0x50, 0x7d, + 0xd5, 0xd2, 0xc6, 0xa8, 0x3a, 0xca, 0x87, 0xb4, 0xa0, 0xbf, 0x0, + 0xd4, 0xe3, 0xec, 0x72, 0xeb, 0xb3, 0x44, 0xe2, 0xba, 0x2d}, + {0xcc, 0xe1, 0xff, 0x57, 0x2f, 0x4a, 0xf, 0x98, 0x43, 0x98, 0x83, + 0xe1, 0xd, 0xd, 0x67, 0x0, 0xfd, 0x15, 0xfb, 0x49, 0x4a, 0x3f, + 0x5c, 0x10, 0x9c, 0xa6, 0x26, 0x51, 0x63, 0xca, 0x98, 0x26}, + }, + { + {0xe, 0xd9, 0x3d, 0x5e, 0x2f, 0x70, 0x3d, 0x2e, 0x86, 0x53, 0xd2, + 0xe4, 0x18, 0x9, 0x3f, 0x9e, 0x6a, 0xa9, 0x4d, 0x2, 0xf6, 0x3e, + 0x77, 0x5e, 0x32, 0x33, 0xfa, 0x4a, 0xc, 0x4b, 0x0, 0x3c}, + {0x78, 0xba, 0xb0, 0x32, 0x88, 0x31, 0x65, 0xe7, 0x8b, 0xff, 0x5c, + 0x92, 0xf7, 0x31, 0x18, 0x38, 0xcc, 0x1f, 0x29, 0xa0, 0x91, 0x1b, + 0xa8, 0x8, 0x7, 0xeb, 0xca, 0x49, 0xcc, 0x3d, 0xb4, 0x1f}, + {0x2b, 0xb8, 0xf4, 0x6, 0xac, 0x46, 0xa9, 0x9a, 0xf3, 0xc4, 0x6, + 0xa8, 0xa5, 0x84, 0xa2, 0x1c, 0x87, 0x47, 0xcd, 0xc6, 0x5f, 0x26, + 0xd3, 0x3e, 0x17, 0xd2, 0x1f, 0xcd, 0x1, 0xfd, 0x43, 0x6b}, + }, + { + {0xf3, 0xe, 0x76, 0x3e, 0x58, 0x42, 0xc7, 0xb5, 0x90, 0xb9, 0xa, + 0xee, 0xb9, 0x52, 0xdc, 0x75, 0x3f, 0x92, 0x2b, 0x7, 0xc2, 0x27, + 0x14, 0xbf, 0xf0, 0xd9, 0xf0, 0x6f, 0x2d, 0xb, 0x42, 0x73}, + {0x44, 0xc5, 0x97, 0x46, 0x4b, 0x5d, 0xa7, 0xc7, 0xbf, 0xff, 0xf, + 0xdf, 0x48, 0xf8, 0xfd, 0x15, 0x5a, 0x78, 0x46, 0xaa, 0xeb, 0xb9, + 0x68, 0x28, 0x14, 0xf7, 0x52, 0x5b, 0x10, 0xd7, 0x68, 0x5a}, + {0x6, 0x1e, 0x85, 0x9e, 0xcb, 0xf6, 0x2c, 0xaf, 0xc4, 0x38, 0x22, + 0xc6, 0x13, 0x39, 0x59, 0x8f, 0x73, 0xf3, 0xfb, 0x99, 0x96, 0xb8, + 0x8a, 0xda, 0x9e, 0xbc, 0x34, 0xea, 0x2f, 0x63, 0xb5, 0x3d}, + }, + { + {0xd5, 0x25, 0x98, 0x82, 0xb1, 0x90, 0x49, 0x2e, 0x91, 0x89, 0x9a, + 0x3e, 0x87, 0xeb, 0xea, 0xed, 0xf8, 0x4a, 0x70, 0x4c, 0x39, 0x3d, + 0xf0, 0xee, 0xe, 0x2b, 0xdf, 0x95, 0xa4, 0x7e, 0x19, 0x59}, + {0xd8, 0xd9, 0x5d, 0xf7, 0x2b, 0xee, 0x6e, 0xf4, 0xa5, 0x59, 0x67, + 0x39, 0xf6, 0xb1, 0x17, 0xd, 0x73, 0x72, 0x9e, 0x49, 0x31, 0xd1, + 0xf2, 0x1b, 0x13, 0x5f, 0xd7, 0x49, 0xdf, 0x1a, 0x32, 0x4}, + {0xae, 0x5a, 0xe5, 0xe4, 0x19, 0x60, 0xe1, 0x4, 0xe9, 0x92, 0x2f, + 0x7e, 0x7a, 0x43, 0x7b, 0xe7, 0xa4, 0x9a, 0x15, 0x6f, 0xc1, 0x2d, + 0xce, 0xc7, 0xc0, 0xc, 0xd7, 0xf4, 0xc1, 0xfd, 0xea, 0x45}, + }, + { + {0xed, 0xb1, 0xcc, 0xcf, 0x24, 0x46, 0xe, 0xb6, 0x95, 0x3, 0x5c, + 0xbd, 0x92, 0xc2, 0xdb, 0x59, 0xc9, 0x81, 0x4, 0xdc, 0x1d, 0x9d, + 0xa0, 0x31, 0x40, 0xd9, 0x56, 0x5d, 0xea, 0xce, 0x73, 0x3f}, + {0x2b, 0xd7, 0x45, 0x80, 0x85, 0x1, 0x84, 0x69, 0x51, 0x6, 0x2f, + 0xcf, 0xa2, 0xfa, 0x22, 0x4c, 0xc6, 0x2d, 0x22, 0x6b, 0x65, 0x36, + 0x1a, 0x94, 0xde, 0xda, 0x62, 0x3, 0xc8, 0xeb, 0x5e, 0x5a}, + {0xc6, 0x8d, 0x4e, 0xa, 0xd1, 0xbf, 0xa7, 0xb7, 0x39, 0xb3, 0xc9, + 0x44, 0x7e, 0x0, 0x57, 0xbe, 0xfa, 0xae, 0x57, 0x15, 0x7f, 0x20, + 0xc1, 0x60, 0xdb, 0x18, 0x62, 0x26, 0x91, 0x88, 0x5, 0x26}, + }, + { + {0x42, 0xe5, 0x76, 0xc6, 0x3c, 0x8e, 0x81, 0x4c, 0xad, 0xcc, 0xce, + 0x3, 0x93, 0x2c, 0x42, 0x5e, 0x8, 0x9f, 0x12, 0xb4, 0xca, 0xcc, + 0x7, 0xec, 0xb8, 0x43, 0x44, 0xb2, 0x10, 0xfa, 0xed, 0xd}, + {0x4, 0xff, 0x60, 0x83, 0xa6, 0x4, 0xf7, 0x59, 0xf4, 0xe6, 0x61, + 0x76, 0xde, 0x3f, 0xd9, 0xc3, 0x51, 0x35, 0x87, 0x12, 0x73, 0x2a, + 0x1b, 0x83, 0x57, 0x5d, 0x61, 0x4e, 0x2e, 0xc, 0xad, 0x54}, + {0x2a, 0x52, 0x2b, 0xb8, 0xd5, 0x67, 0x3b, 0xee, 0xeb, 0xc1, 0xa5, + 0x9f, 0x46, 0x63, 0xf1, 0x36, 0xd3, 0x9f, 0xc1, 0x6e, 0xf2, 0xd2, + 0xb4, 0xa5, 0x8, 0x94, 0x7a, 0xa7, 0xba, 0xb2, 0xec, 0x62}, + }, + { + {0x74, 0x28, 0xb6, 0xaf, 0x36, 0x28, 0x7, 0x92, 0xa5, 0x4, 0xe1, + 0x79, 0x85, 0x5e, 0xcd, 0x5f, 0x4a, 0xa1, 0x30, 0xc6, 0xad, 0x1, + 0xad, 0x5a, 0x98, 0x3f, 0x66, 0x75, 0x50, 0x3d, 0x91, 0x61}, + {0x3d, 0x2b, 0x15, 0x61, 0x52, 0x79, 0xed, 0xe5, 0xd1, 0xd7, 0xdd, + 0xe, 0x7d, 0x35, 0x62, 0x49, 0x71, 0x4c, 0x6b, 0xb9, 0xd0, 0xc8, + 0x82, 0x74, 0xbe, 0xd8, 0x66, 0xa9, 0x19, 0xf9, 0x59, 0x2e}, + {0xda, 0x31, 0x32, 0x1a, 0x36, 0x2d, 0xc6, 0xd, 0x70, 0x2, 0x20, + 0x94, 0x32, 0x58, 0x47, 0xfa, 0xce, 0x94, 0x95, 0x3f, 0x51, 0x1, + 0xd8, 0x2, 0x5c, 0x5d, 0xc0, 0x31, 0xa1, 0xc2, 0xdb, 0x3d}, + }, + }, + { + { + {0x14, 0xbb, 0x96, 0x27, 0xa2, 0x57, 0xaa, 0xf3, 0x21, 0xda, 0x7, + 0x9b, 0xb7, 0xba, 0x3a, 0x88, 0x1c, 0x39, 0xa0, 0x31, 0x18, 0xe2, + 0x4b, 0xe5, 0xf9, 0x5, 0x32, 0xd8, 0x38, 0xfb, 0xe7, 0x5e}, + {0x4b, 0xc5, 0x5e, 0xce, 0xf9, 0xf, 0xdc, 0x9a, 0xd, 0x13, 0x2f, + 0x8c, 0x6b, 0x2a, 0x9c, 0x3, 0x15, 0x95, 0xf8, 0xf0, 0xc7, 0x7, + 0x80, 0x2, 0x6b, 0xb3, 0x4, 0xac, 0x14, 0x83, 0x96, 0x78}, + {0x8e, 0x6a, 0x44, 0x41, 0xcb, 0xfd, 0x8d, 0x53, 0xf9, 0x37, 0x49, + 0x43, 0xa9, 0xfd, 0xac, 0xa5, 0x78, 0x8c, 0x3c, 0x26, 0x8d, 0x90, + 0xaf, 0x46, 0x9, 0xd, 0xca, 0x9b, 0x3c, 0x63, 0xd0, 0x61}, + }, + { + {0xdf, 0x73, 0xfc, 0xf8, 0xbc, 0x28, 0xa3, 0xad, 0xfc, 0x37, 0xf0, + 0xa6, 0x5d, 0x69, 0x84, 0xee, 0x9, 0xa9, 0xc2, 0x38, 0xdb, 0xb4, + 0x7f, 0x63, 0xdc, 0x7b, 0x6, 0xf8, 0x2d, 0xac, 0x23, 0x5b}, + {0x66, 0x25, 0xdb, 0xff, 0x35, 0x49, 0x74, 0x63, 0xbb, 0x68, 0xb, + 0x78, 0x89, 0x6b, 0xbd, 0xc5, 0x3, 0xec, 0x3e, 0x55, 0x80, 0x32, + 0x1b, 0x6f, 0xf5, 0xd7, 0xae, 0x47, 0xd8, 0x5f, 0x96, 0x6e}, + {0x7b, 0x52, 0x80, 0xee, 0x53, 0xb9, 0xd2, 0x9a, 0x8d, 0x6d, 0xde, + 0xfa, 0xaa, 0x19, 0x8f, 0xe8, 0xcf, 0x82, 0xe, 0x15, 0x4, 0x17, + 0x71, 0xe, 0xdc, 0xde, 0x95, 0xdd, 0xb9, 0xbb, 0xb9, 0x79}, + }, + { + {0x74, 0x73, 0x9f, 0x8e, 0xae, 0x7d, 0x99, 0xd1, 0x16, 0x8, 0xbb, + 0xcf, 0xf8, 0xa2, 0x32, 0xa0, 0xa, 0x5f, 0x44, 0x6d, 0x12, 0xba, + 0x6c, 0xcd, 0x34, 0xb8, 0xcc, 0xa, 0x46, 0x11, 0xa8, 0x1b}, + {0xc2, 0x26, 0x31, 0x6a, 0x40, 0x55, 0xb3, 0xeb, 0x93, 0xc3, 0xc8, + 0x68, 0xa8, 0x83, 0x63, 0xd2, 0x82, 0x7a, 0xb9, 0xe5, 0x29, 0x64, + 0xc, 0x6c, 0x47, 0x21, 0xfd, 0xc9, 0x58, 0xf1, 0x65, 0x50}, + {0x54, 0x99, 0x42, 0xc, 0xfb, 0x69, 0x81, 0x70, 0x67, 0xcf, 0x6e, + 0xd7, 0xac, 0x0, 0x46, 0xe1, 0xba, 0x45, 0xe6, 0x70, 0x8a, 0xb9, + 0xaa, 0x2e, 0xf2, 0xfa, 0xa4, 0x58, 0x9e, 0xf3, 0x81, 0x39}, + }, + { + {0xde, 0x6f, 0xe6, 0x6d, 0xa5, 0xdf, 0x45, 0xc8, 0x3a, 0x48, 0x40, + 0x2c, 0x0, 0xa5, 0x52, 0xe1, 0x32, 0xf6, 0xb4, 0xc7, 0x63, 0xe1, + 0xd2, 0xe9, 0x65, 0x1b, 0xbc, 0xdc, 0x2e, 0x45, 0xf4, 0x30}, + {0x93, 0xa, 0x23, 0x59, 0x75, 0x8a, 0xfb, 0x18, 0x5d, 0xf4, 0xe6, + 0x60, 0x69, 0x8f, 0x16, 0x1d, 0xb5, 0x3c, 0xa9, 0x14, 0x45, 0xa9, + 0x85, 0x3a, 0xfd, 0xd0, 0xac, 0x5, 0x37, 0x8, 0xdc, 0x38}, + {0x40, 0x97, 0x75, 0xc5, 0x82, 0x27, 0x6d, 0x85, 0xcc, 0xbe, 0x9c, + 0xf9, 0x69, 0x45, 0x13, 0xfa, 0x71, 0x4e, 0xea, 0xc0, 0x73, 0xfc, + 0x44, 0x88, 0x69, 0x24, 0x3f, 0x59, 0x1a, 0x9a, 0x2d, 0x63}, + }, + { + {0xa7, 0x84, 0xc, 0xed, 0x11, 0xfd, 0x9, 0xbf, 0x3a, 0x69, 0x9f, + 0xd, 0x81, 0x71, 0xf0, 0x63, 0x79, 0x87, 0xcf, 0x57, 0x2d, 0x8c, + 0x90, 0x21, 0xa2, 0x4b, 0xf6, 0x8a, 0xf2, 0x7d, 0x5a, 0x3a}, + {0xa6, 0xcb, 0x7, 0xb8, 0x15, 0x6b, 0xbb, 0xf6, 0xd7, 0xf0, 0x54, + 0xbc, 0xdf, 0xc7, 0x23, 0x18, 0xb, 0x67, 0x29, 0x6e, 0x3, 0x97, + 0x1d, 0xbb, 0x57, 0x4a, 0xed, 0x47, 0x88, 0xf4, 0x24, 0xb}, + {0xc7, 0xea, 0x1b, 0x51, 0xbe, 0xd4, 0xda, 0xdc, 0xf2, 0xcc, 0x26, + 0xed, 0x75, 0x80, 0x53, 0xa4, 0x65, 0x9a, 0x5f, 0x0, 0x9f, 0xff, + 0x9c, 0xe1, 0x63, 0x1f, 0x48, 0x75, 0x44, 0xf7, 0xfc, 0x34}, + }, + { + {0x98, 0xaa, 0xcf, 0x78, 0xab, 0x1d, 0xbb, 0xa5, 0xf2, 0x72, 0xb, + 0x19, 0x67, 0xa2, 0xed, 0x5c, 0x8e, 0x60, 0x92, 0xa, 0x11, 0xc9, + 0x9, 0x93, 0xb0, 0x74, 0xb3, 0x2f, 0x4, 0xa3, 0x19, 0x1}, + {0xca, 0x67, 0x97, 0x78, 0x4c, 0xe0, 0x97, 0xc1, 0x7d, 0x46, 0xd9, + 0x38, 0xcb, 0x4d, 0x71, 0xb8, 0xa8, 0x5f, 0xf9, 0x83, 0x82, 0x88, + 0xde, 0x55, 0xf7, 0x63, 0xfa, 0x4d, 0x16, 0xdc, 0x3b, 0x3d}, + {0x7d, 0x17, 0xc2, 0xe8, 0x9c, 0xd8, 0xa2, 0x67, 0xc1, 0xd0, 0x95, + 0x68, 0xf6, 0xa5, 0x9d, 0x66, 0xb0, 0xa2, 0x82, 0xb2, 0xe5, 0x98, + 0x65, 0xf5, 0x73, 0xa, 0xe2, 0xed, 0xf1, 0x88, 0xc0, 0x56}, + }, + { + {0x2, 0x8f, 0xf3, 0x24, 0xac, 0x5f, 0x1b, 0x58, 0xbd, 0xc, 0xe3, + 0xba, 0xfe, 0xe9, 0xb, 0xa9, 0xf0, 0x92, 0xcf, 0x8a, 0x2, 0x69, + 0x21, 0x9a, 0x8f, 0x3, 0x59, 0x83, 0xa4, 0x7e, 0x8b, 0x3}, + {0x17, 0x6e, 0xa8, 0x10, 0x11, 0x3d, 0x6d, 0x33, 0xfa, 0xb2, 0x75, + 0xb, 0x32, 0x88, 0xf3, 0xd7, 0x88, 0x29, 0x7, 0x25, 0x76, 0x33, + 0x15, 0xf9, 0x87, 0x8b, 0x10, 0x99, 0x6b, 0x4c, 0x67, 0x9}, + {0xf8, 0x6f, 0x31, 0x99, 0x21, 0xf8, 0x4e, 0x9f, 0x4f, 0x8d, 0xa7, + 0xea, 0x82, 0xd2, 0x49, 0x2f, 0x74, 0x31, 0xef, 0x5a, 0xab, 0xa5, + 0x71, 0x9, 0x65, 0xeb, 0x69, 0x59, 0x2, 0x31, 0x5e, 0x6e}, + }, + { + {0x22, 0x62, 0x6, 0x63, 0xe, 0xfb, 0x4, 0x33, 0x3f, 0xba, 0xac, + 0x87, 0x89, 0x6, 0x35, 0xfb, 0xa3, 0x61, 0x10, 0x8c, 0x77, 0x24, + 0x19, 0xbd, 0x20, 0x86, 0x83, 0xd1, 0x43, 0xad, 0x58, 0x30}, + {0xfb, 0x93, 0xe5, 0x87, 0xf5, 0x62, 0x6c, 0xb1, 0x71, 0x3e, 0x5d, + 0xca, 0xde, 0xed, 0x99, 0x49, 0x6d, 0x3e, 0xcc, 0x14, 0xe0, 0xc1, + 0x91, 0xb4, 0xa8, 0xdb, 0xa8, 0x89, 0x47, 0x11, 0xf5, 0x8}, + {0xd0, 0x63, 0x76, 0xe5, 0xfd, 0xf, 0x3c, 0x32, 0x10, 0xa6, 0x2e, + 0xa2, 0x38, 0xdf, 0xc3, 0x5, 0x9a, 0x4f, 0x99, 0xac, 0xbd, 0x8a, + 0xc7, 0xbd, 0x99, 0xdc, 0xe3, 0xef, 0xa4, 0x9f, 0x54, 0x26}, + }, + }, + { + { + {0x6e, 0x66, 0x3f, 0xaf, 0x49, 0x85, 0x46, 0xdb, 0xa5, 0xe, 0x4a, + 0xf1, 0x4, 0xcf, 0x7f, 0xd7, 0x47, 0xc, 0xba, 0xa4, 0xf7, 0x3f, + 0xf2, 0x3d, 0x85, 0x3c, 0xce, 0x32, 0xe1, 0xdf, 0x10, 0x3a}, + {0xd6, 0xf9, 0x6b, 0x1e, 0x46, 0x5a, 0x1d, 0x74, 0x81, 0xa5, 0x77, + 0x77, 0xfc, 0xb3, 0x5, 0x23, 0xd9, 0xd3, 0x74, 0x64, 0xa2, 0x74, + 0x55, 0xd4, 0xff, 0xe0, 0x1, 0x64, 0xdc, 0xe1, 0x26, 0x19}, + {0xa0, 0xce, 0x17, 0xea, 0x8a, 0x4e, 0x7f, 0xe0, 0xfd, 0xc1, 0x1f, + 0x3a, 0x46, 0x15, 0xd5, 0x2f, 0xf1, 0xc0, 0xf2, 0x31, 0xfd, 0x22, + 0x53, 0x17, 0x15, 0x5d, 0x1e, 0x86, 0x1d, 0xd0, 0xa1, 0x1f}, + }, + { + {0xab, 0x94, 0xdf, 0xd1, 0x0, 0xac, 0xdc, 0x38, 0xe9, 0xd, 0x8, + 0xd1, 0xdd, 0x2b, 0x71, 0x2e, 0x62, 0xe2, 0xd5, 0xfd, 0x3e, 0xe9, + 0x13, 0x7f, 0xe5, 0x1, 0x9a, 0xee, 0x18, 0xed, 0xfc, 0x73}, + {0x32, 0x98, 0x59, 0x7d, 0x94, 0x55, 0x80, 0xcc, 0x20, 0x55, 0xf1, + 0x37, 0xda, 0x56, 0x46, 0x1e, 0x20, 0x93, 0x5, 0x4e, 0x74, 0xf7, + 0xf6, 0x99, 0x33, 0xcf, 0x75, 0x6a, 0xbc, 0x63, 0x35, 0x77}, + {0xb3, 0x9c, 0x13, 0x63, 0x8, 0xe9, 0xb1, 0x6, 0xcd, 0x3e, 0xa0, + 0xc5, 0x67, 0xda, 0x93, 0xa4, 0x32, 0x89, 0x63, 0xad, 0xc8, 0xce, + 0x77, 0x8d, 0x44, 0x4f, 0x86, 0x1b, 0x70, 0x6b, 0x42, 0x1f}, + }, + { + {0x52, 0x25, 0xa1, 0x91, 0xc8, 0x35, 0x7e, 0xf1, 0x76, 0x9c, 0x5e, + 0x57, 0x53, 0x81, 0x6b, 0xb7, 0x3e, 0x72, 0x9b, 0xd, 0x6f, 0x40, + 0x83, 0xfa, 0x38, 0xe4, 0xa7, 0x3f, 0x1b, 0xbb, 0x76, 0xb}, + {0x1, 0x1c, 0x91, 0x41, 0x4c, 0x26, 0xc9, 0xef, 0x25, 0x2c, 0xa2, + 0x17, 0xb8, 0xb7, 0xa3, 0xf1, 0x47, 0x14, 0xf, 0xf3, 0x6b, 0xda, + 0x75, 0x58, 0x90, 0xb0, 0x31, 0x1d, 0x27, 0xf5, 0x1a, 0x4e}, + {0x9b, 0x93, 0x92, 0x7f, 0xf9, 0xc1, 0xb8, 0x8, 0x6e, 0xab, 0x44, + 0xd4, 0xcb, 0x71, 0x67, 0xbe, 0x17, 0x80, 0xbb, 0x99, 0x63, 0x64, + 0xe5, 0x22, 0x55, 0xa9, 0x72, 0xb7, 0x1e, 0xd6, 0x6d, 0x7b}, + }, + { + {0xc7, 0xd2, 0x1, 0xab, 0xf9, 0xab, 0x30, 0x57, 0x18, 0x3b, 0x14, + 0x40, 0xdc, 0x76, 0xfb, 0x16, 0x81, 0xb2, 0xcb, 0xa0, 0x65, 0xbe, + 0x6c, 0x86, 0xfe, 0x6a, 0xff, 0x9b, 0x65, 0x9b, 0xfa, 0x53}, + {0x92, 0x3d, 0xf3, 0x50, 0xe8, 0xc1, 0xad, 0xb7, 0xcf, 0xd5, 0x8c, + 0x60, 0x4f, 0xfa, 0x98, 0x79, 0xdb, 0x5b, 0xfc, 0x8d, 0xbd, 0x2d, + 0x96, 0xad, 0x4f, 0x2f, 0x1d, 0xaf, 0xce, 0x9b, 0x3e, 0x70}, + {0x55, 0x54, 0x88, 0x94, 0xe9, 0xc8, 0x14, 0x6c, 0xe5, 0xd4, 0xae, + 0x65, 0x66, 0x5d, 0x3a, 0x84, 0xf1, 0x5a, 0xd6, 0xbc, 0x3e, 0xb7, + 0x1b, 0x18, 0x50, 0x1f, 0xc6, 0xc4, 0xe5, 0x93, 0x8d, 0x39}, + }, + { + {0xf2, 0xe3, 0xe7, 0xd2, 0x60, 0x7c, 0x87, 0xc3, 0xb1, 0x8b, 0x82, + 0x30, 0xa0, 0xaa, 0x34, 0x3b, 0x38, 0xf1, 0x9e, 0x73, 0xe7, 0x26, + 0x3e, 0x28, 0x77, 0x5, 0xc3, 0x2, 0x90, 0x9c, 0x9c, 0x69}, + {0xf3, 0x48, 0xe2, 0x33, 0x67, 0xd1, 0x4b, 0x1c, 0x5f, 0xa, 0xbf, + 0x15, 0x87, 0x12, 0x9e, 0xbd, 0x76, 0x3, 0xb, 0xa1, 0xf0, 0x8c, + 0x3f, 0xd4, 0x13, 0x1b, 0x19, 0xdf, 0x5d, 0x9b, 0xb0, 0x53}, + {0xcc, 0xf1, 0x46, 0x59, 0x23, 0xa7, 0x6, 0xf3, 0x7d, 0xd9, 0xe5, + 0xcc, 0xb5, 0x18, 0x17, 0x92, 0x75, 0xe9, 0xb4, 0x81, 0x47, 0xd2, + 0xcd, 0x28, 0x7, 0xd9, 0xcd, 0x6f, 0xc, 0xf3, 0xca, 0x51}, + }, + { + {0xc7, 0x54, 0xac, 0x18, 0x9a, 0xf9, 0x7a, 0x73, 0xf, 0xb3, 0x1c, + 0xc5, 0xdc, 0x78, 0x33, 0x90, 0xc7, 0xc, 0xe1, 0x4c, 0x33, 0xbc, + 0x89, 0x2b, 0x9a, 0xe9, 0xf8, 0x89, 0xc1, 0x29, 0xae, 0x12}, + {0xa, 0xe0, 0x74, 0x76, 0x42, 0xa7, 0xb, 0xa6, 0xf3, 0x7b, 0x7a, + 0xa1, 0x70, 0x85, 0xe, 0x63, 0xcc, 0x24, 0x33, 0xcf, 0x3d, 0x56, + 0x58, 0x37, 0xaa, 0xfd, 0x83, 0x23, 0x29, 0xaa, 0x4, 0x55}, + {0xcf, 0x1, 0xd, 0x1f, 0xcb, 0xc0, 0x9e, 0xa9, 0xae, 0xf7, 0x34, + 0x3a, 0xcc, 0xef, 0xd1, 0xd, 0x22, 0x4e, 0x9c, 0xd0, 0x21, 0x75, + 0xca, 0x55, 0xea, 0xa5, 0xeb, 0x58, 0xe9, 0x4f, 0xd1, 0x5f}, + }, + { + {0x8e, 0xcb, 0x93, 0xbf, 0x5e, 0xfe, 0x42, 0x3c, 0x5f, 0x56, 0xd4, + 0x36, 0x51, 0xa8, 0xdf, 0xbe, 0xe8, 0x20, 0x42, 0x88, 0x9e, 0x85, + 0xf0, 0xe0, 0x28, 0xd1, 0x25, 0x7, 0x96, 0x3f, 0xd7, 0x7d}, + {0x2c, 0xab, 0x45, 0x28, 0xdf, 0x2d, 0xdc, 0xb5, 0x93, 0xe9, 0x7f, + 0xa, 0xb1, 0x91, 0x94, 0x6, 0x46, 0xe3, 0x2, 0x40, 0xd6, 0xf3, + 0xaa, 0x4d, 0xd1, 0x74, 0x64, 0x58, 0x6e, 0xf2, 0x3f, 0x9}, + {0x29, 0x98, 0x5, 0x68, 0xfe, 0x24, 0xd, 0xb1, 0xe5, 0x23, 0xaf, + 0xdb, 0x72, 0x6, 0x73, 0x75, 0x29, 0xac, 0x57, 0xb4, 0x3a, 0x25, + 0x67, 0x13, 0xa4, 0x70, 0xb4, 0x86, 0xbc, 0xbc, 0x59, 0x2f}, + }, + { + {0x1, 0xc3, 0x91, 0xb6, 0x60, 0xd5, 0x41, 0x70, 0x1e, 0xe7, 0xd7, + 0xad, 0x3f, 0x1b, 0x20, 0x85, 0x85, 0x55, 0x33, 0x11, 0x63, 0xe1, + 0xc2, 0x16, 0xb1, 0x28, 0x8, 0x1, 0x3d, 0x5e, 0xa5, 0x2a}, + {0x5f, 0x13, 0x17, 0x99, 0x42, 0x7d, 0x84, 0x83, 0xd7, 0x3, 0x7d, + 0x56, 0x1f, 0x91, 0x1b, 0xad, 0xd1, 0xaa, 0x77, 0xbe, 0xd9, 0x48, + 0x77, 0x7e, 0x4a, 0xaf, 0x51, 0x2e, 0x2e, 0xb4, 0x58, 0x54}, + {0x4f, 0x44, 0x7, 0xc, 0xe6, 0x92, 0x51, 0xed, 0x10, 0x1d, 0x42, + 0x74, 0x2d, 0x4e, 0xc5, 0x42, 0x64, 0xc8, 0xb5, 0xfd, 0x82, 0x4c, + 0x2b, 0x35, 0x64, 0x86, 0x76, 0x8a, 0x4a, 0x0, 0xe9, 0x13}, + }, + }, + { + { + {0x7f, 0x87, 0x3b, 0x19, 0xc9, 0x0, 0x2e, 0xbb, 0x6b, 0x50, 0xdc, + 0xe0, 0x90, 0xa8, 0xe3, 0xec, 0x9f, 0x64, 0xde, 0x36, 0xc0, 0xb7, + 0xf3, 0xec, 0x1a, 0x9e, 0xde, 0x98, 0x8, 0x4, 0x46, 0x5f}, + {0xdb, 0xce, 0x2f, 0x83, 0x45, 0x88, 0x9d, 0x73, 0x63, 0xf8, 0x6b, + 0xae, 0xc9, 0xd6, 0x38, 0xfa, 0xf7, 0xfe, 0x4f, 0xb7, 0xca, 0xd, + 0xbc, 0x32, 0x5e, 0xe4, 0xbc, 0x14, 0x88, 0x7e, 0x93, 0x73}, + {0x8d, 0xf4, 0x7b, 0x29, 0x16, 0x71, 0x3, 0xb9, 0x34, 0x68, 0xf0, + 0xd4, 0x22, 0x3b, 0xd1, 0xa9, 0xc6, 0xbd, 0x96, 0x46, 0x57, 0x15, + 0x97, 0xe1, 0x35, 0xe8, 0xd5, 0x91, 0xe8, 0xa4, 0xf8, 0x2c}, + }, + { + {0xa2, 0x6b, 0xd0, 0x17, 0x7e, 0x48, 0xb5, 0x2c, 0x6b, 0x19, 0x50, + 0x39, 0x1c, 0x38, 0xd2, 0x24, 0x30, 0x8a, 0x97, 0x85, 0x81, 0x9c, + 0x65, 0xd7, 0xf6, 0xa4, 0xd6, 0x91, 0x28, 0x7f, 0x6f, 0x7a}, + {0x67, 0xf, 0x11, 0x7, 0x87, 0xfd, 0x93, 0x6d, 0x49, 0xb5, 0x38, + 0x7c, 0xd3, 0x9, 0x4c, 0xdd, 0x86, 0x6a, 0x73, 0xc2, 0x4c, 0x6a, + 0xb1, 0x7c, 0x9, 0x2a, 0x25, 0x58, 0x6e, 0xbd, 0x49, 0x20}, + {0x49, 0xef, 0x9a, 0x6a, 0x8d, 0xfd, 0x9, 0x7d, 0xb, 0xb9, 0x3d, + 0x5b, 0xbe, 0x60, 0xee, 0xf0, 0xd4, 0xbf, 0x9e, 0x51, 0x2c, 0xb5, + 0x21, 0x4c, 0x1d, 0x94, 0x45, 0xc5, 0xdf, 0xaa, 0x11, 0x60}, + }, + { + {0x90, 0xf8, 0xcb, 0x2, 0xc8, 0xd0, 0xde, 0x63, 0xaa, 0x6a, 0xff, + 0xd, 0xca, 0x98, 0xd0, 0xfb, 0x99, 0xed, 0xb6, 0xb9, 0xfd, 0xa, + 0x4d, 0x62, 0x1e, 0xb, 0x34, 0x79, 0xb7, 0x18, 0xce, 0x69}, + {0x3c, 0xf8, 0x95, 0xcf, 0x6d, 0x92, 0x67, 0x5f, 0x71, 0x90, 0x28, + 0x71, 0x61, 0x85, 0x7e, 0x7c, 0x5b, 0x7a, 0x8f, 0x99, 0xf3, 0xe7, + 0xa1, 0xd6, 0xe0, 0xf9, 0x62, 0xb, 0x1b, 0xcc, 0xc5, 0x6f}, + {0xcb, 0x79, 0x98, 0xb2, 0x28, 0x55, 0xef, 0xd1, 0x92, 0x90, 0x7e, + 0xd4, 0x3c, 0xae, 0x1a, 0xdd, 0x52, 0x23, 0x9f, 0x18, 0x42, 0x4, + 0x7e, 0x12, 0xf1, 0x1, 0x71, 0xe5, 0x3a, 0x6b, 0x59, 0x15}, + }, + { + {0xca, 0x24, 0x51, 0x7e, 0x16, 0x31, 0xff, 0x9, 0xdf, 0x45, 0xc7, + 0xd9, 0x8b, 0x15, 0xe4, 0xb, 0xe5, 0x56, 0xf5, 0x7e, 0x22, 0x7d, + 0x2b, 0x29, 0x38, 0xd1, 0xb6, 0xaf, 0x41, 0xe2, 0xa4, 0x3a}, + {0xa2, 0x79, 0x91, 0x3f, 0xd2, 0x39, 0x27, 0x46, 0xcf, 0xdd, 0xd6, + 0x97, 0x31, 0x12, 0x83, 0xff, 0x8a, 0x14, 0xf2, 0x53, 0xb5, 0xde, + 0x7, 0x13, 0xda, 0x4d, 0x5f, 0x7b, 0x68, 0x37, 0x22, 0xd}, + {0xf5, 0x5, 0x33, 0x2a, 0xbf, 0x38, 0xc1, 0x2c, 0xc3, 0x26, 0xe9, + 0xa2, 0x8f, 0x3f, 0x58, 0x48, 0xeb, 0xd2, 0x49, 0x55, 0xa2, 0xb1, + 0x3a, 0x8, 0x6c, 0xa3, 0x87, 0x46, 0x6e, 0xaa, 0xfc, 0x32}, + }, + { + {0xdf, 0xcc, 0x87, 0x27, 0x73, 0xa4, 0x7, 0x32, 0xf8, 0xe3, 0x13, + 0xf2, 0x8, 0x19, 0xe3, 0x17, 0x4e, 0x96, 0xd, 0xf6, 0xd7, 0xec, + 0xb2, 0xd5, 0xe9, 0xb, 0x60, 0xc2, 0x36, 0x63, 0x6f, 0x74}, + {0xf5, 0x9a, 0x7d, 0xc5, 0x8d, 0x6e, 0xc5, 0x7b, 0xf2, 0xbd, 0xf0, + 0x9d, 0xed, 0xd2, 0xb, 0x3e, 0xa3, 0xe4, 0xef, 0x22, 0xde, 0x14, + 0xc0, 0xaa, 0x5c, 0x6a, 0xbd, 0xfe, 0xce, 0xe9, 0x27, 0x46}, + {0x1c, 0x97, 0x6c, 0xab, 0x45, 0xf3, 0x4a, 0x3f, 0x1f, 0x73, 0x43, + 0x99, 0x72, 0xeb, 0x88, 0xe2, 0x6d, 0x18, 0x44, 0x3, 0x8a, 0x6a, + 0x59, 0x33, 0x93, 0x62, 0xd6, 0x7e, 0x0, 0x17, 0x49, 0x7b}, + }, + { + {0xdd, 0xa2, 0x53, 0xdd, 0x28, 0x1b, 0x34, 0x54, 0x3f, 0xfc, 0x42, + 0xdf, 0x5b, 0x90, 0x17, 0xaa, 0xf4, 0xf8, 0xd2, 0x4d, 0xd9, 0x92, + 0xf5, 0xf, 0x7d, 0xd3, 0x8c, 0xe0, 0xf, 0x62, 0x3, 0x1d}, + {0x64, 0xb0, 0x84, 0xab, 0x5c, 0xfb, 0x85, 0x2d, 0x14, 0xbc, 0xf3, + 0x89, 0xd2, 0x10, 0x78, 0x49, 0xc, 0xce, 0x15, 0x7b, 0x44, 0xdc, + 0x6a, 0x47, 0x7b, 0xfd, 0x44, 0xf8, 0x76, 0xa3, 0x2b, 0x12}, + {0x54, 0xe5, 0xb4, 0xa2, 0xcd, 0x32, 0x2, 0xc2, 0x7f, 0x18, 0x5d, + 0x11, 0x42, 0xfd, 0xd0, 0x9e, 0xd9, 0x79, 0xd4, 0x7d, 0xbe, 0xb4, + 0xab, 0x2e, 0x4c, 0xec, 0x68, 0x2b, 0xf5, 0xb, 0xc7, 0x2}, + }, + { + {0xe1, 0x72, 0x8d, 0x45, 0xbf, 0x32, 0xe5, 0xac, 0xb5, 0x3c, 0xb7, + 0x7c, 0xe0, 0x68, 0xe7, 0x5b, 0xe7, 0xbd, 0x8b, 0xee, 0x94, 0x7d, + 0xcf, 0x56, 0x3, 0x3a, 0xb4, 0xfe, 0xe3, 0x97, 0x6, 0x6b}, + {0xbb, 0x2f, 0xb, 0x5d, 0x4b, 0xec, 0x87, 0xa2, 0xca, 0x82, 0x48, + 0x7, 0x90, 0x57, 0x5c, 0x41, 0x5c, 0x81, 0xd0, 0xc1, 0x1e, 0xa6, + 0x44, 0xe0, 0xe0, 0xf5, 0x9e, 0x40, 0xa, 0x4f, 0x33, 0x26}, + {0xc0, 0xa3, 0x62, 0xdf, 0x4a, 0xf0, 0xc8, 0xb6, 0x5d, 0xa4, 0x6d, + 0x7, 0xef, 0x0, 0xf0, 0x3e, 0xa9, 0xd2, 0xf0, 0x49, 0x58, 0xb9, + 0x9c, 0x9c, 0xae, 0x2f, 0x1b, 0x44, 0x43, 0x7f, 0xc3, 0x1c}, + }, + { + {0xb9, 0xae, 0xce, 0xc9, 0xf1, 0x56, 0x66, 0xd7, 0x6a, 0x65, 0xe5, + 0x18, 0xf8, 0x15, 0x5b, 0x1c, 0x34, 0x23, 0x4c, 0x84, 0x32, 0x28, + 0xe7, 0x26, 0x38, 0x68, 0x19, 0x2f, 0x77, 0x6f, 0x34, 0x3a}, + {0x4f, 0x32, 0xc7, 0x5c, 0x5a, 0x56, 0x8f, 0x50, 0x22, 0xa9, 0x6, + 0xe5, 0xc0, 0xc4, 0x61, 0xd0, 0x19, 0xac, 0x45, 0x5c, 0xdb, 0xab, + 0x18, 0xfb, 0x4a, 0x31, 0x80, 0x3, 0xc1, 0x9, 0x68, 0x6c}, + {0xc8, 0x6a, 0xda, 0xe2, 0x12, 0x51, 0xd5, 0xd2, 0xed, 0x51, 0xe8, + 0xb1, 0x31, 0x3, 0xbd, 0xe9, 0x62, 0x72, 0xc6, 0x8e, 0xdd, 0x46, + 0x7, 0x96, 0xd0, 0xc5, 0xf7, 0x6e, 0x9f, 0x1b, 0x91, 0x5}, + }, + }, + { + { + {0xef, 0xea, 0x2e, 0x51, 0xf3, 0xac, 0x49, 0x53, 0x49, 0xcb, 0xc1, + 0x1c, 0xd3, 0x41, 0xc1, 0x20, 0x8d, 0x68, 0x9a, 0xa9, 0x7, 0xc, + 0x18, 0x24, 0x17, 0x2d, 0x4b, 0xc6, 0xd1, 0xf9, 0x5e, 0x55}, + {0xbb, 0xe, 0xdf, 0xf5, 0x83, 0x99, 0x33, 0xc1, 0xac, 0x4c, 0x2c, + 0x51, 0x8f, 0x75, 0xf3, 0xc0, 0xe1, 0x98, 0xb3, 0xb, 0xa, 0x13, + 0xf1, 0x2c, 0x62, 0xc, 0x27, 0xaa, 0xf9, 0xec, 0x3c, 0x6b}, + {0x8, 0xbd, 0x73, 0x3b, 0xba, 0x70, 0xa7, 0x36, 0xc, 0xbf, 0xaf, + 0xa3, 0x8, 0xef, 0x4a, 0x62, 0xf2, 0x46, 0x9, 0xb4, 0x98, 0xff, + 0x37, 0x57, 0x9d, 0x74, 0x81, 0x33, 0xe1, 0x4d, 0x5f, 0x67}, + }, + { + {0x1d, 0xb3, 0xda, 0x3b, 0xd9, 0xf6, 0x2f, 0xa1, 0xfe, 0x2d, 0x65, + 0x9d, 0xf, 0xd8, 0x25, 0x7, 0x87, 0x94, 0xbe, 0x9a, 0xf3, 0x4f, + 0x9c, 0x1, 0x43, 0x3c, 0xcd, 0x82, 0xb8, 0x50, 0xf4, 0x60}, + {0xfc, 0x82, 0x17, 0x6b, 0x3, 0x52, 0x2c, 0xe, 0xb4, 0x83, 0xad, + 0x6c, 0x81, 0x6c, 0x81, 0x64, 0x3e, 0x7, 0x64, 0x69, 0xd9, 0xbd, + 0xdc, 0xd0, 0x20, 0xc5, 0x64, 0x1, 0xf7, 0x9d, 0xd9, 0x13}, + {0xca, 0xc0, 0xe5, 0x21, 0xc3, 0x5e, 0x4b, 0x1, 0xa2, 0xbf, 0x19, + 0xd7, 0xc9, 0x69, 0xcb, 0x4f, 0xa0, 0x23, 0x0, 0x75, 0x18, 0x1c, + 0x5f, 0x4e, 0x80, 0xac, 0xed, 0x55, 0x9e, 0xde, 0x6, 0x1c}, + }, + { + {0xaa, 0x69, 0x6d, 0xff, 0x40, 0x2b, 0xd5, 0xff, 0xbb, 0x49, 0x40, + 0xdc, 0x18, 0xb, 0x53, 0x34, 0x97, 0x98, 0x4d, 0xa3, 0x2f, 0x5c, + 0x4a, 0x5e, 0x2d, 0xba, 0x32, 0x7d, 0x8e, 0x6f, 0x9, 0x78}, + {0xe2, 0xc4, 0x3e, 0xa3, 0xd6, 0x7a, 0xf, 0x99, 0x8e, 0xe0, 0x2e, + 0xbe, 0x38, 0xf9, 0x8, 0x66, 0x15, 0x45, 0x28, 0x63, 0xc5, 0x43, + 0xa1, 0x9c, 0xd, 0xb6, 0x2d, 0xec, 0x1f, 0x8a, 0xf3, 0x4c}, + {0xe7, 0x5c, 0xfa, 0xd, 0x65, 0xaa, 0xaa, 0xa0, 0x8c, 0x47, 0xb5, + 0x48, 0x2a, 0x9e, 0xc4, 0xf9, 0x5b, 0x72, 0x3, 0x70, 0x7d, 0xcc, + 0x9, 0x4f, 0xbe, 0x1a, 0x9, 0x26, 0x3a, 0xad, 0x3c, 0x37}, + }, + { + {0xad, 0xbb, 0xdd, 0x89, 0xfb, 0xa8, 0xbe, 0xf1, 0xcb, 0xae, 0xae, + 0x61, 0xbc, 0x2c, 0xcb, 0x3b, 0x9d, 0x8d, 0x9b, 0x1f, 0xbb, 0xa7, + 0x58, 0x8f, 0x86, 0xa6, 0x12, 0x51, 0xda, 0x7e, 0x54, 0x21}, + {0x7c, 0xf5, 0xc9, 0x82, 0x4d, 0x63, 0x94, 0xb2, 0x36, 0x45, 0x93, + 0x24, 0xe1, 0xfd, 0xcb, 0x1f, 0x5a, 0xdb, 0x8c, 0x41, 0xb3, 0x4d, + 0x9c, 0x9e, 0xfc, 0x19, 0x44, 0x45, 0xd9, 0xf3, 0x40, 0x0}, + {0xd3, 0x86, 0x59, 0xfd, 0x39, 0xe9, 0xfd, 0xde, 0xc, 0x38, 0xa, + 0x51, 0x89, 0x2c, 0x27, 0xf4, 0xb9, 0x19, 0x31, 0xbb, 0x7, 0xa4, + 0x2b, 0xb7, 0xf4, 0x4d, 0x25, 0x4a, 0x33, 0xa, 0x55, 0x63}, + }, + { + {0x49, 0x7b, 0x54, 0x72, 0x45, 0x58, 0xba, 0x9b, 0xe0, 0x8, 0xc4, + 0xe2, 0xfa, 0xc6, 0x5, 0xf3, 0x8d, 0xf1, 0x34, 0xc7, 0x69, 0xfa, + 0xe8, 0x60, 0x7a, 0x76, 0x7d, 0xaa, 0xaf, 0x2b, 0xa9, 0x39}, + {0x37, 0xcf, 0x69, 0xb5, 0xed, 0xd6, 0x7, 0x65, 0xe1, 0x2e, 0xa5, + 0xc, 0xb0, 0x29, 0x84, 0x17, 0x5d, 0xd6, 0x6b, 0xeb, 0x90, 0x0, + 0x7c, 0xea, 0x51, 0x8f, 0xf7, 0xda, 0xc7, 0x62, 0xea, 0x3e}, + {0x4e, 0x27, 0x93, 0xe6, 0x13, 0xc7, 0x24, 0x9d, 0x75, 0xd3, 0xdb, + 0x68, 0x77, 0x85, 0x63, 0x5f, 0x9a, 0xb3, 0x8a, 0xeb, 0x60, 0x55, + 0x52, 0x70, 0xcd, 0xc4, 0xc9, 0x65, 0x6, 0x6a, 0x43, 0x68}, + }, + { + {0x7c, 0x10, 0x20, 0xe8, 0x17, 0xd3, 0x56, 0x1e, 0x65, 0xe9, 0xa, + 0x84, 0x44, 0x68, 0x26, 0xc5, 0x7a, 0xfc, 0xf, 0x32, 0xc6, 0xa1, + 0xe0, 0xc1, 0x72, 0x14, 0x61, 0x91, 0x9c, 0x66, 0x73, 0x53}, + {0x27, 0x3f, 0x2f, 0x20, 0xe8, 0x35, 0x2, 0xbc, 0xb0, 0x75, 0xf9, + 0x64, 0xe2, 0x0, 0x5c, 0xc7, 0x16, 0x24, 0x8c, 0xa3, 0xd5, 0xe9, + 0xa4, 0x91, 0xf9, 0x89, 0xb7, 0x8a, 0xf6, 0xe7, 0xb6, 0x17}, + {0x57, 0x52, 0xe, 0x9a, 0xab, 0x14, 0x28, 0x5d, 0xfc, 0xb3, 0xca, + 0xc9, 0x84, 0x20, 0x8f, 0x90, 0xca, 0x1e, 0x2d, 0x5b, 0x88, 0xf5, + 0xca, 0xaf, 0x11, 0x7d, 0xf8, 0x78, 0xa6, 0xb5, 0xb4, 0x1c}, + }, + { + {0xe7, 0x7, 0xa0, 0xa2, 0x62, 0xaa, 0x74, 0x6b, 0xb1, 0xc7, 0x71, + 0xf0, 0xb0, 0xe0, 0x11, 0xf3, 0x23, 0xe2, 0xb, 0x0, 0x38, 0xe4, + 0x7, 0x57, 0xac, 0x6e, 0xef, 0x82, 0x2d, 0xfd, 0xc0, 0x2d}, + {0x6c, 0xfc, 0x4a, 0x39, 0x6b, 0xc0, 0x64, 0xb6, 0xb1, 0x5f, 0xda, + 0x98, 0x24, 0xde, 0x88, 0xc, 0x34, 0xd8, 0xca, 0x4b, 0x16, 0x3, + 0x8d, 0x4f, 0xa2, 0x34, 0x74, 0xde, 0x78, 0xca, 0xb, 0x33}, + {0x4e, 0x74, 0x19, 0x11, 0x84, 0xff, 0x2e, 0x98, 0x24, 0x47, 0x7, + 0x2b, 0x96, 0x5e, 0x69, 0xf9, 0xfb, 0x53, 0xc9, 0xbf, 0x4f, 0xc1, + 0x8a, 0xc5, 0xf5, 0x1c, 0x9f, 0x36, 0x1b, 0xbe, 0x31, 0x3c}, + }, + { + {0x72, 0x42, 0xcb, 0xf9, 0x93, 0xbc, 0x68, 0xc1, 0x98, 0xdb, 0xce, + 0xc7, 0x1f, 0x71, 0xb8, 0xae, 0x7a, 0x8d, 0xac, 0x34, 0xaa, 0x52, + 0xe, 0x7f, 0xbb, 0x55, 0x7d, 0x7e, 0x9, 0xc1, 0xce, 0x41}, + {0xee, 0x8a, 0x94, 0x8, 0x4d, 0x86, 0xf4, 0xb0, 0x6f, 0x1c, 0xba, + 0x91, 0xee, 0x19, 0xdc, 0x7, 0x58, 0xa1, 0xac, 0xa6, 0xae, 0xcd, + 0x75, 0x79, 0xbb, 0xd4, 0x62, 0x42, 0x13, 0x61, 0xb, 0x33}, + {0x8a, 0x80, 0x6d, 0xa2, 0xd7, 0x19, 0x96, 0xf7, 0x6d, 0x15, 0x9e, + 0x1d, 0x9e, 0xd4, 0x1f, 0xbb, 0x27, 0xdf, 0xa1, 0xdb, 0x6c, 0xc3, + 0xd7, 0x73, 0x7d, 0x77, 0x28, 0x1f, 0xd9, 0x4c, 0xb4, 0x26}, + }, + }, + { + { + {0x83, 0x3, 0x73, 0x62, 0x93, 0xf2, 0xb7, 0xe1, 0x2c, 0x8a, 0xca, + 0xeb, 0xff, 0x79, 0x52, 0x4b, 0x14, 0x13, 0xd4, 0xbf, 0x8a, 0x77, + 0xfc, 0xda, 0xf, 0x61, 0x72, 0x9c, 0x14, 0x10, 0xeb, 0x7d}, + {0x75, 0x74, 0x38, 0x8f, 0x47, 0x48, 0xf0, 0x51, 0x3c, 0xcb, 0xbe, + 0x9c, 0xf4, 0xbc, 0x5d, 0xb2, 0x55, 0x20, 0x9f, 0xd9, 0x44, 0x12, + 0xab, 0x9a, 0xd6, 0xa5, 0x10, 0x1c, 0x6c, 0x9e, 0x70, 0x2c}, + {0x7a, 0xee, 0x66, 0x87, 0x6a, 0xaf, 0x62, 0xcb, 0xe, 0xcd, 0x53, + 0x55, 0x4, 0xec, 0xcb, 0x66, 0xb5, 0xe4, 0xb, 0xf, 0x38, 0x1, + 0x80, 0x58, 0xea, 0xe2, 0x2c, 0xf6, 0x9f, 0x8e, 0xe6, 0x8}, + }, + { + {0xf9, 0xf2, 0xb8, 0xa, 0xd5, 0x9, 0x2d, 0x2f, 0xdf, 0x23, 0x59, + 0xc5, 0x8d, 0x21, 0xb9, 0xac, 0xb9, 0x6c, 0x76, 0x73, 0x26, 0x34, + 0x8f, 0x4a, 0xf5, 0x19, 0xf7, 0x38, 0xd7, 0x3b, 0xb1, 0x4c}, + {0xad, 0x30, 0xc1, 0x4b, 0xa, 0x50, 0xad, 0x34, 0x9c, 0xd4, 0xb, + 0x3d, 0x49, 0xdb, 0x38, 0x8d, 0xbe, 0x89, 0xa, 0x50, 0x98, 0x3d, + 0x5c, 0xa2, 0x9, 0x3b, 0xba, 0xee, 0x87, 0x3f, 0x1f, 0x2f}, + {0x4a, 0xb6, 0x15, 0xe5, 0x75, 0x8c, 0x84, 0xf7, 0x38, 0x90, 0x4a, + 0xdb, 0xba, 0x1, 0x95, 0xa5, 0x50, 0x1b, 0x75, 0x3f, 0x3f, 0x31, + 0xd, 0xc2, 0xe8, 0x2e, 0xae, 0xc0, 0x53, 0xe3, 0xa1, 0x19}, + }, + { + {0xbd, 0xbd, 0x96, 0xd5, 0xcd, 0x72, 0x21, 0xb4, 0x40, 0xfc, 0xee, + 0x98, 0x43, 0x45, 0xe0, 0x93, 0xb5, 0x9, 0x41, 0xb4, 0x47, 0x53, + 0xb1, 0x9f, 0x34, 0xae, 0x66, 0x2, 0x99, 0xd3, 0x6b, 0x73}, + {0xc3, 0x5, 0xfa, 0xba, 0x60, 0x75, 0x1c, 0x7d, 0x61, 0x5e, 0xe5, + 0xc6, 0xa0, 0xa0, 0xe1, 0xb3, 0x73, 0x64, 0xd6, 0xc0, 0x18, 0x97, + 0x52, 0xe3, 0x86, 0x34, 0xc, 0xc2, 0x11, 0x6b, 0x54, 0x41}, + {0xb4, 0xb3, 0x34, 0x93, 0x50, 0x2d, 0x53, 0x85, 0x73, 0x65, 0x81, + 0x60, 0x4b, 0x11, 0xfd, 0x46, 0x75, 0x83, 0x5c, 0x42, 0x30, 0x5f, + 0x5f, 0xcc, 0x5c, 0xab, 0x7f, 0xb8, 0xa2, 0x95, 0x22, 0x41}, + }, + { + {0xc6, 0xea, 0x93, 0xe2, 0x61, 0x52, 0x65, 0x2e, 0xdb, 0xac, 0x33, + 0x21, 0x3, 0x92, 0x5a, 0x84, 0x6b, 0x99, 0x0, 0x79, 0xcb, 0x75, + 0x9, 0x46, 0x80, 0xdd, 0x5a, 0x19, 0x8d, 0xbb, 0x60, 0x7}, + {0xe9, 0xd6, 0x7e, 0xf5, 0x88, 0x9b, 0xc9, 0x19, 0x25, 0xc8, 0xf8, + 0x6d, 0x26, 0xcb, 0x93, 0x53, 0x73, 0xd2, 0xa, 0xb3, 0x13, 0x32, + 0xee, 0x5c, 0x34, 0x2e, 0x2d, 0xb5, 0xeb, 0x53, 0xe1, 0x14}, + {0x8a, 0x81, 0xe6, 0xcd, 0x17, 0x1a, 0x3e, 0x41, 0x84, 0xa0, 0x69, + 0xed, 0xa9, 0x6d, 0x15, 0x57, 0xb1, 0xcc, 0xca, 0x46, 0x8f, 0x26, + 0xbf, 0x2c, 0xf2, 0xc5, 0x3a, 0xc3, 0x9b, 0xbe, 0x34, 0x6b}, + }, + { + {0xd3, 0xf2, 0x71, 0x65, 0x65, 0x69, 0xfc, 0x11, 0x7a, 0x73, 0xe, + 0x53, 0x45, 0xe8, 0xc9, 0xc6, 0x35, 0x50, 0xfe, 0xd4, 0xa2, 0xe7, + 0x3a, 0xe3, 0xb, 0xd3, 0x6d, 0x2e, 0xb6, 0xc7, 0xb9, 0x1}, + {0xb2, 0xc0, 0x78, 0x3a, 0x64, 0x2f, 0xdf, 0xf3, 0x7c, 0x2, 0x2e, + 0xf2, 0x1e, 0x97, 0x3e, 0x4c, 0xa3, 0xb5, 0xc1, 0x49, 0x5e, 0x1c, + 0x7d, 0xec, 0x2d, 0xdd, 0x22, 0x9, 0x8f, 0xc1, 0x12, 0x20}, + {0x29, 0x9d, 0xc8, 0x5a, 0xe5, 0x55, 0xb, 0x88, 0x63, 0xa7, 0xa0, + 0x45, 0x1f, 0x24, 0x83, 0x14, 0x1f, 0x6c, 0xe7, 0xc2, 0xdf, 0xef, + 0x36, 0x3d, 0xe8, 0xad, 0x4b, 0x4e, 0x78, 0x5b, 0xaf, 0x8}, + }, + { + {0x4b, 0x2c, 0xcc, 0x89, 0xd2, 0x14, 0x73, 0xe2, 0x8d, 0x17, 0x87, + 0xa2, 0x11, 0xbd, 0xe4, 0x4b, 0xce, 0x64, 0x33, 0xfa, 0xd6, 0x28, + 0xd5, 0x18, 0x6e, 0x82, 0xd9, 0xaf, 0xd5, 0xc1, 0x23, 0x64}, + {0x33, 0x25, 0x1f, 0x88, 0xdc, 0x99, 0x34, 0x28, 0xb6, 0x23, 0x93, + 0x77, 0xda, 0x25, 0x5, 0x9d, 0xf4, 0x41, 0x34, 0x67, 0xfb, 0xdd, + 0x7a, 0x89, 0x8d, 0x16, 0x3a, 0x16, 0x71, 0x9d, 0xb7, 0x32}, + {0x6a, 0xb3, 0xfc, 0xed, 0xd9, 0xf8, 0x85, 0xcc, 0xf9, 0xe5, 0x46, + 0x37, 0x8f, 0xc2, 0xbc, 0x22, 0xcd, 0xd3, 0xe5, 0xf9, 0x38, 0xe3, + 0x9d, 0xe4, 0xcc, 0x2d, 0x3e, 0xc1, 0xfb, 0x5e, 0xa, 0x48}, + }, + { + {0x1f, 0x22, 0xce, 0x42, 0xe4, 0x4c, 0x61, 0xb6, 0x28, 0x39, 0x5, + 0x4c, 0xcc, 0x9d, 0x19, 0x6e, 0x3, 0xbe, 0x1c, 0xdc, 0xa4, 0xb4, + 0x3f, 0x66, 0x6, 0x8e, 0x1c, 0x69, 0x47, 0x1d, 0xb3, 0x24}, + {0x71, 0x20, 0x62, 0x1, 0xb, 0xe7, 0x51, 0xb, 0xc5, 0xaf, 0x1d, + 0x8b, 0xcf, 0x5, 0xb5, 0x6, 0xcd, 0xab, 0x5a, 0xef, 0x61, 0xb0, + 0x6b, 0x2c, 0x31, 0xbf, 0xb7, 0xc, 0x60, 0x27, 0xaa, 0x47}, + {0xc3, 0xf8, 0x15, 0xc0, 0xed, 0x1e, 0x54, 0x2a, 0x7c, 0x3f, 0x69, + 0x7c, 0x7e, 0xfe, 0xa4, 0x11, 0xd6, 0x78, 0xa2, 0x4e, 0x13, 0x66, + 0xaf, 0xf0, 0x94, 0xa0, 0xdd, 0x14, 0x5d, 0x58, 0x5b, 0x54}, + }, + { + {0xe1, 0x21, 0xb3, 0xe3, 0xd0, 0xe4, 0x4, 0x62, 0x95, 0x1e, 0xff, + 0x28, 0x7a, 0x63, 0xaa, 0x3b, 0x9e, 0xbd, 0x99, 0x5b, 0xfd, 0xcf, + 0xc, 0xb, 0x71, 0xd0, 0xc8, 0x64, 0x3e, 0xdc, 0x22, 0x4d}, + {0xf, 0x3a, 0xd4, 0xa0, 0x5e, 0x27, 0xbf, 0x67, 0xbe, 0xee, 0x9b, + 0x8, 0x34, 0x8e, 0xe6, 0xad, 0x2e, 0xe7, 0x79, 0xd4, 0x4c, 0x13, + 0x89, 0x42, 0x54, 0x54, 0xba, 0x32, 0xc3, 0xf9, 0x62, 0xf}, + {0x39, 0x5f, 0x3b, 0xd6, 0x89, 0x65, 0xb4, 0xfc, 0x61, 0xcf, 0xcb, + 0x57, 0x3f, 0x6a, 0xae, 0x5c, 0x5, 0xfa, 0x3a, 0x95, 0xd2, 0xc2, + 0xba, 0xfe, 0x36, 0x14, 0x37, 0x36, 0x1a, 0xa0, 0xf, 0x1c}, + }, + }, + { + { + {0x50, 0x6a, 0x93, 0x8c, 0xe, 0x2b, 0x8, 0x69, 0xb6, 0xc5, 0xda, + 0xc1, 0x35, 0xa0, 0xc9, 0xf9, 0x34, 0xb6, 0xdf, 0xc4, 0x54, 0x3e, + 0xb7, 0x6f, 0x40, 0xc1, 0x2b, 0x1d, 0x9b, 0x41, 0x5, 0x40}, + {0xff, 0x3d, 0x94, 0x22, 0xb6, 0x4, 0xc6, 0xd2, 0xa0, 0xb3, 0xcf, + 0x44, 0xce, 0xbe, 0x8c, 0xbc, 0x78, 0x86, 0x80, 0x97, 0xf3, 0x4f, + 0x25, 0x5d, 0xbf, 0xa6, 0x1c, 0x3b, 0x4f, 0x61, 0xa3, 0xf}, + {0xf0, 0x82, 0xbe, 0xb9, 0xbd, 0xfe, 0x3, 0xa0, 0x90, 0xac, 0x44, + 0x3a, 0xaf, 0xc1, 0x89, 0x20, 0x8e, 0xfa, 0x54, 0x19, 0x91, 0x9f, + 0x49, 0xf8, 0x42, 0xab, 0x40, 0xef, 0x8a, 0x21, 0xba, 0x1f}, + }, + { + {0x94, 0x1, 0x7b, 0x3e, 0x4, 0x57, 0x3e, 0x4f, 0x7f, 0xaf, 0xda, + 0x8, 0xee, 0x3e, 0x1d, 0xa8, 0xf1, 0xde, 0xdc, 0x99, 0xab, 0xc6, + 0x39, 0xc8, 0xd5, 0x61, 0x77, 0xff, 0x13, 0x5d, 0x53, 0x6c}, + {0x3e, 0xf5, 0xc8, 0xfa, 0x48, 0x94, 0x54, 0xab, 0x41, 0x37, 0xa6, + 0x7b, 0x9a, 0xe8, 0xf6, 0x81, 0x1, 0x5e, 0x2b, 0x6c, 0x7d, 0x6c, + 0xfd, 0x74, 0x42, 0x6e, 0xc8, 0xa8, 0xca, 0x3a, 0x2e, 0x39}, + {0xaf, 0x35, 0x8a, 0x3e, 0xe9, 0x34, 0xbd, 0x4c, 0x16, 0xe8, 0x87, + 0x58, 0x44, 0x81, 0x7, 0x2e, 0xab, 0xb0, 0x9a, 0xf2, 0x76, 0x9c, + 0x31, 0x19, 0x3b, 0xc1, 0xa, 0xd5, 0xe4, 0x7f, 0xe1, 0x25}, + }, + { + {0xa7, 0x21, 0xf1, 0x76, 0xf5, 0x7f, 0x5f, 0x91, 0xe3, 0x87, 0xcd, + 0x2f, 0x27, 0x32, 0x4a, 0xc3, 0x26, 0xe5, 0x1b, 0x4d, 0xde, 0x2f, + 0xba, 0xcc, 0x9b, 0x89, 0x69, 0x89, 0x8f, 0x82, 0xba, 0x6b}, + {0x76, 0xf6, 0x4, 0x1e, 0xd7, 0x9b, 0x28, 0xa, 0x95, 0xf, 0x42, + 0xd6, 0x52, 0x1c, 0x8e, 0x20, 0xab, 0x1f, 0x69, 0x34, 0xb0, 0xd8, + 0x86, 0x51, 0x51, 0xb3, 0x9f, 0x2a, 0x44, 0x51, 0x57, 0x25}, + {0x1, 0x39, 0xfe, 0x90, 0x66, 0xbc, 0xd1, 0xe2, 0xd5, 0x7a, 0x99, + 0xa0, 0x18, 0x4a, 0xb5, 0x4c, 0xd4, 0x60, 0x84, 0xaf, 0x14, 0x69, + 0x1d, 0x97, 0xe4, 0x7b, 0x6b, 0x7f, 0x4f, 0x50, 0x9d, 0x55}, + }, + { + {0xfd, 0x66, 0xd2, 0xf6, 0xe7, 0x91, 0x48, 0x9c, 0x1b, 0x78, 0x7, + 0x3, 0x9b, 0xa1, 0x44, 0x7, 0x3b, 0xe2, 0x61, 0x60, 0x1d, 0x8f, + 0x38, 0x88, 0xe, 0xd5, 0x4b, 0x35, 0xa3, 0xa6, 0x3e, 0x12}, + {0xd5, 0x54, 0xeb, 0xb3, 0x78, 0x83, 0x73, 0xa7, 0x7c, 0x3c, 0x55, + 0xa5, 0x66, 0xd3, 0x69, 0x1d, 0xba, 0x0, 0x28, 0xf9, 0x62, 0xcf, + 0x26, 0xa, 0x17, 0x32, 0x7e, 0x80, 0xd5, 0x12, 0xab, 0x1}, + {0x96, 0x2d, 0xe3, 0x41, 0x90, 0x18, 0x8d, 0x11, 0x48, 0x58, 0x31, + 0xd8, 0xc2, 0xe3, 0xed, 0xb9, 0xd9, 0x45, 0x32, 0xd8, 0x71, 0x42, + 0xab, 0x1e, 0x54, 0xa1, 0x18, 0xc9, 0xe2, 0x61, 0x39, 0x4a}, + }, + { + {0x1e, 0x3f, 0x23, 0xf3, 0x44, 0xd6, 0x27, 0x3, 0x16, 0xf0, 0xfc, + 0x34, 0xe, 0x26, 0x9a, 0x49, 0x79, 0xb9, 0xda, 0xf2, 0x16, 0xa7, + 0xb5, 0x83, 0x1f, 0x11, 0xd4, 0x9b, 0xad, 0xee, 0xac, 0x68}, + {0xa0, 0xbb, 0xe6, 0xf8, 0xe0, 0x3b, 0xdc, 0x71, 0xa, 0xe3, 0xff, + 0x7e, 0x34, 0xf8, 0xce, 0xd6, 0x6a, 0x47, 0x3a, 0xe1, 0x5f, 0x42, + 0x92, 0xa9, 0x63, 0xb7, 0x1d, 0xfb, 0xe3, 0xbc, 0xd6, 0x2c}, + {0x10, 0xc2, 0xd7, 0xf3, 0xe, 0xc9, 0xb4, 0x38, 0xc, 0x4, 0xad, + 0xb7, 0x24, 0x6e, 0x8e, 0x30, 0x23, 0x3e, 0xe7, 0xb7, 0xf1, 0xd9, + 0x60, 0x38, 0x97, 0xf5, 0x8, 0xb5, 0xd5, 0x60, 0x57, 0x59}, + }, + { + {0x90, 0x27, 0x2, 0xfd, 0xeb, 0xcb, 0x2a, 0x88, 0x60, 0x57, 0x11, + 0xc4, 0x5, 0x33, 0xaf, 0x89, 0xf4, 0x73, 0x34, 0x7d, 0xe3, 0x92, + 0xf4, 0x65, 0x2b, 0x5a, 0x51, 0x54, 0xdf, 0xc5, 0xb2, 0x2c}, + {0x97, 0x63, 0xaa, 0x4, 0xe1, 0xbf, 0x29, 0x61, 0xcb, 0xfc, 0xa7, + 0xa4, 0x8, 0x0, 0x96, 0x8f, 0x58, 0x94, 0x90, 0x7d, 0x89, 0xc0, + 0x8b, 0x3f, 0xa9, 0x91, 0xb2, 0xdc, 0x3e, 0xa4, 0x9f, 0x70}, + {0xca, 0x2a, 0xfd, 0x63, 0x8c, 0x5d, 0xa, 0xeb, 0xff, 0x4e, 0x69, + 0x2e, 0x66, 0xc1, 0x2b, 0xd2, 0x3a, 0xb0, 0xcb, 0xf8, 0x6e, 0xf3, + 0x23, 0x27, 0x1f, 0x13, 0xc8, 0xf0, 0xec, 0x29, 0xf0, 0x70}, + }, + { + {0xb9, 0xb0, 0x10, 0x5e, 0xaa, 0xaf, 0x6a, 0x2a, 0xa9, 0x1a, 0x4, + 0xef, 0x70, 0xa3, 0xf0, 0x78, 0x1f, 0xd6, 0x3a, 0xaa, 0x77, 0xfb, + 0x3e, 0x77, 0xe1, 0xd9, 0x4b, 0xa7, 0xa2, 0xa5, 0xec, 0x44}, + {0x33, 0x3e, 0xed, 0x2e, 0xb3, 0x7, 0x13, 0x46, 0xe7, 0x81, 0x55, + 0xa4, 0x33, 0x2f, 0x4, 0xae, 0x66, 0x3, 0x5f, 0x19, 0xd3, 0x49, + 0x44, 0xc9, 0x58, 0x48, 0x31, 0x6c, 0x8a, 0x5d, 0x7d, 0xb}, + {0x43, 0xd5, 0x95, 0x7b, 0x32, 0x48, 0xd4, 0x25, 0x1d, 0xf, 0x34, + 0xa3, 0x0, 0x83, 0xd3, 0x70, 0x2b, 0xc5, 0xe1, 0x60, 0x1c, 0x53, + 0x1c, 0xde, 0xe4, 0xe9, 0x7d, 0x2c, 0x51, 0x24, 0x22, 0x27}, + }, + { + {0xfc, 0x75, 0xa9, 0x42, 0x8a, 0xbb, 0x7b, 0xbf, 0x58, 0xa3, 0xad, + 0x96, 0x77, 0x39, 0x5c, 0x8c, 0x48, 0xaa, 0xed, 0xcd, 0x6f, 0xc7, + 0x7f, 0xe2, 0xa6, 0x20, 0xbc, 0xf6, 0xd7, 0x5f, 0x73, 0x19}, + {0x2e, 0x34, 0xc5, 0x49, 0xaf, 0x92, 0xbc, 0x1a, 0xd0, 0xfa, 0xe6, + 0xb2, 0x11, 0xd8, 0xee, 0xff, 0x29, 0x4e, 0xc8, 0xfc, 0x8d, 0x8c, + 0xa2, 0xef, 0x43, 0xc5, 0x4c, 0xa4, 0x18, 0xdf, 0xb5, 0x11}, + {0x66, 0x42, 0xc8, 0x42, 0xd0, 0x90, 0xab, 0xe3, 0x7e, 0x54, 0x19, + 0x7f, 0xf, 0x8e, 0x84, 0xeb, 0xb9, 0x97, 0xa4, 0x65, 0xd0, 0xa1, + 0x3, 0x25, 0x5f, 0x89, 0xdf, 0x91, 0x11, 0x91, 0xef, 0xf}, + }, + }, +}; + +#endif // OPENSSL_SMALL + +// Bi[i] = (2*i+1)*B +static const ge_precomp Bi[8] = { + { + {{ +#if defined(OPENSSL_64_BIT) + 1288382639258501, 245678601348599, 269427782077623, + 1462984067271730, 137412439391563 +#else + 25967493, 19198397, 29566455, 3660896, 54414519, 4014786, 27544626, + 21800161, 61029707, 2047604 +#endif + }}, + {{ +#if defined(OPENSSL_64_BIT) + 62697248952638, 204681361388450, 631292143396476, 338455783676468, + 1213667448819585 +#else + 54563134, 934261, 64385954, 3049989, 66381436, 9406985, 12720692, + 5043384, 19500929, 18085054 +#endif + }}, + {{ +#if defined(OPENSSL_64_BIT) + 301289933810280, 1259582250014073, 1422107436869536, + 796239922652654, 1953934009299142 +#else + 58370664, 4489569, 9688441, 18769238, 10184608, 21191052, 29287918, + 11864899, 42594502, 29115885 +#endif + }}, + }, + { + {{ +#if defined(OPENSSL_64_BIT) + 1601611775252272, 1720807796594148, 1132070835939856, + 1260455018889551, 2147779492816911 +#else + 15636272, 23865875, 24204772, 25642034, 616976, 16869170, 27787599, + 18782243, 28944399, 32004408 +#endif + }}, + {{ +#if defined(OPENSSL_64_BIT) + 316559037616741, 2177824224946892, 1459442586438991, + 1461528397712656, 751590696113597 +#else + 16568933, 4717097, 55552716, 32452109, 15682895, 21747389, 16354576, + 21778470, 7689661, 11199574 +#endif + }}, + {{ +#if defined(OPENSSL_64_BIT) + 1850748884277385, 1200145853858453, 1068094770532492, + 672251375690438, 1586055907191707 +#else + 30464137, 27578307, 55329429, 17883566, 23220364, 15915852, 7512774, + 10017326, 49359771, 23634074 +#endif + }}, + }, + { + {{ +#if defined(OPENSSL_64_BIT) + 769950342298419, 132954430919746, 844085933195555, 974092374476333, + 726076285546016 +#else + 10861363, 11473154, 27284546, 1981175, 37044515, 12577860, 32867885, + 14515107, 51670560, 10819379 +#endif + }}, + {{ +#if defined(OPENSSL_64_BIT) + 425251763115706, 608463272472562, 442562545713235, 837766094556764, + 374555092627893 +#else + 4708026, 6336745, 20377586, 9066809, 55836755, 6594695, 41455196, + 12483687, 54440373, 5581305 +#endif + }}, + {{ +#if defined(OPENSSL_64_BIT) + 1086255230780037, 274979815921559, 1960002765731872, + 929474102396301, 1190409889297339 +#else + 19563141, 16186464, 37722007, 4097518, 10237984, 29206317, 28542349, + 13850243, 43430843, 17738489 +#endif + }}, + }, + { + {{ +#if defined(OPENSSL_64_BIT) + 665000864555967, 2065379846933859, 370231110385876, 350988370788628, + 1233371373142985 +#else + 5153727, 9909285, 1723747, 30776558, 30523604, 5516873, 19480852, + 5230134, 43156425, 18378665 +#endif + }}, + {{ +#if defined(OPENSSL_64_BIT) + 2019367628972465, 676711900706637, 110710997811333, + 1108646842542025, 517791959672113 +#else + 36839857, 30090922, 7665485, 10083793, 28475525, 1649722, 20654025, + 16520125, 30598449, 7715701 +#endif + }}, + {{ +#if defined(OPENSSL_64_BIT) + 965130719900578, 247011430587952, 526356006571389, 91986625355052, + 2157223321444601 +#else + 28881826, 14381568, 9657904, 3680757, 46927229, 7843315, 35708204, + 1370707, 29794553, 32145132 +#endif + }}, + }, + { + {{ +#if defined(OPENSSL_64_BIT) + 1802695059465007, 1664899123557221, 593559490740857, + 2160434469266659, 927570450755031 +#else + 44589871, 26862249, 14201701, 24808930, 43598457, 8844725, 18474211, + 32192982, 54046167, 13821876 +#endif + }}, + {{ +#if defined(OPENSSL_64_BIT) + 1725674970513508, 1933645953859181, 1542344539275782, + 1767788773573747, 1297447965928905 +#else + 60653668, 25714560, 3374701, 28813570, 40010246, 22982724, 31655027, + 26342105, 18853321, 19333481 +#endif + }}, + {{ +#if defined(OPENSSL_64_BIT) + 1381809363726107, 1430341051343062, 2061843536018959, + 1551778050872521, 2036394857967624 +#else + 4566811, 20590564, 38133974, 21313742, 59506191, 30723862, 58594505, + 23123294, 2207752, 30344648 +#endif + }}, + }, + { + {{ +#if defined(OPENSSL_64_BIT) + 1970894096313054, 528066325833207, 1619374932191227, + 2207306624415883, 1169170329061080 +#else + 41954014, 29368610, 29681143, 7868801, 60254203, 24130566, 54671499, + 32891431, 35997400, 17421995 +#endif + }}, + {{ +#if defined(OPENSSL_64_BIT) + 2070390218572616, 1458919061857835, 624171843017421, + 1055332792707765, 433987520732508 +#else + 25576264, 30851218, 7349803, 21739588, 16472781, 9300885, 3844789, + 15725684, 171356, 6466918 +#endif + }}, + {{ +#if defined(OPENSSL_64_BIT) + 893653801273833, 1168026499324677, 1242553501121234, + 1306366254304474, 1086752658510815 +#else + 23103977, 13316479, 9739013, 17404951, 817874, 18515490, 8965338, + 19466374, 36393951, 16193876 +#endif + }}, + }, + { + {{ +#if defined(OPENSSL_64_BIT) + 213454002618221, 939771523987438, 1159882208056014, 317388369627517, + 621213314200687 +#else + 33587053, 3180712, 64714734, 14003686, 50205390, 17283591, 17238397, + 4729455, 49034351, 9256799 +#endif + }}, + {{ +#if defined(OPENSSL_64_BIT) + 1971678598905747, 338026507889165, 762398079972271, 655096486107477, + 42299032696322 +#else + 41926547, 29380300, 32336397, 5036987, 45872047, 11360616, 22616405, + 9761698, 47281666, 630304 +#endif + }}, + {{ +#if defined(OPENSSL_64_BIT) + 177130678690680, 1754759263300204, 1864311296286618, + 1180675631479880, 1292726903152791 +#else + 53388152, 2639452, 42871404, 26147950, 9494426, 27780403, 60554312, + 17593437, 64659607, 19263131 +#endif + }}, + }, + { + {{ +#if defined(OPENSSL_64_BIT) + 1913163449625248, 460779200291993, 2193883288642314, + 1008900146920800, 1721983679009502 +#else + 63957664, 28508356, 9282713, 6866145, 35201802, 32691408, 48168288, + 15033783, 25105118, 25659556 +#endif + }}, + {{ +#if defined(OPENSSL_64_BIT) + 1070401523076875, 1272492007800961, 1910153608563310, + 2075579521696771, 1191169788841221 +#else + 42782475, 15950225, 35307649, 18961608, 55446126, 28463506, 1573891, + 30928545, 2198789, 17749813 +#endif + }}, + {{ +#if defined(OPENSSL_64_BIT) + 692896803108118, 500174642072499, 2068223309439677, + 1162190621851337, 1426986007309901 +#else + 64009494, 10324966, 64867251, 7453182, 61661885, 30818928, 53296841, + 17317989, 34647629, 21263748 +#endif + }}, + }, +}; diff --git a/ring-0.17.14/crypto/curve25519/internal.h b/ring-0.17.14/crypto/curve25519/internal.h new file mode 100644 index 0000000000..d15a1b47a7 --- /dev/null +++ b/ring-0.17.14/crypto/curve25519/internal.h @@ -0,0 +1,135 @@ +// Copyright 2020 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CURVE25519_INTERNAL_H +#define OPENSSL_HEADER_CURVE25519_INTERNAL_H + +#include + +#include "../internal.h" + + +#if defined(OPENSSL_ARM) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_APPLE) +#define BORINGSSL_X25519_NEON + +// x25519_NEON is defined in asm/x25519-arm.S. +void x25519_NEON(uint8_t out[32], const uint8_t scalar[32], + const uint8_t point[32]); +#endif + +#if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_SMALL) && \ + defined(__GNUC__) && defined(__x86_64__) && !defined(OPENSSL_WINDOWS) +#define BORINGSSL_FE25519_ADX + +// fiat_curve25519_adx_mul is defined in +// third_party/fiat/asm/fiat_curve25519_adx_mul.S +void __attribute__((sysv_abi)) +fiat_curve25519_adx_mul(uint64_t out[4], const uint64_t in1[4], + const uint64_t in2[4]); + +// fiat_curve25519_adx_square is defined in +// third_party/fiat/asm/fiat_curve25519_adx_square.S +void __attribute__((sysv_abi)) +fiat_curve25519_adx_square(uint64_t out[4], const uint64_t in[4]); + +// x25519_scalar_mult_adx is defined in third_party/fiat/curve25519_64_adx.h +void x25519_scalar_mult_adx(uint8_t out[32], const uint8_t scalar[32], + const uint8_t point[32]); +void x25519_ge_scalarmult_base_adx(uint8_t h[4][32], const uint8_t a[32]); +#endif + +#if defined(OPENSSL_64_BIT) +// An element t, +// entries t[0]...t[4], represents the integer t[0]+2^51 t[1]+2^102 t[2]+2^153 +// t[3]+2^204 t[4]. +// fe limbs are bounded by 1.125*2^51. +// fe_loose limbs are bounded by 3.375*2^51. +typedef uint64_t fe_limb_t; +#define FE_NUM_LIMBS 5 +#else +// An element t, +// entries t[0]...t[9], represents the integer t[0]+2^26 t[1]+2^51 t[2]+2^77 +// t[3]+2^102 t[4]+...+2^230 t[9]. +// fe limbs are bounded by 1.125*2^26,1.125*2^25,1.125*2^26,1.125*2^25,etc. +// fe_loose limbs are bounded by 3.375*2^26,3.375*2^25,3.375*2^26,3.375*2^25,etc. +typedef uint32_t fe_limb_t; +#define FE_NUM_LIMBS 10 +#endif + +// fe means field element. Here the field is \Z/(2^255-19). +// Multiplication and carrying produce fe from fe_loose. +// Keep in sync with `Elem` and `ELEM_LIMBS` in curve25519/ops.rs. +typedef struct fe { fe_limb_t v[FE_NUM_LIMBS]; } fe; + +// Addition and subtraction produce fe_loose from (fe, fe). +// Keep in sync with `Elem` and `ELEM_LIMBS` in curve25519/ops.rs. +typedef struct fe_loose { fe_limb_t v[FE_NUM_LIMBS]; } fe_loose; + +static inline void fe_limbs_copy(fe_limb_t r[], const fe_limb_t a[]) { + for (size_t i = 0; i < FE_NUM_LIMBS; ++i) { + r[i] = a[i]; + } +} + +// ge means group element. +// +// Here the group is the set of pairs (x,y) of field elements (see fe.h) +// satisfying -x^2 + y^2 = 1 + d x^2y^2 +// where d = -121665/121666. +// +// Representations: +// ge_p2 (projective): (X:Y:Z) satisfying x=X/Z, y=Y/Z +// ge_p3 (extended): (X:Y:Z:T) satisfying x=X/Z, y=Y/Z, XY=ZT +// ge_p1p1 (completed): ((X:Z),(Y:T)) satisfying x=X/Z, y=Y/T +// ge_precomp (Duif): (y+x,y-x,2dxy) + +// Keep in sync with `Point` in curve25519/ops.rs. +typedef struct { + fe X; + fe Y; + fe Z; +} ge_p2; + + +// Keep in sync with `ExtPoint` in curve25519/ops.rs. +typedef struct { + fe X; + fe Y; + fe Z; + fe T; +} ge_p3; + +typedef struct { + fe_loose X; + fe_loose Y; + fe_loose Z; + fe_loose T; +} ge_p1p1; + +typedef struct { + fe_loose yplusx; + fe_loose yminusx; + fe_loose xy2d; +} ge_precomp; + +typedef struct { + fe_loose YplusX; + fe_loose YminusX; + fe_loose Z; + fe_loose T2d; +} ge_cached; + +extern const uint8_t k25519Precomp[32][8][3][32]; + +#endif // OPENSSL_HEADER_CURVE25519_INTERNAL_H diff --git a/ring-0.17.14/crypto/fipsmodule/aes/aes_nohw.c b/ring-0.17.14/crypto/fipsmodule/aes/aes_nohw.c new file mode 100644 index 0000000000..9530cbc9b2 --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/aes/aes_nohw.c @@ -0,0 +1,881 @@ +/* Copyright (c) 2019, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#include + +#include "../../internal.h" + +// This file contains a constant-time implementation of AES, bitsliced with +// 32-bit or 64-bit, operating on two-, four-, and eight-block +// batches, respectively. +// +// This implementation is based on the algorithms described in the following +// references: +// - https://bearssl.org/constanttime.html#aes +// - https://eprint.iacr.org/2009/129.pdf +// - https://eprint.iacr.org/2009/191.pdf + + +// Word operations. +// +// An aes_word_t is the word used for this AES implementation. Throughout this +// file, bits and bytes are ordered little-endian, though "left" and "right" +// shifts match the operations themselves, which makes them reversed in a +// little-endian, left-to-right reading. +// +// Eight |aes_word_t|s contain |AES_NOHW_BATCH_SIZE| blocks. The bits in an +// |aes_word_t| are divided into 16 consecutive groups of |AES_NOHW_BATCH_SIZE| +// bits each, each corresponding to a byte in an AES block in column-major +// order (AES's byte order). We refer to these as "logical bytes". Note, in the +// 32-bit and 64-bit implementations, they are smaller than a byte. (The +// contents of a logical byte will be described later.) +// +// MSVC does not support C bit operators on |__m128i|, so the wrapper functions +// |aes_nohw_and|, etc., should be used instead. Note |aes_nohw_shift_left| and +// |aes_nohw_shift_right| measure the shift in logical bytes. That is, the shift +// value ranges from 0 to 15 independent of |aes_word_t| and +// |AES_NOHW_BATCH_SIZE|. +// +// This ordering is different from https://eprint.iacr.org/2009/129.pdf, which +// uses row-major order. Matching the AES order was easier to reason about, and +// we do not have PSHUFB available to arbitrarily permute bytes. + +#if defined(OPENSSL_64_BIT) +typedef uint64_t aes_word_t; +#define AES_NOHW_WORD_SIZE 8 +#define AES_NOHW_BATCH_SIZE 4 +#define AES_NOHW_ROW0_MASK UINT64_C(0x000f000f000f000f) +#define AES_NOHW_ROW1_MASK UINT64_C(0x00f000f000f000f0) +#define AES_NOHW_ROW2_MASK UINT64_C(0x0f000f000f000f00) +#define AES_NOHW_ROW3_MASK UINT64_C(0xf000f000f000f000) +#else // !OPENSSL_64_BIT +typedef uint32_t aes_word_t; +#define AES_NOHW_WORD_SIZE 4 +#define AES_NOHW_BATCH_SIZE 2 +#define AES_NOHW_ROW0_MASK 0x03030303 +#define AES_NOHW_ROW1_MASK 0x0c0c0c0c +#define AES_NOHW_ROW2_MASK 0x30303030 +#define AES_NOHW_ROW3_MASK 0xc0c0c0c0 +#endif // OPENSSL_64_BIT + +static inline aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b) { + return a & b; +} + +static inline aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b) { + return a | b; +} + +static inline aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b) { + return a ^ b; +} + +static inline aes_word_t aes_nohw_not(aes_word_t a) { return ~a; } + +static inline aes_word_t aes_nohw_shift_left(aes_word_t a, aes_word_t i) { + return a << (i * AES_NOHW_BATCH_SIZE); +} + +static inline aes_word_t aes_nohw_shift_right(aes_word_t a, aes_word_t i) { + return a >> (i * AES_NOHW_BATCH_SIZE); +} + +OPENSSL_STATIC_ASSERT(AES_NOHW_BATCH_SIZE * 128 == 8 * 8 * sizeof(aes_word_t), + "batch size does not match word size"); +OPENSSL_STATIC_ASSERT(AES_NOHW_WORD_SIZE == sizeof(aes_word_t), + "AES_NOHW_WORD_SIZE is incorrect"); + + +// Block representations. +// +// This implementation uses three representations for AES blocks. First, the +// public API represents blocks as uint8_t[16] in the usual way. Second, most +// AES steps are evaluated in bitsliced form, stored in an |AES_NOHW_BATCH|. +// This stores |AES_NOHW_BATCH_SIZE| blocks in bitsliced order. For 64-bit words +// containing bitsliced blocks a, b, c, d, this would be as follows (vertical +// bars divide logical bytes): +// +// batch.w[0] = a0 b0 c0 d0 | a8 b8 c8 d8 | a16 b16 c16 d16 ... +// batch.w[1] = a1 b1 c1 d1 | a9 b9 c9 d9 | a17 b17 c17 d17 ... +// batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ... +// batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ... +// ... +// +// Finally, an individual block may be stored as an intermediate form in an +// aes_word_t[AES_NOHW_BLOCK_WORDS]. In this form, we permute the bits in each +// block, so that block[0]'s ith logical byte contains least-significant +// |AES_NOHW_BATCH_SIZE| bits of byte i, block[1] contains the next group of +// |AES_NOHW_BATCH_SIZE| bits, and so on. We refer to this transformation as +// "compacting" the block. Note this is no-op with 128-bit words because then +// |AES_NOHW_BLOCK_WORDS| is one and |AES_NOHW_BATCH_SIZE| is eight. For 64-bit +// words, one block would be stored in two words: +// +// block[0] = a0 a1 a2 a3 | a8 a9 a10 a11 | a16 a17 a18 a19 ... +// block[1] = a4 a5 a6 a7 | a12 a13 a14 a15 | a20 a21 a22 a23 ... +// +// Observe that the distances between corresponding bits in bitsliced and +// compact bit orders match. If we line up corresponding words of each block, +// the bitsliced and compact representations may be converted by tranposing bits +// in corresponding logical bytes. Continuing the 64-bit example: +// +// block_a[0] = a0 a1 a2 a3 | a8 a9 a10 a11 | a16 a17 a18 a19 ... +// block_b[0] = b0 b1 b2 b3 | b8 b9 b10 b11 | b16 b17 b18 b19 ... +// block_c[0] = c0 c1 c2 c3 | c8 c9 c10 c11 | c16 c17 c18 c19 ... +// block_d[0] = d0 d1 d2 d3 | d8 d9 d10 d11 | d16 d17 d18 d19 ... +// +// batch.w[0] = a0 b0 c0 d0 | a8 b8 c8 d8 | a16 b16 c16 d16 ... +// batch.w[1] = a1 b1 c1 d1 | a9 b9 c9 d9 | a17 b17 c17 d17 ... +// batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ... +// batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ... +// +// Note also that bitwise operations and (logical) byte permutations on an +// |aes_word_t| work equally for the bitsliced and compact words. +// +// We use the compact form in the |AES_KEY| representation to save work +// inflating round keys into |AES_NOHW_BATCH|. The compact form also exists +// temporarily while moving blocks in or out of an |AES_NOHW_BATCH|, immediately +// before or after |aes_nohw_transpose|. + +#define AES_NOHW_BLOCK_WORDS (16 / sizeof(aes_word_t)) + +// An AES_NOHW_BATCH stores |AES_NOHW_BATCH_SIZE| blocks. Unless otherwise +// specified, it is in bitsliced form. +typedef struct { + aes_word_t w[8]; +} AES_NOHW_BATCH; + +// An AES_NOHW_SCHEDULE is an expanded bitsliced AES key schedule. It is +// suitable for encryption or decryption. It is as large as |AES_NOHW_BATCH| +// |AES_KEY|s so it should not be used as a long-term key representation. +typedef struct { + // keys is an array of batches, one for each round key. Each batch stores + // |AES_NOHW_BATCH_SIZE| copies of the round key in bitsliced form. + AES_NOHW_BATCH keys[AES_MAXNR + 1]; +} AES_NOHW_SCHEDULE; + +// aes_nohw_batch_set sets the |i|th block of |batch| to |in|. |batch| is in +// compact form. +static inline void aes_nohw_batch_set(AES_NOHW_BATCH *batch, + const aes_word_t in[AES_NOHW_BLOCK_WORDS], + size_t i) { + // Note the words are interleaved. The order comes from |aes_nohw_transpose|. + // If |i| is zero and this is the 64-bit implementation, in[0] contains bits + // 0-3 and in[1] contains bits 4-7. We place in[0] at w[0] and in[1] at + // w[4] so that bits 0 and 4 are in the correct position. (In general, bits + // along diagonals of |AES_NOHW_BATCH_SIZE| by |AES_NOHW_BATCH_SIZE| squares + // will be correctly placed.) + dev_assert_secret(i < AES_NOHW_BATCH_SIZE); +#if defined(OPENSSL_64_BIT) + batch->w[i] = in[0]; + batch->w[i + 4] = in[1]; +#else + batch->w[i] = in[0]; + batch->w[i + 2] = in[1]; + batch->w[i + 4] = in[2]; + batch->w[i + 6] = in[3]; +#endif +} + +// aes_nohw_batch_get writes the |i|th block of |batch| to |out|. |batch| is in +// compact form. +static inline void aes_nohw_batch_get(const AES_NOHW_BATCH *batch, + aes_word_t out[AES_NOHW_BLOCK_WORDS], + size_t i) { + dev_assert_secret(i < AES_NOHW_BATCH_SIZE); +#if defined(OPENSSL_64_BIT) + out[0] = batch->w[i]; + out[1] = batch->w[i + 4]; +#else + out[0] = batch->w[i]; + out[1] = batch->w[i + 2]; + out[2] = batch->w[i + 4]; + out[3] = batch->w[i + 6]; +#endif +} + +// aes_nohw_delta_swap returns |a| with bits |a & mask| and +// |a & (mask << shift)| swapped. |mask| and |mask << shift| may not overlap. +static inline aes_word_t aes_nohw_delta_swap(aes_word_t a, aes_word_t mask, + aes_word_t shift) { + // See + // https://reflectionsonsecurity.wordpress.com/2014/05/11/efficient-bit-permutation-using-delta-swaps/ + aes_word_t b = (a ^ (a >> shift)) & mask; + return a ^ b ^ (b << shift); +} + +// In the 32-bit and 64-bit implementations, a block spans multiple words. +// |aes_nohw_compact_block| must permute bits across different words. First we +// implement |aes_nohw_compact_word| which performs a smaller version of the +// transformation which stays within a single word. +// +// These transformations are generalizations of the output of +// http://programming.sirrida.de/calcperm.php on smaller inputs. +#if defined(OPENSSL_64_BIT) +static inline uint64_t aes_nohw_compact_word(uint64_t a) { +#if defined(RING_BIG_ENDIAN) + a = CRYPTO_bswap8(a); +#endif + // Numbering the 64/2 = 16 4-bit chunks, least to most significant, we swap + // quartets of those chunks: + // 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 => + // 0 2 1 3 | 4 6 5 7 | 8 10 9 11 | 12 14 13 15 + a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4); + // Swap quartets of 8-bit chunks (still numbering by 4-bit chunks): + // 0 2 1 3 | 4 6 5 7 | 8 10 9 11 | 12 14 13 15 => + // 0 2 4 6 | 1 3 5 7 | 8 10 12 14 | 9 11 13 15 + a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8); + // Swap quartets of 16-bit chunks (still numbering by 4-bit chunks): + // 0 2 4 6 | 1 3 5 7 | 8 10 12 14 | 9 11 13 15 => + // 0 2 4 6 | 8 10 12 14 | 1 3 5 7 | 9 11 13 15 + a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16); + return a; +} + +static inline uint64_t aes_nohw_uncompact_word(uint64_t a) { + // Reverse the steps of |aes_nohw_uncompact_word|. + a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16); + a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8); + a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4); +#if defined(RING_BIG_ENDIAN) + a = CRYPTO_bswap8(a); +#endif + return a; +} +#else // !OPENSSL_64_BIT +static inline uint32_t aes_nohw_compact_word(uint32_t a) { +#if defined(RING_BIG_ENDIAN) + a = CRYPTO_bswap4(a); +#endif + // Numbering the 32/2 = 16 pairs of bits, least to most significant, we swap: + // 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 => + // 0 4 2 6 | 1 5 3 7 | 8 12 10 14 | 9 13 11 15 + // Note: 0x00cc = 0b0000_0000_1100_1100 + // 0x00cc << 6 = 0b0011_0011_0000_0000 + a = aes_nohw_delta_swap(a, 0x00cc00cc, 6); + // Now we swap groups of four bits (still numbering by pairs): + // 0 4 2 6 | 1 5 3 7 | 8 12 10 14 | 9 13 11 15 => + // 0 4 8 12 | 1 5 9 13 | 2 6 10 14 | 3 7 11 15 + // Note: 0x0000_f0f0 << 12 = 0x0f0f_0000 + a = aes_nohw_delta_swap(a, 0x0000f0f0, 12); + return a; +} + +static inline uint32_t aes_nohw_uncompact_word(uint32_t a) { + // Reverse the steps of |aes_nohw_uncompact_word|. + a = aes_nohw_delta_swap(a, 0x0000f0f0, 12); + a = aes_nohw_delta_swap(a, 0x00cc00cc, 6); +#if defined(RING_BIG_ENDIAN) + a = CRYPTO_bswap4(a); +#endif + return a; +} + +static inline uint32_t aes_nohw_word_from_bytes(uint8_t a0, uint8_t a1, + uint8_t a2, uint8_t a3) { + return (uint32_t)a0 | ((uint32_t)a1 << 8) | ((uint32_t)a2 << 16) | + ((uint32_t)a3 << 24); +} + +static inline uint8_t lo(uint32_t a) { + return (uint8_t)a; +} + +#endif // OPENSSL_64_BIT + +static inline void aes_nohw_compact_block(aes_word_t out[AES_NOHW_BLOCK_WORDS], + const uint8_t in[16]) { + OPENSSL_memcpy(out, in, 16); +#if defined(OPENSSL_64_BIT) + uint64_t a0 = aes_nohw_compact_word(out[0]); + uint64_t a1 = aes_nohw_compact_word(out[1]); + out[0] = (a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32); + out[1] = (a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32); +#else + uint32_t a0 = aes_nohw_compact_word(out[0]); + uint32_t a1 = aes_nohw_compact_word(out[1]); + uint32_t a2 = aes_nohw_compact_word(out[2]); + uint32_t a3 = aes_nohw_compact_word(out[3]); + // Note clang, when building for ARM Thumb2, will sometimes miscompile + // expressions such as (a0 & 0x0000ff00) << 8, particularly when building + // without optimizations. This bug was introduced in + // https://reviews.llvm.org/rL340261 and fixed in + // https://reviews.llvm.org/rL351310. The following is written to avoid this. + out[0] = aes_nohw_word_from_bytes(lo(a0), lo(a1), lo(a2), lo(a3)); + out[1] = aes_nohw_word_from_bytes(lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8)); + out[2] = aes_nohw_word_from_bytes(lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16)); + out[3] = aes_nohw_word_from_bytes(lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24)); +#endif +} + +static inline void aes_nohw_uncompact_block( + uint8_t out[16], const aes_word_t in[AES_NOHW_BLOCK_WORDS]) { +#if defined(OPENSSL_64_BIT) + uint64_t a0 = in[0]; + uint64_t a1 = in[1]; + uint64_t b0 = + aes_nohw_uncompact_word((a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32)); + uint64_t b1 = + aes_nohw_uncompact_word((a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32)); + OPENSSL_memcpy(out, &b0, 8); + OPENSSL_memcpy(out + 8, &b1, 8); +#else + uint32_t a0 = in[0]; + uint32_t a1 = in[1]; + uint32_t a2 = in[2]; + uint32_t a3 = in[3]; + // Note clang, when building for ARM Thumb2, will sometimes miscompile + // expressions such as (a0 & 0x0000ff00) << 8, particularly when building + // without optimizations. This bug was introduced in + // https://reviews.llvm.org/rL340261 and fixed in + // https://reviews.llvm.org/rL351310. The following is written to avoid this. + uint32_t b0 = aes_nohw_word_from_bytes(lo(a0), lo(a1), lo(a2), lo(a3)); + uint32_t b1 = aes_nohw_word_from_bytes(lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8)); + uint32_t b2 = + aes_nohw_word_from_bytes(lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16)); + uint32_t b3 = + aes_nohw_word_from_bytes(lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24)); + b0 = aes_nohw_uncompact_word(b0); + b1 = aes_nohw_uncompact_word(b1); + b2 = aes_nohw_uncompact_word(b2); + b3 = aes_nohw_uncompact_word(b3); + OPENSSL_memcpy(out, &b0, 4); + OPENSSL_memcpy(out + 4, &b1, 4); + OPENSSL_memcpy(out + 8, &b2, 4); + OPENSSL_memcpy(out + 12, &b3, 4); +#endif +} + +// aes_nohw_swap_bits is a variation on a delta swap. It swaps the bits in +// |*a & (mask << shift)| with the bits in |*b & mask|. |mask| and +// |mask << shift| must not overlap. |mask| is specified as a |uint32_t|, but it +// is repeated to the full width of |aes_word_t|. +static inline void aes_nohw_swap_bits(aes_word_t *a, aes_word_t *b, + uint32_t mask, aes_word_t shift) { +#if defined(OPENSSL_64_BIT) + aes_word_t mask_w = (((uint64_t)mask) << 32) | mask; +#else + aes_word_t mask_w = mask; +#endif + // This is a variation on a delta swap. + aes_word_t swap = ((*a >> shift) ^ *b) & mask_w; + *a ^= swap << shift; + *b ^= swap; +} + +// aes_nohw_transpose converts |batch| to and from bitsliced form. It divides +// the 8 × word_size bits into AES_NOHW_BATCH_SIZE × AES_NOHW_BATCH_SIZE squares +// and transposes each square. +static void aes_nohw_transpose(AES_NOHW_BATCH *batch) { + // Swap bits with index 0 and 1 mod 2 (0x55 = 0b01010101). + aes_nohw_swap_bits(&batch->w[0], &batch->w[1], 0x55555555, 1); + aes_nohw_swap_bits(&batch->w[2], &batch->w[3], 0x55555555, 1); + aes_nohw_swap_bits(&batch->w[4], &batch->w[5], 0x55555555, 1); + aes_nohw_swap_bits(&batch->w[6], &batch->w[7], 0x55555555, 1); + +#if AES_NOHW_BATCH_SIZE >= 4 + // Swap bits with index 0-1 and 2-3 mod 4 (0x33 = 0b00110011). + aes_nohw_swap_bits(&batch->w[0], &batch->w[2], 0x33333333, 2); + aes_nohw_swap_bits(&batch->w[1], &batch->w[3], 0x33333333, 2); + aes_nohw_swap_bits(&batch->w[4], &batch->w[6], 0x33333333, 2); + aes_nohw_swap_bits(&batch->w[5], &batch->w[7], 0x33333333, 2); +#endif + +#if AES_NOHW_BATCH_SIZE >= 8 + // Swap bits with index 0-3 and 4-7 mod 8 (0x0f = 0b00001111). + aes_nohw_swap_bits(&batch->w[0], &batch->w[4], 0x0f0f0f0f, 4); + aes_nohw_swap_bits(&batch->w[1], &batch->w[5], 0x0f0f0f0f, 4); + aes_nohw_swap_bits(&batch->w[2], &batch->w[6], 0x0f0f0f0f, 4); + aes_nohw_swap_bits(&batch->w[3], &batch->w[7], 0x0f0f0f0f, 4); +#endif +} + +// aes_nohw_to_batch initializes |out| with the |num_blocks| blocks from |in|. +// |num_blocks| must be at most |AES_NOHW_BATCH|. +static void aes_nohw_to_batch(AES_NOHW_BATCH *out, const uint8_t *in, + size_t num_blocks) { + // Don't leave unused blocks uninitialized. + OPENSSL_memset(out, 0, sizeof(AES_NOHW_BATCH)); + debug_assert_nonsecret(num_blocks <= AES_NOHW_BATCH_SIZE); + for (size_t i = 0; i < num_blocks; i++) { + aes_word_t block[AES_NOHW_BLOCK_WORDS]; + aes_nohw_compact_block(block, in + 16 * i); + aes_nohw_batch_set(out, block, i); + } + + aes_nohw_transpose(out); +} + +// aes_nohw_to_batch writes the first |num_blocks| blocks in |batch| to |out|. +// |num_blocks| must be at most |AES_NOHW_BATCH|. +static void aes_nohw_from_batch(uint8_t *out, size_t num_blocks, + const AES_NOHW_BATCH *batch) { + AES_NOHW_BATCH copy = *batch; + aes_nohw_transpose(©); + + debug_assert_nonsecret(num_blocks <= AES_NOHW_BATCH_SIZE); + for (size_t i = 0; i < num_blocks; i++) { + aes_word_t block[AES_NOHW_BLOCK_WORDS]; + aes_nohw_batch_get(©, block, i); + aes_nohw_uncompact_block(out + 16 * i, block); + } +} + + +// AES round steps. + +static void aes_nohw_add_round_key(AES_NOHW_BATCH *batch, + const AES_NOHW_BATCH *key) { + for (size_t i = 0; i < 8; i++) { + batch->w[i] = aes_nohw_xor(batch->w[i], key->w[i]); + } +} + +static void aes_nohw_sub_bytes(AES_NOHW_BATCH *batch) { + // See https://eprint.iacr.org/2009/191.pdf, Appendix C. + aes_word_t x0 = batch->w[7]; + aes_word_t x1 = batch->w[6]; + aes_word_t x2 = batch->w[5]; + aes_word_t x3 = batch->w[4]; + aes_word_t x4 = batch->w[3]; + aes_word_t x5 = batch->w[2]; + aes_word_t x6 = batch->w[1]; + aes_word_t x7 = batch->w[0]; + + // Figure 2, the top linear transformation. + aes_word_t y14 = aes_nohw_xor(x3, x5); + aes_word_t y13 = aes_nohw_xor(x0, x6); + aes_word_t y9 = aes_nohw_xor(x0, x3); + aes_word_t y8 = aes_nohw_xor(x0, x5); + aes_word_t t0 = aes_nohw_xor(x1, x2); + aes_word_t y1 = aes_nohw_xor(t0, x7); + aes_word_t y4 = aes_nohw_xor(y1, x3); + aes_word_t y12 = aes_nohw_xor(y13, y14); + aes_word_t y2 = aes_nohw_xor(y1, x0); + aes_word_t y5 = aes_nohw_xor(y1, x6); + aes_word_t y3 = aes_nohw_xor(y5, y8); + aes_word_t t1 = aes_nohw_xor(x4, y12); + aes_word_t y15 = aes_nohw_xor(t1, x5); + aes_word_t y20 = aes_nohw_xor(t1, x1); + aes_word_t y6 = aes_nohw_xor(y15, x7); + aes_word_t y10 = aes_nohw_xor(y15, t0); + aes_word_t y11 = aes_nohw_xor(y20, y9); + aes_word_t y7 = aes_nohw_xor(x7, y11); + aes_word_t y17 = aes_nohw_xor(y10, y11); + aes_word_t y19 = aes_nohw_xor(y10, y8); + aes_word_t y16 = aes_nohw_xor(t0, y11); + aes_word_t y21 = aes_nohw_xor(y13, y16); + aes_word_t y18 = aes_nohw_xor(x0, y16); + + // Figure 3, the middle non-linear section. + aes_word_t t2 = aes_nohw_and(y12, y15); + aes_word_t t3 = aes_nohw_and(y3, y6); + aes_word_t t4 = aes_nohw_xor(t3, t2); + aes_word_t t5 = aes_nohw_and(y4, x7); + aes_word_t t6 = aes_nohw_xor(t5, t2); + aes_word_t t7 = aes_nohw_and(y13, y16); + aes_word_t t8 = aes_nohw_and(y5, y1); + aes_word_t t9 = aes_nohw_xor(t8, t7); + aes_word_t t10 = aes_nohw_and(y2, y7); + aes_word_t t11 = aes_nohw_xor(t10, t7); + aes_word_t t12 = aes_nohw_and(y9, y11); + aes_word_t t13 = aes_nohw_and(y14, y17); + aes_word_t t14 = aes_nohw_xor(t13, t12); + aes_word_t t15 = aes_nohw_and(y8, y10); + aes_word_t t16 = aes_nohw_xor(t15, t12); + aes_word_t t17 = aes_nohw_xor(t4, t14); + aes_word_t t18 = aes_nohw_xor(t6, t16); + aes_word_t t19 = aes_nohw_xor(t9, t14); + aes_word_t t20 = aes_nohw_xor(t11, t16); + aes_word_t t21 = aes_nohw_xor(t17, y20); + aes_word_t t22 = aes_nohw_xor(t18, y19); + aes_word_t t23 = aes_nohw_xor(t19, y21); + aes_word_t t24 = aes_nohw_xor(t20, y18); + aes_word_t t25 = aes_nohw_xor(t21, t22); + aes_word_t t26 = aes_nohw_and(t21, t23); + aes_word_t t27 = aes_nohw_xor(t24, t26); + aes_word_t t28 = aes_nohw_and(t25, t27); + aes_word_t t29 = aes_nohw_xor(t28, t22); + aes_word_t t30 = aes_nohw_xor(t23, t24); + aes_word_t t31 = aes_nohw_xor(t22, t26); + aes_word_t t32 = aes_nohw_and(t31, t30); + aes_word_t t33 = aes_nohw_xor(t32, t24); + aes_word_t t34 = aes_nohw_xor(t23, t33); + aes_word_t t35 = aes_nohw_xor(t27, t33); + aes_word_t t36 = aes_nohw_and(t24, t35); + aes_word_t t37 = aes_nohw_xor(t36, t34); + aes_word_t t38 = aes_nohw_xor(t27, t36); + aes_word_t t39 = aes_nohw_and(t29, t38); + aes_word_t t40 = aes_nohw_xor(t25, t39); + aes_word_t t41 = aes_nohw_xor(t40, t37); + aes_word_t t42 = aes_nohw_xor(t29, t33); + aes_word_t t43 = aes_nohw_xor(t29, t40); + aes_word_t t44 = aes_nohw_xor(t33, t37); + aes_word_t t45 = aes_nohw_xor(t42, t41); + aes_word_t z0 = aes_nohw_and(t44, y15); + aes_word_t z1 = aes_nohw_and(t37, y6); + aes_word_t z2 = aes_nohw_and(t33, x7); + aes_word_t z3 = aes_nohw_and(t43, y16); + aes_word_t z4 = aes_nohw_and(t40, y1); + aes_word_t z5 = aes_nohw_and(t29, y7); + aes_word_t z6 = aes_nohw_and(t42, y11); + aes_word_t z7 = aes_nohw_and(t45, y17); + aes_word_t z8 = aes_nohw_and(t41, y10); + aes_word_t z9 = aes_nohw_and(t44, y12); + aes_word_t z10 = aes_nohw_and(t37, y3); + aes_word_t z11 = aes_nohw_and(t33, y4); + aes_word_t z12 = aes_nohw_and(t43, y13); + aes_word_t z13 = aes_nohw_and(t40, y5); + aes_word_t z14 = aes_nohw_and(t29, y2); + aes_word_t z15 = aes_nohw_and(t42, y9); + aes_word_t z16 = aes_nohw_and(t45, y14); + aes_word_t z17 = aes_nohw_and(t41, y8); + + // Figure 4, bottom linear transformation. + aes_word_t t46 = aes_nohw_xor(z15, z16); + aes_word_t t47 = aes_nohw_xor(z10, z11); + aes_word_t t48 = aes_nohw_xor(z5, z13); + aes_word_t t49 = aes_nohw_xor(z9, z10); + aes_word_t t50 = aes_nohw_xor(z2, z12); + aes_word_t t51 = aes_nohw_xor(z2, z5); + aes_word_t t52 = aes_nohw_xor(z7, z8); + aes_word_t t53 = aes_nohw_xor(z0, z3); + aes_word_t t54 = aes_nohw_xor(z6, z7); + aes_word_t t55 = aes_nohw_xor(z16, z17); + aes_word_t t56 = aes_nohw_xor(z12, t48); + aes_word_t t57 = aes_nohw_xor(t50, t53); + aes_word_t t58 = aes_nohw_xor(z4, t46); + aes_word_t t59 = aes_nohw_xor(z3, t54); + aes_word_t t60 = aes_nohw_xor(t46, t57); + aes_word_t t61 = aes_nohw_xor(z14, t57); + aes_word_t t62 = aes_nohw_xor(t52, t58); + aes_word_t t63 = aes_nohw_xor(t49, t58); + aes_word_t t64 = aes_nohw_xor(z4, t59); + aes_word_t t65 = aes_nohw_xor(t61, t62); + aes_word_t t66 = aes_nohw_xor(z1, t63); + aes_word_t s0 = aes_nohw_xor(t59, t63); + aes_word_t s6 = aes_nohw_xor(t56, aes_nohw_not(t62)); + aes_word_t s7 = aes_nohw_xor(t48, aes_nohw_not(t60)); + aes_word_t t67 = aes_nohw_xor(t64, t65); + aes_word_t s3 = aes_nohw_xor(t53, t66); + aes_word_t s4 = aes_nohw_xor(t51, t66); + aes_word_t s5 = aes_nohw_xor(t47, t65); + aes_word_t s1 = aes_nohw_xor(t64, aes_nohw_not(s3)); + aes_word_t s2 = aes_nohw_xor(t55, aes_nohw_not(t67)); + + batch->w[0] = s7; + batch->w[1] = s6; + batch->w[2] = s5; + batch->w[3] = s4; + batch->w[4] = s3; + batch->w[5] = s2; + batch->w[6] = s1; + batch->w[7] = s0; +} + +// aes_nohw_rotate_cols_right returns |v| with the columns in each row rotated +// to the right by |n|. This is a macro because |aes_nohw_shift_*| require +// constant shift counts in the SSE2 implementation. +#define aes_nohw_rotate_cols_right(/* aes_word_t */ v, /* const */ n) \ + (aes_nohw_or(aes_nohw_shift_right((v), (n)*4), \ + aes_nohw_shift_left((v), 16 - (n)*4))) + +static void aes_nohw_shift_rows(AES_NOHW_BATCH *batch) { + for (size_t i = 0; i < 8; i++) { + aes_word_t row0 = aes_nohw_and(batch->w[i], AES_NOHW_ROW0_MASK); + aes_word_t row1 = aes_nohw_and(batch->w[i], AES_NOHW_ROW1_MASK); + aes_word_t row2 = aes_nohw_and(batch->w[i], AES_NOHW_ROW2_MASK); + aes_word_t row3 = aes_nohw_and(batch->w[i], AES_NOHW_ROW3_MASK); + row1 = aes_nohw_rotate_cols_right(row1, 1); + row2 = aes_nohw_rotate_cols_right(row2, 2); + row3 = aes_nohw_rotate_cols_right(row3, 3); + batch->w[i] = aes_nohw_or(aes_nohw_or(row0, row1), aes_nohw_or(row2, row3)); + } +} + +// aes_nohw_rotate_rows_down returns |v| with the rows in each column rotated +// down by one. +static inline aes_word_t aes_nohw_rotate_rows_down(aes_word_t v) { +#if defined(OPENSSL_64_BIT) + return ((v >> 4) & UINT64_C(0x0fff0fff0fff0fff)) | + ((v << 12) & UINT64_C(0xf000f000f000f000)); +#else + return ((v >> 2) & 0x3f3f3f3f) | ((v << 6) & 0xc0c0c0c0); +#endif +} + +// aes_nohw_rotate_rows_twice returns |v| with the rows in each column rotated +// by two. +static inline aes_word_t aes_nohw_rotate_rows_twice(aes_word_t v) { +#if defined(OPENSSL_64_BIT) + return ((v >> 8) & UINT64_C(0x00ff00ff00ff00ff)) | + ((v << 8) & UINT64_C(0xff00ff00ff00ff00)); +#else + return ((v >> 4) & 0x0f0f0f0f) | ((v << 4) & 0xf0f0f0f0); +#endif +} + +static void aes_nohw_mix_columns(AES_NOHW_BATCH *batch) { + // See https://eprint.iacr.org/2009/129.pdf, section 4.4 and appendix A. + aes_word_t a0 = batch->w[0]; + aes_word_t a1 = batch->w[1]; + aes_word_t a2 = batch->w[2]; + aes_word_t a3 = batch->w[3]; + aes_word_t a4 = batch->w[4]; + aes_word_t a5 = batch->w[5]; + aes_word_t a6 = batch->w[6]; + aes_word_t a7 = batch->w[7]; + + aes_word_t r0 = aes_nohw_rotate_rows_down(a0); + aes_word_t a0_r0 = aes_nohw_xor(a0, r0); + aes_word_t r1 = aes_nohw_rotate_rows_down(a1); + aes_word_t a1_r1 = aes_nohw_xor(a1, r1); + aes_word_t r2 = aes_nohw_rotate_rows_down(a2); + aes_word_t a2_r2 = aes_nohw_xor(a2, r2); + aes_word_t r3 = aes_nohw_rotate_rows_down(a3); + aes_word_t a3_r3 = aes_nohw_xor(a3, r3); + aes_word_t r4 = aes_nohw_rotate_rows_down(a4); + aes_word_t a4_r4 = aes_nohw_xor(a4, r4); + aes_word_t r5 = aes_nohw_rotate_rows_down(a5); + aes_word_t a5_r5 = aes_nohw_xor(a5, r5); + aes_word_t r6 = aes_nohw_rotate_rows_down(a6); + aes_word_t a6_r6 = aes_nohw_xor(a6, r6); + aes_word_t r7 = aes_nohw_rotate_rows_down(a7); + aes_word_t a7_r7 = aes_nohw_xor(a7, r7); + + batch->w[0] = + aes_nohw_xor(aes_nohw_xor(a7_r7, r0), aes_nohw_rotate_rows_twice(a0_r0)); + batch->w[1] = + aes_nohw_xor(aes_nohw_xor(a0_r0, a7_r7), + aes_nohw_xor(r1, aes_nohw_rotate_rows_twice(a1_r1))); + batch->w[2] = + aes_nohw_xor(aes_nohw_xor(a1_r1, r2), aes_nohw_rotate_rows_twice(a2_r2)); + batch->w[3] = + aes_nohw_xor(aes_nohw_xor(a2_r2, a7_r7), + aes_nohw_xor(r3, aes_nohw_rotate_rows_twice(a3_r3))); + batch->w[4] = + aes_nohw_xor(aes_nohw_xor(a3_r3, a7_r7), + aes_nohw_xor(r4, aes_nohw_rotate_rows_twice(a4_r4))); + batch->w[5] = + aes_nohw_xor(aes_nohw_xor(a4_r4, r5), aes_nohw_rotate_rows_twice(a5_r5)); + batch->w[6] = + aes_nohw_xor(aes_nohw_xor(a5_r5, r6), aes_nohw_rotate_rows_twice(a6_r6)); + batch->w[7] = + aes_nohw_xor(aes_nohw_xor(a6_r6, r7), aes_nohw_rotate_rows_twice(a7_r7)); +} + +static void aes_nohw_encrypt_batch(const AES_NOHW_SCHEDULE *key, + size_t num_rounds, AES_NOHW_BATCH *batch) { + aes_nohw_add_round_key(batch, &key->keys[0]); + for (size_t i = 1; i < num_rounds; i++) { + aes_nohw_sub_bytes(batch); + aes_nohw_shift_rows(batch); + aes_nohw_mix_columns(batch); + aes_nohw_add_round_key(batch, &key->keys[i]); + } + aes_nohw_sub_bytes(batch); + aes_nohw_shift_rows(batch); + aes_nohw_add_round_key(batch, &key->keys[num_rounds]); +} + +// Key schedule. + +static void aes_nohw_expand_round_keys(AES_NOHW_SCHEDULE *out, + const AES_KEY *key) { + for (size_t i = 0; i <= key->rounds; i++) { + // Copy the round key into each block in the batch. + for (size_t j = 0; j < AES_NOHW_BATCH_SIZE; j++) { + aes_word_t tmp[AES_NOHW_BLOCK_WORDS]; + OPENSSL_memcpy(tmp, key->rd_key + 4 * i, 16); + aes_nohw_batch_set(&out->keys[i], tmp, j); + } + aes_nohw_transpose(&out->keys[i]); + } +} + +static const uint8_t aes_nohw_rcon[10] = {0x01, 0x02, 0x04, 0x08, 0x10, + 0x20, 0x40, 0x80, 0x1b, 0x36}; + +// aes_nohw_rcon_slice returns the |i|th group of |AES_NOHW_BATCH_SIZE| bits in +// |rcon|, stored in a |aes_word_t|. +static inline aes_word_t aes_nohw_rcon_slice(uint8_t rcon, size_t i) { + rcon = (rcon >> (i * AES_NOHW_BATCH_SIZE)) & ((1 << AES_NOHW_BATCH_SIZE) - 1); + return ((aes_word_t)rcon); +} + +static void aes_nohw_sub_block(aes_word_t out[AES_NOHW_BLOCK_WORDS], + const aes_word_t in[AES_NOHW_BLOCK_WORDS]) { + AES_NOHW_BATCH batch; + OPENSSL_memset(&batch, 0, sizeof(batch)); + aes_nohw_batch_set(&batch, in, 0); + aes_nohw_transpose(&batch); + aes_nohw_sub_bytes(&batch); + aes_nohw_transpose(&batch); + aes_nohw_batch_get(&batch, out, 0); +} + +static void aes_nohw_setup_key_128(AES_KEY *key, const uint8_t in[16]) { + key->rounds = 10; + + aes_word_t block[AES_NOHW_BLOCK_WORDS]; + aes_nohw_compact_block(block, in); + OPENSSL_memcpy(key->rd_key, block, 16); + + for (size_t i = 1; i <= 10; i++) { + aes_word_t sub[AES_NOHW_BLOCK_WORDS]; + aes_nohw_sub_block(sub, block); + uint8_t rcon = aes_nohw_rcon[i - 1]; + for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) { + // Incorporate |rcon| and the transformed word into the first word. + block[j] = aes_nohw_xor(block[j], aes_nohw_rcon_slice(rcon, j)); + block[j] = aes_nohw_xor( + block[j], + aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12)); + // Propagate to the remaining words. Note this is reordered from the usual + // formulation to avoid needing masks. + aes_word_t v = block[j]; + block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 4)); + block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 8)); + block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 12)); + } + OPENSSL_memcpy(key->rd_key + 4 * i, block, 16); + } +} + +static void aes_nohw_setup_key_256(AES_KEY *key, const uint8_t in[32]) { + key->rounds = 14; + + // Each key schedule iteration produces two round keys. + aes_word_t block1[AES_NOHW_BLOCK_WORDS], block2[AES_NOHW_BLOCK_WORDS]; + aes_nohw_compact_block(block1, in); + OPENSSL_memcpy(key->rd_key, block1, 16); + + aes_nohw_compact_block(block2, in + 16); + OPENSSL_memcpy(key->rd_key + 4, block2, 16); + + for (size_t i = 2; i <= 14; i += 2) { + aes_word_t sub[AES_NOHW_BLOCK_WORDS]; + aes_nohw_sub_block(sub, block2); + uint8_t rcon = aes_nohw_rcon[i / 2 - 1]; + for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) { + // Incorporate |rcon| and the transformed word into the first word. + block1[j] = aes_nohw_xor(block1[j], aes_nohw_rcon_slice(rcon, j)); + block1[j] = aes_nohw_xor( + block1[j], + aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12)); + // Propagate to the remaining words. + aes_word_t v = block1[j]; + block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 4)); + block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 8)); + block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 12)); + } + OPENSSL_memcpy(key->rd_key + 4 * i, block1, 16); + + if (i == 14) { + break; + } + + aes_nohw_sub_block(sub, block1); + for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) { + // Incorporate the transformed word into the first word. + block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_right(sub[j], 12)); + // Propagate to the remaining words. + aes_word_t v = block2[j]; + block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 4)); + block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 8)); + block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 12)); + } + OPENSSL_memcpy(key->rd_key + 4 * (i + 1), block2, 16); + } +} + + +// External API. + +int aes_nohw_set_encrypt_key(const uint8_t *key, unsigned bits, + AES_KEY *aeskey) { + switch (bits) { + case 128: + aes_nohw_setup_key_128(aeskey, key); + return 0; + case 256: + aes_nohw_setup_key_256(aeskey, key); + return 0; + } + return 1; +} + +void aes_nohw_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) { + AES_NOHW_SCHEDULE sched; + aes_nohw_expand_round_keys(&sched, key); + AES_NOHW_BATCH batch; + aes_nohw_to_batch(&batch, in, /*num_blocks=*/1); + aes_nohw_encrypt_batch(&sched, key->rounds, &batch); + aes_nohw_from_batch(out, /*num_blocks=*/1, &batch); +} + +static inline void aes_nohw_xor_block(uint8_t out[16], const uint8_t a[16], + const uint8_t b[16]) { + for (size_t i = 0; i < 16; i += sizeof(aes_word_t)) { + aes_word_t x, y; + OPENSSL_memcpy(&x, a + i, sizeof(aes_word_t)); + OPENSSL_memcpy(&y, b + i, sizeof(aes_word_t)); + x = aes_nohw_xor(x, y); + OPENSSL_memcpy(out + i, &x, sizeof(aes_word_t)); + } +} + +void aes_nohw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, + size_t blocks, const AES_KEY *key, + const uint8_t ivec[16]) { + if (blocks == 0) { + return; + } + + AES_NOHW_SCHEDULE sched; + aes_nohw_expand_round_keys(&sched, key); + + // Make |AES_NOHW_BATCH_SIZE| copies of |ivec|. + alignas(AES_NOHW_WORD_SIZE) uint8_t ivs[AES_NOHW_BATCH_SIZE * 16]; + alignas(AES_NOHW_WORD_SIZE) uint8_t enc_ivs[AES_NOHW_BATCH_SIZE * 16]; + for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) { + OPENSSL_memcpy(ivs + 16 * i, ivec, 16); + } + + uint32_t ctr = CRYPTO_load_u32_be(ivs + 12); + for (;;) { + // Update counters. + for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) { + CRYPTO_store_u32_be(ivs + 16 * i + 12, ctr + (uint32_t)i); + } + + size_t todo = blocks >= AES_NOHW_BATCH_SIZE ? AES_NOHW_BATCH_SIZE : blocks; + AES_NOHW_BATCH batch; + aes_nohw_to_batch(&batch, ivs, todo); + aes_nohw_encrypt_batch(&sched, key->rounds, &batch); + aes_nohw_from_batch(enc_ivs, todo, &batch); + + for (size_t i = 0; i < todo; i++) { + aes_nohw_xor_block(out + 16 * i, in + 16 * i, enc_ivs + 16 * i); + } + + blocks -= todo; + if (blocks == 0) { + break; + } + + in += 16 * AES_NOHW_BATCH_SIZE; + out += 16 * AES_NOHW_BATCH_SIZE; + ctr += AES_NOHW_BATCH_SIZE; + } +} diff --git a/ring-0.17.14/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl b/ring-0.17.14/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl new file mode 100644 index 0000000000..f4c546a70a --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl @@ -0,0 +1,1038 @@ +#!/usr/bin/env perl +# Copyright 2024 The BoringSSL Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +#------------------------------------------------------------------------------ +# +# VAES and VPCLMULQDQ optimized AES-GCM for x86_64 (AVX2 version) +# +# This is similar to aes-gcm-avx10-x86_64.pl, but it uses AVX2 instead of AVX512 +# / AVX10. This means it can only use 16 vector registers instead of 32, the +# maximum vector length is 32 bytes, and some instructions such as vpternlogd +# and masked loads/stores are unavailable. However, it is able to run on CPUs +# that have VAES without AVX512 / AVX10, namely AMD Zen 3 (including "Milan" +# server processors) and some Intel client CPUs such as Alder Lake. +# +# This implementation also uses Karatsuba multiplication instead of schoolbook +# multiplication for GHASH in its main loop. This does not help much on Intel, +# but it improves performance by ~5% on AMD Zen 3 which is the main target for +# this implementation. Other factors weighing slightly in favor of Karatsuba +# multiplication in this implementation are the lower maximum vector length +# (which means there is space left in the Htable array to cache the halves of +# the key powers XOR'd together) and the unavailability of the vpternlogd +# instruction (which helped schoolbook a bit more than Karatsuba). + +use strict; + +my $flavour = shift; +my $output = shift; +if ( $flavour =~ /\./ ) { $output = $flavour; undef $flavour; } + +my $win64; +my @argregs; +if ( $flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/ ) { + $win64 = 1; + @argregs = ( "%rcx", "%rdx", "%r8", "%r9" ); +} +else { + $win64 = 0; + @argregs = ( "%rdi", "%rsi", "%rdx", "%rcx", "%r8", "%r9" ); +} + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; +my $dir = $1; +my $xlate; +( $xlate = "${dir}x86_64-xlate.pl" and -f $xlate ) + or ( $xlate = "${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate ) + or die "can't locate x86_64-xlate.pl"; + +open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT = *OUT; + +my $g_cur_func_name; +my $g_cur_func_uses_seh; +my @g_cur_func_saved_gpregs; +my @g_cur_func_saved_xmmregs; + +sub _begin_func { + my ( $funcname, $uses_seh ) = @_; + $g_cur_func_name = $funcname; + $g_cur_func_uses_seh = $uses_seh; + @g_cur_func_saved_gpregs = (); + @g_cur_func_saved_xmmregs = (); + return <<___; +.globl $funcname +.type $funcname,\@abi-omnipotent +.align 32 +$funcname: + .cfi_startproc + @{[ $uses_seh ? ".seh_startproc" : "" ]} + _CET_ENDBR +___ +} + +# Push a list of general purpose registers onto the stack. +sub _save_gpregs { + my @gpregs = @_; + my $code = ""; + die "_save_gpregs requires uses_seh" unless $g_cur_func_uses_seh; + die "_save_gpregs can only be called once per function" + if @g_cur_func_saved_gpregs; + die "Order must be _save_gpregs, then _save_xmmregs" + if @g_cur_func_saved_xmmregs; + @g_cur_func_saved_gpregs = @gpregs; + for my $reg (@gpregs) { + $code .= "push $reg\n"; + if ($win64) { + $code .= ".seh_pushreg $reg\n"; + } + else { + $code .= ".cfi_push $reg\n"; + } + } + return $code; +} + +# Push a list of xmm registers onto the stack if the target is Windows. +sub _save_xmmregs { + my @xmmregs = @_; + my $num_xmmregs = scalar @xmmregs; + my $code = ""; + die "_save_xmmregs requires uses_seh" unless $g_cur_func_uses_seh; + die "_save_xmmregs can only be called once per function" + if @g_cur_func_saved_xmmregs; + if ( $win64 and $num_xmmregs > 0 ) { + @g_cur_func_saved_xmmregs = @xmmregs; + my $is_misaligned = ( scalar @g_cur_func_saved_gpregs ) % 2 == 0; + my $alloc_size = 16 * $num_xmmregs + ( $is_misaligned ? 8 : 0 ); + $code .= "sub \$$alloc_size, %rsp\n"; + $code .= ".seh_stackalloc $alloc_size\n"; + for my $i ( 0 .. $num_xmmregs - 1 ) { + my $reg_num = $xmmregs[$i]; + my $pos = 16 * $i; + $code .= "movdqa %xmm$reg_num, $pos(%rsp)\n"; + $code .= ".seh_savexmm %xmm$reg_num, $pos\n"; + } + } + return $code; +} + +sub _end_func { + my $code = ""; + + # Restore any xmm registers that were saved earlier. + my $num_xmmregs = scalar @g_cur_func_saved_xmmregs; + if ( $win64 and $num_xmmregs > 0 ) { + my $need_alignment = ( scalar @g_cur_func_saved_gpregs ) % 2 == 0; + my $alloc_size = 16 * $num_xmmregs + ( $need_alignment ? 8 : 0 ); + for my $i ( 0 .. $num_xmmregs - 1 ) { + my $reg_num = $g_cur_func_saved_xmmregs[$i]; + my $pos = 16 * $i; + $code .= "movdqa $pos(%rsp), %xmm$reg_num\n"; + } + $code .= "add \$$alloc_size, %rsp\n"; + } + + # Restore any general purpose registers that were saved earlier. + for my $reg ( reverse @g_cur_func_saved_gpregs ) { + $code .= "pop $reg\n"; + if ( !$win64 ) { + $code .= ".cfi_pop $reg\n"; + } + } + + $code .= <<___; + ret + @{[ $g_cur_func_uses_seh ? ".seh_endproc" : "" ]} + .cfi_endproc + .size $g_cur_func_name, . - $g_cur_func_name +___ + return $code; +} + +my $code = <<___; +.section .rodata +.align 16 + + # A shuffle mask that reflects the bytes of 16-byte blocks +.Lbswap_mask: + .quad 0x08090a0b0c0d0e0f, 0x0001020304050607 + + # This is the GHASH reducing polynomial without its constant term, i.e. + # x^128 + x^7 + x^2 + x, represented using the backwards mapping + # between bits and polynomial coefficients. + # + # Alternatively, it can be interpreted as the naturally-ordered + # representation of the polynomial x^127 + x^126 + x^121 + 1, i.e. the + # "reversed" GHASH reducing polynomial without its x^128 term. +.Lgfpoly: + .quad 1, 0xc200000000000000 + + # Same as above, but with the (1 << 64) bit set. +.Lgfpoly_and_internal_carrybit: + .quad 1, 0xc200000000000001 + +.align 32 + # The below constants are used for incrementing the counter blocks. +.Lctr_pattern: + .quad 0, 0 + .quad 1, 0 +.Linc_2blocks: + .quad 2, 0 + .quad 2, 0 + +.text +___ + +# We use Htable[0..7] to store H^8 through H^1, and Htable[8..11] to store the +# 64-bit halves of the key powers XOR'd together (for Karatsuba multiplication) +# in the order 8,6,7,5,4,2,3,1. We do not use Htable[12..15]. +my $NUM_H_POWERS = 8; +my $OFFSETOFEND_H_POWERS = $NUM_H_POWERS * 16; +my $OFFSETOF_H_POWERS_XORED = $OFFSETOFEND_H_POWERS; + +# Offset to 'rounds' in AES_KEY struct +my $OFFSETOF_AES_ROUNDS = 240; + +# GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store +# the reduced products in \dst. Uses schoolbook multiplication. +sub _ghash_mul { + my ( $a, $b, $dst, $gfpoly, $t0, $t1, $t2 ) = @_; + return <<___; + vpclmulqdq \$0x00, $a, $b, $t0 # LO = a_L * b_L + vpclmulqdq \$0x01, $a, $b, $t1 # MI_0 = a_L * b_H + vpclmulqdq \$0x10, $a, $b, $t2 # MI_1 = a_H * b_L + vpxor $t2, $t1, $t1 # MI = MI_0 + MI_1 + vpclmulqdq \$0x01, $t0, $gfpoly, $t2 # LO_L*(x^63 + x^62 + x^57) + vpshufd \$0x4e, $t0, $t0 # Swap halves of LO + vpxor $t0, $t1, $t1 # Fold LO into MI (part 1) + vpxor $t2, $t1, $t1 # Fold LO into MI (part 2) + vpclmulqdq \$0x11, $a, $b, $dst # HI = a_H * b_H + vpclmulqdq \$0x01, $t1, $gfpoly, $t0 # MI_L*(x^63 + x^62 + x^57) + vpshufd \$0x4e, $t1, $t1 # Swap halves of MI + vpxor $t1, $dst, $dst # Fold MI into HI (part 1) + vpxor $t0, $dst, $dst # Fold MI into HI (part 2) +___ +} + +# void gcm_init_vpclmulqdq_avx2(u128 Htable[16], const uint64_t H[2]); +# +# Initialize |Htable| with powers of the GHASH subkey |H|. +# +# We use Htable[0..7] to store H^8 through H^1, and Htable[8..11] to store the +# 64-bit halves of the key powers XOR'd together (for Karatsuba multiplication) +# in the order 8,6,7,5,4,2,3,1. We do not use Htable[12..15]. +$code .= _begin_func "gcm_init_vpclmulqdq_avx2", 1; +{ + my ( $HTABLE, $H_PTR ) = @argregs[ 0 .. 1 ]; + my ( $TMP0, $TMP0_XMM ) = ( "%ymm0", "%xmm0" ); + my ( $TMP1, $TMP1_XMM ) = ( "%ymm1", "%xmm1" ); + my ( $TMP2, $TMP2_XMM ) = ( "%ymm2", "%xmm2" ); + my ( $H_CUR, $H_CUR_XMM ) = ( "%ymm3", "%xmm3" ); + my ( $H_CUR2, $H_CUR2_XMM ) = ( "%ymm4", "%xmm4" ); + my ( $H_INC, $H_INC_XMM ) = ( "%ymm5", "%xmm5" ); + my ( $GFPOLY, $GFPOLY_XMM ) = ( "%ymm6", "%xmm6" ); + + $code .= <<___; + @{[ _save_xmmregs (6) ]} + .seh_endprologue + + # Load the byte-reflected hash subkey. BoringSSL provides it in + # byte-reflected form except the two halves are in the wrong order. + vpshufd \$0x4e, ($H_PTR), $H_CUR_XMM + + # Finish preprocessing the byte-reflected hash subkey by multiplying it by + # x^-1 ("standard" interpretation of polynomial coefficients) or + # equivalently x^1 (natural interpretation). This gets the key into a + # format that avoids having to bit-reflect the data blocks later. + vpshufd \$0xd3, $H_CUR_XMM, $TMP0_XMM + vpsrad \$31, $TMP0_XMM, $TMP0_XMM + vpaddq $H_CUR_XMM, $H_CUR_XMM, $H_CUR_XMM + vpand .Lgfpoly_and_internal_carrybit(%rip), $TMP0_XMM, $TMP0_XMM + vpxor $TMP0_XMM, $H_CUR_XMM, $H_CUR_XMM + + vbroadcasti128 .Lgfpoly(%rip), $GFPOLY + + # Square H^1 to get H^2. + @{[ _ghash_mul $H_CUR_XMM, $H_CUR_XMM, $H_INC_XMM, $GFPOLY_XMM, + $TMP0_XMM, $TMP1_XMM, $TMP2_XMM ]} + + # Create H_CUR = [H^2, H^1] and H_INC = [H^2, H^2]. + vinserti128 \$1, $H_CUR_XMM, $H_INC, $H_CUR + vinserti128 \$1, $H_INC_XMM, $H_INC, $H_INC + + # Compute H_CUR2 = [H^4, H^3]. + @{[ _ghash_mul $H_INC, $H_CUR, $H_CUR2, $GFPOLY, $TMP0, $TMP1, $TMP2 ]} + + # Store [H^2, H^1] and [H^4, H^3]. + vmovdqu $H_CUR, 3*32($HTABLE) + vmovdqu $H_CUR2, 2*32($HTABLE) + + # For Karatsuba multiplication: compute and store the two 64-bit halves of + # each key power XOR'd together. Order is 4,2,3,1. + vpunpcklqdq $H_CUR, $H_CUR2, $TMP0 + vpunpckhqdq $H_CUR, $H_CUR2, $TMP1 + vpxor $TMP1, $TMP0, $TMP0 + vmovdqu $TMP0, $OFFSETOF_H_POWERS_XORED+32($HTABLE) + + # Compute and store H_CUR = [H^6, H^5] and H_CUR2 = [H^8, H^7]. + @{[ _ghash_mul $H_INC, $H_CUR2, $H_CUR, $GFPOLY, $TMP0, $TMP1, $TMP2 ]} + @{[ _ghash_mul $H_INC, $H_CUR, $H_CUR2, $GFPOLY, $TMP0, $TMP1, $TMP2 ]} + vmovdqu $H_CUR, 1*32($HTABLE) + vmovdqu $H_CUR2, 0*32($HTABLE) + + # Again, compute and store the two 64-bit halves of each key power XOR'd + # together. Order is 8,6,7,5. + vpunpcklqdq $H_CUR, $H_CUR2, $TMP0 + vpunpckhqdq $H_CUR, $H_CUR2, $TMP1 + vpxor $TMP1, $TMP0, $TMP0 + vmovdqu $TMP0, $OFFSETOF_H_POWERS_XORED($HTABLE) + + vzeroupper +___ +} +$code .= _end_func; + +# Do one step of the GHASH update of four vectors of data blocks. +# $i: the step to do, 0 through 9 +# $ghashdata_ptr: pointer to the data blocks (ciphertext or AAD) +# $htable: pointer to the Htable for the key +# $bswap_mask: mask for reflecting the bytes of blocks +# $h_pow[2-1]_xored: XOR'd key powers cached from Htable +# $tmp[0-2]: temporary registers. $tmp[1-2] must be preserved across steps. +# $lo, $mi: working state for this macro that must be preserved across steps +# $ghash_acc: the GHASH accumulator (input/output) +sub _ghash_step_4x { + my ( + $i, $ghashdata_ptr, $htable, $bswap_mask, + $h_pow2_xored, $h_pow1_xored, $tmp0, $tmp0_xmm, + $tmp1, $tmp2, $lo, $mi, + $ghash_acc, $ghash_acc_xmm + ) = @_; + my ( $hi, $hi_xmm ) = ( $ghash_acc, $ghash_acc_xmm ); # alias + if ( $i == 0 ) { + return <<___; + # First vector + vmovdqu 0*32($ghashdata_ptr), $tmp1 + vpshufb $bswap_mask, $tmp1, $tmp1 + vmovdqu 0*32($htable), $tmp2 + vpxor $ghash_acc, $tmp1, $tmp1 + vpclmulqdq \$0x00, $tmp2, $tmp1, $lo + vpclmulqdq \$0x11, $tmp2, $tmp1, $hi + vpunpckhqdq $tmp1, $tmp1, $tmp0 + vpxor $tmp1, $tmp0, $tmp0 + vpclmulqdq \$0x00, $h_pow2_xored, $tmp0, $mi +___ + } + elsif ( $i == 1 ) { + return <<___; +___ + } + elsif ( $i == 2 ) { + return <<___; + # Second vector + vmovdqu 1*32($ghashdata_ptr), $tmp1 + vpshufb $bswap_mask, $tmp1, $tmp1 + vmovdqu 1*32($htable), $tmp2 + vpclmulqdq \$0x00, $tmp2, $tmp1, $tmp0 + vpxor $tmp0, $lo, $lo + vpclmulqdq \$0x11, $tmp2, $tmp1, $tmp0 + vpxor $tmp0, $hi, $hi + vpunpckhqdq $tmp1, $tmp1, $tmp0 + vpxor $tmp1, $tmp0, $tmp0 + vpclmulqdq \$0x10, $h_pow2_xored, $tmp0, $tmp0 + vpxor $tmp0, $mi, $mi +___ + } + elsif ( $i == 3 ) { + return <<___; + # Third vector + vmovdqu 2*32($ghashdata_ptr), $tmp1 + vpshufb $bswap_mask, $tmp1, $tmp1 + vmovdqu 2*32($htable), $tmp2 +___ + } + elsif ( $i == 4 ) { + return <<___; + vpclmulqdq \$0x00, $tmp2, $tmp1, $tmp0 + vpxor $tmp0, $lo, $lo + vpclmulqdq \$0x11, $tmp2, $tmp1, $tmp0 + vpxor $tmp0, $hi, $hi +___ + } + elsif ( $i == 5 ) { + return <<___; + vpunpckhqdq $tmp1, $tmp1, $tmp0 + vpxor $tmp1, $tmp0, $tmp0 + vpclmulqdq \$0x00, $h_pow1_xored, $tmp0, $tmp0 + vpxor $tmp0, $mi, $mi + + # Fourth vector + vmovdqu 3*32($ghashdata_ptr), $tmp1 + vpshufb $bswap_mask, $tmp1, $tmp1 +___ + } + elsif ( $i == 6 ) { + return <<___; + vmovdqu 3*32($htable), $tmp2 + vpclmulqdq \$0x00, $tmp2, $tmp1, $tmp0 + vpxor $tmp0, $lo, $lo + vpclmulqdq \$0x11, $tmp2, $tmp1, $tmp0 + vpxor $tmp0, $hi, $hi + vpunpckhqdq $tmp1, $tmp1, $tmp0 + vpxor $tmp1, $tmp0, $tmp0 + vpclmulqdq \$0x10, $h_pow1_xored, $tmp0, $tmp0 + vpxor $tmp0, $mi, $mi +___ + } + elsif ( $i == 7 ) { + return <<___; + # Finalize 'mi' following Karatsuba multiplication. + vpxor $lo, $mi, $mi + vpxor $hi, $mi, $mi + + # Fold lo into mi. + vbroadcasti128 .Lgfpoly(%rip), $tmp2 + vpclmulqdq \$0x01, $lo, $tmp2, $tmp0 + vpshufd \$0x4e, $lo, $lo + vpxor $lo, $mi, $mi + vpxor $tmp0, $mi, $mi +___ + } + elsif ( $i == 8 ) { + return <<___; + # Fold mi into hi. + vpclmulqdq \$0x01, $mi, $tmp2, $tmp0 + vpshufd \$0x4e, $mi, $mi + vpxor $mi, $hi, $hi + vpxor $tmp0, $hi, $hi +___ + } + elsif ( $i == 9 ) { + return <<___; + vextracti128 \$1, $hi, $tmp0_xmm + vpxor $tmp0_xmm, $hi_xmm, $ghash_acc_xmm +___ + } +} + +sub _ghash_4x { + my $code = ""; + for my $i ( 0 .. 9 ) { + $code .= _ghash_step_4x $i, @_; + } + return $code; +} + +# void gcm_ghash_vpclmulqdq_avx2(uint8_t Xi[16], const u128 Htable[16], +# const uint8_t *in, size_t len); +# +# Using the key |Htable|, update the GHASH accumulator |Xi| with the data given +# by |in| and |len|. |len| must be exactly 16. +$code .= _begin_func "gcm_ghash_vpclmulqdq_avx2_1", 1; +{ + # Function arguments + my ( $GHASH_ACC_PTR, $HTABLE, $AAD, $AADLEN ) = @argregs[ 0 .. 3 ]; + + # Additional local variables + my ( $TMP0, $TMP0_XMM ) = ( "%ymm0", "%xmm0" ); + my ( $TMP1, $TMP1_XMM ) = ( "%ymm1", "%xmm1" ); + my ( $TMP2, $TMP2_XMM ) = ( "%ymm2", "%xmm2" ); + my ( $LO, $LO_XMM ) = ( "%ymm3", "%xmm3" ); + my ( $MI, $MI_XMM ) = ( "%ymm4", "%xmm4" ); + my ( $GHASH_ACC, $GHASH_ACC_XMM ) = ( "%ymm5", "%xmm5" ); + my ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( "%ymm6", "%xmm6" ); + my ( $GFPOLY, $GFPOLY_XMM ) = ( "%ymm7", "%xmm7" ); + my $H_POW2_XORED = "%ymm8"; + my $H_POW1_XORED = "%ymm9"; + + $code .= <<___; + @{[ _save_xmmregs (6 .. 9) ]} + .seh_endprologue + + # Load the bswap_mask and gfpoly constants. Since AADLEN is usually small, + # usually only 128-bit vectors will be used. So as an optimization, don't + # broadcast these constants to both 128-bit lanes quite yet. + vmovdqu .Lbswap_mask(%rip), $BSWAP_MASK_XMM + vmovdqu .Lgfpoly(%rip), $GFPOLY_XMM + + # Load the GHASH accumulator. + vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC_XMM + vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM + + + # Update GHASH with the remaining 16-byte block if any. +.Lghash_lastblock: + vmovdqu ($AAD), $TMP0_XMM + vpshufb $BSWAP_MASK_XMM, $TMP0_XMM, $TMP0_XMM + vpxor $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM + vmovdqu $OFFSETOFEND_H_POWERS-16($HTABLE), $TMP0_XMM + @{[ _ghash_mul $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GFPOLY_XMM, + $TMP1_XMM, $TMP2_XMM, $LO_XMM ]} + +.Lghash_done: + # Store the updated GHASH accumulator back to memory. + vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM + vmovdqu $GHASH_ACC_XMM, ($GHASH_ACC_PTR) + + vzeroupper +___ +} +$code .= _end_func; + +sub _vaesenc_4x { + my ( $round_key, $aesdata0, $aesdata1, $aesdata2, $aesdata3 ) = @_; + return <<___; + vaesenc $round_key, $aesdata0, $aesdata0 + vaesenc $round_key, $aesdata1, $aesdata1 + vaesenc $round_key, $aesdata2, $aesdata2 + vaesenc $round_key, $aesdata3, $aesdata3 +___ +} + +sub _ctr_begin_4x { + my ( + $le_ctr, $bswap_mask, $rndkey0, $aesdata0, + $aesdata1, $aesdata2, $aesdata3, $tmp + ) = @_; + return <<___; + # Increment le_ctr four times to generate four vectors of little-endian + # counter blocks, swap each to big-endian, and store them in aesdata[0-3]. + vmovdqu .Linc_2blocks(%rip), $tmp + vpshufb $bswap_mask, $le_ctr, $aesdata0 + vpaddd $tmp, $le_ctr, $le_ctr + vpshufb $bswap_mask, $le_ctr, $aesdata1 + vpaddd $tmp, $le_ctr, $le_ctr + vpshufb $bswap_mask, $le_ctr, $aesdata2 + vpaddd $tmp, $le_ctr, $le_ctr + vpshufb $bswap_mask, $le_ctr, $aesdata3 + vpaddd $tmp, $le_ctr, $le_ctr + + # AES "round zero": XOR in the zero-th round key. + vpxor $rndkey0, $aesdata0, $aesdata0 + vpxor $rndkey0, $aesdata1, $aesdata1 + vpxor $rndkey0, $aesdata2, $aesdata2 + vpxor $rndkey0, $aesdata3, $aesdata3 +___ +} + +# Do the last AES round for four vectors of counter blocks, XOR four vectors of +# source data with the resulting keystream blocks, and write the result to the +# destination buffer. The implementation differs slightly as it takes advantage +# of the property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a) to reduce +# latency, but it has the same effect. +sub _aesenclast_and_xor_4x { + my ( + $src, $dst, $rndkeylast, $aesdata0, + $aesdata1, $aesdata2, $aesdata3, $t0, + $t1, $t2, $t3 + ) = @_; + return <<___; + vpxor 0*32($src), $rndkeylast, $t0 + vpxor 1*32($src), $rndkeylast, $t1 + vpxor 2*32($src), $rndkeylast, $t2 + vpxor 3*32($src), $rndkeylast, $t3 + vaesenclast $t0, $aesdata0, $aesdata0 + vaesenclast $t1, $aesdata1, $aesdata1 + vaesenclast $t2, $aesdata2, $aesdata2 + vaesenclast $t3, $aesdata3, $aesdata3 + vmovdqu $aesdata0, 0*32($dst) + vmovdqu $aesdata1, 1*32($dst) + vmovdqu $aesdata2, 2*32($dst) + vmovdqu $aesdata3, 3*32($dst) +___ +} + +my $g_update_macro_expansion_count = 0; + +# void aes_gcm_{enc,dec}_update_vaes_avx2(const uint8_t *in, uint8_t *out, +# size_t len, const AES_KEY *key, +# const uint8_t ivec[16], +# const u128 Htable[16], +# uint8_t Xi[16]); +# +# This macro generates a GCM encryption or decryption update function with the +# above prototype (with \enc selecting which one). The function computes the +# next portion of the CTR keystream, XOR's it with |len| bytes from |in|, and +# writes the resulting encrypted or decrypted data to |out|. It also updates +# the GHASH accumulator |Xi| using the next |len| ciphertext bytes. +# +# |len| must be a multiple of 16. The caller must do any buffering needed to +# ensure this. Both in-place and out-of-place en/decryption are supported. +# +# |ivec| must give the current counter in big-endian format. This function +# loads the counter from |ivec| and increments the loaded counter as needed, but +# it does *not* store the updated counter back to |ivec|. The caller must +# update |ivec| if any more data segments follow. Internally, only the low +# 32-bit word of the counter is incremented, following the GCM standard. +sub _aes_gcm_update { + my $local_label_suffix = "__func" . ++$g_update_macro_expansion_count; + my ($enc) = @_; + my $code = ""; + + # Function arguments + my ( $SRC, $DST, $DATALEN, $AESKEY, $BE_CTR_PTR, $HTABLE, $GHASH_ACC_PTR ) + = $win64 + ? ( @argregs[ 0 .. 3 ], "%rsi", "%rdi", "%r12" ) + : ( @argregs[ 0 .. 5 ], "%r12" ); + + # Additional local variables. + # %rax is used as a temporary register. BE_CTR_PTR is also available as a + # temporary register after the counter is loaded. + + # AES key length in bytes + my ( $AESKEYLEN, $AESKEYLEN64 ) = ( "%r10d", "%r10" ); + + # Pointer to the last AES round key for the chosen AES variant + my $RNDKEYLAST_PTR = "%r11"; + + # BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values + # using vpshufb, copied to all 128-bit lanes. + my ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( "%ymm0", "%xmm0" ); + + # GHASH_ACC is the accumulator variable for GHASH. When fully reduced, + # only the lowest 128-bit lane can be nonzero. When not fully reduced, + # more than one lane may be used, and they need to be XOR'd together. + my ( $GHASH_ACC, $GHASH_ACC_XMM ) = ( "%ymm1", "%xmm1" ); + + # TMP[0-2] are temporary registers. + my ( $TMP0, $TMP0_XMM ) = ( "%ymm2", "%xmm2" ); + my ( $TMP1, $TMP1_XMM ) = ( "%ymm3", "%xmm3" ); + my ( $TMP2, $TMP2_XMM ) = ( "%ymm4", "%xmm4" ); + + # LO and MI are used to accumulate unreduced GHASH products. + my ( $LO, $LO_XMM ) = ( "%ymm5", "%xmm5" ); + my ( $MI, $MI_XMM ) = ( "%ymm6", "%xmm6" ); + + # Cached key powers from Htable + my ( $H_POW2_XORED, $H_POW2_XORED_XMM ) = ( "%ymm7", "%xmm7" ); + my ( $H_POW1_XORED, $H_POW1_XORED_XMM ) = ( "%ymm8", "%xmm8" ); + + # RNDKEY0 caches the zero-th round key, and RNDKEYLAST the last one. + my $RNDKEY0 = "%ymm9"; + my $RNDKEYLAST = "%ymm10"; + + # LE_CTR contains the next set of little-endian counter blocks. + my $LE_CTR = "%ymm11"; + + # AESDATA[0-3] hold the counter blocks that are being encrypted by AES. + my ( $AESDATA0, $AESDATA0_XMM ) = ( "%ymm12", "%xmm12" ); + my ( $AESDATA1, $AESDATA1_XMM ) = ( "%ymm13", "%xmm13" ); + my ( $AESDATA2, $AESDATA2_XMM ) = ( "%ymm14", "%xmm14" ); + my ( $AESDATA3, $AESDATA3_XMM ) = ( "%ymm15", "%xmm15" ); + my @AESDATA = ( $AESDATA0, $AESDATA1, $AESDATA2, $AESDATA3 ); + + my @ghash_4x_args = ( + $enc ? $DST : $SRC, $HTABLE, $BSWAP_MASK, $H_POW2_XORED, + $H_POW1_XORED, $TMP0, $TMP0_XMM, $TMP1, + $TMP2, $LO, $MI, $GHASH_ACC, + $GHASH_ACC_XMM + ); + + if ($win64) { + $code .= <<___; + @{[ _save_gpregs $BE_CTR_PTR, $HTABLE, $GHASH_ACC_PTR ]} + mov 64(%rsp), $BE_CTR_PTR # arg5 + mov 72(%rsp), $HTABLE # arg6 + mov 80(%rsp), $GHASH_ACC_PTR # arg7 + @{[ _save_xmmregs (6 .. 15) ]} + .seh_endprologue +___ + } + else { + $code .= <<___; + @{[ _save_gpregs $GHASH_ACC_PTR ]} + mov 16(%rsp), $GHASH_ACC_PTR # arg7 +___ + } + + if ($enc) { + $code .= <<___; +#ifdef BORINGSSL_DISPATCH_TEST + .extern BORINGSSL_function_hit + movb \$1,BORINGSSL_function_hit+8(%rip) +#endif +___ + } + $code .= <<___; + vbroadcasti128 .Lbswap_mask(%rip), $BSWAP_MASK + + # Load the GHASH accumulator and the starting counter. + # BoringSSL passes these values in big endian format. + vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC_XMM + vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM + vbroadcasti128 ($BE_CTR_PTR), $LE_CTR + vpshufb $BSWAP_MASK, $LE_CTR, $LE_CTR + + # Load the AES key length in bytes. BoringSSL stores number of rounds + # minus 1, so convert using: AESKEYLEN = 4 * aeskey->rounds - 20. + movl $OFFSETOF_AES_ROUNDS($AESKEY), $AESKEYLEN + lea -20(,$AESKEYLEN,4), $AESKEYLEN + + # Make RNDKEYLAST_PTR point to the last AES round key. This is the + # round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256 + # respectively. Then load the zero-th and last round keys. + lea 6*16($AESKEY,$AESKEYLEN64,4), $RNDKEYLAST_PTR + vbroadcasti128 ($AESKEY), $RNDKEY0 + vbroadcasti128 ($RNDKEYLAST_PTR), $RNDKEYLAST + + # Finish initializing LE_CTR by adding 1 to the second block. + vpaddd .Lctr_pattern(%rip), $LE_CTR, $LE_CTR + + # If there are at least 128 bytes of data, then continue into the loop that + # processes 128 bytes of data at a time. Otherwise skip it. + cmp \$127, $DATALEN + jbe .Lcrypt_loop_4x_done$local_label_suffix + + vmovdqu $OFFSETOF_H_POWERS_XORED($HTABLE), $H_POW2_XORED + vmovdqu $OFFSETOF_H_POWERS_XORED+32($HTABLE), $H_POW1_XORED +___ + + # Main loop: en/decrypt and hash 4 vectors (128 bytes) at a time. + + if ($enc) { + $code .= <<___; + # Encrypt the first 4 vectors of plaintext blocks. + @{[ _ctr_begin_4x $LE_CTR, $BSWAP_MASK, $RNDKEY0, @AESDATA, $TMP0 ]} + lea 16($AESKEY), %rax +.Lvaesenc_loop_first_4_vecs$local_label_suffix: + vbroadcasti128 (%rax), $TMP0 + @{[ _vaesenc_4x $TMP0, @AESDATA ]} + add \$16, %rax + cmp %rax, $RNDKEYLAST_PTR + jne .Lvaesenc_loop_first_4_vecs$local_label_suffix + @{[ _aesenclast_and_xor_4x $SRC, $DST, $RNDKEYLAST, @AESDATA, + $TMP0, $TMP1, $LO, $MI ]} + sub \$-128, $SRC # 128 is 4 bytes, -128 is 1 byte + add \$-128, $DATALEN + cmp \$127, $DATALEN + jbe .Lghash_last_ciphertext_4x$local_label_suffix +___ + } + + $code .= <<___; +.align 16 +.Lcrypt_loop_4x$local_label_suffix: + + # Start the AES encryption of the counter blocks. + @{[ _ctr_begin_4x $LE_CTR, $BSWAP_MASK, $RNDKEY0, @AESDATA, $TMP0 ]} + cmp \$24, $AESKEYLEN + jl .Laes128$local_label_suffix + je .Laes192$local_label_suffix + # AES-256 + vbroadcasti128 -13*16($RNDKEYLAST_PTR), $TMP0 + @{[ _vaesenc_4x $TMP0, @AESDATA ]} + vbroadcasti128 -12*16($RNDKEYLAST_PTR), $TMP0 + @{[ _vaesenc_4x $TMP0, @AESDATA ]} +.Laes192$local_label_suffix: + vbroadcasti128 -11*16($RNDKEYLAST_PTR), $TMP0 + @{[ _vaesenc_4x $TMP0, @AESDATA ]} + vbroadcasti128 -10*16($RNDKEYLAST_PTR), $TMP0 + @{[ _vaesenc_4x $TMP0, @AESDATA ]} +.Laes128$local_label_suffix: +___ + + # Prefetch the source data 512 bytes ahead into the L1 data cache, to + # improve performance when the hardware prefetcher is disabled. Assumes the + # L1 data cache line size is 64 bytes (de facto standard on x86_64). + $code .= "prefetcht0 512($SRC)\n"; + $code .= "prefetcht0 512+64($SRC)\n"; + + # Finish the AES encryption of the counter blocks in AESDATA[0-3], + # interleaved with the GHASH update of the ciphertext blocks. + for my $i ( reverse 1 .. 9 ) { + $code .= <<___; + @{[ _ghash_step_4x 9-$i, @ghash_4x_args ]} + vbroadcasti128 -$i*16($RNDKEYLAST_PTR), $TMP0 + @{[ _vaesenc_4x $TMP0, @AESDATA ]} +___ + } + $code .= <<___; + @{[ _ghash_step_4x 9, @ghash_4x_args ]} + + @{[ $enc ? "sub \$-128, $DST" : "" ]} # 128 is 4 bytes, -128 is 1 byte + @{[ _aesenclast_and_xor_4x $SRC, $DST, $RNDKEYLAST, @AESDATA, + $TMP0, $TMP1, $LO, $MI ]} + sub \$-128, $SRC + @{[ !$enc ? "sub \$-128, $DST" : "" ]} + add \$-128, $DATALEN + cmp \$127, $DATALEN + ja .Lcrypt_loop_4x$local_label_suffix +___ + + if ($enc) { + + # Update GHASH with the last set of ciphertext blocks. + $code .= <<___; +.Lghash_last_ciphertext_4x$local_label_suffix: + @{[ _ghash_4x @ghash_4x_args ]} + sub \$-128, $DST +___ + } + + my $POWERS_PTR = $BE_CTR_PTR; # BE_CTR_PTR is free to be reused. + my ( $HI, $HI_XMM ) = ( $H_POW2_XORED, $H_POW2_XORED_XMM ); # reuse + + $code .= <<___; +.Lcrypt_loop_4x_done$local_label_suffix: + # Check whether any data remains. + test $DATALEN, $DATALEN + jz .Ldone$local_label_suffix + + # DATALEN is in [16, 32, 48, 64, 80, 96, 112]. + + # Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N + # is the number of blocks that remain. + lea $OFFSETOFEND_H_POWERS($HTABLE), $POWERS_PTR + sub $DATALEN, $POWERS_PTR + + # Start collecting the unreduced GHASH intermediate value LO, MI, HI. + vpxor $LO_XMM, $LO_XMM, $LO_XMM + vpxor $MI_XMM, $MI_XMM, $MI_XMM + vpxor $HI_XMM, $HI_XMM, $HI_XMM + + cmp \$64, $DATALEN + jb .Llessthan64bytes$local_label_suffix + + # DATALEN is in [64, 80, 96, 112]. Encrypt two vectors of counter blocks. + vpshufb $BSWAP_MASK, $LE_CTR, $AESDATA0 + vpaddd .Linc_2blocks(%rip), $LE_CTR, $LE_CTR + vpshufb $BSWAP_MASK, $LE_CTR, $AESDATA1 + vpaddd .Linc_2blocks(%rip), $LE_CTR, $LE_CTR + vpxor $RNDKEY0, $AESDATA0, $AESDATA0 + vpxor $RNDKEY0, $AESDATA1, $AESDATA1 + lea 16($AESKEY), %rax +.Lvaesenc_loop_tail_1$local_label_suffix: + vbroadcasti128 (%rax), $TMP0 + vaesenc $TMP0, $AESDATA0, $AESDATA0 + vaesenc $TMP0, $AESDATA1, $AESDATA1 + add \$16, %rax + cmp %rax, $RNDKEYLAST_PTR + jne .Lvaesenc_loop_tail_1$local_label_suffix + vaesenclast $RNDKEYLAST, $AESDATA0, $AESDATA0 + vaesenclast $RNDKEYLAST, $AESDATA1, $AESDATA1 + + # XOR the data with the two vectors of keystream blocks. + vmovdqu 0($SRC), $TMP0 + vmovdqu 32($SRC), $TMP1 + vpxor $TMP0, $AESDATA0, $AESDATA0 + vpxor $TMP1, $AESDATA1, $AESDATA1 + vmovdqu $AESDATA0, 0($DST) + vmovdqu $AESDATA1, 32($DST) + + # Update GHASH with two vectors of ciphertext blocks, without reducing. + vpshufb $BSWAP_MASK, @{[ $enc ? $AESDATA0 : $TMP0 ]}, $AESDATA0 + vpshufb $BSWAP_MASK, @{[ $enc ? $AESDATA1 : $TMP1 ]}, $AESDATA1 + vpxor $GHASH_ACC, $AESDATA0, $AESDATA0 + vmovdqu ($POWERS_PTR), $TMP0 + vmovdqu 32($POWERS_PTR), $TMP1 + vpclmulqdq \$0x00, $TMP0, $AESDATA0, $LO + vpclmulqdq \$0x01, $TMP0, $AESDATA0, $MI + vpclmulqdq \$0x10, $TMP0, $AESDATA0, $TMP2 + vpxor $TMP2, $MI, $MI + vpclmulqdq \$0x11, $TMP0, $AESDATA0, $HI + vpclmulqdq \$0x00, $TMP1, $AESDATA1, $TMP2 + vpxor $TMP2, $LO, $LO + vpclmulqdq \$0x01, $TMP1, $AESDATA1, $TMP2 + vpxor $TMP2, $MI, $MI + vpclmulqdq \$0x10, $TMP1, $AESDATA1, $TMP2 + vpxor $TMP2, $MI, $MI + vpclmulqdq \$0x11, $TMP1, $AESDATA1, $TMP2 + vpxor $TMP2, $HI, $HI + + add \$64, $POWERS_PTR + add \$64, $SRC + add \$64, $DST + sub \$64, $DATALEN + jz .Lreduce$local_label_suffix + + vpxor $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM + + # DATALEN is in [16, 32, 48]. Encrypt two last vectors of counter blocks. +.Llessthan64bytes$local_label_suffix: + vpshufb $BSWAP_MASK, $LE_CTR, $AESDATA0 + vpaddd .Linc_2blocks(%rip), $LE_CTR, $LE_CTR + vpshufb $BSWAP_MASK, $LE_CTR, $AESDATA1 + vpxor $RNDKEY0, $AESDATA0, $AESDATA0 + vpxor $RNDKEY0, $AESDATA1, $AESDATA1 + lea 16($AESKEY), %rax +.Lvaesenc_loop_tail_2$local_label_suffix: + vbroadcasti128 (%rax), $TMP0 + vaesenc $TMP0, $AESDATA0, $AESDATA0 + vaesenc $TMP0, $AESDATA1, $AESDATA1 + add \$16, %rax + cmp %rax, $RNDKEYLAST_PTR + jne .Lvaesenc_loop_tail_2$local_label_suffix + vaesenclast $RNDKEYLAST, $AESDATA0, $AESDATA0 + vaesenclast $RNDKEYLAST, $AESDATA1, $AESDATA1 + + # XOR the remaining data with the keystream blocks, and update GHASH with + # the remaining ciphertext blocks without reducing. + + cmp \$32, $DATALEN + jb .Lxor_one_block$local_label_suffix + je .Lxor_two_blocks$local_label_suffix + +.Lxor_three_blocks$local_label_suffix: + vmovdqu 0($SRC), $TMP0 + vmovdqu 32($SRC), $TMP1_XMM + vpxor $TMP0, $AESDATA0, $AESDATA0 + vpxor $TMP1_XMM, $AESDATA1_XMM, $AESDATA1_XMM + vmovdqu $AESDATA0, 0($DST) + vmovdqu $AESDATA1_XMM, 32($DST) + + vpshufb $BSWAP_MASK, @{[ $enc ? $AESDATA0 : $TMP0 ]}, $AESDATA0 + vpshufb $BSWAP_MASK_XMM, @{[ $enc ? $AESDATA1_XMM : $TMP1_XMM ]}, $AESDATA1_XMM + vpxor $GHASH_ACC, $AESDATA0, $AESDATA0 + vmovdqu ($POWERS_PTR), $TMP0 + vmovdqu 32($POWERS_PTR), $TMP1_XMM + vpclmulqdq \$0x00, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM + vpxor $TMP2, $LO, $LO + vpclmulqdq \$0x01, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM + vpxor $TMP2, $MI, $MI + vpclmulqdq \$0x10, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM + vpxor $TMP2, $MI, $MI + vpclmulqdq \$0x11, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM + vpxor $TMP2, $HI, $HI + jmp .Lghash_mul_one_vec_unreduced$local_label_suffix + +.Lxor_two_blocks$local_label_suffix: + vmovdqu ($SRC), $TMP0 + vpxor $TMP0, $AESDATA0, $AESDATA0 + vmovdqu $AESDATA0, ($DST) + vpshufb $BSWAP_MASK, @{[ $enc ? $AESDATA0 : $TMP0 ]}, $AESDATA0 + vpxor $GHASH_ACC, $AESDATA0, $AESDATA0 + vmovdqu ($POWERS_PTR), $TMP0 + jmp .Lghash_mul_one_vec_unreduced$local_label_suffix + +.Lxor_one_block$local_label_suffix: + vmovdqu ($SRC), $TMP0_XMM + vpxor $TMP0_XMM, $AESDATA0_XMM, $AESDATA0_XMM + vmovdqu $AESDATA0_XMM, ($DST) + vpshufb $BSWAP_MASK_XMM, @{[ $enc ? $AESDATA0_XMM : $TMP0_XMM ]}, $AESDATA0_XMM + vpxor $GHASH_ACC_XMM, $AESDATA0_XMM, $AESDATA0_XMM + vmovdqu ($POWERS_PTR), $TMP0_XMM + +.Lghash_mul_one_vec_unreduced$local_label_suffix: + vpclmulqdq \$0x00, $TMP0, $AESDATA0, $TMP2 + vpxor $TMP2, $LO, $LO + vpclmulqdq \$0x01, $TMP0, $AESDATA0, $TMP2 + vpxor $TMP2, $MI, $MI + vpclmulqdq \$0x10, $TMP0, $AESDATA0, $TMP2 + vpxor $TMP2, $MI, $MI + vpclmulqdq \$0x11, $TMP0, $AESDATA0, $TMP2 + vpxor $TMP2, $HI, $HI + +.Lreduce$local_label_suffix: + # Finally, do the GHASH reduction. + vbroadcasti128 .Lgfpoly(%rip), $TMP0 + vpclmulqdq \$0x01, $LO, $TMP0, $TMP1 + vpshufd \$0x4e, $LO, $LO + vpxor $LO, $MI, $MI + vpxor $TMP1, $MI, $MI + vpclmulqdq \$0x01, $MI, $TMP0, $TMP1 + vpshufd \$0x4e, $MI, $MI + vpxor $MI, $HI, $HI + vpxor $TMP1, $HI, $HI + vextracti128 \$1, $HI, $GHASH_ACC_XMM + vpxor $HI_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM + +.Ldone$local_label_suffix: + # Store the updated GHASH accumulator back to memory. + vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM + vmovdqu $GHASH_ACC_XMM, ($GHASH_ACC_PTR) + + vzeroupper +___ + return $code; +} + +$code .= _begin_func "aes_gcm_enc_update_vaes_avx2", 1; +$code .= _aes_gcm_update 1; +$code .= _end_func; + +$code .= _begin_func "aes_gcm_dec_update_vaes_avx2", 1; +$code .= _aes_gcm_update 0; +$code .= _end_func; + +sub filter_and_print { + # This function replaces AVX2 assembly instructions with their assembled forms, + # to allow the code to work on old versions of binutils (older than 2.30) that do + # not support these instructions. + my %asmMap = ( + 'vaesenc %ymm2, %ymm12, %ymm12' => '.byte 0xc4,0x62,0x1d,0xdc,0xe2', + 'vaesenc %ymm2, %ymm13, %ymm13' => '.byte 0xc4,0x62,0x15,0xdc,0xea', + 'vaesenc %ymm2, %ymm14, %ymm14' => '.byte 0xc4,0x62,0x0d,0xdc,0xf2', + 'vaesenc %ymm2, %ymm15, %ymm15' => '.byte 0xc4,0x62,0x05,0xdc,0xfa', + 'vaesenclast %ymm10, %ymm12, %ymm12' => '.byte 0xc4,0x42,0x1d,0xdd,0xe2', + 'vaesenclast %ymm10, %ymm13, %ymm13' => '.byte 0xc4,0x42,0x15,0xdd,0xea', + 'vaesenclast %ymm2, %ymm12, %ymm12' => '.byte 0xc4,0x62,0x1d,0xdd,0xe2', + 'vaesenclast %ymm3, %ymm13, %ymm13' => '.byte 0xc4,0x62,0x15,0xdd,0xeb', + 'vaesenclast %ymm5, %ymm14, %ymm14' => '.byte 0xc4,0x62,0x0d,0xdd,0xf5', + 'vaesenclast %ymm6, %ymm15, %ymm15' => '.byte 0xc4,0x62,0x05,0xdd,0xfe', + 'vpclmulqdq $0x00, %ymm2, %ymm12, %ymm4' => '.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x00', + 'vpclmulqdq $0x00, %ymm2, %ymm12, %ymm5' => '.byte 0xc4,0xe3,0x1d,0x44,0xea,0x00', + 'vpclmulqdq $0x00, %ymm3, %ymm13, %ymm4' => '.byte 0xc4,0xe3,0x15,0x44,0xe3,0x00', + 'vpclmulqdq $0x00, %ymm4, %ymm3, %ymm2' => '.byte 0xc4,0xe3,0x65,0x44,0xd4,0x00', + 'vpclmulqdq $0x00, %ymm4, %ymm3, %ymm5' => '.byte 0xc4,0xe3,0x65,0x44,0xec,0x00', + 'vpclmulqdq $0x00, %ymm5, %ymm3, %ymm0' => '.byte 0xc4,0xe3,0x65,0x44,0xc5,0x00', + 'vpclmulqdq $0x00, %ymm5, %ymm4, %ymm0' => '.byte 0xc4,0xe3,0x5d,0x44,0xc5,0x00', + 'vpclmulqdq $0x00, %ymm7, %ymm2, %ymm6' => '.byte 0xc4,0xe3,0x6d,0x44,0xf7,0x00', + 'vpclmulqdq $0x00, %ymm8, %ymm2, %ymm2' => '.byte 0xc4,0xc3,0x6d,0x44,0xd0,0x00', + 'vpclmulqdq $0x01, %ymm0, %ymm6, %ymm2' => '.byte 0xc4,0xe3,0x4d,0x44,0xd0,0x01', + 'vpclmulqdq $0x01, %ymm1, %ymm6, %ymm0' => '.byte 0xc4,0xe3,0x4d,0x44,0xc1,0x01', + 'vpclmulqdq $0x01, %ymm2, %ymm12, %ymm4' => '.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x01', + 'vpclmulqdq $0x01, %ymm2, %ymm12, %ymm6' => '.byte 0xc4,0xe3,0x1d,0x44,0xf2,0x01', + 'vpclmulqdq $0x01, %ymm3, %ymm13, %ymm4' => '.byte 0xc4,0xe3,0x15,0x44,0xe3,0x01', + 'vpclmulqdq $0x01, %ymm5, %ymm2, %ymm3' => '.byte 0xc4,0xe3,0x6d,0x44,0xdd,0x01', + 'vpclmulqdq $0x01, %ymm5, %ymm3, %ymm1' => '.byte 0xc4,0xe3,0x65,0x44,0xcd,0x01', + 'vpclmulqdq $0x01, %ymm5, %ymm4, %ymm1' => '.byte 0xc4,0xe3,0x5d,0x44,0xcd,0x01', + 'vpclmulqdq $0x01, %ymm5, %ymm4, %ymm2' => '.byte 0xc4,0xe3,0x5d,0x44,0xd5,0x01', + 'vpclmulqdq $0x01, %ymm6, %ymm2, %ymm3' => '.byte 0xc4,0xe3,0x6d,0x44,0xde,0x01', + 'vpclmulqdq $0x01, %ymm6, %ymm4, %ymm2' => '.byte 0xc4,0xe3,0x5d,0x44,0xd6,0x01', + 'vpclmulqdq $0x10, %ymm2, %ymm12, %ymm4' => '.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x10', + 'vpclmulqdq $0x10, %ymm3, %ymm13, %ymm4' => '.byte 0xc4,0xe3,0x15,0x44,0xe3,0x10', + 'vpclmulqdq $0x10, %ymm5, %ymm3, %ymm2' => '.byte 0xc4,0xe3,0x65,0x44,0xd5,0x10', + 'vpclmulqdq $0x10, %ymm5, %ymm4, %ymm2' => '.byte 0xc4,0xe3,0x5d,0x44,0xd5,0x10', + 'vpclmulqdq $0x10, %ymm7, %ymm2, %ymm2' => '.byte 0xc4,0xe3,0x6d,0x44,0xd7,0x10', + 'vpclmulqdq $0x10, %ymm8, %ymm2, %ymm2' => '.byte 0xc4,0xc3,0x6d,0x44,0xd0,0x10', + 'vpclmulqdq $0x11, %ymm2, %ymm12, %ymm4' => '.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x11', + 'vpclmulqdq $0x11, %ymm2, %ymm12, %ymm7' => '.byte 0xc4,0xe3,0x1d,0x44,0xfa,0x11', + 'vpclmulqdq $0x11, %ymm3, %ymm13, %ymm4' => '.byte 0xc4,0xe3,0x15,0x44,0xe3,0x11', + 'vpclmulqdq $0x11, %ymm4, %ymm3, %ymm1' => '.byte 0xc4,0xe3,0x65,0x44,0xcc,0x11', + 'vpclmulqdq $0x11, %ymm4, %ymm3, %ymm2' => '.byte 0xc4,0xe3,0x65,0x44,0xd4,0x11', + 'vpclmulqdq $0x11, %ymm5, %ymm3, %ymm4' => '.byte 0xc4,0xe3,0x65,0x44,0xe5,0x11', + 'vpclmulqdq $0x11, %ymm5, %ymm4, %ymm3' => '.byte 0xc4,0xe3,0x5d,0x44,0xdd,0x11', + ); + for my $line (split("\n",$code)) { + my $trimmed; + $trimmed = $line; + $trimmed =~ s/^\s+//; + $trimmed =~ s/\s+(#.*)?$//; + if (exists $asmMap{$trimmed}) { + $line = $asmMap{$trimmed}; + } else { + if($trimmed =~ /(vpclmulqdq|vaes).*%[yz]mm/) { + die ("found instruction not supported under old binutils, please update asmMap with the results of running\n" . + 'find target -name "*aes-gcm-avx2*.o" -exec python3 crypto/fipsmodule/aes/asm/make-avx-map-for-old-binutils.py \{\} \; | LC_ALL=C sort | uniq'); + } + } + print $line,"\n"; + } +} + +filter_and_print(); + +close STDOUT or die "error closing STDOUT: $!"; +exit 0; diff --git a/ring-0.17.14/crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl b/ring-0.17.14/crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl new file mode 100644 index 0000000000..77c8bbec63 --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl @@ -0,0 +1,1120 @@ +#! /usr/bin/env perl +# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. +# ==================================================================== +# +# +# AES-NI-CTR+GHASH stitch. +# +# February 2013 +# +# OpenSSL GCM implementation is organized in such way that its +# performance is rather close to the sum of its streamed components, +# in the context parallelized AES-NI CTR and modulo-scheduled +# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation +# was observed to perform significantly better than the sum of the +# components on contemporary CPUs, the effort was deemed impossible to +# justify. This module is based on combination of Intel submissions, +# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max +# Locktyukhin of Intel Corp. who verified that it reduces shuffles +# pressure with notable relative improvement, achieving 1.0 cycle per +# byte processed with 128-bit key on Haswell processor, 0.74 - on +# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled +# measurements for favourable packet size, one divisible by 96. +# Applications using the EVP interface will observe a few percent +# worse performance.] +# +# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP). +# +# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest +# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +# |$avx| in ghash-x86_64.pl must be set to at least 1; otherwise tags will +# be computed incorrectly. +# +# In upstream, this is controlled by shelling out to the compiler to check +# versions, but BoringSSL is intended to be used with pre-generated perlasm +# output, so this isn't useful anyway. +# +# The upstream code uses the condition |$avx>1| even though no AVX2 +# instructions are used, because it assumes MOVBE is supported by the assembler +# if and only if AVX2 is also supported by the assembler; see +# https://marc.info/?l=openssl-dev&m=146567589526984&w=2. +$avx = 2; + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; + +# See the comment above regarding why the condition is ($avx>1) when there are +# no AVX2 instructions being used. +if ($avx>1) {{{ + +# On Windows, only four parameters are passed in registers. The last two +# parameters will be manually loaded into %rdi and %rsi. +my ($inp, $out, $len, $key, $ivp, $Htable) = + $win64 ? ("%rcx", "%rdx", "%r8", "%r9", "%rdi", "%rsi") : + ("%rdi", "%rsi", "%rdx", "%rcx", "%r8", "%r9"); + +# The offset from %rbp to the Xip parameter. On Windows, all parameters have +# corresponding stack positions, not just ones passed on the stack. +# (0x40 = 6*8 + 0x10) +# +# Xip only needs to be accessed at the beginning and end of the function, and +# this function is short on registers, so we make it the last parameter for +# convenience. +my $Xip_offset = $win64 ? 0x40 : 0x10; + +($Ii,$T1,$T2,$Hkey, + $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8)); + +($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15)); + +($counter,$rounds,$const,$in0,$end0)=("%ebx","%r10d","%r11","%r14","%r15"); + +$code=<<___; +.text + +.type _aesni_ctr32_ghash_6x,\@abi-omnipotent +.align 32 +_aesni_ctr32_ghash_6x: +.cfi_startproc + vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb + sub \$6,$len + vpxor $Z0,$Z0,$Z0 # $Z0 = 0 + vmovdqu 0x00-0x80($key),$rndkey + vpaddb $T2,$T1,$inout1 + vpaddb $T2,$inout1,$inout2 + vpaddb $T2,$inout2,$inout3 + vpaddb $T2,$inout3,$inout4 + vpaddb $T2,$inout4,$inout5 + vpxor $rndkey,$T1,$inout0 + vmovdqu $Z0,16+8(%rsp) # "$Z3" = 0 + jmp .Loop6x + +.align 32 +.Loop6x: + add \$`6<<24`,$counter + jc .Lhandle_ctr32 # discard $inout[1-5]? + vmovdqu 0x00-0x20($Htable),$Hkey # $Hkey^1 + vpaddb $T2,$inout5,$T1 # next counter value + vpxor $rndkey,$inout1,$inout1 + vpxor $rndkey,$inout2,$inout2 + +.Lresume_ctr32: + vmovdqu $T1,($ivp) # save next counter value + vpclmulqdq \$0x10,$Hkey,$Z3,$Z1 + vpxor $rndkey,$inout3,$inout3 + vmovups 0x10-0x80($key),$T2 # borrow $T2 for $rndkey + vpclmulqdq \$0x01,$Hkey,$Z3,$Z2 + + # At this point, the current block of 96 (0x60) bytes has already been + # loaded into registers. Concurrently with processing it, we want to + # load the next 96 bytes of input for the next round. Obviously, we can + # only do this if there are at least 96 more bytes of input beyond the + # input we're currently processing, or else we'd read past the end of + # the input buffer. Here, we set |%r12| to 96 if there are at least 96 + # bytes of input beyond the 96 bytes we're already processing, and we + # set |%r12| to 0 otherwise. In the case where we set |%r12| to 96, + # we'll read in the next block so that it is in registers for the next + # loop iteration. In the case where we set |%r12| to 0, we'll re-read + # the current block and then ignore what we re-read. + # + # At this point, |$in0| points to the current (already read into + # registers) block, and |$end0| points to 2*96 bytes before the end of + # the input. Thus, |$in0| > |$end0| means that we do not have the next + # 96-byte block to read in, and |$in0| <= |$end0| means we do. + xor %r12,%r12 + cmp $in0,$end0 + + vaesenc $T2,$inout0,$inout0 + vmovdqu 0x30+8(%rsp),$Ii # I[4] + vpxor $rndkey,$inout4,$inout4 + vpclmulqdq \$0x00,$Hkey,$Z3,$T1 + vaesenc $T2,$inout1,$inout1 + vpxor $rndkey,$inout5,$inout5 + setnc %r12b + vpclmulqdq \$0x11,$Hkey,$Z3,$Z3 + vaesenc $T2,$inout2,$inout2 + vmovdqu 0x10-0x20($Htable),$Hkey # $Hkey^2 + neg %r12 + vaesenc $T2,$inout3,$inout3 + vpxor $Z1,$Z2,$Z2 + vpclmulqdq \$0x00,$Hkey,$Ii,$Z1 + vpxor $Z0,$Xi,$Xi # modulo-scheduled + vaesenc $T2,$inout4,$inout4 + vpxor $Z1,$T1,$Z0 + and \$0x60,%r12 + vmovups 0x20-0x80($key),$rndkey + vpclmulqdq \$0x10,$Hkey,$Ii,$T1 + vaesenc $T2,$inout5,$inout5 + + vpclmulqdq \$0x01,$Hkey,$Ii,$T2 + lea ($in0,%r12),$in0 + vaesenc $rndkey,$inout0,$inout0 + vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled [vpxor $Z3,$Xi,$Xi] + vpclmulqdq \$0x11,$Hkey,$Ii,$Hkey + vmovdqu 0x40+8(%rsp),$Ii # I[3] + vaesenc $rndkey,$inout1,$inout1 + movbe 0x58($in0),%r13 + vaesenc $rndkey,$inout2,$inout2 + movbe 0x50($in0),%r12 + vaesenc $rndkey,$inout3,$inout3 + mov %r13,0x20+8(%rsp) + vaesenc $rndkey,$inout4,$inout4 + mov %r12,0x28+8(%rsp) + vmovdqu 0x30-0x20($Htable),$Z1 # borrow $Z1 for $Hkey^3 + vaesenc $rndkey,$inout5,$inout5 + + vmovups 0x30-0x80($key),$rndkey + vpxor $T1,$Z2,$Z2 + vpclmulqdq \$0x00,$Z1,$Ii,$T1 + vaesenc $rndkey,$inout0,$inout0 + vpxor $T2,$Z2,$Z2 + vpclmulqdq \$0x10,$Z1,$Ii,$T2 + vaesenc $rndkey,$inout1,$inout1 + vpxor $Hkey,$Z3,$Z3 + vpclmulqdq \$0x01,$Z1,$Ii,$Hkey + vaesenc $rndkey,$inout2,$inout2 + vpclmulqdq \$0x11,$Z1,$Ii,$Z1 + vmovdqu 0x50+8(%rsp),$Ii # I[2] + vaesenc $rndkey,$inout3,$inout3 + vaesenc $rndkey,$inout4,$inout4 + vpxor $T1,$Z0,$Z0 + vmovdqu 0x40-0x20($Htable),$T1 # borrow $T1 for $Hkey^4 + vaesenc $rndkey,$inout5,$inout5 + + vmovups 0x40-0x80($key),$rndkey + vpxor $T2,$Z2,$Z2 + vpclmulqdq \$0x00,$T1,$Ii,$T2 + vaesenc $rndkey,$inout0,$inout0 + vpxor $Hkey,$Z2,$Z2 + vpclmulqdq \$0x10,$T1,$Ii,$Hkey + vaesenc $rndkey,$inout1,$inout1 + movbe 0x48($in0),%r13 + vpxor $Z1,$Z3,$Z3 + vpclmulqdq \$0x01,$T1,$Ii,$Z1 + vaesenc $rndkey,$inout2,$inout2 + movbe 0x40($in0),%r12 + vpclmulqdq \$0x11,$T1,$Ii,$T1 + vmovdqu 0x60+8(%rsp),$Ii # I[1] + vaesenc $rndkey,$inout3,$inout3 + mov %r13,0x30+8(%rsp) + vaesenc $rndkey,$inout4,$inout4 + mov %r12,0x38+8(%rsp) + vpxor $T2,$Z0,$Z0 + vmovdqu 0x60-0x20($Htable),$T2 # borrow $T2 for $Hkey^5 + vaesenc $rndkey,$inout5,$inout5 + + vmovups 0x50-0x80($key),$rndkey + vpxor $Hkey,$Z2,$Z2 + vpclmulqdq \$0x00,$T2,$Ii,$Hkey + vaesenc $rndkey,$inout0,$inout0 + vpxor $Z1,$Z2,$Z2 + vpclmulqdq \$0x10,$T2,$Ii,$Z1 + vaesenc $rndkey,$inout1,$inout1 + movbe 0x38($in0),%r13 + vpxor $T1,$Z3,$Z3 + vpclmulqdq \$0x01,$T2,$Ii,$T1 + vpxor 0x70+8(%rsp),$Xi,$Xi # accumulate I[0] + vaesenc $rndkey,$inout2,$inout2 + movbe 0x30($in0),%r12 + vpclmulqdq \$0x11,$T2,$Ii,$T2 + vaesenc $rndkey,$inout3,$inout3 + mov %r13,0x40+8(%rsp) + vaesenc $rndkey,$inout4,$inout4 + mov %r12,0x48+8(%rsp) + vpxor $Hkey,$Z0,$Z0 + vmovdqu 0x70-0x20($Htable),$Hkey # $Hkey^6 + vaesenc $rndkey,$inout5,$inout5 + + vmovups 0x60-0x80($key),$rndkey + vpxor $Z1,$Z2,$Z2 + vpclmulqdq \$0x10,$Hkey,$Xi,$Z1 + vaesenc $rndkey,$inout0,$inout0 + vpxor $T1,$Z2,$Z2 + vpclmulqdq \$0x01,$Hkey,$Xi,$T1 + vaesenc $rndkey,$inout1,$inout1 + movbe 0x28($in0),%r13 + vpxor $T2,$Z3,$Z3 + vpclmulqdq \$0x00,$Hkey,$Xi,$T2 + vaesenc $rndkey,$inout2,$inout2 + movbe 0x20($in0),%r12 + vpclmulqdq \$0x11,$Hkey,$Xi,$Xi + vaesenc $rndkey,$inout3,$inout3 + mov %r13,0x50+8(%rsp) + vaesenc $rndkey,$inout4,$inout4 + mov %r12,0x58+8(%rsp) + vpxor $Z1,$Z2,$Z2 + vaesenc $rndkey,$inout5,$inout5 + vpxor $T1,$Z2,$Z2 + + vmovups 0x70-0x80($key),$rndkey + vpslldq \$8,$Z2,$Z1 + vpxor $T2,$Z0,$Z0 + vmovdqu 0x10($const),$Hkey # .Lpoly + + vaesenc $rndkey,$inout0,$inout0 + vpxor $Xi,$Z3,$Z3 + vaesenc $rndkey,$inout1,$inout1 + vpxor $Z1,$Z0,$Z0 + movbe 0x18($in0),%r13 + vaesenc $rndkey,$inout2,$inout2 + movbe 0x10($in0),%r12 + vpalignr \$8,$Z0,$Z0,$Ii # 1st phase + vpclmulqdq \$0x10,$Hkey,$Z0,$Z0 + mov %r13,0x60+8(%rsp) + vaesenc $rndkey,$inout3,$inout3 + mov %r12,0x68+8(%rsp) + vaesenc $rndkey,$inout4,$inout4 + vmovups 0x80-0x80($key),$T1 # borrow $T1 for $rndkey + vaesenc $rndkey,$inout5,$inout5 + + vaesenc $T1,$inout0,$inout0 + vmovups 0x90-0x80($key),$rndkey + vaesenc $T1,$inout1,$inout1 + vpsrldq \$8,$Z2,$Z2 + vaesenc $T1,$inout2,$inout2 + vpxor $Z2,$Z3,$Z3 + vaesenc $T1,$inout3,$inout3 + vpxor $Ii,$Z0,$Z0 + movbe 0x08($in0),%r13 + vaesenc $T1,$inout4,$inout4 + movbe 0x00($in0),%r12 + vaesenc $T1,$inout5,$inout5 + vmovups 0xa0-0x80($key),$T1 + cmp \$11,$rounds + jb .Lenc_tail # 128-bit key + + vaesenc $rndkey,$inout0,$inout0 + vaesenc $rndkey,$inout1,$inout1 + vaesenc $rndkey,$inout2,$inout2 + vaesenc $rndkey,$inout3,$inout3 + vaesenc $rndkey,$inout4,$inout4 + vaesenc $rndkey,$inout5,$inout5 + + vaesenc $T1,$inout0,$inout0 + vaesenc $T1,$inout1,$inout1 + vaesenc $T1,$inout2,$inout2 + vaesenc $T1,$inout3,$inout3 + vaesenc $T1,$inout4,$inout4 + vmovups 0xb0-0x80($key),$rndkey + vaesenc $T1,$inout5,$inout5 + vmovups 0xc0-0x80($key),$T1 + # 192-bit key support was removed. + + vaesenc $rndkey,$inout0,$inout0 + vaesenc $rndkey,$inout1,$inout1 + vaesenc $rndkey,$inout2,$inout2 + vaesenc $rndkey,$inout3,$inout3 + vaesenc $rndkey,$inout4,$inout4 + vaesenc $rndkey,$inout5,$inout5 + + vaesenc $T1,$inout0,$inout0 + vaesenc $T1,$inout1,$inout1 + vaesenc $T1,$inout2,$inout2 + vaesenc $T1,$inout3,$inout3 + vaesenc $T1,$inout4,$inout4 + vmovups 0xd0-0x80($key),$rndkey + vaesenc $T1,$inout5,$inout5 + vmovups 0xe0-0x80($key),$T1 + jmp .Lenc_tail # 256-bit key + +.align 32 +.Lhandle_ctr32: + vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask + vpshufb $Ii,$T1,$Z2 # byte-swap counter + vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb + vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb + vpaddd $Z1,$Z2,$inout2 + vmovdqu 0x00-0x20($Htable),$Hkey # $Hkey^1 + vpaddd $Z1,$inout1,$inout3 + vpshufb $Ii,$inout1,$inout1 + vpaddd $Z1,$inout2,$inout4 + vpshufb $Ii,$inout2,$inout2 + vpxor $rndkey,$inout1,$inout1 + vpaddd $Z1,$inout3,$inout5 + vpshufb $Ii,$inout3,$inout3 + vpxor $rndkey,$inout2,$inout2 + vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value + vpshufb $Ii,$inout4,$inout4 + vpshufb $Ii,$inout5,$inout5 + vpshufb $Ii,$T1,$T1 # next counter value + jmp .Lresume_ctr32 + +.align 32 +.Lenc_tail: + vaesenc $rndkey,$inout0,$inout0 + vmovdqu $Z3,16+8(%rsp) # postpone vpxor $Z3,$Xi,$Xi + vpalignr \$8,$Z0,$Z0,$Xi # 2nd phase + vaesenc $rndkey,$inout1,$inout1 + vpclmulqdq \$0x10,$Hkey,$Z0,$Z0 + vpxor 0x00($inp),$T1,$T2 + vaesenc $rndkey,$inout2,$inout2 + vpxor 0x10($inp),$T1,$Ii + vaesenc $rndkey,$inout3,$inout3 + vpxor 0x20($inp),$T1,$Z1 + vaesenc $rndkey,$inout4,$inout4 + vpxor 0x30($inp),$T1,$Z2 + vaesenc $rndkey,$inout5,$inout5 + vpxor 0x40($inp),$T1,$Z3 + vpxor 0x50($inp),$T1,$Hkey + vmovdqu ($ivp),$T1 # load next counter value + + vaesenclast $T2,$inout0,$inout0 + vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb + vaesenclast $Ii,$inout1,$inout1 + vpaddb $T2,$T1,$Ii + mov %r13,0x70+8(%rsp) + lea 0x60($inp),$inp + # These two prefetches were added in BoringSSL. See change that added them. + prefetcht0 512($inp) # We use 96-byte block so prefetch 2 lines (128 bytes) + prefetcht0 576($inp) + vaesenclast $Z1,$inout2,$inout2 + vpaddb $T2,$Ii,$Z1 + mov %r12,0x78+8(%rsp) + lea 0x60($out),$out + vmovdqu 0x00-0x80($key),$rndkey + vaesenclast $Z2,$inout3,$inout3 + vpaddb $T2,$Z1,$Z2 + vaesenclast $Z3, $inout4,$inout4 + vpaddb $T2,$Z2,$Z3 + vaesenclast $Hkey,$inout5,$inout5 + vpaddb $T2,$Z3,$Hkey + + add \$0x60,%rax + sub \$0x6,$len + jc .L6x_done + + vmovups $inout0,-0x60($out) # save output + vpxor $rndkey,$T1,$inout0 + vmovups $inout1,-0x50($out) + vmovdqa $Ii,$inout1 # 0 latency + vmovups $inout2,-0x40($out) + vmovdqa $Z1,$inout2 # 0 latency + vmovups $inout3,-0x30($out) + vmovdqa $Z2,$inout3 # 0 latency + vmovups $inout4,-0x20($out) + vmovdqa $Z3,$inout4 # 0 latency + vmovups $inout5,-0x10($out) + vmovdqa $Hkey,$inout5 # 0 latency + vmovdqu 0x20+8(%rsp),$Z3 # I[5] + jmp .Loop6x + +.L6x_done: + vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled + vpxor $Z0,$Xi,$Xi # modulo-scheduled + + ret +.cfi_endproc +.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x +___ +###################################################################### +# +# size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len, +# const AES_KEY *key, unsigned char iv[16], const u128 Htbl[9], +# u128 *Xip); +$code.=<<___; +.globl aesni_gcm_decrypt +.type aesni_gcm_decrypt,\@abi-omnipotent +.align 32 +aesni_gcm_decrypt: +.cfi_startproc +.seh_startproc + _CET_ENDBR + xor %rax,%rax + + # We call |_aesni_ctr32_ghash_6x|, which requires at least 96 (0x60) + # bytes of input. + cmp \$0x60,$len # minimal accepted length + jb .Lgcm_dec_abort + + push %rbp +.cfi_push %rbp +.seh_pushreg %rbp + mov %rsp, %rbp # save stack pointer +.cfi_def_cfa_register %rbp + push %rbx +.cfi_push %rbx +.seh_pushreg %rbx + push %r12 +.cfi_push %r12 +.seh_pushreg %r12 + push %r13 +.cfi_push %r13 +.seh_pushreg %r13 + push %r14 +.cfi_push %r14 +.seh_pushreg %r14 + push %r15 +.cfi_push %r15 +.seh_pushreg %r15 +___ +if ($win64) { +$code.=<<___ + lea -0xa8(%rsp),%rsp # 8 extra bytes to align the stack +.seh_stackalloc 0xa8 +.seh_setframe %rbp, 0xa8+5*8 + # Load the last two parameters. These go into %rdi and %rsi, which are + # non-volatile on Windows, so stash them in the parameter stack area + # first. + mov %rdi, 0x10(%rbp) +.seh_savereg %rdi, 0xa8+5*8+0x10 + mov %rsi, 0x18(%rbp) +.seh_savereg %rsi, 0xa8+5*8+0x18 + mov 0x30(%rbp), $ivp + mov 0x38(%rbp), $Htable + # Save non-volatile XMM registers. + movaps %xmm6,-0xd0(%rbp) +.seh_savexmm %xmm6, 0xa8+5*8-0xd0 + movaps %xmm7,-0xc0(%rbp) +.seh_savexmm %xmm7, 0xa8+5*8-0xc0 + movaps %xmm8,-0xb0(%rbp) +.seh_savexmm %xmm8, 0xa8+5*8-0xb0 + movaps %xmm9,-0xa0(%rbp) +.seh_savexmm %xmm9, 0xa8+5*8-0xa0 + movaps %xmm10,-0x90(%rbp) +.seh_savexmm %xmm10, 0xa8+5*8-0x90 + movaps %xmm11,-0x80(%rbp) +.seh_savexmm %xmm11, 0xa8+5*8-0x80 + movaps %xmm12,-0x70(%rbp) +.seh_savexmm %xmm12, 0xa8+5*8-0x70 + movaps %xmm13,-0x60(%rbp) +.seh_savexmm %xmm13, 0xa8+5*8-0x60 + movaps %xmm14,-0x50(%rbp) +.seh_savexmm %xmm14, 0xa8+5*8-0x50 + movaps %xmm15,-0x40(%rbp) +.seh_savexmm %xmm15, 0xa8+5*8-0x40 +.seh_endprologue +___ +} +$code.=<<___; + vzeroupper + + mov $Xip_offset(%rbp), %r12 + vmovdqu ($ivp),$T1 # input counter value + add \$-128,%rsp + mov 12($ivp),$counter + lea .Lbswap_mask(%rip),$const + lea -0x80($key),$in0 # borrow $in0 + mov \$0xf80,$end0 # borrow $end0 + vmovdqu (%r12),$Xi # load Xi + and \$-128,%rsp # ensure stack alignment + vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask + lea 0x80($key),$key # size optimization + lea 0x20($Htable),$Htable # size optimization + mov 0xf0-0x80($key),$rounds + vpshufb $Ii,$Xi,$Xi + + and $end0,$in0 + and %rsp,$end0 + sub $in0,$end0 + jc .Ldec_no_key_aliasing + cmp \$768,$end0 + jnc .Ldec_no_key_aliasing + sub $end0,%rsp # avoid aliasing with key +.Ldec_no_key_aliasing: + + vmovdqu 0x50($inp),$Z3 # I[5] + mov $inp,$in0 + vmovdqu 0x40($inp),$Z0 + + # |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0) + # bytes before the end of the input. Note, in particular, that this is + # correct even if |$len| is not an even multiple of 96 or 16. XXX: This + # seems to require that |$inp| + |$len| >= 2*96 (0xc0); i.e. |$inp| must + # not be near the very beginning of the address space when |$len| < 2*96 + # (0xc0). + lea -0xc0($inp,$len),$end0 + + vmovdqu 0x30($inp),$Z1 + shr \$4,$len + xor %rax,%rax + vmovdqu 0x20($inp),$Z2 + vpshufb $Ii,$Z3,$Z3 # passed to _aesni_ctr32_ghash_6x + vmovdqu 0x10($inp),$T2 + vpshufb $Ii,$Z0,$Z0 + vmovdqu ($inp),$Hkey + vpshufb $Ii,$Z1,$Z1 + vmovdqu $Z0,0x30(%rsp) + vpshufb $Ii,$Z2,$Z2 + vmovdqu $Z1,0x40(%rsp) + vpshufb $Ii,$T2,$T2 + vmovdqu $Z2,0x50(%rsp) + vpshufb $Ii,$Hkey,$Hkey + vmovdqu $T2,0x60(%rsp) + vmovdqu $Hkey,0x70(%rsp) + + call _aesni_ctr32_ghash_6x + + mov $Xip_offset(%rbp), %r12 + vmovups $inout0,-0x60($out) # save output + vmovups $inout1,-0x50($out) + vmovups $inout2,-0x40($out) + vmovups $inout3,-0x30($out) + vmovups $inout4,-0x20($out) + vmovups $inout5,-0x10($out) + + vpshufb ($const),$Xi,$Xi # .Lbswap_mask + vmovdqu $Xi,(%r12) # output Xi + + vzeroupper +___ +$code.=<<___ if ($win64); + movaps -0xd0(%rbp),%xmm6 + movaps -0xc0(%rbp),%xmm7 + movaps -0xb0(%rbp),%xmm8 + movaps -0xa0(%rbp),%xmm9 + movaps -0x90(%rbp),%xmm10 + movaps -0x80(%rbp),%xmm11 + movaps -0x70(%rbp),%xmm12 + movaps -0x60(%rbp),%xmm13 + movaps -0x50(%rbp),%xmm14 + movaps -0x40(%rbp),%xmm15 + mov 0x10(%rbp),%rdi + mov 0x18(%rbp),%rsi +___ +$code.=<<___; + lea -0x28(%rbp), %rsp # restore %rsp to fixed allocation +.cfi_def_cfa %rsp, 0x38 + pop %r15 +.cfi_pop %r15 + pop %r14 +.cfi_pop %r14 + pop %r13 +.cfi_pop %r13 + pop %r12 +.cfi_pop %r12 + pop %rbx +.cfi_pop %rbx + pop %rbp +.cfi_pop %rbp +.Lgcm_dec_abort: + ret +.seh_endproc +.cfi_endproc +.size aesni_gcm_decrypt,.-aesni_gcm_decrypt +___ + +$code.=<<___; +.type _aesni_ctr32_6x,\@abi-omnipotent +.align 32 +_aesni_ctr32_6x: +.cfi_startproc + vmovdqu 0x00-0x80($key),$Z0 # borrow $Z0 for $rndkey + vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb + lea -1($rounds),%r13 + vmovups 0x10-0x80($key),$rndkey + lea 0x20-0x80($key),%r12 + vpxor $Z0,$T1,$inout0 + add \$`6<<24`,$counter + jc .Lhandle_ctr32_2 + vpaddb $T2,$T1,$inout1 + vpaddb $T2,$inout1,$inout2 + vpxor $Z0,$inout1,$inout1 + vpaddb $T2,$inout2,$inout3 + vpxor $Z0,$inout2,$inout2 + vpaddb $T2,$inout3,$inout4 + vpxor $Z0,$inout3,$inout3 + vpaddb $T2,$inout4,$inout5 + vpxor $Z0,$inout4,$inout4 + vpaddb $T2,$inout5,$T1 + vpxor $Z0,$inout5,$inout5 + jmp .Loop_ctr32 + +.align 16 +.Loop_ctr32: + vaesenc $rndkey,$inout0,$inout0 + vaesenc $rndkey,$inout1,$inout1 + vaesenc $rndkey,$inout2,$inout2 + vaesenc $rndkey,$inout3,$inout3 + vaesenc $rndkey,$inout4,$inout4 + vaesenc $rndkey,$inout5,$inout5 + vmovups (%r12),$rndkey + lea 0x10(%r12),%r12 + dec %r13d + jnz .Loop_ctr32 + + vmovdqu (%r12),$Hkey # last round key + vaesenc $rndkey,$inout0,$inout0 + vpxor 0x00($inp),$Hkey,$Z0 + vaesenc $rndkey,$inout1,$inout1 + vpxor 0x10($inp),$Hkey,$Z1 + vaesenc $rndkey,$inout2,$inout2 + vpxor 0x20($inp),$Hkey,$Z2 + vaesenc $rndkey,$inout3,$inout3 + vpxor 0x30($inp),$Hkey,$Xi + vaesenc $rndkey,$inout4,$inout4 + vpxor 0x40($inp),$Hkey,$T2 + vaesenc $rndkey,$inout5,$inout5 + vpxor 0x50($inp),$Hkey,$Hkey + lea 0x60($inp),$inp + + vaesenclast $Z0,$inout0,$inout0 + vaesenclast $Z1,$inout1,$inout1 + vaesenclast $Z2,$inout2,$inout2 + vaesenclast $Xi,$inout3,$inout3 + vaesenclast $T2,$inout4,$inout4 + vaesenclast $Hkey,$inout5,$inout5 + vmovups $inout0,0x00($out) + vmovups $inout1,0x10($out) + vmovups $inout2,0x20($out) + vmovups $inout3,0x30($out) + vmovups $inout4,0x40($out) + vmovups $inout5,0x50($out) + lea 0x60($out),$out + + ret +.align 32 +.Lhandle_ctr32_2: + vpshufb $Ii,$T1,$Z2 # byte-swap counter + vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb + vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb + vpaddd $Z1,$Z2,$inout2 + vpaddd $Z1,$inout1,$inout3 + vpshufb $Ii,$inout1,$inout1 + vpaddd $Z1,$inout2,$inout4 + vpshufb $Ii,$inout2,$inout2 + vpxor $Z0,$inout1,$inout1 + vpaddd $Z1,$inout3,$inout5 + vpshufb $Ii,$inout3,$inout3 + vpxor $Z0,$inout2,$inout2 + vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value + vpshufb $Ii,$inout4,$inout4 + vpxor $Z0,$inout3,$inout3 + vpshufb $Ii,$inout5,$inout5 + vpxor $Z0,$inout4,$inout4 + vpshufb $Ii,$T1,$T1 # next counter value + vpxor $Z0,$inout5,$inout5 + jmp .Loop_ctr32 +.cfi_endproc +.size _aesni_ctr32_6x,.-_aesni_ctr32_6x + +.globl aesni_gcm_encrypt +.type aesni_gcm_encrypt,\@abi-omnipotent +.align 32 +aesni_gcm_encrypt: +.cfi_startproc +.seh_startproc + _CET_ENDBR +#ifdef BORINGSSL_DISPATCH_TEST +.extern BORINGSSL_function_hit + movb \$1,BORINGSSL_function_hit+2(%rip) +#endif + xor %rax,%rax + + # We call |_aesni_ctr32_6x| twice, each call consuming 96 bytes of + # input. Then we call |_aesni_ctr32_ghash_6x|, which requires at + # least 96 more bytes of input. + cmp \$0x60*3,$len # minimal accepted length + jb .Lgcm_enc_abort + + push %rbp +.cfi_push %rbp +.seh_pushreg %rbp + mov %rsp, %rbp # save stack pointer +.cfi_def_cfa_register %rbp + push %rbx +.cfi_push %rbx +.seh_pushreg %rbx + push %r12 +.cfi_push %r12 +.seh_pushreg %r12 + push %r13 +.cfi_push %r13 +.seh_pushreg %r13 + push %r14 +.cfi_push %r14 +.seh_pushreg %r14 + push %r15 +.cfi_push %r15 +.seh_pushreg %r15 +___ +if ($win64) { +$code.=<<___ + lea -0xa8(%rsp),%rsp # 8 extra bytes to align the stack +.seh_stackalloc 0xa8 +.seh_setframe %rbp, 0xa8+5*8 + # Load the last two parameters. These go into %rdi and %rsi, which are + # non-volatile on Windows, so stash them in the parameter stack area + # first. + mov %rdi, 0x10(%rbp) +.seh_savereg %rdi, 0xa8+5*8+0x10 + mov %rsi, 0x18(%rbp) +.seh_savereg %rsi, 0xa8+5*8+0x18 + mov 0x30(%rbp), $ivp + mov 0x38(%rbp), $Htable + # Save non-volatile XMM registers. + movaps %xmm6,-0xd0(%rbp) +.seh_savexmm %xmm6, 0xa8+5*8-0xd0 + movaps %xmm7,-0xc0(%rbp) +.seh_savexmm %xmm7, 0xa8+5*8-0xc0 + movaps %xmm8,-0xb0(%rbp) +.seh_savexmm %xmm8, 0xa8+5*8-0xb0 + movaps %xmm9,-0xa0(%rbp) +.seh_savexmm %xmm9, 0xa8+5*8-0xa0 + movaps %xmm10,-0x90(%rbp) +.seh_savexmm %xmm10, 0xa8+5*8-0x90 + movaps %xmm11,-0x80(%rbp) +.seh_savexmm %xmm11, 0xa8+5*8-0x80 + movaps %xmm12,-0x70(%rbp) +.seh_savexmm %xmm12, 0xa8+5*8-0x70 + movaps %xmm13,-0x60(%rbp) +.seh_savexmm %xmm13, 0xa8+5*8-0x60 + movaps %xmm14,-0x50(%rbp) +.seh_savexmm %xmm14, 0xa8+5*8-0x50 + movaps %xmm15,-0x40(%rbp) +.seh_savexmm %xmm15, 0xa8+5*8-0x40 +.seh_endprologue +___ +} +$code.=<<___; + vzeroupper + + vmovdqu ($ivp),$T1 # input counter value + add \$-128,%rsp + mov 12($ivp),$counter + lea .Lbswap_mask(%rip),$const + lea -0x80($key),$in0 # borrow $in0 + mov \$0xf80,$end0 # borrow $end0 + lea 0x80($key),$key # size optimization + vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask + and \$-128,%rsp # ensure stack alignment + mov 0xf0-0x80($key),$rounds + + and $end0,$in0 + and %rsp,$end0 + sub $in0,$end0 + jc .Lenc_no_key_aliasing + cmp \$768,$end0 + jnc .Lenc_no_key_aliasing + sub $end0,%rsp # avoid aliasing with key +.Lenc_no_key_aliasing: + + mov $out,$in0 + + # |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0) + # bytes before the end of the input. Note, in particular, that this is + # correct even if |$len| is not an even multiple of 96 or 16. Unlike in + # the decryption case, there's no caveat that |$out| must not be near + # the very beginning of the address space, because we know that + # |$len| >= 3*96 from the check above, and so we know + # |$out| + |$len| >= 2*96 (0xc0). + lea -0xc0($out,$len),$end0 + + shr \$4,$len + + call _aesni_ctr32_6x + vpshufb $Ii,$inout0,$Xi # save bswapped output on stack + vpshufb $Ii,$inout1,$T2 + vmovdqu $Xi,0x70(%rsp) + vpshufb $Ii,$inout2,$Z0 + vmovdqu $T2,0x60(%rsp) + vpshufb $Ii,$inout3,$Z1 + vmovdqu $Z0,0x50(%rsp) + vpshufb $Ii,$inout4,$Z2 + vmovdqu $Z1,0x40(%rsp) + vpshufb $Ii,$inout5,$Z3 # passed to _aesni_ctr32_ghash_6x + vmovdqu $Z2,0x30(%rsp) + + call _aesni_ctr32_6x + + mov $Xip_offset(%rbp), %r12 + lea 0x20($Htable),$Htable # size optimization + vmovdqu (%r12),$Xi # load Xi + sub \$12,$len + mov \$0x60*2,%rax + vpshufb $Ii,$Xi,$Xi + + call _aesni_ctr32_ghash_6x + vmovdqu 0x20(%rsp),$Z3 # I[5] + vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask + vmovdqu 0x00-0x20($Htable),$Hkey # $Hkey^1 + vpunpckhqdq $Z3,$Z3,$T1 + vmovdqu 0x20-0x20($Htable),$rndkey # borrow $rndkey for $HK + vmovups $inout0,-0x60($out) # save output + vpshufb $Ii,$inout0,$inout0 # but keep bswapped copy + vpxor $Z3,$T1,$T1 + vmovups $inout1,-0x50($out) + vpshufb $Ii,$inout1,$inout1 + vmovups $inout2,-0x40($out) + vpshufb $Ii,$inout2,$inout2 + vmovups $inout3,-0x30($out) + vpshufb $Ii,$inout3,$inout3 + vmovups $inout4,-0x20($out) + vpshufb $Ii,$inout4,$inout4 + vmovups $inout5,-0x10($out) + vpshufb $Ii,$inout5,$inout5 + vmovdqu $inout0,0x10(%rsp) # free $inout0 +___ +{ my ($HK,$T3)=($rndkey,$inout0); + +$code.=<<___; + vmovdqu 0x30(%rsp),$Z2 # I[4] + vmovdqu 0x10-0x20($Htable),$Ii # borrow $Ii for $Hkey^2 + vpunpckhqdq $Z2,$Z2,$T2 + vpclmulqdq \$0x00,$Hkey,$Z3,$Z1 + vpxor $Z2,$T2,$T2 + vpclmulqdq \$0x11,$Hkey,$Z3,$Z3 + vpclmulqdq \$0x00,$HK,$T1,$T1 + + vmovdqu 0x40(%rsp),$T3 # I[3] + vpclmulqdq \$0x00,$Ii,$Z2,$Z0 + vmovdqu 0x30-0x20($Htable),$Hkey # $Hkey^3 + vpxor $Z1,$Z0,$Z0 + vpunpckhqdq $T3,$T3,$Z1 + vpclmulqdq \$0x11,$Ii,$Z2,$Z2 + vpxor $T3,$Z1,$Z1 + vpxor $Z3,$Z2,$Z2 + vpclmulqdq \$0x10,$HK,$T2,$T2 + vmovdqu 0x50-0x20($Htable),$HK + vpxor $T1,$T2,$T2 + + vmovdqu 0x50(%rsp),$T1 # I[2] + vpclmulqdq \$0x00,$Hkey,$T3,$Z3 + vmovdqu 0x40-0x20($Htable),$Ii # borrow $Ii for $Hkey^4 + vpxor $Z0,$Z3,$Z3 + vpunpckhqdq $T1,$T1,$Z0 + vpclmulqdq \$0x11,$Hkey,$T3,$T3 + vpxor $T1,$Z0,$Z0 + vpxor $Z2,$T3,$T3 + vpclmulqdq \$0x00,$HK,$Z1,$Z1 + vpxor $T2,$Z1,$Z1 + + vmovdqu 0x60(%rsp),$T2 # I[1] + vpclmulqdq \$0x00,$Ii,$T1,$Z2 + vmovdqu 0x60-0x20($Htable),$Hkey # $Hkey^5 + vpxor $Z3,$Z2,$Z2 + vpunpckhqdq $T2,$T2,$Z3 + vpclmulqdq \$0x11,$Ii,$T1,$T1 + vpxor $T2,$Z3,$Z3 + vpxor $T3,$T1,$T1 + vpclmulqdq \$0x10,$HK,$Z0,$Z0 + vmovdqu 0x80-0x20($Htable),$HK + vpxor $Z1,$Z0,$Z0 + + vpxor 0x70(%rsp),$Xi,$Xi # accumulate I[0] + vpclmulqdq \$0x00,$Hkey,$T2,$Z1 + vmovdqu 0x70-0x20($Htable),$Ii # borrow $Ii for $Hkey^6 + vpunpckhqdq $Xi,$Xi,$T3 + vpxor $Z2,$Z1,$Z1 + vpclmulqdq \$0x11,$Hkey,$T2,$T2 + vpxor $Xi,$T3,$T3 + vpxor $T1,$T2,$T2 + vpclmulqdq \$0x00,$HK,$Z3,$Z3 + vpxor $Z0,$Z3,$Z0 + + vpclmulqdq \$0x00,$Ii,$Xi,$Z2 + vmovdqu 0x00-0x20($Htable),$Hkey # $Hkey^1 + vpunpckhqdq $inout5,$inout5,$T1 + vpclmulqdq \$0x11,$Ii,$Xi,$Xi + vpxor $inout5,$T1,$T1 + vpxor $Z1,$Z2,$Z1 + vpclmulqdq \$0x10,$HK,$T3,$T3 + vmovdqu 0x20-0x20($Htable),$HK + vpxor $T2,$Xi,$Z3 + vpxor $Z0,$T3,$Z2 + + vmovdqu 0x10-0x20($Htable),$Ii # borrow $Ii for $Hkey^2 + vpxor $Z1,$Z3,$T3 # aggregated Karatsuba post-processing + vpclmulqdq \$0x00,$Hkey,$inout5,$Z0 + vpxor $T3,$Z2,$Z2 + vpunpckhqdq $inout4,$inout4,$T2 + vpclmulqdq \$0x11,$Hkey,$inout5,$inout5 + vpxor $inout4,$T2,$T2 + vpslldq \$8,$Z2,$T3 + vpclmulqdq \$0x00,$HK,$T1,$T1 + vpxor $T3,$Z1,$Xi + vpsrldq \$8,$Z2,$Z2 + vpxor $Z2,$Z3,$Z3 + + vpclmulqdq \$0x00,$Ii,$inout4,$Z1 + vmovdqu 0x30-0x20($Htable),$Hkey # $Hkey^3 + vpxor $Z0,$Z1,$Z1 + vpunpckhqdq $inout3,$inout3,$T3 + vpclmulqdq \$0x11,$Ii,$inout4,$inout4 + vpxor $inout3,$T3,$T3 + vpxor $inout5,$inout4,$inout4 + vpalignr \$8,$Xi,$Xi,$inout5 # 1st phase + vpclmulqdq \$0x10,$HK,$T2,$T2 + vmovdqu 0x50-0x20($Htable),$HK + vpxor $T1,$T2,$T2 + + vpclmulqdq \$0x00,$Hkey,$inout3,$Z0 + vmovdqu 0x40-0x20($Htable),$Ii # borrow $Ii for $Hkey^4 + vpxor $Z1,$Z0,$Z0 + vpunpckhqdq $inout2,$inout2,$T1 + vpclmulqdq \$0x11,$Hkey,$inout3,$inout3 + vpxor $inout2,$T1,$T1 + vpxor $inout4,$inout3,$inout3 + vxorps 0x10(%rsp),$Z3,$Z3 # accumulate $inout0 + vpclmulqdq \$0x00,$HK,$T3,$T3 + vpxor $T2,$T3,$T3 + + vpclmulqdq \$0x10,0x10($const),$Xi,$Xi + vxorps $inout5,$Xi,$Xi + + vpclmulqdq \$0x00,$Ii,$inout2,$Z1 + vmovdqu 0x60-0x20($Htable),$Hkey # $Hkey^5 + vpxor $Z0,$Z1,$Z1 + vpunpckhqdq $inout1,$inout1,$T2 + vpclmulqdq \$0x11,$Ii,$inout2,$inout2 + vpxor $inout1,$T2,$T2 + vpalignr \$8,$Xi,$Xi,$inout5 # 2nd phase + vpxor $inout3,$inout2,$inout2 + vpclmulqdq \$0x10,$HK,$T1,$T1 + vmovdqu 0x80-0x20($Htable),$HK + vpxor $T3,$T1,$T1 + + vxorps $Z3,$inout5,$inout5 + vpclmulqdq \$0x10,0x10($const),$Xi,$Xi + vxorps $inout5,$Xi,$Xi + + vpclmulqdq \$0x00,$Hkey,$inout1,$Z0 + vmovdqu 0x70-0x20($Htable),$Ii # borrow $Ii for $Hkey^6 + vpxor $Z1,$Z0,$Z0 + vpunpckhqdq $Xi,$Xi,$T3 + vpclmulqdq \$0x11,$Hkey,$inout1,$inout1 + vpxor $Xi,$T3,$T3 + vpxor $inout2,$inout1,$inout1 + vpclmulqdq \$0x00,$HK,$T2,$T2 + vpxor $T1,$T2,$T2 + + vpclmulqdq \$0x00,$Ii,$Xi,$Z1 + vpclmulqdq \$0x11,$Ii,$Xi,$Z3 + vpxor $Z0,$Z1,$Z1 + vpclmulqdq \$0x10,$HK,$T3,$Z2 + vpxor $inout1,$Z3,$Z3 + vpxor $T2,$Z2,$Z2 + + vpxor $Z1,$Z3,$Z0 # aggregated Karatsuba post-processing + vpxor $Z0,$Z2,$Z2 + vpslldq \$8,$Z2,$T1 + vmovdqu 0x10($const),$Hkey # .Lpoly + vpsrldq \$8,$Z2,$Z2 + vpxor $T1,$Z1,$Xi + vpxor $Z2,$Z3,$Z3 + + vpalignr \$8,$Xi,$Xi,$T2 # 1st phase + vpclmulqdq \$0x10,$Hkey,$Xi,$Xi + vpxor $T2,$Xi,$Xi + + vpalignr \$8,$Xi,$Xi,$T2 # 2nd phase + vpclmulqdq \$0x10,$Hkey,$Xi,$Xi + vpxor $Z3,$T2,$T2 + vpxor $T2,$Xi,$Xi +___ +} +$code.=<<___; + mov $Xip_offset(%rbp), %r12 + vpshufb ($const),$Xi,$Xi # .Lbswap_mask + vmovdqu $Xi,(%r12) # output Xi + + vzeroupper +___ +$code.=<<___ if ($win64); + movaps -0xd0(%rbp),%xmm6 + movaps -0xc0(%rbp),%xmm7 + movaps -0xb0(%rbp),%xmm8 + movaps -0xa0(%rbp),%xmm9 + movaps -0x90(%rbp),%xmm10 + movaps -0x80(%rbp),%xmm11 + movaps -0x70(%rbp),%xmm12 + movaps -0x60(%rbp),%xmm13 + movaps -0x50(%rbp),%xmm14 + movaps -0x40(%rbp),%xmm15 + mov 0x10(%rbp),%rdi + mov 0x18(%rbp),%rsi +___ +$code.=<<___; + lea -0x28(%rbp), %rsp # restore %rsp to fixed allocation +.cfi_def_cfa %rsp, 0x38 + pop %r15 +.cfi_pop %r15 + pop %r14 +.cfi_pop %r14 + pop %r13 +.cfi_pop %r13 + pop %r12 +.cfi_pop %r12 + pop %rbx +.cfi_pop %rbx + pop %rbp +.cfi_pop %rbp +.Lgcm_enc_abort: + ret +.seh_endproc +.cfi_endproc +.size aesni_gcm_encrypt,.-aesni_gcm_encrypt +___ + +$code.=<<___; +.section .rodata +.align 64 +.Lbswap_mask: + .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.Lpoly: + .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +.Lone_msb: + .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +.Ltwo_lsb: + .byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +.Lone_lsb: + .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +.asciz "AES-NI GCM module for x86_64, CRYPTOGAMS by " +.align 64 +.text +___ +}}} else {{{ +$code=<<___; # assembler is too old +.text + +.globl aesni_gcm_encrypt +.type aesni_gcm_encrypt,\@abi-omnipotent +aesni_gcm_encrypt: + _CET_ENDBR + xor %eax,%eax + ret +.size aesni_gcm_encrypt,.-aesni_gcm_encrypt + +.globl aesni_gcm_decrypt +.type aesni_gcm_decrypt,\@abi-omnipotent +aesni_gcm_decrypt: + _CET_ENDBR + xor %eax,%eax + ret +.size aesni_gcm_decrypt,.-aesni_gcm_decrypt +___ +}}} + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; + +print $code; + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/ring-0.17.14/crypto/fipsmodule/aes/asm/aesni-x86.pl b/ring-0.17.14/crypto/fipsmodule/aes/asm/aesni-x86.pl new file mode 100644 index 0000000000..8a7a15619d --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/aes/asm/aesni-x86.pl @@ -0,0 +1,973 @@ +#! /usr/bin/env perl +# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. +# ==================================================================== +# +# This module implements support for Intel AES-NI extension. In +# OpenSSL context it's used with Intel engine, but can also be used as +# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for +# details]. +# +# Performance. +# +# To start with see corresponding paragraph in aesni-x86_64.pl... +# Instead of filling table similar to one found there I've chosen to +# summarize *comparison* results for raw ECB, CTR and CBC benchmarks. +# The simplified table below represents 32-bit performance relative +# to 64-bit one in every given point. Ratios vary for different +# encryption modes, therefore interval values. +# +# 16-byte 64-byte 256-byte 1-KB 8-KB +# 53-67% 67-84% 91-94% 95-98% 97-99.5% +# +# Lower ratios for smaller block sizes are perfectly understandable, +# because function call overhead is higher in 32-bit mode. Largest +# 8-KB block performance is virtually same: 32-bit code is less than +# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise. + +# January 2011 +# +# See aesni-x86_64.pl for details. Unlike x86_64 version this module +# interleaves at most 6 aes[enc|dec] instructions, because there are +# not enough registers for 8x interleave [which should be optimal for +# Sandy Bridge]. Actually, performance results for 6x interleave +# factor presented in aesni-x86_64.pl (except for CTR) are for this +# module. + +# April 2011 +# +# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing +# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09. + +# November 2015 +# +# Add aesni_ocb_[en|de]crypt. [Removed in BoringSSL] + +###################################################################### +# Current large-block performance in cycles per byte processed with +# 128-bit key (less is better). +# +# CBC en-/decrypt CTR XTS ECB OCB +# Westmere 3.77/1.37 1.37 1.52 1.27 +# * Bridge 5.07/0.98 0.99 1.09 0.91 1.10 +# Haswell 4.44/0.80 0.97 1.03 0.72 0.76 +# Skylake 2.68/0.65 0.65 0.66 0.64 0.66 +# Silvermont 5.77/3.56 3.67 4.03 3.46 4.03 +# Goldmont 3.84/1.39 1.39 1.63 1.31 1.70 +# Bulldozer 5.80/0.98 1.05 1.24 0.93 1.23 + +$PREFIX="aes_hw"; # if $PREFIX is set to "AES", the script + # generates drop-in replacement for + # crypto/aes/asm/aes-586.pl:-) +$AESNI_PREFIX="aes_hw"; +$inline=1; # inline _aesni_[en|de]crypt + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../../perlasm"); +require "x86asm.pl"; + +$output = pop; +open OUT,">$output"; +*STDOUT=*OUT; + +&asm_init($ARGV[0]); + +&preprocessor_ifdef("BORINGSSL_DISPATCH_TEST") +&external_label("BORINGSSL_function_hit"); +&preprocessor_endif(); +&static_label("key_const"); + +if ($PREFIX eq $AESNI_PREFIX) { $movekey=\&movups; } +else { $movekey=\&movups; } + +$len="eax"; +$rounds="ecx"; +$key="edx"; +$inp="esi"; +$out="edi"; +$rounds_="ebx"; # backup copy for $rounds +$key_="ebp"; # backup copy for $key + +$rndkey0="xmm0"; +$rndkey1="xmm1"; +$inout0="xmm2"; +$inout1="xmm3"; +$inout2="xmm4"; +$inout3="xmm5"; $in1="xmm5"; +$inout4="xmm6"; $in0="xmm6"; +$inout5="xmm7"; $ivec="xmm7"; + +# AESNI extension +sub aeskeygenassist +{ my($dst,$src,$imm)=@_; + if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) + { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); } +} +sub aescommon +{ my($opcodelet,$dst,$src)=@_; + if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) + { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);} +} +sub aesimc { aescommon(0xdb,@_); } +sub aesenc { aescommon(0xdc,@_); } +sub aesenclast { aescommon(0xdd,@_); } + +# Inline version of internal aesni_[en|de]crypt1 +{ my $sn; +sub aesni_inline_generate1 +{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); + $sn++; + + &$movekey ($rndkey0,&QWP(0,$key)); + &$movekey ($rndkey1,&QWP(16,$key)); + &xorps ($ivec,$rndkey0) if (defined($ivec)); + &lea ($key,&DWP(32,$key)); + &xorps ($inout,$ivec) if (defined($ivec)); + &xorps ($inout,$rndkey0) if (!defined($ivec)); + &set_label("${p}1_loop_$sn"); + eval"&aes${p} ($inout,$rndkey1)"; + &dec ($rounds); + &$movekey ($rndkey1,&QWP(0,$key)); + &lea ($key,&DWP(16,$key)); + &jnz (&label("${p}1_loop_$sn")); + eval"&aes${p}last ($inout,$rndkey1)"; +}} + +sub aesni_generate1 # fully unrolled loop +{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout)); + + &function_begin_B("_aesni_${p}rypt1"); + &movups ($rndkey0,&QWP(0,$key)); + &$movekey ($rndkey1,&QWP(0x10,$key)); + &xorps ($inout,$rndkey0); + &$movekey ($rndkey0,&QWP(0x20,$key)); + &lea ($key,&DWP(0x30,$key)); + &cmp ($rounds,11); + &jb (&label("${p}128")); + &lea ($key,&DWP(0x40,$key)); + # 192-bit key support was removed. + eval"&aes${p} ($inout,$rndkey1)"; + &$movekey ($rndkey1,&QWP(-0x40,$key)); + eval"&aes${p} ($inout,$rndkey0)"; + &$movekey ($rndkey0,&QWP(-0x30,$key)); + # 192-bit key support was removed. + eval"&aes${p} ($inout,$rndkey1)"; + &$movekey ($rndkey1,&QWP(-0x20,$key)); + eval"&aes${p} ($inout,$rndkey0)"; + &$movekey ($rndkey0,&QWP(-0x10,$key)); + &set_label("${p}128"); + eval"&aes${p} ($inout,$rndkey1)"; + &$movekey ($rndkey1,&QWP(0,$key)); + eval"&aes${p} ($inout,$rndkey0)"; + &$movekey ($rndkey0,&QWP(0x10,$key)); + eval"&aes${p} ($inout,$rndkey1)"; + &$movekey ($rndkey1,&QWP(0x20,$key)); + eval"&aes${p} ($inout,$rndkey0)"; + &$movekey ($rndkey0,&QWP(0x30,$key)); + eval"&aes${p} ($inout,$rndkey1)"; + &$movekey ($rndkey1,&QWP(0x40,$key)); + eval"&aes${p} ($inout,$rndkey0)"; + &$movekey ($rndkey0,&QWP(0x50,$key)); + eval"&aes${p} ($inout,$rndkey1)"; + &$movekey ($rndkey1,&QWP(0x60,$key)); + eval"&aes${p} ($inout,$rndkey0)"; + &$movekey ($rndkey0,&QWP(0x70,$key)); + eval"&aes${p} ($inout,$rndkey1)"; + eval"&aes${p}last ($inout,$rndkey0)"; + &ret(); + &function_end_B("_aesni_${p}rypt1"); +} + + +# _aesni_[en|de]cryptN are private interfaces, N denotes interleave +# factor. Why 3x subroutine were originally used in loops? Even though +# aes[enc|dec] latency was originally 6, it could be scheduled only +# every *2nd* cycle. Thus 3x interleave was the one providing optimal +# utilization, i.e. when subroutine's throughput is virtually same as +# of non-interleaved subroutine [for number of input blocks up to 3]. +# This is why it originally made no sense to implement 2x subroutine. +# But times change and it became appropriate to spend extra 192 bytes +# on 2x subroutine on Atom Silvermont account. For processors that +# can schedule aes[enc|dec] every cycle optimal interleave factor +# equals to corresponding instructions latency. 8x is optimal for +# * Bridge, but it's unfeasible to accommodate such implementation +# in XMM registers addressable in 32-bit mode and therefore maximum +# of 6x is used instead... + +sub aesni_generate2 +{ my $p=shift; + + &function_begin_B("_aesni_${p}rypt2"); + &$movekey ($rndkey0,&QWP(0,$key)); + &shl ($rounds,4); + &$movekey ($rndkey1,&QWP(16,$key)); + &xorps ($inout0,$rndkey0); + &pxor ($inout1,$rndkey0); + &$movekey ($rndkey0,&QWP(32,$key)); + &lea ($key,&DWP(32,$key,$rounds)); + &neg ($rounds); + &add ($rounds,16); + + &set_label("${p}2_loop"); + eval"&aes${p} ($inout0,$rndkey1)"; + eval"&aes${p} ($inout1,$rndkey1)"; + &$movekey ($rndkey1,&QWP(0,$key,$rounds)); + &add ($rounds,32); + eval"&aes${p} ($inout0,$rndkey0)"; + eval"&aes${p} ($inout1,$rndkey0)"; + &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); + &jnz (&label("${p}2_loop")); + eval"&aes${p} ($inout0,$rndkey1)"; + eval"&aes${p} ($inout1,$rndkey1)"; + eval"&aes${p}last ($inout0,$rndkey0)"; + eval"&aes${p}last ($inout1,$rndkey0)"; + &ret(); + &function_end_B("_aesni_${p}rypt2"); +} + +sub aesni_generate3 +{ my $p=shift; + + &function_begin_B("_aesni_${p}rypt3"); + &$movekey ($rndkey0,&QWP(0,$key)); + &shl ($rounds,4); + &$movekey ($rndkey1,&QWP(16,$key)); + &xorps ($inout0,$rndkey0); + &pxor ($inout1,$rndkey0); + &pxor ($inout2,$rndkey0); + &$movekey ($rndkey0,&QWP(32,$key)); + &lea ($key,&DWP(32,$key,$rounds)); + &neg ($rounds); + &add ($rounds,16); + + &set_label("${p}3_loop"); + eval"&aes${p} ($inout0,$rndkey1)"; + eval"&aes${p} ($inout1,$rndkey1)"; + eval"&aes${p} ($inout2,$rndkey1)"; + &$movekey ($rndkey1,&QWP(0,$key,$rounds)); + &add ($rounds,32); + eval"&aes${p} ($inout0,$rndkey0)"; + eval"&aes${p} ($inout1,$rndkey0)"; + eval"&aes${p} ($inout2,$rndkey0)"; + &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); + &jnz (&label("${p}3_loop")); + eval"&aes${p} ($inout0,$rndkey1)"; + eval"&aes${p} ($inout1,$rndkey1)"; + eval"&aes${p} ($inout2,$rndkey1)"; + eval"&aes${p}last ($inout0,$rndkey0)"; + eval"&aes${p}last ($inout1,$rndkey0)"; + eval"&aes${p}last ($inout2,$rndkey0)"; + &ret(); + &function_end_B("_aesni_${p}rypt3"); +} + +# 4x interleave is implemented to improve small block performance, +# most notably [and naturally] 4 block by ~30%. One can argue that one +# should have implemented 5x as well, but improvement would be <20%, +# so it's not worth it... +sub aesni_generate4 +{ my $p=shift; + + &function_begin_B("_aesni_${p}rypt4"); + &$movekey ($rndkey0,&QWP(0,$key)); + &$movekey ($rndkey1,&QWP(16,$key)); + &shl ($rounds,4); + &xorps ($inout0,$rndkey0); + &pxor ($inout1,$rndkey0); + &pxor ($inout2,$rndkey0); + &pxor ($inout3,$rndkey0); + &$movekey ($rndkey0,&QWP(32,$key)); + &lea ($key,&DWP(32,$key,$rounds)); + &neg ($rounds); + &data_byte (0x0f,0x1f,0x40,0x00); + &add ($rounds,16); + + &set_label("${p}4_loop"); + eval"&aes${p} ($inout0,$rndkey1)"; + eval"&aes${p} ($inout1,$rndkey1)"; + eval"&aes${p} ($inout2,$rndkey1)"; + eval"&aes${p} ($inout3,$rndkey1)"; + &$movekey ($rndkey1,&QWP(0,$key,$rounds)); + &add ($rounds,32); + eval"&aes${p} ($inout0,$rndkey0)"; + eval"&aes${p} ($inout1,$rndkey0)"; + eval"&aes${p} ($inout2,$rndkey0)"; + eval"&aes${p} ($inout3,$rndkey0)"; + &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); + &jnz (&label("${p}4_loop")); + + eval"&aes${p} ($inout0,$rndkey1)"; + eval"&aes${p} ($inout1,$rndkey1)"; + eval"&aes${p} ($inout2,$rndkey1)"; + eval"&aes${p} ($inout3,$rndkey1)"; + eval"&aes${p}last ($inout0,$rndkey0)"; + eval"&aes${p}last ($inout1,$rndkey0)"; + eval"&aes${p}last ($inout2,$rndkey0)"; + eval"&aes${p}last ($inout3,$rndkey0)"; + &ret(); + &function_end_B("_aesni_${p}rypt4"); +} + +sub aesni_generate6 +{ my $p=shift; + + &function_begin_B("_aesni_${p}rypt6"); + &static_label("_aesni_${p}rypt6_enter"); + &$movekey ($rndkey0,&QWP(0,$key)); + &shl ($rounds,4); + &$movekey ($rndkey1,&QWP(16,$key)); + &xorps ($inout0,$rndkey0); + &pxor ($inout1,$rndkey0); # pxor does better here + &pxor ($inout2,$rndkey0); + eval"&aes${p} ($inout0,$rndkey1)"; + &pxor ($inout3,$rndkey0); + &pxor ($inout4,$rndkey0); + eval"&aes${p} ($inout1,$rndkey1)"; + &lea ($key,&DWP(32,$key,$rounds)); + &neg ($rounds); + eval"&aes${p} ($inout2,$rndkey1)"; + &pxor ($inout5,$rndkey0); + &$movekey ($rndkey0,&QWP(0,$key,$rounds)); + &add ($rounds,16); + &jmp (&label("_aesni_${p}rypt6_inner")); + + &set_label("${p}6_loop",16); + eval"&aes${p} ($inout0,$rndkey1)"; + eval"&aes${p} ($inout1,$rndkey1)"; + eval"&aes${p} ($inout2,$rndkey1)"; + &set_label("_aesni_${p}rypt6_inner"); + eval"&aes${p} ($inout3,$rndkey1)"; + eval"&aes${p} ($inout4,$rndkey1)"; + eval"&aes${p} ($inout5,$rndkey1)"; + &set_label("_aesni_${p}rypt6_enter"); + &$movekey ($rndkey1,&QWP(0,$key,$rounds)); + &add ($rounds,32); + eval"&aes${p} ($inout0,$rndkey0)"; + eval"&aes${p} ($inout1,$rndkey0)"; + eval"&aes${p} ($inout2,$rndkey0)"; + eval"&aes${p} ($inout3,$rndkey0)"; + eval"&aes${p} ($inout4,$rndkey0)"; + eval"&aes${p} ($inout5,$rndkey0)"; + &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); + &jnz (&label("${p}6_loop")); + + eval"&aes${p} ($inout0,$rndkey1)"; + eval"&aes${p} ($inout1,$rndkey1)"; + eval"&aes${p} ($inout2,$rndkey1)"; + eval"&aes${p} ($inout3,$rndkey1)"; + eval"&aes${p} ($inout4,$rndkey1)"; + eval"&aes${p} ($inout5,$rndkey1)"; + eval"&aes${p}last ($inout0,$rndkey0)"; + eval"&aes${p}last ($inout1,$rndkey0)"; + eval"&aes${p}last ($inout2,$rndkey0)"; + eval"&aes${p}last ($inout3,$rndkey0)"; + eval"&aes${p}last ($inout4,$rndkey0)"; + eval"&aes${p}last ($inout5,$rndkey0)"; + &ret(); + &function_end_B("_aesni_${p}rypt6"); +} +&aesni_generate2("enc") if ($PREFIX eq $AESNI_PREFIX); +&aesni_generate3("enc") if ($PREFIX eq $AESNI_PREFIX); +&aesni_generate4("enc") if ($PREFIX eq $AESNI_PREFIX); +&aesni_generate6("enc") if ($PREFIX eq $AESNI_PREFIX); + +if ($PREFIX eq $AESNI_PREFIX) { + +###################################################################### +# void aes_hw_ctr32_encrypt_blocks (const void *in, void *out, +# size_t blocks, const AES_KEY *key, +# const char *ivec); +# +# Handles only complete blocks, operates on 32-bit counter and +# does not update *ivec! (see crypto/modes/ctr128.c for details) +# +# stack layout: +# 0 pshufb mask +# 16 vector addend: 0,6,6,6 +# 32 counter-less ivec +# 48 1st triplet of counter vector +# 64 2nd triplet of counter vector +# 80 saved %esp + +&function_begin("${PREFIX}_ctr32_encrypt_blocks"); + &record_function_hit(0); + + &mov ($inp,&wparam(0)); + &mov ($out,&wparam(1)); + &mov ($len,&wparam(2)); + &mov ($key,&wparam(3)); + &mov ($rounds_,&wparam(4)); + &mov ($key_,"esp"); + &sub ("esp",88); + &and ("esp",-16); # align stack + &mov (&DWP(80,"esp"),$key_); + + &cmp ($len,1); + &je (&label("ctr32_one_shortcut")); + + &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec + + # compose byte-swap control mask for pshufb on stack + &mov (&DWP(0,"esp"),0x0c0d0e0f); + &mov (&DWP(4,"esp"),0x08090a0b); + &mov (&DWP(8,"esp"),0x04050607); + &mov (&DWP(12,"esp"),0x00010203); + + # compose counter increment vector on stack + &mov ($rounds,6); + &xor ($key_,$key_); + &mov (&DWP(16,"esp"),$rounds); + &mov (&DWP(20,"esp"),$rounds); + &mov (&DWP(24,"esp"),$rounds); + &mov (&DWP(28,"esp"),$key_); + + &pextrd ($rounds_,$inout5,3); # pull 32-bit counter + &pinsrd ($inout5,$key_,3); # wipe 32-bit counter + + &mov ($rounds,&DWP(240,$key)); # key->rounds + + # compose 2 vectors of 3x32-bit counters + &bswap ($rounds_); + &pxor ($rndkey0,$rndkey0); + &pxor ($rndkey1,$rndkey1); + &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask + &pinsrd ($rndkey0,$rounds_,0); + &lea ($key_,&DWP(3,$rounds_)); + &pinsrd ($rndkey1,$key_,0); + &inc ($rounds_); + &pinsrd ($rndkey0,$rounds_,1); + &inc ($key_); + &pinsrd ($rndkey1,$key_,1); + &inc ($rounds_); + &pinsrd ($rndkey0,$rounds_,2); + &inc ($key_); + &pinsrd ($rndkey1,$key_,2); + &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet + &pshufb ($rndkey0,$inout0); # byte swap + &movdqu ($inout4,&QWP(0,$key)); # key[0] + &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet + &pshufb ($rndkey1,$inout0); # byte swap + + &pshufd ($inout0,$rndkey0,3<<6); # place counter to upper dword + &pshufd ($inout1,$rndkey0,2<<6); + &cmp ($len,6); + &jb (&label("ctr32_tail")); + &pxor ($inout5,$inout4); # counter-less ivec^key[0] + &shl ($rounds,4); + &mov ($rounds_,16); + &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec^key[0] + &mov ($key_,$key); # backup $key + &sub ($rounds_,$rounds); # backup twisted $rounds + &lea ($key,&DWP(32,$key,$rounds)); + &sub ($len,6); + &jmp (&label("ctr32_loop6")); + +&set_label("ctr32_loop6",16); + # inlining _aesni_encrypt6's prologue gives ~6% improvement... + &pshufd ($inout2,$rndkey0,1<<6); + &movdqa ($rndkey0,&QWP(32,"esp")); # pull counter-less ivec + &pshufd ($inout3,$rndkey1,3<<6); + &pxor ($inout0,$rndkey0); # merge counter-less ivec + &pshufd ($inout4,$rndkey1,2<<6); + &pxor ($inout1,$rndkey0); + &pshufd ($inout5,$rndkey1,1<<6); + &$movekey ($rndkey1,&QWP(16,$key_)); + &pxor ($inout2,$rndkey0); + &pxor ($inout3,$rndkey0); + &aesenc ($inout0,$rndkey1); + &pxor ($inout4,$rndkey0); + &pxor ($inout5,$rndkey0); + &aesenc ($inout1,$rndkey1); + &$movekey ($rndkey0,&QWP(32,$key_)); + &mov ($rounds,$rounds_); + &aesenc ($inout2,$rndkey1); + &aesenc ($inout3,$rndkey1); + &aesenc ($inout4,$rndkey1); + &aesenc ($inout5,$rndkey1); + + &call (&label("_aesni_encrypt6_enter")); + + &movups ($rndkey1,&QWP(0,$inp)); + &movups ($rndkey0,&QWP(0x10,$inp)); + &xorps ($inout0,$rndkey1); + &movups ($rndkey1,&QWP(0x20,$inp)); + &xorps ($inout1,$rndkey0); + &movups (&QWP(0,$out),$inout0); + &movdqa ($rndkey0,&QWP(16,"esp")); # load increment + &xorps ($inout2,$rndkey1); + &movdqa ($rndkey1,&QWP(64,"esp")); # load 2nd triplet + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + + &paddd ($rndkey1,$rndkey0); # 2nd triplet increment + &paddd ($rndkey0,&QWP(48,"esp")); # 1st triplet increment + &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask + + &movups ($inout1,&QWP(0x30,$inp)); + &movups ($inout2,&QWP(0x40,$inp)); + &xorps ($inout3,$inout1); + &movups ($inout1,&QWP(0x50,$inp)); + &lea ($inp,&DWP(0x60,$inp)); + &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet + &pshufb ($rndkey0,$inout0); # byte swap + &xorps ($inout4,$inout2); + &movups (&QWP(0x30,$out),$inout3); + &xorps ($inout5,$inout1); + &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet + &pshufb ($rndkey1,$inout0); # byte swap + &movups (&QWP(0x40,$out),$inout4); + &pshufd ($inout0,$rndkey0,3<<6); + &movups (&QWP(0x50,$out),$inout5); + &lea ($out,&DWP(0x60,$out)); + + &pshufd ($inout1,$rndkey0,2<<6); + &sub ($len,6); + &jnc (&label("ctr32_loop6")); + + &add ($len,6); + &jz (&label("ctr32_ret")); + &movdqu ($inout5,&QWP(0,$key_)); + &mov ($key,$key_); + &pxor ($inout5,&QWP(32,"esp")); # restore count-less ivec + &mov ($rounds,&DWP(240,$key_)); # restore $rounds + +&set_label("ctr32_tail"); + &por ($inout0,$inout5); + &cmp ($len,2); + &jb (&label("ctr32_one")); + + &pshufd ($inout2,$rndkey0,1<<6); + &por ($inout1,$inout5); + &je (&label("ctr32_two")); + + &pshufd ($inout3,$rndkey1,3<<6); + &por ($inout2,$inout5); + &cmp ($len,4); + &jb (&label("ctr32_three")); + + &pshufd ($inout4,$rndkey1,2<<6); + &por ($inout3,$inout5); + &je (&label("ctr32_four")); + + &por ($inout4,$inout5); + &call ("_aesni_encrypt6"); + &movups ($rndkey1,&QWP(0,$inp)); + &movups ($rndkey0,&QWP(0x10,$inp)); + &xorps ($inout0,$rndkey1); + &movups ($rndkey1,&QWP(0x20,$inp)); + &xorps ($inout1,$rndkey0); + &movups ($rndkey0,&QWP(0x30,$inp)); + &xorps ($inout2,$rndkey1); + &movups ($rndkey1,&QWP(0x40,$inp)); + &xorps ($inout3,$rndkey0); + &movups (&QWP(0,$out),$inout0); + &xorps ($inout4,$rndkey1); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + &movups (&QWP(0x30,$out),$inout3); + &movups (&QWP(0x40,$out),$inout4); + &jmp (&label("ctr32_ret")); + +&set_label("ctr32_one_shortcut",16); + &movups ($inout0,&QWP(0,$rounds_)); # load ivec + &mov ($rounds,&DWP(240,$key)); + +&set_label("ctr32_one"); + if ($inline) + { &aesni_inline_generate1("enc"); } + else + { &call ("_aesni_encrypt1"); } + &movups ($in0,&QWP(0,$inp)); + &xorps ($in0,$inout0); + &movups (&QWP(0,$out),$in0); + &jmp (&label("ctr32_ret")); + +&set_label("ctr32_two",16); + &call ("_aesni_encrypt2"); + &movups ($inout3,&QWP(0,$inp)); + &movups ($inout4,&QWP(0x10,$inp)); + &xorps ($inout0,$inout3); + &xorps ($inout1,$inout4); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &jmp (&label("ctr32_ret")); + +&set_label("ctr32_three",16); + &call ("_aesni_encrypt3"); + &movups ($inout3,&QWP(0,$inp)); + &movups ($inout4,&QWP(0x10,$inp)); + &xorps ($inout0,$inout3); + &movups ($inout5,&QWP(0x20,$inp)); + &xorps ($inout1,$inout4); + &movups (&QWP(0,$out),$inout0); + &xorps ($inout2,$inout5); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + &jmp (&label("ctr32_ret")); + +&set_label("ctr32_four",16); + &call ("_aesni_encrypt4"); + &movups ($inout4,&QWP(0,$inp)); + &movups ($inout5,&QWP(0x10,$inp)); + &movups ($rndkey1,&QWP(0x20,$inp)); + &xorps ($inout0,$inout4); + &movups ($rndkey0,&QWP(0x30,$inp)); + &xorps ($inout1,$inout5); + &movups (&QWP(0,$out),$inout0); + &xorps ($inout2,$rndkey1); + &movups (&QWP(0x10,$out),$inout1); + &xorps ($inout3,$rndkey0); + &movups (&QWP(0x20,$out),$inout2); + &movups (&QWP(0x30,$out),$inout3); + +&set_label("ctr32_ret"); + &pxor ("xmm0","xmm0"); # clear register bank + &pxor ("xmm1","xmm1"); + &pxor ("xmm2","xmm2"); + &pxor ("xmm3","xmm3"); + &pxor ("xmm4","xmm4"); + &movdqa (&QWP(32,"esp"),"xmm0"); # clear stack + &pxor ("xmm5","xmm5"); + &movdqa (&QWP(48,"esp"),"xmm0"); + &pxor ("xmm6","xmm6"); + &movdqa (&QWP(64,"esp"),"xmm0"); + &pxor ("xmm7","xmm7"); + &mov ("esp",&DWP(80,"esp")); +&function_end("${PREFIX}_ctr32_encrypt_blocks"); +} + +###################################################################### +# Mechanical port from aesni-x86_64.pl. + +# int $PREFIX_set_encrypt_key_base (const unsigned char *userKey, int bits, +# AES_KEY *key) +&function_begin_B("${PREFIX}_set_encrypt_key_base"); + &record_function_hit(3); + + &mov ("eax",&wparam(0)); + &mov ($rounds,&wparam(1)); + &mov ($key,&wparam(2)); + &push ("ebx"); + + &call (&label("pic")); +&set_label("pic"); + &blindpop("ebx"); + &lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx")); + + &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey + &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0 + &lea ($key,&DWP(16,$key)); + &cmp ($rounds,256); + &je (&label("14rounds")); + # 192-bit key support was removed. + &cmp ($rounds,128); + &jne (&label("bad_keybits")); + +&set_label("10rounds",16); + &mov ($rounds,9); + &$movekey (&QWP(-16,$key),"xmm0"); # round 0 + &aeskeygenassist("xmm1","xmm0",0x01); # round 1 + &call (&label("key_128_cold")); + &aeskeygenassist("xmm1","xmm0",0x2); # round 2 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x04); # round 3 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x08); # round 4 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x10); # round 5 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x20); # round 6 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x40); # round 7 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x80); # round 8 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x1b); # round 9 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x36); # round 10 + &call (&label("key_128")); + &$movekey (&QWP(0,$key),"xmm0"); + &mov (&DWP(80,$key),$rounds); + + &jmp (&label("good_key")); + +&set_label("key_128",16); + &$movekey (&QWP(0,$key),"xmm0"); + &lea ($key,&DWP(16,$key)); +&set_label("key_128_cold"); + &shufps ("xmm4","xmm0",0b00010000); + &xorps ("xmm0","xmm4"); + &shufps ("xmm4","xmm0",0b10001100); + &xorps ("xmm0","xmm4"); + &shufps ("xmm1","xmm1",0b11111111); # critical path + &xorps ("xmm0","xmm1"); + &ret(); + +&set_label("14rounds",16); + &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey + &lea ($key,&DWP(16,$key)); + + &mov ($rounds,13); + &$movekey (&QWP(-32,$key),"xmm0"); # round 0 + &$movekey (&QWP(-16,$key),"xmm2"); # round 1 + &aeskeygenassist("xmm1","xmm2",0x01); # round 2 + &call (&label("key_256a_cold")); + &aeskeygenassist("xmm1","xmm0",0x01); # round 3 + &call (&label("key_256b")); + &aeskeygenassist("xmm1","xmm2",0x02); # round 4 + &call (&label("key_256a")); + &aeskeygenassist("xmm1","xmm0",0x02); # round 5 + &call (&label("key_256b")); + &aeskeygenassist("xmm1","xmm2",0x04); # round 6 + &call (&label("key_256a")); + &aeskeygenassist("xmm1","xmm0",0x04); # round 7 + &call (&label("key_256b")); + &aeskeygenassist("xmm1","xmm2",0x08); # round 8 + &call (&label("key_256a")); + &aeskeygenassist("xmm1","xmm0",0x08); # round 9 + &call (&label("key_256b")); + &aeskeygenassist("xmm1","xmm2",0x10); # round 10 + &call (&label("key_256a")); + &aeskeygenassist("xmm1","xmm0",0x10); # round 11 + &call (&label("key_256b")); + &aeskeygenassist("xmm1","xmm2",0x20); # round 12 + &call (&label("key_256a")); + &aeskeygenassist("xmm1","xmm0",0x20); # round 13 + &call (&label("key_256b")); + &aeskeygenassist("xmm1","xmm2",0x40); # round 14 + &call (&label("key_256a")); + &$movekey (&QWP(0,$key),"xmm0"); + &mov (&DWP(16,$key),$rounds); + &xor ("eax","eax"); + + &jmp (&label("good_key")); + +&set_label("key_256a",16); + &$movekey (&QWP(0,$key),"xmm2"); + &lea ($key,&DWP(16,$key)); +&set_label("key_256a_cold"); + &shufps ("xmm4","xmm0",0b00010000); + &xorps ("xmm0","xmm4"); + &shufps ("xmm4","xmm0",0b10001100); + &xorps ("xmm0","xmm4"); + &shufps ("xmm1","xmm1",0b11111111); # critical path + &xorps ("xmm0","xmm1"); + &ret(); + +&set_label("key_256b",16); + &$movekey (&QWP(0,$key),"xmm0"); + &lea ($key,&DWP(16,$key)); + + &shufps ("xmm4","xmm2",0b00010000); + &xorps ("xmm2","xmm4"); + &shufps ("xmm4","xmm2",0b10001100); + &xorps ("xmm2","xmm4"); + &shufps ("xmm1","xmm1",0b10101010); # critical path + &xorps ("xmm2","xmm1"); + &ret(); + +&set_label("good_key"); + &pxor ("xmm0","xmm0"); + &pxor ("xmm1","xmm1"); + &pxor ("xmm2","xmm2"); + &pxor ("xmm3","xmm3"); + &pxor ("xmm4","xmm4"); + &pxor ("xmm5","xmm5"); + &xor ("eax","eax"); + &pop ("ebx"); + &ret (); + +&set_label("bad_keybits",4); + &pxor ("xmm0","xmm0"); + &mov ("eax",-2); + &pop ("ebx"); + &ret (); +&function_end_B("${PREFIX}_set_encrypt_key_base"); + +# int $PREFIX_set_encrypt_key_alt (const unsigned char *userKey, int bits, +# AES_KEY *key) +&function_begin_B("${PREFIX}_set_encrypt_key_alt"); + &record_function_hit(3); + + &mov ("eax",&wparam(0)); + &mov ($rounds,&wparam(1)); + &mov ($key,&wparam(2)); + &push ("ebx"); + + &call (&label("pic")); +&set_label("pic"); + &blindpop("ebx"); + &lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx")); + + &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey + &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0 + &lea ($key,&DWP(16,$key)); + &cmp ($rounds,256); + &je (&label("14rounds_alt")); + # 192-bit key support was removed. + &cmp ($rounds,128); + &jne (&label("bad_keybits")); + +&set_label("10rounds_alt",16); + &movdqa ("xmm5",&QWP(0x00,"ebx")); + &mov ($rounds,8); + &movdqa ("xmm4",&QWP(0x20,"ebx")); + &movdqa ("xmm2","xmm0"); + &movdqu (&QWP(-16,$key),"xmm0"); + +&set_label("loop_key128"); + &pshufb ("xmm0","xmm5"); + &aesenclast ("xmm0","xmm4"); + &pslld ("xmm4",1); + &lea ($key,&DWP(16,$key)); + + &movdqa ("xmm3","xmm2"); + &pslldq ("xmm2",4); + &pxor ("xmm3","xmm2"); + &pslldq ("xmm2",4); + &pxor ("xmm3","xmm2"); + &pslldq ("xmm2",4); + &pxor ("xmm2","xmm3"); + + &pxor ("xmm0","xmm2"); + &movdqu (&QWP(-16,$key),"xmm0"); + &movdqa ("xmm2","xmm0"); + + &dec ($rounds); + &jnz (&label("loop_key128")); + + &movdqa ("xmm4",&QWP(0x30,"ebx")); + + &pshufb ("xmm0","xmm5"); + &aesenclast ("xmm0","xmm4"); + &pslld ("xmm4",1); + + &movdqa ("xmm3","xmm2"); + &pslldq ("xmm2",4); + &pxor ("xmm3","xmm2"); + &pslldq ("xmm2",4); + &pxor ("xmm3","xmm2"); + &pslldq ("xmm2",4); + &pxor ("xmm2","xmm3"); + + &pxor ("xmm0","xmm2"); + &movdqu (&QWP(0,$key),"xmm0"); + + &movdqa ("xmm2","xmm0"); + &pshufb ("xmm0","xmm5"); + &aesenclast ("xmm0","xmm4"); + + &movdqa ("xmm3","xmm2"); + &pslldq ("xmm2",4); + &pxor ("xmm3","xmm2"); + &pslldq ("xmm2",4); + &pxor ("xmm3","xmm2"); + &pslldq ("xmm2",4); + &pxor ("xmm2","xmm3"); + + &pxor ("xmm0","xmm2"); + &movdqu (&QWP(16,$key),"xmm0"); + + &mov ($rounds,9); + &mov (&DWP(96,$key),$rounds); + + &jmp (&label("good_key")); + + # 192-bit key support was removed. + +&set_label("14rounds_alt",16); + &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey + &lea ($key,&DWP(16,$key)); + &movdqa ("xmm5",&QWP(0x00,"ebx")); + &movdqa ("xmm4",&QWP(0x20,"ebx")); + &mov ($rounds,7); + &movdqu (&QWP(-32,$key),"xmm0"); + &movdqa ("xmm1","xmm2"); + &movdqu (&QWP(-16,$key),"xmm2"); + +&set_label("loop_key256"); + &pshufb ("xmm2","xmm5"); + &aesenclast ("xmm2","xmm4"); + + &movdqa ("xmm3","xmm0"); + &pslldq ("xmm0",4); + &pxor ("xmm3","xmm0"); + &pslldq ("xmm0",4); + &pxor ("xmm3","xmm0"); + &pslldq ("xmm0",4); + &pxor ("xmm0","xmm3"); + &pslld ("xmm4",1); + + &pxor ("xmm0","xmm2"); + &movdqu (&QWP(0,$key),"xmm0"); + + &dec ($rounds); + &jz (&label("done_key256")); + + &pshufd ("xmm2","xmm0",0xff); + &pxor ("xmm3","xmm3"); + &aesenclast ("xmm2","xmm3"); + + &movdqa ("xmm3","xmm1"); + &pslldq ("xmm1",4); + &pxor ("xmm3","xmm1"); + &pslldq ("xmm1",4); + &pxor ("xmm3","xmm1"); + &pslldq ("xmm1",4); + &pxor ("xmm1","xmm3"); + + &pxor ("xmm2","xmm1"); + &movdqu (&QWP(16,$key),"xmm2"); + &lea ($key,&DWP(32,$key)); + &movdqa ("xmm1","xmm2"); + &jmp (&label("loop_key256")); + +&set_label("done_key256"); + &mov ($rounds,13); + &mov (&DWP(16,$key),$rounds); + +&set_label("good_key"); + &pxor ("xmm0","xmm0"); + &pxor ("xmm1","xmm1"); + &pxor ("xmm2","xmm2"); + &pxor ("xmm3","xmm3"); + &pxor ("xmm4","xmm4"); + &pxor ("xmm5","xmm5"); + &xor ("eax","eax"); + &pop ("ebx"); + &ret (); + +&set_label("bad_keybits",4); + &pxor ("xmm0","xmm0"); + &mov ("eax",-2); + &pop ("ebx"); + &ret (); +&function_end_B("${PREFIX}_set_encrypt_key_alt"); + + +&set_label("key_const",64); +&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d); +&data_word(0x04070605,0x04070605,0x04070605,0x04070605); +&data_word(1,1,1,1); +&data_word(0x1b,0x1b,0x1b,0x1b); +&asciz("AES for Intel AES-NI, CRYPTOGAMS by "); + +&asm_finish(); + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/ring-0.17.14/crypto/fipsmodule/aes/asm/aesni-x86_64.pl b/ring-0.17.14/crypto/fipsmodule/aes/asm/aesni-x86_64.pl new file mode 100644 index 0000000000..94fd3a95ab --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/aes/asm/aesni-x86_64.pl @@ -0,0 +1,1600 @@ +#! /usr/bin/env perl +# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. +# ==================================================================== +# +# This module implements support for Intel AES-NI extension. In +# OpenSSL context it's used with Intel engine, but can also be used as +# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for +# details]. +# +# Performance. +# +# Given aes(enc|dec) instructions' latency asymptotic performance for +# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte +# processed with 128-bit key. And given their throughput asymptotic +# performance for parallelizable modes is 1.25 cycles per byte. Being +# asymptotic limit it's not something you commonly achieve in reality, +# but how close does one get? Below are results collected for +# different modes and block sized. Pairs of numbers are for en-/ +# decryption. +# +# 16-byte 64-byte 256-byte 1-KB 8-KB +# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26 +# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26 +# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28 +# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07 +# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38 +# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55 +# +# ECB, CTR, CBC and CCM results are free from EVP overhead. This means +# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni +# [-decrypt]' will exhibit 10-15% worse results for smaller blocks. +# The results were collected with specially crafted speed.c benchmark +# in order to compare them with results reported in "Intel Advanced +# Encryption Standard (AES) New Instruction Set" White Paper Revision +# 3.0 dated May 2010. All above results are consistently better. This +# module also provides better performance for block sizes smaller than +# 128 bytes in points *not* represented in the above table. +# +# Looking at the results for 8-KB buffer. +# +# CFB and OFB results are far from the limit, because implementation +# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on +# single-block aesni_encrypt, which is not the most optimal way to go. +# CBC encrypt result is unexpectedly high and there is no documented +# explanation for it. Seemingly there is a small penalty for feeding +# the result back to AES unit the way it's done in CBC mode. There is +# nothing one can do and the result appears optimal. CCM result is +# identical to CBC, because CBC-MAC is essentially CBC encrypt without +# saving output. CCM CTR "stays invisible," because it's neatly +# interleaved wih CBC-MAC. This provides ~30% improvement over +# "straightforward" CCM implementation with CTR and CBC-MAC performed +# disjointly. Parallelizable modes practically achieve the theoretical +# limit. +# +# Looking at how results vary with buffer size. +# +# Curves are practically saturated at 1-KB buffer size. In most cases +# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one. +# CTR curve doesn't follow this pattern and is "slowest" changing one +# with "256-byte" result being 87% of "8-KB." This is because overhead +# in CTR mode is most computationally intensive. Small-block CCM +# decrypt is slower than encrypt, because first CTR and last CBC-MAC +# iterations can't be interleaved. +# +# Results for 192- and 256-bit keys. +# +# EVP-free results were observed to scale perfectly with number of +# rounds for larger block sizes, i.e. 192-bit result being 10/12 times +# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences +# are a tad smaller, because the above mentioned penalty biases all +# results by same constant value. In similar way function call +# overhead affects small-block performance, as well as OFB and CFB +# results. Differences are not large, most common coefficients are +# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one +# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)... + +# January 2011 +# +# While Westmere processor features 6 cycles latency for aes[enc|dec] +# instructions, which can be scheduled every second cycle, Sandy +# Bridge spends 8 cycles per instruction, but it can schedule them +# every cycle. This means that code targeting Westmere would perform +# suboptimally on Sandy Bridge. Therefore this update. +# +# In addition, non-parallelizable CBC encrypt (as well as CCM) is +# optimized. Relative improvement might appear modest, 8% on Westmere, +# but in absolute terms it's 3.77 cycles per byte encrypted with +# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers +# should be compared to asymptotic limits of 3.75 for Westmere and +# 5.00 for Sandy Bridge. Actually, the fact that they get this close +# to asymptotic limits is quite amazing. Indeed, the limit is +# calculated as latency times number of rounds, 10 for 128-bit key, +# and divided by 16, the number of bytes in block, or in other words +# it accounts *solely* for aesenc instructions. But there are extra +# instructions, and numbers so close to the asymptotic limits mean +# that it's as if it takes as little as *one* additional cycle to +# execute all of them. How is it possible? It is possible thanks to +# out-of-order execution logic, which manages to overlap post- +# processing of previous block, things like saving the output, with +# actual encryption of current block, as well as pre-processing of +# current block, things like fetching input and xor-ing it with +# 0-round element of the key schedule, with actual encryption of +# previous block. Keep this in mind... +# +# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher +# performance is achieved by interleaving instructions working on +# independent blocks. In which case asymptotic limit for such modes +# can be obtained by dividing above mentioned numbers by AES +# instructions' interleave factor. Westmere can execute at most 3 +# instructions at a time, meaning that optimal interleave factor is 3, +# and that's where the "magic" number of 1.25 come from. "Optimal +# interleave factor" means that increase of interleave factor does +# not improve performance. The formula has proven to reflect reality +# pretty well on Westmere... Sandy Bridge on the other hand can +# execute up to 8 AES instructions at a time, so how does varying +# interleave factor affect the performance? Here is table for ECB +# (numbers are cycles per byte processed with 128-bit key): +# +# instruction interleave factor 3x 6x 8x +# theoretical asymptotic limit 1.67 0.83 0.625 +# measured performance for 8KB block 1.05 0.86 0.84 +# +# "as if" interleave factor 4.7x 5.8x 6.0x +# +# Further data for other parallelizable modes: +# +# CBC decrypt 1.16 0.93 0.74 +# CTR 1.14 0.91 0.74 +# +# Well, given 3x column it's probably inappropriate to call the limit +# asymptotic, if it can be surpassed, isn't it? What happens there? +# Rewind to CBC paragraph for the answer. Yes, out-of-order execution +# magic is responsible for this. Processor overlaps not only the +# additional instructions with AES ones, but even AES instructions +# processing adjacent triplets of independent blocks. In the 6x case +# additional instructions still claim disproportionally small amount +# of additional cycles, but in 8x case number of instructions must be +# a tad too high for out-of-order logic to cope with, and AES unit +# remains underutilized... As you can see 8x interleave is hardly +# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl +# utilizes 6x interleave because of limited register bank capacity. +# +# Higher interleave factors do have negative impact on Westmere +# performance. While for ECB mode it's negligible ~1.5%, other +# parallelizables perform ~5% worse, which is outweighed by ~25% +# improvement on Sandy Bridge. To balance regression on Westmere +# CTR mode was implemented with 6x aesenc interleave factor. + +# April 2011 +# +# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing +# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like +# in CTR mode AES instruction interleave factor was chosen to be 6x. + +###################################################################### +# Current large-block performance in cycles per byte processed with +# 128-bit key (less is better). +# +# CBC en-/decrypt CTR XTS ECB OCB +# Westmere 3.77/1.25 1.25 1.25 1.26 +# * Bridge 5.07/0.74 0.75 0.90 0.85 0.98 +# Haswell 4.44/0.63 0.63 0.73 0.63 0.70 +# Skylake 2.62/0.63 0.63 0.63 0.63 +# Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11 +# Knights L 2.54/0.77 0.78 0.85 - 1.50 +# Goldmont 3.82/1.26 1.26 1.29 1.29 1.50 +# Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95 +# Ryzen 2.71/0.35 0.35 0.44 0.38 0.49 +# +# (*) Atom Silvermont ECB result is suboptimal because of penalties +# incurred by operations on %xmm8-15. As ECB is not considered +# critical, nothing was done to mitigate the problem. + +$PREFIX="aes_hw"; # if $PREFIX is set to "AES", the script + # generates drop-in replacement for + # crypto/aes/asm/aes-x86_64.pl:-) + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; + +$movkey = $PREFIX eq "aes_hw" ? "movups" : "movups"; +@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order + ("%rdi","%rsi","%rdx","%rcx"); # Unix order + +$code=".text\n"; + +$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!! +# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ... +$inp="%rdi"; +$out="%rsi"; +$len="%rdx"; +$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! +$ivp="%r8"; # cbc, ctr, ... + +$rnds_="%r10d"; # backup copy for $rounds +$key_="%r11"; # backup copy for $key + +# %xmm register layout +$rndkey0="%xmm0"; $rndkey1="%xmm1"; +$inout0="%xmm2"; $inout1="%xmm3"; +$inout2="%xmm4"; $inout3="%xmm5"; +$inout4="%xmm6"; $inout5="%xmm7"; +$inout6="%xmm8"; $inout7="%xmm9"; + +$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ... +$in0="%xmm8"; $iv="%xmm9"; + +# Inline version of internal aesni_[en|de]crypt1. +# +# Why folded loop? Because aes[enc|dec] is slow enough to accommodate +# cycles which take care of loop variables... +{ my $sn; +sub aesni_generate1 { +my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); +++$sn; +$code.=<<___; + $movkey ($key),$rndkey0 + $movkey 16($key),$rndkey1 +___ +$code.=<<___ if (defined($ivec)); + xorps $rndkey0,$ivec + lea 32($key),$key + xorps $ivec,$inout +___ +$code.=<<___ if (!defined($ivec)); + lea 32($key),$key + xorps $rndkey0,$inout +___ +$code.=<<___; +.Loop_${p}1_$sn: + aes${p} $rndkey1,$inout + dec $rounds + $movkey ($key),$rndkey1 + lea 16($key),$key + jnz .Loop_${p}1_$sn # loop body is 16 bytes + aes${p}last $rndkey1,$inout +___ +}} + +# _aesni_[en|de]cryptN are private interfaces, N denotes interleave +# factor. Why 3x subroutine were originally used in loops? Even though +# aes[enc|dec] latency was originally 6, it could be scheduled only +# every *2nd* cycle. Thus 3x interleave was the one providing optimal +# utilization, i.e. when subroutine's throughput is virtually same as +# of non-interleaved subroutine [for number of input blocks up to 3]. +# This is why it originally made no sense to implement 2x subroutine. +# But times change and it became appropriate to spend extra 192 bytes +# on 2x subroutine on Atom Silvermont account. For processors that +# can schedule aes[enc|dec] every cycle optimal interleave factor +# equals to corresponding instructions latency. 8x is optimal for +# * Bridge and "super-optimal" for other Intel CPUs... + +sub aesni_generate2 { +my $dir=shift; +# As already mentioned it takes in $key and $rounds, which are *not* +# preserved. $inout[0-1] is cipher/clear text... +$code.=<<___; +.type _aesni_${dir}rypt2,\@abi-omnipotent +.align 16 +_aesni_${dir}rypt2: +.cfi_startproc + $movkey ($key),$rndkey0 + shl \$4,$rounds + $movkey 16($key),$rndkey1 + xorps $rndkey0,$inout0 + xorps $rndkey0,$inout1 + $movkey 32($key),$rndkey0 + lea 32($key,$rounds),$key + neg %rax # $rounds + add \$16,%rax + +.L${dir}_loop2: + aes${dir} $rndkey1,$inout0 + aes${dir} $rndkey1,$inout1 + $movkey ($key,%rax),$rndkey1 + add \$32,%rax + aes${dir} $rndkey0,$inout0 + aes${dir} $rndkey0,$inout1 + $movkey -16($key,%rax),$rndkey0 + jnz .L${dir}_loop2 + + aes${dir} $rndkey1,$inout0 + aes${dir} $rndkey1,$inout1 + aes${dir}last $rndkey0,$inout0 + aes${dir}last $rndkey0,$inout1 + ret +.cfi_endproc +.size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2 +___ +} +sub aesni_generate3 { +my $dir=shift; +# As already mentioned it takes in $key and $rounds, which are *not* +# preserved. $inout[0-2] is cipher/clear text... +$code.=<<___; +.type _aesni_${dir}rypt3,\@abi-omnipotent +.align 16 +_aesni_${dir}rypt3: +.cfi_startproc + $movkey ($key),$rndkey0 + shl \$4,$rounds + $movkey 16($key),$rndkey1 + xorps $rndkey0,$inout0 + xorps $rndkey0,$inout1 + xorps $rndkey0,$inout2 + $movkey 32($key),$rndkey0 + lea 32($key,$rounds),$key + neg %rax # $rounds + add \$16,%rax + +.L${dir}_loop3: + aes${dir} $rndkey1,$inout0 + aes${dir} $rndkey1,$inout1 + aes${dir} $rndkey1,$inout2 + $movkey ($key,%rax),$rndkey1 + add \$32,%rax + aes${dir} $rndkey0,$inout0 + aes${dir} $rndkey0,$inout1 + aes${dir} $rndkey0,$inout2 + $movkey -16($key,%rax),$rndkey0 + jnz .L${dir}_loop3 + + aes${dir} $rndkey1,$inout0 + aes${dir} $rndkey1,$inout1 + aes${dir} $rndkey1,$inout2 + aes${dir}last $rndkey0,$inout0 + aes${dir}last $rndkey0,$inout1 + aes${dir}last $rndkey0,$inout2 + ret +.cfi_endproc +.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3 +___ +} +# 4x interleave is implemented to improve small block performance, +# most notably [and naturally] 4 block by ~30%. One can argue that one +# should have implemented 5x as well, but improvement would be <20%, +# so it's not worth it... +sub aesni_generate4 { +my $dir=shift; +# As already mentioned it takes in $key and $rounds, which are *not* +# preserved. $inout[0-3] is cipher/clear text... +$code.=<<___; +.type _aesni_${dir}rypt4,\@abi-omnipotent +.align 16 +_aesni_${dir}rypt4: +.cfi_startproc + $movkey ($key),$rndkey0 + shl \$4,$rounds + $movkey 16($key),$rndkey1 + xorps $rndkey0,$inout0 + xorps $rndkey0,$inout1 + xorps $rndkey0,$inout2 + xorps $rndkey0,$inout3 + $movkey 32($key),$rndkey0 + lea 32($key,$rounds),$key + neg %rax # $rounds + .byte 0x0f,0x1f,0x00 + add \$16,%rax + +.L${dir}_loop4: + aes${dir} $rndkey1,$inout0 + aes${dir} $rndkey1,$inout1 + aes${dir} $rndkey1,$inout2 + aes${dir} $rndkey1,$inout3 + $movkey ($key,%rax),$rndkey1 + add \$32,%rax + aes${dir} $rndkey0,$inout0 + aes${dir} $rndkey0,$inout1 + aes${dir} $rndkey0,$inout2 + aes${dir} $rndkey0,$inout3 + $movkey -16($key,%rax),$rndkey0 + jnz .L${dir}_loop4 + + aes${dir} $rndkey1,$inout0 + aes${dir} $rndkey1,$inout1 + aes${dir} $rndkey1,$inout2 + aes${dir} $rndkey1,$inout3 + aes${dir}last $rndkey0,$inout0 + aes${dir}last $rndkey0,$inout1 + aes${dir}last $rndkey0,$inout2 + aes${dir}last $rndkey0,$inout3 + ret +.cfi_endproc +.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4 +___ +} +sub aesni_generate6 { +my $dir=shift; +# As already mentioned it takes in $key and $rounds, which are *not* +# preserved. $inout[0-5] is cipher/clear text... +$code.=<<___; +.type _aesni_${dir}rypt6,\@abi-omnipotent +.align 16 +_aesni_${dir}rypt6: +.cfi_startproc + $movkey ($key),$rndkey0 + shl \$4,$rounds + $movkey 16($key),$rndkey1 + xorps $rndkey0,$inout0 + pxor $rndkey0,$inout1 + pxor $rndkey0,$inout2 + aes${dir} $rndkey1,$inout0 + lea 32($key,$rounds),$key + neg %rax # $rounds + aes${dir} $rndkey1,$inout1 + pxor $rndkey0,$inout3 + pxor $rndkey0,$inout4 + aes${dir} $rndkey1,$inout2 + pxor $rndkey0,$inout5 + $movkey ($key,%rax),$rndkey0 + add \$16,%rax + jmp .L${dir}_loop6_enter +.align 16 +.L${dir}_loop6: + aes${dir} $rndkey1,$inout0 + aes${dir} $rndkey1,$inout1 + aes${dir} $rndkey1,$inout2 +.L${dir}_loop6_enter: + aes${dir} $rndkey1,$inout3 + aes${dir} $rndkey1,$inout4 + aes${dir} $rndkey1,$inout5 + $movkey ($key,%rax),$rndkey1 + add \$32,%rax + aes${dir} $rndkey0,$inout0 + aes${dir} $rndkey0,$inout1 + aes${dir} $rndkey0,$inout2 + aes${dir} $rndkey0,$inout3 + aes${dir} $rndkey0,$inout4 + aes${dir} $rndkey0,$inout5 + $movkey -16($key,%rax),$rndkey0 + jnz .L${dir}_loop6 + + aes${dir} $rndkey1,$inout0 + aes${dir} $rndkey1,$inout1 + aes${dir} $rndkey1,$inout2 + aes${dir} $rndkey1,$inout3 + aes${dir} $rndkey1,$inout4 + aes${dir} $rndkey1,$inout5 + aes${dir}last $rndkey0,$inout0 + aes${dir}last $rndkey0,$inout1 + aes${dir}last $rndkey0,$inout2 + aes${dir}last $rndkey0,$inout3 + aes${dir}last $rndkey0,$inout4 + aes${dir}last $rndkey0,$inout5 + ret +.cfi_endproc +.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6 +___ +} +sub aesni_generate8 { +my $dir=shift; +# As already mentioned it takes in $key and $rounds, which are *not* +# preserved. $inout[0-7] is cipher/clear text... +$code.=<<___; +.type _aesni_${dir}rypt8,\@abi-omnipotent +.align 16 +_aesni_${dir}rypt8: +.cfi_startproc + $movkey ($key),$rndkey0 + shl \$4,$rounds + $movkey 16($key),$rndkey1 + xorps $rndkey0,$inout0 + xorps $rndkey0,$inout1 + pxor $rndkey0,$inout2 + pxor $rndkey0,$inout3 + pxor $rndkey0,$inout4 + lea 32($key,$rounds),$key + neg %rax # $rounds + aes${dir} $rndkey1,$inout0 + pxor $rndkey0,$inout5 + pxor $rndkey0,$inout6 + aes${dir} $rndkey1,$inout1 + pxor $rndkey0,$inout7 + $movkey ($key,%rax),$rndkey0 + add \$16,%rax + jmp .L${dir}_loop8_inner +.align 16 +.L${dir}_loop8: + aes${dir} $rndkey1,$inout0 + aes${dir} $rndkey1,$inout1 +.L${dir}_loop8_inner: + aes${dir} $rndkey1,$inout2 + aes${dir} $rndkey1,$inout3 + aes${dir} $rndkey1,$inout4 + aes${dir} $rndkey1,$inout5 + aes${dir} $rndkey1,$inout6 + aes${dir} $rndkey1,$inout7 +.L${dir}_loop8_enter: + $movkey ($key,%rax),$rndkey1 + add \$32,%rax + aes${dir} $rndkey0,$inout0 + aes${dir} $rndkey0,$inout1 + aes${dir} $rndkey0,$inout2 + aes${dir} $rndkey0,$inout3 + aes${dir} $rndkey0,$inout4 + aes${dir} $rndkey0,$inout5 + aes${dir} $rndkey0,$inout6 + aes${dir} $rndkey0,$inout7 + $movkey -16($key,%rax),$rndkey0 + jnz .L${dir}_loop8 + + aes${dir} $rndkey1,$inout0 + aes${dir} $rndkey1,$inout1 + aes${dir} $rndkey1,$inout2 + aes${dir} $rndkey1,$inout3 + aes${dir} $rndkey1,$inout4 + aes${dir} $rndkey1,$inout5 + aes${dir} $rndkey1,$inout6 + aes${dir} $rndkey1,$inout7 + aes${dir}last $rndkey0,$inout0 + aes${dir}last $rndkey0,$inout1 + aes${dir}last $rndkey0,$inout2 + aes${dir}last $rndkey0,$inout3 + aes${dir}last $rndkey0,$inout4 + aes${dir}last $rndkey0,$inout5 + aes${dir}last $rndkey0,$inout6 + aes${dir}last $rndkey0,$inout7 + ret +.cfi_endproc +.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8 +___ +} +&aesni_generate2("enc") if ($PREFIX eq "aes_hw"); +&aesni_generate3("enc") if ($PREFIX eq "aes_hw"); +&aesni_generate4("enc") if ($PREFIX eq "aes_hw"); +&aesni_generate6("enc") if ($PREFIX eq "aes_hw"); +&aesni_generate8("enc") if ($PREFIX eq "aes_hw"); + +if ($PREFIX eq "aes_hw") { +{ +###################################################################### +# void aesni_ctr32_encrypt_blocks (const void *in, void *out, +# size_t blocks, const AES_KEY *key, +# const char *ivec); +# +# Handles only complete blocks, operates on 32-bit counter and +# does not update *ivec! (see crypto/modes/ctr128.c for details) +# +# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov, +# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest. +# Keywords are full unroll and modulo-schedule counter calculations +# with zero-round key xor. +{ +my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15)); +my ($key0,$ctr)=("%ebp","${ivp}d"); +my $frame_size = 0x80 + ($win64?160:0); + +$code.=<<___; +.globl ${PREFIX}_ctr32_encrypt_blocks +.type ${PREFIX}_ctr32_encrypt_blocks,\@function,5 +.align 16 +${PREFIX}_ctr32_encrypt_blocks: +.cfi_startproc + _CET_ENDBR +#ifdef BORINGSSL_DISPATCH_TEST + movb \$1,BORINGSSL_function_hit(%rip) +#endif + cmp \$1,$len + jne .Lctr32_bulk + + # handle single block without allocating stack frame, + # useful when handling edges + movups ($ivp),$inout0 + movups ($inp),$inout1 + mov 240($key),%edx # key->rounds +___ + &aesni_generate1("enc",$key,"%edx"); +$code.=<<___; + pxor $rndkey0,$rndkey0 # clear register bank + pxor $rndkey1,$rndkey1 + xorps $inout1,$inout0 + pxor $inout1,$inout1 + movups $inout0,($out) + xorps $inout0,$inout0 + jmp .Lctr32_epilogue + +.align 16 +.Lctr32_bulk: + lea (%rsp),$key_ # use $key_ as frame pointer +.cfi_def_cfa_register $key_ + push %rbp +.cfi_push %rbp + sub \$$frame_size,%rsp + and \$-16,%rsp # Linux kernel stack can be incorrectly seeded +___ +$code.=<<___ if ($win64); + movaps %xmm6,-0xa8($key_) # offload everything + movaps %xmm7,-0x98($key_) + movaps %xmm8,-0x88($key_) + movaps %xmm9,-0x78($key_) + movaps %xmm10,-0x68($key_) + movaps %xmm11,-0x58($key_) + movaps %xmm12,-0x48($key_) + movaps %xmm13,-0x38($key_) + movaps %xmm14,-0x28($key_) + movaps %xmm15,-0x18($key_) +.Lctr32_body: +___ +$code.=<<___; + + # 8 16-byte words on top of stack are counter values + # xor-ed with zero-round key + + movdqu ($ivp),$inout0 + movdqu ($key),$rndkey0 + mov 12($ivp),$ctr # counter LSB + pxor $rndkey0,$inout0 + mov 12($key),$key0 # 0-round key LSB + movdqa $inout0,0x00(%rsp) # populate counter block + bswap $ctr + movdqa $inout0,$inout1 + movdqa $inout0,$inout2 + movdqa $inout0,$inout3 + movdqa $inout0,0x40(%rsp) + movdqa $inout0,0x50(%rsp) + movdqa $inout0,0x60(%rsp) + mov %rdx,%r10 # about to borrow %rdx + movdqa $inout0,0x70(%rsp) + + lea 1($ctr),%rax + lea 2($ctr),%rdx + bswap %eax + bswap %edx + xor $key0,%eax + xor $key0,%edx + pinsrd \$3,%eax,$inout1 + lea 3($ctr),%rax + movdqa $inout1,0x10(%rsp) + pinsrd \$3,%edx,$inout2 + bswap %eax + mov %r10,%rdx # restore %rdx + lea 4($ctr),%r10 + movdqa $inout2,0x20(%rsp) + xor $key0,%eax + bswap %r10d + pinsrd \$3,%eax,$inout3 + xor $key0,%r10d + movdqa $inout3,0x30(%rsp) + lea 5($ctr),%r9 + mov %r10d,0x40+12(%rsp) + bswap %r9d + lea 6($ctr),%r10 + mov 240($key),$rounds # key->rounds + xor $key0,%r9d + bswap %r10d + mov %r9d,0x50+12(%rsp) + xor $key0,%r10d + lea 7($ctr),%r9 + mov %r10d,0x60+12(%rsp) + bswap %r9d + xor $key0,%r9d + mov %r9d,0x70+12(%rsp) + + $movkey 0x10($key),$rndkey1 + + movdqa 0x40(%rsp),$inout4 + movdqa 0x50(%rsp),$inout5 + + cmp \$8,$len # $len is in blocks + jb .Lctr32_tail # short input if ($len<8) + + lea 0x80($key),$key # size optimization + sub \$8,$len # $len is biased by -8 + jmp .Lctr32_loop8 + +.align 32 +.Lctr32_loop8: + add \$8,$ctr # next counter value + movdqa 0x60(%rsp),$inout6 + aesenc $rndkey1,$inout0 + mov $ctr,%r9d + movdqa 0x70(%rsp),$inout7 + aesenc $rndkey1,$inout1 + bswap %r9d + $movkey 0x20-0x80($key),$rndkey0 + aesenc $rndkey1,$inout2 + xor $key0,%r9d + nop + aesenc $rndkey1,$inout3 + mov %r9d,0x00+12(%rsp) # store next counter value + lea 1($ctr),%r9 + aesenc $rndkey1,$inout4 + aesenc $rndkey1,$inout5 + aesenc $rndkey1,$inout6 + aesenc $rndkey1,$inout7 + $movkey 0x30-0x80($key),$rndkey1 +___ +for($i=2;$i<8;$i++) { +my $rndkeyx = ($i&1)?$rndkey1:$rndkey0; +$code.=<<___; + bswap %r9d + aesenc $rndkeyx,$inout0 + aesenc $rndkeyx,$inout1 + xor $key0,%r9d + .byte 0x66,0x90 + aesenc $rndkeyx,$inout2 + aesenc $rndkeyx,$inout3 + mov %r9d,`0x10*($i-1)`+12(%rsp) + lea $i($ctr),%r9 + aesenc $rndkeyx,$inout4 + aesenc $rndkeyx,$inout5 + aesenc $rndkeyx,$inout6 + aesenc $rndkeyx,$inout7 + $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx +___ +} +$code.=<<___; + bswap %r9d + aesenc $rndkey0,$inout0 + aesenc $rndkey0,$inout1 + aesenc $rndkey0,$inout2 + xor $key0,%r9d + movdqu 0x00($inp),$in0 # start loading input + aesenc $rndkey0,$inout3 + mov %r9d,0x70+12(%rsp) + cmp \$11,$rounds + aesenc $rndkey0,$inout4 + aesenc $rndkey0,$inout5 + aesenc $rndkey0,$inout6 + aesenc $rndkey0,$inout7 + $movkey 0xa0-0x80($key),$rndkey0 + + jb .Lctr32_enc_done + + aesenc $rndkey1,$inout0 + aesenc $rndkey1,$inout1 + aesenc $rndkey1,$inout2 + aesenc $rndkey1,$inout3 + aesenc $rndkey1,$inout4 + aesenc $rndkey1,$inout5 + aesenc $rndkey1,$inout6 + aesenc $rndkey1,$inout7 + $movkey 0xb0-0x80($key),$rndkey1 + + aesenc $rndkey0,$inout0 + aesenc $rndkey0,$inout1 + aesenc $rndkey0,$inout2 + aesenc $rndkey0,$inout3 + aesenc $rndkey0,$inout4 + aesenc $rndkey0,$inout5 + aesenc $rndkey0,$inout6 + aesenc $rndkey0,$inout7 + $movkey 0xc0-0x80($key),$rndkey0 + # 192-bit key support was removed. + + aesenc $rndkey1,$inout0 + aesenc $rndkey1,$inout1 + aesenc $rndkey1,$inout2 + aesenc $rndkey1,$inout3 + aesenc $rndkey1,$inout4 + aesenc $rndkey1,$inout5 + aesenc $rndkey1,$inout6 + aesenc $rndkey1,$inout7 + $movkey 0xd0-0x80($key),$rndkey1 + + aesenc $rndkey0,$inout0 + aesenc $rndkey0,$inout1 + aesenc $rndkey0,$inout2 + aesenc $rndkey0,$inout3 + aesenc $rndkey0,$inout4 + aesenc $rndkey0,$inout5 + aesenc $rndkey0,$inout6 + aesenc $rndkey0,$inout7 + $movkey 0xe0-0x80($key),$rndkey0 + jmp .Lctr32_enc_done + +.align 16 +.Lctr32_enc_done: + movdqu 0x10($inp),$in1 + pxor $rndkey0,$in0 # input^=round[last] + movdqu 0x20($inp),$in2 + pxor $rndkey0,$in1 + movdqu 0x30($inp),$in3 + pxor $rndkey0,$in2 + movdqu 0x40($inp),$in4 + pxor $rndkey0,$in3 + movdqu 0x50($inp),$in5 + pxor $rndkey0,$in4 + prefetcht0 0x1c0($inp) # We process 128 bytes (8*16), so to prefetch 1 iteration + prefetcht0 0x200($inp) # We need to prefetch 2 64 byte lines + pxor $rndkey0,$in5 + aesenc $rndkey1,$inout0 + aesenc $rndkey1,$inout1 + aesenc $rndkey1,$inout2 + aesenc $rndkey1,$inout3 + aesenc $rndkey1,$inout4 + aesenc $rndkey1,$inout5 + aesenc $rndkey1,$inout6 + aesenc $rndkey1,$inout7 + movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6] + lea 0x80($inp),$inp # $inp+=8*16 + + aesenclast $in0,$inout0 # $inN is inp[N]^round[last] + pxor $rndkey0,$rndkey1 # borrowed $rndkey + movdqu 0x70-0x80($inp),$in0 + aesenclast $in1,$inout1 + pxor $rndkey0,$in0 + movdqa 0x00(%rsp),$in1 # load next counter block + aesenclast $in2,$inout2 + aesenclast $in3,$inout3 + movdqa 0x10(%rsp),$in2 + movdqa 0x20(%rsp),$in3 + aesenclast $in4,$inout4 + aesenclast $in5,$inout5 + movdqa 0x30(%rsp),$in4 + movdqa 0x40(%rsp),$in5 + aesenclast $rndkey1,$inout6 + movdqa 0x50(%rsp),$rndkey0 + $movkey 0x10-0x80($key),$rndkey1#real 1st-round key + aesenclast $in0,$inout7 + + movups $inout0,($out) # store 8 output blocks + movdqa $in1,$inout0 + movups $inout1,0x10($out) + movdqa $in2,$inout1 + movups $inout2,0x20($out) + movdqa $in3,$inout2 + movups $inout3,0x30($out) + movdqa $in4,$inout3 + movups $inout4,0x40($out) + movdqa $in5,$inout4 + movups $inout5,0x50($out) + movdqa $rndkey0,$inout5 + movups $inout6,0x60($out) + movups $inout7,0x70($out) + lea 0x80($out),$out # $out+=8*16 + + sub \$8,$len + jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow + + add \$8,$len # restore real remaining $len + jz .Lctr32_done # done if ($len==0) + lea -0x80($key),$key + +.Lctr32_tail: + # note that at this point $inout0..5 are populated with + # counter values xor-ed with 0-round key + lea 16($key),$key + cmp \$4,$len + jb .Lctr32_loop3 + je .Lctr32_loop4 + + # if ($len>4) compute 7 E(counter) + shl \$4,$rounds + movdqa 0x60(%rsp),$inout6 + pxor $inout7,$inout7 + + $movkey 16($key),$rndkey0 + aesenc $rndkey1,$inout0 + aesenc $rndkey1,$inout1 + lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter + neg %rax + aesenc $rndkey1,$inout2 + add \$16,%rax # prepare for .Lenc_loop8_enter + movups ($inp),$in0 + aesenc $rndkey1,$inout3 + aesenc $rndkey1,$inout4 + movups 0x10($inp),$in1 # pre-load input + movups 0x20($inp),$in2 + aesenc $rndkey1,$inout5 + aesenc $rndkey1,$inout6 + + call .Lenc_loop8_enter + + movdqu 0x30($inp),$in3 + pxor $in0,$inout0 + movdqu 0x40($inp),$in0 + pxor $in1,$inout1 + movdqu $inout0,($out) # store output + pxor $in2,$inout2 + movdqu $inout1,0x10($out) + pxor $in3,$inout3 + movdqu $inout2,0x20($out) + pxor $in0,$inout4 + movdqu $inout3,0x30($out) + movdqu $inout4,0x40($out) + cmp \$6,$len + jb .Lctr32_done # $len was 5, stop store + + movups 0x50($inp),$in1 + xorps $in1,$inout5 + movups $inout5,0x50($out) + je .Lctr32_done # $len was 6, stop store + + movups 0x60($inp),$in2 + xorps $in2,$inout6 + movups $inout6,0x60($out) + jmp .Lctr32_done # $len was 7, stop store + +.align 32 +.Lctr32_loop4: + aesenc $rndkey1,$inout0 + lea 16($key),$key + dec $rounds + aesenc $rndkey1,$inout1 + aesenc $rndkey1,$inout2 + aesenc $rndkey1,$inout3 + $movkey ($key),$rndkey1 + jnz .Lctr32_loop4 + aesenclast $rndkey1,$inout0 + aesenclast $rndkey1,$inout1 + movups ($inp),$in0 # load input + movups 0x10($inp),$in1 + aesenclast $rndkey1,$inout2 + aesenclast $rndkey1,$inout3 + movups 0x20($inp),$in2 + movups 0x30($inp),$in3 + + xorps $in0,$inout0 + movups $inout0,($out) # store output + xorps $in1,$inout1 + movups $inout1,0x10($out) + pxor $in2,$inout2 + movdqu $inout2,0x20($out) + pxor $in3,$inout3 + movdqu $inout3,0x30($out) + jmp .Lctr32_done # $len was 4, stop store + +.align 32 +.Lctr32_loop3: + aesenc $rndkey1,$inout0 + lea 16($key),$key + dec $rounds + aesenc $rndkey1,$inout1 + aesenc $rndkey1,$inout2 + $movkey ($key),$rndkey1 + jnz .Lctr32_loop3 + aesenclast $rndkey1,$inout0 + aesenclast $rndkey1,$inout1 + aesenclast $rndkey1,$inout2 + + movups ($inp),$in0 # load input + xorps $in0,$inout0 + movups $inout0,($out) # store output + cmp \$2,$len + jb .Lctr32_done # $len was 1, stop store + + movups 0x10($inp),$in1 + xorps $in1,$inout1 + movups $inout1,0x10($out) + je .Lctr32_done # $len was 2, stop store + + movups 0x20($inp),$in2 + xorps $in2,$inout2 + movups $inout2,0x20($out) # $len was 3, stop store + +.Lctr32_done: + xorps %xmm0,%xmm0 # clear register bank + xor $key0,$key0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 +___ +$code.=<<___ if (!$win64); + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + movaps %xmm0,0x00(%rsp) # clear stack + pxor %xmm8,%xmm8 + movaps %xmm0,0x10(%rsp) + pxor %xmm9,%xmm9 + movaps %xmm0,0x20(%rsp) + pxor %xmm10,%xmm10 + movaps %xmm0,0x30(%rsp) + pxor %xmm11,%xmm11 + movaps %xmm0,0x40(%rsp) + pxor %xmm12,%xmm12 + movaps %xmm0,0x50(%rsp) + pxor %xmm13,%xmm13 + movaps %xmm0,0x60(%rsp) + pxor %xmm14,%xmm14 + movaps %xmm0,0x70(%rsp) + pxor %xmm15,%xmm15 +___ +$code.=<<___ if ($win64); + movaps -0xa8($key_),%xmm6 + movaps %xmm0,-0xa8($key_) # clear stack + movaps -0x98($key_),%xmm7 + movaps %xmm0,-0x98($key_) + movaps -0x88($key_),%xmm8 + movaps %xmm0,-0x88($key_) + movaps -0x78($key_),%xmm9 + movaps %xmm0,-0x78($key_) + movaps -0x68($key_),%xmm10 + movaps %xmm0,-0x68($key_) + movaps -0x58($key_),%xmm11 + movaps %xmm0,-0x58($key_) + movaps -0x48($key_),%xmm12 + movaps %xmm0,-0x48($key_) + movaps -0x38($key_),%xmm13 + movaps %xmm0,-0x38($key_) + movaps -0x28($key_),%xmm14 + movaps %xmm0,-0x28($key_) + movaps -0x18($key_),%xmm15 + movaps %xmm0,-0x18($key_) + movaps %xmm0,0x00(%rsp) + movaps %xmm0,0x10(%rsp) + movaps %xmm0,0x20(%rsp) + movaps %xmm0,0x30(%rsp) + movaps %xmm0,0x40(%rsp) + movaps %xmm0,0x50(%rsp) + movaps %xmm0,0x60(%rsp) + movaps %xmm0,0x70(%rsp) +___ +$code.=<<___; + mov -8($key_),%rbp +.cfi_restore %rbp + lea ($key_),%rsp +.cfi_def_cfa_register %rsp +.Lctr32_epilogue: + ret +.cfi_endproc +.size ${PREFIX}_ctr32_encrypt_blocks,.-${PREFIX}_ctr32_encrypt_blocks +___ +} }} + +{ my ($inp,$bits,$key) = @_4args; + $bits =~ s/%r/%e/; +# This is based on submission from Intel by +# Huang Ying +# Vinodh Gopal +# Kahraman Akdemir +# +# Aggressively optimized in respect to aeskeygenassist's critical path +# and is contained in %xmm0-5 to meet Win64 ABI requirement. +# +# int ${PREFIX}_set_encrypt_key(const unsigned char *inp, +# int bits, AES_KEY * const key); +# +# input: $inp user-supplied key +# $bits $inp length in bits +# $key pointer to key schedule +# output: %eax 0 denoting success, -1 or -2 - failure (see C) +# $bits rounds-1 (used in aesni_set_decrypt_key) +# *$key key schedule +# $key pointer to key schedule (used in +# aesni_set_decrypt_key) +# +# Subroutine is frame-less, which means that only volatile registers +# are used. Note that it's declared "abi-omnipotent", which means that +# amount of volatile registers is smaller on Windows. +# +# There are two variants of this function, one which uses aeskeygenassist +# ("base") and one which uses aesenclast + pshufb ("alt"). See aes/internal.h +# for details. +$code.=<<___; +.globl ${PREFIX}_set_encrypt_key_base +.type ${PREFIX}_set_encrypt_key_base,\@abi-omnipotent +.align 16 +${PREFIX}_set_encrypt_key_base: +.cfi_startproc +.seh_startproc + _CET_ENDBR +#ifdef BORINGSSL_DISPATCH_TEST + movb \$1,BORINGSSL_function_hit+3(%rip) +#endif + sub \$8,%rsp +.cfi_adjust_cfa_offset 8 +.seh_stackalloc 8 +.seh_endprologue + movups ($inp),%xmm0 # pull first 128 bits of *userKey + xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0 + lea 16($key),%rax # %rax is used as modifiable copy of $key + cmp \$256,$bits + je .L14rounds + # 192-bit key support was removed. + + cmp \$128,$bits + jne .Lbad_keybits + +.L10rounds: + mov \$9,$bits # 10 rounds for 128-bit key + + $movkey %xmm0,($key) # round 0 + aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1 + call .Lkey_expansion_128_cold + aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2 + call .Lkey_expansion_128 + aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3 + call .Lkey_expansion_128 + aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4 + call .Lkey_expansion_128 + aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5 + call .Lkey_expansion_128 + aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6 + call .Lkey_expansion_128 + aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7 + call .Lkey_expansion_128 + aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8 + call .Lkey_expansion_128 + aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9 + call .Lkey_expansion_128 + aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10 + call .Lkey_expansion_128 + $movkey %xmm0,(%rax) + mov $bits,80(%rax) # 240(%rdx) + xor %eax,%eax + jmp .Lenc_key_ret + + # 192-bit key support was removed. + +.align 16 +.L14rounds: + movups 16($inp),%xmm2 # remaining half of *userKey + mov \$13,$bits # 14 rounds for 256 + lea 16(%rax),%rax + + $movkey %xmm0,($key) # round 0 + $movkey %xmm2,16($key) # round 1 + aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2 + call .Lkey_expansion_256a_cold + aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3 + call .Lkey_expansion_256b + aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4 + call .Lkey_expansion_256a + aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5 + call .Lkey_expansion_256b + aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6 + call .Lkey_expansion_256a + aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7 + call .Lkey_expansion_256b + aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8 + call .Lkey_expansion_256a + aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9 + call .Lkey_expansion_256b + aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10 + call .Lkey_expansion_256a + aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11 + call .Lkey_expansion_256b + aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12 + call .Lkey_expansion_256a + aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13 + call .Lkey_expansion_256b + aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14 + call .Lkey_expansion_256a + $movkey %xmm0,(%rax) + mov $bits,16(%rax) # 240(%rdx) + xor %rax,%rax + jmp .Lenc_key_ret + +.align 16 +.Lbad_keybits: + mov \$-2,%rax +.Lenc_key_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + add \$8,%rsp +.cfi_adjust_cfa_offset -8 + ret +.cfi_endproc +.seh_endproc + +.align 16 +.Lkey_expansion_128: +.cfi_startproc + $movkey %xmm0,(%rax) + lea 16(%rax),%rax +.Lkey_expansion_128_cold: + shufps \$0b00010000,%xmm0,%xmm4 + xorps %xmm4, %xmm0 + shufps \$0b10001100,%xmm0,%xmm4 + xorps %xmm4, %xmm0 + shufps \$0b11111111,%xmm1,%xmm1 # critical path + xorps %xmm1,%xmm0 + ret +.cfi_endproc + +.align 16 +.Lkey_expansion_256a: +.cfi_startproc + $movkey %xmm2,(%rax) + lea 16(%rax),%rax +.Lkey_expansion_256a_cold: + shufps \$0b00010000,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps \$0b10001100,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps \$0b11111111,%xmm1,%xmm1 # critical path + xorps %xmm1,%xmm0 + ret +.cfi_endproc + +.align 16 +.Lkey_expansion_256b: +.cfi_startproc + $movkey %xmm0,(%rax) + lea 16(%rax),%rax + + shufps \$0b00010000,%xmm2,%xmm4 + xorps %xmm4,%xmm2 + shufps \$0b10001100,%xmm2,%xmm4 + xorps %xmm4,%xmm2 + shufps \$0b10101010,%xmm1,%xmm1 # critical path + xorps %xmm1,%xmm2 + ret +.cfi_endproc +.size ${PREFIX}_set_encrypt_key_base,.-${PREFIX}_set_encrypt_key_base + +.globl ${PREFIX}_set_encrypt_key_alt +.type ${PREFIX}_set_encrypt_key_alt,\@abi-omnipotent +.align 16 +${PREFIX}_set_encrypt_key_alt: +.cfi_startproc +.seh_startproc + _CET_ENDBR +#ifdef BORINGSSL_DISPATCH_TEST + movb \$1,BORINGSSL_function_hit+3(%rip) +#endif + sub \$8,%rsp +.cfi_adjust_cfa_offset 8 +.seh_stackalloc 8 +.seh_endprologue + movups ($inp),%xmm0 # pull first 128 bits of *userKey + xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0 + lea 16($key),%rax # %rax is used as modifiable copy of $key + cmp \$256,$bits + je .L14rounds_alt + # 192-bit key support was removed. + cmp \$128,$bits + jne .Lbad_keybits_alt + + mov \$9,$bits # 10 rounds for 128-bit key + movdqa .Lkey_rotate(%rip),%xmm5 + mov \$8,%r10d + movdqa .Lkey_rcon1(%rip),%xmm4 + movdqa %xmm0,%xmm2 + movdqu %xmm0,($key) + jmp .Loop_key128 + +.align 16 +.Loop_key128: + pshufb %xmm5,%xmm0 + aesenclast %xmm4,%xmm0 + pslld \$1,%xmm4 + lea 16(%rax),%rax + + movdqa %xmm2,%xmm3 + pslldq \$4,%xmm2 + pxor %xmm2,%xmm3 + pslldq \$4,%xmm2 + pxor %xmm2,%xmm3 + pslldq \$4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,-16(%rax) + movdqa %xmm0,%xmm2 + + dec %r10d + jnz .Loop_key128 + + movdqa .Lkey_rcon1b(%rip),%xmm4 + + pshufb %xmm5,%xmm0 + aesenclast %xmm4,%xmm0 + pslld \$1,%xmm4 + + movdqa %xmm2,%xmm3 + pslldq \$4,%xmm2 + pxor %xmm2,%xmm3 + pslldq \$4,%xmm2 + pxor %xmm2,%xmm3 + pslldq \$4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,(%rax) + + movdqa %xmm0,%xmm2 + pshufb %xmm5,%xmm0 + aesenclast %xmm4,%xmm0 + + movdqa %xmm2,%xmm3 + pslldq \$4,%xmm2 + pxor %xmm2,%xmm3 + pslldq \$4,%xmm2 + pxor %xmm2,%xmm3 + pslldq \$4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,16(%rax) + + mov $bits,96(%rax) # 240($key) + xor %eax,%eax + jmp .Lenc_key_ret_alt + + # 192-bit key support was removed. + +.align 16 +.L14rounds_alt: + movups 16($inp),%xmm2 # remaining half of *userKey + mov \$13,$bits # 14 rounds for 256 + lea 16(%rax),%rax + movdqa .Lkey_rotate(%rip),%xmm5 + movdqa .Lkey_rcon1(%rip),%xmm4 + mov \$7,%r10d + movdqu %xmm0,0($key) + movdqa %xmm2,%xmm1 + movdqu %xmm2,16($key) + jmp .Loop_key256 + +.align 16 +.Loop_key256: + pshufb %xmm5,%xmm2 + aesenclast %xmm4,%xmm2 + + movdqa %xmm0,%xmm3 + pslldq \$4,%xmm0 + pxor %xmm0,%xmm3 + pslldq \$4,%xmm0 + pxor %xmm0,%xmm3 + pslldq \$4,%xmm0 + pxor %xmm3,%xmm0 + pslld \$1,%xmm4 + + pxor %xmm2,%xmm0 + movdqu %xmm0,(%rax) + + dec %r10d + jz .Ldone_key256 + + pshufd \$0xff,%xmm0,%xmm2 + pxor %xmm3,%xmm3 + aesenclast %xmm3,%xmm2 + + movdqa %xmm1,%xmm3 + pslldq \$4,%xmm1 + pxor %xmm1,%xmm3 + pslldq \$4,%xmm1 + pxor %xmm1,%xmm3 + pslldq \$4,%xmm1 + pxor %xmm3,%xmm1 + + pxor %xmm1,%xmm2 + movdqu %xmm2,16(%rax) + lea 32(%rax),%rax + movdqa %xmm2,%xmm1 + + jmp .Loop_key256 + +.Ldone_key256: + mov $bits,16(%rax) # 240($key) + xor %eax,%eax + jmp .Lenc_key_ret_alt + +.align 16 +.Lbad_keybits_alt: + mov \$-2,%rax +.Lenc_key_ret_alt: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + add \$8,%rsp +.cfi_adjust_cfa_offset -8 + ret +.cfi_endproc +.seh_endproc +.size ${PREFIX}_set_encrypt_key_alt,.-${PREFIX}_set_encrypt_key_alt +___ +} + +$code.=<<___; +.section .rodata +.align 64 +.Lbswap_mask: + .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.Lincrement32: + .long 6,6,6,0 +.Lincrement64: + .long 1,0,0,0 +.Lincrement1: + .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +.Lkey_rotate: + .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d +.Lkey_rotate192: + .long 0x04070605,0x04070605,0x04070605,0x04070605 +.Lkey_rcon1: + .long 1,1,1,1 +.Lkey_rcon1b: + .long 0x1b,0x1b,0x1b,0x1b + +.asciz "AES for Intel AES-NI, CRYPTOGAMS by " +.align 64 +.text +___ + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +___ +$code.=<<___ if ($PREFIX eq "aes_hw"); +.type ctr_xts_se_handler,\@abi-omnipotent +.align 16 +ctr_xts_se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue lable + cmp %r10,%rbx # context->RipRsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + mov 208($context),%rax # pull context->R11 + + lea -0xa8(%rax),%rsi # %xmm save area + lea 512($context),%rdi # & context.Xmm6 + mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) + .long 0xa548f3fc # cld; rep movsq + + mov -8(%rax),%rbp # restore saved %rbp + mov %rbp,160($context) # restore context->Rbp + + +.Lcommon_seh_tail: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size ctr_xts_se_handler,.-ctr_xts_se_handler + +.section .pdata +.align 4 +___ +$code.=<<___ if ($PREFIX eq "aes_hw"); + .rva .LSEH_begin_${PREFIX}_ctr32_encrypt_blocks + .rva .LSEH_end_${PREFIX}_ctr32_encrypt_blocks + .rva .LSEH_info_ctr32 +___ +$code.=<<___; +.section .xdata +.align 8 +___ +$code.=<<___ if ($PREFIX eq "aes_hw"); +.LSEH_info_ctr32: + .byte 9,0,0,0 + .rva ctr_xts_se_handler + .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[] +___ +} + +sub rex { + local *opcode=shift; + my ($dst,$src)=@_; + my $rex=0; + + $rex|=0x04 if($dst>=8); + $rex|=0x01 if($src>=8); + push @opcode,$rex|0x40 if($rex); +} + +sub aesni { + my $line=shift; + my @opcode=(0x66); + + if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + rex(\@opcode,$4,$3); + push @opcode,0x0f,0x3a,0xdf; + push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M + my $c=$2; + push @opcode,$c=~/^0/?oct($c):$c; + return ".byte\t".join(',',@opcode); + } + elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my %opcodelet = ( + "aesimc" => 0xdb, + "aesenc" => 0xdc, "aesenclast" => 0xdd, + "aesdec" => 0xde, "aesdeclast" => 0xdf + ); + return undef if (!defined($opcodelet{$1})); + rex(\@opcode,$3,$2); + push @opcode,0x0f,0x38,$opcodelet{$1}; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + return ".byte\t".join(',',@opcode); + } + elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) { + my %opcodelet = ( + "aesenc" => 0xdc, "aesenclast" => 0xdd, + "aesdec" => 0xde, "aesdeclast" => 0xdf + ); + return undef if (!defined($opcodelet{$1})); + my $off = $2; + push @opcode,0x44 if ($3>=8); + push @opcode,0x0f,0x38,$opcodelet{$1}; + push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M + push @opcode,($off=~/^0/?oct($off):$off)&0xff; + return ".byte\t".join(',',@opcode); + } + return $line; +} + +sub movbe { + ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift; +} + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; +$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; +#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact +$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem; + +print $code; + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/ring-0.17.14/crypto/fipsmodule/aes/asm/aesv8-armx.pl b/ring-0.17.14/crypto/fipsmodule/aes/asm/aesv8-armx.pl new file mode 100644 index 0000000000..5dfb4b1d8d --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/aes/asm/aesv8-armx.pl @@ -0,0 +1,587 @@ +#! /usr/bin/env perl +# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. +# ==================================================================== +# +# This module implements support for ARMv8 AES instructions. The +# module is endian-agnostic in sense that it supports both big- and +# little-endian cases. As does it support both 32- and 64-bit modes +# of operation. Latter is achieved by limiting amount of utilized +# registers to 16, which implies additional NEON load and integer +# instructions. This has no effect on mighty Apple A7, where results +# are literally equal to the theoretical estimates based on AES +# instruction latencies and issue rates. On Cortex-A53, an in-order +# execution core, this costs up to 10-15%, which is partially +# compensated by implementing dedicated code path for 128-bit +# CBC encrypt case. On Cortex-A57 parallelizable mode performance +# seems to be limited by sheer amount of NEON instructions... +# +# Performance in cycles per byte processed with 128-bit key: +# +# CBC enc CBC dec CTR +# Apple A7 2.39 1.20 1.20 +# Cortex-A53 1.32 1.29 1.46 +# Cortex-A57(*) 1.95 0.85 0.93 +# Denver 1.96 0.86 0.80 +# Mongoose 1.33 1.20 1.20 +# +# (*) original 3.64/1.34/1.32 results were for r0p0 revision +# and are still same even for updated module; + +$flavour = shift; +$output = shift; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; + +$prefix="aes_hw"; + +$code=<<___; +#if __ARM_MAX_ARCH__>=7 +.text +___ +$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/); +$code.=<<___ if ($flavour !~ /64/); +.arch armv7-a // don't confuse not-so-latest binutils with argv8 :-) +.fpu neon +.code 32 +#undef __thumb2__ +___ + +# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax, +# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to +# maintain both 32- and 64-bit codes within single module and +# transliterate common code to either flavour with regex vodoo. +# +{{{ +my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12"); +my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)= + $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10)); + + +# On AArch64, put the data .rodata and use adrp + add for compatibility with +# execute-only memory. On AArch32, put it in .text and use adr. +$code.= ".section .rodata\n" if ($flavour =~ /64/); +$code.=<<___; +.align 5 +.Lrcon: +.long 0x01,0x01,0x01,0x01 +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat +.long 0x1b,0x1b,0x1b,0x1b + +.text + +.globl ${prefix}_set_encrypt_key +.type ${prefix}_set_encrypt_key,%function +.align 5 +${prefix}_set_encrypt_key: +.Lenc_key: +___ +$code.=<<___ if ($flavour =~ /64/); + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 +___ +$code.=<<___; + mov $ptr,#-2 + cmp $bits,#128 + b.lt .Lenc_key_abort + cmp $bits,#256 + b.gt .Lenc_key_abort + tst $bits,#0x3f + b.ne .Lenc_key_abort + +___ +$code.=<<___ if ($flavour =~ /64/); + adrp $ptr,:pg_hi21:.Lrcon + add $ptr,$ptr,:lo12:.Lrcon +___ +$code.=<<___ if ($flavour !~ /64/); + adr $ptr,.Lrcon +___ +$code.=<<___; + cmp $bits,#192 + + veor $zero,$zero,$zero + vld1.8 {$in0},[$inp],#16 + mov $bits,#8 // reuse $bits + vld1.32 {$rcon,$mask},[$ptr],#32 + + b.lt .Loop128 + // 192-bit key support was removed. + b .L256 + +.align 4 +.Loop128: + vtbl.8 $key,{$in0},$mask + vext.8 $tmp,$zero,$in0,#12 + vst1.32 {$in0},[$out],#16 + aese $key,$zero + subs $bits,$bits,#1 + + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $key,$key,$rcon + veor $in0,$in0,$tmp + vshl.u8 $rcon,$rcon,#1 + veor $in0,$in0,$key + b.ne .Loop128 + + vld1.32 {$rcon},[$ptr] + + vtbl.8 $key,{$in0},$mask + vext.8 $tmp,$zero,$in0,#12 + vst1.32 {$in0},[$out],#16 + aese $key,$zero + + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $key,$key,$rcon + veor $in0,$in0,$tmp + vshl.u8 $rcon,$rcon,#1 + veor $in0,$in0,$key + + vtbl.8 $key,{$in0},$mask + vext.8 $tmp,$zero,$in0,#12 + vst1.32 {$in0},[$out],#16 + aese $key,$zero + + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $key,$key,$rcon + veor $in0,$in0,$tmp + veor $in0,$in0,$key + vst1.32 {$in0},[$out] + add $out,$out,#0x50 + + mov $rounds,#10 + b .Ldone + +// 192-bit key support was removed. + +.align 4 +.L256: + vld1.8 {$in1},[$inp] + mov $bits,#7 + mov $rounds,#14 + vst1.32 {$in0},[$out],#16 + +.Loop256: + vtbl.8 $key,{$in1},$mask + vext.8 $tmp,$zero,$in0,#12 + vst1.32 {$in1},[$out],#16 + aese $key,$zero + subs $bits,$bits,#1 + + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $key,$key,$rcon + veor $in0,$in0,$tmp + vshl.u8 $rcon,$rcon,#1 + veor $in0,$in0,$key + vst1.32 {$in0},[$out],#16 + b.eq .Ldone + + vdup.32 $key,${in0}[3] // just splat + vext.8 $tmp,$zero,$in1,#12 + aese $key,$zero + + veor $in1,$in1,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in1,$in1,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in1,$in1,$tmp + + veor $in1,$in1,$key + b .Loop256 + +.Ldone: + str $rounds,[$out] + mov $ptr,#0 + +.Lenc_key_abort: + mov x0,$ptr // return value + `"ldr x29,[sp],#16" if ($flavour =~ /64/)` + ret +.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key +___ +}}} +{{{ +my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); +my ($rounds,$cnt,$key_)=("w5","w6","x7"); +my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12)); +my $step="x12"; # aliases with $tctr2 + +my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); +my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); + +my ($dat,$tmp)=($dat0,$tmp0); + +### q8-q15 preloaded key schedule + +$code.=<<___; +.globl ${prefix}_ctr32_encrypt_blocks +.type ${prefix}_ctr32_encrypt_blocks,%function +.align 5 +${prefix}_ctr32_encrypt_blocks: +___ +$code.=<<___ if ($flavour =~ /64/); + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 +___ +$code.=<<___ if ($flavour !~ /64/); + mov ip,sp + stmdb sp!,{r4-r10,lr} + vstmdb sp!,{d8-d15} @ ABI specification says so + ldr r4, [ip] @ load remaining arg +___ +$code.=<<___; + ldr $rounds,[$key,#240] + + ldr $ctr, [$ivp, #12] + vld1.32 {$dat0},[$ivp] + + vld1.32 {q8-q9},[$key] // load key schedule... + sub $rounds,$rounds,#4 + mov $step,#16 + cmp $len,#2 + add $key_,$key,x5,lsl#4 // pointer to last 5 round keys + sub $rounds,$rounds,#2 + vld1.32 {q12-q13},[$key_],#32 + vld1.32 {q14-q15},[$key_],#32 + vld1.32 {$rndlast},[$key_] + add $key_,$key,#32 + mov $cnt,$rounds + cclr $step,lo + + // ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are + // affected by silicon errata #1742098 [0] and #1655431 [1], + // respectively, where the second instruction of an aese/aesmc + // instruction pair may execute twice if an interrupt is taken right + // after the first instruction consumes an input register of which a + // single 32-bit lane has been updated the last time it was modified. + // + // This function uses a counter in one 32-bit lane. The vmov.32 lines + // could write to $dat1 and $dat2 directly, but that trips this bugs. + // We write to $ivec and copy to the final register as a workaround. + // + // [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice + // [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice +#ifndef __ARMEB__ + rev $ctr, $ctr +#endif + add $tctr1, $ctr, #1 + vorr $ivec,$dat0,$dat0 + rev $tctr1, $tctr1 + vmov.32 ${ivec}[3],$tctr1 + add $ctr, $ctr, #2 + vorr $dat1,$ivec,$ivec + b.ls .Lctr32_tail + rev $tctr2, $ctr + vmov.32 ${ivec}[3],$tctr2 + sub $len,$len,#3 // bias + vorr $dat2,$ivec,$ivec + b .Loop3x_ctr32 + +.align 4 +.Loop3x_ctr32: + aese $dat0,q8 + aesmc $dat0,$dat0 + aese $dat1,q8 + aesmc $dat1,$dat1 + aese $dat2,q8 + aesmc $dat2,$dat2 + vld1.32 {q8},[$key_],#16 + subs $cnt,$cnt,#2 + aese $dat0,q9 + aesmc $dat0,$dat0 + aese $dat1,q9 + aesmc $dat1,$dat1 + aese $dat2,q9 + aesmc $dat2,$dat2 + vld1.32 {q9},[$key_],#16 + b.gt .Loop3x_ctr32 + + aese $dat0,q8 + aesmc $tmp0,$dat0 + aese $dat1,q8 + aesmc $tmp1,$dat1 + vld1.8 {$in0},[$inp],#16 + add $tctr0,$ctr,#1 + aese $dat2,q8 + aesmc $dat2,$dat2 + vld1.8 {$in1},[$inp],#16 + rev $tctr0,$tctr0 + aese $tmp0,q9 + aesmc $tmp0,$tmp0 + aese $tmp1,q9 + aesmc $tmp1,$tmp1 + vld1.8 {$in2},[$inp],#16 + mov $key_,$key + aese $dat2,q9 + aesmc $tmp2,$dat2 + aese $tmp0,q12 + aesmc $tmp0,$tmp0 + aese $tmp1,q12 + aesmc $tmp1,$tmp1 + veor $in0,$in0,$rndlast + add $tctr1,$ctr,#2 + aese $tmp2,q12 + aesmc $tmp2,$tmp2 + veor $in1,$in1,$rndlast + add $ctr,$ctr,#3 + aese $tmp0,q13 + aesmc $tmp0,$tmp0 + aese $tmp1,q13 + aesmc $tmp1,$tmp1 + // Note the logic to update $dat0, $dat1, and $dat1 is written to work + // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in + // 32-bit mode. See the comment above. + veor $in2,$in2,$rndlast + vmov.32 ${ivec}[3], $tctr0 + aese $tmp2,q13 + aesmc $tmp2,$tmp2 + vorr $dat0,$ivec,$ivec + rev $tctr1,$tctr1 + aese $tmp0,q14 + aesmc $tmp0,$tmp0 + vmov.32 ${ivec}[3], $tctr1 + rev $tctr2,$ctr + aese $tmp1,q14 + aesmc $tmp1,$tmp1 + vorr $dat1,$ivec,$ivec + vmov.32 ${ivec}[3], $tctr2 + aese $tmp2,q14 + aesmc $tmp2,$tmp2 + vorr $dat2,$ivec,$ivec + subs $len,$len,#3 + aese $tmp0,q15 + aese $tmp1,q15 + aese $tmp2,q15 + + veor $in0,$in0,$tmp0 + vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] + vst1.8 {$in0},[$out],#16 + veor $in1,$in1,$tmp1 + mov $cnt,$rounds + vst1.8 {$in1},[$out],#16 + veor $in2,$in2,$tmp2 + vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] + vst1.8 {$in2},[$out],#16 + b.hs .Loop3x_ctr32 + + adds $len,$len,#3 + b.eq .Lctr32_done + cmp $len,#1 + mov $step,#16 + cclr $step,eq + +.Lctr32_tail: + aese $dat0,q8 + aesmc $dat0,$dat0 + aese $dat1,q8 + aesmc $dat1,$dat1 + vld1.32 {q8},[$key_],#16 + subs $cnt,$cnt,#2 + aese $dat0,q9 + aesmc $dat0,$dat0 + aese $dat1,q9 + aesmc $dat1,$dat1 + vld1.32 {q9},[$key_],#16 + b.gt .Lctr32_tail + + aese $dat0,q8 + aesmc $dat0,$dat0 + aese $dat1,q8 + aesmc $dat1,$dat1 + aese $dat0,q9 + aesmc $dat0,$dat0 + aese $dat1,q9 + aesmc $dat1,$dat1 + vld1.8 {$in0},[$inp],$step + aese $dat0,q12 + aesmc $dat0,$dat0 + aese $dat1,q12 + aesmc $dat1,$dat1 + vld1.8 {$in1},[$inp] + aese $dat0,q13 + aesmc $dat0,$dat0 + aese $dat1,q13 + aesmc $dat1,$dat1 + veor $in0,$in0,$rndlast + aese $dat0,q14 + aesmc $dat0,$dat0 + aese $dat1,q14 + aesmc $dat1,$dat1 + veor $in1,$in1,$rndlast + aese $dat0,q15 + aese $dat1,q15 + + cmp $len,#1 + veor $in0,$in0,$dat0 + veor $in1,$in1,$dat1 + vst1.8 {$in0},[$out],#16 + b.eq .Lctr32_done + vst1.8 {$in1},[$out] + +.Lctr32_done: +___ +$code.=<<___ if ($flavour !~ /64/); + vldmia sp!,{d8-d15} + ldmia sp!,{r4-r10,pc} +___ +$code.=<<___ if ($flavour =~ /64/); + ldr x29,[sp],#16 + ret +___ +$code.=<<___; +.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks +___ +}}} +$code.=<<___; +#endif +___ +######################################## +if ($flavour =~ /64/) { ######## 64-bit code + my %opcode = ( + "aesd" => 0x4e285800, "aese" => 0x4e284800, + "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 ); + + local *unaes = sub { + my ($mnemonic,$arg)=@_; + + $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o && + sprintf ".inst\t0x%08x\t//%s %s", + $opcode{$mnemonic}|$1|($2<<5), + $mnemonic,$arg; + }; + + foreach(split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/geo; + + s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers + s/@\s/\/\//o; # old->new style commentary + + #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or + s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or + s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or + s/vmov\.i8/movi/o or # fix up legacy mnemonics + s/vext\.8/ext/o or + s/vrev32\.8/rev32/o or + s/vtst\.8/cmtst/o or + s/vshr/ushr/o or + s/^(\s+)v/$1/o or # strip off v prefix + s/\bbx\s+lr\b/ret/o; + + # fix up remaining legacy suffixes + s/\.[ui]?8//o; + m/\],#8/o and s/\.16b/\.8b/go; + s/\.[ui]?32//o and s/\.16b/\.4s/go; + s/\.[ui]?64//o and s/\.16b/\.2d/go; + s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; + + # Switch preprocessor checks to aarch64 versions. + s/__ARME([BL])__/__AARCH64E$1__/go; + + print $_,"\n"; + } +} else { ######## 32-bit code + my %opcode = ( + "aesd" => 0xf3b00340, "aese" => 0xf3b00300, + "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 ); + + local *unaes = sub { + my ($mnemonic,$arg)=@_; + + if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) { + my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) + |(($2&7)<<1) |(($2&8)<<2); + # since ARMv7 instructions are always encoded little-endian. + # correct solution is to use .inst directive, but older + # assemblers don't implement it:-( + sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", + $word&0xff,($word>>8)&0xff, + ($word>>16)&0xff,($word>>24)&0xff, + $mnemonic,$arg; + } + }; + + sub unvtbl { + my $arg=shift; + + $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o && + sprintf "vtbl.8 d%d,{q%d},d%d\n\t". + "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; + } + + sub unvdup32 { + my $arg=shift; + + $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && + sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; + } + + sub unvmov32 { + my $arg=shift; + + $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o && + sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3; + } + + foreach(split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/geo; + + s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers + s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers + s/\/\/\s?/@ /o; # new->old style commentary + + # fix up remaining new-style suffixes + s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or + s/\],#[0-9]+/]!/o; + + s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or + s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or + s/vtbl\.8\s+(.*)/unvtbl($1)/geo or + s/vdup\.32\s+(.*)/unvdup32($1)/geo or + s/vmov\.32\s+(.*)/unvmov32($1)/geo or + s/^(\s+)b\./$1b/o or + s/^(\s+)mov\./$1mov/o or + s/^(\s+)ret/$1bx\tlr/o; + + print $_,"\n"; + } +} + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/ring-0.17.14/crypto/fipsmodule/aes/asm/aesv8-gcm-armv8.pl b/ring-0.17.14/crypto/fipsmodule/aes/asm/aesv8-gcm-armv8.pl new file mode 100644 index 0000000000..4f862a9ba5 --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/aes/asm/aesv8-gcm-armv8.pl @@ -0,0 +1,1541 @@ +#! /usr/bin/env perl + +# Copyright (c) 2022, ARM Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#======================================================================== +# Written by Fangming Fang for the OpenSSL project, +# derived from https://github.com/ARM-software/AArch64cryptolib, original +# author Samuel Lee . +#======================================================================== +# +# Approach - assume we don't want to reload constants, so reserve ~half of +# vector register file for constants +# +# main loop to act on 4 16B blocks per iteration, and then do modulo of the +# accumulated intermediate hashes from the 4 blocks +# +# ____________________________________________________ +# | | +# | PRE | +# |____________________________________________________| +# | | | | +# | CTR block 4k+8 | AES block 4k+4 | GHASH block 4k+0 | +# |________________|________________|__________________| +# | | | | +# | CTR block 4k+9 | AES block 4k+5 | GHASH block 4k+1 | +# |________________|________________|__________________| +# | | | | +# | CTR block 4k+10| AES block 4k+6 | GHASH block 4k+2 | +# |________________|________________|__________________| +# | | | | +# | CTR block 4k+11| AES block 4k+7 | GHASH block 4k+3 | +# |________________|____(mostly)____|__________________| +# | | +# | MODULO | +# |____________________________________________________| +# +# PRE: Ensure previous generated intermediate hash is aligned and merged with +# result for GHASH 4k+0 +# +# EXT low_acc, low_acc, low_acc, #8 +# EOR res_curr (4k+0), res_curr (4k+0), low_acc +# +# CTR block: Increment and byte reverse counter in scalar registers and transfer +# to SIMD registers +# +# REV ctr32, rev_ctr32 +# ORR ctr64, constctr96_top32, ctr32, LSL #32 +# // Keeping this in scalar registers to free up space in SIMD RF +# INS ctr_next.d[0], constctr96_bottom64 +# INS ctr_next.d[1], ctr64X +# ADD rev_ctr32, #1 +# +# AES block: +# +# Do AES encryption/decryption on CTR block X and EOR it with input block X. +# Take 256 bytes key below for example. Doing small trick here of loading input +# in scalar registers, EORing with last key and then transferring Given we are +# very constrained in our ASIMD registers this is quite important +# +# Encrypt: +# LDR input_low, [ input_ptr ], #8 +# LDR input_high, [ input_ptr ], #8 +# EOR input_low, k14_low +# EOR input_high, k14_high +# INS res_curr.d[0], input_low +# INS res_curr.d[1], input_high +# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k13 +# EOR res_curr, res_curr, ctr_curr +# ST1 { res_curr.16b }, [ output_ptr ], #16 +# +# Decrypt: +# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k13 +# LDR res_curr, [ input_ptr ], #16 +# EOR res_curr, res_curr, ctr_curr +# MOV output_low, res_curr.d[0] +# MOV output_high, res_curr.d[1] +# EOR output_low, k14_low +# EOR output_high, k14_high +# STP output_low, output_high, [ output_ptr ], #16 +# +# GHASH block X: +# Do 128b karatsuba polynomial multiplication on block. We only have +# 64b->128b polynomial multipliers, naively that means we need to do 4 64b +# multiplies to generate a 128b. +# +# multiplication: +# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ +# (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64 +# +# The idea behind Karatsuba multiplication is that we can do just 3 64b +# multiplies: +# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ +# (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ +# Pmull(Al,Bl))<<64 +# +# There is some complication here because the bit order of GHASH's PMULL is +# reversed compared to elsewhere, so we are multiplying with "twisted" +# powers of H +# +# Note: We can PMULL directly into the acc_x in first GHASH of the loop +# +# Note: For scheduling big cores we want to split the processing to happen over +# two loop iterations - otherwise the critical path latency dominates the +# performance. +# +# This has a knock on effect on register pressure, so we have to be a bit +# more clever with our temporary registers than indicated here +# +# REV64 res_curr, res_curr +# INS t_m.d[0], res_curr.d[1] +# EOR t_m.8B, t_m.8B, res_curr.8B +# PMULL2 t_h, res_curr, HX +# PMULL t_l, res_curr, HX +# PMULL t_m, t_m, HX_k +# EOR acc_h, acc_h, t_h +# EOR acc_l, acc_l, t_l +# EOR acc_m, acc_m, t_m +# +# MODULO: take the partial accumulators (~representing sum of 256b +# multiplication results), from GHASH and do modulo reduction on them +# There is some complication here because the bit order of GHASH's +# PMULL is reversed compared to elsewhere, so we are doing modulo with +# a reversed constant +# +# EOR acc_m, acc_m, acc_h +# EOR acc_m, acc_m, acc_l // Finish off karatsuba processing +# PMULL t_mod, acc_h, mod_constant +# EXT acc_h, acc_h, acc_h, #8 +# EOR acc_m, acc_m, acc_h +# EOR acc_m, acc_m, t_mod +# PMULL acc_h, acc_m, mod_constant +# EXT acc_m, acc_m, acc_m, #8 +# EOR acc_l, acc_l, acc_h +# EOR acc_l, acc_l, acc_m +# +# This code was then modified to merge the AES-128-GCM, AES-192-GCM, and +# AES-256-GCM implementations into a single function to reduce size. We move the +# last two round keys into consistent registers across all sizes, as they're +# treated special. Then, after rounds 0 through 8, we added some branches to +# conditionally run rounds 9-10 (AES-192 + AES-256) and 11-12 (AES-256), before +# merging back into code which finishes up the last two rounds. +# +# There is a mostly decision to be made around how much parallel work goes +# before or after the conditional part. We attempted to preserve the original +# scheduling where possible, but it's possible other schedulings are more +# optimal with the current ordering. + +$flavour = shift; +$output = shift; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +$code=<<___; +#if __ARM_MAX_ARCH__ >= 8 + +.arch armv8-a+crypto +.text +___ + +$input_ptr="x0"; #argument block +$bit_length="x1"; +$output_ptr="x2"; +$current_tag="x3"; +$Htable="x6"; +$counter="x16"; +$cc="x8"; + +{ +my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7)); +my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24)); +my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24)); +my ($output_l0,$output_h0)=map("x$_",(6..7)); + +# rkN_l and rkN_h store the final round key, which is handled slightly +# differently because it is EORed through general-purpose registers. +my $ctr32w="w9"; +my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rkN_l,$rkN_h,$len)=map("x$_",(9..15)); +my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12)); + +my $rounds="x17"; +my $roundsw="w17"; + +my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7)); +my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7)); +my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7)); +my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7)); + +my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11)); +my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11)); +my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11)); + +my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17)); +my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15)); +my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15)); + +my $t0="v8"; +my $t0d="d8"; +my $t1="v4"; +my $t1d="d4"; +my $t2="v8"; +my $t2d="d8"; +my $t3="v4"; +my $t3d="d4"; +my $t4="v4"; +my $t4d="d4"; +my $t5="v5"; +my $t5d="d5"; +my $t6="v8"; +my $t6d="d8"; +my $t7="v5"; +my $t7d="d5"; +my $t8="v6"; +my $t8d="d6"; +my $t9="v4"; +my $t9d="d4"; + +my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7)); +my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7)); +my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7)); + +my $mod_constantd="d8"; +my $mod_constant="v8"; +my $mod_t="v7"; + +# rkNm1 stores the second-to-last round key, which is handled slightly +# differently because it uses plain AESE instead of an AESE + AESMC macro-op. +my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11,$rk12,$rkNm1)=map("v$_.16b",(18..31)); +my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q,$rk12q,$rkNm1q)=map("q$_",(18..31)); +my $rk2q1="v20.1q"; +my $rk3q1="v21.1q"; +my $rk4v="v22"; +my $rk4d="d22"; + +################################################################################ +# size_t aes_gcm_enc_kernel(const uint8_t *in, +# size_t len_bits, +# uint8_t *out, +# u64 *Xi, +# uint8_t ivec[16], +# const void *key, +# const void *Htable); +# +$code.=<<___; +.global aes_gcm_enc_kernel +.type aes_gcm_enc_kernel,%function +.align 4 +aes_gcm_enc_kernel: + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp, #-128]! + mov x29, sp + stp x19, x20, [sp, #16] + mov $counter, x4 + mov $cc, x5 + stp x21, x22, [sp, #32] + stp x23, x24, [sp, #48] + stp d8, d9, [sp, #64] + stp d10, d11, [sp, #80] + stp d12, d13, [sp, #96] + stp d14, d15, [sp, #112] + ldr $roundsw, [$cc, #240] + add $input_l1, $cc, $rounds, lsl #4 // borrow input_l1 for last key + ldp $rkN_l, $rkN_h, [$input_l1] // load round N keys + ldr $rkNm1q, [$input_l1, #-16] // load round N-1 keys + add $end_input_ptr, $input_ptr, $bit_length, lsr #3 // end_input_ptr + lsr $main_end_input_ptr, $bit_length, #3 // byte_len + mov $len, $main_end_input_ptr + ldp $ctr96_b64x, $ctr96_t32x, [$counter] // ctr96_b64, ctr96_t32 + ld1 { $ctr0b}, [$counter] // special case vector load initial counter so we can start first AES block as quickly as possible + sub $main_end_input_ptr, $main_end_input_ptr, #1 // byte_len - 1 + ldr $rk0q, [$cc, #0] // load rk0 + and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + ldr $rk7q, [$cc, #112] // load rk7 + add $main_end_input_ptr, $main_end_input_ptr, $input_ptr + lsr $rctr32x, $ctr96_t32x, #32 + fmov $ctr2d, $ctr96_b64x // CTR block 2 + orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w + rev $rctr32w, $rctr32w // rev_ctr32 + fmov $ctr1d, $ctr96_b64x // CTR block 1 + aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 0 + add $rctr32w, $rctr32w, #1 // increment rev_ctr32 + rev $ctr32w, $rctr32w // CTR block 1 + fmov $ctr3d, $ctr96_b64x // CTR block 3 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 1 + add $rctr32w, $rctr32w, #1 // CTR block 1 + ldr $rk1q, [$cc, #16] // load rk1 + fmov $ctr1.d[1], $ctr32x // CTR block 1 + rev $ctr32w, $rctr32w // CTR block 2 + add $rctr32w, $rctr32w, #1 // CTR block 2 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 2 + ldr $rk2q, [$cc, #32] // load rk2 + fmov $ctr2.d[1], $ctr32x // CTR block 2 + rev $ctr32w, $rctr32w // CTR block 3 + aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 1 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 3 + fmov $ctr3.d[1], $ctr32x // CTR block 3 + aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 0 + ldr $rk3q, [$cc, #48] // load rk3 + aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 2 + ldr $rk6q, [$cc, #96] // load rk6 + aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 0 + ldr $rk5q, [$cc, #80] // load rk5 + aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 1 + ldr $h3q, [$Htable, #48] // load h3l | h3h + ext $h3b, $h3b, $h3b, #8 + aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 0 + aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 1 + ldr $rk4q, [$cc, #64] // load rk4 + aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 2 + ldr $h2q, [$Htable, #32] // load h2l | h2h + ext $h2b, $h2b, $h2b, #8 + aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 1 + ldr $rk12q, [$cc, #192] // load rk12 + aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 2 + ldr $h4q, [$Htable, #80] // load h4l | h4h + ext $h4b, $h4b, $h4b, #8 + aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 3 + ldr $rk11q, [$cc, #176] // load rk11 + aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 2 + ldr $rk8q, [$cc, #128] // load rk8 + aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 3 + add $rctr32w, $rctr32w, #1 // CTR block 3 + aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 3 + aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 3 + ld1 { $acc_lb}, [$current_tag] + ext $acc_lb, $acc_lb, $acc_lb, #8 + rev64 $acc_lb, $acc_lb + aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 4 + aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 4 + aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 4 + aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 4 + cmp $rounds, #12 // setup flags for AES-128/192/256 check + aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 5 + aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 5 + aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 5 + aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 5 + aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 6 + trn2 $h34k.2d, $h3.2d, $h4.2d // h4l | h3l + aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 6 + ldr $rk9q, [$cc, #144] // load rk9 + aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 6 + ldr $h1q, [$Htable] // load h1l | h1h + ext $h1b, $h1b, $h1b, #8 + aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 6 + ldr $rk10q, [$cc, #160] // load rk10 + aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 7 + trn1 $acc_h.2d, $h3.2d, $h4.2d // h4h | h3h + aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 7 + aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 7 + aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 7 + trn2 $h12k.2d, $h1.2d, $h2.2d // h2l | h1l + aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 8 + aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 8 + aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 8 + aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 8 + b.lt .Lenc_finish_first_blocks // branch if AES-128 + + aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 9 + aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 9 + aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 9 + aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 9 + aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 10 + aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 10 + aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 10 + aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 10 + b.eq .Lenc_finish_first_blocks // branch if AES-192 + + aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 11 + aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 11 + aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 11 + aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 11 + aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 12 + aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 12 + aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 12 + aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 12 + +.Lenc_finish_first_blocks: + cmp $input_ptr, $main_end_input_ptr // check if we have <= 4 blocks + eor $h34k.16b, $h34k.16b, $acc_h.16b // h4k | h3k + aese $ctr2b, $rkNm1 // AES block 2 - round N-1 + trn1 $t0.2d, $h1.2d, $h2.2d // h2h | h1h + aese $ctr1b, $rkNm1 // AES block 1 - round N-1 + aese $ctr0b, $rkNm1 // AES block 0 - round N-1 + aese $ctr3b, $rkNm1 // AES block 3 - round N-1 + eor $h12k.16b, $h12k.16b, $t0.16b // h2k | h1k + b.ge .Lenc_tail // handle tail + + ldp $input_l1, $input_h1, [$input_ptr, #16] // AES block 1 - load plaintext + rev $ctr32w, $rctr32w // CTR block 4 + ldp $input_l0, $input_h0, [$input_ptr, #0] // AES block 0 - load plaintext + ldp $input_l3, $input_h3, [$input_ptr, #48] // AES block 3 - load plaintext + ldp $input_l2, $input_h2, [$input_ptr, #32] // AES block 2 - load plaintext + add $input_ptr, $input_ptr, #64 // AES input_ptr update + eor $input_l1, $input_l1, $rkN_l // AES block 1 - round N low + eor $input_h1, $input_h1, $rkN_h // AES block 1 - round N high + fmov $ctr_t1d, $input_l1 // AES block 1 - mov low + eor $input_l0, $input_l0, $rkN_l // AES block 0 - round N low + eor $input_h0, $input_h0, $rkN_h // AES block 0 - round N high + eor $input_h3, $input_h3, $rkN_h // AES block 3 - round N high + fmov $ctr_t0d, $input_l0 // AES block 0 - mov low + cmp $input_ptr, $main_end_input_ptr // check if we have <= 8 blocks + fmov $ctr_t0.d[1], $input_h0 // AES block 0 - mov high + eor $input_l3, $input_l3, $rkN_l // AES block 3 - round N low + eor $input_l2, $input_l2, $rkN_l // AES block 2 - round N low + fmov $ctr_t1.d[1], $input_h1 // AES block 1 - mov high + fmov $ctr_t2d, $input_l2 // AES block 2 - mov low + add $rctr32w, $rctr32w, #1 // CTR block 4 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4 + fmov $ctr_t3d, $input_l3 // AES block 3 - mov low + eor $input_h2, $input_h2, $rkN_h // AES block 2 - round N high + fmov $ctr_t2.d[1], $input_h2 // AES block 2 - mov high + eor $res0b, $ctr_t0b, $ctr0b // AES block 0 - result + fmov $ctr0d, $ctr96_b64x // CTR block 4 + fmov $ctr0.d[1], $ctr32x // CTR block 4 + rev $ctr32w, $rctr32w // CTR block 5 + add $rctr32w, $rctr32w, #1 // CTR block 5 + eor $res1b, $ctr_t1b, $ctr1b // AES block 1 - result + fmov $ctr1d, $ctr96_b64x // CTR block 5 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 5 + fmov $ctr1.d[1], $ctr32x // CTR block 5 + rev $ctr32w, $rctr32w // CTR block 6 + st1 { $res0b}, [$output_ptr], #16 // AES block 0 - store result + fmov $ctr_t3.d[1], $input_h3 // AES block 3 - mov high + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 6 + eor $res2b, $ctr_t2b, $ctr2b // AES block 2 - result + st1 { $res1b}, [$output_ptr], #16 // AES block 1 - store result + add $rctr32w, $rctr32w, #1 // CTR block 6 + fmov $ctr2d, $ctr96_b64x // CTR block 6 + fmov $ctr2.d[1], $ctr32x // CTR block 6 + st1 { $res2b}, [$output_ptr], #16 // AES block 2 - store result + rev $ctr32w, $rctr32w // CTR block 7 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 7 + eor $res3b, $ctr_t3b, $ctr3b // AES block 3 - result + st1 { $res3b}, [$output_ptr], #16 // AES block 3 - store result + b.ge .Lenc_prepretail // do prepretail + +.Lenc_main_loop: // main loop start + aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 0 + rev64 $res0b, $res0b // GHASH block 4k (only t0 is free) + aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 0 + fmov $ctr3d, $ctr96_b64x // CTR block 4k+3 + aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 0 + ext $acc_lb, $acc_lb, $acc_lb, #8 // PRE 0 + aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 1 + fmov $ctr3.d[1], $ctr32x // CTR block 4k+3 + aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 1 + ldp $input_l3, $input_h3, [$input_ptr, #48] // AES block 4k+7 - load plaintext + aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 1 + ldp $input_l2, $input_h2, [$input_ptr, #32] // AES block 4k+6 - load plaintext + aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 2 + eor $res0b, $res0b, $acc_lb // PRE 1 + aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 2 + aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 0 + eor $input_l3, $input_l3, $rkN_l // AES block 4k+7 - round N low + aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 3 + mov $acc_md, $h34k.d[1] // GHASH block 4k - mid + pmull2 $acc_h.1q, $res0.2d, $h4.2d // GHASH block 4k - high + eor $input_h2, $input_h2, $rkN_h // AES block 4k+6 - round N high + mov $t0d, $res0.d[1] // GHASH block 4k - mid + aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 1 + rev64 $res1b, $res1b // GHASH block 4k+1 (t0 and t1 free) + aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 4 + pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH block 4k - low + eor $t0.8b, $t0.8b, $res0.8b // GHASH block 4k - mid + aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 2 + aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 5 + rev64 $res3b, $res3b // GHASH block 4k+3 (t0, t1, t2 and t3 free) + pmull2 $t1.1q, $res1.2d, $h3.2d // GHASH block 4k+1 - high + pmull $acc_m.1q, $t0.1d, $acc_m.1d // GHASH block 4k - mid + rev64 $res2b, $res2b // GHASH block 4k+2 (t0, t1, and t2 free) + pmull $t2.1q, $res1.1d, $h3.1d // GHASH block 4k+1 - low + eor $acc_hb, $acc_hb, $t1.16b // GHASH block 4k+1 - high + mov $t3d, $res1.d[1] // GHASH block 4k+1 - mid + aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 3 + aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 2 + eor $acc_lb, $acc_lb, $t2.16b // GHASH block 4k+1 - low + aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 3 + aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 4 + mov $t6d, $res2.d[1] // GHASH block 4k+2 - mid + aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 3 + eor $t3.8b, $t3.8b, $res1.8b // GHASH block 4k+1 - mid + aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 4 + aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 6 + eor $t6.8b, $t6.8b, $res2.8b // GHASH block 4k+2 - mid + aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 4 + pmull $t3.1q, $t3.1d, $h34k.1d // GHASH block 4k+1 - mid + aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 7 + aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 5 + ins $t6.d[1], $t6.d[0] // GHASH block 4k+2 - mid + aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 5 + aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 8 + aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 5 + aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 6 + eor $acc_mb, $acc_mb, $t3.16b // GHASH block 4k+1 - mid + pmull2 $t4.1q, $res2.2d, $h2.2d // GHASH block 4k+2 - high + pmull $t5.1q, $res2.1d, $h2.1d // GHASH block 4k+2 - low + aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 7 + pmull $t8.1q, $res3.1d, $h1.1d // GHASH block 4k+3 - low + eor $acc_hb, $acc_hb, $t4.16b // GHASH block 4k+2 - high + aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 6 + ldp $input_l1, $input_h1, [$input_ptr, #16] // AES block 4k+5 - load plaintext + aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 8 + mov $t9d, $res3.d[1] // GHASH block 4k+3 - mid + aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 6 + eor $acc_lb, $acc_lb, $t5.16b // GHASH block 4k+2 - low + pmull2 $t6.1q, $t6.2d, $h12k.2d // GHASH block 4k+2 - mid + pmull2 $t7.1q, $res3.2d, $h1.2d // GHASH block 4k+3 - high + eor $t9.8b, $t9.8b, $res3.8b // GHASH block 4k+3 - mid + aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 7 + eor $input_l1, $input_l1, $rkN_l // AES block 4k+5 - round N low + aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 8 + eor $acc_mb, $acc_mb, $t6.16b // GHASH block 4k+2 - mid + aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 7 + eor $input_l2, $input_l2, $rkN_l // AES block 4k+6 - round N low + aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 8 + movi $mod_constant.8b, #0xc2 + pmull $t9.1q, $t9.1d, $h12k.1d // GHASH block 4k+3 - mid + eor $acc_hb, $acc_hb, $t7.16b // GHASH block 4k+3 - high + cmp $rounds, #12 // setup flags for AES-128/192/256 check + fmov $ctr_t1d, $input_l1 // AES block 4k+5 - mov low + ldp $input_l0, $input_h0, [$input_ptr, #0] // AES block 4k+4 - load plaintext + b.lt .Lenc_main_loop_continue // branch if AES-128 + + aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 9 + aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 9 + aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 9 + aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 9 + aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 10 + aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 10 + aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 10 + aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 10 + b.eq .Lenc_main_loop_continue // branch if AES-192 + + aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 11 + aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 11 + aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 11 + aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 11 + aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 12 + aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 12 + aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 12 + aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 12 + +.Lenc_main_loop_continue: + shl $mod_constantd, $mod_constantd, #56 // mod_constant + eor $acc_lb, $acc_lb, $t8.16b // GHASH block 4k+3 - low + eor $acc_mb, $acc_mb, $t9.16b // GHASH block 4k+3 - mid + add $rctr32w, $rctr32w, #1 // CTR block 4k+3 + eor $t9.16b, $acc_lb, $acc_hb // MODULO - karatsuba tidy up + add $input_ptr, $input_ptr, #64 // AES input_ptr update + pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d // MODULO - top 64b align with mid + rev $ctr32w, $rctr32w // CTR block 4k+8 + ext $acc_hb, $acc_hb, $acc_hb, #8 // MODULO - other top alignment + eor $input_l0, $input_l0, $rkN_l // AES block 4k+4 - round N low + eor $acc_mb, $acc_mb, $t9.16b // MODULO - karatsuba tidy up + eor $input_h0, $input_h0, $rkN_h // AES block 4k+4 - round N high + fmov $ctr_t0d, $input_l0 // AES block 4k+4 - mov low + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+8 + eor $mod_t.16b, $acc_hb, $mod_t.16b // MODULO - fold into mid + eor $input_h1, $input_h1, $rkN_h // AES block 4k+5 - round N high + eor $input_h3, $input_h3, $rkN_h // AES block 4k+7 - round N high + add $rctr32w, $rctr32w, #1 // CTR block 4k+8 + aese $ctr0b, $rkNm1 // AES block 4k+4 - round N-1 + fmov $ctr_t0.d[1], $input_h0 // AES block 4k+4 - mov high + eor $acc_mb, $acc_mb, $mod_t.16b // MODULO - fold into mid + fmov $ctr_t3d, $input_l3 // AES block 4k+7 - mov low + aese $ctr1b, $rkNm1 // AES block 4k+5 - round N-1 + fmov $ctr_t1.d[1], $input_h1 // AES block 4k+5 - mov high + fmov $ctr_t2d, $input_l2 // AES block 4k+6 - mov low + cmp $input_ptr, $main_end_input_ptr // LOOP CONTROL + fmov $ctr_t2.d[1], $input_h2 // AES block 4k+6 - mov high + pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d // MODULO - mid 64b align with low + eor $res0b, $ctr_t0b, $ctr0b // AES block 4k+4 - result + fmov $ctr0d, $ctr96_b64x // CTR block 4k+8 + fmov $ctr0.d[1], $ctr32x // CTR block 4k+8 + rev $ctr32w, $rctr32w // CTR block 4k+9 + add $rctr32w, $rctr32w, #1 // CTR block 4k+9 + eor $res1b, $ctr_t1b, $ctr1b // AES block 4k+5 - result + fmov $ctr1d, $ctr96_b64x // CTR block 4k+9 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+9 + fmov $ctr1.d[1], $ctr32x // CTR block 4k+9 + aese $ctr2b, $rkNm1 // AES block 4k+6 - round N-1 + rev $ctr32w, $rctr32w // CTR block 4k+10 + st1 { $res0b}, [$output_ptr], #16 // AES block 4k+4 - store result + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+10 + eor $acc_lb, $acc_lb, $acc_hb // MODULO - fold into low + fmov $ctr_t3.d[1], $input_h3 // AES block 4k+7 - mov high + ext $acc_mb, $acc_mb, $acc_mb, #8 // MODULO - other mid alignment + st1 { $res1b}, [$output_ptr], #16 // AES block 4k+5 - store result + add $rctr32w, $rctr32w, #1 // CTR block 4k+10 + aese $ctr3b, $rkNm1 // AES block 4k+7 - round N-1 + eor $res2b, $ctr_t2b, $ctr2b // AES block 4k+6 - result + fmov $ctr2d, $ctr96_b64x // CTR block 4k+10 + st1 { $res2b}, [$output_ptr], #16 // AES block 4k+6 - store result + fmov $ctr2.d[1], $ctr32x // CTR block 4k+10 + rev $ctr32w, $rctr32w // CTR block 4k+11 + eor $acc_lb, $acc_lb, $acc_mb // MODULO - fold into low + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+11 + eor $res3b, $ctr_t3b, $ctr3b // AES block 4k+7 - result + st1 { $res3b}, [$output_ptr], #16 // AES block 4k+7 - store result + b.lt .Lenc_main_loop + +.Lenc_prepretail: // PREPRETAIL + aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 0 + rev64 $res2b, $res2b // GHASH block 4k+2 (t0, t1, and t2 free) + aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 0 + fmov $ctr3d, $ctr96_b64x // CTR block 4k+3 + aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 0 + rev64 $res0b, $res0b // GHASH block 4k (only t0 is free) + fmov $ctr3.d[1], $ctr32x // CTR block 4k+3 + ext $acc_lb, $acc_lb, $acc_lb, #8 // PRE 0 + aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 1 + aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 1 + eor $res0b, $res0b, $acc_lb // PRE 1 + rev64 $res1b, $res1b // GHASH block 4k+1 (t0 and t1 free) + aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 2 + aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 0 + mov $acc_md, $h34k.d[1] // GHASH block 4k - mid + aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 1 + pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH block 4k - low + mov $t0d, $res0.d[1] // GHASH block 4k - mid + pmull2 $acc_h.1q, $res0.2d, $h4.2d // GHASH block 4k - high + aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 3 + aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 2 + eor $t0.8b, $t0.8b, $res0.8b // GHASH block 4k - mid + aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 2 + aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 1 + aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 3 + pmull $acc_m.1q, $t0.1d, $acc_m.1d // GHASH block 4k - mid + pmull2 $t1.1q, $res1.2d, $h3.2d // GHASH block 4k+1 - high + pmull $t2.1q, $res1.1d, $h3.1d // GHASH block 4k+1 - low + aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 2 + eor $acc_hb, $acc_hb, $t1.16b // GHASH block 4k+1 - high + mov $t3d, $res1.d[1] // GHASH block 4k+1 - mid + aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 3 + eor $acc_lb, $acc_lb, $t2.16b // GHASH block 4k+1 - low + aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 3 + eor $t3.8b, $t3.8b, $res1.8b // GHASH block 4k+1 - mid + mov $t6d, $res2.d[1] // GHASH block 4k+2 - mid + aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 4 + rev64 $res3b, $res3b // GHASH block 4k+3 (t0, t1, t2 and t3 free) + aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 4 + pmull $t3.1q, $t3.1d, $h34k.1d // GHASH block 4k+1 - mid + eor $t6.8b, $t6.8b, $res2.8b // GHASH block 4k+2 - mid + add $rctr32w, $rctr32w, #1 // CTR block 4k+3 + pmull $t5.1q, $res2.1d, $h2.1d // GHASH block 4k+2 - low + aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 5 + aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 4 + eor $acc_mb, $acc_mb, $t3.16b // GHASH block 4k+1 - mid + pmull2 $t4.1q, $res2.2d, $h2.2d // GHASH block 4k+2 - high + eor $acc_lb, $acc_lb, $t5.16b // GHASH block 4k+2 - low + ins $t6.d[1], $t6.d[0] // GHASH block 4k+2 - mid + aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 5 + eor $acc_hb, $acc_hb, $t4.16b // GHASH block 4k+2 - high + mov $t9d, $res3.d[1] // GHASH block 4k+3 - mid + aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 4 + pmull2 $t6.1q, $t6.2d, $h12k.2d // GHASH block 4k+2 - mid + eor $t9.8b, $t9.8b, $res3.8b // GHASH block 4k+3 - mid + pmull2 $t7.1q, $res3.2d, $h1.2d // GHASH block 4k+3 - high + aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 5 + pmull $t9.1q, $t9.1d, $h12k.1d // GHASH block 4k+3 - mid + eor $acc_mb, $acc_mb, $t6.16b // GHASH block 4k+2 - mid + aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 5 + aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 6 + aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 6 + aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 6 + movi $mod_constant.8b, #0xc2 + aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 6 + aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 7 + eor $acc_hb, $acc_hb, $t7.16b // GHASH block 4k+3 - high + aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 7 + aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 7 + shl $mod_constantd, $mod_constantd, #56 // mod_constant + aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 8 + eor $acc_mb, $acc_mb, $t9.16b // GHASH block 4k+3 - mid + pmull $t8.1q, $res3.1d, $h1.1d // GHASH block 4k+3 - low + aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 8 + cmp $rounds, #12 // setup flags for AES-128/192/256 check + aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 8 + eor $acc_lb, $acc_lb, $t8.16b // GHASH block 4k+3 - low + aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 7 + eor $acc_mb, $acc_mb, $acc_hb // karatsuba tidy up + aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 8 + pmull $t1.1q, $acc_h.1d, $mod_constant.1d + ext $acc_hb, $acc_hb, $acc_hb, #8 + eor $acc_mb, $acc_mb, $acc_lb + b.lt .Lenc_finish_prepretail // branch if AES-128 + + aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 9 + aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 9 + aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 9 + aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 9 + aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 10 + aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 10 + aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 10 + aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 10 + b.eq .Lenc_finish_prepretail // branch if AES-192 + + aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 11 + aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 11 + aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 11 + aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 11 + aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 12 + aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 12 + aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 12 + aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 12 + +.Lenc_finish_prepretail: + eor $acc_mb, $acc_mb, $t1.16b + eor $acc_mb, $acc_mb, $acc_hb + pmull $t1.1q, $acc_m.1d, $mod_constant.1d + ext $acc_mb, $acc_mb, $acc_mb, #8 + aese $ctr1b, $rkNm1 // AES block 4k+5 - round N-1 + eor $acc_lb, $acc_lb, $t1.16b + aese $ctr3b, $rkNm1 // AES block 4k+7 - round N-1 + aese $ctr0b, $rkNm1 // AES block 4k+4 - round N-1 + aese $ctr2b, $rkNm1 // AES block 4k+6 - round N-1 + eor $acc_lb, $acc_lb, $acc_mb + +.Lenc_tail: // TAIL + ext $t0.16b, $acc_lb, $acc_lb, #8 // prepare final partial tag + sub $main_end_input_ptr, $end_input_ptr, $input_ptr // main_end_input_ptr is number of bytes left to process + ldp $input_l0, $input_h0, [$input_ptr], #16 // AES block 4k+4 - load plaintext + eor $input_l0, $input_l0, $rkN_l // AES block 4k+4 - round N low + eor $input_h0, $input_h0, $rkN_h // AES block 4k+4 - round N high + cmp $main_end_input_ptr, #48 + fmov $ctr_t0d, $input_l0 // AES block 4k+4 - mov low + fmov $ctr_t0.d[1], $input_h0 // AES block 4k+4 - mov high + eor $res1b, $ctr_t0b, $ctr0b // AES block 4k+4 - result + b.gt .Lenc_blocks_more_than_3 + cmp $main_end_input_ptr, #32 + mov $ctr3b, $ctr2b + movi $acc_l.8b, #0 + movi $acc_h.8b, #0 + sub $rctr32w, $rctr32w, #1 + mov $ctr2b, $ctr1b + movi $acc_m.8b, #0 + b.gt .Lenc_blocks_more_than_2 + mov $ctr3b, $ctr1b + sub $rctr32w, $rctr32w, #1 + cmp $main_end_input_ptr, #16 + b.gt .Lenc_blocks_more_than_1 + sub $rctr32w, $rctr32w, #1 + b .Lenc_blocks_less_than_1 +.Lenc_blocks_more_than_3: // blocks left > 3 + st1 { $res1b}, [$output_ptr], #16 // AES final-3 block - store result + ldp $input_l0, $input_h0, [$input_ptr], #16 // AES final-2 block - load input low & high + rev64 $res0b, $res1b // GHASH final-3 block + eor $input_l0, $input_l0, $rkN_l // AES final-2 block - round N low + eor $res0b, $res0b, $t0.16b // feed in partial tag + eor $input_h0, $input_h0, $rkN_h // AES final-2 block - round N high + mov $rk4d, $res0.d[1] // GHASH final-3 block - mid + fmov $res1d, $input_l0 // AES final-2 block - mov low + fmov $res1.d[1], $input_h0 // AES final-2 block - mov high + eor $rk4v.8b, $rk4v.8b, $res0.8b // GHASH final-3 block - mid + movi $t0.8b, #0 // suppress further partial tag feed in + mov $acc_md, $h34k.d[1] // GHASH final-3 block - mid + pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH final-3 block - low + pmull2 $acc_h.1q, $res0.2d, $h4.2d // GHASH final-3 block - high + pmull $acc_m.1q, $rk4v.1d, $acc_m.1d // GHASH final-3 block - mid + eor $res1b, $res1b, $ctr1b // AES final-2 block - result +.Lenc_blocks_more_than_2: // blocks left > 2 + st1 { $res1b}, [$output_ptr], #16 // AES final-2 block - store result + ldp $input_l0, $input_h0, [$input_ptr], #16 // AES final-1 block - load input low & high + rev64 $res0b, $res1b // GHASH final-2 block + eor $input_l0, $input_l0, $rkN_l // AES final-1 block - round N low + eor $res0b, $res0b, $t0.16b // feed in partial tag + fmov $res1d, $input_l0 // AES final-1 block - mov low + eor $input_h0, $input_h0, $rkN_h // AES final-1 block - round N high + fmov $res1.d[1], $input_h0 // AES final-1 block - mov high + movi $t0.8b, #0 // suppress further partial tag feed in + pmull2 $rk2q1, $res0.2d, $h3.2d // GHASH final-2 block - high + mov $rk4d, $res0.d[1] // GHASH final-2 block - mid + pmull $rk3q1, $res0.1d, $h3.1d // GHASH final-2 block - low + eor $rk4v.8b, $rk4v.8b, $res0.8b // GHASH final-2 block - mid + eor $res1b, $res1b, $ctr2b // AES final-1 block - result + eor $acc_hb, $acc_hb, $rk2 // GHASH final-2 block - high + pmull $rk4v.1q, $rk4v.1d, $h34k.1d // GHASH final-2 block - mid + eor $acc_lb, $acc_lb, $rk3 // GHASH final-2 block - low + eor $acc_mb, $acc_mb, $rk4v.16b // GHASH final-2 block - mid +.Lenc_blocks_more_than_1: // blocks left > 1 + st1 { $res1b}, [$output_ptr], #16 // AES final-1 block - store result + rev64 $res0b, $res1b // GHASH final-1 block + ldp $input_l0, $input_h0, [$input_ptr], #16 // AES final block - load input low & high + eor $res0b, $res0b, $t0.16b // feed in partial tag + movi $t0.8b, #0 // suppress further partial tag feed in + eor $input_l0, $input_l0, $rkN_l // AES final block - round N low + mov $rk4d, $res0.d[1] // GHASH final-1 block - mid + pmull2 $rk2q1, $res0.2d, $h2.2d // GHASH final-1 block - high + eor $input_h0, $input_h0, $rkN_h // AES final block - round N high + eor $rk4v.8b, $rk4v.8b, $res0.8b // GHASH final-1 block - mid + eor $acc_hb, $acc_hb, $rk2 // GHASH final-1 block - high + ins $rk4v.d[1], $rk4v.d[0] // GHASH final-1 block - mid + fmov $res1d, $input_l0 // AES final block - mov low + fmov $res1.d[1], $input_h0 // AES final block - mov high + pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d // GHASH final-1 block - mid + pmull $rk3q1, $res0.1d, $h2.1d // GHASH final-1 block - low + eor $res1b, $res1b, $ctr3b // AES final block - result + eor $acc_mb, $acc_mb, $rk4v.16b // GHASH final-1 block - mid + eor $acc_lb, $acc_lb, $rk3 // GHASH final-1 block - low +.Lenc_blocks_less_than_1: // blocks left <= 1 + and $bit_length, $bit_length, #127 // bit_length %= 128 + mvn $rkN_l, xzr // rkN_l = 0xffffffffffffffff + sub $bit_length, $bit_length, #128 // bit_length -= 128 + neg $bit_length, $bit_length // bit_length = 128 - #bits in input (in range [1,128]) + ld1 { $rk0}, [$output_ptr] // load existing bytes where the possibly partial last block is to be stored + mvn $rkN_h, xzr // rkN_h = 0xffffffffffffffff + and $bit_length, $bit_length, #127 // bit_length %= 128 + lsr $rkN_h, $rkN_h, $bit_length // rkN_h is mask for top 64b of last block + cmp $bit_length, #64 + csel $input_l0, $rkN_l, $rkN_h, lt + csel $input_h0, $rkN_h, xzr, lt + fmov $ctr0d, $input_l0 // ctr0b is mask for last block + fmov $ctr0.d[1], $input_h0 + and $res1b, $res1b, $ctr0b // possibly partial last block has zeroes in highest bits + rev64 $res0b, $res1b // GHASH final block + eor $res0b, $res0b, $t0.16b // feed in partial tag + bif $res1b, $rk0, $ctr0b // insert existing bytes in top end of result before storing + pmull2 $rk2q1, $res0.2d, $h1.2d // GHASH final block - high + mov $t0d, $res0.d[1] // GHASH final block - mid + rev $ctr32w, $rctr32w + pmull $rk3q1, $res0.1d, $h1.1d // GHASH final block - low + eor $acc_hb, $acc_hb, $rk2 // GHASH final block - high + eor $t0.8b, $t0.8b, $res0.8b // GHASH final block - mid + pmull $t0.1q, $t0.1d, $h12k.1d // GHASH final block - mid + eor $acc_lb, $acc_lb, $rk3 // GHASH final block - low + eor $acc_mb, $acc_mb, $t0.16b // GHASH final block - mid + movi $mod_constant.8b, #0xc2 + eor $t9.16b, $acc_lb, $acc_hb // MODULO - karatsuba tidy up + shl $mod_constantd, $mod_constantd, #56 // mod_constant + eor $acc_mb, $acc_mb, $t9.16b // MODULO - karatsuba tidy up + pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d // MODULO - top 64b align with mid + ext $acc_hb, $acc_hb, $acc_hb, #8 // MODULO - other top alignment + eor $acc_mb, $acc_mb, $mod_t.16b // MODULO - fold into mid + eor $acc_mb, $acc_mb, $acc_hb // MODULO - fold into mid + pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d // MODULO - mid 64b align with low + ext $acc_mb, $acc_mb, $acc_mb, #8 // MODULO - other mid alignment + str $ctr32w, [$counter, #12] // store the updated counter + st1 { $res1b}, [$output_ptr] // store all 16B + eor $acc_lb, $acc_lb, $acc_hb // MODULO - fold into low + eor $acc_lb, $acc_lb, $acc_mb // MODULO - fold into low + ext $acc_lb, $acc_lb, $acc_lb, #8 + rev64 $acc_lb, $acc_lb + mov x0, $len + st1 { $acc_l.16b }, [$current_tag] + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x23, x24, [sp, #48] + ldp d8, d9, [sp, #64] + ldp d10, d11, [sp, #80] + ldp d12, d13, [sp, #96] + ldp d14, d15, [sp, #112] + ldp x29, x30, [sp], #128 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size aes_gcm_enc_kernel,.-aes_gcm_enc_kernel +___ + +{ +my $t8="v4"; +my $t8d="d4"; +my $t9="v6"; +my $t9d="d6"; +################################################################################ +# size_t aes_gcm_dec_kernel(const uint8_t *in, +# size_t len_bits, +# uint8_t *out, +# u64 *Xi, +# uint8_t ivec[16], +# const void *key); +# +$code.=<<___; +.global aes_gcm_dec_kernel +.type aes_gcm_dec_kernel,%function +.align 4 +aes_gcm_dec_kernel: + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp, #-128]! + mov x29, sp + stp x19, x20, [sp, #16] + mov $counter, x4 + mov $cc, x5 + stp x21, x22, [sp, #32] + stp x23, x24, [sp, #48] + stp d8, d9, [sp, #64] + stp d10, d11, [sp, #80] + stp d12, d13, [sp, #96] + stp d14, d15, [sp, #112] + ldr $roundsw, [$cc, #240] + add $input_l1, $cc, $rounds, lsl #4 // borrow input_l1 for last key + ldp $rkN_l, $rkN_h, [$input_l1] // load round N keys + ldr $rkNm1q, [$input_l1, #-16] // load round N-1 keys + lsr $main_end_input_ptr, $bit_length, #3 // byte_len + mov $len, $main_end_input_ptr + ldp $ctr96_b64x, $ctr96_t32x, [$counter] // ctr96_b64, ctr96_t32 + ldr $rk8q, [$cc, #128] // load rk8 + sub $main_end_input_ptr, $main_end_input_ptr, #1 // byte_len - 1 + ldr $rk7q, [$cc, #112] // load rk7 + and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + add $end_input_ptr, $input_ptr, $bit_length, lsr #3 // end_input_ptr + ldr $rk6q, [$cc, #96] // load rk6 + lsr $rctr32x, $ctr96_t32x, #32 + ldr $rk5q, [$cc, #80] // load rk5 + orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w + ldr $rk3q, [$cc, #48] // load rk3 + add $main_end_input_ptr, $main_end_input_ptr, $input_ptr + rev $rctr32w, $rctr32w // rev_ctr32 + add $rctr32w, $rctr32w, #1 // increment rev_ctr32 + fmov $ctr3d, $ctr96_b64x // CTR block 3 + rev $ctr32w, $rctr32w // CTR block 1 + add $rctr32w, $rctr32w, #1 // CTR block 1 + fmov $ctr1d, $ctr96_b64x // CTR block 1 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 1 + ld1 { $ctr0b}, [$counter] // special case vector load initial counter so we can start first AES block as quickly as possible + fmov $ctr1.d[1], $ctr32x // CTR block 1 + rev $ctr32w, $rctr32w // CTR block 2 + add $rctr32w, $rctr32w, #1 // CTR block 2 + fmov $ctr2d, $ctr96_b64x // CTR block 2 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 2 + fmov $ctr2.d[1], $ctr32x // CTR block 2 + rev $ctr32w, $rctr32w // CTR block 3 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 3 + ldr $rk0q, [$cc, #0] // load rk0 + fmov $ctr3.d[1], $ctr32x // CTR block 3 + add $rctr32w, $rctr32w, #1 // CTR block 3 + ldr $rk4q, [$cc, #64] // load rk4 + ldr $rk1q, [$cc, #16] // load rk1 + aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 0 + ldr $h3q, [$Htable, #48] // load h3l | h3h + ext $h3b, $h3b, $h3b, #8 + aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 0 + ldr $h4q, [$Htable, #80] // load h4l | h4h + ext $h4b, $h4b, $h4b, #8 + aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 0 + ldr $h2q, [$Htable, #32] // load h2l | h2h + ext $h2b, $h2b, $h2b, #8 + aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 0 + ldr $rk2q, [$cc, #32] // load rk2 + aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 1 + aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 1 + ld1 { $acc_lb}, [$current_tag] + ext $acc_lb, $acc_lb, $acc_lb, #8 + rev64 $acc_lb, $acc_lb + aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 1 + ldr $rk9q, [$cc, #144] // load rk9 + aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 1 + ldr $rk12q, [$cc, #192] // load rk12 + aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 2 + ldr $h1q, [$Htable] // load h1l | h1h + ext $h1b, $h1b, $h1b, #8 + aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 2 + ldr $rk10q, [$cc, #160] // load rk10 + aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 2 + aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 3 + aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 2 + aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 3 + aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 4 + aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 3 + aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 3 + aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 4 + aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 4 + aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 4 + aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 5 + aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 5 + aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 5 + aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 5 + aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 6 + aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 6 + cmp $rounds, #12 // setup flags for AES-128/192/256 check + aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 6 + aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 6 + aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 7 + aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 7 + aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 7 + aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 8 + aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 7 + aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 8 + aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 8 + ldr $rk11q, [$cc, #176] // load rk11 + aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 8 + b.lt .Ldec_finish_first_blocks // branch if AES-128 + + aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 9 + aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 9 + aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 9 + aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 9 + aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 10 + aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 10 + aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 10 + aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 10 + b.eq .Ldec_finish_first_blocks // branch if AES-192 + + aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 11 + aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 11 + aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 11 + aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 11 + aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 12 + aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 12 + aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 12 + aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 12 + +.Ldec_finish_first_blocks: + cmp $input_ptr, $main_end_input_ptr // check if we have <= 4 blocks + trn1 $acc_h.2d, $h3.2d, $h4.2d // h4h | h3h + trn2 $h34k.2d, $h3.2d, $h4.2d // h4l | h3l + trn1 $t0.2d, $h1.2d, $h2.2d // h2h | h1h + trn2 $h12k.2d, $h1.2d, $h2.2d // h2l | h1l + eor $h34k.16b, $h34k.16b, $acc_h.16b // h4k | h3k + aese $ctr1b, $rkNm1 // AES block 1 - round N-1 + aese $ctr2b, $rkNm1 // AES block 2 - round N-1 + eor $h12k.16b, $h12k.16b, $t0.16b // h2k | h1k + aese $ctr3b, $rkNm1 // AES block 3 - round N-1 + aese $ctr0b, $rkNm1 // AES block 0 - round N-1 + b.ge .Ldec_tail // handle tail + + ldr $res0q, [$input_ptr, #0] // AES block 0 - load ciphertext + ldr $res1q, [$input_ptr, #16] // AES block 1 - load ciphertext + rev $ctr32w, $rctr32w // CTR block 4 + eor $ctr0b, $res0b, $ctr0b // AES block 0 - result + eor $ctr1b, $res1b, $ctr1b // AES block 1 - result + rev64 $res1b, $res1b // GHASH block 1 + ldr $res3q, [$input_ptr, #48] // AES block 3 - load ciphertext + mov $output_h0, $ctr0.d[1] // AES block 0 - mov high + mov $output_l0, $ctr0.d[0] // AES block 0 - mov low + rev64 $res0b, $res0b // GHASH block 0 + add $rctr32w, $rctr32w, #1 // CTR block 4 + fmov $ctr0d, $ctr96_b64x // CTR block 4 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4 + fmov $ctr0.d[1], $ctr32x // CTR block 4 + rev $ctr32w, $rctr32w // CTR block 5 + add $rctr32w, $rctr32w, #1 // CTR block 5 + mov $output_l1, $ctr1.d[0] // AES block 1 - mov low + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 5 + mov $output_h1, $ctr1.d[1] // AES block 1 - mov high + eor $output_h0, $output_h0, $rkN_h // AES block 0 - round N high + eor $output_l0, $output_l0, $rkN_l // AES block 0 - round N low + stp $output_l0, $output_h0, [$output_ptr], #16 // AES block 0 - store result + fmov $ctr1d, $ctr96_b64x // CTR block 5 + ldr $res2q, [$input_ptr, #32] // AES block 2 - load ciphertext + add $input_ptr, $input_ptr, #64 // AES input_ptr update + fmov $ctr1.d[1], $ctr32x // CTR block 5 + rev $ctr32w, $rctr32w // CTR block 6 + add $rctr32w, $rctr32w, #1 // CTR block 6 + eor $output_l1, $output_l1, $rkN_l // AES block 1 - round N low + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 6 + eor $output_h1, $output_h1, $rkN_h // AES block 1 - round N high + stp $output_l1, $output_h1, [$output_ptr], #16 // AES block 1 - store result + eor $ctr2b, $res2b, $ctr2b // AES block 2 - result + cmp $input_ptr, $main_end_input_ptr // check if we have <= 8 blocks + b.ge .Ldec_prepretail // do prepretail + +.Ldec_main_loop: // main loop start + mov $output_l2, $ctr2.d[0] // AES block 4k+2 - mov low + ext $acc_lb, $acc_lb, $acc_lb, #8 // PRE 0 + eor $ctr3b, $res3b, $ctr3b // AES block 4k+3 - result + aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 0 + mov $output_h2, $ctr2.d[1] // AES block 4k+2 - mov high + aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 0 + fmov $ctr2d, $ctr96_b64x // CTR block 4k+6 + fmov $ctr2.d[1], $ctr32x // CTR block 4k+6 + eor $res0b, $res0b, $acc_lb // PRE 1 + rev $ctr32w, $rctr32w // CTR block 4k+7 + aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 1 + mov $output_h3, $ctr3.d[1] // AES block 4k+3 - mov high + aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 1 + mov $output_l3, $ctr3.d[0] // AES block 4k+3 - mov low + pmull2 $acc_h.1q, $res0.2d, $h4.2d // GHASH block 4k - high + mov $t0d, $res0.d[1] // GHASH block 4k - mid + fmov $ctr3d, $ctr96_b64x // CTR block 4k+7 + aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 2 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+7 + aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 0 + fmov $ctr3.d[1], $ctr32x // CTR block 4k+7 + aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 2 + eor $t0.8b, $t0.8b, $res0.8b // GHASH block 4k - mid + aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 3 + eor $output_h2, $output_h2, $rkN_h // AES block 4k+2 - round N high + aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 1 + mov $acc_md, $h34k.d[1] // GHASH block 4k - mid + aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 3 + rev64 $res2b, $res2b // GHASH block 4k+2 + aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 0 + eor $output_l2, $output_l2, $rkN_l // AES block 4k+2 - round N low + aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 2 + stp $output_l2, $output_h2, [$output_ptr], #16 // AES block 4k+2 - store result + pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH block 4k - low + pmull2 $t1.1q, $res1.2d, $h3.2d // GHASH block 4k+1 - high + aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 3 + rev64 $res3b, $res3b // GHASH block 4k+3 + pmull $acc_m.1q, $t0.1d, $acc_m.1d // GHASH block 4k - mid + eor $output_l3, $output_l3, $rkN_l // AES block 4k+3 - round N low + pmull $t2.1q, $res1.1d, $h3.1d // GHASH block 4k+1 - low + eor $output_h3, $output_h3, $rkN_h // AES block 4k+3 - round N high + eor $acc_hb, $acc_hb, $t1.16b // GHASH block 4k+1 - high + aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 4 + aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 1 + mov $t3d, $res1.d[1] // GHASH block 4k+1 - mid + aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 4 + eor $acc_lb, $acc_lb, $t2.16b // GHASH block 4k+1 - low + aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 5 + add $rctr32w, $rctr32w, #1 // CTR block 4k+7 + aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 2 + mov $t6d, $res2.d[1] // GHASH block 4k+2 - mid + aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 4 + eor $t3.8b, $t3.8b, $res1.8b // GHASH block 4k+1 - mid + pmull $t5.1q, $res2.1d, $h2.1d // GHASH block 4k+2 - low + aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 3 + eor $t6.8b, $t6.8b, $res2.8b // GHASH block 4k+2 - mid + aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 5 + aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 5 + eor $acc_lb, $acc_lb, $t5.16b // GHASH block 4k+2 - low + pmull $t3.1q, $t3.1d, $h34k.1d // GHASH block 4k+1 - mid + rev $ctr32w, $rctr32w // CTR block 4k+8 + aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 6 + ins $t6.d[1], $t6.d[0] // GHASH block 4k+2 - mid + aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 6 + add $rctr32w, $rctr32w, #1 // CTR block 4k+8 + aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 4 + aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 7 + eor $acc_mb, $acc_mb, $t3.16b // GHASH block 4k+1 - mid + aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 7 + pmull2 $t4.1q, $res2.2d, $h2.2d // GHASH block 4k+2 - high + mov $t9d, $res3.d[1] // GHASH block 4k+3 - mid + aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 5 + pmull2 $t6.1q, $t6.2d, $h12k.2d // GHASH block 4k+2 - mid + aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 8 + eor $acc_hb, $acc_hb, $t4.16b // GHASH block 4k+2 - high + aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 6 + pmull $t8.1q, $res3.1d, $h1.1d // GHASH block 4k+3 - low + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+8 + eor $acc_mb, $acc_mb, $t6.16b // GHASH block 4k+2 - mid + pmull2 $t7.1q, $res3.2d, $h1.2d // GHASH block 4k+3 - high + cmp $rounds, #12 // setup flags for AES-128/192/256 check + eor $t9.8b, $t9.8b, $res3.8b // GHASH block 4k+3 - mid + aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 8 + aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 6 + eor $acc_hb, $acc_hb, $t7.16b // GHASH block 4k+3 - high + pmull $t9.1q, $t9.1d, $h12k.1d // GHASH block 4k+3 - mid + movi $mod_constant.8b, #0xc2 + aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 7 + eor $acc_lb, $acc_lb, $t8.16b // GHASH block 4k+3 - low + aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 7 + shl $mod_constantd, $mod_constantd, #56 // mod_constant + aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 8 + eor $acc_mb, $acc_mb, $t9.16b // GHASH block 4k+3 - mid + aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 8 + b.lt .Ldec_main_loop_continue // branch if AES-128 + + aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 9 + aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 9 + aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 9 + aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 9 + aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 10 + aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 10 + aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 10 + aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 10 + b.eq .Ldec_main_loop_continue // branch if AES-192 + + aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 11 + aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 11 + aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 11 + aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 11 + aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 12 + aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 12 + aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 12 + aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 12 + +.Ldec_main_loop_continue: + pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d // MODULO - top 64b align with mid + eor $t9.16b, $acc_lb, $acc_hb // MODULO - karatsuba tidy up + ldr $res0q, [$input_ptr, #0] // AES block 4k+4 - load ciphertext + aese $ctr0b, $rkNm1 // AES block 4k+4 - round N-1 + ext $acc_hb, $acc_hb, $acc_hb, #8 // MODULO - other top alignment + eor $acc_mb, $acc_mb, $t9.16b // MODULO - karatsuba tidy up + ldr $res1q, [$input_ptr, #16] // AES block 4k+5 - load ciphertext + eor $ctr0b, $res0b, $ctr0b // AES block 4k+4 - result + stp $output_l3, $output_h3, [$output_ptr], #16 // AES block 4k+3 - store result + eor $acc_mb, $acc_mb, $mod_t.16b // MODULO - fold into mid + ldr $res3q, [$input_ptr, #48] // AES block 4k+7 - load ciphertext + ldr $res2q, [$input_ptr, #32] // AES block 4k+6 - load ciphertext + mov $output_h0, $ctr0.d[1] // AES block 4k+4 - mov high + eor $acc_mb, $acc_mb, $acc_hb // MODULO - fold into mid + aese $ctr1b, $rkNm1 // AES block 4k+5 - round N-1 + add $input_ptr, $input_ptr, #64 // AES input_ptr update + mov $output_l0, $ctr0.d[0] // AES block 4k+4 - mov low + fmov $ctr0d, $ctr96_b64x // CTR block 4k+8 + fmov $ctr0.d[1], $ctr32x // CTR block 4k+8 + pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d // MODULO - mid 64b align with low + eor $ctr1b, $res1b, $ctr1b // AES block 4k+5 - result + rev $ctr32w, $rctr32w // CTR block 4k+9 + aese $ctr2b, $rkNm1 // AES block 4k+6 - round N-1 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+9 + cmp $input_ptr, $main_end_input_ptr // LOOP CONTROL + add $rctr32w, $rctr32w, #1 // CTR block 4k+9 + eor $output_l0, $output_l0, $rkN_l // AES block 4k+4 - round N low + eor $output_h0, $output_h0, $rkN_h // AES block 4k+4 - round N high + mov $output_h1, $ctr1.d[1] // AES block 4k+5 - mov high + eor $ctr2b, $res2b, $ctr2b // AES block 4k+6 - result + eor $acc_lb, $acc_lb, $mod_constant.16b // MODULO - fold into low + mov $output_l1, $ctr1.d[0] // AES block 4k+5 - mov low + fmov $ctr1d, $ctr96_b64x // CTR block 4k+9 + ext $acc_mb, $acc_mb, $acc_mb, #8 // MODULO - other mid alignment + fmov $ctr1.d[1], $ctr32x // CTR block 4k+9 + rev $ctr32w, $rctr32w // CTR block 4k+10 + add $rctr32w, $rctr32w, #1 // CTR block 4k+10 + aese $ctr3b, $rkNm1 // AES block 4k+7 - round N-1 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+10 + rev64 $res1b, $res1b // GHASH block 4k+5 + eor $output_h1, $output_h1, $rkN_h // AES block 4k+5 - round N high + stp $output_l0, $output_h0, [$output_ptr], #16 // AES block 4k+4 - store result + eor $output_l1, $output_l1, $rkN_l // AES block 4k+5 - round N low + stp $output_l1, $output_h1, [$output_ptr], #16 // AES block 4k+5 - store result + rev64 $res0b, $res0b // GHASH block 4k+4 + eor $acc_lb, $acc_lb, $acc_mb // MODULO - fold into low + b.lt .Ldec_main_loop + +.Ldec_prepretail: // PREPRETAIL + ext $acc_lb, $acc_lb, $acc_lb, #8 // PRE 0 + mov $output_l2, $ctr2.d[0] // AES block 4k+2 - mov low + eor $ctr3b, $res3b, $ctr3b // AES block 4k+3 - result + aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 0 + mov $output_h2, $ctr2.d[1] // AES block 4k+2 - mov high + aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 0 + fmov $ctr2d, $ctr96_b64x // CTR block 4k+6 + fmov $ctr2.d[1], $ctr32x // CTR block 4k+6 + rev $ctr32w, $rctr32w // CTR block 4k+7 + eor $res0b, $res0b, $acc_lb // PRE 1 + rev64 $res2b, $res2b // GHASH block 4k+2 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+7 + mov $output_l3, $ctr3.d[0] // AES block 4k+3 - mov low + aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 1 + mov $output_h3, $ctr3.d[1] // AES block 4k+3 - mov high + pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH block 4k - low + mov $t0d, $res0.d[1] // GHASH block 4k - mid + fmov $ctr3d, $ctr96_b64x // CTR block 4k+7 + pmull2 $acc_h.1q, $res0.2d, $h4.2d // GHASH block 4k - high + fmov $ctr3.d[1], $ctr32x // CTR block 4k+7 + aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 0 + mov $acc_md, $h34k.d[1] // GHASH block 4k - mid + aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 1 + eor $t0.8b, $t0.8b, $res0.8b // GHASH block 4k - mid + pmull2 $t1.1q, $res1.2d, $h3.2d // GHASH block 4k+1 - high + aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 1 + rev64 $res3b, $res3b // GHASH block 4k+3 + aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 0 + pmull $acc_m.1q, $t0.1d, $acc_m.1d // GHASH block 4k - mid + eor $acc_hb, $acc_hb, $t1.16b // GHASH block 4k+1 - high + pmull $t2.1q, $res1.1d, $h3.1d // GHASH block 4k+1 - low + aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 1 + mov $t3d, $res1.d[1] // GHASH block 4k+1 - mid + aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 2 + aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 2 + eor $acc_lb, $acc_lb, $t2.16b // GHASH block 4k+1 - low + aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 2 + aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 3 + mov $t6d, $res2.d[1] // GHASH block 4k+2 - mid + aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 2 + eor $t3.8b, $t3.8b, $res1.8b // GHASH block 4k+1 - mid + pmull $t5.1q, $res2.1d, $h2.1d // GHASH block 4k+2 - low + aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 4 + aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 3 + eor $t6.8b, $t6.8b, $res2.8b // GHASH block 4k+2 - mid + pmull $t3.1q, $t3.1d, $h34k.1d // GHASH block 4k+1 - mid + aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 5 + eor $acc_lb, $acc_lb, $t5.16b // GHASH block 4k+2 - low + aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 4 + pmull2 $t7.1q, $res3.2d, $h1.2d // GHASH block 4k+3 - high + eor $acc_mb, $acc_mb, $t3.16b // GHASH block 4k+1 - mid + pmull2 $t4.1q, $res2.2d, $h2.2d // GHASH block 4k+2 - high + aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 5 + ins $t6.d[1], $t6.d[0] // GHASH block 4k+2 - mid + aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 3 + aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 3 + eor $acc_hb, $acc_hb, $t4.16b // GHASH block 4k+2 - high + pmull $t8.1q, $res3.1d, $h1.1d // GHASH block 4k+3 - low + aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 4 + mov $t9d, $res3.d[1] // GHASH block 4k+3 - mid + aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 4 + pmull2 $t6.1q, $t6.2d, $h12k.2d // GHASH block 4k+2 - mid + aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 5 + eor $t9.8b, $t9.8b, $res3.8b // GHASH block 4k+3 - mid + aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 5 + aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 6 + eor $acc_mb, $acc_mb, $t6.16b // GHASH block 4k+2 - mid + aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 6 + aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 6 + movi $mod_constant.8b, #0xc2 + aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 6 + eor $acc_lb, $acc_lb, $t8.16b // GHASH block 4k+3 - low + pmull $t9.1q, $t9.1d, $h12k.1d // GHASH block 4k+3 - mid + aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 7 + cmp $rounds, #12 // setup flags for AES-128/192/256 check + eor $acc_hb, $acc_hb, $t7.16b // GHASH block 4k+3 - high + aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 7 + aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 7 + eor $acc_mb, $acc_mb, $t9.16b // GHASH block 4k+3 - mid + aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 8 + aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 7 + eor $t9.16b, $acc_lb, $acc_hb // MODULO - karatsuba tidy up + aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 8 + aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 8 + shl $mod_constantd, $mod_constantd, #56 // mod_constant + aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 8 + b.lt .Ldec_finish_prepretail // branch if AES-128 + + aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 9 + aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 9 + aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 9 + aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 9 + aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 10 + aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 10 + aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 10 + aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 10 + b.eq .Ldec_finish_prepretail // branch if AES-192 + + aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 11 + aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 11 + aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 11 + aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 12 + aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 11 + aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 12 + aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 12 + aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 12 + +.Ldec_finish_prepretail: + eor $acc_mb, $acc_mb, $t9.16b // MODULO - karatsuba tidy up + pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d // MODULO - top 64b align with mid + ext $acc_hb, $acc_hb, $acc_hb, #8 // MODULO - other top alignment + eor $acc_mb, $acc_mb, $mod_t.16b // MODULO - fold into mid + eor $output_h2, $output_h2, $rkN_h // AES block 4k+2 - round N high + eor $output_l3, $output_l3, $rkN_l // AES block 4k+3 - round N low + eor $acc_mb, $acc_mb, $acc_hb // MODULO - fold into mid + add $rctr32w, $rctr32w, #1 // CTR block 4k+7 + eor $output_l2, $output_l2, $rkN_l // AES block 4k+2 - round N low + pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d // MODULO - mid 64b align with low + eor $output_h3, $output_h3, $rkN_h // AES block 4k+3 - round N high + stp $output_l2, $output_h2, [$output_ptr], #16 // AES block 4k+2 - store result + ext $acc_mb, $acc_mb, $acc_mb, #8 // MODULO - other mid alignment + stp $output_l3, $output_h3, [$output_ptr], #16 // AES block 4k+3 - store result + + eor $acc_lb, $acc_lb, $mod_constant.16b // MODULO - fold into low + aese $ctr1b, $rkNm1 // AES block 4k+5 - round N-1 + aese $ctr0b, $rkNm1 // AES block 4k+4 - round N-1 + aese $ctr3b, $rkNm1 // AES block 4k+7 - round N-1 + aese $ctr2b, $rkNm1 // AES block 4k+6 - round N-1 + eor $acc_lb, $acc_lb, $acc_mb // MODULO - fold into low + +.Ldec_tail: // TAIL + sub $main_end_input_ptr, $end_input_ptr, $input_ptr // main_end_input_ptr is number of bytes left to process + ld1 { $res1b}, [$input_ptr], #16 // AES block 4k+4 - load ciphertext + eor $ctr0b, $res1b, $ctr0b // AES block 4k+4 - result + mov $output_l0, $ctr0.d[0] // AES block 4k+4 - mov low + mov $output_h0, $ctr0.d[1] // AES block 4k+4 - mov high + ext $t0.16b, $acc_lb, $acc_lb, #8 // prepare final partial tag + cmp $main_end_input_ptr, #48 + eor $output_l0, $output_l0, $rkN_l // AES block 4k+4 - round N low + eor $output_h0, $output_h0, $rkN_h // AES block 4k+4 - round N high + b.gt .Ldec_blocks_more_than_3 + sub $rctr32w, $rctr32w, #1 + mov $ctr3b, $ctr2b + movi $acc_m.8b, #0 + movi $acc_l.8b, #0 + cmp $main_end_input_ptr, #32 + movi $acc_h.8b, #0 + mov $ctr2b, $ctr1b + b.gt .Ldec_blocks_more_than_2 + sub $rctr32w, $rctr32w, #1 + mov $ctr3b, $ctr1b + cmp $main_end_input_ptr, #16 + b.gt .Ldec_blocks_more_than_1 + sub $rctr32w, $rctr32w, #1 + b .Ldec_blocks_less_than_1 +.Ldec_blocks_more_than_3: // blocks left > 3 + rev64 $res0b, $res1b // GHASH final-3 block + ld1 { $res1b}, [$input_ptr], #16 // AES final-2 block - load ciphertext + stp $output_l0, $output_h0, [$output_ptr], #16 // AES final-3 block - store result + mov $acc_md, $h34k.d[1] // GHASH final-3 block - mid + eor $res0b, $res0b, $t0.16b // feed in partial tag + eor $ctr0b, $res1b, $ctr1b // AES final-2 block - result + mov $rk4d, $res0.d[1] // GHASH final-3 block - mid + mov $output_l0, $ctr0.d[0] // AES final-2 block - mov low + mov $output_h0, $ctr0.d[1] // AES final-2 block - mov high + eor $rk4v.8b, $rk4v.8b, $res0.8b // GHASH final-3 block - mid + movi $t0.8b, #0 // suppress further partial tag feed in + pmull2 $acc_h.1q, $res0.2d, $h4.2d // GHASH final-3 block - high + pmull $acc_m.1q, $rk4v.1d, $acc_m.1d // GHASH final-3 block - mid + eor $output_l0, $output_l0, $rkN_l // AES final-2 block - round N low + pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH final-3 block - low + eor $output_h0, $output_h0, $rkN_h // AES final-2 block - round N high +.Ldec_blocks_more_than_2: // blocks left > 2 + rev64 $res0b, $res1b // GHASH final-2 block + ld1 { $res1b}, [$input_ptr], #16 // AES final-1 block - load ciphertext + eor $res0b, $res0b, $t0.16b // feed in partial tag + stp $output_l0, $output_h0, [$output_ptr], #16 // AES final-2 block - store result + eor $ctr0b, $res1b, $ctr2b // AES final-1 block - result + mov $rk4d, $res0.d[1] // GHASH final-2 block - mid + pmull $rk3q1, $res0.1d, $h3.1d // GHASH final-2 block - low + pmull2 $rk2q1, $res0.2d, $h3.2d // GHASH final-2 block - high + eor $rk4v.8b, $rk4v.8b, $res0.8b // GHASH final-2 block - mid + mov $output_l0, $ctr0.d[0] // AES final-1 block - mov low + mov $output_h0, $ctr0.d[1] // AES final-1 block - mov high + eor $acc_lb, $acc_lb, $rk3 // GHASH final-2 block - low + movi $t0.8b, #0 // suppress further partial tag feed in + pmull $rk4v.1q, $rk4v.1d, $h34k.1d // GHASH final-2 block - mid + eor $acc_hb, $acc_hb, $rk2 // GHASH final-2 block - high + eor $output_l0, $output_l0, $rkN_l // AES final-1 block - round N low + eor $acc_mb, $acc_mb, $rk4v.16b // GHASH final-2 block - mid + eor $output_h0, $output_h0, $rkN_h // AES final-1 block - round N high +.Ldec_blocks_more_than_1: // blocks left > 1 + stp $output_l0, $output_h0, [$output_ptr], #16 // AES final-1 block - store result + rev64 $res0b, $res1b // GHASH final-1 block + ld1 { $res1b}, [$input_ptr], #16 // AES final block - load ciphertext + eor $res0b, $res0b, $t0.16b // feed in partial tag + movi $t0.8b, #0 // suppress further partial tag feed in + mov $rk4d, $res0.d[1] // GHASH final-1 block - mid + eor $ctr0b, $res1b, $ctr3b // AES final block - result + pmull2 $rk2q1, $res0.2d, $h2.2d // GHASH final-1 block - high + eor $rk4v.8b, $rk4v.8b, $res0.8b // GHASH final-1 block - mid + pmull $rk3q1, $res0.1d, $h2.1d // GHASH final-1 block - low + mov $output_l0, $ctr0.d[0] // AES final block - mov low + ins $rk4v.d[1], $rk4v.d[0] // GHASH final-1 block - mid + mov $output_h0, $ctr0.d[1] // AES final block - mov high + pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d // GHASH final-1 block - mid + eor $output_l0, $output_l0, $rkN_l // AES final block - round N low + eor $acc_lb, $acc_lb, $rk3 // GHASH final-1 block - low + eor $acc_hb, $acc_hb, $rk2 // GHASH final-1 block - high + eor $acc_mb, $acc_mb, $rk4v.16b // GHASH final-1 block - mid + eor $output_h0, $output_h0, $rkN_h // AES final block - round N high +.Ldec_blocks_less_than_1: // blocks left <= 1 + and $bit_length, $bit_length, #127 // bit_length %= 128 + mvn $rkN_h, xzr // rkN_h = 0xffffffffffffffff + sub $bit_length, $bit_length, #128 // bit_length -= 128 + mvn $rkN_l, xzr // rkN_l = 0xffffffffffffffff + ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] // load existing bytes we need to not overwrite + neg $bit_length, $bit_length // bit_length = 128 - #bits in input (in range [1,128]) + and $bit_length, $bit_length, #127 // bit_length %= 128 + lsr $rkN_h, $rkN_h, $bit_length // rkN_h is mask for top 64b of last block + cmp $bit_length, #64 + csel $ctr32x, $rkN_l, $rkN_h, lt + csel $ctr96_b64x, $rkN_h, xzr, lt + fmov $ctr0d, $ctr32x // ctr0b is mask for last block + and $output_l0, $output_l0, $ctr32x + mov $ctr0.d[1], $ctr96_b64x + bic $end_input_ptr, $end_input_ptr, $ctr32x // mask out low existing bytes + rev $ctr32w, $rctr32w + bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x // mask out high existing bytes + orr $output_l0, $output_l0, $end_input_ptr + and $output_h0, $output_h0, $ctr96_b64x + orr $output_h0, $output_h0, $main_end_input_ptr + and $res1b, $res1b, $ctr0b // possibly partial last block has zeroes in highest bits + rev64 $res0b, $res1b // GHASH final block + eor $res0b, $res0b, $t0.16b // feed in partial tag + pmull $rk3q1, $res0.1d, $h1.1d // GHASH final block - low + mov $t0d, $res0.d[1] // GHASH final block - mid + eor $t0.8b, $t0.8b, $res0.8b // GHASH final block - mid + pmull2 $rk2q1, $res0.2d, $h1.2d // GHASH final block - high + pmull $t0.1q, $t0.1d, $h12k.1d // GHASH final block - mid + eor $acc_hb, $acc_hb, $rk2 // GHASH final block - high + eor $acc_lb, $acc_lb, $rk3 // GHASH final block - low + eor $acc_mb, $acc_mb, $t0.16b // GHASH final block - mid + movi $mod_constant.8b, #0xc2 + eor $t9.16b, $acc_lb, $acc_hb // MODULO - karatsuba tidy up + shl $mod_constantd, $mod_constantd, #56 // mod_constant + eor $acc_mb, $acc_mb, $t9.16b // MODULO - karatsuba tidy up + pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d // MODULO - top 64b align with mid + ext $acc_hb, $acc_hb, $acc_hb, #8 // MODULO - other top alignment + eor $acc_mb, $acc_mb, $mod_t.16b // MODULO - fold into mid + eor $acc_mb, $acc_mb, $acc_hb // MODULO - fold into mid + pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d // MODULO - mid 64b align with low + ext $acc_mb, $acc_mb, $acc_mb, #8 // MODULO - other mid alignment + eor $acc_lb, $acc_lb, $mod_constant.16b // MODULO - fold into low + stp $output_l0, $output_h0, [$output_ptr] + str $ctr32w, [$counter, #12] // store the updated counter + eor $acc_lb, $acc_lb, $acc_mb // MODULO - fold into low + ext $acc_lb, $acc_lb, $acc_lb, #8 + rev64 $acc_lb, $acc_lb + mov x0, $len + st1 { $acc_l.16b }, [$current_tag] + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x23, x24, [sp, #48] + ldp d8, d9, [sp, #64] + ldp d10, d11, [sp, #80] + ldp d12, d13, [sp, #96] + ldp d14, d15, [sp, #112] + ldp x29, x30, [sp], #128 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size aes_gcm_dec_kernel,.-aes_gcm_dec_kernel +___ +} +} + +$code.=<<___; +#endif +___ + +print $code; +close STDOUT or die "error closing STDOUT: $!"; # enforce flush diff --git a/ring-0.17.14/crypto/fipsmodule/aes/asm/bsaes-armv7.pl b/ring-0.17.14/crypto/fipsmodule/aes/asm/bsaes-armv7.pl new file mode 100644 index 0000000000..a7518f6e63 --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/aes/asm/bsaes-armv7.pl @@ -0,0 +1,1144 @@ +#! /usr/bin/env perl +# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. +# +# Specific modes and adaptation for Linux kernel by Ard Biesheuvel +# of Linaro. +# ==================================================================== + +# Bit-sliced AES for ARM NEON +# +# February 2012. +# +# This implementation is direct adaptation of bsaes-x86_64 module for +# ARM NEON. Except that this module is endian-neutral [in sense that +# it can be compiled for either endianness] by courtesy of vld1.8's +# neutrality. Initial version doesn't implement interface to OpenSSL, +# only low-level primitives and unsupported entry points, just enough +# to collect performance results, which for Cortex-A8 core are: +# +# encrypt 19.5 cycles per byte processed with 128-bit key +# decrypt 22.1 cycles per byte processed with 128-bit key +# key conv. 440 cycles per 128-bit key/0.18 of 8x block +# +# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7, +# which is [much] worse than anticipated (for further details see +# http://www.openssl.org/~appro/Snapdragon-S4.html). +# +# Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code +# manages in 20.0 cycles]. +# +# When comparing to x86_64 results keep in mind that NEON unit is +# [mostly] single-issue and thus can't [fully] benefit from +# instruction-level parallelism. And when comparing to aes-armv4 +# results keep in mind key schedule conversion overhead (see +# bsaes-x86_64.pl for further details)... +# +# + +# April-August 2013 +# Add CBC, CTR and XTS subroutines and adapt for kernel use; courtesy of Ard. + +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; + *STDOUT=*OUT; +} else { + open OUT,">$output"; + *STDOUT=*OUT; +} + +my ($inp,$out,$len,$key)=("r0","r1","r2","r3"); +my @XMM=map("q$_",(0..15)); + +{ +my ($key,$rounds,$const)=("r4","r5","r6"); + +sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } +sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } + +sub Sbox { +# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb +# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb +my @b=@_[0..7]; +my @t=@_[8..11]; +my @s=@_[12..15]; + &InBasisChange (@b); + &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s); + &OutBasisChange (@b[7,1,4,2,6,5,0,3]); +} + +sub InBasisChange { +# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb +# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb +my @b=@_[0..7]; +$code.=<<___; + veor @b[2], @b[2], @b[1] + veor @b[5], @b[5], @b[6] + veor @b[3], @b[3], @b[0] + veor @b[6], @b[6], @b[2] + veor @b[5], @b[5], @b[0] + + veor @b[6], @b[6], @b[3] + veor @b[3], @b[3], @b[7] + veor @b[7], @b[7], @b[5] + veor @b[3], @b[3], @b[4] + veor @b[4], @b[4], @b[5] + + veor @b[2], @b[2], @b[7] + veor @b[3], @b[3], @b[1] + veor @b[1], @b[1], @b[5] +___ +} + +sub OutBasisChange { +# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb +# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb +my @b=@_[0..7]; +$code.=<<___; + veor @b[0], @b[0], @b[6] + veor @b[1], @b[1], @b[4] + veor @b[4], @b[4], @b[6] + veor @b[2], @b[2], @b[0] + veor @b[6], @b[6], @b[1] + + veor @b[1], @b[1], @b[5] + veor @b[5], @b[5], @b[3] + veor @b[3], @b[3], @b[7] + veor @b[7], @b[7], @b[5] + veor @b[2], @b[2], @b[5] + + veor @b[4], @b[4], @b[7] +___ +} + +sub InvSbox { +# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb +# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb +my @b=@_[0..7]; +my @t=@_[8..11]; +my @s=@_[12..15]; + &InvInBasisChange (@b); + &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s); + &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]); +} + +sub InvInBasisChange { # OutBasisChange in reverse (with twist) +my @b=@_[5,1,2,6,3,7,0,4]; +$code.=<<___ + veor @b[1], @b[1], @b[7] + veor @b[4], @b[4], @b[7] + + veor @b[7], @b[7], @b[5] + veor @b[1], @b[1], @b[3] + veor @b[2], @b[2], @b[5] + veor @b[3], @b[3], @b[7] + + veor @b[6], @b[6], @b[1] + veor @b[2], @b[2], @b[0] + veor @b[5], @b[5], @b[3] + veor @b[4], @b[4], @b[6] + veor @b[0], @b[0], @b[6] + veor @b[1], @b[1], @b[4] +___ +} + +sub InvOutBasisChange { # InBasisChange in reverse +my @b=@_[2,5,7,3,6,1,0,4]; +$code.=<<___; + veor @b[1], @b[1], @b[5] + veor @b[2], @b[2], @b[7] + + veor @b[3], @b[3], @b[1] + veor @b[4], @b[4], @b[5] + veor @b[7], @b[7], @b[5] + veor @b[3], @b[3], @b[4] + veor @b[5], @b[5], @b[0] + veor @b[3], @b[3], @b[7] + veor @b[6], @b[6], @b[2] + veor @b[2], @b[2], @b[1] + veor @b[6], @b[6], @b[3] + + veor @b[3], @b[3], @b[0] + veor @b[5], @b[5], @b[6] +___ +} + +sub Mul_GF4 { +#;************************************************************* +#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) * +#;************************************************************* +my ($x0,$x1,$y0,$y1,$t0,$t1)=@_; +$code.=<<___; + veor $t0, $y0, $y1 + vand $t0, $t0, $x0 + veor $x0, $x0, $x1 + vand $t1, $x1, $y0 + vand $x0, $x0, $y1 + veor $x1, $t1, $t0 + veor $x0, $x0, $t1 +___ +} + +sub Mul_GF4_N { # not used, see next subroutine +# multiply and scale by N +my ($x0,$x1,$y0,$y1,$t0)=@_; +$code.=<<___; + veor $t0, $y0, $y1 + vand $t0, $t0, $x0 + veor $x0, $x0, $x1 + vand $x1, $x1, $y0 + vand $x0, $x0, $y1 + veor $x1, $x1, $x0 + veor $x0, $x0, $t0 +___ +} + +sub Mul_GF4_N_GF4 { +# interleaved Mul_GF4_N and Mul_GF4 +my ($x0,$x1,$y0,$y1,$t0, + $x2,$x3,$y2,$y3,$t1)=@_; +$code.=<<___; + veor $t0, $y0, $y1 + veor $t1, $y2, $y3 + vand $t0, $t0, $x0 + vand $t1, $t1, $x2 + veor $x0, $x0, $x1 + veor $x2, $x2, $x3 + vand $x1, $x1, $y0 + vand $x3, $x3, $y2 + vand $x0, $x0, $y1 + vand $x2, $x2, $y3 + veor $x1, $x1, $x0 + veor $x2, $x2, $x3 + veor $x0, $x0, $t0 + veor $x3, $x3, $t1 +___ +} +sub Mul_GF16_2 { +my @x=@_[0..7]; +my @y=@_[8..11]; +my @t=@_[12..15]; +$code.=<<___; + veor @t[0], @x[0], @x[2] + veor @t[1], @x[1], @x[3] +___ + &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2..3]); +$code.=<<___; + veor @y[0], @y[0], @y[2] + veor @y[1], @y[1], @y[3] +___ + Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], + @x[2], @x[3], @y[2], @y[3], @t[2]); +$code.=<<___; + veor @x[0], @x[0], @t[0] + veor @x[2], @x[2], @t[0] + veor @x[1], @x[1], @t[1] + veor @x[3], @x[3], @t[1] + + veor @t[0], @x[4], @x[6] + veor @t[1], @x[5], @x[7] +___ + &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], + @x[6], @x[7], @y[2], @y[3], @t[2]); +$code.=<<___; + veor @y[0], @y[0], @y[2] + veor @y[1], @y[1], @y[3] +___ + &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[2..3]); +$code.=<<___; + veor @x[4], @x[4], @t[0] + veor @x[6], @x[6], @t[0] + veor @x[5], @x[5], @t[1] + veor @x[7], @x[7], @t[1] +___ +} +sub Inv_GF256 { +#;******************************************************************** +#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) * +#;******************************************************************** +my @x=@_[0..7]; +my @t=@_[8..11]; +my @s=@_[12..15]; +# direct optimizations from hardware +$code.=<<___; + veor @t[3], @x[4], @x[6] + veor @t[2], @x[5], @x[7] + veor @t[1], @x[1], @x[3] + veor @s[1], @x[7], @x[6] + vmov @t[0], @t[2] + veor @s[0], @x[0], @x[2] + + vorr @t[2], @t[2], @t[1] + veor @s[3], @t[3], @t[0] + vand @s[2], @t[3], @s[0] + vorr @t[3], @t[3], @s[0] + veor @s[0], @s[0], @t[1] + vand @t[0], @t[0], @t[1] + veor @t[1], @x[3], @x[2] + vand @s[3], @s[3], @s[0] + vand @s[1], @s[1], @t[1] + veor @t[1], @x[4], @x[5] + veor @s[0], @x[1], @x[0] + veor @t[3], @t[3], @s[1] + veor @t[2], @t[2], @s[1] + vand @s[1], @t[1], @s[0] + vorr @t[1], @t[1], @s[0] + veor @t[3], @t[3], @s[3] + veor @t[0], @t[0], @s[1] + veor @t[2], @t[2], @s[2] + veor @t[1], @t[1], @s[3] + veor @t[0], @t[0], @s[2] + vand @s[0], @x[7], @x[3] + veor @t[1], @t[1], @s[2] + vand @s[1], @x[6], @x[2] + vand @s[2], @x[5], @x[1] + vorr @s[3], @x[4], @x[0] + veor @t[3], @t[3], @s[0] + veor @t[1], @t[1], @s[2] + veor @t[0], @t[0], @s[3] + veor @t[2], @t[2], @s[1] + + @ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3 + + @ new smaller inversion + + vand @s[2], @t[3], @t[1] + vmov @s[0], @t[0] + + veor @s[1], @t[2], @s[2] + veor @s[3], @t[0], @s[2] + veor @s[2], @t[0], @s[2] @ @s[2]=@s[3] + + vbsl @s[1], @t[1], @t[0] + vbsl @s[3], @t[3], @t[2] + veor @t[3], @t[3], @t[2] + + vbsl @s[0], @s[1], @s[2] + vbsl @t[0], @s[2], @s[1] + + vand @s[2], @s[0], @s[3] + veor @t[1], @t[1], @t[0] + + veor @s[2], @s[2], @t[3] +___ +# output in s3, s2, s1, t1 + +# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3 + +# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 + &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]); + +### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb +} + +# AES linear components + +sub ShiftRows { +my @x=@_[0..7]; +my @t=@_[8..11]; +my $mask=pop; +$code.=<<___; + vldmia $key!, {@t[0]-@t[3]} + veor @t[0], @t[0], @x[0] + veor @t[1], @t[1], @x[1] + vtbl.8 `&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)` + vtbl.8 `&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)` + vldmia $key!, {@t[0]} + veor @t[2], @t[2], @x[2] + vtbl.8 `&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)` + vtbl.8 `&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)` + vldmia $key!, {@t[1]} + veor @t[3], @t[3], @x[3] + vtbl.8 `&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)` + vtbl.8 `&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)` + vldmia $key!, {@t[2]} + vtbl.8 `&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)` + vtbl.8 `&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)` + vldmia $key!, {@t[3]} + veor @t[0], @t[0], @x[4] + veor @t[1], @t[1], @x[5] + vtbl.8 `&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)` + vtbl.8 `&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)` + veor @t[2], @t[2], @x[6] + vtbl.8 `&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)` + vtbl.8 `&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)` + veor @t[3], @t[3], @x[7] + vtbl.8 `&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)` + vtbl.8 `&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)` + vtbl.8 `&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)` + vtbl.8 `&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)` +___ +} + +sub MixColumns { +# modified to emit output in order suitable for feeding back to aesenc[last] +my @x=@_[0..7]; +my @t=@_[8..15]; +my $inv=@_[16]; # optional +$code.=<<___; + vext.8 @t[0], @x[0], @x[0], #12 @ x0 <<< 32 + vext.8 @t[1], @x[1], @x[1], #12 + veor @x[0], @x[0], @t[0] @ x0 ^ (x0 <<< 32) + vext.8 @t[2], @x[2], @x[2], #12 + veor @x[1], @x[1], @t[1] + vext.8 @t[3], @x[3], @x[3], #12 + veor @x[2], @x[2], @t[2] + vext.8 @t[4], @x[4], @x[4], #12 + veor @x[3], @x[3], @t[3] + vext.8 @t[5], @x[5], @x[5], #12 + veor @x[4], @x[4], @t[4] + vext.8 @t[6], @x[6], @x[6], #12 + veor @x[5], @x[5], @t[5] + vext.8 @t[7], @x[7], @x[7], #12 + veor @x[6], @x[6], @t[6] + + veor @t[1], @t[1], @x[0] + veor @x[7], @x[7], @t[7] + vext.8 @x[0], @x[0], @x[0], #8 @ (x0 ^ (x0 <<< 32)) <<< 64) + veor @t[2], @t[2], @x[1] + veor @t[0], @t[0], @x[7] + veor @t[1], @t[1], @x[7] + vext.8 @x[1], @x[1], @x[1], #8 + veor @t[5], @t[5], @x[4] + veor @x[0], @x[0], @t[0] + veor @t[6], @t[6], @x[5] + veor @x[1], @x[1], @t[1] + vext.8 @t[0], @x[4], @x[4], #8 + veor @t[4], @t[4], @x[3] + vext.8 @t[1], @x[5], @x[5], #8 + veor @t[7], @t[7], @x[6] + vext.8 @x[4], @x[3], @x[3], #8 + veor @t[3], @t[3], @x[2] + vext.8 @x[5], @x[7], @x[7], #8 + veor @t[4], @t[4], @x[7] + vext.8 @x[3], @x[6], @x[6], #8 + veor @t[3], @t[3], @x[7] + vext.8 @x[6], @x[2], @x[2], #8 + veor @x[7], @t[1], @t[5] +___ +$code.=<<___ if (!$inv); + veor @x[2], @t[0], @t[4] + veor @x[4], @x[4], @t[3] + veor @x[5], @x[5], @t[7] + veor @x[3], @x[3], @t[6] + @ vmov @x[2], @t[0] + veor @x[6], @x[6], @t[2] + @ vmov @x[7], @t[1] +___ +$code.=<<___ if ($inv); + veor @t[3], @t[3], @x[4] + veor @x[5], @x[5], @t[7] + veor @x[2], @x[3], @t[6] + veor @x[3], @t[0], @t[4] + veor @x[4], @x[6], @t[2] + vmov @x[6], @t[3] + @ vmov @x[7], @t[1] +___ +} + +sub InvMixColumns_orig { +my @x=@_[0..7]; +my @t=@_[8..15]; + +$code.=<<___; + @ multiplication by 0x0e + vext.8 @t[7], @x[7], @x[7], #12 + vmov @t[2], @x[2] + veor @x[2], @x[2], @x[5] @ 2 5 + veor @x[7], @x[7], @x[5] @ 7 5 + vext.8 @t[0], @x[0], @x[0], #12 + vmov @t[5], @x[5] + veor @x[5], @x[5], @x[0] @ 5 0 [1] + veor @x[0], @x[0], @x[1] @ 0 1 + vext.8 @t[1], @x[1], @x[1], #12 + veor @x[1], @x[1], @x[2] @ 1 25 + veor @x[0], @x[0], @x[6] @ 01 6 [2] + vext.8 @t[3], @x[3], @x[3], #12 + veor @x[1], @x[1], @x[3] @ 125 3 [4] + veor @x[2], @x[2], @x[0] @ 25 016 [3] + veor @x[3], @x[3], @x[7] @ 3 75 + veor @x[7], @x[7], @x[6] @ 75 6 [0] + vext.8 @t[6], @x[6], @x[6], #12 + vmov @t[4], @x[4] + veor @x[6], @x[6], @x[4] @ 6 4 + veor @x[4], @x[4], @x[3] @ 4 375 [6] + veor @x[3], @x[3], @x[7] @ 375 756=36 + veor @x[6], @x[6], @t[5] @ 64 5 [7] + veor @x[3], @x[3], @t[2] @ 36 2 + vext.8 @t[5], @t[5], @t[5], #12 + veor @x[3], @x[3], @t[4] @ 362 4 [5] +___ + my @y = @x[7,5,0,2,1,3,4,6]; +$code.=<<___; + @ multiplication by 0x0b + veor @y[1], @y[1], @y[0] + veor @y[0], @y[0], @t[0] + vext.8 @t[2], @t[2], @t[2], #12 + veor @y[1], @y[1], @t[1] + veor @y[0], @y[0], @t[5] + vext.8 @t[4], @t[4], @t[4], #12 + veor @y[1], @y[1], @t[6] + veor @y[0], @y[0], @t[7] + veor @t[7], @t[7], @t[6] @ clobber t[7] + + veor @y[3], @y[3], @t[0] + veor @y[1], @y[1], @y[0] + vext.8 @t[0], @t[0], @t[0], #12 + veor @y[2], @y[2], @t[1] + veor @y[4], @y[4], @t[1] + vext.8 @t[1], @t[1], @t[1], #12 + veor @y[2], @y[2], @t[2] + veor @y[3], @y[3], @t[2] + veor @y[5], @y[5], @t[2] + veor @y[2], @y[2], @t[7] + vext.8 @t[2], @t[2], @t[2], #12 + veor @y[3], @y[3], @t[3] + veor @y[6], @y[6], @t[3] + veor @y[4], @y[4], @t[3] + veor @y[7], @y[7], @t[4] + vext.8 @t[3], @t[3], @t[3], #12 + veor @y[5], @y[5], @t[4] + veor @y[7], @y[7], @t[7] + veor @t[7], @t[7], @t[5] @ clobber t[7] even more + veor @y[3], @y[3], @t[5] + veor @y[4], @y[4], @t[4] + + veor @y[5], @y[5], @t[7] + vext.8 @t[4], @t[4], @t[4], #12 + veor @y[6], @y[6], @t[7] + veor @y[4], @y[4], @t[7] + + veor @t[7], @t[7], @t[5] + vext.8 @t[5], @t[5], @t[5], #12 + + @ multiplication by 0x0d + veor @y[4], @y[4], @y[7] + veor @t[7], @t[7], @t[6] @ restore t[7] + veor @y[7], @y[7], @t[4] + vext.8 @t[6], @t[6], @t[6], #12 + veor @y[2], @y[2], @t[0] + veor @y[7], @y[7], @t[5] + vext.8 @t[7], @t[7], @t[7], #12 + veor @y[2], @y[2], @t[2] + + veor @y[3], @y[3], @y[1] + veor @y[1], @y[1], @t[1] + veor @y[0], @y[0], @t[0] + veor @y[3], @y[3], @t[0] + veor @y[1], @y[1], @t[5] + veor @y[0], @y[0], @t[5] + vext.8 @t[0], @t[0], @t[0], #12 + veor @y[1], @y[1], @t[7] + veor @y[0], @y[0], @t[6] + veor @y[3], @y[3], @y[1] + veor @y[4], @y[4], @t[1] + vext.8 @t[1], @t[1], @t[1], #12 + + veor @y[7], @y[7], @t[7] + veor @y[4], @y[4], @t[2] + veor @y[5], @y[5], @t[2] + veor @y[2], @y[2], @t[6] + veor @t[6], @t[6], @t[3] @ clobber t[6] + vext.8 @t[2], @t[2], @t[2], #12 + veor @y[4], @y[4], @y[7] + veor @y[3], @y[3], @t[6] + + veor @y[6], @y[6], @t[6] + veor @y[5], @y[5], @t[5] + vext.8 @t[5], @t[5], @t[5], #12 + veor @y[6], @y[6], @t[4] + vext.8 @t[4], @t[4], @t[4], #12 + veor @y[5], @y[5], @t[6] + veor @y[6], @y[6], @t[7] + vext.8 @t[7], @t[7], @t[7], #12 + veor @t[6], @t[6], @t[3] @ restore t[6] + vext.8 @t[3], @t[3], @t[3], #12 + + @ multiplication by 0x09 + veor @y[4], @y[4], @y[1] + veor @t[1], @t[1], @y[1] @ t[1]=y[1] + veor @t[0], @t[0], @t[5] @ clobber t[0] + vext.8 @t[6], @t[6], @t[6], #12 + veor @t[1], @t[1], @t[5] + veor @y[3], @y[3], @t[0] + veor @t[0], @t[0], @y[0] @ t[0]=y[0] + veor @t[1], @t[1], @t[6] + veor @t[6], @t[6], @t[7] @ clobber t[6] + veor @y[4], @y[4], @t[1] + veor @y[7], @y[7], @t[4] + veor @y[6], @y[6], @t[3] + veor @y[5], @y[5], @t[2] + veor @t[4], @t[4], @y[4] @ t[4]=y[4] + veor @t[3], @t[3], @y[3] @ t[3]=y[3] + veor @t[5], @t[5], @y[5] @ t[5]=y[5] + veor @t[2], @t[2], @y[2] @ t[2]=y[2] + veor @t[3], @t[3], @t[7] + veor @XMM[5], @t[5], @t[6] + veor @XMM[6], @t[6], @y[6] @ t[6]=y[6] + veor @XMM[2], @t[2], @t[6] + veor @XMM[7], @t[7], @y[7] @ t[7]=y[7] + + vmov @XMM[0], @t[0] + vmov @XMM[1], @t[1] + @ vmov @XMM[2], @t[2] + vmov @XMM[3], @t[3] + vmov @XMM[4], @t[4] + @ vmov @XMM[5], @t[5] + @ vmov @XMM[6], @t[6] + @ vmov @XMM[7], @t[7] +___ +} + +sub InvMixColumns { +my @x=@_[0..7]; +my @t=@_[8..15]; + +# Thanks to Jussi Kivilinna for providing pointer to +# +# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 | +# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 | +# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 | +# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 | + +$code.=<<___; + @ multiplication by 0x05-0x00-0x04-0x00 + vext.8 @t[0], @x[0], @x[0], #8 + vext.8 @t[6], @x[6], @x[6], #8 + vext.8 @t[7], @x[7], @x[7], #8 + veor @t[0], @t[0], @x[0] + vext.8 @t[1], @x[1], @x[1], #8 + veor @t[6], @t[6], @x[6] + vext.8 @t[2], @x[2], @x[2], #8 + veor @t[7], @t[7], @x[7] + vext.8 @t[3], @x[3], @x[3], #8 + veor @t[1], @t[1], @x[1] + vext.8 @t[4], @x[4], @x[4], #8 + veor @t[2], @t[2], @x[2] + vext.8 @t[5], @x[5], @x[5], #8 + veor @t[3], @t[3], @x[3] + veor @t[4], @t[4], @x[4] + veor @t[5], @t[5], @x[5] + + veor @x[0], @x[0], @t[6] + veor @x[1], @x[1], @t[6] + veor @x[2], @x[2], @t[0] + veor @x[4], @x[4], @t[2] + veor @x[3], @x[3], @t[1] + veor @x[1], @x[1], @t[7] + veor @x[2], @x[2], @t[7] + veor @x[4], @x[4], @t[6] + veor @x[5], @x[5], @t[3] + veor @x[3], @x[3], @t[6] + veor @x[6], @x[6], @t[4] + veor @x[4], @x[4], @t[7] + veor @x[5], @x[5], @t[7] + veor @x[7], @x[7], @t[5] +___ + &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6 +} + +sub swapmove { +my ($a,$b,$n,$mask,$t)=@_; +$code.=<<___; + vshr.u64 $t, $b, #$n + veor $t, $t, $a + vand $t, $t, $mask + veor $a, $a, $t + vshl.u64 $t, $t, #$n + veor $b, $b, $t +___ +} +sub swapmove2x { +my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_; +$code.=<<___; + vshr.u64 $t0, $b0, #$n + vshr.u64 $t1, $b1, #$n + veor $t0, $t0, $a0 + veor $t1, $t1, $a1 + vand $t0, $t0, $mask + vand $t1, $t1, $mask + veor $a0, $a0, $t0 + vshl.u64 $t0, $t0, #$n + veor $a1, $a1, $t1 + vshl.u64 $t1, $t1, #$n + veor $b0, $b0, $t0 + veor $b1, $b1, $t1 +___ +} + +sub bitslice { +my @x=reverse(@_[0..7]); +my ($t0,$t1,$t2,$t3)=@_[8..11]; +$code.=<<___; + vmov.i8 $t0,#0x55 @ compose .LBS0 + vmov.i8 $t1,#0x33 @ compose .LBS1 +___ + &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3); + &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); +$code.=<<___; + vmov.i8 $t0,#0x0f @ compose .LBS2 +___ + &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3); + &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); + + &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3); + &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3); +} + +$code.=<<___; +#ifndef __KERNEL__ +# define VFP_ABI_PUSH vstmdb sp!,{d8-d15} +# define VFP_ABI_POP vldmia sp!,{d8-d15} +# define VFP_ABI_FRAME 0x40 +#else +# define VFP_ABI_PUSH +# define VFP_ABI_POP +# define VFP_ABI_FRAME 0 +# define BSAES_ASM_EXTENDED_KEY +# define __ARM_MAX_ARCH__ 7 +#endif + +#ifdef __thumb__ +# define adrl adr +#endif + +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.text +.syntax unified @ ARMv7-capable assembler is expected to handle this +#if defined(__thumb2__) && !defined(__APPLE__) +.thumb +#else +.code 32 +# undef __thumb2__ +#endif + +.type _bsaes_const,%object +.align 6 +_bsaes_const: +.LM0ISR: @ InvShiftRows constants + .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 +.LISR: + .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 +.LISRM0: + .quad 0x01040b0e0205080f, 0x0306090c00070a0d +.LM0SR: @ ShiftRows constants + .quad 0x0a0e02060f03070b, 0x0004080c05090d01 +.LSR: + .quad 0x0504070600030201, 0x0f0e0d0c0a09080b +.LSRM0: + .quad 0x0304090e00050a0f, 0x01060b0c0207080d +.LM0: + .quad 0x02060a0e03070b0f, 0x0004080c0105090d +.LREVM0SR: + .quad 0x090d01050c000408, 0x03070b0f060a0e02 +.asciz "Bit-sliced AES for NEON, CRYPTOGAMS by " +.align 6 +.size _bsaes_const,.-_bsaes_const + +.type _bsaes_encrypt8,%function +.align 4 +_bsaes_encrypt8: + adr $const,. + vldmia $key!, {@XMM[9]} @ round 0 key +#if defined(__thumb2__) || defined(__APPLE__) + adr $const,.LM0SR +#else + sub $const,$const,#_bsaes_encrypt8-.LM0SR +#endif + + vldmia $const!, {@XMM[8]} @ .LM0SR +_bsaes_encrypt8_alt: + veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key + veor @XMM[11], @XMM[1], @XMM[9] + vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])` + veor @XMM[12], @XMM[2], @XMM[9] + vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])` + veor @XMM[13], @XMM[3], @XMM[9] + vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])` + veor @XMM[14], @XMM[4], @XMM[9] + vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])` + veor @XMM[15], @XMM[5], @XMM[9] + vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])` + veor @XMM[10], @XMM[6], @XMM[9] + vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])` + veor @XMM[11], @XMM[7], @XMM[9] + vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])` + vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])` +_bsaes_encrypt8_bitslice: +___ + &bitslice (@XMM[0..7, 8..11]); +$code.=<<___; + sub $rounds,$rounds,#1 + b .Lenc_sbox +.align 4 +.Lenc_loop: +___ + &ShiftRows (@XMM[0..7, 8..12]); +$code.=".Lenc_sbox:\n"; + &Sbox (@XMM[0..7, 8..15]); +$code.=<<___; + subs $rounds,$rounds,#1 + bcc .Lenc_done +___ + &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]); +$code.=<<___; + vldmia $const, {@XMM[12]} @ .LSR + ite eq @ Thumb2 thing, samity check in ARM + addeq $const,$const,#0x10 + bne .Lenc_loop + vldmia $const, {@XMM[12]} @ .LSRM0 + b .Lenc_loop +.align 4 +.Lenc_done: +___ + # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb + &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]); +$code.=<<___; + vldmia $key, {@XMM[8]} @ last round key + veor @XMM[4], @XMM[4], @XMM[8] + veor @XMM[6], @XMM[6], @XMM[8] + veor @XMM[3], @XMM[3], @XMM[8] + veor @XMM[7], @XMM[7], @XMM[8] + veor @XMM[2], @XMM[2], @XMM[8] + veor @XMM[5], @XMM[5], @XMM[8] + veor @XMM[0], @XMM[0], @XMM[8] + veor @XMM[1], @XMM[1], @XMM[8] + bx lr +.size _bsaes_encrypt8,.-_bsaes_encrypt8 +___ +} +{ +my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6"); + +sub bitslice_key { +my @x=reverse(@_[0..7]); +my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12]; + + &swapmove (@x[0,1],1,$bs0,$t2,$t3); +$code.=<<___; + @ &swapmove(@x[2,3],1,$t0,$t2,$t3); + vmov @x[2], @x[0] + vmov @x[3], @x[1] +___ + #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); + + &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3); +$code.=<<___; + @ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); + vmov @x[4], @x[0] + vmov @x[6], @x[2] + vmov @x[5], @x[1] + vmov @x[7], @x[3] +___ + &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3); + &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3); +} + +$code.=<<___; +.type _bsaes_key_convert,%function +.align 4 +_bsaes_key_convert: + adr $const,. + vld1.8 {@XMM[7]}, [$inp]! @ load round 0 key +#if defined(__thumb2__) || defined(__APPLE__) + adr $const,.LM0 +#else + sub $const,$const,#_bsaes_key_convert-.LM0 +#endif + vld1.8 {@XMM[15]}, [$inp]! @ load round 1 key + + vmov.i8 @XMM[8], #0x01 @ bit masks + vmov.i8 @XMM[9], #0x02 + vmov.i8 @XMM[10], #0x04 + vmov.i8 @XMM[11], #0x08 + vmov.i8 @XMM[12], #0x10 + vmov.i8 @XMM[13], #0x20 + vldmia $const, {@XMM[14]} @ .LM0 + +#ifdef __ARMEL__ + vrev32.8 @XMM[7], @XMM[7] + vrev32.8 @XMM[15], @XMM[15] +#endif + sub $rounds,$rounds,#1 + vstmia $out!, {@XMM[7]} @ save round 0 key + b .Lkey_loop + +.align 4 +.Lkey_loop: + vtbl.8 `&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])` + vtbl.8 `&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])` + vmov.i8 @XMM[6], #0x40 + vmov.i8 @XMM[15], #0x80 + + vtst.8 @XMM[0], @XMM[7], @XMM[8] + vtst.8 @XMM[1], @XMM[7], @XMM[9] + vtst.8 @XMM[2], @XMM[7], @XMM[10] + vtst.8 @XMM[3], @XMM[7], @XMM[11] + vtst.8 @XMM[4], @XMM[7], @XMM[12] + vtst.8 @XMM[5], @XMM[7], @XMM[13] + vtst.8 @XMM[6], @XMM[7], @XMM[6] + vtst.8 @XMM[7], @XMM[7], @XMM[15] + vld1.8 {@XMM[15]}, [$inp]! @ load next round key + vmvn @XMM[0], @XMM[0] @ "pnot" + vmvn @XMM[1], @XMM[1] + vmvn @XMM[5], @XMM[5] + vmvn @XMM[6], @XMM[6] +#ifdef __ARMEL__ + vrev32.8 @XMM[15], @XMM[15] +#endif + subs $rounds,$rounds,#1 + vstmia $out!,{@XMM[0]-@XMM[7]} @ write bit-sliced round key + bne .Lkey_loop + + vmov.i8 @XMM[7],#0x63 @ compose .L63 + @ don't save last round key + bx lr +.size _bsaes_key_convert,.-_bsaes_key_convert +___ +} + +{ +my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10))); +my $const = "r6"; # shared with _bsaes_encrypt8_alt +my $keysched = "sp"; + +$code.=<<___; +.global bsaes_ctr32_encrypt_blocks +.type bsaes_ctr32_encrypt_blocks,%function +.align 5 +bsaes_ctr32_encrypt_blocks: + @ In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this + @ out to retain a constant-time implementation. + mov ip, sp + stmdb sp!, {r4-r10, lr} + VFP_ABI_PUSH + ldr $ctr, [ip] @ ctr is 1st arg on the stack + sub sp, sp, #0x10 @ scratch space to carry over the ctr + mov $fp, sp @ save sp + + ldr $rounds, [$key, #240] @ get # of rounds +#ifndef BSAES_ASM_EXTENDED_KEY + @ allocate the key schedule on the stack + sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key + add r12, #`128-32` @ size of bit-sliced key schedule + + @ populate the key schedule + mov r4, $key @ pass key + mov r5, $rounds @ pass # of rounds + mov sp, r12 @ sp is $keysched + bl _bsaes_key_convert + veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key + vstmia r12, {@XMM[7]} @ save last round key + + vld1.8 {@XMM[0]}, [$ctr] @ load counter +#ifdef __APPLE__ + mov $ctr, #:lower16:(.LREVM0SR-.LM0) + add $ctr, $const, $ctr +#else + add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr +#endif + vldmia $keysched, {@XMM[4]} @ load round0 key +#else + ldr r12, [$key, #244] + eors r12, #1 + beq 0f + + @ populate the key schedule + str r12, [$key, #244] + mov r4, $key @ pass key + mov r5, $rounds @ pass # of rounds + add r12, $key, #248 @ pass key schedule + bl _bsaes_key_convert + veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key + vstmia r12, {@XMM[7]} @ save last round key + +.align 2 +0: add r12, $key, #248 + vld1.8 {@XMM[0]}, [$ctr] @ load counter + adrl $ctr, .LREVM0SR @ borrow $ctr + vldmia r12, {@XMM[4]} @ load round0 key + sub sp, #0x10 @ place for adjusted round0 key +#endif + + vmov.i32 @XMM[8],#1 @ compose 1<<96 + veor @XMM[9],@XMM[9],@XMM[9] + vrev32.8 @XMM[0],@XMM[0] + vext.8 @XMM[8],@XMM[9],@XMM[8],#4 + vrev32.8 @XMM[4],@XMM[4] + vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96 + vstmia $keysched, {@XMM[4]} @ save adjusted round0 key + b .Lctr_enc_loop + +.align 4 +.Lctr_enc_loop: + vadd.u32 @XMM[10], @XMM[8], @XMM[9] @ compose 3<<96 + vadd.u32 @XMM[1], @XMM[0], @XMM[8] @ +1 + vadd.u32 @XMM[2], @XMM[0], @XMM[9] @ +2 + vadd.u32 @XMM[3], @XMM[0], @XMM[10] @ +3 + vadd.u32 @XMM[4], @XMM[1], @XMM[10] + vadd.u32 @XMM[5], @XMM[2], @XMM[10] + vadd.u32 @XMM[6], @XMM[3], @XMM[10] + vadd.u32 @XMM[7], @XMM[4], @XMM[10] + vadd.u32 @XMM[10], @XMM[5], @XMM[10] @ next counter + + @ Borrow prologue from _bsaes_encrypt8 to use the opportunity + @ to flip byte order in 32-bit counter + + vldmia $keysched, {@XMM[9]} @ load round0 key +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, $keysched, #0x10 @ pass next round key +#else + add r4, $key, #`248+16` +#endif + vldmia $ctr, {@XMM[8]} @ .LREVM0SR + mov r5, $rounds @ pass rounds + vstmia $fp, {@XMM[10]} @ save next counter +#ifdef __APPLE__ + mov $const, #:lower16:(.LREVM0SR-.LSR) + sub $const, $ctr, $const +#else + sub $const, $ctr, #.LREVM0SR-.LSR @ pass constants +#endif + + bl _bsaes_encrypt8_alt + + subs $len, $len, #8 + blo .Lctr_enc_loop_done + + vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ load input + vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! + veor @XMM[0], @XMM[8] + veor @XMM[1], @XMM[9] + vld1.8 {@XMM[12]-@XMM[13]}, [$inp]! + veor @XMM[4], @XMM[10] + veor @XMM[6], @XMM[11] + vld1.8 {@XMM[14]-@XMM[15]}, [$inp]! + veor @XMM[3], @XMM[12] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output + veor @XMM[7], @XMM[13] + veor @XMM[2], @XMM[14] + vst1.8 {@XMM[4]}, [$out]! + veor @XMM[5], @XMM[15] + vst1.8 {@XMM[6]}, [$out]! + vmov.i32 @XMM[8], #1 @ compose 1<<96 + vst1.8 {@XMM[3]}, [$out]! + veor @XMM[9], @XMM[9], @XMM[9] + vst1.8 {@XMM[7]}, [$out]! + vext.8 @XMM[8], @XMM[9], @XMM[8], #4 + vst1.8 {@XMM[2]}, [$out]! + vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96 + vst1.8 {@XMM[5]}, [$out]! + vldmia $fp, {@XMM[0]} @ load counter + + bne .Lctr_enc_loop + b .Lctr_enc_done + +.align 4 +.Lctr_enc_loop_done: + add $len, $len, #8 + vld1.8 {@XMM[8]}, [$inp]! @ load input + veor @XMM[0], @XMM[8] + vst1.8 {@XMM[0]}, [$out]! @ write output + cmp $len, #2 + blo .Lctr_enc_done + vld1.8 {@XMM[9]}, [$inp]! + veor @XMM[1], @XMM[9] + vst1.8 {@XMM[1]}, [$out]! + beq .Lctr_enc_done + vld1.8 {@XMM[10]}, [$inp]! + veor @XMM[4], @XMM[10] + vst1.8 {@XMM[4]}, [$out]! + cmp $len, #4 + blo .Lctr_enc_done + vld1.8 {@XMM[11]}, [$inp]! + veor @XMM[6], @XMM[11] + vst1.8 {@XMM[6]}, [$out]! + beq .Lctr_enc_done + vld1.8 {@XMM[12]}, [$inp]! + veor @XMM[3], @XMM[12] + vst1.8 {@XMM[3]}, [$out]! + cmp $len, #6 + blo .Lctr_enc_done + vld1.8 {@XMM[13]}, [$inp]! + veor @XMM[7], @XMM[13] + vst1.8 {@XMM[7]}, [$out]! + beq .Lctr_enc_done + vld1.8 {@XMM[14]}, [$inp] + veor @XMM[2], @XMM[14] + vst1.8 {@XMM[2]}, [$out]! + +.Lctr_enc_done: + vmov.i32 q0, #0 + vmov.i32 q1, #0 +#ifndef BSAES_ASM_EXTENDED_KEY +.Lctr_enc_bzero: @ wipe key schedule [if any] + vstmia $keysched!, {q0-q1} + cmp $keysched, $fp + bne .Lctr_enc_bzero +#else + vstmia $keysched, {q0-q1} +#endif + + mov sp, $fp + add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb + VFP_ABI_POP + ldmia sp!, {r4-r10, pc} @ return + + @ OpenSSL contains aes_nohw_* fallback code here. We patch this + @ out to retain a constant-time implementation. +.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks +___ +} +$code.=<<___; +#endif +___ + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; + +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/@/ and !/^$/); + print; +} +close SELF; + +print $code; + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/ring-0.17.14/crypto/fipsmodule/aes/asm/ghash-armv4.pl b/ring-0.17.14/crypto/fipsmodule/aes/asm/ghash-armv4.pl new file mode 100644 index 0000000000..c16ed377d3 --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/aes/asm/ghash-armv4.pl @@ -0,0 +1,303 @@ +#! /usr/bin/env perl +# Copyright 2010-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. +# ==================================================================== +# +# April 2010 +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that it +# uses 256 bytes per-key table [+32 bytes shared table]. There is no +# experimental performance data available yet. The only approximation +# that can be made at this point is based on code size. Inner loop is +# 32 instructions long and on single-issue core should execute in <40 +# cycles. Having verified that gcc 3.4 didn't unroll corresponding +# loop, this assembler loop body was found to be ~3x smaller than +# compiler-generated one... +# +# July 2010 +# +# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on +# Cortex A8 core and ~25 cycles per processed byte (which was observed +# to be ~3 times faster than gcc-generated code:-) +# +# February 2011 +# +# Profiler-assisted and platform-specific optimization resulted in 7% +# improvement on Cortex A8 core and ~23.5 cycles per byte. +# +# March 2011 +# +# Add NEON implementation featuring polynomial multiplication, i.e. no +# lookup tables involved. On Cortex A8 it was measured to process one +# byte in 15 cycles or 55% faster than integer-only code. +# +# April 2014 +# +# Switch to multiplication algorithm suggested in paper referred +# below and combine it with reduction algorithm from x86 module. +# Performance improvement over previous version varies from 65% on +# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8 +# processes one byte in 8.45 cycles, A9 - in 10.2, A15 - in 7.63, +# Snapdragon S4 - in 9.33. +# +# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software +# Polynomial Multiplication on ARM Processors using the NEON Engine. +# +# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf + +# ==================================================================== +# Note about "528B" variant. In ARM case it makes lesser sense to +# implement it for following reasons: +# +# - performance improvement won't be anywhere near 50%, because 128- +# bit shift operation is neatly fused with 128-bit xor here, and +# "538B" variant would eliminate only 4-5 instructions out of 32 +# in the inner loop (meaning that estimated improvement is ~15%); +# - ARM-based systems are often embedded ones and extra memory +# consumption might be unappreciated (for so little improvement); +# +# Byte order [in]dependence. ========================================= +# +# Caller is expected to maintain specific *dword* order in Htable, +# namely with *least* significant dword of 128-bit value at *lower* +# address. This differs completely from C code and has everything to +# do with ldm instruction and order in which dwords are "consumed" by +# algorithm. *Byte* order within these dwords in turn is whatever +# *native* byte order on current platform. See gcm128.c for working +# example... + +# This file was patched in BoringSSL to remove the variable-time 4-bit +# implementation. + +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; + *STDOUT=*OUT; +} else { + open OUT,">$output"; + *STDOUT=*OUT; +} + +$Xi="r0"; # argument block +$Htbl="r1"; +$inp="r2"; +$len="r3"; + +$code=<<___; +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL +@ instructions are in aesv8-armx.pl.) +.arch armv7-a + +.text +#if defined(__thumb2__) || defined(__clang__) +.syntax unified +#define ldrplb ldrbpl +#define ldrneb ldrbne +#endif +#if defined(__thumb2__) +.thumb +#else +.code 32 +#endif +___ +{ +my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3)); +my ($t0,$t1,$t2,$t3)=map("q$_",(8..12)); +my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31)); + +sub clmul64x64 { +my ($r,$a,$b)=@_; +$code.=<<___; + vext.8 $t0#lo, $a, $a, #1 @ A1 + vmull.p8 $t0, $t0#lo, $b @ F = A1*B + vext.8 $r#lo, $b, $b, #1 @ B1 + vmull.p8 $r, $a, $r#lo @ E = A*B1 + vext.8 $t1#lo, $a, $a, #2 @ A2 + vmull.p8 $t1, $t1#lo, $b @ H = A2*B + vext.8 $t3#lo, $b, $b, #2 @ B2 + vmull.p8 $t3, $a, $t3#lo @ G = A*B2 + vext.8 $t2#lo, $a, $a, #3 @ A3 + veor $t0, $t0, $r @ L = E + F + vmull.p8 $t2, $t2#lo, $b @ J = A3*B + vext.8 $r#lo, $b, $b, #3 @ B3 + veor $t1, $t1, $t3 @ M = G + H + vmull.p8 $r, $a, $r#lo @ I = A*B3 + veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8 + vand $t0#hi, $t0#hi, $k48 + vext.8 $t3#lo, $b, $b, #4 @ B4 + veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16 + vand $t1#hi, $t1#hi, $k32 + vmull.p8 $t3, $a, $t3#lo @ K = A*B4 + veor $t2, $t2, $r @ N = I + J + veor $t0#lo, $t0#lo, $t0#hi + veor $t1#lo, $t1#lo, $t1#hi + veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24 + vand $t2#hi, $t2#hi, $k16 + vext.8 $t0, $t0, $t0, #15 + veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32 + vmov.i64 $t3#hi, #0 + vext.8 $t1, $t1, $t1, #14 + veor $t2#lo, $t2#lo, $t2#hi + vmull.p8 $r, $a, $b @ D = A*B + vext.8 $t3, $t3, $t3, #12 + vext.8 $t2, $t2, $t2, #13 + veor $t0, $t0, $t1 + veor $t2, $t2, $t3 + veor $r, $r, $t0 + veor $r, $r, $t2 +___ +} + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.global gcm_init_neon +.type gcm_init_neon,%function +.align 4 +gcm_init_neon: + vld1.64 $IN#hi,[r1]! @ load H + vmov.i8 $t0,#0xe1 + vld1.64 $IN#lo,[r1] + vshl.i64 $t0#hi,#57 + vshr.u64 $t0#lo,#63 @ t0=0xc2....01 + vdup.8 $t1,$IN#hi[7] + vshr.u64 $Hlo,$IN#lo,#63 + vshr.s8 $t1,#7 @ broadcast carry bit + vshl.i64 $IN,$IN,#1 + vand $t0,$t0,$t1 + vorr $IN#hi,$Hlo @ H<<<=1 + veor $IN,$IN,$t0 @ twisted H + vstmia r0,{$IN} + + ret @ bx lr +.size gcm_init_neon,.-gcm_init_neon + +.global gcm_gmult_neon +.type gcm_gmult_neon,%function +.align 4 +gcm_gmult_neon: + vld1.64 $IN#hi,[$Xi]! @ load Xi + vld1.64 $IN#lo,[$Xi]! + vmov.i64 $k48,#0x0000ffffffffffff + vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H + vmov.i64 $k32,#0x00000000ffffffff +#ifdef __ARMEL__ + vrev64.8 $IN,$IN +#endif + vmov.i64 $k16,#0x000000000000ffff + veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing + mov $len,#16 + b .Lgmult_neon +.size gcm_gmult_neon,.-gcm_gmult_neon + +.global gcm_ghash_neon +.type gcm_ghash_neon,%function +.align 4 +gcm_ghash_neon: + vld1.64 $Xl#hi,[$Xi]! @ load Xi + vld1.64 $Xl#lo,[$Xi]! + vmov.i64 $k48,#0x0000ffffffffffff + vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H + vmov.i64 $k32,#0x00000000ffffffff +#ifdef __ARMEL__ + vrev64.8 $Xl,$Xl +#endif + vmov.i64 $k16,#0x000000000000ffff + veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing + +.Loop_neon: + vld1.64 $IN#hi,[$inp]! @ load inp + vld1.64 $IN#lo,[$inp]! +#ifdef __ARMEL__ + vrev64.8 $IN,$IN +#endif + veor $IN,$Xl @ inp^=Xi +.Lgmult_neon: +___ + &clmul64x64 ($Xl,$Hlo,"$IN#lo"); # H.lo·Xi.lo +$code.=<<___; + veor $IN#lo,$IN#lo,$IN#hi @ Karatsuba pre-processing +___ + &clmul64x64 ($Xm,$Hhl,"$IN#lo"); # (H.lo+H.hi)·(Xi.lo+Xi.hi) + &clmul64x64 ($Xh,$Hhi,"$IN#hi"); # H.hi·Xi.hi +$code.=<<___; + veor $Xm,$Xm,$Xl @ Karatsuba post-processing + veor $Xm,$Xm,$Xh + veor $Xl#hi,$Xl#hi,$Xm#lo + veor $Xh#lo,$Xh#lo,$Xm#hi @ Xh|Xl - 256-bit result + + @ equivalent of reduction_avx from ghash-x86_64.pl + vshl.i64 $t1,$Xl,#57 @ 1st phase + vshl.i64 $t2,$Xl,#62 + veor $t2,$t2,$t1 @ + vshl.i64 $t1,$Xl,#63 + veor $t2, $t2, $t1 @ + veor $Xl#hi,$Xl#hi,$t2#lo @ + veor $Xh#lo,$Xh#lo,$t2#hi + + vshr.u64 $t2,$Xl,#1 @ 2nd phase + veor $Xh,$Xh,$Xl + veor $Xl,$Xl,$t2 @ + vshr.u64 $t2,$t2,#6 + vshr.u64 $Xl,$Xl,#1 @ + veor $Xl,$Xl,$Xh @ + veor $Xl,$Xl,$t2 @ + + subs $len,#16 + bne .Loop_neon + +#ifdef __ARMEL__ + vrev64.8 $Xl,$Xl +#endif + sub $Xi,#16 + vst1.64 $Xl#hi,[$Xi]! @ write out Xi + vst1.64 $Xl#lo,[$Xi] + + ret @ bx lr +.size gcm_ghash_neon,.-gcm_ghash_neon +#endif +___ +} +$code.=<<___; +.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by " +.align 2 +___ + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or + s/\bret\b/bx lr/go or + s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 + + print $_,"\n"; +} +close STDOUT or die "error closing STDOUT: $!"; # enforce flush diff --git a/ring-0.17.14/crypto/fipsmodule/aes/asm/ghash-neon-armv8.pl b/ring-0.17.14/crypto/fipsmodule/aes/asm/ghash-neon-armv8.pl new file mode 100644 index 0000000000..b4ae1db8c9 --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/aes/asm/ghash-neon-armv8.pl @@ -0,0 +1,297 @@ +#! /usr/bin/env perl +# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. +# ==================================================================== + +# This file was adapted to AArch64 from the 32-bit version in ghash-armv4.pl. It +# implements the multiplication algorithm described in: +# +# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software +# Polynomial Multiplication on ARM Processors using the NEON Engine. +# +# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf +# +# The main distinction to keep in mind between 32-bit NEON and AArch64 SIMD is +# AArch64 cannot compute over the upper halves of SIMD registers. In 32-bit +# NEON, the low and high halves of the 128-bit register q0 are accessible as +# 64-bit registers d0 and d1, respectively. In AArch64, dN is the lower half of +# vN. Where the 32-bit version would use the upper half, this file must keep +# halves in separate registers. +# +# The other distinction is in syntax. 32-bit NEON embeds lane information in the +# instruction name, while AArch64 uses suffixes on the registers. For instance, +# left-shifting 64-bit lanes of a SIMD register in 32-bit would be written: +# +# vshl.i64 q0, q0, #1 +# +# in 64-bit, it would be written: +# +# shl v0.2d, v0.2d, #1 +# +# See Programmer's Guide for ARMv8-A, section 7 for details. +# http://infocenter.arm.com/help/topic/com.arm.doc.den0024a/DEN0024A_v8_architecture_PG.pdf +# +# Finally, note the 8-bit and 64-bit polynomial multipliers in AArch64 differ +# only by suffix. pmull vR.8h, vA.8b, vB.8b multiplies eight 8-bit polynomials +# and is always available. pmull vR.1q, vA.1d, vB.1d multiplies a 64-bit +# polynomial and is conditioned on the PMULL extension. This file emulates the +# latter with the former. + +use strict; + +my $flavour = shift; +my $output; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; + my $dir = $1; + my $xlate; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; + *STDOUT=*OUT; +} else { + open OUT,">$output"; + *STDOUT=*OUT; +} + +my ($Xi, $Htbl, $inp, $len) = map("x$_", (0..3)); # argument block +my ($Xl, $Xm, $Xh, $INlo, $INhi) = map("v$_", (0..4)); +my ($Hlo, $Hhi, $Hhl) = map("v$_", (5..7)); +# d8-d15 are callee-saved, so avoid v8-v15. AArch64 SIMD has plenty of registers +# to spare. +my ($t0, $t1, $t2, $t3) = map("v$_", (16..19)); +my ($t0l_t1l, $t0h_t1h, $t2l_t3l, $t2h_t3h) = map("v$_", (20..23)); +my ($k48_k32, $k16_k0) = map("v$_", (24..25)); + +my $code = ""; + +# clmul64x64 emits code which emulates pmull $r.1q, $a.1d, $b.1d. $r, $a, and $b +# must be distinct from $t* and $k*. $t* are clobbered by the emitted code. +sub clmul64x64 { +my ($r, $a, $b) = @_; +$code .= <<___; + ext $t0.8b, $a.8b, $a.8b, #1 // A1 + pmull $t0.8h, $t0.8b, $b.8b // F = A1*B + ext $r.8b, $b.8b, $b.8b, #1 // B1 + pmull $r.8h, $a.8b, $r.8b // E = A*B1 + ext $t1.8b, $a.8b, $a.8b, #2 // A2 + pmull $t1.8h, $t1.8b, $b.8b // H = A2*B + ext $t3.8b, $b.8b, $b.8b, #2 // B2 + pmull $t3.8h, $a.8b, $t3.8b // G = A*B2 + ext $t2.8b, $a.8b, $a.8b, #3 // A3 + eor $t0.16b, $t0.16b, $r.16b // L = E + F + pmull $t2.8h, $t2.8b, $b.8b // J = A3*B + ext $r.8b, $b.8b, $b.8b, #3 // B3 + eor $t1.16b, $t1.16b, $t3.16b // M = G + H + pmull $r.8h, $a.8b, $r.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor \$t0#lo, \$t0#lo, \$t0#hi @ t0 = P0 + P1 (L) + // vand \$t0#hi, \$t0#hi, \$k48 + // veor \$t0#lo, \$t0#lo, \$t0#hi + // + // veor \$t1#lo, \$t1#lo, \$t1#hi @ t1 = P2 + P3 (M) + // vand \$t1#hi, \$t1#hi, \$k32 + // veor \$t1#lo, \$t1#lo, \$t1#hi + // + // veor \$t2#lo, \$t2#lo, \$t2#hi @ t2 = P4 + P5 (N) + // vand \$t2#hi, \$t2#hi, \$k16 + // veor \$t2#lo, \$t2#lo, \$t2#hi + // + // veor \$t3#lo, \$t3#lo, \$t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 \$t3#hi, #0 + // + // \$kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext $t3.8b, $b.8b, $b.8b, #4 // B4 + eor $t2.16b, $t2.16b, $r.16b // N = I + J + pmull $t3.8h, $a.8b, $t3.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 $t0l_t1l.2d, $t0.2d, $t1.2d + zip1 $t2l_t3l.2d, $t2.2d, $t3.2d + zip2 $t0h_t1h.2d, $t0.2d, $t1.2d + zip2 $t2h_t3h.2d, $t2.2d, $t3.2d + eor $t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b + eor $t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b + and $t0h_t1h.16b, $t0h_t1h.16b, $k48_k32.16b + and $t2h_t3h.16b, $t2h_t3h.16b, $k16_k0.16b + eor $t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b + eor $t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b + zip1 $t0.2d, $t0l_t1l.2d, $t0h_t1h.2d + zip1 $t2.2d, $t2l_t3l.2d, $t2h_t3h.2d + zip2 $t1.2d, $t0l_t1l.2d, $t0h_t1h.2d + zip2 $t3.2d, $t2l_t3l.2d, $t2h_t3h.2d + + ext $t0.16b, $t0.16b, $t0.16b, #15 // t0 = t0 << 8 + ext $t1.16b, $t1.16b, $t1.16b, #14 // t1 = t1 << 16 + pmull $r.8h, $a.8b, $b.8b // D = A*B + ext $t3.16b, $t3.16b, $t3.16b, #12 // t3 = t3 << 32 + ext $t2.16b, $t2.16b, $t2.16b, #13 // t2 = t2 << 24 + eor $t0.16b, $t0.16b, $t1.16b + eor $t2.16b, $t2.16b, $t3.16b + eor $r.16b, $r.16b, $t0.16b + eor $r.16b, $r.16b, $t2.16b +___ +} + +$code .= <<___; +.text + +.global gcm_init_neon +.type gcm_init_neon,%function +.align 4 +gcm_init_neon: + AARCH64_VALID_CALL_TARGET + // This function is adapted from gcm_init_v8. xC2 is t3. + ld1 {$t1.2d}, [x1] // load H + movi $t3.16b, #0xe1 + shl $t3.2d, $t3.2d, #57 // 0xc2.0 + ext $INlo.16b, $t1.16b, $t1.16b, #8 + ushr $t2.2d, $t3.2d, #63 + dup $t1.4s, $t1.s[1] + ext $t0.16b, $t2.16b, $t3.16b, #8 // t0=0xc2....01 + ushr $t2.2d, $INlo.2d, #63 + sshr $t1.4s, $t1.4s, #31 // broadcast carry bit + and $t2.16b, $t2.16b, $t0.16b + shl $INlo.2d, $INlo.2d, #1 + ext $t2.16b, $t2.16b, $t2.16b, #8 + and $t0.16b, $t0.16b, $t1.16b + orr $INlo.16b, $INlo.16b, $t2.16b // H<<<=1 + eor $Hlo.16b, $INlo.16b, $t0.16b // twisted H + st1 {$Hlo.2d}, [x0] // store Htable[0] + ret +.size gcm_init_neon,.-gcm_init_neon + +.global gcm_gmult_neon +.type gcm_gmult_neon,%function +.align 4 +gcm_gmult_neon: + AARCH64_VALID_CALL_TARGET + ld1 {$INlo.16b}, [$Xi] // load Xi + ld1 {$Hlo.1d}, [$Htbl], #8 // load twisted H + ld1 {$Hhi.1d}, [$Htbl] + adrp x9, :pg_hi21:.Lmasks // load constants + add x9, x9, :lo12:.Lmasks + ld1 {$k48_k32.2d, $k16_k0.2d}, [x9] + rev64 $INlo.16b, $INlo.16b // byteswap Xi + ext $INlo.16b, $INlo.16b, $INlo.16b, #8 + eor $Hhl.8b, $Hlo.8b, $Hhi.8b // Karatsuba pre-processing + + mov $len, #16 + b .Lgmult_neon +.size gcm_gmult_neon,.-gcm_gmult_neon + +.global gcm_ghash_neon +.type gcm_ghash_neon,%function +.align 4 +gcm_ghash_neon: + AARCH64_VALID_CALL_TARGET + ld1 {$Xl.16b}, [$Xi] // load Xi + ld1 {$Hlo.1d}, [$Htbl], #8 // load twisted H + ld1 {$Hhi.1d}, [$Htbl] + adrp x9, :pg_hi21:.Lmasks // load constants + add x9, x9, :lo12:.Lmasks + ld1 {$k48_k32.2d, $k16_k0.2d}, [x9] + rev64 $Xl.16b, $Xl.16b // byteswap Xi + ext $Xl.16b, $Xl.16b, $Xl.16b, #8 + eor $Hhl.8b, $Hlo.8b, $Hhi.8b // Karatsuba pre-processing + +.Loop_neon: + ld1 {$INlo.16b}, [$inp], #16 // load inp + rev64 $INlo.16b, $INlo.16b // byteswap inp + ext $INlo.16b, $INlo.16b, $INlo.16b, #8 + eor $INlo.16b, $INlo.16b, $Xl.16b // inp ^= Xi + +.Lgmult_neon: + // Split the input into $INlo and $INhi. (The upper halves are unused, + // so it is okay to leave them alone.) + ins $INhi.d[0], $INlo.d[1] +___ +&clmul64x64 ($Xl, $Hlo, $INlo); # H.lo·Xi.lo +$code .= <<___; + eor $INlo.8b, $INlo.8b, $INhi.8b // Karatsuba pre-processing +___ +&clmul64x64 ($Xm, $Hhl, $INlo); # (H.lo+H.hi)·(Xi.lo+Xi.hi) +&clmul64x64 ($Xh, $Hhi, $INhi); # H.hi·Xi.hi +$code .= <<___; + ext $t0.16b, $Xl.16b, $Xh.16b, #8 + eor $Xm.16b, $Xm.16b, $Xl.16b // Karatsuba post-processing + eor $Xm.16b, $Xm.16b, $Xh.16b + eor $Xm.16b, $Xm.16b, $t0.16b // Xm overlaps Xh.lo and Xl.hi + ins $Xl.d[1], $Xm.d[0] // Xh|Xl - 256-bit result + // This is a no-op due to the ins instruction below. + // ins $Xh.d[0], $Xm.d[1] + + // equivalent of reduction_avx from ghash-x86_64.pl + shl $t1.2d, $Xl.2d, #57 // 1st phase + shl $t2.2d, $Xl.2d, #62 + eor $t2.16b, $t2.16b, $t1.16b // + shl $t1.2d, $Xl.2d, #63 + eor $t2.16b, $t2.16b, $t1.16b // + // Note Xm contains {Xl.d[1], Xh.d[0]}. + eor $t2.16b, $t2.16b, $Xm.16b + ins $Xl.d[1], $t2.d[0] // Xl.d[1] ^= t2.d[0] + ins $Xh.d[0], $t2.d[1] // Xh.d[0] ^= t2.d[1] + + ushr $t2.2d, $Xl.2d, #1 // 2nd phase + eor $Xh.16b, $Xh.16b,$Xl.16b + eor $Xl.16b, $Xl.16b,$t2.16b // + ushr $t2.2d, $t2.2d, #6 + ushr $Xl.2d, $Xl.2d, #1 // + eor $Xl.16b, $Xl.16b, $Xh.16b // + eor $Xl.16b, $Xl.16b, $t2.16b // + + subs $len, $len, #16 + bne .Loop_neon + + rev64 $Xl.16b, $Xl.16b // byteswap Xi and write + ext $Xl.16b, $Xl.16b, $Xl.16b, #8 + st1 {$Xl.16b}, [$Xi] + + ret +.size gcm_ghash_neon,.-gcm_ghash_neon + +.section .rodata +.align 4 +.Lmasks: +.quad 0x0000ffffffffffff // k48 +.quad 0x00000000ffffffff // k32 +.quad 0x000000000000ffff // k16 +.quad 0x0000000000000000 // k0 +.asciz "GHASH for ARMv8, derived from ARMv4 version by " +.align 2 +___ + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + print $_,"\n"; +} +close STDOUT or die "error closing STDOUT: $!"; # enforce flush diff --git a/ring-0.17.14/crypto/fipsmodule/aes/asm/ghash-x86.pl b/ring-0.17.14/crypto/fipsmodule/aes/asm/ghash-x86.pl new file mode 100644 index 0000000000..1cebb4b20e --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/aes/asm/ghash-x86.pl @@ -0,0 +1,460 @@ +#! /usr/bin/env perl +# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. +# ==================================================================== +# +# March, May, June 2010 +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that it +# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two +# code paths: vanilla x86 and vanilla SSE. Former will be executed on +# 486 and Pentium, latter on all others. SSE GHASH features so called +# "528B" variant of "4-bit" method utilizing additional 256+16 bytes +# of per-key storage [+512 bytes shared table]. Performance results +# are for streamed GHASH subroutine and are expressed in cycles per +# processed byte, less is better: +# +# gcc 2.95.3(*) SSE assembler x86 assembler +# +# Pentium 105/111(**) - 50 +# PIII 68 /75 12.2 24 +# P4 125/125 17.8 84(***) +# Opteron 66 /70 10.1 30 +# Core2 54 /67 8.4 18 +# Atom 105/105 16.8 53 +# VIA Nano 69 /71 13.0 27 +# +# (*) gcc 3.4.x was observed to generate few percent slower code, +# which is one of reasons why 2.95.3 results were chosen, +# another reason is lack of 3.4.x results for older CPUs; +# comparison with SSE results is not completely fair, because C +# results are for vanilla "256B" implementation, while +# assembler results are for "528B";-) +# (**) second number is result for code compiled with -fPIC flag, +# which is actually more relevant, because assembler code is +# position-independent; +# (***) see comment in non-MMX routine for further details; +# +# To summarize, it's >2-5 times faster than gcc-generated code. To +# anchor it to something else SHA1 assembler processes one byte in +# ~7 cycles on contemporary x86 cores. As for choice of MMX/SSE +# in particular, see comment at the end of the file... + +# May 2010 +# +# Add PCLMULQDQ version performing at 2.10 cycles per processed byte. +# The question is how close is it to theoretical limit? The pclmulqdq +# instruction latency appears to be 14 cycles and there can't be more +# than 2 of them executing at any given time. This means that single +# Karatsuba multiplication would take 28 cycles *plus* few cycles for +# pre- and post-processing. Then multiplication has to be followed by +# modulo-reduction. Given that aggregated reduction method [see +# "Carry-less Multiplication and Its Usage for Computing the GCM Mode" +# white paper by Intel] allows you to perform reduction only once in +# a while we can assume that asymptotic performance can be estimated +# as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction +# and Naggr is the aggregation factor. +# +# Before we proceed to this implementation let's have closer look at +# the best-performing code suggested by Intel in their white paper. +# By tracing inter-register dependencies Tmod is estimated as ~19 +# cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per +# processed byte. As implied, this is quite optimistic estimate, +# because it does not account for Karatsuba pre- and post-processing, +# which for a single multiplication is ~5 cycles. Unfortunately Intel +# does not provide performance data for GHASH alone. But benchmarking +# AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt +# alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that +# the result accounts even for pre-computing of degrees of the hash +# key H, but its portion is negligible at 16KB buffer size. +# +# Moving on to the implementation in question. Tmod is estimated as +# ~13 cycles and Naggr is 2, giving asymptotic performance of ... +# 2.16. How is it possible that measured performance is better than +# optimistic theoretical estimate? There is one thing Intel failed +# to recognize. By serializing GHASH with CTR in same subroutine +# former's performance is really limited to above (Tmul + Tmod/Naggr) +# equation. But if GHASH procedure is detached, the modulo-reduction +# can be interleaved with Naggr-1 multiplications at instruction level +# and under ideal conditions even disappear from the equation. So that +# optimistic theoretical estimate for this implementation is ... +# 28/16=1.75, and not 2.16. Well, it's probably way too optimistic, +# at least for such small Naggr. I'd argue that (28+Tproc/Naggr), +# where Tproc is time required for Karatsuba pre- and post-processing, +# is more realistic estimate. In this case it gives ... 1.91 cycles. +# Or in other words, depending on how well we can interleave reduction +# and one of the two multiplications the performance should be between +# 1.91 and 2.16. As already mentioned, this implementation processes +# one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart +# - in 2.02. x86_64 performance is better, because larger register +# bank allows to interleave reduction and multiplication better. +# +# Does it make sense to increase Naggr? To start with it's virtually +# impossible in 32-bit mode, because of limited register bank +# capacity. Otherwise improvement has to be weighed against slower +# setup, as well as code size and complexity increase. As even +# optimistic estimate doesn't promise 30% performance improvement, +# there are currently no plans to increase Naggr. +# +# Special thanks to David Woodhouse for providing access to a +# Westmere-based system on behalf of Intel Open Source Technology Centre. + +# January 2010 +# +# Tweaked to optimize transitions between integer and FP operations +# on same XMM register, PCLMULQDQ subroutine was measured to process +# one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere. +# The minor regression on Westmere is outweighed by ~15% improvement +# on Sandy Bridge. Strangely enough attempt to modify 64-bit code in +# similar manner resulted in almost 20% degradation on Sandy Bridge, +# where original 64-bit code processes one byte in 1.95 cycles. + +##################################################################### +# For reference, AMD Bulldozer processes one byte in 1.98 cycles in +# 32-bit mode and 1.89 in 64-bit. + +# February 2013 +# +# Overhaul: aggregate Karatsuba post-processing, improve ILP in +# reduction_alg9. Resulting performance is 1.96 cycles per byte on +# Westmere, 1.95 - on Sandy/Ivy Bridge, 1.76 - on Bulldozer. + +# This file was patched in BoringSSL to remove the variable-time 4-bit +# implementation. + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../../perlasm"); +require "x86asm.pl"; + +$output=pop; +open STDOUT,">$output"; + +&asm_init($ARGV[0]); + +$x86only=0; +$sse2=1; + +if (!$x86only) {{{ +if ($sse2) {{ +###################################################################### +# PCLMULQDQ version. + +$Xip="eax"; +$Htbl="edx"; +$const="ecx"; +$inp="esi"; +$len="ebx"; + +($Xi,$Xhi)=("xmm0","xmm1"); $Hkey="xmm2"; +($T1,$T2,$T3)=("xmm3","xmm4","xmm5"); +($Xn,$Xhn)=("xmm6","xmm7"); + +&static_label("bswap"); + +sub clmul64x64_T2 { # minimal "register" pressure +my ($Xhi,$Xi,$Hkey,$HK)=@_; + + &movdqa ($Xhi,$Xi); # + &pshufd ($T1,$Xi,0b01001110); + &pshufd ($T2,$Hkey,0b01001110) if (!defined($HK)); + &pxor ($T1,$Xi); # + &pxor ($T2,$Hkey) if (!defined($HK)); + $HK=$T2 if (!defined($HK)); + + &pclmulqdq ($Xi,$Hkey,0x00); ####### + &pclmulqdq ($Xhi,$Hkey,0x11); ####### + &pclmulqdq ($T1,$HK,0x00); ####### + &xorps ($T1,$Xi); # + &xorps ($T1,$Xhi); # + + &movdqa ($T2,$T1); # + &psrldq ($T1,8); + &pslldq ($T2,8); # + &pxor ($Xhi,$T1); + &pxor ($Xi,$T2); # +} + + +if (1) { # Algorithm 9 with <<1 twist. + # Reduction is shorter and uses only two + # temporary registers, which makes it better + # candidate for interleaving with 64x64 + # multiplication. Pre-modulo-scheduled loop + # was found to be ~20% faster than Algorithm 5 + # below. Algorithm 9 was therefore chosen for + # further optimization... + +sub reduction_alg9 { # 17/11 times faster than Intel version +my ($Xhi,$Xi) = @_; + + # 1st phase + &movdqa ($T2,$Xi); # + &movdqa ($T1,$Xi); + &psllq ($Xi,5); + &pxor ($T1,$Xi); # + &psllq ($Xi,1); + &pxor ($Xi,$T1); # + &psllq ($Xi,57); # + &movdqa ($T1,$Xi); # + &pslldq ($Xi,8); + &psrldq ($T1,8); # + &pxor ($Xi,$T2); + &pxor ($Xhi,$T1); # + + # 2nd phase + &movdqa ($T2,$Xi); + &psrlq ($Xi,1); + &pxor ($Xhi,$T2); # + &pxor ($T2,$Xi); + &psrlq ($Xi,5); + &pxor ($Xi,$T2); # + &psrlq ($Xi,1); # + &pxor ($Xi,$Xhi) # +} + +&function_begin_B("gcm_init_clmul"); + &mov ($Htbl,&wparam(0)); + &mov ($Xip,&wparam(1)); + + &call (&label("pic")); +&set_label("pic"); + &blindpop ($const); + &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); + + &movdqu ($Hkey,&QWP(0,$Xip)); + &pshufd ($Hkey,$Hkey,0b01001110);# dword swap + + # <<1 twist + &pshufd ($T2,$Hkey,0b11111111); # broadcast uppermost dword + &movdqa ($T1,$Hkey); + &psllq ($Hkey,1); + &pxor ($T3,$T3); # + &psrlq ($T1,63); + &pcmpgtd ($T3,$T2); # broadcast carry bit + &pslldq ($T1,8); + &por ($Hkey,$T1); # H<<=1 + + # magic reduction + &pand ($T3,&QWP(16,$const)); # 0x1c2_polynomial + &pxor ($Hkey,$T3); # if(carry) H^=0x1c2_polynomial + + # calculate H^2 + &movdqa ($Xi,$Hkey); + &clmul64x64_T2 ($Xhi,$Xi,$Hkey); + &reduction_alg9 ($Xhi,$Xi); + + &pshufd ($T1,$Hkey,0b01001110); + &pshufd ($T2,$Xi,0b01001110); + &pxor ($T1,$Hkey); # Karatsuba pre-processing + &movdqu (&QWP(0,$Htbl),$Hkey); # save H + &pxor ($T2,$Xi); # Karatsuba pre-processing + &movdqu (&QWP(16,$Htbl),$Xi); # save H^2 + &palignr ($T2,$T1,8); # low part is H.lo^H.hi + &movdqu (&QWP(32,$Htbl),$T2); # save Karatsuba "salt" + + &ret (); +&function_end_B("gcm_init_clmul"); + +&function_begin("gcm_ghash_clmul"); + &mov ($Xip,&wparam(0)); + &mov ($Htbl,&wparam(1)); + &mov ($inp,&wparam(2)); + &mov ($len,&wparam(3)); + + &call (&label("pic")); +&set_label("pic"); + &blindpop ($const); + &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); + + &movdqu ($Xi,&QWP(0,$Xip)); + &movdqa ($T3,&QWP(0,$const)); + &movdqu ($Hkey,&QWP(0,$Htbl)); + &pshufb ($Xi,$T3); + + &sub ($len,0x10); + &jz (&label("odd_tail")); + + ####### + # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = + # [(H*Ii+1) + (H*Xi+1)] mod P = + # [(H*Ii+1) + H^2*(Ii+Xi)] mod P + # + &movdqu ($T1,&QWP(0,$inp)); # Ii + &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 + &pshufb ($T1,$T3); + &pshufb ($Xn,$T3); + &movdqu ($T3,&QWP(32,$Htbl)); + &pxor ($Xi,$T1); # Ii+Xi + + &pshufd ($T1,$Xn,0b01001110); # H*Ii+1 + &movdqa ($Xhn,$Xn); + &pxor ($T1,$Xn); # + &lea ($inp,&DWP(32,$inp)); # i+=2 + + &pclmulqdq ($Xn,$Hkey,0x00); ####### + &pclmulqdq ($Xhn,$Hkey,0x11); ####### + &pclmulqdq ($T1,$T3,0x00); ####### + &movups ($Hkey,&QWP(16,$Htbl)); # load H^2 + &nop (); + + &sub ($len,0x20); + &jbe (&label("even_tail")); + &jmp (&label("mod_loop")); + +&set_label("mod_loop",32); + &pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi) + &movdqa ($Xhi,$Xi); + &pxor ($T2,$Xi); # + &nop (); + + &pclmulqdq ($Xi,$Hkey,0x00); ####### + &pclmulqdq ($Xhi,$Hkey,0x11); ####### + &pclmulqdq ($T2,$T3,0x10); ####### + &movups ($Hkey,&QWP(0,$Htbl)); # load H + + &xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) + &movdqa ($T3,&QWP(0,$const)); + &xorps ($Xhi,$Xhn); + &movdqu ($Xhn,&QWP(0,$inp)); # Ii + &pxor ($T1,$Xi); # aggregated Karatsuba post-processing + &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 + &pxor ($T1,$Xhi); # + + &pshufb ($Xhn,$T3); + &pxor ($T2,$T1); # + + &movdqa ($T1,$T2); # + &psrldq ($T2,8); + &pslldq ($T1,8); # + &pxor ($Xhi,$T2); + &pxor ($Xi,$T1); # + &pshufb ($Xn,$T3); + &pxor ($Xhi,$Xhn); # "Ii+Xi", consume early + + &movdqa ($Xhn,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1 + &movdqa ($T2,$Xi); #&reduction_alg9($Xhi,$Xi); 1st phase + &movdqa ($T1,$Xi); + &psllq ($Xi,5); + &pxor ($T1,$Xi); # + &psllq ($Xi,1); + &pxor ($Xi,$T1); # + &pclmulqdq ($Xn,$Hkey,0x00); ####### + &movups ($T3,&QWP(32,$Htbl)); + &psllq ($Xi,57); # + &movdqa ($T1,$Xi); # + &pslldq ($Xi,8); + &psrldq ($T1,8); # + &pxor ($Xi,$T2); + &pxor ($Xhi,$T1); # + &pshufd ($T1,$Xhn,0b01001110); + &movdqa ($T2,$Xi); # 2nd phase + &psrlq ($Xi,1); + &pxor ($T1,$Xhn); + &pxor ($Xhi,$T2); # + &pclmulqdq ($Xhn,$Hkey,0x11); ####### + &movups ($Hkey,&QWP(16,$Htbl)); # load H^2 + &pxor ($T2,$Xi); + &psrlq ($Xi,5); + &pxor ($Xi,$T2); # + &psrlq ($Xi,1); # + &pxor ($Xi,$Xhi) # + &pclmulqdq ($T1,$T3,0x00); ####### + + &lea ($inp,&DWP(32,$inp)); + &sub ($len,0x20); + &ja (&label("mod_loop")); + +&set_label("even_tail"); + &pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi) + &movdqa ($Xhi,$Xi); + &pxor ($T2,$Xi); # + + &pclmulqdq ($Xi,$Hkey,0x00); ####### + &pclmulqdq ($Xhi,$Hkey,0x11); ####### + &pclmulqdq ($T2,$T3,0x10); ####### + &movdqa ($T3,&QWP(0,$const)); + + &xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) + &xorps ($Xhi,$Xhn); + &pxor ($T1,$Xi); # aggregated Karatsuba post-processing + &pxor ($T1,$Xhi); # + + &pxor ($T2,$T1); # + + &movdqa ($T1,$T2); # + &psrldq ($T2,8); + &pslldq ($T1,8); # + &pxor ($Xhi,$T2); + &pxor ($Xi,$T1); # + + &reduction_alg9 ($Xhi,$Xi); + + &test ($len,$len); + &jnz (&label("done")); + + &movups ($Hkey,&QWP(0,$Htbl)); # load H +&set_label("odd_tail"); + &movdqu ($T1,&QWP(0,$inp)); # Ii + &pshufb ($T1,$T3); + &pxor ($Xi,$T1); # Ii+Xi + + &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi) + &reduction_alg9 ($Xhi,$Xi); + +&set_label("done"); + &pshufb ($Xi,$T3); + &movdqu (&QWP(0,$Xip),$Xi); +&function_end("gcm_ghash_clmul"); + +} + +&set_label("bswap",64); + &data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); + &data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial +}} # $sse2 +}}} # !$x86only + +&asciz("GHASH for x86, CRYPTOGAMS by "); +&asm_finish(); + +close STDOUT or die "error closing STDOUT: $!"; + +# A question was risen about choice of vanilla MMX. Or rather why wasn't +# SSE2 chosen instead? In addition to the fact that MMX runs on legacy +# CPUs such as PIII, "4-bit" MMX version was observed to provide better +# performance than *corresponding* SSE2 one even on contemporary CPUs. +# SSE2 results were provided by Peter-Michael Hager. He maintains SSE2 +# implementation featuring full range of lookup-table sizes, but with +# per-invocation lookup table setup. Latter means that table size is +# chosen depending on how much data is to be hashed in every given call, +# more data - larger table. Best reported result for Core2 is ~4 cycles +# per processed byte out of 64KB block. This number accounts even for +# 64KB table setup overhead. As discussed in gcm128.c we choose to be +# more conservative in respect to lookup table sizes, but how do the +# results compare? Minimalistic "256B" MMX version delivers ~11 cycles +# on same platform. As also discussed in gcm128.c, next in line "8-bit +# Shoup's" or "4KB" method should deliver twice the performance of +# "256B" one, in other words not worse than ~6 cycles per byte. It +# should be also be noted that in SSE2 case improvement can be "super- +# linear," i.e. more than twice, mostly because >>8 maps to single +# instruction on SSE2 register. This is unlike "4-bit" case when >>4 +# maps to same amount of instructions in both MMX and SSE2 cases. +# Bottom line is that switch to SSE2 is considered to be justifiable +# only in case we choose to implement "8-bit" method... diff --git a/ring-0.17.14/crypto/fipsmodule/aes/asm/ghash-x86_64.pl b/ring-0.17.14/crypto/fipsmodule/aes/asm/ghash-x86_64.pl new file mode 100644 index 0000000000..1a228d75ab --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/aes/asm/ghash-x86_64.pl @@ -0,0 +1,1266 @@ +#! /usr/bin/env perl +# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. +# ==================================================================== +# +# March, June 2010 +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that +# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH +# function features so called "528B" variant utilizing additional +# 256+16 bytes of per-key storage [+512 bytes shared table]. +# Performance results are for this streamed GHASH subroutine and are +# expressed in cycles per processed byte, less is better: +# +# gcc 3.4.x(*) assembler +# +# P4 28.6 14.0 +100% +# Opteron 19.3 7.7 +150% +# Core2 17.8 8.1(**) +120% +# Atom 31.6 16.8 +88% +# VIA Nano 21.8 10.1 +115% +# +# (*) comparison is not completely fair, because C results are +# for vanilla "256B" implementation, while assembler results +# are for "528B";-) +# (**) it's mystery [to me] why Core2 result is not same as for +# Opteron; + +# May 2010 +# +# Add PCLMULQDQ version performing at 2.02 cycles per processed byte. +# See ghash-x86.pl for background information and details about coding +# techniques. +# +# Special thanks to David Woodhouse for providing access to a +# Westmere-based system on behalf of Intel Open Source Technology Centre. + +# December 2012 +# +# Overhaul: aggregate Karatsuba post-processing, improve ILP in +# reduction_alg9, increase reduction aggregate factor to 4x. As for +# the latter. ghash-x86.pl discusses that it makes lesser sense to +# increase aggregate factor. Then why increase here? Critical path +# consists of 3 independent pclmulqdq instructions, Karatsuba post- +# processing and reduction. "On top" of this we lay down aggregated +# multiplication operations, triplets of independent pclmulqdq's. As +# issue rate for pclmulqdq is limited, it makes lesser sense to +# aggregate more multiplications than it takes to perform remaining +# non-multiplication operations. 2x is near-optimal coefficient for +# contemporary Intel CPUs (therefore modest improvement coefficient), +# but not for Bulldozer. Latter is because logical SIMD operations +# are twice as slow in comparison to Intel, so that critical path is +# longer. A CPU with higher pclmulqdq issue rate would also benefit +# from higher aggregate factor... +# +# Westmere 1.78(+13%) +# Sandy Bridge 1.80(+8%) +# Ivy Bridge 1.80(+7%) +# Haswell 0.55(+93%) (if system doesn't support AVX) +# Broadwell 0.45(+110%)(if system doesn't support AVX) +# Skylake 0.44(+110%)(if system doesn't support AVX) +# Bulldozer 1.49(+27%) +# Silvermont 2.88(+13%) +# Knights L 2.12(-) (if system doesn't support AVX) +# Goldmont 1.08(+24%) + +# March 2013 +# +# ... 8x aggregate factor AVX code path is using reduction algorithm +# suggested by Shay Gueron[1]. Even though contemporary AVX-capable +# CPUs such as Sandy and Ivy Bridge can execute it, the code performs +# sub-optimally in comparison to above mentioned version. But thanks +# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that +# it performs in 0.41 cycles per byte on Haswell processor, in +# 0.29 on Broadwell, and in 0.36 on Skylake. +# +# Knights Landing achieves 1.09 cpb. +# +# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest + +# This file was patched in BoringSSL to remove the variable-time 4-bit +# implementation. + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +# See the notes about |$avx| in aesni-gcm-x86_64.pl; otherwise tags will be +# computed incorrectly. +# +# In upstream, this is controlled by shelling out to the compiler to check +# versions, but BoringSSL is intended to be used with pre-generated perlasm +# output, so this isn't useful anyway. +$avx = 1; + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; + +$do4xaggr=1; + + +$code=<<___; +.text +___ + + +###################################################################### +# PCLMULQDQ version. + +@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order + ("%rdi","%rsi","%rdx","%rcx"); # Unix order + +($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2"; +($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5"); + +sub clmul64x64_T2 { # minimal register pressure +my ($Xhi,$Xi,$Hkey,$HK)=@_; + +if (!defined($HK)) { $HK = $T2; +$code.=<<___; + movdqa $Xi,$Xhi # + pshufd \$0b01001110,$Xi,$T1 + pshufd \$0b01001110,$Hkey,$T2 + pxor $Xi,$T1 # + pxor $Hkey,$T2 +___ +} else { +$code.=<<___; + movdqa $Xi,$Xhi # + pshufd \$0b01001110,$Xi,$T1 + pxor $Xi,$T1 # +___ +} +$code.=<<___; + pclmulqdq \$0x00,$Hkey,$Xi ####### + pclmulqdq \$0x11,$Hkey,$Xhi ####### + pclmulqdq \$0x00,$HK,$T1 ####### + pxor $Xi,$T1 # + pxor $Xhi,$T1 # + + movdqa $T1,$T2 # + psrldq \$8,$T1 + pslldq \$8,$T2 # + pxor $T1,$Xhi + pxor $T2,$Xi # +___ +} + +sub reduction_alg9 { # 17/11 times faster than Intel version +my ($Xhi,$Xi) = @_; + +$code.=<<___; + # 1st phase + movdqa $Xi,$T2 # + movdqa $Xi,$T1 + psllq \$5,$Xi + pxor $Xi,$T1 # + psllq \$1,$Xi + pxor $T1,$Xi # + psllq \$57,$Xi # + movdqa $Xi,$T1 # + pslldq \$8,$Xi + psrldq \$8,$T1 # + pxor $T2,$Xi + pxor $T1,$Xhi # + + # 2nd phase + movdqa $Xi,$T2 + psrlq \$1,$Xi + pxor $T2,$Xhi # + pxor $Xi,$T2 + psrlq \$5,$Xi + pxor $T2,$Xi # + psrlq \$1,$Xi # + pxor $Xhi,$Xi # +___ +} + +{ my ($Htbl,$Xip)=@_4args; + my $HK="%xmm6"; + +$code.=<<___; +.globl gcm_init_clmul +.type gcm_init_clmul,\@abi-omnipotent +.align 16 +gcm_init_clmul: +.cfi_startproc +.seh_startproc + _CET_ENDBR +.L_init_clmul: +___ +$code.=<<___ if ($win64); + sub \$0x18,%rsp +.seh_stackalloc 0x18 + movaps %xmm6,(%rsp) +.seh_savexmm %xmm6, 0 +.seh_endprologue +___ +$code.=<<___; + movdqu ($Xip),$Hkey + pshufd \$0b01001110,$Hkey,$Hkey # dword swap + + # <<1 twist + pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword + movdqa $Hkey,$T1 + psllq \$1,$Hkey + pxor $T3,$T3 # + psrlq \$63,$T1 + pcmpgtd $T2,$T3 # broadcast carry bit + pslldq \$8,$T1 + por $T1,$Hkey # H<<=1 + + # magic reduction + pand .L0x1c2_polynomial(%rip),$T3 + pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial + + # calculate H^2 + pshufd \$0b01001110,$Hkey,$HK + movdqa $Hkey,$Xi + pxor $Hkey,$HK +___ + &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); + &reduction_alg9 ($Xhi,$Xi); +$code.=<<___; + pshufd \$0b01001110,$Hkey,$T1 + pshufd \$0b01001110,$Xi,$T2 + pxor $Hkey,$T1 # Karatsuba pre-processing + movdqu $Hkey,0x00($Htbl) # save H + pxor $Xi,$T2 # Karatsuba pre-processing + movdqu $Xi,0x10($Htbl) # save H^2 + palignr \$8,$T1,$T2 # low part is H.lo^H.hi... + movdqu $T2,0x20($Htbl) # save Karatsuba "salt" +___ +if ($do4xaggr) { + &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3 + &reduction_alg9 ($Xhi,$Xi); +$code.=<<___; + movdqa $Xi,$T3 +___ + &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4 + &reduction_alg9 ($Xhi,$Xi); +$code.=<<___; + pshufd \$0b01001110,$T3,$T1 + pshufd \$0b01001110,$Xi,$T2 + pxor $T3,$T1 # Karatsuba pre-processing + movdqu $T3,0x30($Htbl) # save H^3 + pxor $Xi,$T2 # Karatsuba pre-processing + movdqu $Xi,0x40($Htbl) # save H^4 + palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi... + movdqu $T2,0x50($Htbl) # save Karatsuba "salt" +___ +} +$code.=<<___ if ($win64); + movaps (%rsp),%xmm6 + lea 0x18(%rsp),%rsp +___ +$code.=<<___; + ret +.cfi_endproc +.seh_endproc +.size gcm_init_clmul,.-gcm_init_clmul +___ +} + + +{ my ($Xip,$Htbl,$inp,$len)=@_4args; + my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7)); + my ($T1,$T2,$T3)=map("%xmm$_",(8..10)); + +$code.=<<___; +.globl gcm_ghash_clmul +.type gcm_ghash_clmul,\@abi-omnipotent +.align 32 +gcm_ghash_clmul: +.cfi_startproc +.seh_startproc + _CET_ENDBR +.L_ghash_clmul: +___ +$code.=<<___ if ($win64); + lea -0x88(%rsp),%rax + lea -0x20(%rax),%rsp +.seh_stackalloc 0x20+0x88 + movaps %xmm6,-0x20(%rax) +.seh_savexmm %xmm6, 0x20-0x20 + movaps %xmm7,-0x10(%rax) +.seh_savexmm %xmm7, 0x20-0x10 + movaps %xmm8,0(%rax) +.seh_savexmm %xmm8, 0x20+0 + movaps %xmm9,0x10(%rax) +.seh_savexmm %xmm9, 0x20+0x10 + movaps %xmm10,0x20(%rax) +.seh_savexmm %xmm10, 0x20+0x20 + movaps %xmm11,0x30(%rax) +.seh_savexmm %xmm11, 0x20+0x30 + movaps %xmm12,0x40(%rax) +.seh_savexmm %xmm12, 0x20+0x40 + movaps %xmm13,0x50(%rax) +.seh_savexmm %xmm13, 0x20+0x50 + movaps %xmm14,0x60(%rax) +.seh_savexmm %xmm14, 0x20+0x60 + movaps %xmm15,0x70(%rax) +.seh_savexmm %xmm15, 0x20+0x70 +.seh_endprologue +___ +$code.=<<___; + movdqa .Lbswap_mask(%rip),$T3 + + movdqu ($Xip),$Xi + movdqu ($Htbl),$Hkey + movdqu 0x20($Htbl),$HK + pshufb $T3,$Xi + + sub \$0x10,$len + jz .Lodd_tail + + movdqu 0x10($Htbl),$Hkey2 +___ +if ($do4xaggr) { +my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15)); + +$code.=<<___; + cmp \$0x30,$len + jb .Lskip4x + + sub \$0x30,$len + mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff + movdqu 0x30($Htbl),$Hkey3 + movdqu 0x40($Htbl),$Hkey4 + + ####### + # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P + # + movdqu 0x30($inp),$Xln + movdqu 0x20($inp),$Xl + pshufb $T3,$Xln + pshufb $T3,$Xl + movdqa $Xln,$Xhn + pshufd \$0b01001110,$Xln,$Xmn + pxor $Xln,$Xmn + pclmulqdq \$0x00,$Hkey,$Xln + pclmulqdq \$0x11,$Hkey,$Xhn + pclmulqdq \$0x00,$HK,$Xmn + + movdqa $Xl,$Xh + pshufd \$0b01001110,$Xl,$Xm + pxor $Xl,$Xm + pclmulqdq \$0x00,$Hkey2,$Xl + pclmulqdq \$0x11,$Hkey2,$Xh + pclmulqdq \$0x10,$HK,$Xm + xorps $Xl,$Xln + xorps $Xh,$Xhn + movups 0x50($Htbl),$HK + xorps $Xm,$Xmn + + movdqu 0x10($inp),$Xl + movdqu 0($inp),$T1 + pshufb $T3,$Xl + pshufb $T3,$T1 + movdqa $Xl,$Xh + pshufd \$0b01001110,$Xl,$Xm + pxor $T1,$Xi + pxor $Xl,$Xm + pclmulqdq \$0x00,$Hkey3,$Xl + movdqa $Xi,$Xhi + pshufd \$0b01001110,$Xi,$T1 + pxor $Xi,$T1 + pclmulqdq \$0x11,$Hkey3,$Xh + pclmulqdq \$0x00,$HK,$Xm + xorps $Xl,$Xln + xorps $Xh,$Xhn + + lea 0x40($inp),$inp + sub \$0x40,$len + jc .Ltail4x + + jmp .Lmod4_loop +.align 32 +.Lmod4_loop: + pclmulqdq \$0x00,$Hkey4,$Xi + xorps $Xm,$Xmn + movdqu 0x30($inp),$Xl + pshufb $T3,$Xl + pclmulqdq \$0x11,$Hkey4,$Xhi + xorps $Xln,$Xi + movdqu 0x20($inp),$Xln + movdqa $Xl,$Xh + pclmulqdq \$0x10,$HK,$T1 + pshufd \$0b01001110,$Xl,$Xm + xorps $Xhn,$Xhi + pxor $Xl,$Xm + pshufb $T3,$Xln + movups 0x20($Htbl),$HK + xorps $Xmn,$T1 + pclmulqdq \$0x00,$Hkey,$Xl + pshufd \$0b01001110,$Xln,$Xmn + + pxor $Xi,$T1 # aggregated Karatsuba post-processing + movdqa $Xln,$Xhn + pxor $Xhi,$T1 # + pxor $Xln,$Xmn + movdqa $T1,$T2 # + pclmulqdq \$0x11,$Hkey,$Xh + pslldq \$8,$T1 + psrldq \$8,$T2 # + pxor $T1,$Xi + movdqa .L7_mask(%rip),$T1 + pxor $T2,$Xhi # + movq %rax,$T2 + + pand $Xi,$T1 # 1st phase + pshufb $T1,$T2 # + pxor $Xi,$T2 # + pclmulqdq \$0x00,$HK,$Xm + psllq \$57,$T2 # + movdqa $T2,$T1 # + pslldq \$8,$T2 + pclmulqdq \$0x00,$Hkey2,$Xln + psrldq \$8,$T1 # + pxor $T2,$Xi + pxor $T1,$Xhi # + movdqu 0($inp),$T1 + + movdqa $Xi,$T2 # 2nd phase + psrlq \$1,$Xi + pclmulqdq \$0x11,$Hkey2,$Xhn + xorps $Xl,$Xln + movdqu 0x10($inp),$Xl + pshufb $T3,$Xl + pclmulqdq \$0x10,$HK,$Xmn + xorps $Xh,$Xhn + movups 0x50($Htbl),$HK + pshufb $T3,$T1 + pxor $T2,$Xhi # + pxor $Xi,$T2 + psrlq \$5,$Xi + + movdqa $Xl,$Xh + pxor $Xm,$Xmn + pshufd \$0b01001110,$Xl,$Xm + pxor $T2,$Xi # + pxor $T1,$Xhi + pxor $Xl,$Xm + pclmulqdq \$0x00,$Hkey3,$Xl + psrlq \$1,$Xi # + pxor $Xhi,$Xi # + movdqa $Xi,$Xhi + pclmulqdq \$0x11,$Hkey3,$Xh + xorps $Xl,$Xln + pshufd \$0b01001110,$Xi,$T1 + pxor $Xi,$T1 + + pclmulqdq \$0x00,$HK,$Xm + xorps $Xh,$Xhn + + lea 0x40($inp),$inp + sub \$0x40,$len + jnc .Lmod4_loop + +.Ltail4x: + pclmulqdq \$0x00,$Hkey4,$Xi + pclmulqdq \$0x11,$Hkey4,$Xhi + pclmulqdq \$0x10,$HK,$T1 + xorps $Xm,$Xmn + xorps $Xln,$Xi + xorps $Xhn,$Xhi + pxor $Xi,$Xhi # aggregated Karatsuba post-processing + pxor $Xmn,$T1 + + pxor $Xhi,$T1 # + pxor $Xi,$Xhi + + movdqa $T1,$T2 # + psrldq \$8,$T1 + pslldq \$8,$T2 # + pxor $T1,$Xhi + pxor $T2,$Xi # +___ + &reduction_alg9($Xhi,$Xi); +$code.=<<___; + add \$0x40,$len + jz .Ldone + movdqu 0x20($Htbl),$HK + sub \$0x10,$len + jz .Lodd_tail +.Lskip4x: +___ +} +$code.=<<___; + ####### + # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = + # [(H*Ii+1) + (H*Xi+1)] mod P = + # [(H*Ii+1) + H^2*(Ii+Xi)] mod P + # + movdqu ($inp),$T1 # Ii + movdqu 16($inp),$Xln # Ii+1 + pshufb $T3,$T1 + pshufb $T3,$Xln + pxor $T1,$Xi # Ii+Xi + + movdqa $Xln,$Xhn + pshufd \$0b01001110,$Xln,$Xmn + pxor $Xln,$Xmn + pclmulqdq \$0x00,$Hkey,$Xln + pclmulqdq \$0x11,$Hkey,$Xhn + pclmulqdq \$0x00,$HK,$Xmn + + lea 32($inp),$inp # i+=2 + nop + sub \$0x20,$len + jbe .Leven_tail + nop + jmp .Lmod_loop + +.align 32 +.Lmod_loop: + movdqa $Xi,$Xhi + movdqa $Xmn,$T1 + pshufd \$0b01001110,$Xi,$Xmn # + pxor $Xi,$Xmn # + + pclmulqdq \$0x00,$Hkey2,$Xi + pclmulqdq \$0x11,$Hkey2,$Xhi + pclmulqdq \$0x10,$HK,$Xmn + + pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) + pxor $Xhn,$Xhi + movdqu ($inp),$T2 # Ii + pxor $Xi,$T1 # aggregated Karatsuba post-processing + pshufb $T3,$T2 + movdqu 16($inp),$Xln # Ii+1 + + pxor $Xhi,$T1 + pxor $T2,$Xhi # "Ii+Xi", consume early + pxor $T1,$Xmn + pshufb $T3,$Xln + movdqa $Xmn,$T1 # + psrldq \$8,$T1 + pslldq \$8,$Xmn # + pxor $T1,$Xhi + pxor $Xmn,$Xi # + + movdqa $Xln,$Xhn # + + movdqa $Xi,$T2 # 1st phase + movdqa $Xi,$T1 + psllq \$5,$Xi + pxor $Xi,$T1 # + pclmulqdq \$0x00,$Hkey,$Xln ####### + psllq \$1,$Xi + pxor $T1,$Xi # + psllq \$57,$Xi # + movdqa $Xi,$T1 # + pslldq \$8,$Xi + psrldq \$8,$T1 # + pxor $T2,$Xi + pshufd \$0b01001110,$Xhn,$Xmn + pxor $T1,$Xhi # + pxor $Xhn,$Xmn # + + movdqa $Xi,$T2 # 2nd phase + psrlq \$1,$Xi + pclmulqdq \$0x11,$Hkey,$Xhn ####### + pxor $T2,$Xhi # + pxor $Xi,$T2 + psrlq \$5,$Xi + pxor $T2,$Xi # + lea 32($inp),$inp + psrlq \$1,$Xi # + pclmulqdq \$0x00,$HK,$Xmn ####### + pxor $Xhi,$Xi # + + sub \$0x20,$len + ja .Lmod_loop + +.Leven_tail: + movdqa $Xi,$Xhi + movdqa $Xmn,$T1 + pshufd \$0b01001110,$Xi,$Xmn # + pxor $Xi,$Xmn # + + pclmulqdq \$0x00,$Hkey2,$Xi + pclmulqdq \$0x11,$Hkey2,$Xhi + pclmulqdq \$0x10,$HK,$Xmn + + pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) + pxor $Xhn,$Xhi + pxor $Xi,$T1 + pxor $Xhi,$T1 + pxor $T1,$Xmn + movdqa $Xmn,$T1 # + psrldq \$8,$T1 + pslldq \$8,$Xmn # + pxor $T1,$Xhi + pxor $Xmn,$Xi # +___ + &reduction_alg9 ($Xhi,$Xi); +$code.=<<___; + test $len,$len + jnz .Ldone + +.Lodd_tail: + movdqu ($inp),$T1 # Ii + pshufb $T3,$T1 + pxor $T1,$Xi # Ii+Xi +___ + &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi) + &reduction_alg9 ($Xhi,$Xi); +$code.=<<___; +.Ldone: + pshufb $T3,$Xi + movdqu $Xi,($Xip) +___ +$code.=<<___ if ($win64); + movaps (%rsp),%xmm6 + movaps 0x10(%rsp),%xmm7 + movaps 0x20(%rsp),%xmm8 + movaps 0x30(%rsp),%xmm9 + movaps 0x40(%rsp),%xmm10 + movaps 0x50(%rsp),%xmm11 + movaps 0x60(%rsp),%xmm12 + movaps 0x70(%rsp),%xmm13 + movaps 0x80(%rsp),%xmm14 + movaps 0x90(%rsp),%xmm15 + lea 0xa8(%rsp),%rsp +___ +$code.=<<___; + ret +.cfi_endproc +.seh_endproc +.size gcm_ghash_clmul,.-gcm_ghash_clmul +___ +} + +$code.=<<___; +.globl gcm_init_avx +.type gcm_init_avx,\@abi-omnipotent +.align 32 +gcm_init_avx: +.cfi_startproc +.seh_startproc + _CET_ENDBR +___ +if ($avx) { +my ($Htbl,$Xip)=@_4args; +my $HK="%xmm6"; + +$code.=<<___ if ($win64); + sub \$0x18,%rsp +.seh_stackalloc 0x18 + movaps %xmm6,(%rsp) +.seh_savexmm %xmm6, 0 +.seh_endprologue +___ +$code.=<<___; + vzeroupper + + vmovdqu ($Xip),$Hkey + vpshufd \$0b01001110,$Hkey,$Hkey # dword swap + + # <<1 twist + vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword + vpsrlq \$63,$Hkey,$T1 + vpsllq \$1,$Hkey,$Hkey + vpxor $T3,$T3,$T3 # + vpcmpgtd $T2,$T3,$T3 # broadcast carry bit + vpslldq \$8,$T1,$T1 + vpor $T1,$Hkey,$Hkey # H<<=1 + + # magic reduction + vpand .L0x1c2_polynomial(%rip),$T3,$T3 + vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial + + vpunpckhqdq $Hkey,$Hkey,$HK + vmovdqa $Hkey,$Xi + vpxor $Hkey,$HK,$HK + mov \$4,%r10 # up to H^8 + jmp .Linit_start_avx +___ + +sub clmul64x64_avx { +my ($Xhi,$Xi,$Hkey,$HK)=@_; + +if (!defined($HK)) { $HK = $T2; +$code.=<<___; + vpunpckhqdq $Xi,$Xi,$T1 + vpunpckhqdq $Hkey,$Hkey,$T2 + vpxor $Xi,$T1,$T1 # + vpxor $Hkey,$T2,$T2 +___ +} else { +$code.=<<___; + vpunpckhqdq $Xi,$Xi,$T1 + vpxor $Xi,$T1,$T1 # +___ +} +$code.=<<___; + vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi ####### + vpclmulqdq \$0x00,$Hkey,$Xi,$Xi ####### + vpclmulqdq \$0x00,$HK,$T1,$T1 ####### + vpxor $Xi,$Xhi,$T2 # + vpxor $T2,$T1,$T1 # + + vpslldq \$8,$T1,$T2 # + vpsrldq \$8,$T1,$T1 + vpxor $T2,$Xi,$Xi # + vpxor $T1,$Xhi,$Xhi +___ +} + +sub reduction_avx { +my ($Xhi,$Xi) = @_; + +$code.=<<___; + vpsllq \$57,$Xi,$T1 # 1st phase + vpsllq \$62,$Xi,$T2 + vpxor $T1,$T2,$T2 # + vpsllq \$63,$Xi,$T1 + vpxor $T1,$T2,$T2 # + vpslldq \$8,$T2,$T1 # + vpsrldq \$8,$T2,$T2 + vpxor $T1,$Xi,$Xi # + vpxor $T2,$Xhi,$Xhi + + vpsrlq \$1,$Xi,$T2 # 2nd phase + vpxor $Xi,$Xhi,$Xhi + vpxor $T2,$Xi,$Xi # + vpsrlq \$5,$T2,$T2 + vpxor $T2,$Xi,$Xi # + vpsrlq \$1,$Xi,$Xi # + vpxor $Xhi,$Xi,$Xi # +___ +} + +$code.=<<___; +.align 32 +.Linit_loop_avx: + vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi... + vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt" +___ + &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7 + &reduction_avx ($Xhi,$Xi); +$code.=<<___; +.Linit_start_avx: + vmovdqa $Xi,$T3 +___ + &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8 + &reduction_avx ($Xhi,$Xi); +$code.=<<___; + vpshufd \$0b01001110,$T3,$T1 + vpshufd \$0b01001110,$Xi,$T2 + vpxor $T3,$T1,$T1 # Karatsuba pre-processing + vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7 + vpxor $Xi,$T2,$T2 # Karatsuba pre-processing + vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8 + lea 0x30($Htbl),$Htbl + sub \$1,%r10 + jnz .Linit_loop_avx + + vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped + vmovdqu $T3,-0x10($Htbl) + + vzeroupper +___ +$code.=<<___ if ($win64); + movaps (%rsp),%xmm6 + lea 0x18(%rsp),%rsp +___ +$code.=<<___; + ret +.seh_endproc +.cfi_endproc +.size gcm_init_avx,.-gcm_init_avx +___ +} else { +$code.=<<___; + jmp .L_init_clmul +.size gcm_init_avx,.-gcm_init_avx +___ +} + +$code.=<<___; +.globl gcm_ghash_avx +.type gcm_ghash_avx,\@abi-omnipotent +.align 32 +gcm_ghash_avx: +.cfi_startproc +.seh_startproc + _CET_ENDBR +___ +if ($avx) { +my ($Xip,$Htbl,$inp,$len)=@_4args; +my ($Xlo,$Xhi,$Xmi, + $Zlo,$Zhi,$Zmi, + $Hkey,$HK,$T1,$T2, + $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15)); + +$code.=<<___ if ($win64); + lea -0x88(%rsp),%rax + lea -0x20(%rax),%rsp +.seh_stackalloc 0x20+0x88 + movaps %xmm6,-0x20(%rax) +.seh_savexmm %xmm6, 0x20-0x20 + movaps %xmm7,-0x10(%rax) +.seh_savexmm %xmm7, 0x20-0x10 + movaps %xmm8,0(%rax) +.seh_savexmm %xmm8, 0x20+0 + movaps %xmm9,0x10(%rax) +.seh_savexmm %xmm9, 0x20+0x10 + movaps %xmm10,0x20(%rax) +.seh_savexmm %xmm10, 0x20+0x20 + movaps %xmm11,0x30(%rax) +.seh_savexmm %xmm11, 0x20+0x30 + movaps %xmm12,0x40(%rax) +.seh_savexmm %xmm12, 0x20+0x40 + movaps %xmm13,0x50(%rax) +.seh_savexmm %xmm13, 0x20+0x50 + movaps %xmm14,0x60(%rax) +.seh_savexmm %xmm14, 0x20+0x60 + movaps %xmm15,0x70(%rax) +.seh_savexmm %xmm15, 0x20+0x70 +.seh_endprologue +___ +$code.=<<___; + vzeroupper + + vmovdqu ($Xip),$Xi # load $Xi + lea .L0x1c2_polynomial(%rip),%r10 + lea 0x40($Htbl),$Htbl # size optimization + vmovdqu .Lbswap_mask(%rip),$bswap + vpshufb $bswap,$Xi,$Xi + cmp \$0x80,$len + jb .Lshort_avx + sub \$0x80,$len + + vmovdqu 0x70($inp),$Ii # I[7] + vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 + vpshufb $bswap,$Ii,$Ii + vmovdqu 0x20-0x40($Htbl),$HK + + vpunpckhqdq $Ii,$Ii,$T2 + vmovdqu 0x60($inp),$Ij # I[6] + vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo + vpxor $Ii,$T2,$T2 + vpshufb $bswap,$Ij,$Ij + vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi + vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 + vpunpckhqdq $Ij,$Ij,$T1 + vmovdqu 0x50($inp),$Ii # I[5] + vpclmulqdq \$0x00,$HK,$T2,$Xmi + vpxor $Ij,$T1,$T1 + + vpshufb $bswap,$Ii,$Ii + vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo + vpunpckhqdq $Ii,$Ii,$T2 + vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi + vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 + vpxor $Ii,$T2,$T2 + vmovdqu 0x40($inp),$Ij # I[4] + vpclmulqdq \$0x10,$HK,$T1,$Zmi + vmovdqu 0x50-0x40($Htbl),$HK + + vpshufb $bswap,$Ij,$Ij + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo + vpxor $Xhi,$Zhi,$Zhi + vpunpckhqdq $Ij,$Ij,$T1 + vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi + vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T2,$Xmi + vpxor $Ij,$T1,$T1 + + vmovdqu 0x30($inp),$Ii # I[3] + vpxor $Zlo,$Xlo,$Xlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo + vpxor $Zhi,$Xhi,$Xhi + vpshufb $bswap,$Ii,$Ii + vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi + vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 + vpxor $Zmi,$Xmi,$Xmi + vpunpckhqdq $Ii,$Ii,$T2 + vpclmulqdq \$0x10,$HK,$T1,$Zmi + vmovdqu 0x80-0x40($Htbl),$HK + vpxor $Ii,$T2,$T2 + + vmovdqu 0x20($inp),$Ij # I[2] + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo + vpxor $Xhi,$Zhi,$Zhi + vpshufb $bswap,$Ij,$Ij + vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi + vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 + vpxor $Xmi,$Zmi,$Zmi + vpunpckhqdq $Ij,$Ij,$T1 + vpclmulqdq \$0x00,$HK,$T2,$Xmi + vpxor $Ij,$T1,$T1 + + vmovdqu 0x10($inp),$Ii # I[1] + vpxor $Zlo,$Xlo,$Xlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo + vpxor $Zhi,$Xhi,$Xhi + vpshufb $bswap,$Ii,$Ii + vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi + vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 + vpxor $Zmi,$Xmi,$Xmi + vpunpckhqdq $Ii,$Ii,$T2 + vpclmulqdq \$0x10,$HK,$T1,$Zmi + vmovdqu 0xb0-0x40($Htbl),$HK + vpxor $Ii,$T2,$T2 + + vmovdqu ($inp),$Ij # I[0] + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo + vpxor $Xhi,$Zhi,$Zhi + vpshufb $bswap,$Ij,$Ij + vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi + vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x10,$HK,$T2,$Xmi + + lea 0x80($inp),$inp + cmp \$0x80,$len + jb .Ltail_avx + + vpxor $Xi,$Ij,$Ij # accumulate $Xi + sub \$0x80,$len + jmp .Loop8x_avx + +.align 32 +.Loop8x_avx: + vpunpckhqdq $Ij,$Ij,$T1 + vmovdqu 0x70($inp),$Ii # I[7] + vpxor $Xlo,$Zlo,$Zlo + vpxor $Ij,$T1,$T1 + vpclmulqdq \$0x00,$Hkey,$Ij,$Xi + vpshufb $bswap,$Ii,$Ii + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x11,$Hkey,$Ij,$Xo + vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 + vpunpckhqdq $Ii,$Ii,$T2 + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T1,$Tred + vmovdqu 0x20-0x40($Htbl),$HK + vpxor $Ii,$T2,$T2 + + vmovdqu 0x60($inp),$Ij # I[6] + vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo + vpxor $Zlo,$Xi,$Xi # collect result + vpshufb $bswap,$Ij,$Ij + vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi + vxorps $Zhi,$Xo,$Xo + vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 + vpunpckhqdq $Ij,$Ij,$T1 + vpclmulqdq \$0x00,$HK, $T2,$Xmi + vpxor $Zmi,$Tred,$Tred + vxorps $Ij,$T1,$T1 + + vmovdqu 0x50($inp),$Ii # I[5] + vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing + vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo + vpxor $Xo,$Tred,$Tred + vpslldq \$8,$Tred,$T2 + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi + vpsrldq \$8,$Tred,$Tred + vpxor $T2, $Xi, $Xi + vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 + vpshufb $bswap,$Ii,$Ii + vxorps $Tred,$Xo, $Xo + vpxor $Xhi,$Zhi,$Zhi + vpunpckhqdq $Ii,$Ii,$T2 + vpclmulqdq \$0x10,$HK, $T1,$Zmi + vmovdqu 0x50-0x40($Htbl),$HK + vpxor $Ii,$T2,$T2 + vpxor $Xmi,$Zmi,$Zmi + + vmovdqu 0x40($inp),$Ij # I[4] + vpalignr \$8,$Xi,$Xi,$Tred # 1st phase + vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo + vpshufb $bswap,$Ij,$Ij + vpxor $Zlo,$Xlo,$Xlo + vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi + vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Zhi,$Xhi,$Xhi + vpclmulqdq \$0x00,$HK, $T2,$Xmi + vxorps $Ij,$T1,$T1 + vpxor $Zmi,$Xmi,$Xmi + + vmovdqu 0x30($inp),$Ii # I[3] + vpclmulqdq \$0x10,(%r10),$Xi,$Xi + vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo + vpshufb $bswap,$Ii,$Ii + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi + vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 + vpunpckhqdq $Ii,$Ii,$T2 + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x10,$HK, $T1,$Zmi + vmovdqu 0x80-0x40($Htbl),$HK + vpxor $Ii,$T2,$T2 + vpxor $Xmi,$Zmi,$Zmi + + vmovdqu 0x20($inp),$Ij # I[2] + vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo + vpshufb $bswap,$Ij,$Ij + vpxor $Zlo,$Xlo,$Xlo + vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi + vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Zhi,$Xhi,$Xhi + vpclmulqdq \$0x00,$HK, $T2,$Xmi + vpxor $Ij,$T1,$T1 + vpxor $Zmi,$Xmi,$Xmi + vxorps $Tred,$Xi,$Xi + + vmovdqu 0x10($inp),$Ii # I[1] + vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase + vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo + vpshufb $bswap,$Ii,$Ii + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi + vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 + vpclmulqdq \$0x10,(%r10),$Xi,$Xi + vxorps $Xo,$Tred,$Tred + vpunpckhqdq $Ii,$Ii,$T2 + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x10,$HK, $T1,$Zmi + vmovdqu 0xb0-0x40($Htbl),$HK + vpxor $Ii,$T2,$T2 + vpxor $Xmi,$Zmi,$Zmi + + vmovdqu ($inp),$Ij # I[0] + vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo + vpshufb $bswap,$Ij,$Ij + vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi + vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 + vpxor $Tred,$Ij,$Ij + vpclmulqdq \$0x10,$HK, $T2,$Xmi + vpxor $Xi,$Ij,$Ij # accumulate $Xi + + lea 0x80($inp),$inp + sub \$0x80,$len + jnc .Loop8x_avx + + add \$0x80,$len + jmp .Ltail_no_xor_avx + +.align 32 +.Lshort_avx: + vmovdqu -0x10($inp,$len),$Ii # very last word + lea ($inp,$len),$inp + vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 + vmovdqu 0x20-0x40($Htbl),$HK + vpshufb $bswap,$Ii,$Ij + + vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo, + vmovdqa $Xhi,$Zhi # $Zhi and + vmovdqa $Xmi,$Zmi # $Zmi + sub \$0x10,$len + jz .Ltail_avx + + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo + vpxor $Ij,$T1,$T1 + vmovdqu -0x20($inp),$Ii + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi + vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 + vpshufb $bswap,$Ii,$Ij + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T1,$Xmi + vpsrldq \$8,$HK,$HK + sub \$0x10,$len + jz .Ltail_avx + + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo + vpxor $Ij,$T1,$T1 + vmovdqu -0x30($inp),$Ii + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi + vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 + vpshufb $bswap,$Ii,$Ij + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T1,$Xmi + vmovdqu 0x50-0x40($Htbl),$HK + sub \$0x10,$len + jz .Ltail_avx + + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo + vpxor $Ij,$T1,$T1 + vmovdqu -0x40($inp),$Ii + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi + vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 + vpshufb $bswap,$Ii,$Ij + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T1,$Xmi + vpsrldq \$8,$HK,$HK + sub \$0x10,$len + jz .Ltail_avx + + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo + vpxor $Ij,$T1,$T1 + vmovdqu -0x50($inp),$Ii + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi + vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 + vpshufb $bswap,$Ii,$Ij + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T1,$Xmi + vmovdqu 0x80-0x40($Htbl),$HK + sub \$0x10,$len + jz .Ltail_avx + + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo + vpxor $Ij,$T1,$T1 + vmovdqu -0x60($inp),$Ii + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi + vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 + vpshufb $bswap,$Ii,$Ij + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T1,$Xmi + vpsrldq \$8,$HK,$HK + sub \$0x10,$len + jz .Ltail_avx + + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo + vpxor $Ij,$T1,$T1 + vmovdqu -0x70($inp),$Ii + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi + vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 + vpshufb $bswap,$Ii,$Ij + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T1,$Xmi + vmovq 0xb8-0x40($Htbl),$HK + sub \$0x10,$len + jmp .Ltail_avx + +.align 32 +.Ltail_avx: + vpxor $Xi,$Ij,$Ij # accumulate $Xi +.Ltail_no_xor_avx: + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo + vpxor $Ij,$T1,$T1 + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T1,$Xmi + + vmovdqu (%r10),$Tred + + vpxor $Xlo,$Zlo,$Xi + vpxor $Xhi,$Zhi,$Xo + vpxor $Xmi,$Zmi,$Zmi + + vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing + vpxor $Xo, $Zmi,$Zmi + vpslldq \$8, $Zmi,$T2 + vpsrldq \$8, $Zmi,$Zmi + vpxor $T2, $Xi, $Xi + vpxor $Zmi,$Xo, $Xo + + vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase + vpalignr \$8,$Xi,$Xi,$Xi + vpxor $T2,$Xi,$Xi + + vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase + vpalignr \$8,$Xi,$Xi,$Xi + vpxor $Xo,$Xi,$Xi + vpxor $T2,$Xi,$Xi + + cmp \$0,$len + jne .Lshort_avx + + vpshufb $bswap,$Xi,$Xi + vmovdqu $Xi,($Xip) + vzeroupper +___ +$code.=<<___ if ($win64); + movaps (%rsp),%xmm6 + movaps 0x10(%rsp),%xmm7 + movaps 0x20(%rsp),%xmm8 + movaps 0x30(%rsp),%xmm9 + movaps 0x40(%rsp),%xmm10 + movaps 0x50(%rsp),%xmm11 + movaps 0x60(%rsp),%xmm12 + movaps 0x70(%rsp),%xmm13 + movaps 0x80(%rsp),%xmm14 + movaps 0x90(%rsp),%xmm15 + lea 0xa8(%rsp),%rsp +___ +$code.=<<___; + ret +.cfi_endproc +.seh_endproc +.size gcm_ghash_avx,.-gcm_ghash_avx +___ +} else { +$code.=<<___; + jmp .L_ghash_clmul +.size gcm_ghash_avx,.-gcm_ghash_avx +___ +} + +$code.=<<___; +.section .rodata +.align 64 +.Lbswap_mask: + .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.L0x1c2_polynomial: + .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +.L7_mask: + .long 7,0,7,0 +.align 64 + +.asciz "GHASH for x86_64, CRYPTOGAMS by " +.align 64 +.text +___ + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; + +print $code; + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/ring-0.17.14/crypto/fipsmodule/aes/asm/ghashv8-armx.pl b/ring-0.17.14/crypto/fipsmodule/aes/asm/ghashv8-armx.pl new file mode 100644 index 0000000000..1f1df1cf84 --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/aes/asm/ghashv8-armx.pl @@ -0,0 +1,334 @@ +#! /usr/bin/env perl +# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. +# ==================================================================== +# +# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication. +# +# June 2014 +# +# Initial version was developed in tight cooperation with Ard +# Biesheuvel of Linaro from bits-n-pieces from other assembly modules. +# Just like aesv8-armx.pl this module supports both AArch32 and +# AArch64 execution modes. +# +# July 2014 +# +# Implement 2x aggregated reduction [see ghash-x86.pl for background +# information]. +# +# November 2017 +# +# AArch64 register bank to "accommodate" 4x aggregated reduction and +# improve performance by 20-70% depending on processor. +# +# Current performance in cycles per processed byte: +# +# 64-bit PMULL 32-bit PMULL 32-bit NEON(*) +# Apple A7 0.58 0.92 5.62 +# Cortex-A53 0.85 1.01 8.39 +# Cortex-A57 0.73 1.17 7.61 +# Denver 0.51 0.65 6.02 +# Mongoose 0.65 1.10 8.06 +# Kryo 0.76 1.16 8.00 +# +# (*) presented for reference/comparison purposes; + +$flavour = shift; +$output = shift; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; + +$Xi="x0"; # argument block +$Htbl="x1"; +$inp="x2"; +$len="x3"; + +$inc="x12"; + +{ +my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3)); +my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14)); + +$code=<<___; +#if __ARM_MAX_ARCH__>=7 +.text +___ +$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/); +$code.=<<___ if ($flavour !~ /64/); +.fpu neon +.code 32 +#undef __thumb2__ +___ + +################################################################################ +# void gcm_init_clmul(u128 Htable[16],const u64 H[2]); +# +# input: 128-bit H - secret parameter E(K,0^128) +# output: precomputed table filled with degrees of twisted H; +# H is twisted to handle reverse bitness of GHASH; +# only few of 16 slots of Htable[16] are used; +# data is opaque to outside world (which allows to +# optimize the code independently); +# +$code.=<<___; +.global gcm_init_clmul +.type gcm_init_clmul,%function +.align 4 +gcm_init_clmul: + AARCH64_VALID_CALL_TARGET + vld1.64 {$t1},[x1] @ load input H + vmov.i8 $xC2,#0xe1 + vshl.i64 $xC2,$xC2,#57 @ 0xc2.0 + vext.8 $IN,$t1,$t1,#8 + vshr.u64 $t2,$xC2,#63 + vdup.32 $t1,${t1}[1] + vext.8 $t0,$t2,$xC2,#8 @ t0=0xc2....01 + vshr.u64 $t2,$IN,#63 + vshr.s32 $t1,$t1,#31 @ broadcast carry bit + vand $t2,$t2,$t0 + vshl.i64 $IN,$IN,#1 + vext.8 $t2,$t2,$t2,#8 + vand $t0,$t0,$t1 + vorr $IN,$IN,$t2 @ H<<<=1 + veor $H,$IN,$t0 @ twisted H + vst1.64 {$H},[x0],#16 @ store Htable[0] + + @ calculate H^2 + vext.8 $t0,$H,$H,#8 @ Karatsuba pre-processing + vpmull.p64 $Xl,$H,$H + veor $t0,$t0,$H + vpmull2.p64 $Xh,$H,$H + vpmull.p64 $Xm,$t0,$t0 + + vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing + veor $t2,$Xl,$Xh + veor $Xm,$Xm,$t1 + veor $Xm,$Xm,$t2 + vpmull.p64 $t2,$Xl,$xC2 @ 1st phase + + vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result + vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl + veor $Xl,$Xm,$t2 + + vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase + vpmull.p64 $Xl,$Xl,$xC2 + veor $t2,$t2,$Xh + veor $H2,$Xl,$t2 + + vext.8 $t1,$H2,$H2,#8 @ Karatsuba pre-processing + veor $t1,$t1,$H2 + vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed + vst1.64 {$Hhl-$H2},[x0],#32 @ store Htable[1..2] +___ +if ($flavour =~ /64/) { +my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7)); + +$code.=<<___; + @ calculate H^3 and H^4 + vpmull.p64 $Xl,$H, $H2 + vpmull.p64 $Yl,$H2,$H2 + vpmull2.p64 $Xh,$H, $H2 + vpmull2.p64 $Yh,$H2,$H2 + vpmull.p64 $Xm,$t0,$t1 + vpmull.p64 $Ym,$t1,$t1 + + vext.8 $t0,$Xl,$Xh,#8 @ Karatsuba post-processing + vext.8 $t1,$Yl,$Yh,#8 + veor $t2,$Xl,$Xh + veor $Xm,$Xm,$t0 + veor $t3,$Yl,$Yh + veor $Ym,$Ym,$t1 + veor $Xm,$Xm,$t2 + vpmull.p64 $t2,$Xl,$xC2 @ 1st phase + veor $Ym,$Ym,$t3 + vpmull.p64 $t3,$Yl,$xC2 + + vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result + vmov $Yh#lo,$Ym#hi + vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl + vmov $Ym#hi,$Yl#lo + veor $Xl,$Xm,$t2 + veor $Yl,$Ym,$t3 + + vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase + vext.8 $t3,$Yl,$Yl,#8 + vpmull.p64 $Xl,$Xl,$xC2 + vpmull.p64 $Yl,$Yl,$xC2 + veor $t2,$t2,$Xh + veor $t3,$t3,$Yh + veor $H, $Xl,$t2 @ H^3 + veor $H2,$Yl,$t3 @ H^4 + + vext.8 $t0,$H, $H,#8 @ Karatsuba pre-processing + vext.8 $t1,$H2,$H2,#8 + veor $t0,$t0,$H + veor $t1,$t1,$H2 + vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed + vst1.64 {$H-$H2},[x0] @ store Htable[3..5] +___ +} +$code.=<<___; + ret +.size gcm_init_clmul,.-gcm_init_clmul +___ +################################################################################ +# void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]); +# +# input: Xi - current hash value; +# Htable - table precomputed in gcm_init_clmul; +# output: Xi - next hash value Xi; +# +$code.=<<___; +.global gcm_gmult_clmul +.type gcm_gmult_clmul,%function +.align 4 +gcm_gmult_clmul: + AARCH64_VALID_CALL_TARGET + vld1.64 {$t1},[$Xi] @ load Xi + vmov.i8 $xC2,#0xe1 + vld1.64 {$H-$Hhl},[$Htbl] @ load twisted H, ... + vshl.u64 $xC2,$xC2,#57 +#ifndef __ARMEB__ + vrev64.8 $t1,$t1 +#endif + vext.8 $IN,$t1,$t1,#8 + + vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo + veor $t1,$t1,$IN @ Karatsuba pre-processing + vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi + vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) + + vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing + veor $t2,$Xl,$Xh + veor $Xm,$Xm,$t1 + veor $Xm,$Xm,$t2 + vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction + + vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result + vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl + veor $Xl,$Xm,$t2 + + vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction + vpmull.p64 $Xl,$Xl,$xC2 + veor $t2,$t2,$Xh + veor $Xl,$Xl,$t2 + +#ifndef __ARMEB__ + vrev64.8 $Xl,$Xl +#endif + vext.8 $Xl,$Xl,$Xl,#8 + vst1.64 {$Xl},[$Xi] @ write out Xi + + ret +.size gcm_gmult_clmul,.-gcm_gmult_clmul +___ +} + +$code.=<<___; +.asciz "GHASH for ARMv8, CRYPTOGAMS by " +.align 2 +#endif +___ + +if ($flavour =~ /64/) { ######## 64-bit code + sub unvmov { + my $arg=shift; + + $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o && + sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1, + $3<8?$3:$3+8,($4 eq "lo")?0:1; + } + foreach(split("\n",$code)) { + s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or + s/vmov\.i8/movi/o or # fix up legacy mnemonics + s/vmov\s+(.*)/unvmov($1)/geo or + s/vext\.8/ext/o or + s/vshr\.s/sshr\.s/o or + s/vshr/ushr/o or + s/^(\s+)v/$1/o or # strip off v prefix + s/\bbx\s+lr\b/ret/o; + + s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers + s/@\s/\/\//o; # old->new style commentary + + # fix up remaining legacy suffixes + s/\.[ui]?8(\s)/$1/o; + s/\.[uis]?32//o and s/\.16b/\.4s/go; + m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument + m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments + s/\.[uisp]?64//o and s/\.16b/\.2d/go; + s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; + + # Switch preprocessor checks to aarch64 versions. + s/__ARME([BL])__/__AARCH64E$1__/go; + + print $_,"\n"; + } +} else { ######## 32-bit code + sub unvdup32 { + my $arg=shift; + + $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && + sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; + } + sub unvpmullp64 { + my ($mnemonic,$arg)=@_; + + if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) { + my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19) + |(($2&7)<<17)|(($2&8)<<4) + |(($3&7)<<1) |(($3&8)<<2); + $word |= 0x00010001 if ($mnemonic =~ "2"); + # since ARMv7 instructions are always encoded little-endian. + # correct solution is to use .inst directive, but older + # assemblers don't implement it:-( + sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", + $word&0xff,($word>>8)&0xff, + ($word>>16)&0xff,($word>>24)&0xff, + $mnemonic,$arg; + } + } + + foreach(split("\n",$code)) { + s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers + s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers + s/\/\/\s?/@ /o; # new->old style commentary + + # fix up remaining new-style suffixes + s/\],#[0-9]+/]!/o; + + s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or + s/vdup\.32\s+(.*)/unvdup32($1)/geo or + s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or + s/^(\s+)b\./$1b/o or + s/^(\s+)ret/$1bx\tlr/o; + + print $_,"\n"; + } +} + +close STDOUT or die "error closing STDOUT: $!"; # enforce flush diff --git a/ring-0.17.14/crypto/fipsmodule/aes/asm/vpaes-armv7.pl b/ring-0.17.14/crypto/fipsmodule/aes/asm/vpaes-armv7.pl new file mode 100644 index 0000000000..a46ad6c785 --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/aes/asm/vpaes-armv7.pl @@ -0,0 +1,884 @@ +#! /usr/bin/env perl +# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +###################################################################### +## Constant-time SSSE3 AES core implementation. +## version 0.1 +## +## By Mike Hamburg (Stanford University), 2009 +## Public domain. +## +## For details see http://shiftleft.org/papers/vector_aes/ and +## http://crypto.stanford.edu/vpaes/. +## +###################################################################### +# Adapted from the original x86_64 version and 's ARMv8 +# version. +# +# armv7, aarch64, and x86_64 differ in several ways: +# +# * x86_64 SSSE3 instructions are two-address (destination operand is also a +# source), while NEON is three-address (destination operand is separate from +# two sources). +# +# * aarch64 has 32 SIMD registers available, while x86_64 and armv7 have 16. +# +# * x86_64 instructions can take memory references, while ARM is a load/store +# architecture. This means we sometimes need a spare register. +# +# * aarch64 and x86_64 have 128-bit byte shuffle instructions (tbl and pshufb), +# while armv7 only has a 64-bit byte shuffle (vtbl). +# +# This means this armv7 version must be a mix of both aarch64 and x86_64 +# implementations. armv7 and aarch64 have analogous SIMD instructions, so we +# base the instructions on aarch64. However, we cannot use aarch64's register +# allocation. x86_64's register count matches, but x86_64 is two-address. +# vpaes-armv8.pl already accounts for this in the comments, which use +# three-address AVX instructions instead of the original SSSE3 ones. We base +# register usage on these comments, which are preserved in this file. +# +# This means we do not use separate input and output registers as in aarch64 and +# cannot pin as many constants in the preheat functions. However, the load/store +# architecture means we must still deviate from x86_64 in places. +# +# Next, we account for the byte shuffle instructions. vtbl takes 64-bit source +# and destination and 128-bit table. Fortunately, armv7 also allows addressing +# upper and lower halves of each 128-bit register. The lower half of q{N} is +# d{2*N}. The upper half is d{2*N+1}. Instead of the following non-existent +# instruction, +# +# vtbl.8 q0, q1, q2 @ Index each of q2's 16 bytes into q1. Store in q0. +# +# we write: +# +# vtbl.8 d0, q1, d4 @ Index each of d4's 8 bytes into q1. Store in d0. +# vtbl.8 d1, q1, d5 @ Index each of d5's 8 bytes into q1. Store in d1. +# +# For readability, we write d0 and d1 as q0#lo and q0#hi, respectively and +# post-process before outputting. (This is adapted from ghash-armv4.pl.) Note, +# however, that destination (q0) and table (q1) registers may no longer match. +# We adjust the register usage from x86_64 to avoid this. (Unfortunately, the +# two-address pshufb always matched these operands, so this is common.) +# +# This file also runs against the limit of ARMv7's ADR pseudo-instruction. ADR +# expands to an ADD or SUB of the pc register to find an address. That immediate +# must fit in ARM's encoding scheme: 8 bits of constant and 4 bits of rotation. +# This means larger values must be more aligned. +# +# ARM additionally has two encodings, ARM and Thumb mode. Our assembly files may +# use either encoding (do we actually need to support this?). In ARM mode, the +# distances get large enough to require 16-byte alignment. Moving constants +# closer to their use resolves most of this, but common constants in +# _vpaes_consts are used by the whole file. Affected ADR instructions must be +# placed at 8 mod 16 (the pc register is 8 ahead). Instructions with this +# constraint have been commented. +# +# For details on ARM's immediate value encoding scheme, see +# https://alisdair.mcdiarmid.org/arm-immediate-value-encoding/ +# +# Finally, a summary of armv7 and aarch64 SIMD syntax differences: +# +# * armv7 prefixes SIMD instructions with 'v', while aarch64 does not. +# +# * armv7 SIMD registers are named like q0 (and d0 for the half-width ones). +# aarch64 names registers like v0, and denotes half-width operations in an +# instruction suffix (see below). +# +# * aarch64 embeds size and lane information in register suffixes. v0.16b is +# 16 bytes, v0.8h is eight u16s, v0.4s is four u32s, and v0.2d is two u64s. +# armv7 embeds the total size in the register name (see above) and the size of +# each element in an instruction suffix, which may look like vmov.i8, +# vshr.u8, or vtbl.8, depending on instruction. + +use strict; + +my $flavour = shift; +my $output; +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; +my $dir=$1; +my $xlate; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; + +my $code = ""; + +$code.=<<___; +.syntax unified + +.arch armv7-a +.fpu neon + +#if defined(__thumb2__) +.thumb +#else +.code 32 +#endif + +.text + +.type _vpaes_consts,%object +.align 7 @ totally strategic alignment +_vpaes_consts: +.Lk_mc_forward: @ mc_forward + .quad 0x0407060500030201, 0x0C0F0E0D080B0A09 + .quad 0x080B0A0904070605, 0x000302010C0F0E0D + .quad 0x0C0F0E0D080B0A09, 0x0407060500030201 + .quad 0x000302010C0F0E0D, 0x080B0A0904070605 +.Lk_mc_backward:@ mc_backward + .quad 0x0605040702010003, 0x0E0D0C0F0A09080B + .quad 0x020100030E0D0C0F, 0x0A09080B06050407 + .quad 0x0E0D0C0F0A09080B, 0x0605040702010003 + .quad 0x0A09080B06050407, 0x020100030E0D0C0F +.Lk_sr: @ sr + .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 + .quad 0x030E09040F0A0500, 0x0B06010C07020D08 + .quad 0x0F060D040B020900, 0x070E050C030A0108 + .quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +@ +@ "Hot" constants +@ +.Lk_inv: @ inv, inva + .quad 0x0E05060F0D080180, 0x040703090A0B0C02 + .quad 0x01040A060F0B0780, 0x030D0E0C02050809 +.Lk_ipt: @ input transform (lo, hi) + .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 + .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 +.Lk_sbo: @ sbou, sbot + .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 + .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA +.Lk_sb1: @ sb1u, sb1t + .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF + .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 +.Lk_sb2: @ sb2u, sb2t + .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A + .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD + +.asciz "Vector Permutation AES for ARMv7 NEON, Mike Hamburg (Stanford University)" +.size _vpaes_consts,.-_vpaes_consts +.align 6 +___ + +{ +my ($inp,$out,$key) = map("r$_", (0..2)); + +my ($invlo,$invhi) = map("q$_", (10..11)); +my ($sb1u,$sb1t,$sb2u,$sb2t) = map("q$_", (12..15)); + +$code.=<<___; +@@ +@@ _aes_preheat +@@ +@@ Fills q9-q15 as specified below. +@@ +.type _vpaes_preheat,%function +.align 4 +_vpaes_preheat: + adr r10, .Lk_inv + vmov.i8 q9, #0x0f @ .Lk_s0F + vld1.64 {q10,q11}, [r10]! @ .Lk_inv + add r10, r10, #64 @ Skip .Lk_ipt, .Lk_sbo + vld1.64 {q12,q13}, [r10]! @ .Lk_sb1 + vld1.64 {q14,q15}, [r10] @ .Lk_sb2 + bx lr + +@@ +@@ _aes_encrypt_core +@@ +@@ AES-encrypt q0. +@@ +@@ Inputs: +@@ q0 = input +@@ q9-q15 as in _vpaes_preheat +@@ [$key] = scheduled keys +@@ +@@ Output in q0 +@@ Clobbers q1-q5, r8-r11 +@@ Preserves q6-q8 so you get some local vectors +@@ +@@ +.type _vpaes_encrypt_core,%function +.align 4 +_vpaes_encrypt_core: + mov r9, $key + ldr r8, [$key,#240] @ pull rounds + adr r11, .Lk_ipt + @ vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + @ vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + vld1.64 {q2, q3}, [r11] + adr r11, .Lk_mc_forward+16 + vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 # round0 key + vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 + vshr.u8 q0, q0, #4 @ vpsrlb \$4, %xmm0, %xmm0 + vtbl.8 q1#lo, {q2}, q1#lo @ vpshufb %xmm1, %xmm2, %xmm1 + vtbl.8 q1#hi, {q2}, q1#hi + vtbl.8 q2#lo, {q3}, q0#lo @ vpshufb %xmm0, %xmm3, %xmm2 + vtbl.8 q2#hi, {q3}, q0#hi + veor q0, q1, q5 @ vpxor %xmm5, %xmm1, %xmm0 + veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 + + @ .Lenc_entry ends with a bnz instruction which is normally paired with + @ subs in .Lenc_loop. + tst r8, r8 + b .Lenc_entry + +.align 4 +.Lenc_loop: + @ middle of middle round + add r10, r11, #0x40 + vtbl.8 q4#lo, {$sb1t}, q2#lo @ vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + vtbl.8 q4#hi, {$sb1t}, q2#hi + vld1.64 {q1}, [r11]! @ vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + vtbl.8 q0#lo, {$sb1u}, q3#lo @ vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + vtbl.8 q0#hi, {$sb1u}, q3#hi + veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + vtbl.8 q5#lo, {$sb2t}, q2#lo @ vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + vtbl.8 q5#hi, {$sb2t}, q2#hi + veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A + vtbl.8 q2#lo, {$sb2u}, q3#lo @ vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + vtbl.8 q2#hi, {$sb2u}, q3#hi + vld1.64 {q4}, [r10] @ vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] + vtbl.8 q3#lo, {q0}, q1#lo @ vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + vtbl.8 q3#hi, {q0}, q1#hi + veor q2, q2, q5 @ vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + @ Write to q5 instead of q0, so the table and destination registers do + @ not overlap. + vtbl.8 q5#lo, {q0}, q4#lo @ vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + vtbl.8 q5#hi, {q0}, q4#hi + veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + vtbl.8 q4#lo, {q3}, q1#lo @ vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + vtbl.8 q4#hi, {q3}, q1#hi + @ Here we restore the original q0/q5 usage. + veor q0, q5, q3 @ vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + and r11, r11, #~(1<<6) @ and \$0x30, %r11 # ... mod 4 + veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + subs r8, r8, #1 @ nr-- + +.Lenc_entry: + @ top of round + vand q1, q0, q9 @ vpand %xmm0, %xmm9, %xmm1 # 0 = k + vshr.u8 q0, q0, #4 @ vpsrlb \$4, %xmm0, %xmm0 # 1 = i + vtbl.8 q5#lo, {$invhi}, q1#lo @ vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + vtbl.8 q5#hi, {$invhi}, q1#hi + veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j + vtbl.8 q3#lo, {$invlo}, q0#lo @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + vtbl.8 q3#hi, {$invlo}, q0#hi + vtbl.8 q4#lo, {$invlo}, q1#lo @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + vtbl.8 q4#hi, {$invlo}, q1#hi + veor q3, q3, q5 @ vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + vtbl.8 q2#lo, {$invlo}, q3#lo @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + vtbl.8 q2#hi, {$invlo}, q3#hi + vtbl.8 q3#lo, {$invlo}, q4#lo @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + vtbl.8 q3#hi, {$invlo}, q4#hi + veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io + veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 + bne .Lenc_loop + + @ middle of last round + add r10, r11, #0x80 + + adr r11, .Lk_sbo + @ Read to q1 instead of q4, so the vtbl.8 instruction below does not + @ overlap table and destination registers. + vld1.64 {q1}, [r11]! @ vmovdqa -0x60(%r10), %xmm4 # 3 : sbou + vld1.64 {q0}, [r11] @ vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + vtbl.8 q4#lo, {q1}, q2#lo @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + vtbl.8 q4#hi, {q1}, q2#hi + vld1.64 {q1}, [r10] @ vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] + @ Write to q2 instead of q0 below, to avoid overlapping table and + @ destination registers. + vtbl.8 q2#lo, {q0}, q3#lo @ vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + vtbl.8 q2#hi, {q0}, q3#hi + veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + veor q2, q2, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A + @ Here we restore the original q0/q2 usage. + vtbl.8 q0#lo, {q2}, q1#lo @ vpshufb %xmm1, %xmm0, %xmm0 + vtbl.8 q0#hi, {q2}, q1#hi + bx lr +.size _vpaes_encrypt_core,.-_vpaes_encrypt_core +___ +} +{ +my ($inp,$bits,$out,$dir)=("r0","r1","r2","r3"); +my ($rcon,$s0F,$invlo,$invhi,$s63) = map("q$_",(8..12)); + +$code.=<<___; +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ +@@ @@ +@@ AES key schedule @@ +@@ @@ +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + +@ This function diverges from both x86_64 and armv7 in which constants are +@ pinned. x86_64 has a common preheat function for all operations. aarch64 +@ separates them because it has enough registers to pin nearly all constants. +@ armv7 does not have enough registers, but needing explicit loads and stores +@ also complicates using x86_64's register allocation directly. +@ +@ We pin some constants for convenience and leave q14 and q15 free to load +@ others on demand. + +@ +@ Key schedule constants +@ +.type _vpaes_key_consts,%object +.align 4 +_vpaes_key_consts: +.Lk_rcon: @ rcon + .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +.Lk_opt: @ output transform + .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 + .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 +.Lk_deskew: @ deskew tables: inverts the sbox's "skew" + .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A + .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 +.size _vpaes_key_consts,.-_vpaes_key_consts + +.type _vpaes_key_preheat,%function +.align 4 +_vpaes_key_preheat: + adr r11, .Lk_rcon + vmov.i8 $s63, #0x5b @ .Lk_s63 + adr r10, .Lk_inv @ Must be aligned to 8 mod 16. + vmov.i8 $s0F, #0x0f @ .Lk_s0F + vld1.64 {$invlo,$invhi}, [r10] @ .Lk_inv + vld1.64 {$rcon}, [r11] @ .Lk_rcon + bx lr +.size _vpaes_key_preheat,.-_vpaes_key_preheat + +.type _vpaes_schedule_core,%function +.align 4 +_vpaes_schedule_core: + @ We only need to save lr, but ARM requires an 8-byte stack alignment, + @ so save an extra register. + stmdb sp!, {r3,lr} + + bl _vpaes_key_preheat @ load the tables + + adr r11, .Lk_ipt @ Must be aligned to 8 mod 16. + vld1.64 {q0}, [$inp]! @ vmovdqu (%rdi), %xmm0 # load key (unaligned) + + @ input transform + @ Use q4 here rather than q3 so .Lschedule_am_decrypting does not + @ overlap table and destination. + vmov q4, q0 @ vmovdqa %xmm0, %xmm3 + bl _vpaes_schedule_transform + adr r10, .Lk_sr @ Must be aligned to 8 mod 16. + vmov q7, q0 @ vmovdqa %xmm0, %xmm7 + + add r8, r8, r10 + + @ encrypting, output zeroth round key after transform + vst1.64 {q0}, [$out] @ vmovdqu %xmm0, (%rdx) + + @ *ring*: Decryption removed. + +.Lschedule_go: + cmp $bits, #192 @ cmp \$192, %esi + bhi .Lschedule_256 + @ 128: fall though + +@@ +@@ .schedule_128 +@@ +@@ 128-bit specific part of key schedule. +@@ +@@ This schedule is really simple, because all its parts +@@ are accomplished by the subroutines. +@@ +.Lschedule_128: + mov $inp, #10 @ mov \$10, %esi + +.Loop_schedule_128: + bl _vpaes_schedule_round + subs $inp, $inp, #1 @ dec %esi + beq .Lschedule_mangle_last + bl _vpaes_schedule_mangle @ write output + b .Loop_schedule_128 + +@@ +@@ .aes_schedule_256 +@@ +@@ 256-bit specific part of key schedule. +@@ +@@ The structure here is very similar to the 128-bit +@@ schedule, but with an additional "low side" in +@@ q6. The low side's rounds are the same as the +@@ high side's, except no rcon and no rotation. +@@ +.align 4 +.Lschedule_256: + vld1.64 {q0}, [$inp] @ vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) + bl _vpaes_schedule_transform @ input transform + mov $inp, #7 @ mov \$7, %esi + +.Loop_schedule_256: + bl _vpaes_schedule_mangle @ output low result + vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 + + @ high round + bl _vpaes_schedule_round + subs $inp, $inp, #1 @ dec %esi + beq .Lschedule_mangle_last + bl _vpaes_schedule_mangle + + @ low round. swap xmm7 and xmm6 + vdup.32 q0, q0#hi[1] @ vpshufd \$0xFF, %xmm0, %xmm0 + vmov.i8 q4, #0 + vmov q5, q7 @ vmovdqa %xmm7, %xmm5 + vmov q7, q6 @ vmovdqa %xmm6, %xmm7 + bl _vpaes_schedule_low_round + vmov q7, q5 @ vmovdqa %xmm5, %xmm7 + + b .Loop_schedule_256 + +@@ +@@ .aes_schedule_mangle_last +@@ +@@ Mangler for last round of key schedule +@@ Mangles q0 +@@ when encrypting, outputs out(q0) ^ 63 +@@ when decrypting, outputs unskew(q0) +@@ +@@ Always called right before return... jumps to cleanup and exits +@@ +.align 4 +.Lschedule_mangle_last: + @ schedule last round key from xmm0 + adr r11, .Lk_deskew @ lea .Lk_deskew(%rip),%r11 # prepare to deskew + + @ encrypting + vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10),%xmm1 + adr r11, .Lk_opt @ lea .Lk_opt(%rip), %r11 # prepare to output transform + add $out, $out, #32 @ add \$32, %rdx + vmov q2, q0 + vtbl.8 q0#lo, {q2}, q1#lo @ vpshufb %xmm1, %xmm0, %xmm0 # output permute + vtbl.8 q0#hi, {q2}, q1#hi + +.Lschedule_mangle_last_dec: + sub $out, $out, #16 @ add \$-16, %rdx + veor q0, q0, $s63 @ vpxor .Lk_s63(%rip), %xmm0, %xmm0 + bl _vpaes_schedule_transform @ output transform + vst1.64 {q0}, [$out] @ vmovdqu %xmm0, (%rdx) # save last key + + @ cleanup + veor q0, q0, q0 @ vpxor %xmm0, %xmm0, %xmm0 + veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1 + veor q2, q2, q2 @ vpxor %xmm2, %xmm2, %xmm2 + veor q3, q3, q3 @ vpxor %xmm3, %xmm3, %xmm3 + veor q4, q4, q4 @ vpxor %xmm4, %xmm4, %xmm4 + veor q5, q5, q5 @ vpxor %xmm5, %xmm5, %xmm5 + veor q6, q6, q6 @ vpxor %xmm6, %xmm6, %xmm6 + veor q7, q7, q7 @ vpxor %xmm7, %xmm7, %xmm7 + ldmia sp!, {r3,pc} @ return +.size _vpaes_schedule_core,.-_vpaes_schedule_core + +@@ +@@ .aes_schedule_round +@@ +@@ Runs one main round of the key schedule on q0, q7 +@@ +@@ Specifically, runs subbytes on the high dword of q0 +@@ then rotates it by one byte and xors into the low dword of +@@ q7. +@@ +@@ Adds rcon from low byte of q8, then rotates q8 for +@@ next rcon. +@@ +@@ Smears the dwords of q7 by xoring the low into the +@@ second low, result into third, result into highest. +@@ +@@ Returns results in q7 = q0. +@@ Clobbers q1-q4, r11. +@@ +.type _vpaes_schedule_round,%function +.align 4 +_vpaes_schedule_round: + @ extract rcon from xmm8 + vmov.i8 q4, #0 @ vpxor %xmm4, %xmm4, %xmm4 + vext.8 q1, $rcon, q4, #15 @ vpalignr \$15, %xmm8, %xmm4, %xmm1 + vext.8 $rcon, $rcon, $rcon, #15 @ vpalignr \$15, %xmm8, %xmm8, %xmm8 + veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7 + + @ rotate + vdup.32 q0, q0#hi[1] @ vpshufd \$0xFF, %xmm0, %xmm0 + vext.8 q0, q0, q0, #1 @ vpalignr \$1, %xmm0, %xmm0, %xmm0 + + @ fall through... + + @ low round: same as high round, but no rotation and no rcon. +_vpaes_schedule_low_round: + @ The x86_64 version pins .Lk_sb1 in %xmm13 and .Lk_sb1+16 in %xmm12. + @ We pin other values in _vpaes_key_preheat, so load them now. + adr r11, .Lk_sb1 + vld1.64 {q14,q15}, [r11] + + @ smear xmm7 + vext.8 q1, q4, q7, #12 @ vpslldq \$4, %xmm7, %xmm1 + veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7 + vext.8 q4, q4, q7, #8 @ vpslldq \$8, %xmm7, %xmm4 + + @ subbytes + vand q1, q0, $s0F @ vpand %xmm9, %xmm0, %xmm1 # 0 = k + vshr.u8 q0, q0, #4 @ vpsrlb \$4, %xmm0, %xmm0 # 1 = i + veor q7, q7, q4 @ vpxor %xmm4, %xmm7, %xmm7 + vtbl.8 q2#lo, {$invhi}, q1#lo @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + vtbl.8 q2#hi, {$invhi}, q1#hi + veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j + vtbl.8 q3#lo, {$invlo}, q0#lo @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + vtbl.8 q3#hi, {$invlo}, q0#hi + veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + vtbl.8 q4#lo, {$invlo}, q1#lo @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + vtbl.8 q4#hi, {$invlo}, q1#hi + veor q7, q7, $s63 @ vpxor .Lk_s63(%rip), %xmm7, %xmm7 + vtbl.8 q3#lo, {$invlo}, q3#lo @ vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak + vtbl.8 q3#hi, {$invlo}, q3#hi + veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + vtbl.8 q2#lo, {$invlo}, q4#lo @ vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak + vtbl.8 q2#hi, {$invlo}, q4#hi + veor q3, q3, q1 @ vpxor %xmm1, %xmm3, %xmm3 # 2 = io + veor q2, q2, q0 @ vpxor %xmm0, %xmm2, %xmm2 # 3 = jo + vtbl.8 q4#lo, {q15}, q3#lo @ vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou + vtbl.8 q4#hi, {q15}, q3#hi + vtbl.8 q1#lo, {q14}, q2#lo @ vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t + vtbl.8 q1#hi, {q14}, q2#hi + veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output + + @ add in smeared stuff + veor q0, q1, q7 @ vpxor %xmm7, %xmm1, %xmm0 + veor q7, q1, q7 @ vmovdqa %xmm0, %xmm7 + bx lr +.size _vpaes_schedule_round,.-_vpaes_schedule_round + +@@ +@@ .aes_schedule_transform +@@ +@@ Linear-transform q0 according to tables at [r11] +@@ +@@ Requires that q9 = 0x0F0F... as in preheat +@@ Output in q0 +@@ Clobbers q1, q2, q14, q15 +@@ +.type _vpaes_schedule_transform,%function +.align 4 +_vpaes_schedule_transform: + vld1.64 {q14,q15}, [r11] @ vmovdqa (%r11), %xmm2 # lo + @ vmovdqa 16(%r11), %xmm1 # hi + vand q1, q0, $s0F @ vpand %xmm9, %xmm0, %xmm1 + vshr.u8 q0, q0, #4 @ vpsrlb \$4, %xmm0, %xmm0 + vtbl.8 q2#lo, {q14}, q1#lo @ vpshufb %xmm1, %xmm2, %xmm2 + vtbl.8 q2#hi, {q14}, q1#hi + vtbl.8 q0#lo, {q15}, q0#lo @ vpshufb %xmm0, %xmm1, %xmm0 + vtbl.8 q0#hi, {q15}, q0#hi + veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 + bx lr +.size _vpaes_schedule_transform,.-_vpaes_schedule_transform + +@@ +@@ .aes_schedule_mangle +@@ +@@ Mangles q0 from (basis-transformed) standard version +@@ to our version. +@@ +@@ On encrypt, +@@ xor with 0x63 +@@ multiply by circulant 0,1,1,1 +@@ apply shiftrows transform +@@ +@@ On decrypt, +@@ xor with 0x63 +@@ multiply by "inverse mixcolumns" circulant E,B,D,9 +@@ deskew +@@ apply shiftrows transform +@@ +@@ +@@ Writes out to [r2], and increments or decrements it +@@ Keeps track of round number mod 4 in r8 +@@ Preserves q0 +@@ Clobbers q1-q5 +@@ +.type _vpaes_schedule_mangle,%function +.align 4 +_vpaes_schedule_mangle: + tst $dir, $dir + vmov q4, q0 @ vmovdqa %xmm0, %xmm4 # save xmm0 for later + adr r11, .Lk_mc_forward @ Must be aligned to 8 mod 16. + vld1.64 {q5}, [r11] @ vmovdqa .Lk_mc_forward(%rip),%xmm5 + + @ encrypting + @ Write to q2 so we do not overlap table and destination below. + veor q2, q0, $s63 @ vpxor .Lk_s63(%rip), %xmm0, %xmm4 + add $out, $out, #16 @ add \$16, %rdx + vtbl.8 q4#lo, {q2}, q5#lo @ vpshufb %xmm5, %xmm4, %xmm4 + vtbl.8 q4#hi, {q2}, q5#hi + vtbl.8 q1#lo, {q4}, q5#lo @ vpshufb %xmm5, %xmm4, %xmm1 + vtbl.8 q1#hi, {q4}, q5#hi + vtbl.8 q3#lo, {q1}, q5#lo @ vpshufb %xmm5, %xmm1, %xmm3 + vtbl.8 q3#hi, {q1}, q5#hi + veor q4, q4, q1 @ vpxor %xmm1, %xmm4, %xmm4 + vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1 + veor q3, q3, q4 @ vpxor %xmm4, %xmm3, %xmm3 + +.Lschedule_mangle_both: + @ Write to q2 so table and destination do not overlap. + vtbl.8 q2#lo, {q3}, q1#lo @ vpshufb %xmm1, %xmm3, %xmm3 + vtbl.8 q2#hi, {q3}, q1#hi + add r8, r8, #64-16 @ add \$-16, %r8 + and r8, r8, #~(1<<6) @ and \$0x30, %r8 + vst1.64 {q2}, [$out] @ vmovdqu %xmm3, (%rdx) + bx lr +.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle + +.globl vpaes_set_encrypt_key +.type vpaes_set_encrypt_key,%function +.align 4 +vpaes_set_encrypt_key: + stmdb sp!, {r7-r11, lr} + vstmdb sp!, {d8-d15} + + lsr r9, $bits, #5 @ shr \$5,%eax + add r9, r9, #5 @ \$5,%eax + str r9, [$out,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + + mov $dir, #0 @ mov \$0,%ecx + mov r8, #0x30 @ mov \$0x30,%r8d + bl _vpaes_schedule_core + eor r0, r0, r0 + + vldmia sp!, {d8-d15} + ldmia sp!, {r7-r11, pc} @ return +.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key +___ +} + +{ +my ($out, $inp) = map("r$_", (0..1)); +my ($s0F, $s63, $s63_raw, $mc_forward) = map("q$_", (9..12)); + +$code .= <<___; + +@ Additional constants for converting to bsaes. +.type _vpaes_convert_consts,%object +.align 4 +_vpaes_convert_consts: +@ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear +@ transform in the AES S-box. 0x63 is incorporated into the low half of the +@ table. This was computed with the following script: +@ +@ def u64s_to_u128(x, y): +@ return x | (y << 64) +@ def u128_to_u64s(w): +@ return w & ((1<<64)-1), w >> 64 +@ def get_byte(w, i): +@ return (w >> (i*8)) & 0xff +@ def apply_table(table, b): +@ lo = b & 0xf +@ hi = b >> 4 +@ return get_byte(table[0], lo) ^ get_byte(table[1], hi) +@ def opt(b): +@ table = [ +@ u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808), +@ u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0), +@ ] +@ return apply_table(table, b) +@ def rot_byte(b, n): +@ return 0xff & ((b << n) | (b >> (8-n))) +@ def skew(x): +@ return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^ +@ rot_byte(x, 4)) +@ table = [0, 0] +@ for i in range(16): +@ table[0] |= (skew(opt(i)) ^ 0x63) << (i*8) +@ table[1] |= skew(opt(i<<4)) << (i*8) +@ print("\t.quad\t0x%016x, 0x%016x" % u128_to_u64s(table[0])) +@ print("\t.quad\t0x%016x, 0x%016x" % u128_to_u64s(table[1])) +.Lk_opt_then_skew: + .quad 0x9cb8436798bc4763, 0x6440bb9f6044bf9b + .quad 0x1f30062936192f00, 0xb49bad829db284ab + +@ void vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes); +.globl vpaes_encrypt_key_to_bsaes +.type vpaes_encrypt_key_to_bsaes,%function +.align 4 +vpaes_encrypt_key_to_bsaes: + stmdb sp!, {r11, lr} + + @ See _vpaes_schedule_core for the key schedule logic. In particular, + @ _vpaes_schedule_transform(.Lk_ipt) (section 2.2 of the paper), + @ _vpaes_schedule_mangle (section 4.3), and .Lschedule_mangle_last + @ contain the transformations not in the bsaes representation. This + @ function inverts those transforms. + @ + @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key + @ representation, which does not match the other aes_nohw_* + @ implementations. The ARM aes_nohw_* stores each 32-bit word + @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the + @ cost of extra REV and VREV32 operations in little-endian ARM. + + vmov.i8 $s0F, #0x0f @ Required by _vpaes_schedule_transform + adr r2, .Lk_mc_forward @ Must be aligned to 8 mod 16. + add r3, r2, 0x90 @ .Lk_sr+0x10-.Lk_mc_forward = 0x90 (Apple's toolchain doesn't support the expression) + + vld1.64 {$mc_forward}, [r2] + vmov.i8 $s63, #0x5b @ .Lk_s63 from vpaes-x86_64 + adr r11, .Lk_opt @ Must be aligned to 8 mod 16. + vmov.i8 $s63_raw, #0x63 @ .LK_s63 without .Lk_ipt applied + + @ vpaes stores one fewer round count than bsaes, but the number of keys + @ is the same. + ldr r2, [$inp,#240] + add r2, r2, #1 + str r2, [$out,#240] + + @ The first key is transformed with _vpaes_schedule_transform(.Lk_ipt). + @ Invert this with .Lk_opt. + vld1.64 {q0}, [$inp]! + bl _vpaes_schedule_transform + vrev32.8 q0, q0 + vst1.64 {q0}, [$out]! + + @ The middle keys have _vpaes_schedule_transform(.Lk_ipt) applied, + @ followed by _vpaes_schedule_mangle. _vpaes_schedule_mangle XORs 0x63, + @ multiplies by the circulant 0,1,1,1, then applies ShiftRows. +.Loop_enc_key_to_bsaes: + vld1.64 {q0}, [$inp]! + + @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note we cycle + @ r3 in the opposite direction and start at .Lk_sr+0x10 instead of 0x30. + @ We use r3 rather than r8 to avoid a callee-saved register. + vld1.64 {q1}, [r3] + vtbl.8 q2#lo, {q0}, q1#lo + vtbl.8 q2#hi, {q0}, q1#hi + add r3, r3, #16 + and r3, r3, #~(1<<6) + vmov q0, q2 + + @ Handle the last key differently. + subs r2, r2, #1 + beq .Loop_enc_key_to_bsaes_last + + @ Multiply by the circulant. This is its own inverse. + vtbl.8 q1#lo, {q0}, $mc_forward#lo + vtbl.8 q1#hi, {q0}, $mc_forward#hi + vmov q0, q1 + vtbl.8 q2#lo, {q1}, $mc_forward#lo + vtbl.8 q2#hi, {q1}, $mc_forward#hi + veor q0, q0, q2 + vtbl.8 q1#lo, {q2}, $mc_forward#lo + vtbl.8 q1#hi, {q2}, $mc_forward#hi + veor q0, q0, q1 + + @ XOR and finish. + veor q0, q0, $s63 + bl _vpaes_schedule_transform + vrev32.8 q0, q0 + vst1.64 {q0}, [$out]! + b .Loop_enc_key_to_bsaes + +.Loop_enc_key_to_bsaes_last: + @ The final key does not have a basis transform (note + @ .Lschedule_mangle_last inverts the original transform). It only XORs + @ 0x63 and applies ShiftRows. The latter was already inverted in the + @ loop. Note that, because we act on the original representation, we use + @ $s63_raw, not $s63. + veor q0, q0, $s63_raw + vrev32.8 q0, q0 + vst1.64 {q0}, [$out] + + @ Wipe registers which contained key material. + veor q0, q0, q0 + veor q1, q1, q1 + veor q2, q2, q2 + + ldmia sp!, {r11, pc} @ return +.size vpaes_encrypt_key_to_bsaes,.-vpaes_encrypt_key_to_bsaes +___ +} + +{ +# Register-passed parameters. +my ($inp, $out, $len, $key) = map("r$_", 0..3); +# Temporaries. _vpaes_encrypt_core already uses r8..r11, so overlap $ivec and +# $tmp. $ctr is r7 because it must be preserved across calls. +my ($ctr, $ivec, $tmp) = map("r$_", 7..9); + +# void vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len, +# const AES_KEY *key, const uint8_t ivec[16]); +$code .= <<___; +.globl vpaes_ctr32_encrypt_blocks +.type vpaes_ctr32_encrypt_blocks,%function +.align 4 +vpaes_ctr32_encrypt_blocks: + mov ip, sp + stmdb sp!, {r7-r11, lr} + @ This function uses q4-q7 (d8-d15), which are callee-saved. + vstmdb sp!, {d8-d15} + + cmp $len, #0 + @ $ivec is passed on the stack. + ldr $ivec, [ip] + beq .Lctr32_done + + @ _vpaes_encrypt_core expects the key in r2, so swap $len and $key. + mov $tmp, $key + mov $key, $len + mov $len, $tmp +___ +my ($len, $key) = ($key, $len); +$code .= <<___; + + @ Load the IV and counter portion. + ldr $ctr, [$ivec, #12] + vld1.8 {q7}, [$ivec] + + bl _vpaes_preheat + rev $ctr, $ctr @ The counter is big-endian. + +.Lctr32_loop: + vmov q0, q7 + vld1.8 {q6}, [$inp]! @ Load input ahead of time + bl _vpaes_encrypt_core + veor q0, q0, q6 @ XOR input and result + vst1.8 {q0}, [$out]! + subs $len, $len, #1 + @ Update the counter. + add $ctr, $ctr, #1 + rev $tmp, $ctr + vmov.32 q7#hi[1], $tmp + bne .Lctr32_loop + +.Lctr32_done: + vldmia sp!, {d8-d15} + ldmia sp!, {r7-r11, pc} @ return +.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks +___ +} + +foreach (split("\n",$code)) { + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; + print $_,"\n"; +} + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/ring-0.17.14/crypto/fipsmodule/aes/asm/vpaes-armv8.pl b/ring-0.17.14/crypto/fipsmodule/aes/asm/vpaes-armv8.pl new file mode 100644 index 0000000000..d5e4bce897 --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/aes/asm/vpaes-armv8.pl @@ -0,0 +1,824 @@ +#! /usr/bin/env perl +# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +###################################################################### +## Constant-time SSSE3 AES core implementation. +## version 0.1 +## +## By Mike Hamburg (Stanford University), 2009 +## Public domain. +## +## For details see http://shiftleft.org/papers/vector_aes/ and +## http://crypto.stanford.edu/vpaes/. +## +###################################################################### +# ARMv8 NEON adaptation by +# +# Reason for undertaken effort is that there is at least one popular +# SoC based on Cortex-A53 that doesn't have crypto extensions. +# +# CBC enc ECB enc/dec(*) [bit-sliced enc/dec] +# Cortex-A53 21.5 18.1/20.6 [17.5/19.8 ] +# Cortex-A57 36.0(**) 20.4/24.9(**) [14.4/16.6 ] +# X-Gene 45.9(**) 45.8/57.7(**) [33.1/37.6(**) ] +# Denver(***) 16.6(**) 15.1/17.8(**) [8.80/9.93 ] +# Apple A7(***) 22.7(**) 10.9/14.3 [8.45/10.0 ] +# Mongoose(***) 26.3(**) 21.0/25.0(**) [13.3/16.8 ] +# +# (*) ECB denotes approximate result for parallelizable modes +# such as CBC decrypt, CTR, etc.; +# (**) these results are worse than scalar compiler-generated +# code, but it's constant-time and therefore preferred; +# (***) presented for reference/comparison purposes; + +$flavour = shift; +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; + +$code.=<<___; +.section .rodata + +.type _vpaes_consts,%object +.align 7 // totally strategic alignment +_vpaes_consts: +.Lk_mc_forward: // mc_forward + .quad 0x0407060500030201, 0x0C0F0E0D080B0A09 + .quad 0x080B0A0904070605, 0x000302010C0F0E0D + .quad 0x0C0F0E0D080B0A09, 0x0407060500030201 + .quad 0x000302010C0F0E0D, 0x080B0A0904070605 +.Lk_mc_backward:// mc_backward + .quad 0x0605040702010003, 0x0E0D0C0F0A09080B + .quad 0x020100030E0D0C0F, 0x0A09080B06050407 + .quad 0x0E0D0C0F0A09080B, 0x0605040702010003 + .quad 0x0A09080B06050407, 0x020100030E0D0C0F +.Lk_sr: // sr + .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 + .quad 0x030E09040F0A0500, 0x0B06010C07020D08 + .quad 0x0F060D040B020900, 0x070E050C030A0108 + .quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +// +// "Hot" constants +// +.Lk_inv: // inv, inva + .quad 0x0E05060F0D080180, 0x040703090A0B0C02 + .quad 0x01040A060F0B0780, 0x030D0E0C02050809 +.Lk_ipt: // input transform (lo, hi) + .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 + .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 +.Lk_sbo: // sbou, sbot + .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 + .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA +.Lk_sb1: // sb1u, sb1t + .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF + .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 +.Lk_sb2: // sb2u, sb2t + .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A + .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD + +// +// Key schedule constants +// +.Lk_dksd: // decryption key schedule: invskew x*D + .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 + .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E +.Lk_dksb: // decryption key schedule: invskew x*B + .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 + .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 +.Lk_dkse: // decryption key schedule: invskew x*E + 0x63 + .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 + .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 +.Lk_dks9: // decryption key schedule: invskew x*9 + .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC + .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE + +.Lk_rcon: // rcon + .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +.Lk_opt: // output transform + .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 + .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 +.Lk_deskew: // deskew tables: inverts the sbox's "skew" + .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A + .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 + +.asciz "Vector Permutation AES for ARMv8, Mike Hamburg (Stanford University)" +.size _vpaes_consts,.-_vpaes_consts +.align 6 + +.text +___ + +{ +my ($inp,$out,$key) = map("x$_",(0..2)); + +my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_.16b",(18..23)); +my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27)); +my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_.16b",(24..31)); + +$code.=<<___; +## +## _aes_preheat +## +## Fills register %r10 -> .aes_consts (so you can -fPIC) +## and %xmm9-%xmm15 as specified below. +## +.type _vpaes_encrypt_preheat,%function +.align 4 +_vpaes_encrypt_preheat: + adrp x10, :pg_hi21:.Lk_inv + add x10, x10, :lo12:.Lk_inv + movi v17.16b, #0x0f + ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv + ld1 {v20.2d-v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo + ld1 {v24.2d-v27.2d}, [x10] // .Lk_sb1, .Lk_sb2 + ret +.size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat + +## +## _aes_encrypt_core +## +## AES-encrypt %xmm0. +## +## Inputs: +## %xmm0 = input +## %xmm9-%xmm15 as in _vpaes_preheat +## (%rdx) = scheduled keys +## +## Output in %xmm0 +## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax +## Preserves %xmm6 - %xmm8 so you get some local vectors +## +## +.type _vpaes_encrypt_core,%function +.align 4 +_vpaes_encrypt_core: + mov x9, $key + ldr w8, [$key,#240] // pull rounds + adrp x11, :pg_hi21:.Lk_mc_forward+16 + add x11, x11, :lo12:.Lk_mc_forward+16 + // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 + tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 + // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 + eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + b .Lenc_entry + +.align 4 +.Lenc_loop: + // middle of middle round + add x10, x11, #0x40 + tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] + tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4 + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + sub w8, w8, #1 // nr-- + +.Lenc_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i + tbl v5.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + cbnz w8, .Lenc_loop + + // middle of last round + add x10, x11, #0x80 + // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] + tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 + ret +.size _vpaes_encrypt_core,.-_vpaes_encrypt_core + +.type _vpaes_encrypt_2x,%function +.align 4 +_vpaes_encrypt_2x: + mov x9, $key + ldr w8, [$key,#240] // pull rounds + adrp x11, :pg_hi21:.Lk_mc_forward+16 + add x11, x11, :lo12:.Lk_mc_forward+16 + // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 + and v9.16b, v15.16b, v17.16b + ushr v8.16b, v15.16b, #4 + tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 + tbl v9.16b, {$iptlo}, v9.16b + // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 + tbl v10.16b, {$ipthi}, v8.16b + eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 + eor v8.16b, v9.16b, v16.16b + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + eor v8.16b, v8.16b, v10.16b + b .Lenc_2x_entry + +.align 4 +.Lenc_2x_loop: + // middle of middle round + add x10, x11, #0x40 + tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + tbl v12.16b, {$sb1t}, v10.16b + ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + tbl v8.16b, {$sb1u}, v11.16b + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + tbl v13.16b, {$sb2t}, v10.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + eor v8.16b, v8.16b, v12.16b + tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + tbl v10.16b, {$sb2u}, v11.16b + ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] + tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + tbl v11.16b, {v8.16b}, v1.16b + eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + eor v10.16b, v10.16b, v13.16b + tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + tbl v8.16b, {v8.16b}, v4.16b + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + eor v11.16b, v11.16b, v10.16b + tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + tbl v12.16b, {v11.16b},v1.16b + eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + eor v8.16b, v8.16b, v11.16b + and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4 + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + eor v8.16b, v8.16b, v12.16b + sub w8, w8, #1 // nr-- + +.Lenc_2x_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i + and v9.16b, v8.16b, v17.16b + ushr v8.16b, v8.16b, #4 + tbl v5.16b, {$invhi},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + tbl v13.16b, {$invhi},v9.16b + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + eor v9.16b, v9.16b, v8.16b + tbl v3.16b, {$invlo},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v11.16b, {$invlo},v8.16b + tbl v4.16b, {$invlo},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + tbl v12.16b, {$invlo},v9.16b + eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v11.16b, v11.16b, v13.16b + eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + eor v12.16b, v12.16b, v13.16b + tbl v2.16b, {$invlo},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v10.16b, {$invlo},v11.16b + tbl v3.16b, {$invlo},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + tbl v11.16b, {$invlo},v12.16b + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v10.16b, v10.16b, v9.16b + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + eor v11.16b, v11.16b, v8.16b + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + cbnz w8, .Lenc_2x_loop + + // middle of last round + add x10, x11, #0x80 + // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + tbl v12.16b, {$sbou}, v10.16b + ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] + tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + tbl v8.16b, {$sbot}, v11.16b + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + eor v8.16b, v8.16b, v12.16b + tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 + tbl v1.16b, {v8.16b},v1.16b + ret +.size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x +___ +} +{ +my ($inp,$bits,$out,$dir)=("x0","w1","x2","w3"); +my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8)); + +$code.=<<___; +######################################################## +## ## +## AES key schedule ## +## ## +######################################################## +.type _vpaes_key_preheat,%function +.align 4 +_vpaes_key_preheat: + adrp x10, :pg_hi21:.Lk_inv + add x10, x10, :lo12:.Lk_inv + movi v16.16b, #0x5b // .Lk_s63 + adrp x11, :pg_hi21:.Lk_sb1 + add x11, x11, :lo12:.Lk_sb1 + movi v17.16b, #0x0f // .Lk_s0F + ld1 {v18.2d-v21.2d}, [x10] // .Lk_inv, .Lk_ipt + adrp x10, :pg_hi21:.Lk_dksd + add x10, x10, :lo12:.Lk_dksd + ld1 {v22.2d-v23.2d}, [x11] // .Lk_sb1 + adrp x11, :pg_hi21:.Lk_mc_forward + add x11, x11, :lo12:.Lk_mc_forward + ld1 {v24.2d-v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb + ld1 {v28.2d-v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9 + ld1 {v8.2d}, [x10] // .Lk_rcon + ld1 {v9.2d}, [x11] // .Lk_mc_forward[0] + ret +.size _vpaes_key_preheat,.-_vpaes_key_preheat + +.type _vpaes_schedule_core,%function +.align 4 +_vpaes_schedule_core: + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp,#-16]! + add x29,sp,#0 + + bl _vpaes_key_preheat // load the tables + + ld1 {v0.16b}, [$inp],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) + + // input transform + mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 + bl _vpaes_schedule_transform + mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 + + adrp x10, :pg_hi21:.Lk_sr // lea .Lk_sr(%rip),%r10 + add x10, x10, :lo12:.Lk_sr + + add x8, x8, x10 + + // encrypting, output zeroth round key after transform + st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx) + + cmp $bits, #192 // cmp \$192, %esi + b.hi .Lschedule_256 + b.eq .Lschedule_192 + // 128: fall though + +## +## .schedule_128 +## +## 128-bit specific part of key schedule. +## +## This schedule is really simple, because all its parts +## are accomplished by the subroutines. +## +.Lschedule_128: + mov $inp, #10 // mov \$10, %esi + +.Loop_schedule_128: + sub $inp, $inp, #1 // dec %esi + bl _vpaes_schedule_round + cbz $inp, .Lschedule_mangle_last + bl _vpaes_schedule_mangle // write output + b .Loop_schedule_128 + +## +## .aes_schedule_192 +## +## 192-bit specific part of key schedule. +## +## The main body of this schedule is the same as the 128-bit +## schedule, but with more smearing. The long, high side is +## stored in %xmm7 as before, and the short, low side is in +## the high bits of %xmm6. +## +## This schedule is somewhat nastier, however, because each +## round produces 192 bits of key material, or 1.5 round keys. +## Therefore, on each cycle we do 2 rounds and produce 3 round +## keys. +## +.align 4 +.Lschedule_192: + sub $inp, $inp, #8 + ld1 {v0.16b}, [$inp] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) + bl _vpaes_schedule_transform // input transform + mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part + eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 + ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros + mov $inp, #4 // mov \$4, %esi + +.Loop_schedule_192: + sub $inp, $inp, #1 // dec %esi + bl _vpaes_schedule_round + ext v0.16b, v6.16b, v0.16b, #8 // vpalignr \$8,%xmm6,%xmm0,%xmm0 + bl _vpaes_schedule_mangle // save key n + bl _vpaes_schedule_192_smear + bl _vpaes_schedule_mangle // save key n+1 + bl _vpaes_schedule_round + cbz $inp, .Lschedule_mangle_last + bl _vpaes_schedule_mangle // save key n+2 + bl _vpaes_schedule_192_smear + b .Loop_schedule_192 + +## +## .aes_schedule_256 +## +## 256-bit specific part of key schedule. +## +## The structure here is very similar to the 128-bit +## schedule, but with an additional "low side" in +## %xmm6. The low side's rounds are the same as the +## high side's, except no rcon and no rotation. +## +.align 4 +.Lschedule_256: + ld1 {v0.16b}, [$inp] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) + bl _vpaes_schedule_transform // input transform + mov $inp, #7 // mov \$7, %esi + +.Loop_schedule_256: + sub $inp, $inp, #1 // dec %esi + bl _vpaes_schedule_mangle // output low result + mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 + + // high round + bl _vpaes_schedule_round + cbz $inp, .Lschedule_mangle_last + bl _vpaes_schedule_mangle + + // low round. swap xmm7 and xmm6 + dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0 + movi v4.16b, #0 + mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 + mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 + bl _vpaes_schedule_low_round + mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 + + b .Loop_schedule_256 + +## +## .aes_schedule_mangle_last +## +## Mangler for last round of key schedule +## Mangles %xmm0 +## when encrypting, outputs out(%xmm0) ^ 63 +## when decrypting, outputs unskew(%xmm0) +## +## Always called right before return... jumps to cleanup and exits +## +.align 4 +.Lschedule_mangle_last: + // schedule last round key from xmm0 + adrp x11, :pg_hi21:.Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew + add x11, x11, :lo12:.Lk_deskew + + cbnz $dir, .Lschedule_mangle_last_dec + + // encrypting + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 + adrp x11, :pg_hi21:.Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform + add x11, x11, :lo12:.Lk_opt + add $out, $out, #32 // add \$32, %rdx + tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute + +.Lschedule_mangle_last_dec: + ld1 {v20.2d-v21.2d}, [x11] // reload constants + sub $out, $out, #16 // add \$-16, %rdx + eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0 + bl _vpaes_schedule_transform // output transform + st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx) # save last key + + // cleanup + eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 + eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 + eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 + eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 + eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 + eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 + eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 + eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 + ldp x29, x30, [sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size _vpaes_schedule_core,.-_vpaes_schedule_core + +## +## .aes_schedule_192_smear +## +## Smear the short, low side in the 192-bit key schedule. +## +## Inputs: +## %xmm7: high side, b a x y +## %xmm6: low side, d c 0 0 +## %xmm13: 0 +## +## Outputs: +## %xmm6: b+c+d b+c 0 0 +## %xmm0: b+c+d b+c b a +## +.type _vpaes_schedule_192_smear,%function +.align 4 +_vpaes_schedule_192_smear: + movi v1.16b, #0 + dup v0.4s, v7.s[3] + ins v1.s[3], v6.s[2] // vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 + ins v0.s[0], v7.s[2] // vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a + eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 + eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 + eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a + mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 + ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros + ret +.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear + +## +## .aes_schedule_round +## +## Runs one main round of the key schedule on %xmm0, %xmm7 +## +## Specifically, runs subbytes on the high dword of %xmm0 +## then rotates it by one byte and xors into the low dword of +## %xmm7. +## +## Adds rcon from low byte of %xmm8, then rotates %xmm8 for +## next rcon. +## +## Smears the dwords of %xmm7 by xoring the low into the +## second low, result into third, result into highest. +## +## Returns results in %xmm7 = %xmm0. +## Clobbers %xmm1-%xmm4, %r11. +## +.type _vpaes_schedule_round,%function +.align 4 +_vpaes_schedule_round: + // extract rcon from xmm8 + movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 + ext v1.16b, $rcon, v4.16b, #15 // vpalignr \$15, %xmm8, %xmm4, %xmm1 + ext $rcon, $rcon, $rcon, #15 // vpalignr \$15, %xmm8, %xmm8, %xmm8 + eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 + + // rotate + dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0 + ext v0.16b, v0.16b, v0.16b, #1 // vpalignr \$1, %xmm0, %xmm0, %xmm0 + + // fall through... + + // low round: same as high round, but no rotation and no rcon. +_vpaes_schedule_low_round: + // smear xmm7 + ext v1.16b, v4.16b, v7.16b, #12 // vpslldq \$4, %xmm7, %xmm1 + eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 + ext v4.16b, v4.16b, v7.16b, #8 // vpslldq \$8, %xmm7, %xmm4 + + // subbytes + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i + eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 + tbl v2.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7 + tbl v3.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak + eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io + eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo + tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou + tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t + eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output + + // add in smeared stuff + eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 + eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 + ret +.size _vpaes_schedule_round,.-_vpaes_schedule_round + +## +## .aes_schedule_transform +## +## Linear-transform %xmm0 according to tables at (%r11) +## +## Requires that %xmm9 = 0x0F0F... as in preheat +## Output in %xmm0 +## Clobbers %xmm1, %xmm2 +## +.type _vpaes_schedule_transform,%function +.align 4 +_vpaes_schedule_transform: + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 + // vmovdqa (%r11), %xmm2 # lo + tbl v2.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + // vmovdqa 16(%r11), %xmm1 # hi + tbl v0.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + ret +.size _vpaes_schedule_transform,.-_vpaes_schedule_transform + +## +## .aes_schedule_mangle +## +## Mangle xmm0 from (basis-transformed) standard version +## to our version. +## +## On encrypt, +## xor with 0x63 +## multiply by circulant 0,1,1,1 +## apply shiftrows transform +## +## On decrypt, +## xor with 0x63 +## multiply by "inverse mixcolumns" circulant E,B,D,9 +## deskew +## apply shiftrows transform +## +## +## Writes out to (%rdx), and increments or decrements it +## Keeps track of round number mod 4 in %r8 +## Preserves xmm0 +## Clobbers xmm1-xmm5 +## +.type _vpaes_schedule_mangle,%function +.align 4 +_vpaes_schedule_mangle: + mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later + // vmovdqa .Lk_mc_forward(%rip),%xmm5 + + // encrypting + eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4 + add $out, $out, #16 // add \$16, %rdx + tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 + tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 + tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 + eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 + +.Lschedule_mangle_both: + tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + add x8, x8, #48 // add \$-16, %r8 + and x8, x8, #~(1<<6) // and \$0x30, %r8 + st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx) + ret +.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle + +.globl vpaes_set_encrypt_key +.type vpaes_set_encrypt_key,%function +.align 4 +vpaes_set_encrypt_key: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + + lsr w9, $bits, #5 // shr \$5,%eax + add w9, w9, #5 // \$5,%eax + str w9, [$out,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + + mov $dir, #0 // mov \$0,%ecx + mov x8, #0x30 // mov \$0x30,%r8d + bl _vpaes_schedule_core + eor x0, x0, x0 + + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key +___ +} +{ +my ($inp,$out,$len,$key,$ivec) = map("x$_",(0..4)); +my ($ctr, $ctr_tmp) = ("w6", "w7"); + +# void vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len, +# const AES_KEY *key, const uint8_t ivec[16]); +$code.=<<___; +.globl vpaes_ctr32_encrypt_blocks +.type vpaes_ctr32_encrypt_blocks,%function +.align 4 +vpaes_ctr32_encrypt_blocks: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + stp d10,d11,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! + + cbz $len, .Lctr32_done + + // Note, unlike the other functions, $len here is measured in blocks, + // not bytes. + mov x17, $len + mov x2, $key + + // Load the IV and counter portion. + ldr $ctr, [$ivec, #12] + ld1 {v7.16b}, [$ivec] + + bl _vpaes_encrypt_preheat + tst x17, #1 + rev $ctr, $ctr // The counter is big-endian. + b.eq .Lctr32_prep_loop + + // Handle one block so the remaining block count is even for + // _vpaes_encrypt_2x. + ld1 {v6.16b}, [$inp], #16 // Load input ahead of time + bl _vpaes_encrypt_core + eor v0.16b, v0.16b, v6.16b // XOR input and result + st1 {v0.16b}, [$out], #16 + subs x17, x17, #1 + // Update the counter. + add $ctr, $ctr, #1 + rev $ctr_tmp, $ctr + mov v7.s[3], $ctr_tmp + b.ls .Lctr32_done + +.Lctr32_prep_loop: + // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x + // uses v14 and v15. + mov v15.16b, v7.16b + mov v14.16b, v7.16b + add $ctr, $ctr, #1 + rev $ctr_tmp, $ctr + mov v15.s[3], $ctr_tmp + +.Lctr32_loop: + ld1 {v6.16b,v7.16b}, [$inp], #32 // Load input ahead of time + bl _vpaes_encrypt_2x + eor v0.16b, v0.16b, v6.16b // XOR input and result + eor v1.16b, v1.16b, v7.16b // XOR input and result (#2) + st1 {v0.16b,v1.16b}, [$out], #32 + subs x17, x17, #2 + // Update the counter. + add $ctr_tmp, $ctr, #1 + add $ctr, $ctr, #2 + rev $ctr_tmp, $ctr_tmp + mov v14.s[3], $ctr_tmp + rev $ctr_tmp, $ctr + mov v15.s[3], $ctr_tmp + b.hi .Lctr32_loop + +.Lctr32_done: + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 + ldp d10,d11,[sp],#16 + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks +___ +} + +print $code; + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/ring-0.17.14/crypto/fipsmodule/aes/asm/vpaes-x86.pl b/ring-0.17.14/crypto/fipsmodule/aes/asm/vpaes-x86.pl new file mode 100644 index 0000000000..6410e7a752 --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/aes/asm/vpaes-x86.pl @@ -0,0 +1,617 @@ +#! /usr/bin/env perl +# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +###################################################################### +## Constant-time SSSE3 AES core implementation. +## version 0.1 +## +## By Mike Hamburg (Stanford University), 2009 +## Public domain. +## +## For details see http://shiftleft.org/papers/vector_aes/ and +## http://crypto.stanford.edu/vpaes/. + +###################################################################### +# September 2011. +# +# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for +# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt +# doesn't handle partial vectors (doesn't have to if called from +# EVP only). "Drop-in" implies that this module doesn't share key +# schedule structure with the original nor does it make assumption +# about its alignment... +# +# Performance summary. aes-586.pl column lists large-block CBC +# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per +# byte processed with 128-bit key, and vpaes-x86.pl column - [also +# large-block CBC] encrypt/decrypt. +# +# aes-586.pl vpaes-x86.pl +# +# Core 2(**) 28.1/41.4/18.3 21.9/25.2(***) +# Nehalem 27.9/40.4/18.1 10.2/11.9 +# Atom 70.7/92.1/60.1 61.1/75.4(***) +# Silvermont 45.4/62.9/24.1 49.2/61.1(***) +# +# (*) "Hyper-threading" in the context refers rather to cache shared +# among multiple cores, than to specifically Intel HTT. As vast +# majority of contemporary cores share cache, slower code path +# is common place. In other words "with-hyper-threading-off" +# results are presented mostly for reference purposes. +# +# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. +# +# (***) Less impressive improvement on Core 2 and Atom is due to slow +# pshufb, yet it's respectable +28%/64% improvement on Core 2 +# and +15% on Atom (as implied, over "hyper-threading-safe" +# code path). +# +# + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../../perlasm"); +require "x86asm.pl"; + +$output = pop; +open OUT,">$output"; +*STDOUT=*OUT; + +&asm_init($ARGV[0]); + +$PREFIX="vpaes"; + +my ($round, $base, $magic, $key, $const, $inp, $out)= + ("eax", "ebx", "ecx", "edx","ebp", "esi","edi"); + +&preprocessor_ifdef("BORINGSSL_DISPATCH_TEST") +&external_label("BORINGSSL_function_hit"); +&preprocessor_endif(); +&static_label("_vpaes_consts"); +&static_label("_vpaes_schedule_low_round"); + +&set_label("_vpaes_consts",64); +$k_inv=-0x30; # inv, inva + &data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309); + &data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C); + +$k_s0F=-0x10; # s0F + &data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F); + +$k_ipt=0x00; # input transform (lo, hi) + &data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090); + &data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC); + +$k_sb1=0x20; # sb1u, sb1t + &data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E); + &data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1); +$k_sb2=0x40; # sb2u, sb2t + &data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955); + &data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8); +$k_sbo=0x60; # sbou, sbot + &data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A); + &data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1); + +$k_mc_forward=0x80; # mc_forward + &data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D); + &data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201); + &data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605); + &data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09); + +$k_mc_backward=0xc0; # mc_backward + &data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F); + &data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B); + &data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407); + &data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003); + +$k_sr=0x100; # sr + &data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C); + &data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C); + &data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C); + &data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C); + +$k_rcon=0x140; # rcon + &data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808); + +$k_s63=0x150; # s63: all equal to 0x63 transformed + &data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B); + +$k_opt=0x160; # output transform + &data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121); + &data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1); + +$k_deskew=0x180; # deskew tables: inverts the sbox's "skew" + &data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A); + &data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB); + +&asciz ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)"); +&align (64); + +&function_begin_B("_vpaes_preheat"); + &add ($const,&DWP(0,"esp")); + &movdqa ("xmm7",&QWP($k_inv,$const)); + &movdqa ("xmm6",&QWP($k_s0F,$const)); + &ret (); +&function_end_B("_vpaes_preheat"); + +## +## _aes_encrypt_core +## +## AES-encrypt %xmm0. +## +## Inputs: +## %xmm0 = input +## %xmm6-%xmm7 as in _vpaes_preheat +## (%edx) = scheduled keys +## +## Output in %xmm0 +## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx +## +## +&function_begin_B("_vpaes_encrypt_core"); + &mov ($magic,16); + &mov ($round,&DWP(240,$key)); + &movdqa ("xmm1","xmm6") + &movdqa ("xmm2",&QWP($k_ipt,$const)); + &pandn ("xmm1","xmm0"); + &pand ("xmm0","xmm6"); + &movdqu ("xmm5",&QWP(0,$key)); + &pshufb ("xmm2","xmm0"); + &movdqa ("xmm0",&QWP($k_ipt+16,$const)); + &pxor ("xmm2","xmm5"); + &psrld ("xmm1",4); + &add ($key,16); + &pshufb ("xmm0","xmm1"); + &lea ($base,&DWP($k_mc_backward,$const)); + &pxor ("xmm0","xmm2"); + &jmp (&label("enc_entry")); + + +&set_label("enc_loop",16); + # middle of middle round + &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u + &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t + &pshufb ("xmm4","xmm2"); # 4 = sb1u + &pshufb ("xmm0","xmm3"); # 0 = sb1t + &pxor ("xmm4","xmm5"); # 4 = sb1u + k + &movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u + &pxor ("xmm0","xmm4"); # 0 = A + &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[] + &pshufb ("xmm5","xmm2"); # 4 = sb2u + &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t + &movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[] + &pshufb ("xmm2","xmm3"); # 2 = sb2t + &movdqa ("xmm3","xmm0"); # 3 = A + &pxor ("xmm2","xmm5"); # 2 = 2A + &pshufb ("xmm0","xmm1"); # 0 = B + &add ($key,16); # next key + &pxor ("xmm0","xmm2"); # 0 = 2A+B + &pshufb ("xmm3","xmm4"); # 3 = D + &add ($magic,16); # next mc + &pxor ("xmm3","xmm0"); # 3 = 2A+B+D + &pshufb ("xmm0","xmm1"); # 0 = 2B+C + &and ($magic,0x30); # ... mod 4 + &sub ($round,1); # nr-- + &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D + +&set_label("enc_entry"); + # top of round + &movdqa ("xmm1","xmm6"); # 1 : i + &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k + &pandn ("xmm1","xmm0"); # 1 = i<<4 + &psrld ("xmm1",4); # 1 = i + &pand ("xmm0","xmm6"); # 0 = k + &pshufb ("xmm5","xmm0"); # 2 = a/k + &movdqa ("xmm3","xmm7"); # 3 : 1/i + &pxor ("xmm0","xmm1"); # 0 = j + &pshufb ("xmm3","xmm1"); # 3 = 1/i + &movdqa ("xmm4","xmm7"); # 4 : 1/j + &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k + &pshufb ("xmm4","xmm0"); # 4 = 1/j + &movdqa ("xmm2","xmm7"); # 2 : 1/iak + &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k + &pshufb ("xmm2","xmm3"); # 2 = 1/iak + &movdqa ("xmm3","xmm7"); # 3 : 1/jak + &pxor ("xmm2","xmm0"); # 2 = io + &pshufb ("xmm3","xmm4"); # 3 = 1/jak + &movdqu ("xmm5",&QWP(0,$key)); + &pxor ("xmm3","xmm1"); # 3 = jo + &jnz (&label("enc_loop")); + + # middle of last round + &movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo + &movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16 + &pshufb ("xmm4","xmm2"); # 4 = sbou + &pxor ("xmm4","xmm5"); # 4 = sb1u + k + &pshufb ("xmm0","xmm3"); # 0 = sb1t + &movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[] + &pxor ("xmm0","xmm4"); # 0 = A + &pshufb ("xmm0","xmm1"); + &ret (); +&function_end_B("_vpaes_encrypt_core"); + +######################################################## +## ## +## AES key schedule ## +## ## +######################################################## +&function_begin_B("_vpaes_schedule_core"); + &add ($const,&DWP(0,"esp")); + &movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned) + &movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon + + # input transform + &movdqa ("xmm3","xmm0"); + &lea ($base,&DWP($k_ipt,$const)); + &movdqa (&QWP(4,"esp"),"xmm2"); # xmm8 + &call ("_vpaes_schedule_transform"); + &movdqa ("xmm7","xmm0"); + + &test ($out,$out); + &jnz (&label("schedule_am_decrypting")); + + # encrypting, output zeroth round key after transform + &movdqu (&QWP(0,$key),"xmm0"); + &jmp (&label("schedule_go")); + +&set_label("schedule_am_decrypting"); + # decrypting, output zeroth round key after shiftrows + &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); + &pshufb ("xmm3","xmm1"); + &movdqu (&QWP(0,$key),"xmm3"); + &xor ($magic,0x30); + +&set_label("schedule_go"); + &cmp ($round,192); + &ja (&label("schedule_256")); + # 192-bit key support was removed. + # 128: fall though + +## +## .schedule_128 +## +## 128-bit specific part of key schedule. +## +## This schedule is really simple, because all its parts +## are accomplished by the subroutines. +## +&set_label("schedule_128"); + &mov ($round,10); + +&set_label("loop_schedule_128"); + &call ("_vpaes_schedule_round"); + &dec ($round); + &jz (&label("schedule_mangle_last")); + &call ("_vpaes_schedule_mangle"); # write output + &jmp (&label("loop_schedule_128")); + +## +## .aes_schedule_256 +## +## 256-bit specific part of key schedule. +## +## The structure here is very similar to the 128-bit +## schedule, but with an additional "low side" in +## %xmm6. The low side's rounds are the same as the +## high side's, except no rcon and no rotation. +## +&set_label("schedule_256",16); + &movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned) + &call ("_vpaes_schedule_transform"); # input transform + &mov ($round,7); + +&set_label("loop_schedule_256"); + &call ("_vpaes_schedule_mangle"); # output low result + &movdqa ("xmm6","xmm0"); # save cur_lo in xmm6 + + # high round + &call ("_vpaes_schedule_round"); + &dec ($round); + &jz (&label("schedule_mangle_last")); + &call ("_vpaes_schedule_mangle"); + + # low round. swap xmm7 and xmm6 + &pshufd ("xmm0","xmm0",0xFF); + &movdqa (&QWP(20,"esp"),"xmm7"); + &movdqa ("xmm7","xmm6"); + &call ("_vpaes_schedule_low_round"); + &movdqa ("xmm7",&QWP(20,"esp")); + + &jmp (&label("loop_schedule_256")); + +## +## .aes_schedule_mangle_last +## +## Mangler for last round of key schedule +## Mangles %xmm0 +## when encrypting, outputs out(%xmm0) ^ 63 +## when decrypting, outputs unskew(%xmm0) +## +## Always called right before return... jumps to cleanup and exits +## +&set_label("schedule_mangle_last",16); + # schedule last round key from xmm0 + &lea ($base,&DWP($k_deskew,$const)); + &test ($out,$out); + &jnz (&label("schedule_mangle_last_dec")); + + # encrypting + &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); + &pshufb ("xmm0","xmm1"); # output permute + &lea ($base,&DWP($k_opt,$const)); # prepare to output transform + &add ($key,32); + +&set_label("schedule_mangle_last_dec"); + &add ($key,-16); + &pxor ("xmm0",&QWP($k_s63,$const)); + &call ("_vpaes_schedule_transform"); # output transform + &movdqu (&QWP(0,$key),"xmm0"); # save last key + + # cleanup + &pxor ("xmm0","xmm0"); + &pxor ("xmm1","xmm1"); + &pxor ("xmm2","xmm2"); + &pxor ("xmm3","xmm3"); + &pxor ("xmm4","xmm4"); + &pxor ("xmm5","xmm5"); + &pxor ("xmm6","xmm6"); + &pxor ("xmm7","xmm7"); + &ret (); +&function_end_B("_vpaes_schedule_core"); + +## +## .aes_schedule_round +## +## Runs one main round of the key schedule on %xmm0, %xmm7 +## +## Specifically, runs subbytes on the high dword of %xmm0 +## then rotates it by one byte and xors into the low dword of +## %xmm7. +## +## Adds rcon from low byte of %xmm8, then rotates %xmm8 for +## next rcon. +## +## Smears the dwords of %xmm7 by xoring the low into the +## second low, result into third, result into highest. +## +## Returns results in %xmm7 = %xmm0. +## Clobbers %xmm1-%xmm5. +## +&function_begin_B("_vpaes_schedule_round"); + # extract rcon from xmm8 + &movdqa ("xmm2",&QWP(8,"esp")); # xmm8 + &pxor ("xmm1","xmm1"); + &palignr("xmm1","xmm2",15); + &palignr("xmm2","xmm2",15); + &pxor ("xmm7","xmm1"); + + # rotate + &pshufd ("xmm0","xmm0",0xFF); + &palignr("xmm0","xmm0",1); + + # fall through... + &movdqa (&QWP(8,"esp"),"xmm2"); # xmm8 + + # low round: same as high round, but no rotation and no rcon. +&set_label("_vpaes_schedule_low_round"); + # smear xmm7 + &movdqa ("xmm1","xmm7"); + &pslldq ("xmm7",4); + &pxor ("xmm7","xmm1"); + &movdqa ("xmm1","xmm7"); + &pslldq ("xmm7",8); + &pxor ("xmm7","xmm1"); + &pxor ("xmm7",&QWP($k_s63,$const)); + + # subbyte + &movdqa ("xmm4",&QWP($k_s0F,$const)); + &movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j + &movdqa ("xmm1","xmm4"); + &pandn ("xmm1","xmm0"); + &psrld ("xmm1",4); # 1 = i + &pand ("xmm0","xmm4"); # 0 = k + &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k + &pshufb ("xmm2","xmm0"); # 2 = a/k + &pxor ("xmm0","xmm1"); # 0 = j + &movdqa ("xmm3","xmm5"); # 3 : 1/i + &pshufb ("xmm3","xmm1"); # 3 = 1/i + &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k + &movdqa ("xmm4","xmm5"); # 4 : 1/j + &pshufb ("xmm4","xmm0"); # 4 = 1/j + &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k + &movdqa ("xmm2","xmm5"); # 2 : 1/iak + &pshufb ("xmm2","xmm3"); # 2 = 1/iak + &pxor ("xmm2","xmm0"); # 2 = io + &movdqa ("xmm3","xmm5"); # 3 : 1/jak + &pshufb ("xmm3","xmm4"); # 3 = 1/jak + &pxor ("xmm3","xmm1"); # 3 = jo + &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou + &pshufb ("xmm4","xmm2"); # 4 = sbou + &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot + &pshufb ("xmm0","xmm3"); # 0 = sb1t + &pxor ("xmm0","xmm4"); # 0 = sbox output + + # add in smeared stuff + &pxor ("xmm0","xmm7"); + &movdqa ("xmm7","xmm0"); + &ret (); +&function_end_B("_vpaes_schedule_round"); + +## +## .aes_schedule_transform +## +## Linear-transform %xmm0 according to tables at (%ebx) +## +## Output in %xmm0 +## Clobbers %xmm1, %xmm2 +## +&function_begin_B("_vpaes_schedule_transform"); + &movdqa ("xmm2",&QWP($k_s0F,$const)); + &movdqa ("xmm1","xmm2"); + &pandn ("xmm1","xmm0"); + &psrld ("xmm1",4); + &pand ("xmm0","xmm2"); + &movdqa ("xmm2",&QWP(0,$base)); + &pshufb ("xmm2","xmm0"); + &movdqa ("xmm0",&QWP(16,$base)); + &pshufb ("xmm0","xmm1"); + &pxor ("xmm0","xmm2"); + &ret (); +&function_end_B("_vpaes_schedule_transform"); + +## +## .aes_schedule_mangle +## +## Mangle xmm0 from (basis-transformed) standard version +## to our version. +## +## On encrypt, +## xor with 0x63 +## multiply by circulant 0,1,1,1 +## apply shiftrows transform +## +## On decrypt, +## xor with 0x63 +## multiply by "inverse mixcolumns" circulant E,B,D,9 +## deskew +## apply shiftrows transform +## +## +## Writes out to (%edx), and increments or decrements it +## Keeps track of round number mod 4 in %ecx +## Preserves xmm0 +## Clobbers xmm1-xmm5 +## +&function_begin_B("_vpaes_schedule_mangle"); + &movdqa ("xmm4","xmm0"); # save xmm0 for later + &movdqa ("xmm5",&QWP($k_mc_forward,$const)); + &test ($out,$out); + &jnz (&label("schedule_mangle_dec")); + + # encrypting + &add ($key,16); + &pxor ("xmm4",&QWP($k_s63,$const)); + &pshufb ("xmm4","xmm5"); + &movdqa ("xmm3","xmm4"); + &pshufb ("xmm4","xmm5"); + &pxor ("xmm3","xmm4"); + &pshufb ("xmm4","xmm5"); + &pxor ("xmm3","xmm4"); + + &jmp (&label("schedule_mangle_both")); + +&set_label("schedule_mangle_dec",16); + # inverse mix columns + &movdqa ("xmm2",&QWP($k_s0F,$const)); + &lea ($inp,&DWP($k_dksd,$const)); + &movdqa ("xmm1","xmm2"); + &pandn ("xmm1","xmm4"); + &psrld ("xmm1",4); # 1 = hi + &pand ("xmm4","xmm2"); # 4 = lo + + &movdqa ("xmm2",&QWP(0,$inp)); + &pshufb ("xmm2","xmm4"); + &movdqa ("xmm3",&QWP(0x10,$inp)); + &pshufb ("xmm3","xmm1"); + &pxor ("xmm3","xmm2"); + &pshufb ("xmm3","xmm5"); + + &movdqa ("xmm2",&QWP(0x20,$inp)); + &pshufb ("xmm2","xmm4"); + &pxor ("xmm2","xmm3"); + &movdqa ("xmm3",&QWP(0x30,$inp)); + &pshufb ("xmm3","xmm1"); + &pxor ("xmm3","xmm2"); + &pshufb ("xmm3","xmm5"); + + &movdqa ("xmm2",&QWP(0x40,$inp)); + &pshufb ("xmm2","xmm4"); + &pxor ("xmm2","xmm3"); + &movdqa ("xmm3",&QWP(0x50,$inp)); + &pshufb ("xmm3","xmm1"); + &pxor ("xmm3","xmm2"); + &pshufb ("xmm3","xmm5"); + + &movdqa ("xmm2",&QWP(0x60,$inp)); + &pshufb ("xmm2","xmm4"); + &pxor ("xmm2","xmm3"); + &movdqa ("xmm3",&QWP(0x70,$inp)); + &pshufb ("xmm3","xmm1"); + &pxor ("xmm3","xmm2"); + + &add ($key,-16); + +&set_label("schedule_mangle_both"); + &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); + &pshufb ("xmm3","xmm1"); + &add ($magic,-16); + &and ($magic,0x30); + &movdqu (&QWP(0,$key),"xmm3"); + &ret (); +&function_end_B("_vpaes_schedule_mangle"); + +# +# Interface to OpenSSL +# +&function_begin("${PREFIX}_set_encrypt_key"); + record_function_hit(5); + + &mov ($inp,&wparam(0)); # inp + &lea ($base,&DWP(-56,"esp")); + &mov ($round,&wparam(1)); # bits + &and ($base,-16); + &mov ($key,&wparam(2)); # key + &xchg ($base,"esp"); # alloca + &mov (&DWP(48,"esp"),$base); + + &mov ($base,$round); + &shr ($base,5); + &add ($base,5); + &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5; + &mov ($magic,0x30); + &mov ($out,0); + + &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); + &call ("_vpaes_schedule_core"); +&set_label("pic_point"); + + &mov ("esp",&DWP(48,"esp")); + &xor ("eax","eax"); +&function_end("${PREFIX}_set_encrypt_key"); + +&function_begin("${PREFIX}_encrypt"); + record_function_hit(4); + + &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); + &call ("_vpaes_preheat"); +&set_label("pic_point"); + &mov ($inp,&wparam(0)); # inp + &lea ($base,&DWP(-56,"esp")); + &mov ($out,&wparam(1)); # out + &and ($base,-16); + &mov ($key,&wparam(2)); # key + &xchg ($base,"esp"); # alloca + &mov (&DWP(48,"esp"),$base); + + &movdqu ("xmm0",&QWP(0,$inp)); + &call ("_vpaes_encrypt_core"); + &movdqu (&QWP(0,$out),"xmm0"); + + &mov ("esp",&DWP(48,"esp")); +&function_end("${PREFIX}_encrypt"); + +&asm_finish(); + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/ring-0.17.14/crypto/fipsmodule/aes/asm/vpaes-x86_64.pl b/ring-0.17.14/crypto/fipsmodule/aes/asm/vpaes-x86_64.pl new file mode 100644 index 0000000000..09c1ba37ba --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/aes/asm/vpaes-x86_64.pl @@ -0,0 +1,1023 @@ +#! /usr/bin/env perl +# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +###################################################################### +## Constant-time SSSE3 AES core implementation. +## version 0.1 +## +## By Mike Hamburg (Stanford University), 2009 +## Public domain. +## +## For details see http://shiftleft.org/papers/vector_aes/ and +## http://crypto.stanford.edu/vpaes/. + +###################################################################### +# September 2011. +# +# Interface to OpenSSL as "almost" drop-in replacement for +# aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt +# doesn't handle partial vectors (doesn't have to if called from +# EVP only). "Drop-in" implies that this module doesn't share key +# schedule structure with the original nor does it make assumption +# about its alignment... +# +# Performance summary. aes-x86_64.pl column lists large-block CBC +# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per +# byte processed with 128-bit key, and vpaes-x86_64.pl column - +# [also large-block CBC] encrypt/decrypt. +# +# aes-x86_64.pl vpaes-x86_64.pl +# +# Core 2(**) 29.6/41.1/14.3 21.9/25.2(***) +# Nehalem 29.6/40.3/14.6 10.0/11.8 +# Atom 57.3/74.2/32.1 60.9/77.2(***) +# Silvermont 52.7/64.0/19.5 48.8/60.8(***) +# Goldmont 38.9/49.0/17.8 10.6/12.6 +# +# (*) "Hyper-threading" in the context refers rather to cache shared +# among multiple cores, than to specifically Intel HTT. As vast +# majority of contemporary cores share cache, slower code path +# is common place. In other words "with-hyper-threading-off" +# results are presented mostly for reference purposes. +# +# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. +# +# (***) Less impressive improvement on Core 2 and Atom is due to slow +# pshufb, yet it's respectable +36%/62% improvement on Core 2 +# (as implied, over "hyper-threading-safe" code path). +# +# + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; + +$PREFIX="vpaes"; + +$code.=<<___; +.text + +## +## _aes_encrypt_core +## +## AES-encrypt %xmm0. +## +## Inputs: +## %xmm0 = input +## %xmm9-%xmm15 as in _vpaes_preheat +## (%rdx) = scheduled keys +## +## Output in %xmm0 +## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax +## Preserves %xmm6 - %xmm8 so you get some local vectors +## +## +.type _vpaes_encrypt_core,\@abi-omnipotent +.align 16 +_vpaes_encrypt_core: +.cfi_startproc + mov %rdx, %r9 + mov \$16, %r11 + mov 240(%rdx),%eax + movdqa %xmm9, %xmm1 + movdqa .Lk_ipt(%rip), %xmm2 # iptlo + pandn %xmm0, %xmm1 + movdqu (%r9), %xmm5 # round0 key + psrld \$4, %xmm1 + pand %xmm9, %xmm0 + pshufb %xmm0, %xmm2 + movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi + pshufb %xmm1, %xmm0 + pxor %xmm5, %xmm2 + add \$16, %r9 + pxor %xmm2, %xmm0 + lea .Lk_mc_backward(%rip),%r10 + jmp .Lenc_entry + +.align 16 +.Lenc_loop: + # middle of middle round + movdqa %xmm13, %xmm4 # 4 : sb1u + movdqa %xmm12, %xmm0 # 0 : sb1t + pshufb %xmm2, %xmm4 # 4 = sb1u + pshufb %xmm3, %xmm0 # 0 = sb1t + pxor %xmm5, %xmm4 # 4 = sb1u + k + movdqa %xmm15, %xmm5 # 4 : sb2u + pxor %xmm4, %xmm0 # 0 = A + movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + pshufb %xmm2, %xmm5 # 4 = sb2u + movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] + movdqa %xmm14, %xmm2 # 2 : sb2t + pshufb %xmm3, %xmm2 # 2 = sb2t + movdqa %xmm0, %xmm3 # 3 = A + pxor %xmm5, %xmm2 # 2 = 2A + pshufb %xmm1, %xmm0 # 0 = B + add \$16, %r9 # next key + pxor %xmm2, %xmm0 # 0 = 2A+B + pshufb %xmm4, %xmm3 # 3 = D + add \$16, %r11 # next mc + pxor %xmm0, %xmm3 # 3 = 2A+B+D + pshufb %xmm1, %xmm0 # 0 = 2B+C + and \$0x30, %r11 # ... mod 4 + sub \$1,%rax # nr-- + pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D + +.Lenc_entry: + # top of round + movdqa %xmm9, %xmm1 # 1 : i + movdqa %xmm11, %xmm5 # 2 : a/k + pandn %xmm0, %xmm1 # 1 = i<<4 + psrld \$4, %xmm1 # 1 = i + pand %xmm9, %xmm0 # 0 = k + pshufb %xmm0, %xmm5 # 2 = a/k + movdqa %xmm10, %xmm3 # 3 : 1/i + pxor %xmm1, %xmm0 # 0 = j + pshufb %xmm1, %xmm3 # 3 = 1/i + movdqa %xmm10, %xmm4 # 4 : 1/j + pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k + pshufb %xmm0, %xmm4 # 4 = 1/j + movdqa %xmm10, %xmm2 # 2 : 1/iak + pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k + pshufb %xmm3, %xmm2 # 2 = 1/iak + movdqa %xmm10, %xmm3 # 3 : 1/jak + pxor %xmm0, %xmm2 # 2 = io + pshufb %xmm4, %xmm3 # 3 = 1/jak + movdqu (%r9), %xmm5 + pxor %xmm1, %xmm3 # 3 = jo + jnz .Lenc_loop + + # middle of last round + movdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + movdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + pshufb %xmm2, %xmm4 # 4 = sbou + pxor %xmm5, %xmm4 # 4 = sb1u + k + pshufb %xmm3, %xmm0 # 0 = sb1t + movdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] + pxor %xmm4, %xmm0 # 0 = A + pshufb %xmm1, %xmm0 + ret +.cfi_endproc +.size _vpaes_encrypt_core,.-_vpaes_encrypt_core + +## +## _aes_encrypt_core_2x +## +## AES-encrypt %xmm0 and %xmm6 in parallel. +## +## Inputs: +## %xmm0 and %xmm6 = input +## %xmm9 and %xmm10 as in _vpaes_preheat +## (%rdx) = scheduled keys +## +## Output in %xmm0 and %xmm6 +## Clobbers %xmm1-%xmm5, %xmm7, %xmm8, %xmm11-%xmm13, %r9, %r10, %r11, %rax +## Preserves %xmm14 and %xmm15 +## +## This function stitches two parallel instances of _vpaes_encrypt_core. x86_64 +## provides 16 XMM registers. _vpaes_encrypt_core computes over six registers +## (%xmm0-%xmm5) and additionally uses seven registers with preloaded constants +## from _vpaes_preheat (%xmm9-%xmm15). This does not quite fit two instances, +## so we spill some of %xmm9 through %xmm15 back to memory. We keep %xmm9 and +## %xmm10 in registers as these values are used several times in a row. The +## remainder are read once per round and are spilled to memory. This leaves two +## registers preserved for the caller. +## +## Thus, of the two _vpaes_encrypt_core instances, the first uses (%xmm0-%xmm5) +## as before. The second uses %xmm6-%xmm8,%xmm11-%xmm13. (Add 6 to %xmm2 and +## below. Add 8 to %xmm3 and up.) Instructions in the second instance are +## indented by one space. +## +## +.type _vpaes_encrypt_core_2x,\@abi-omnipotent +.align 16 +_vpaes_encrypt_core_2x: +.cfi_startproc + mov %rdx, %r9 + mov \$16, %r11 + mov 240(%rdx),%eax + movdqa %xmm9, %xmm1 + movdqa %xmm9, %xmm7 + movdqa .Lk_ipt(%rip), %xmm2 # iptlo + movdqa %xmm2, %xmm8 + pandn %xmm0, %xmm1 + pandn %xmm6, %xmm7 + movdqu (%r9), %xmm5 # round0 key + # Also use %xmm5 in the second instance. + psrld \$4, %xmm1 + psrld \$4, %xmm7 + pand %xmm9, %xmm0 + pand %xmm9, %xmm6 + pshufb %xmm0, %xmm2 + pshufb %xmm6, %xmm8 + movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi + movdqa %xmm0, %xmm6 + pshufb %xmm1, %xmm0 + pshufb %xmm7, %xmm6 + pxor %xmm5, %xmm2 + pxor %xmm5, %xmm8 + add \$16, %r9 + pxor %xmm2, %xmm0 + pxor %xmm8, %xmm6 + lea .Lk_mc_backward(%rip),%r10 + jmp .Lenc2x_entry + +.align 16 +.Lenc2x_loop: + # middle of middle round + movdqa .Lk_sb1(%rip), %xmm4 # 4 : sb1u + movdqa .Lk_sb1+16(%rip),%xmm0 # 0 : sb1t + movdqa %xmm4, %xmm12 + movdqa %xmm0, %xmm6 + pshufb %xmm2, %xmm4 # 4 = sb1u + pshufb %xmm8, %xmm12 + pshufb %xmm3, %xmm0 # 0 = sb1t + pshufb %xmm11, %xmm6 + pxor %xmm5, %xmm4 # 4 = sb1u + k + pxor %xmm5, %xmm12 + movdqa .Lk_sb2(%rip), %xmm5 # 4 : sb2u + movdqa %xmm5, %xmm13 + pxor %xmm4, %xmm0 # 0 = A + pxor %xmm12, %xmm6 + movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + # Also use %xmm1 in the second instance. + pshufb %xmm2, %xmm5 # 4 = sb2u + pshufb %xmm8, %xmm13 + movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] + # Also use %xmm4 in the second instance. + movdqa .Lk_sb2+16(%rip), %xmm2 # 2 : sb2t + movdqa %xmm2, %xmm8 + pshufb %xmm3, %xmm2 # 2 = sb2t + pshufb %xmm11, %xmm8 + movdqa %xmm0, %xmm3 # 3 = A + movdqa %xmm6, %xmm11 + pxor %xmm5, %xmm2 # 2 = 2A + pxor %xmm13, %xmm8 + pshufb %xmm1, %xmm0 # 0 = B + pshufb %xmm1, %xmm6 + add \$16, %r9 # next key + pxor %xmm2, %xmm0 # 0 = 2A+B + pxor %xmm8, %xmm6 + pshufb %xmm4, %xmm3 # 3 = D + pshufb %xmm4, %xmm11 + add \$16, %r11 # next mc + pxor %xmm0, %xmm3 # 3 = 2A+B+D + pxor %xmm6, %xmm11 + pshufb %xmm1, %xmm0 # 0 = 2B+C + pshufb %xmm1, %xmm6 + and \$0x30, %r11 # ... mod 4 + sub \$1,%rax # nr-- + pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D + pxor %xmm11, %xmm6 + +.Lenc2x_entry: + # top of round + movdqa %xmm9, %xmm1 # 1 : i + movdqa %xmm9, %xmm7 + movdqa .Lk_inv+16(%rip), %xmm5 # 2 : a/k + movdqa %xmm5, %xmm13 + pandn %xmm0, %xmm1 # 1 = i<<4 + pandn %xmm6, %xmm7 + psrld \$4, %xmm1 # 1 = i + psrld \$4, %xmm7 + pand %xmm9, %xmm0 # 0 = k + pand %xmm9, %xmm6 + pshufb %xmm0, %xmm5 # 2 = a/k + pshufb %xmm6, %xmm13 + movdqa %xmm10, %xmm3 # 3 : 1/i + movdqa %xmm10, %xmm11 + pxor %xmm1, %xmm0 # 0 = j + pxor %xmm7, %xmm6 + pshufb %xmm1, %xmm3 # 3 = 1/i + pshufb %xmm7, %xmm11 + movdqa %xmm10, %xmm4 # 4 : 1/j + movdqa %xmm10, %xmm12 + pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k + pxor %xmm13, %xmm11 + pshufb %xmm0, %xmm4 # 4 = 1/j + pshufb %xmm6, %xmm12 + movdqa %xmm10, %xmm2 # 2 : 1/iak + movdqa %xmm10, %xmm8 + pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k + pxor %xmm13, %xmm12 + pshufb %xmm3, %xmm2 # 2 = 1/iak + pshufb %xmm11, %xmm8 + movdqa %xmm10, %xmm3 # 3 : 1/jak + movdqa %xmm10, %xmm11 + pxor %xmm0, %xmm2 # 2 = io + pxor %xmm6, %xmm8 + pshufb %xmm4, %xmm3 # 3 = 1/jak + pshufb %xmm12, %xmm11 + movdqu (%r9), %xmm5 + # Also use %xmm5 in the second instance. + pxor %xmm1, %xmm3 # 3 = jo + pxor %xmm7, %xmm11 + jnz .Lenc2x_loop + + # middle of last round + movdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + movdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + movdqa %xmm4, %xmm12 + movdqa %xmm0, %xmm6 + pshufb %xmm2, %xmm4 # 4 = sbou + pshufb %xmm8, %xmm12 + pxor %xmm5, %xmm4 # 4 = sb1u + k + pxor %xmm5, %xmm12 + pshufb %xmm3, %xmm0 # 0 = sb1t + pshufb %xmm11, %xmm6 + movdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] + # Also use %xmm1 in the second instance. + pxor %xmm4, %xmm0 # 0 = A + pxor %xmm12, %xmm6 + pshufb %xmm1, %xmm0 + pshufb %xmm1, %xmm6 + ret +.cfi_endproc +.size _vpaes_encrypt_core_2x,.-_vpaes_encrypt_core_2x + +######################################################## +## ## +## AES key schedule ## +## ## +######################################################## +.type _vpaes_schedule_core,\@abi-omnipotent +.align 16 +_vpaes_schedule_core: +.cfi_startproc + # rdi = key + # rsi = size in bits + # rdx = buffer + # rcx = direction. 0=encrypt, 1=decrypt + + call _vpaes_preheat # load the tables + movdqa .Lk_rcon(%rip), %xmm8 # load rcon + movdqu (%rdi), %xmm0 # load key (unaligned) + + # input transform + movdqa %xmm0, %xmm3 + lea .Lk_ipt(%rip), %r11 + call _vpaes_schedule_transform + movdqa %xmm0, %xmm7 + + lea .Lk_sr(%rip),%r10 + + # encrypting, output zeroth round key after transform + movdqu %xmm0, (%rdx) + +.Lschedule_go: + cmp \$192, %esi + ja .Lschedule_256 + # 192-bit key support was removed. + # 128: fall though + +## +## .schedule_128 +## +## 128-bit specific part of key schedule. +## +## This schedule is really simple, because all its parts +## are accomplished by the subroutines. +## +.Lschedule_128: + mov \$10, %esi + +.Loop_schedule_128: + call _vpaes_schedule_round + dec %rsi + jz .Lschedule_mangle_last + call _vpaes_schedule_mangle # write output + jmp .Loop_schedule_128 + +## +## .aes_schedule_256 +## +## 256-bit specific part of key schedule. +## +## The structure here is very similar to the 128-bit +## schedule, but with an additional "low side" in +## %xmm6. The low side's rounds are the same as the +## high side's, except no rcon and no rotation. +## +.align 16 +.Lschedule_256: + movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) + call _vpaes_schedule_transform # input transform + mov \$7, %esi + +.Loop_schedule_256: + call _vpaes_schedule_mangle # output low result + movdqa %xmm0, %xmm6 # save cur_lo in xmm6 + + # high round + call _vpaes_schedule_round + dec %rsi + jz .Lschedule_mangle_last + call _vpaes_schedule_mangle + + # low round. swap xmm7 and xmm6 + pshufd \$0xFF, %xmm0, %xmm0 + movdqa %xmm7, %xmm5 + movdqa %xmm6, %xmm7 + call _vpaes_schedule_low_round + movdqa %xmm5, %xmm7 + + jmp .Loop_schedule_256 + + +## +## .aes_schedule_mangle_last +## +## Mangler for last round of key schedule +## Mangles %xmm0 +## when encrypting, outputs out(%xmm0) ^ 63 +## when decrypting, outputs unskew(%xmm0) +## +## Always called right before return... jumps to cleanup and exits +## +.align 16 +.Lschedule_mangle_last: + # schedule last round key from xmm0 + lea .Lk_deskew(%rip),%r11 # prepare to deskew + + # encrypting + movdqa (%r8,%r10),%xmm1 + pshufb %xmm1, %xmm0 # output permute + lea .Lk_opt(%rip), %r11 # prepare to output transform + add \$32, %rdx + +.Lschedule_mangle_last_dec: + add \$-16, %rdx + pxor .Lk_s63(%rip), %xmm0 + call _vpaes_schedule_transform # output transform + movdqu %xmm0, (%rdx) # save last key + + # cleanup + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + ret +.cfi_endproc +.size _vpaes_schedule_core,.-_vpaes_schedule_core + +## +## .aes_schedule_round +## +## Runs one main round of the key schedule on %xmm0, %xmm7 +## +## Specifically, runs subbytes on the high dword of %xmm0 +## then rotates it by one byte and xors into the low dword of +## %xmm7. +## +## Adds rcon from low byte of %xmm8, then rotates %xmm8 for +## next rcon. +## +## Smears the dwords of %xmm7 by xoring the low into the +## second low, result into third, result into highest. +## +## Returns results in %xmm7 = %xmm0. +## Clobbers %xmm1-%xmm4, %r11. +## +.type _vpaes_schedule_round,\@abi-omnipotent +.align 16 +_vpaes_schedule_round: +.cfi_startproc + # extract rcon from xmm8 + pxor %xmm1, %xmm1 + palignr \$15, %xmm8, %xmm1 + palignr \$15, %xmm8, %xmm8 + pxor %xmm1, %xmm7 + + # rotate + pshufd \$0xFF, %xmm0, %xmm0 + palignr \$1, %xmm0, %xmm0 + + # fall through... + + # low round: same as high round, but no rotation and no rcon. +_vpaes_schedule_low_round: + # smear xmm7 + movdqa %xmm7, %xmm1 + pslldq \$4, %xmm7 + pxor %xmm1, %xmm7 + movdqa %xmm7, %xmm1 + pslldq \$8, %xmm7 + pxor %xmm1, %xmm7 + pxor .Lk_s63(%rip), %xmm7 + + # subbytes + movdqa %xmm9, %xmm1 + pandn %xmm0, %xmm1 + psrld \$4, %xmm1 # 1 = i + pand %xmm9, %xmm0 # 0 = k + movdqa %xmm11, %xmm2 # 2 : a/k + pshufb %xmm0, %xmm2 # 2 = a/k + pxor %xmm1, %xmm0 # 0 = j + movdqa %xmm10, %xmm3 # 3 : 1/i + pshufb %xmm1, %xmm3 # 3 = 1/i + pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k + movdqa %xmm10, %xmm4 # 4 : 1/j + pshufb %xmm0, %xmm4 # 4 = 1/j + pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k + movdqa %xmm10, %xmm2 # 2 : 1/iak + pshufb %xmm3, %xmm2 # 2 = 1/iak + pxor %xmm0, %xmm2 # 2 = io + movdqa %xmm10, %xmm3 # 3 : 1/jak + pshufb %xmm4, %xmm3 # 3 = 1/jak + pxor %xmm1, %xmm3 # 3 = jo + movdqa %xmm13, %xmm4 # 4 : sbou + pshufb %xmm2, %xmm4 # 4 = sbou + movdqa %xmm12, %xmm0 # 0 : sbot + pshufb %xmm3, %xmm0 # 0 = sb1t + pxor %xmm4, %xmm0 # 0 = sbox output + + # add in smeared stuff + pxor %xmm7, %xmm0 + movdqa %xmm0, %xmm7 + ret +.cfi_endproc +.size _vpaes_schedule_round,.-_vpaes_schedule_round + +## +## .aes_schedule_transform +## +## Linear-transform %xmm0 according to tables at (%r11) +## +## Requires that %xmm9 = 0x0F0F... as in preheat +## Output in %xmm0 +## Clobbers %xmm1, %xmm2 +## +.type _vpaes_schedule_transform,\@abi-omnipotent +.align 16 +_vpaes_schedule_transform: +.cfi_startproc + movdqa %xmm9, %xmm1 + pandn %xmm0, %xmm1 + psrld \$4, %xmm1 + pand %xmm9, %xmm0 + movdqa (%r11), %xmm2 # lo + pshufb %xmm0, %xmm2 + movdqa 16(%r11), %xmm0 # hi + pshufb %xmm1, %xmm0 + pxor %xmm2, %xmm0 + ret +.cfi_endproc +.size _vpaes_schedule_transform,.-_vpaes_schedule_transform + +## +## .aes_schedule_mangle +## +## Mangle xmm0 from (basis-transformed) standard version +## to our version. +## +## On encrypt, +## xor with 0x63 +## multiply by circulant 0,1,1,1 +## apply shiftrows transform +## +## On decrypt, +## xor with 0x63 +## multiply by "inverse mixcolumns" circulant E,B,D,9 +## deskew +## apply shiftrows transform +## +## +## Writes out to (%rdx), and increments or decrements it +## Keeps track of round number mod 4 in %r8 +## Preserves xmm0 +## Clobbers xmm1-xmm5 +## +.type _vpaes_schedule_mangle,\@abi-omnipotent +.align 16 +_vpaes_schedule_mangle: +.cfi_startproc + movdqa %xmm0, %xmm4 # save xmm0 for later + movdqa .Lk_mc_forward(%rip),%xmm5 + + # encrypting + add \$16, %rdx + pxor .Lk_s63(%rip),%xmm4 + pshufb %xmm5, %xmm4 + movdqa %xmm4, %xmm3 + pshufb %xmm5, %xmm4 + pxor %xmm4, %xmm3 + pshufb %xmm5, %xmm4 + pxor %xmm4, %xmm3 + +.Lschedule_mangle_both: + movdqa (%r8,%r10),%xmm1 + pshufb %xmm1,%xmm3 + add \$-16, %r8 + and \$0x30, %r8 + movdqu %xmm3, (%rdx) + ret +.cfi_endproc +.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle + +# +# Interface to OpenSSL +# +.globl ${PREFIX}_set_encrypt_key +.type ${PREFIX}_set_encrypt_key,\@function,3 +.align 16 +${PREFIX}_set_encrypt_key: +.cfi_startproc + _CET_ENDBR +#ifdef BORINGSSL_DISPATCH_TEST +.extern BORINGSSL_function_hit + movb \$1, BORINGSSL_function_hit+5(%rip) +#endif + +___ +$code.=<<___ if ($win64); + lea -0xb8(%rsp),%rsp + movaps %xmm6,0x10(%rsp) + movaps %xmm7,0x20(%rsp) + movaps %xmm8,0x30(%rsp) + movaps %xmm9,0x40(%rsp) + movaps %xmm10,0x50(%rsp) + movaps %xmm11,0x60(%rsp) + movaps %xmm12,0x70(%rsp) + movaps %xmm13,0x80(%rsp) + movaps %xmm14,0x90(%rsp) + movaps %xmm15,0xa0(%rsp) +.Lenc_key_body: +___ +$code.=<<___; + mov %esi,%eax + shr \$5,%eax + add \$5,%eax + mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + + mov \$0,%ecx + mov \$0x30,%r8d + call _vpaes_schedule_core +___ +$code.=<<___ if ($win64); + movaps 0x10(%rsp),%xmm6 + movaps 0x20(%rsp),%xmm7 + movaps 0x30(%rsp),%xmm8 + movaps 0x40(%rsp),%xmm9 + movaps 0x50(%rsp),%xmm10 + movaps 0x60(%rsp),%xmm11 + movaps 0x70(%rsp),%xmm12 + movaps 0x80(%rsp),%xmm13 + movaps 0x90(%rsp),%xmm14 + movaps 0xa0(%rsp),%xmm15 + lea 0xb8(%rsp),%rsp +.Lenc_key_epilogue: +___ +$code.=<<___; + xor %eax,%eax + ret +.cfi_endproc +.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key +___ +{ +my ($inp,$out,$blocks,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx","%r8"); +# void vpaes_ctr32_encrypt_blocks(const uint8_t *inp, uint8_t *out, +# size_t blocks, const AES_KEY *key, +# const uint8_t ivp[16]); +$code.=<<___; +.globl ${PREFIX}_ctr32_encrypt_blocks +.type ${PREFIX}_ctr32_encrypt_blocks,\@function,5 +.align 16 +${PREFIX}_ctr32_encrypt_blocks: +.cfi_startproc + _CET_ENDBR + # _vpaes_encrypt_core and _vpaes_encrypt_core_2x expect the key in %rdx. + xchg $key, $blocks +___ +($blocks,$key)=($key,$blocks); +$code.=<<___; + test $blocks, $blocks + jz .Lctr32_abort +___ +$code.=<<___ if ($win64); + lea -0xb8(%rsp),%rsp + movaps %xmm6,0x10(%rsp) + movaps %xmm7,0x20(%rsp) + movaps %xmm8,0x30(%rsp) + movaps %xmm9,0x40(%rsp) + movaps %xmm10,0x50(%rsp) + movaps %xmm11,0x60(%rsp) + movaps %xmm12,0x70(%rsp) + movaps %xmm13,0x80(%rsp) + movaps %xmm14,0x90(%rsp) + movaps %xmm15,0xa0(%rsp) +.Lctr32_body: +___ +$code.=<<___; + movdqu ($ivp), %xmm0 # Load IV. + movdqa .Lctr_add_one(%rip), %xmm8 + sub $inp, $out # This allows only incrementing $inp. + call _vpaes_preheat + movdqa %xmm0, %xmm6 + pshufb .Lrev_ctr(%rip), %xmm6 + + test \$1, $blocks + jz .Lctr32_prep_loop + + # Handle one block so the remaining block count is even for + # _vpaes_encrypt_core_2x. + movdqu ($inp), %xmm7 # Load input. + call _vpaes_encrypt_core + pxor %xmm7, %xmm0 + paddd %xmm8, %xmm6 + movdqu %xmm0, ($out,$inp) + sub \$1, $blocks + lea 16($inp), $inp + jz .Lctr32_done + +.Lctr32_prep_loop: + # _vpaes_encrypt_core_2x leaves only %xmm14 and %xmm15 as spare + # registers. We maintain two byte-swapped counters in them. + movdqa %xmm6, %xmm14 + movdqa %xmm6, %xmm15 + paddd %xmm8, %xmm15 + +.Lctr32_loop: + movdqa .Lrev_ctr(%rip), %xmm1 # Set up counters. + movdqa %xmm14, %xmm0 + movdqa %xmm15, %xmm6 + pshufb %xmm1, %xmm0 + pshufb %xmm1, %xmm6 + call _vpaes_encrypt_core_2x + movdqu ($inp), %xmm1 # Load input. + movdqu 16($inp), %xmm2 + movdqa .Lctr_add_two(%rip), %xmm3 + pxor %xmm1, %xmm0 # XOR input. + pxor %xmm2, %xmm6 + paddd %xmm3, %xmm14 # Increment counters. + paddd %xmm3, %xmm15 + movdqu %xmm0, ($out,$inp) # Write output. + movdqu %xmm6, 16($out,$inp) + sub \$2, $blocks # Advance loop. + lea 32($inp), $inp + jnz .Lctr32_loop + +.Lctr32_done: +___ +$code.=<<___ if ($win64); + movaps 0x10(%rsp),%xmm6 + movaps 0x20(%rsp),%xmm7 + movaps 0x30(%rsp),%xmm8 + movaps 0x40(%rsp),%xmm9 + movaps 0x50(%rsp),%xmm10 + movaps 0x60(%rsp),%xmm11 + movaps 0x70(%rsp),%xmm12 + movaps 0x80(%rsp),%xmm13 + movaps 0x90(%rsp),%xmm14 + movaps 0xa0(%rsp),%xmm15 + lea 0xb8(%rsp),%rsp +.Lctr32_epilogue: +___ +$code.=<<___; +.Lctr32_abort: + ret +.cfi_endproc +.size ${PREFIX}_ctr32_encrypt_blocks,.-${PREFIX}_ctr32_encrypt_blocks +___ +} +$code.=<<___; +## +## _aes_preheat +## +## Fills register %r10 -> .aes_consts (so you can -fPIC) +## and %xmm9-%xmm15 as specified below. +## +.type _vpaes_preheat,\@abi-omnipotent +.align 16 +_vpaes_preheat: +.cfi_startproc + lea .Lk_s0F(%rip), %r10 + movdqa -0x20(%r10), %xmm10 # .Lk_inv + movdqa -0x10(%r10), %xmm11 # .Lk_inv+16 + movdqa 0x00(%r10), %xmm9 # .Lk_s0F + movdqa 0x30(%r10), %xmm13 # .Lk_sb1 + movdqa 0x40(%r10), %xmm12 # .Lk_sb1+16 + movdqa 0x50(%r10), %xmm15 # .Lk_sb2 + movdqa 0x60(%r10), %xmm14 # .Lk_sb2+16 + ret +.cfi_endproc +.size _vpaes_preheat,.-_vpaes_preheat +######################################################## +## ## +## Constants ## +## ## +######################################################## +.type _vpaes_consts,\@object +.section .rodata +.align 64 +_vpaes_consts: +.Lk_inv: # inv, inva + .quad 0x0E05060F0D080180, 0x040703090A0B0C02 + .quad 0x01040A060F0B0780, 0x030D0E0C02050809 + +.Lk_s0F: # s0F + .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F + +.Lk_ipt: # input transform (lo, hi) + .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 + .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 + +.Lk_sb1: # sb1u, sb1t + .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 + .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF +.Lk_sb2: # sb2u, sb2t + .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD + .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A +.Lk_sbo: # sbou, sbot + .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 + .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA + +.Lk_mc_forward: # mc_forward + .quad 0x0407060500030201, 0x0C0F0E0D080B0A09 + .quad 0x080B0A0904070605, 0x000302010C0F0E0D + .quad 0x0C0F0E0D080B0A09, 0x0407060500030201 + .quad 0x000302010C0F0E0D, 0x080B0A0904070605 + +.Lk_mc_backward:# mc_backward + .quad 0x0605040702010003, 0x0E0D0C0F0A09080B + .quad 0x020100030E0D0C0F, 0x0A09080B06050407 + .quad 0x0E0D0C0F0A09080B, 0x0605040702010003 + .quad 0x0A09080B06050407, 0x020100030E0D0C0F + +.Lk_sr: # sr + .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 + .quad 0x030E09040F0A0500, 0x0B06010C07020D08 + .quad 0x0F060D040B020900, 0x070E050C030A0108 + .quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +.Lk_rcon: # rcon + .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +.Lk_s63: # s63: all equal to 0x63 transformed + .quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B + +.Lk_opt: # output transform + .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 + .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 + +.Lk_deskew: # deskew tables: inverts the sbox's "skew" + .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A + .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 + +# .Lrev_ctr is a permutation which byte-swaps the counter portion of the IV. +.Lrev_ctr: + .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 +# .Lctr_add_* may be added to a byte-swapped xmm register to increment the +# counter. The register must be byte-swapped again to form the actual input. +.Lctr_add_one: + .quad 0x0000000000000000, 0x0000000100000000 +.Lctr_add_two: + .quad 0x0000000000000000, 0x0000000200000000 + +.asciz "Vector Permutation AES for x86_64/SSSE3, Mike Hamburg (Stanford University)" +.align 64 +.size _vpaes_consts,.-_vpaes_consts +.text +___ + +if ($win64) { +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->RipRsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lin_prologue + + lea 16(%rax),%rsi # %xmm save area + lea 512($context),%rdi # &context.Xmm6 + mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) + .long 0xa548f3fc # cld; rep movsq + lea 0xb8(%rax),%rax # adjust stack pointer + +.Lin_prologue: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$`1232/8`,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size se_handler,.-se_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_${PREFIX}_set_encrypt_key + .rva .LSEH_end_${PREFIX}_set_encrypt_key + .rva .LSEH_info_${PREFIX}_set_encrypt_key + + .rva .LSEH_begin_${PREFIX}_ctr32_encrypt_blocks + .rva .LSEH_end_${PREFIX}_ctr32_encrypt_blocks + .rva .LSEH_info_${PREFIX}_ctr32_encrypt_blocks + +.section .xdata +.align 8 +.LSEH_info_${PREFIX}_set_encrypt_key: + .byte 9,0,0,0 + .rva se_handler + .rva .Lenc_key_body,.Lenc_key_epilogue # HandlerData[] +.LSEH_info_${PREFIX}_ctr32_encrypt_blocks: + .byte 9,0,0,0 + .rva se_handler + .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[] +___ +} + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; + +print $code; + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/ring-0.17.14/crypto/fipsmodule/bn/asm/armv4-mont.pl b/ring-0.17.14/crypto/fipsmodule/bn/asm/armv4-mont.pl new file mode 100644 index 0000000000..acae4e582e --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/bn/asm/armv4-mont.pl @@ -0,0 +1,739 @@ +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. +# ==================================================================== + +# January 2007. + +# Montgomery multiplication for ARMv4. +# +# Performance improvement naturally varies among CPU implementations +# and compilers. The code was observed to provide +65-35% improvement +# [depending on key length, less for longer keys] on ARM920T, and +# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code +# base and compiler generated code with in-lined umull and even umlal +# instructions. The latter means that this code didn't really have an +# "advantage" of utilizing some "secret" instruction. +# +# The code is interoperable with Thumb ISA and is rather compact, less +# than 1/2KB. Windows CE port would be trivial, as it's exclusively +# about decorations, ABI and instruction syntax are identical. + +# November 2013 +# +# Add NEON code path, which handles lengths divisible by 8. RSA/DSA +# performance improvement on Cortex-A8 is ~45-100% depending on key +# length, more for longer keys. On Cortex-A15 the span is ~10-105%. +# On Snapdragon S4 improvement was measured to vary from ~70% to +# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is +# rather because original integer-only code seems to perform +# suboptimally on S4. Situation on Cortex-A9 is unfortunately +# different. It's being looked into, but the trouble is that +# performance for vectors longer than 256 bits is actually couple +# of percent worse than for integer-only code. The code is chosen +# for execution on all NEON-capable processors, because gain on +# others outweighs the marginal loss on Cortex-A9. + +# September 2015 +# +# Align Cortex-A9 performance with November 2013 improvements, i.e. +# NEON code is now ~20-105% faster than integer-only one on this +# processor. But this optimization further improved performance even +# on other processors: NEON code path is ~45-180% faster than original +# integer-only on Cortex-A8, ~10-210% on Cortex-A15, ~70-450% on +# Snapdragon S4. + +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; + *STDOUT=*OUT; +} else { + open OUT,">$output"; + *STDOUT=*OUT; +} + +$num="r0"; # starts as num argument, but holds &tp[num-1] +$ap="r1"; +$bp="r2"; $bi="r2"; $rp="r2"; +$np="r3"; +$tp="r4"; +$aj="r5"; +$nj="r6"; +$tj="r7"; +$n0="r8"; +########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer +$alo="r10"; # sl, gcc uses it to keep @GOT +$ahi="r11"; # fp +$nlo="r12"; # ip +########### # r13 is stack pointer +$nhi="r14"; # lr +########### # r15 is program counter + +#### argument block layout relative to &tp[num-1], a.k.a. $num +$_rp="$num,#12*4"; +# ap permanently resides in r1 +$_bp="$num,#13*4"; +# np permanently resides in r3 +$_n0="$num,#14*4"; +$_num="$num,#15*4"; $_bpend=$_num; + +$code=<<___; +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. +.arch armv7-a + +.text +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif + +.global bn_mul_mont_nohw +.type bn_mul_mont_nohw,%function + +.align 5 +bn_mul_mont_nohw: + ldr ip,[sp,#4] @ load num + stmdb sp!,{r0,r2} @ sp points at argument block + cmp ip,#2 + mov $num,ip @ load num +#ifdef __thumb2__ + ittt lt +#endif + movlt r0,#0 + addlt sp,sp,#2*4 + blt .Labrt + + stmdb sp!,{r4-r12,lr} @ save 10 registers + + mov $num,$num,lsl#2 @ rescale $num for byte count + sub sp,sp,$num @ alloca(4*num) + sub sp,sp,#4 @ +extra dword + sub $num,$num,#4 @ "num=num-1" + add $tp,$bp,$num @ &bp[num-1] + + add $num,sp,$num @ $num to point at &tp[num-1] + ldr $n0,[$_n0] @ &n0 + ldr $bi,[$bp] @ bp[0] + ldr $aj,[$ap],#4 @ ap[0],ap++ + ldr $nj,[$np],#4 @ np[0],np++ + ldr $n0,[$n0] @ *n0 + str $tp,[$_bpend] @ save &bp[num] + + umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0] + str $n0,[$_n0] @ save n0 value + mul $n0,$alo,$n0 @ "tp[0]"*n0 + mov $nlo,#0 + umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]" + mov $tp,sp + +.L1st: + ldr $aj,[$ap],#4 @ ap[j],ap++ + mov $alo,$ahi + ldr $nj,[$np],#4 @ np[j],np++ + mov $ahi,#0 + umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0] + mov $nhi,#0 + umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 + adds $nlo,$nlo,$alo + str $nlo,[$tp],#4 @ tp[j-1]=,tp++ + adc $nlo,$nhi,#0 + cmp $tp,$num + bne .L1st + + adds $nlo,$nlo,$ahi + ldr $tp,[$_bp] @ restore bp + mov $nhi,#0 + ldr $n0,[$_n0] @ restore n0 + adc $nhi,$nhi,#0 + str $nlo,[$num] @ tp[num-1]= + mov $tj,sp + str $nhi,[$num,#4] @ tp[num]= + +.Louter: + sub $tj,$num,$tj @ "original" $num-1 value + sub $ap,$ap,$tj @ "rewind" ap to &ap[1] + ldr $bi,[$tp,#4]! @ *(++bp) + sub $np,$np,$tj @ "rewind" np to &np[1] + ldr $aj,[$ap,#-4] @ ap[0] + ldr $alo,[sp] @ tp[0] + ldr $nj,[$np,#-4] @ np[0] + ldr $tj,[sp,#4] @ tp[1] + + mov $ahi,#0 + umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0] + str $tp,[$_bp] @ save bp + mul $n0,$alo,$n0 + mov $nlo,#0 + umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]" + mov $tp,sp + +.Linner: + ldr $aj,[$ap],#4 @ ap[j],ap++ + adds $alo,$ahi,$tj @ +=tp[j] + ldr $nj,[$np],#4 @ np[j],np++ + mov $ahi,#0 + umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i] + mov $nhi,#0 + umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 + adc $ahi,$ahi,#0 + ldr $tj,[$tp,#8] @ tp[j+1] + adds $nlo,$nlo,$alo + str $nlo,[$tp],#4 @ tp[j-1]=,tp++ + adc $nlo,$nhi,#0 + cmp $tp,$num + bne .Linner + + adds $nlo,$nlo,$ahi + mov $nhi,#0 + ldr $tp,[$_bp] @ restore bp + adc $nhi,$nhi,#0 + ldr $n0,[$_n0] @ restore n0 + adds $nlo,$nlo,$tj + ldr $tj,[$_bpend] @ restore &bp[num] + adc $nhi,$nhi,#0 + str $nlo,[$num] @ tp[num-1]= + str $nhi,[$num,#4] @ tp[num]= + + cmp $tp,$tj +#ifdef __thumb2__ + itt ne +#endif + movne $tj,sp + bne .Louter + + ldr $rp,[$_rp] @ pull rp + mov $aj,sp + add $num,$num,#4 @ $num to point at &tp[num] + sub $aj,$num,$aj @ "original" num value + mov $tp,sp @ "rewind" $tp + mov $ap,$tp @ "borrow" $ap + sub $np,$np,$aj @ "rewind" $np to &np[0] + + subs $tj,$tj,$tj @ "clear" carry flag +.Lsub: ldr $tj,[$tp],#4 + ldr $nj,[$np],#4 + sbcs $tj,$tj,$nj @ tp[j]-np[j] + str $tj,[$rp],#4 @ rp[j]= + teq $tp,$num @ preserve carry + bne .Lsub + sbcs $nhi,$nhi,#0 @ upmost carry + mov $tp,sp @ "rewind" $tp + sub $rp,$rp,$aj @ "rewind" $rp + +.Lcopy: ldr $tj,[$tp] @ conditional copy + ldr $aj,[$rp] + str sp,[$tp],#4 @ zap tp +#ifdef __thumb2__ + it cc +#endif + movcc $aj,$tj + str $aj,[$rp],#4 + teq $tp,$num @ preserve carry + bne .Lcopy + + mov sp,$num + add sp,sp,#4 @ skip over tp[num+1] + ldmia sp!,{r4-r12,lr} @ restore registers + add sp,sp,#2*4 @ skip over {r0,r2} + mov r0,#1 +.Labrt: +#if __ARM_ARCH>=5 + ret @ bx lr +#else + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size bn_mul_mont_nohw,.-bn_mul_mont_nohw +___ +{ +my ($A0,$A1,$A2,$A3)=map("d$_",(0..3)); +my ($N0,$N1,$N2,$N3)=map("d$_",(4..7)); +my ($Z,$Temp)=("q4","q5"); +my @ACC=map("q$_",(6..13)); +my ($Bi,$Ni,$M0)=map("d$_",(28..31)); +my $zero="$Z#lo"; +my $temp="$Temp#lo"; + +my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5)); +my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("r$_",(6..11)); + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.global bn_mul8x_mont_neon +.type bn_mul8x_mont_neon,%function +.align 5 +bn_mul8x_mont_neon: + mov ip,sp + stmdb sp!,{r4-r11} + vstmdb sp!,{d8-d15} @ ABI specification says so + ldmia ip,{r4-r5} @ load rest of parameter block + mov ip,sp + + cmp $num,#8 + bhi .LNEON_8n + + @ special case for $num==8, everything is in register bank... + + vld1.32 {${Bi}[0]}, [$bptr,:32]! + veor $zero,$zero,$zero + sub $toutptr,sp,$num,lsl#4 + vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-( + and $toutptr,$toutptr,#-64 + vld1.32 {${M0}[0]}, [$n0,:32] + mov sp,$toutptr @ alloca + vzip.16 $Bi,$zero + + vmull.u32 @ACC[0],$Bi,${A0}[0] + vmull.u32 @ACC[1],$Bi,${A0}[1] + vmull.u32 @ACC[2],$Bi,${A1}[0] + vshl.i64 $Ni,@ACC[0]#hi,#16 + vmull.u32 @ACC[3],$Bi,${A1}[1] + + vadd.u64 $Ni,$Ni,@ACC[0]#lo + veor $zero,$zero,$zero + vmul.u32 $Ni,$Ni,$M0 + + vmull.u32 @ACC[4],$Bi,${A2}[0] + vld1.32 {$N0-$N3}, [$nptr]! + vmull.u32 @ACC[5],$Bi,${A2}[1] + vmull.u32 @ACC[6],$Bi,${A3}[0] + vzip.16 $Ni,$zero + vmull.u32 @ACC[7],$Bi,${A3}[1] + + vmlal.u32 @ACC[0],$Ni,${N0}[0] + sub $outer,$num,#1 + vmlal.u32 @ACC[1],$Ni,${N0}[1] + vmlal.u32 @ACC[2],$Ni,${N1}[0] + vmlal.u32 @ACC[3],$Ni,${N1}[1] + + vmlal.u32 @ACC[4],$Ni,${N2}[0] + vmov $Temp,@ACC[0] + vmlal.u32 @ACC[5],$Ni,${N2}[1] + vmov @ACC[0],@ACC[1] + vmlal.u32 @ACC[6],$Ni,${N3}[0] + vmov @ACC[1],@ACC[2] + vmlal.u32 @ACC[7],$Ni,${N3}[1] + vmov @ACC[2],@ACC[3] + vmov @ACC[3],@ACC[4] + vshr.u64 $temp,$temp,#16 + vmov @ACC[4],@ACC[5] + vmov @ACC[5],@ACC[6] + vadd.u64 $temp,$temp,$Temp#hi + vmov @ACC[6],@ACC[7] + veor @ACC[7],@ACC[7] + vshr.u64 $temp,$temp,#16 + + b .LNEON_outer8 + +.align 4 +.LNEON_outer8: + vld1.32 {${Bi}[0]}, [$bptr,:32]! + veor $zero,$zero,$zero + vzip.16 $Bi,$zero + vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp + + vmlal.u32 @ACC[0],$Bi,${A0}[0] + vmlal.u32 @ACC[1],$Bi,${A0}[1] + vmlal.u32 @ACC[2],$Bi,${A1}[0] + vshl.i64 $Ni,@ACC[0]#hi,#16 + vmlal.u32 @ACC[3],$Bi,${A1}[1] + + vadd.u64 $Ni,$Ni,@ACC[0]#lo + veor $zero,$zero,$zero + subs $outer,$outer,#1 + vmul.u32 $Ni,$Ni,$M0 + + vmlal.u32 @ACC[4],$Bi,${A2}[0] + vmlal.u32 @ACC[5],$Bi,${A2}[1] + vmlal.u32 @ACC[6],$Bi,${A3}[0] + vzip.16 $Ni,$zero + vmlal.u32 @ACC[7],$Bi,${A3}[1] + + vmlal.u32 @ACC[0],$Ni,${N0}[0] + vmlal.u32 @ACC[1],$Ni,${N0}[1] + vmlal.u32 @ACC[2],$Ni,${N1}[0] + vmlal.u32 @ACC[3],$Ni,${N1}[1] + + vmlal.u32 @ACC[4],$Ni,${N2}[0] + vmov $Temp,@ACC[0] + vmlal.u32 @ACC[5],$Ni,${N2}[1] + vmov @ACC[0],@ACC[1] + vmlal.u32 @ACC[6],$Ni,${N3}[0] + vmov @ACC[1],@ACC[2] + vmlal.u32 @ACC[7],$Ni,${N3}[1] + vmov @ACC[2],@ACC[3] + vmov @ACC[3],@ACC[4] + vshr.u64 $temp,$temp,#16 + vmov @ACC[4],@ACC[5] + vmov @ACC[5],@ACC[6] + vadd.u64 $temp,$temp,$Temp#hi + vmov @ACC[6],@ACC[7] + veor @ACC[7],@ACC[7] + vshr.u64 $temp,$temp,#16 + + bne .LNEON_outer8 + + vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp + mov $toutptr,sp + vshr.u64 $temp,@ACC[0]#lo,#16 + mov $inner,$num + vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp + add $tinptr,sp,#96 + vshr.u64 $temp,@ACC[0]#hi,#16 + vzip.16 @ACC[0]#lo,@ACC[0]#hi + + b .LNEON_tail_entry + +.align 4 +.LNEON_8n: + veor @ACC[0],@ACC[0],@ACC[0] + sub $toutptr,sp,#128 + veor @ACC[1],@ACC[1],@ACC[1] + sub $toutptr,$toutptr,$num,lsl#4 + veor @ACC[2],@ACC[2],@ACC[2] + and $toutptr,$toutptr,#-64 + veor @ACC[3],@ACC[3],@ACC[3] + mov sp,$toutptr @ alloca + veor @ACC[4],@ACC[4],@ACC[4] + add $toutptr,$toutptr,#256 + veor @ACC[5],@ACC[5],@ACC[5] + sub $inner,$num,#8 + veor @ACC[6],@ACC[6],@ACC[6] + veor @ACC[7],@ACC[7],@ACC[7] + +.LNEON_8n_init: + vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]! + subs $inner,$inner,#8 + vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]! + vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]! + vst1.64 {@ACC[6]-@ACC[7]},[$toutptr,:256]! + bne .LNEON_8n_init + + add $tinptr,sp,#256 + vld1.32 {$A0-$A3},[$aptr]! + add $bnptr,sp,#8 + vld1.32 {${M0}[0]},[$n0,:32] + mov $outer,$num + b .LNEON_8n_outer + +.align 4 +.LNEON_8n_outer: + vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++ + veor $zero,$zero,$zero + vzip.16 $Bi,$zero + add $toutptr,sp,#128 + vld1.32 {$N0-$N3},[$nptr]! + + vmlal.u32 @ACC[0],$Bi,${A0}[0] + vmlal.u32 @ACC[1],$Bi,${A0}[1] + veor $zero,$zero,$zero + vmlal.u32 @ACC[2],$Bi,${A1}[0] + vshl.i64 $Ni,@ACC[0]#hi,#16 + vmlal.u32 @ACC[3],$Bi,${A1}[1] + vadd.u64 $Ni,$Ni,@ACC[0]#lo + vmlal.u32 @ACC[4],$Bi,${A2}[0] + vmul.u32 $Ni,$Ni,$M0 + vmlal.u32 @ACC[5],$Bi,${A2}[1] + vst1.32 {$Bi},[sp,:64] @ put aside smashed b[8*i+0] + vmlal.u32 @ACC[6],$Bi,${A3}[0] + vzip.16 $Ni,$zero + vmlal.u32 @ACC[7],$Bi,${A3}[1] +___ +for ($i=0; $i<7;) { +$code.=<<___; + vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++ + vmlal.u32 @ACC[0],$Ni,${N0}[0] + veor $temp,$temp,$temp + vmlal.u32 @ACC[1],$Ni,${N0}[1] + vzip.16 $Bi,$temp + vmlal.u32 @ACC[2],$Ni,${N1}[0] + vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 + vmlal.u32 @ACC[3],$Ni,${N1}[1] + vmlal.u32 @ACC[4],$Ni,${N2}[0] + vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi + vmlal.u32 @ACC[5],$Ni,${N2}[1] + vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 + vmlal.u32 @ACC[6],$Ni,${N3}[0] + vmlal.u32 @ACC[7],$Ni,${N3}[1] + vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo + vst1.32 {$Ni},[$bnptr,:64]! @ put aside smashed m[8*i+$i] +___ + push(@ACC,shift(@ACC)); $i++; +$code.=<<___; + vmlal.u32 @ACC[0],$Bi,${A0}[0] + vld1.64 {@ACC[7]},[$tinptr,:128]! + vmlal.u32 @ACC[1],$Bi,${A0}[1] + veor $zero,$zero,$zero + vmlal.u32 @ACC[2],$Bi,${A1}[0] + vshl.i64 $Ni,@ACC[0]#hi,#16 + vmlal.u32 @ACC[3],$Bi,${A1}[1] + vadd.u64 $Ni,$Ni,@ACC[0]#lo + vmlal.u32 @ACC[4],$Bi,${A2}[0] + vmul.u32 $Ni,$Ni,$M0 + vmlal.u32 @ACC[5],$Bi,${A2}[1] + vst1.32 {$Bi},[$bnptr,:64]! @ put aside smashed b[8*i+$i] + vmlal.u32 @ACC[6],$Bi,${A3}[0] + vzip.16 $Ni,$zero + vmlal.u32 @ACC[7],$Bi,${A3}[1] +___ +} +$code.=<<___; + vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0] + vmlal.u32 @ACC[0],$Ni,${N0}[0] + vld1.32 {$A0-$A3},[$aptr]! + vmlal.u32 @ACC[1],$Ni,${N0}[1] + vmlal.u32 @ACC[2],$Ni,${N1}[0] + vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 + vmlal.u32 @ACC[3],$Ni,${N1}[1] + vmlal.u32 @ACC[4],$Ni,${N2}[0] + vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi + vmlal.u32 @ACC[5],$Ni,${N2}[1] + vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 + vmlal.u32 @ACC[6],$Ni,${N3}[0] + vmlal.u32 @ACC[7],$Ni,${N3}[1] + vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo + vst1.32 {$Ni},[$bnptr,:64] @ put aside smashed m[8*i+$i] + add $bnptr,sp,#8 @ rewind +___ + push(@ACC,shift(@ACC)); +$code.=<<___; + sub $inner,$num,#8 + b .LNEON_8n_inner + +.align 4 +.LNEON_8n_inner: + subs $inner,$inner,#8 + vmlal.u32 @ACC[0],$Bi,${A0}[0] + vld1.64 {@ACC[7]},[$tinptr,:128] + vmlal.u32 @ACC[1],$Bi,${A0}[1] + vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+0] + vmlal.u32 @ACC[2],$Bi,${A1}[0] + vld1.32 {$N0-$N3},[$nptr]! + vmlal.u32 @ACC[3],$Bi,${A1}[1] + it ne + addne $tinptr,$tinptr,#16 @ don't advance in last iteration + vmlal.u32 @ACC[4],$Bi,${A2}[0] + vmlal.u32 @ACC[5],$Bi,${A2}[1] + vmlal.u32 @ACC[6],$Bi,${A3}[0] + vmlal.u32 @ACC[7],$Bi,${A3}[1] +___ +for ($i=1; $i<8; $i++) { +$code.=<<___; + vld1.32 {$Bi},[$bnptr,:64]! @ pull smashed b[8*i+$i] + vmlal.u32 @ACC[0],$Ni,${N0}[0] + vmlal.u32 @ACC[1],$Ni,${N0}[1] + vmlal.u32 @ACC[2],$Ni,${N1}[0] + vmlal.u32 @ACC[3],$Ni,${N1}[1] + vmlal.u32 @ACC[4],$Ni,${N2}[0] + vmlal.u32 @ACC[5],$Ni,${N2}[1] + vmlal.u32 @ACC[6],$Ni,${N3}[0] + vmlal.u32 @ACC[7],$Ni,${N3}[1] + vst1.64 {@ACC[0]},[$toutptr,:128]! +___ + push(@ACC,shift(@ACC)); +$code.=<<___; + vmlal.u32 @ACC[0],$Bi,${A0}[0] + vld1.64 {@ACC[7]},[$tinptr,:128] + vmlal.u32 @ACC[1],$Bi,${A0}[1] + vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+$i] + vmlal.u32 @ACC[2],$Bi,${A1}[0] + it ne + addne $tinptr,$tinptr,#16 @ don't advance in last iteration + vmlal.u32 @ACC[3],$Bi,${A1}[1] + vmlal.u32 @ACC[4],$Bi,${A2}[0] + vmlal.u32 @ACC[5],$Bi,${A2}[1] + vmlal.u32 @ACC[6],$Bi,${A3}[0] + vmlal.u32 @ACC[7],$Bi,${A3}[1] +___ +} +$code.=<<___; + it eq + subeq $aptr,$aptr,$num,lsl#2 @ rewind + vmlal.u32 @ACC[0],$Ni,${N0}[0] + vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0] + vmlal.u32 @ACC[1],$Ni,${N0}[1] + vld1.32 {$A0-$A3},[$aptr]! + vmlal.u32 @ACC[2],$Ni,${N1}[0] + add $bnptr,sp,#8 @ rewind + vmlal.u32 @ACC[3],$Ni,${N1}[1] + vmlal.u32 @ACC[4],$Ni,${N2}[0] + vmlal.u32 @ACC[5],$Ni,${N2}[1] + vmlal.u32 @ACC[6],$Ni,${N3}[0] + vst1.64 {@ACC[0]},[$toutptr,:128]! + vmlal.u32 @ACC[7],$Ni,${N3}[1] + + bne .LNEON_8n_inner +___ + push(@ACC,shift(@ACC)); +$code.=<<___; + add $tinptr,sp,#128 + vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]! + veor q2,q2,q2 @ $N0-$N1 + vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]! + veor q3,q3,q3 @ $N2-$N3 + vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]! + vst1.64 {@ACC[6]},[$toutptr,:128] + + subs $outer,$outer,#8 + vld1.64 {@ACC[0]-@ACC[1]},[$tinptr,:256]! + vld1.64 {@ACC[2]-@ACC[3]},[$tinptr,:256]! + vld1.64 {@ACC[4]-@ACC[5]},[$tinptr,:256]! + vld1.64 {@ACC[6]-@ACC[7]},[$tinptr,:256]! + + itt ne + subne $nptr,$nptr,$num,lsl#2 @ rewind + bne .LNEON_8n_outer + + add $toutptr,sp,#128 + vst1.64 {q2-q3}, [sp,:256]! @ start wiping stack frame + vshr.u64 $temp,@ACC[0]#lo,#16 + vst1.64 {q2-q3},[sp,:256]! + vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp + vst1.64 {q2-q3}, [sp,:256]! + vshr.u64 $temp,@ACC[0]#hi,#16 + vst1.64 {q2-q3}, [sp,:256]! + vzip.16 @ACC[0]#lo,@ACC[0]#hi + + mov $inner,$num + b .LNEON_tail_entry + +.align 4 +.LNEON_tail: + vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp + vshr.u64 $temp,@ACC[0]#lo,#16 + vld1.64 {@ACC[2]-@ACC[3]}, [$tinptr, :256]! + vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp + vld1.64 {@ACC[4]-@ACC[5]}, [$tinptr, :256]! + vshr.u64 $temp,@ACC[0]#hi,#16 + vld1.64 {@ACC[6]-@ACC[7]}, [$tinptr, :256]! + vzip.16 @ACC[0]#lo,@ACC[0]#hi + +.LNEON_tail_entry: +___ +for ($i=1; $i<8; $i++) { +$code.=<<___; + vadd.u64 @ACC[1]#lo,@ACC[1]#lo,$temp + vst1.32 {@ACC[0]#lo[0]}, [$toutptr, :32]! + vshr.u64 $temp,@ACC[1]#lo,#16 + vadd.u64 @ACC[1]#hi,@ACC[1]#hi,$temp + vshr.u64 $temp,@ACC[1]#hi,#16 + vzip.16 @ACC[1]#lo,@ACC[1]#hi +___ + push(@ACC,shift(@ACC)); +} + push(@ACC,shift(@ACC)); +$code.=<<___; + vld1.64 {@ACC[0]-@ACC[1]}, [$tinptr, :256]! + subs $inner,$inner,#8 + vst1.32 {@ACC[7]#lo[0]}, [$toutptr, :32]! + bne .LNEON_tail + + vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit + sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr + subs $aptr,sp,#0 @ clear carry flag + add $bptr,sp,$num,lsl#2 + +.LNEON_sub: + ldmia $aptr!, {r4-r7} + ldmia $nptr!, {r8-r11} + sbcs r8, r4,r8 + sbcs r9, r5,r9 + sbcs r10,r6,r10 + sbcs r11,r7,r11 + teq $aptr,$bptr @ preserves carry + stmia $rptr!, {r8-r11} + bne .LNEON_sub + + ldr r10, [$aptr] @ load top-most bit + mov r11,sp + veor q0,q0,q0 + sub r11,$bptr,r11 @ this is num*4 + veor q1,q1,q1 + mov $aptr,sp + sub $rptr,$rptr,r11 @ rewind $rptr + mov $nptr,$bptr @ second 3/4th of frame + sbcs r10,r10,#0 @ result is carry flag + +.LNEON_copy_n_zap: + ldmia $aptr!, {r4-r7} + ldmia $rptr, {r8-r11} + it cc + movcc r8, r4 + vst1.64 {q0-q1}, [$nptr,:256]! @ wipe + itt cc + movcc r9, r5 + movcc r10,r6 + vst1.64 {q0-q1}, [$nptr,:256]! @ wipe + it cc + movcc r11,r7 + ldmia $aptr, {r4-r7} + stmia $rptr!, {r8-r11} + sub $aptr,$aptr,#16 + ldmia $rptr, {r8-r11} + it cc + movcc r8, r4 + vst1.64 {q0-q1}, [$aptr,:256]! @ wipe + itt cc + movcc r9, r5 + movcc r10,r6 + vst1.64 {q0-q1}, [$nptr,:256]! @ wipe + it cc + movcc r11,r7 + teq $aptr,$bptr @ preserves carry + stmia $rptr!, {r8-r11} + bne .LNEON_copy_n_zap + + mov sp,ip + vldmia sp!,{d8-d15} + ldmia sp!,{r4-r11} + ret @ bx lr +.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon +#endif +___ +} +$code.=<<___; +.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by " +___ + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/ge or + s/\bret\b/bx lr/g or + s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4 + + print $_,"\n"; +} + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/ring-0.17.14/crypto/fipsmodule/bn/asm/armv8-mont.pl b/ring-0.17.14/crypto/fipsmodule/bn/asm/armv8-mont.pl new file mode 100644 index 0000000000..0c32c8b147 --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/bn/asm/armv8-mont.pl @@ -0,0 +1,1517 @@ +#! /usr/bin/env perl +# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. +# ==================================================================== + +# March 2015 +# +# "Teaser" Montgomery multiplication module for ARMv8. Needs more +# work. While it does improve RSA sign performance by 20-30% (less for +# longer keys) on most processors, for some reason RSA2048 is not +# faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication +# instruction issue rate is limited on processor in question, meaning +# that dedicated squaring procedure is a must. Well, actually all +# contemporary AArch64 processors seem to have limited multiplication +# issue rate, i.e. they can't issue multiplication every cycle, which +# explains moderate improvement coefficients in comparison to +# compiler-generated code. Recall that compiler is instructed to use +# umulh and therefore uses same amount of multiplication instructions +# to do the job. Assembly's edge is to minimize number of "collateral" +# instructions and of course instruction scheduling. +# +# April 2015 +# +# Squaring procedure that handles lengths divisible by 8 improves +# RSA/DSA performance by 25-40-60% depending on processor and key +# length. Overall improvement coefficients are always positive in +# comparison to compiler-generated code. On Cortex-A57 improvement +# is still modest on longest key lengths, while others exhibit e.g. +# 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster +# on Cortex-A57 and ~60-100% faster on others. + +$flavour = shift; +$output = shift; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; + +($lo0,$hi0,$aj,$m0,$alo,$ahi, + $lo1,$hi1,$nj,$m1,$nlo,$nhi, + $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24); + +# void bn_mul_mont_nohw( +$rp="x0"; # BN_ULONG *rp, +$ap="x1"; # const BN_ULONG *ap, +$bp="x2"; # const BN_ULONG *bp, +$np="x3"; # const BN_ULONG *np, +$n0="x4"; # const BN_ULONG *n0, +$num="x5"; # size_t num); + +$code.=<<___; +.text + +.globl bn_mul_mont_nohw +.type bn_mul_mont_nohw,%function +.align 5 +bn_mul_mont_nohw: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldr $m0,[$bp],#8 // bp[0] + sub $tp,sp,$num,lsl#3 + ldp $hi0,$aj,[$ap],#16 // ap[0..1] + lsl $num,$num,#3 + ldr $n0,[$n0] // *n0 + and $tp,$tp,#-16 // ABI says so + ldp $hi1,$nj,[$np],#16 // np[0..1] + + mul $lo0,$hi0,$m0 // ap[0]*bp[0] + sub $j,$num,#16 // j=num-2 + umulh $hi0,$hi0,$m0 + mul $alo,$aj,$m0 // ap[1]*bp[0] + umulh $ahi,$aj,$m0 + + mul $m1,$lo0,$n0 // "tp[0]"*n0 + mov sp,$tp // alloca + + // (*) mul $lo1,$hi1,$m1 // np[0]*m1 + umulh $hi1,$hi1,$m1 + mul $nlo,$nj,$m1 // np[1]*m1 + // (*) adds $lo1,$lo1,$lo0 // discarded + // (*) As for removal of first multiplication and addition + // instructions. The outcome of first addition is + // guaranteed to be zero, which leaves two computationally + // significant outcomes: it either carries or not. Then + // question is when does it carry? Is there alternative + // way to deduce it? If you follow operations, you can + // observe that condition for carry is quite simple: + // $lo0 being non-zero. So that carry can be calculated + // by adding -1 to $lo0. That's what next instruction does. + subs xzr,$lo0,#1 // (*) + umulh $nhi,$nj,$m1 + adc $hi1,$hi1,xzr + cbz $j,.L1st_skip + +.L1st: + ldr $aj,[$ap],#8 + adds $lo0,$alo,$hi0 + sub $j,$j,#8 // j-- + adc $hi0,$ahi,xzr + + ldr $nj,[$np],#8 + adds $lo1,$nlo,$hi1 + mul $alo,$aj,$m0 // ap[j]*bp[0] + adc $hi1,$nhi,xzr + umulh $ahi,$aj,$m0 + + adds $lo1,$lo1,$lo0 + mul $nlo,$nj,$m1 // np[j]*m1 + adc $hi1,$hi1,xzr + umulh $nhi,$nj,$m1 + str $lo1,[$tp],#8 // tp[j-1] + cbnz $j,.L1st + +.L1st_skip: + adds $lo0,$alo,$hi0 + sub $ap,$ap,$num // rewind $ap + adc $hi0,$ahi,xzr + + adds $lo1,$nlo,$hi1 + sub $np,$np,$num // rewind $np + adc $hi1,$nhi,xzr + + adds $lo1,$lo1,$lo0 + sub $i,$num,#8 // i=num-1 + adcs $hi1,$hi1,$hi0 + + adc $ovf,xzr,xzr // upmost overflow bit + stp $lo1,$hi1,[$tp] + +.Louter: + ldr $m0,[$bp],#8 // bp[i] + ldp $hi0,$aj,[$ap],#16 + ldr $tj,[sp] // tp[0] + add $tp,sp,#8 + + mul $lo0,$hi0,$m0 // ap[0]*bp[i] + sub $j,$num,#16 // j=num-2 + umulh $hi0,$hi0,$m0 + ldp $hi1,$nj,[$np],#16 + mul $alo,$aj,$m0 // ap[1]*bp[i] + adds $lo0,$lo0,$tj + umulh $ahi,$aj,$m0 + adc $hi0,$hi0,xzr + + mul $m1,$lo0,$n0 + sub $i,$i,#8 // i-- + + // (*) mul $lo1,$hi1,$m1 // np[0]*m1 + umulh $hi1,$hi1,$m1 + mul $nlo,$nj,$m1 // np[1]*m1 + // (*) adds $lo1,$lo1,$lo0 + subs xzr,$lo0,#1 // (*) + umulh $nhi,$nj,$m1 + cbz $j,.Linner_skip + +.Linner: + ldr $aj,[$ap],#8 + adc $hi1,$hi1,xzr + ldr $tj,[$tp],#8 // tp[j] + adds $lo0,$alo,$hi0 + sub $j,$j,#8 // j-- + adc $hi0,$ahi,xzr + + adds $lo1,$nlo,$hi1 + ldr $nj,[$np],#8 + adc $hi1,$nhi,xzr + + mul $alo,$aj,$m0 // ap[j]*bp[i] + adds $lo0,$lo0,$tj + umulh $ahi,$aj,$m0 + adc $hi0,$hi0,xzr + + mul $nlo,$nj,$m1 // np[j]*m1 + adds $lo1,$lo1,$lo0 + umulh $nhi,$nj,$m1 + str $lo1,[$tp,#-16] // tp[j-1] + cbnz $j,.Linner + +.Linner_skip: + ldr $tj,[$tp],#8 // tp[j] + adc $hi1,$hi1,xzr + adds $lo0,$alo,$hi0 + sub $ap,$ap,$num // rewind $ap + adc $hi0,$ahi,xzr + + adds $lo1,$nlo,$hi1 + sub $np,$np,$num // rewind $np + adcs $hi1,$nhi,$ovf + adc $ovf,xzr,xzr + + adds $lo0,$lo0,$tj + adc $hi0,$hi0,xzr + + adds $lo1,$lo1,$lo0 + adcs $hi1,$hi1,$hi0 + adc $ovf,$ovf,xzr // upmost overflow bit + stp $lo1,$hi1,[$tp,#-16] + + cbnz $i,.Louter + + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + ldr $tj,[sp] // tp[0] + add $tp,sp,#8 + ldr $nj,[$np],#8 // np[0] + subs $j,$num,#8 // j=num-1 and clear borrow + mov $ap,$rp +.Lsub: + sbcs $aj,$tj,$nj // tp[j]-np[j] + ldr $tj,[$tp],#8 + sub $j,$j,#8 // j-- + ldr $nj,[$np],#8 + str $aj,[$ap],#8 // rp[j]=tp[j]-np[j] + cbnz $j,.Lsub + + sbcs $aj,$tj,$nj + sbcs $ovf,$ovf,xzr // did it borrow? + str $aj,[$ap],#8 // rp[num-1] + + ldr $tj,[sp] // tp[0] + add $tp,sp,#8 + ldr $aj,[$rp],#8 // rp[0] + sub $num,$num,#8 // num-- + nop +.Lcond_copy: + sub $num,$num,#8 // num-- + csel $nj,$tj,$aj,lo // did it borrow? + ldr $tj,[$tp],#8 + ldr $aj,[$rp],#8 + str xzr,[$tp,#-16] // wipe tp + str $nj,[$rp,#-16] + cbnz $num,.Lcond_copy + + csel $nj,$tj,$aj,lo + str xzr,[$tp,#-8] // wipe tp + str $nj,[$rp,#-8] + + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + mov x0,#1 + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size bn_mul_mont_nohw,.-bn_mul_mont_nohw +___ +{ +######################################################################## +# Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module. + +my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13)); +my ($t0,$t1,$t2,$t3)=map("x$_",(14..17)); +my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26)); +my ($cnt,$carry,$topmost)=("x27","x28","x30"); +my ($tp,$ap_end,$na0)=($bp,$np,$carry); + +$code.=<<___; +.globl bn_sqr8x_mont +.type bn_sqr8x_mont,%function +.align 5 +bn_sqr8x_mont: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp $rp,$np,[sp,#96] // offload rp and np + + ldp $a0,$a1,[$ap,#8*0] + ldp $a2,$a3,[$ap,#8*2] + ldp $a4,$a5,[$ap,#8*4] + ldp $a6,$a7,[$ap,#8*6] + + sub $tp,sp,$num,lsl#4 + lsl $num,$num,#3 + ldr $n0,[$n0] // *n0 + mov sp,$tp // alloca + sub $cnt,$num,#8*8 + b .Lsqr8x_zero_start + +.Lsqr8x_zero: + sub $cnt,$cnt,#8*8 + stp xzr,xzr,[$tp,#8*0] + stp xzr,xzr,[$tp,#8*2] + stp xzr,xzr,[$tp,#8*4] + stp xzr,xzr,[$tp,#8*6] +.Lsqr8x_zero_start: + stp xzr,xzr,[$tp,#8*8] + stp xzr,xzr,[$tp,#8*10] + stp xzr,xzr,[$tp,#8*12] + stp xzr,xzr,[$tp,#8*14] + add $tp,$tp,#8*16 + cbnz $cnt,.Lsqr8x_zero + + add $ap_end,$ap,$num + add $ap,$ap,#8*8 + mov $acc0,xzr + mov $acc1,xzr + mov $acc2,xzr + mov $acc3,xzr + mov $acc4,xzr + mov $acc5,xzr + mov $acc6,xzr + mov $acc7,xzr + mov $tp,sp + str $n0,[x29,#112] // offload n0 + + // Multiply everything but a[i]*a[i] +.align 4 +.Lsqr8x_outer_loop: + // a[1]a[0] (i) + // a[2]a[0] + // a[3]a[0] + // a[4]a[0] + // a[5]a[0] + // a[6]a[0] + // a[7]a[0] + // a[2]a[1] (ii) + // a[3]a[1] + // a[4]a[1] + // a[5]a[1] + // a[6]a[1] + // a[7]a[1] + // a[3]a[2] (iii) + // a[4]a[2] + // a[5]a[2] + // a[6]a[2] + // a[7]a[2] + // a[4]a[3] (iv) + // a[5]a[3] + // a[6]a[3] + // a[7]a[3] + // a[5]a[4] (v) + // a[6]a[4] + // a[7]a[4] + // a[6]a[5] (vi) + // a[7]a[5] + // a[7]a[6] (vii) + + mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i) + mul $t1,$a2,$a0 + mul $t2,$a3,$a0 + mul $t3,$a4,$a0 + adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0]) + mul $t0,$a5,$a0 + adcs $acc2,$acc2,$t1 + mul $t1,$a6,$a0 + adcs $acc3,$acc3,$t2 + mul $t2,$a7,$a0 + adcs $acc4,$acc4,$t3 + umulh $t3,$a1,$a0 // hi(a[1..7]*a[0]) + adcs $acc5,$acc5,$t0 + umulh $t0,$a2,$a0 + adcs $acc6,$acc6,$t1 + umulh $t1,$a3,$a0 + adcs $acc7,$acc7,$t2 + umulh $t2,$a4,$a0 + stp $acc0,$acc1,[$tp],#8*2 // t[0..1] + adc $acc0,xzr,xzr // t[8] + adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0]) + umulh $t3,$a5,$a0 + adcs $acc3,$acc3,$t0 + umulh $t0,$a6,$a0 + adcs $acc4,$acc4,$t1 + umulh $t1,$a7,$a0 + adcs $acc5,$acc5,$t2 + mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii) + adcs $acc6,$acc6,$t3 + mul $t3,$a3,$a1 + adcs $acc7,$acc7,$t0 + mul $t0,$a4,$a1 + adc $acc0,$acc0,$t1 + + mul $t1,$a5,$a1 + adds $acc3,$acc3,$t2 + mul $t2,$a6,$a1 + adcs $acc4,$acc4,$t3 + mul $t3,$a7,$a1 + adcs $acc5,$acc5,$t0 + umulh $t0,$a2,$a1 // hi(a[2..7]*a[1]) + adcs $acc6,$acc6,$t1 + umulh $t1,$a3,$a1 + adcs $acc7,$acc7,$t2 + umulh $t2,$a4,$a1 + adcs $acc0,$acc0,$t3 + umulh $t3,$a5,$a1 + stp $acc2,$acc3,[$tp],#8*2 // t[2..3] + adc $acc1,xzr,xzr // t[9] + adds $acc4,$acc4,$t0 + umulh $t0,$a6,$a1 + adcs $acc5,$acc5,$t1 + umulh $t1,$a7,$a1 + adcs $acc6,$acc6,$t2 + mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii) + adcs $acc7,$acc7,$t3 + mul $t3,$a4,$a2 + adcs $acc0,$acc0,$t0 + mul $t0,$a5,$a2 + adc $acc1,$acc1,$t1 + + mul $t1,$a6,$a2 + adds $acc5,$acc5,$t2 + mul $t2,$a7,$a2 + adcs $acc6,$acc6,$t3 + umulh $t3,$a3,$a2 // hi(a[3..7]*a[2]) + adcs $acc7,$acc7,$t0 + umulh $t0,$a4,$a2 + adcs $acc0,$acc0,$t1 + umulh $t1,$a5,$a2 + adcs $acc1,$acc1,$t2 + umulh $t2,$a6,$a2 + stp $acc4,$acc5,[$tp],#8*2 // t[4..5] + adc $acc2,xzr,xzr // t[10] + adds $acc6,$acc6,$t3 + umulh $t3,$a7,$a2 + adcs $acc7,$acc7,$t0 + mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv) + adcs $acc0,$acc0,$t1 + mul $t1,$a5,$a3 + adcs $acc1,$acc1,$t2 + mul $t2,$a6,$a3 + adc $acc2,$acc2,$t3 + + mul $t3,$a7,$a3 + adds $acc7,$acc7,$t0 + umulh $t0,$a4,$a3 // hi(a[4..7]*a[3]) + adcs $acc0,$acc0,$t1 + umulh $t1,$a5,$a3 + adcs $acc1,$acc1,$t2 + umulh $t2,$a6,$a3 + adcs $acc2,$acc2,$t3 + umulh $t3,$a7,$a3 + stp $acc6,$acc7,[$tp],#8*2 // t[6..7] + adc $acc3,xzr,xzr // t[11] + adds $acc0,$acc0,$t0 + mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v) + adcs $acc1,$acc1,$t1 + mul $t1,$a6,$a4 + adcs $acc2,$acc2,$t2 + mul $t2,$a7,$a4 + adc $acc3,$acc3,$t3 + + umulh $t3,$a5,$a4 // hi(a[5..7]*a[4]) + adds $acc1,$acc1,$t0 + umulh $t0,$a6,$a4 + adcs $acc2,$acc2,$t1 + umulh $t1,$a7,$a4 + adcs $acc3,$acc3,$t2 + mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi) + adc $acc4,xzr,xzr // t[12] + adds $acc2,$acc2,$t3 + mul $t3,$a7,$a5 + adcs $acc3,$acc3,$t0 + umulh $t0,$a6,$a5 // hi(a[6..7]*a[5]) + adc $acc4,$acc4,$t1 + + umulh $t1,$a7,$a5 + adds $acc3,$acc3,$t2 + mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii) + adcs $acc4,$acc4,$t3 + umulh $t3,$a7,$a6 // hi(a[7]*a[6]) + adc $acc5,xzr,xzr // t[13] + adds $acc4,$acc4,$t0 + sub $cnt,$ap_end,$ap // done yet? + adc $acc5,$acc5,$t1 + + adds $acc5,$acc5,$t2 + sub $t0,$ap_end,$num // rewinded ap + adc $acc6,xzr,xzr // t[14] + add $acc6,$acc6,$t3 + + cbz $cnt,.Lsqr8x_outer_break + + mov $n0,$a0 + ldp $a0,$a1,[$tp,#8*0] + ldp $a2,$a3,[$tp,#8*2] + ldp $a4,$a5,[$tp,#8*4] + ldp $a6,$a7,[$tp,#8*6] + adds $acc0,$acc0,$a0 + adcs $acc1,$acc1,$a1 + ldp $a0,$a1,[$ap,#8*0] + adcs $acc2,$acc2,$a2 + adcs $acc3,$acc3,$a3 + ldp $a2,$a3,[$ap,#8*2] + adcs $acc4,$acc4,$a4 + adcs $acc5,$acc5,$a5 + ldp $a4,$a5,[$ap,#8*4] + adcs $acc6,$acc6,$a6 + mov $rp,$ap + adcs $acc7,xzr,$a7 + ldp $a6,$a7,[$ap,#8*6] + add $ap,$ap,#8*8 + //adc $carry,xzr,xzr // moved below + mov $cnt,#-8*8 + + // a[8]a[0] + // a[9]a[0] + // a[a]a[0] + // a[b]a[0] + // a[c]a[0] + // a[d]a[0] + // a[e]a[0] + // a[f]a[0] + // a[8]a[1] + // a[f]a[1]........................ + // a[8]a[2] + // a[f]a[2]........................ + // a[8]a[3] + // a[f]a[3]........................ + // a[8]a[4] + // a[f]a[4]........................ + // a[8]a[5] + // a[f]a[5]........................ + // a[8]a[6] + // a[f]a[6]........................ + // a[8]a[7] + // a[f]a[7]........................ +.Lsqr8x_mul: + mul $t0,$a0,$n0 + adc $carry,xzr,xzr // carry bit, modulo-scheduled + mul $t1,$a1,$n0 + add $cnt,$cnt,#8 + mul $t2,$a2,$n0 + mul $t3,$a3,$n0 + adds $acc0,$acc0,$t0 + mul $t0,$a4,$n0 + adcs $acc1,$acc1,$t1 + mul $t1,$a5,$n0 + adcs $acc2,$acc2,$t2 + mul $t2,$a6,$n0 + adcs $acc3,$acc3,$t3 + mul $t3,$a7,$n0 + adcs $acc4,$acc4,$t0 + umulh $t0,$a0,$n0 + adcs $acc5,$acc5,$t1 + umulh $t1,$a1,$n0 + adcs $acc6,$acc6,$t2 + umulh $t2,$a2,$n0 + adcs $acc7,$acc7,$t3 + umulh $t3,$a3,$n0 + adc $carry,$carry,xzr + str $acc0,[$tp],#8 + adds $acc0,$acc1,$t0 + umulh $t0,$a4,$n0 + adcs $acc1,$acc2,$t1 + umulh $t1,$a5,$n0 + adcs $acc2,$acc3,$t2 + umulh $t2,$a6,$n0 + adcs $acc3,$acc4,$t3 + umulh $t3,$a7,$n0 + ldr $n0,[$rp,$cnt] + adcs $acc4,$acc5,$t0 + adcs $acc5,$acc6,$t1 + adcs $acc6,$acc7,$t2 + adcs $acc7,$carry,$t3 + //adc $carry,xzr,xzr // moved above + cbnz $cnt,.Lsqr8x_mul + // note that carry flag is guaranteed + // to be zero at this point + cmp $ap,$ap_end // done yet? + b.eq .Lsqr8x_break + + ldp $a0,$a1,[$tp,#8*0] + ldp $a2,$a3,[$tp,#8*2] + ldp $a4,$a5,[$tp,#8*4] + ldp $a6,$a7,[$tp,#8*6] + adds $acc0,$acc0,$a0 + ldr $n0,[$rp,#-8*8] + adcs $acc1,$acc1,$a1 + ldp $a0,$a1,[$ap,#8*0] + adcs $acc2,$acc2,$a2 + adcs $acc3,$acc3,$a3 + ldp $a2,$a3,[$ap,#8*2] + adcs $acc4,$acc4,$a4 + adcs $acc5,$acc5,$a5 + ldp $a4,$a5,[$ap,#8*4] + adcs $acc6,$acc6,$a6 + mov $cnt,#-8*8 + adcs $acc7,$acc7,$a7 + ldp $a6,$a7,[$ap,#8*6] + add $ap,$ap,#8*8 + //adc $carry,xzr,xzr // moved above + b .Lsqr8x_mul + +.align 4 +.Lsqr8x_break: + ldp $a0,$a1,[$rp,#8*0] + add $ap,$rp,#8*8 + ldp $a2,$a3,[$rp,#8*2] + sub $t0,$ap_end,$ap // is it last iteration? + ldp $a4,$a5,[$rp,#8*4] + sub $t1,$tp,$t0 + ldp $a6,$a7,[$rp,#8*6] + cbz $t0,.Lsqr8x_outer_loop + + stp $acc0,$acc1,[$tp,#8*0] + ldp $acc0,$acc1,[$t1,#8*0] + stp $acc2,$acc3,[$tp,#8*2] + ldp $acc2,$acc3,[$t1,#8*2] + stp $acc4,$acc5,[$tp,#8*4] + ldp $acc4,$acc5,[$t1,#8*4] + stp $acc6,$acc7,[$tp,#8*6] + mov $tp,$t1 + ldp $acc6,$acc7,[$t1,#8*6] + b .Lsqr8x_outer_loop + +.align 4 +.Lsqr8x_outer_break: + // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] + ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0] + ldp $t1,$t2,[sp,#8*1] + ldp $a5,$a7,[$t0,#8*2] + add $ap,$t0,#8*4 + ldp $t3,$t0,[sp,#8*3] + + stp $acc0,$acc1,[$tp,#8*0] + mul $acc0,$a1,$a1 + stp $acc2,$acc3,[$tp,#8*2] + umulh $a1,$a1,$a1 + stp $acc4,$acc5,[$tp,#8*4] + mul $a2,$a3,$a3 + stp $acc6,$acc7,[$tp,#8*6] + mov $tp,sp + umulh $a3,$a3,$a3 + adds $acc1,$a1,$t1,lsl#1 + extr $t1,$t2,$t1,#63 + sub $cnt,$num,#8*4 + +.Lsqr4x_shift_n_add: + adcs $acc2,$a2,$t1 + extr $t2,$t3,$t2,#63 + sub $cnt,$cnt,#8*4 + adcs $acc3,$a3,$t2 + ldp $t1,$t2,[$tp,#8*5] + mul $a4,$a5,$a5 + ldp $a1,$a3,[$ap],#8*2 + umulh $a5,$a5,$a5 + mul $a6,$a7,$a7 + umulh $a7,$a7,$a7 + extr $t3,$t0,$t3,#63 + stp $acc0,$acc1,[$tp,#8*0] + adcs $acc4,$a4,$t3 + extr $t0,$t1,$t0,#63 + stp $acc2,$acc3,[$tp,#8*2] + adcs $acc5,$a5,$t0 + ldp $t3,$t0,[$tp,#8*7] + extr $t1,$t2,$t1,#63 + adcs $acc6,$a6,$t1 + extr $t2,$t3,$t2,#63 + adcs $acc7,$a7,$t2 + ldp $t1,$t2,[$tp,#8*9] + mul $a0,$a1,$a1 + ldp $a5,$a7,[$ap],#8*2 + umulh $a1,$a1,$a1 + mul $a2,$a3,$a3 + umulh $a3,$a3,$a3 + stp $acc4,$acc5,[$tp,#8*4] + extr $t3,$t0,$t3,#63 + stp $acc6,$acc7,[$tp,#8*6] + add $tp,$tp,#8*8 + adcs $acc0,$a0,$t3 + extr $t0,$t1,$t0,#63 + adcs $acc1,$a1,$t0 + ldp $t3,$t0,[$tp,#8*3] + extr $t1,$t2,$t1,#63 + cbnz $cnt,.Lsqr4x_shift_n_add +___ +my ($np,$np_end)=($ap,$ap_end); +$code.=<<___; + ldp $np,$n0,[x29,#104] // pull np and n0 + + adcs $acc2,$a2,$t1 + extr $t2,$t3,$t2,#63 + adcs $acc3,$a3,$t2 + ldp $t1,$t2,[$tp,#8*5] + mul $a4,$a5,$a5 + umulh $a5,$a5,$a5 + stp $acc0,$acc1,[$tp,#8*0] + mul $a6,$a7,$a7 + umulh $a7,$a7,$a7 + stp $acc2,$acc3,[$tp,#8*2] + extr $t3,$t0,$t3,#63 + adcs $acc4,$a4,$t3 + extr $t0,$t1,$t0,#63 + ldp $acc0,$acc1,[sp,#8*0] + adcs $acc5,$a5,$t0 + extr $t1,$t2,$t1,#63 + ldp $a0,$a1,[$np,#8*0] + adcs $acc6,$a6,$t1 + extr $t2,xzr,$t2,#63 + ldp $a2,$a3,[$np,#8*2] + adc $acc7,$a7,$t2 + ldp $a4,$a5,[$np,#8*4] + + // Reduce by 512 bits per iteration + mul $na0,$n0,$acc0 // t[0]*n0 + ldp $a6,$a7,[$np,#8*6] + add $np_end,$np,$num + ldp $acc2,$acc3,[sp,#8*2] + stp $acc4,$acc5,[$tp,#8*4] + ldp $acc4,$acc5,[sp,#8*4] + stp $acc6,$acc7,[$tp,#8*6] + ldp $acc6,$acc7,[sp,#8*6] + add $np,$np,#8*8 + mov $topmost,xzr // initial top-most carry + mov $tp,sp + mov $cnt,#8 + +.Lsqr8x_reduction: + // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0) + mul $t1,$a1,$na0 + sub $cnt,$cnt,#1 + mul $t2,$a2,$na0 + str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing + mul $t3,$a3,$na0 + // (*) adds xzr,$acc0,$t0 + subs xzr,$acc0,#1 // (*) + mul $t0,$a4,$na0 + adcs $acc0,$acc1,$t1 + mul $t1,$a5,$na0 + adcs $acc1,$acc2,$t2 + mul $t2,$a6,$na0 + adcs $acc2,$acc3,$t3 + mul $t3,$a7,$na0 + adcs $acc3,$acc4,$t0 + umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0) + adcs $acc4,$acc5,$t1 + umulh $t1,$a1,$na0 + adcs $acc5,$acc6,$t2 + umulh $t2,$a2,$na0 + adcs $acc6,$acc7,$t3 + umulh $t3,$a3,$na0 + adc $acc7,xzr,xzr + adds $acc0,$acc0,$t0 + umulh $t0,$a4,$na0 + adcs $acc1,$acc1,$t1 + umulh $t1,$a5,$na0 + adcs $acc2,$acc2,$t2 + umulh $t2,$a6,$na0 + adcs $acc3,$acc3,$t3 + umulh $t3,$a7,$na0 + mul $na0,$n0,$acc0 // next t[0]*n0 + adcs $acc4,$acc4,$t0 + adcs $acc5,$acc5,$t1 + adcs $acc6,$acc6,$t2 + adc $acc7,$acc7,$t3 + cbnz $cnt,.Lsqr8x_reduction + + ldp $t0,$t1,[$tp,#8*0] + ldp $t2,$t3,[$tp,#8*2] + mov $rp,$tp + sub $cnt,$np_end,$np // done yet? + adds $acc0,$acc0,$t0 + adcs $acc1,$acc1,$t1 + ldp $t0,$t1,[$tp,#8*4] + adcs $acc2,$acc2,$t2 + adcs $acc3,$acc3,$t3 + ldp $t2,$t3,[$tp,#8*6] + adcs $acc4,$acc4,$t0 + adcs $acc5,$acc5,$t1 + adcs $acc6,$acc6,$t2 + adcs $acc7,$acc7,$t3 + //adc $carry,xzr,xzr // moved below + cbz $cnt,.Lsqr8x8_post_condition + + ldr $n0,[$tp,#-8*8] + ldp $a0,$a1,[$np,#8*0] + ldp $a2,$a3,[$np,#8*2] + ldp $a4,$a5,[$np,#8*4] + mov $cnt,#-8*8 + ldp $a6,$a7,[$np,#8*6] + add $np,$np,#8*8 + +.Lsqr8x_tail: + mul $t0,$a0,$n0 + adc $carry,xzr,xzr // carry bit, modulo-scheduled + mul $t1,$a1,$n0 + add $cnt,$cnt,#8 + mul $t2,$a2,$n0 + mul $t3,$a3,$n0 + adds $acc0,$acc0,$t0 + mul $t0,$a4,$n0 + adcs $acc1,$acc1,$t1 + mul $t1,$a5,$n0 + adcs $acc2,$acc2,$t2 + mul $t2,$a6,$n0 + adcs $acc3,$acc3,$t3 + mul $t3,$a7,$n0 + adcs $acc4,$acc4,$t0 + umulh $t0,$a0,$n0 + adcs $acc5,$acc5,$t1 + umulh $t1,$a1,$n0 + adcs $acc6,$acc6,$t2 + umulh $t2,$a2,$n0 + adcs $acc7,$acc7,$t3 + umulh $t3,$a3,$n0 + adc $carry,$carry,xzr + str $acc0,[$tp],#8 + adds $acc0,$acc1,$t0 + umulh $t0,$a4,$n0 + adcs $acc1,$acc2,$t1 + umulh $t1,$a5,$n0 + adcs $acc2,$acc3,$t2 + umulh $t2,$a6,$n0 + adcs $acc3,$acc4,$t3 + umulh $t3,$a7,$n0 + ldr $n0,[$rp,$cnt] + adcs $acc4,$acc5,$t0 + adcs $acc5,$acc6,$t1 + adcs $acc6,$acc7,$t2 + adcs $acc7,$carry,$t3 + //adc $carry,xzr,xzr // moved above + cbnz $cnt,.Lsqr8x_tail + // note that carry flag is guaranteed + // to be zero at this point + ldp $a0,$a1,[$tp,#8*0] + sub $cnt,$np_end,$np // done yet? + sub $t2,$np_end,$num // rewinded np + ldp $a2,$a3,[$tp,#8*2] + ldp $a4,$a5,[$tp,#8*4] + ldp $a6,$a7,[$tp,#8*6] + cbz $cnt,.Lsqr8x_tail_break + + ldr $n0,[$rp,#-8*8] + adds $acc0,$acc0,$a0 + adcs $acc1,$acc1,$a1 + ldp $a0,$a1,[$np,#8*0] + adcs $acc2,$acc2,$a2 + adcs $acc3,$acc3,$a3 + ldp $a2,$a3,[$np,#8*2] + adcs $acc4,$acc4,$a4 + adcs $acc5,$acc5,$a5 + ldp $a4,$a5,[$np,#8*4] + adcs $acc6,$acc6,$a6 + mov $cnt,#-8*8 + adcs $acc7,$acc7,$a7 + ldp $a6,$a7,[$np,#8*6] + add $np,$np,#8*8 + //adc $carry,xzr,xzr // moved above + b .Lsqr8x_tail + +.align 4 +.Lsqr8x_tail_break: + ldr $n0,[x29,#112] // pull n0 + add $cnt,$tp,#8*8 // end of current t[num] window + + subs xzr,$topmost,#1 // "move" top-most carry to carry bit + adcs $t0,$acc0,$a0 + adcs $t1,$acc1,$a1 + ldp $acc0,$acc1,[$rp,#8*0] + adcs $acc2,$acc2,$a2 + ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0] + adcs $acc3,$acc3,$a3 + ldp $a2,$a3,[$t2,#8*2] + adcs $acc4,$acc4,$a4 + adcs $acc5,$acc5,$a5 + ldp $a4,$a5,[$t2,#8*4] + adcs $acc6,$acc6,$a6 + adcs $acc7,$acc7,$a7 + ldp $a6,$a7,[$t2,#8*6] + add $np,$t2,#8*8 + adc $topmost,xzr,xzr // top-most carry + mul $na0,$n0,$acc0 + stp $t0,$t1,[$tp,#8*0] + stp $acc2,$acc3,[$tp,#8*2] + ldp $acc2,$acc3,[$rp,#8*2] + stp $acc4,$acc5,[$tp,#8*4] + ldp $acc4,$acc5,[$rp,#8*4] + cmp $cnt,x29 // did we hit the bottom? + stp $acc6,$acc7,[$tp,#8*6] + mov $tp,$rp // slide the window + ldp $acc6,$acc7,[$rp,#8*6] + mov $cnt,#8 + b.ne .Lsqr8x_reduction + + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + ldr $rp,[x29,#96] // pull rp + add $tp,$tp,#8*8 + subs $t0,$acc0,$a0 + sbcs $t1,$acc1,$a1 + sub $cnt,$num,#8*8 + mov $ap_end,$rp // $rp copy + +.Lsqr8x_sub: + sbcs $t2,$acc2,$a2 + ldp $a0,$a1,[$np,#8*0] + sbcs $t3,$acc3,$a3 + stp $t0,$t1,[$rp,#8*0] + sbcs $t0,$acc4,$a4 + ldp $a2,$a3,[$np,#8*2] + sbcs $t1,$acc5,$a5 + stp $t2,$t3,[$rp,#8*2] + sbcs $t2,$acc6,$a6 + ldp $a4,$a5,[$np,#8*4] + sbcs $t3,$acc7,$a7 + ldp $a6,$a7,[$np,#8*6] + add $np,$np,#8*8 + ldp $acc0,$acc1,[$tp,#8*0] + sub $cnt,$cnt,#8*8 + ldp $acc2,$acc3,[$tp,#8*2] + ldp $acc4,$acc5,[$tp,#8*4] + ldp $acc6,$acc7,[$tp,#8*6] + add $tp,$tp,#8*8 + stp $t0,$t1,[$rp,#8*4] + sbcs $t0,$acc0,$a0 + stp $t2,$t3,[$rp,#8*6] + add $rp,$rp,#8*8 + sbcs $t1,$acc1,$a1 + cbnz $cnt,.Lsqr8x_sub + + sbcs $t2,$acc2,$a2 + mov $tp,sp + add $ap,sp,$num + ldp $a0,$a1,[$ap_end,#8*0] + sbcs $t3,$acc3,$a3 + stp $t0,$t1,[$rp,#8*0] + sbcs $t0,$acc4,$a4 + ldp $a2,$a3,[$ap_end,#8*2] + sbcs $t1,$acc5,$a5 + stp $t2,$t3,[$rp,#8*2] + sbcs $t2,$acc6,$a6 + ldp $acc0,$acc1,[$ap,#8*0] + sbcs $t3,$acc7,$a7 + ldp $acc2,$acc3,[$ap,#8*2] + sbcs xzr,$topmost,xzr // did it borrow? + ldr x30,[x29,#8] // pull return address + stp $t0,$t1,[$rp,#8*4] + stp $t2,$t3,[$rp,#8*6] + + sub $cnt,$num,#8*4 +.Lsqr4x_cond_copy: + sub $cnt,$cnt,#8*4 + csel $t0,$acc0,$a0,lo + stp xzr,xzr,[$tp,#8*0] + csel $t1,$acc1,$a1,lo + ldp $a0,$a1,[$ap_end,#8*4] + ldp $acc0,$acc1,[$ap,#8*4] + csel $t2,$acc2,$a2,lo + stp xzr,xzr,[$tp,#8*2] + add $tp,$tp,#8*4 + csel $t3,$acc3,$a3,lo + ldp $a2,$a3,[$ap_end,#8*6] + ldp $acc2,$acc3,[$ap,#8*6] + add $ap,$ap,#8*4 + stp $t0,$t1,[$ap_end,#8*0] + stp $t2,$t3,[$ap_end,#8*2] + add $ap_end,$ap_end,#8*4 + stp xzr,xzr,[$ap,#8*0] + stp xzr,xzr,[$ap,#8*2] + cbnz $cnt,.Lsqr4x_cond_copy + + csel $t0,$acc0,$a0,lo + stp xzr,xzr,[$tp,#8*0] + csel $t1,$acc1,$a1,lo + stp xzr,xzr,[$tp,#8*2] + csel $t2,$acc2,$a2,lo + csel $t3,$acc3,$a3,lo + stp $t0,$t1,[$ap_end,#8*0] + stp $t2,$t3,[$ap_end,#8*2] + + b .Lsqr8x_done + +.align 4 +.Lsqr8x8_post_condition: + adc $carry,xzr,xzr + ldr x30,[x29,#8] // pull return address + // $acc0-7,$carry hold result, $a0-7 hold modulus + subs $a0,$acc0,$a0 + ldr $ap,[x29,#96] // pull rp + sbcs $a1,$acc1,$a1 + stp xzr,xzr,[sp,#8*0] + sbcs $a2,$acc2,$a2 + stp xzr,xzr,[sp,#8*2] + sbcs $a3,$acc3,$a3 + stp xzr,xzr,[sp,#8*4] + sbcs $a4,$acc4,$a4 + stp xzr,xzr,[sp,#8*6] + sbcs $a5,$acc5,$a5 + stp xzr,xzr,[sp,#8*8] + sbcs $a6,$acc6,$a6 + stp xzr,xzr,[sp,#8*10] + sbcs $a7,$acc7,$a7 + stp xzr,xzr,[sp,#8*12] + sbcs $carry,$carry,xzr // did it borrow? + stp xzr,xzr,[sp,#8*14] + + // $a0-7 hold result-modulus + csel $a0,$acc0,$a0,lo + csel $a1,$acc1,$a1,lo + csel $a2,$acc2,$a2,lo + csel $a3,$acc3,$a3,lo + stp $a0,$a1,[$ap,#8*0] + csel $a4,$acc4,$a4,lo + csel $a5,$acc5,$a5,lo + stp $a2,$a3,[$ap,#8*2] + csel $a6,$acc6,$a6,lo + csel $a7,$acc7,$a7,lo + stp $a4,$a5,[$ap,#8*4] + stp $a6,$a7,[$ap,#8*6] + +.Lsqr8x_done: + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + mov x0,#1 + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + // x30 is popped earlier + AARCH64_VALIDATE_LINK_REGISTER + ret +.size bn_sqr8x_mont,.-bn_sqr8x_mont +___ +} + +{ +######################################################################## +# Even though this might look as ARMv8 adaptation of mulx4x_mont from +# x86_64-mont5 module, it's different in sense that it performs +# reduction 256 bits at a time. + +my ($a0,$a1,$a2,$a3, + $t0,$t1,$t2,$t3, + $m0,$m1,$m2,$m3, + $acc0,$acc1,$acc2,$acc3,$acc4, + $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28)); +my $bp_end=$rp; +my ($carry,$topmost) = ($rp,"x30"); + +$code.=<<___; +.globl bn_mul4x_mont +.type bn_mul4x_mont,%function +.align 5 +bn_mul4x_mont: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + sub $tp,sp,$num,lsl#3 + lsl $num,$num,#3 + ldr $n0,[$n0] // *n0 + sub sp,$tp,#8*4 // alloca + + add $t0,$bp,$num + add $ap_end,$ap,$num + stp $rp,$t0,[x29,#96] // offload rp and &b[num] + + ldr $bi,[$bp,#8*0] // b[0] + ldp $a0,$a1,[$ap,#8*0] // a[0..3] + ldp $a2,$a3,[$ap,#8*2] + add $ap,$ap,#8*4 + mov $acc0,xzr + mov $acc1,xzr + mov $acc2,xzr + mov $acc3,xzr + ldp $m0,$m1,[$np,#8*0] // n[0..3] + ldp $m2,$m3,[$np,#8*2] + adds $np,$np,#8*4 // clear carry bit + mov $carry,xzr + mov $cnt,#0 + mov $tp,sp + +.Loop_mul4x_1st_reduction: + mul $t0,$a0,$bi // lo(a[0..3]*b[0]) + adc $carry,$carry,xzr // modulo-scheduled + mul $t1,$a1,$bi + add $cnt,$cnt,#8 + mul $t2,$a2,$bi + and $cnt,$cnt,#31 + mul $t3,$a3,$bi + adds $acc0,$acc0,$t0 + umulh $t0,$a0,$bi // hi(a[0..3]*b[0]) + adcs $acc1,$acc1,$t1 + mul $mi,$acc0,$n0 // t[0]*n0 + adcs $acc2,$acc2,$t2 + umulh $t1,$a1,$bi + adcs $acc3,$acc3,$t3 + umulh $t2,$a2,$bi + adc $acc4,xzr,xzr + umulh $t3,$a3,$bi + ldr $bi,[$bp,$cnt] // next b[i] (or b[0]) + adds $acc1,$acc1,$t0 + // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0) + str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing + adcs $acc2,$acc2,$t1 + mul $t1,$m1,$mi + adcs $acc3,$acc3,$t2 + mul $t2,$m2,$mi + adc $acc4,$acc4,$t3 // can't overflow + mul $t3,$m3,$mi + // (*) adds xzr,$acc0,$t0 + subs xzr,$acc0,#1 // (*) + umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0) + adcs $acc0,$acc1,$t1 + umulh $t1,$m1,$mi + adcs $acc1,$acc2,$t2 + umulh $t2,$m2,$mi + adcs $acc2,$acc3,$t3 + umulh $t3,$m3,$mi + adcs $acc3,$acc4,$carry + adc $carry,xzr,xzr + adds $acc0,$acc0,$t0 + sub $t0,$ap_end,$ap + adcs $acc1,$acc1,$t1 + adcs $acc2,$acc2,$t2 + adcs $acc3,$acc3,$t3 + //adc $carry,$carry,xzr + cbnz $cnt,.Loop_mul4x_1st_reduction + + cbz $t0,.Lmul4x4_post_condition + + ldp $a0,$a1,[$ap,#8*0] // a[4..7] + ldp $a2,$a3,[$ap,#8*2] + add $ap,$ap,#8*4 + ldr $mi,[sp] // a[0]*n0 + ldp $m0,$m1,[$np,#8*0] // n[4..7] + ldp $m2,$m3,[$np,#8*2] + add $np,$np,#8*4 + +.Loop_mul4x_1st_tail: + mul $t0,$a0,$bi // lo(a[4..7]*b[i]) + adc $carry,$carry,xzr // modulo-scheduled + mul $t1,$a1,$bi + add $cnt,$cnt,#8 + mul $t2,$a2,$bi + and $cnt,$cnt,#31 + mul $t3,$a3,$bi + adds $acc0,$acc0,$t0 + umulh $t0,$a0,$bi // hi(a[4..7]*b[i]) + adcs $acc1,$acc1,$t1 + umulh $t1,$a1,$bi + adcs $acc2,$acc2,$t2 + umulh $t2,$a2,$bi + adcs $acc3,$acc3,$t3 + umulh $t3,$a3,$bi + adc $acc4,xzr,xzr + ldr $bi,[$bp,$cnt] // next b[i] (or b[0]) + adds $acc1,$acc1,$t0 + mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0) + adcs $acc2,$acc2,$t1 + mul $t1,$m1,$mi + adcs $acc3,$acc3,$t2 + mul $t2,$m2,$mi + adc $acc4,$acc4,$t3 // can't overflow + mul $t3,$m3,$mi + adds $acc0,$acc0,$t0 + umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0) + adcs $acc1,$acc1,$t1 + umulh $t1,$m1,$mi + adcs $acc2,$acc2,$t2 + umulh $t2,$m2,$mi + adcs $acc3,$acc3,$t3 + adcs $acc4,$acc4,$carry + umulh $t3,$m3,$mi + adc $carry,xzr,xzr + ldr $mi,[sp,$cnt] // next t[0]*n0 + str $acc0,[$tp],#8 // result!!! + adds $acc0,$acc1,$t0 + sub $t0,$ap_end,$ap // done yet? + adcs $acc1,$acc2,$t1 + adcs $acc2,$acc3,$t2 + adcs $acc3,$acc4,$t3 + //adc $carry,$carry,xzr + cbnz $cnt,.Loop_mul4x_1st_tail + + sub $t1,$ap_end,$num // rewinded $ap + cbz $t0,.Lmul4x_proceed + + ldp $a0,$a1,[$ap,#8*0] + ldp $a2,$a3,[$ap,#8*2] + add $ap,$ap,#8*4 + ldp $m0,$m1,[$np,#8*0] + ldp $m2,$m3,[$np,#8*2] + add $np,$np,#8*4 + b .Loop_mul4x_1st_tail + +.align 5 +.Lmul4x_proceed: + ldr $bi,[$bp,#8*4]! // *++b + adc $topmost,$carry,xzr + ldp $a0,$a1,[$t1,#8*0] // a[0..3] + sub $np,$np,$num // rewind np + ldp $a2,$a3,[$t1,#8*2] + add $ap,$t1,#8*4 + + stp $acc0,$acc1,[$tp,#8*0] // result!!! + ldp $acc0,$acc1,[sp,#8*4] // t[0..3] + stp $acc2,$acc3,[$tp,#8*2] // result!!! + ldp $acc2,$acc3,[sp,#8*6] + + ldp $m0,$m1,[$np,#8*0] // n[0..3] + mov $tp,sp + ldp $m2,$m3,[$np,#8*2] + adds $np,$np,#8*4 // clear carry bit + mov $carry,xzr + +.align 4 +.Loop_mul4x_reduction: + mul $t0,$a0,$bi // lo(a[0..3]*b[4]) + adc $carry,$carry,xzr // modulo-scheduled + mul $t1,$a1,$bi + add $cnt,$cnt,#8 + mul $t2,$a2,$bi + and $cnt,$cnt,#31 + mul $t3,$a3,$bi + adds $acc0,$acc0,$t0 + umulh $t0,$a0,$bi // hi(a[0..3]*b[4]) + adcs $acc1,$acc1,$t1 + mul $mi,$acc0,$n0 // t[0]*n0 + adcs $acc2,$acc2,$t2 + umulh $t1,$a1,$bi + adcs $acc3,$acc3,$t3 + umulh $t2,$a2,$bi + adc $acc4,xzr,xzr + umulh $t3,$a3,$bi + ldr $bi,[$bp,$cnt] // next b[i] + adds $acc1,$acc1,$t0 + // (*) mul $t0,$m0,$mi + str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing + adcs $acc2,$acc2,$t1 + mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0 + adcs $acc3,$acc3,$t2 + mul $t2,$m2,$mi + adc $acc4,$acc4,$t3 // can't overflow + mul $t3,$m3,$mi + // (*) adds xzr,$acc0,$t0 + subs xzr,$acc0,#1 // (*) + umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0 + adcs $acc0,$acc1,$t1 + umulh $t1,$m1,$mi + adcs $acc1,$acc2,$t2 + umulh $t2,$m2,$mi + adcs $acc2,$acc3,$t3 + umulh $t3,$m3,$mi + adcs $acc3,$acc4,$carry + adc $carry,xzr,xzr + adds $acc0,$acc0,$t0 + adcs $acc1,$acc1,$t1 + adcs $acc2,$acc2,$t2 + adcs $acc3,$acc3,$t3 + //adc $carry,$carry,xzr + cbnz $cnt,.Loop_mul4x_reduction + + adc $carry,$carry,xzr + ldp $t0,$t1,[$tp,#8*4] // t[4..7] + ldp $t2,$t3,[$tp,#8*6] + ldp $a0,$a1,[$ap,#8*0] // a[4..7] + ldp $a2,$a3,[$ap,#8*2] + add $ap,$ap,#8*4 + adds $acc0,$acc0,$t0 + adcs $acc1,$acc1,$t1 + adcs $acc2,$acc2,$t2 + adcs $acc3,$acc3,$t3 + //adc $carry,$carry,xzr + + ldr $mi,[sp] // t[0]*n0 + ldp $m0,$m1,[$np,#8*0] // n[4..7] + ldp $m2,$m3,[$np,#8*2] + add $np,$np,#8*4 + +.align 4 +.Loop_mul4x_tail: + mul $t0,$a0,$bi // lo(a[4..7]*b[4]) + adc $carry,$carry,xzr // modulo-scheduled + mul $t1,$a1,$bi + add $cnt,$cnt,#8 + mul $t2,$a2,$bi + and $cnt,$cnt,#31 + mul $t3,$a3,$bi + adds $acc0,$acc0,$t0 + umulh $t0,$a0,$bi // hi(a[4..7]*b[4]) + adcs $acc1,$acc1,$t1 + umulh $t1,$a1,$bi + adcs $acc2,$acc2,$t2 + umulh $t2,$a2,$bi + adcs $acc3,$acc3,$t3 + umulh $t3,$a3,$bi + adc $acc4,xzr,xzr + ldr $bi,[$bp,$cnt] // next b[i] + adds $acc1,$acc1,$t0 + mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0) + adcs $acc2,$acc2,$t1 + mul $t1,$m1,$mi + adcs $acc3,$acc3,$t2 + mul $t2,$m2,$mi + adc $acc4,$acc4,$t3 // can't overflow + mul $t3,$m3,$mi + adds $acc0,$acc0,$t0 + umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0) + adcs $acc1,$acc1,$t1 + umulh $t1,$m1,$mi + adcs $acc2,$acc2,$t2 + umulh $t2,$m2,$mi + adcs $acc3,$acc3,$t3 + umulh $t3,$m3,$mi + adcs $acc4,$acc4,$carry + ldr $mi,[sp,$cnt] // next a[0]*n0 + adc $carry,xzr,xzr + str $acc0,[$tp],#8 // result!!! + adds $acc0,$acc1,$t0 + sub $t0,$ap_end,$ap // done yet? + adcs $acc1,$acc2,$t1 + adcs $acc2,$acc3,$t2 + adcs $acc3,$acc4,$t3 + //adc $carry,$carry,xzr + cbnz $cnt,.Loop_mul4x_tail + + sub $t1,$np,$num // rewinded np? + adc $carry,$carry,xzr + cbz $t0,.Loop_mul4x_break + + ldp $t0,$t1,[$tp,#8*4] + ldp $t2,$t3,[$tp,#8*6] + ldp $a0,$a1,[$ap,#8*0] + ldp $a2,$a3,[$ap,#8*2] + add $ap,$ap,#8*4 + adds $acc0,$acc0,$t0 + adcs $acc1,$acc1,$t1 + adcs $acc2,$acc2,$t2 + adcs $acc3,$acc3,$t3 + //adc $carry,$carry,xzr + ldp $m0,$m1,[$np,#8*0] + ldp $m2,$m3,[$np,#8*2] + add $np,$np,#8*4 + b .Loop_mul4x_tail + +.align 4 +.Loop_mul4x_break: + ldp $t2,$t3,[x29,#96] // pull rp and &b[num] + adds $acc0,$acc0,$topmost + add $bp,$bp,#8*4 // bp++ + adcs $acc1,$acc1,xzr + sub $ap,$ap,$num // rewind ap + adcs $acc2,$acc2,xzr + stp $acc0,$acc1,[$tp,#8*0] // result!!! + adcs $acc3,$acc3,xzr + ldp $acc0,$acc1,[sp,#8*4] // t[0..3] + adc $topmost,$carry,xzr + stp $acc2,$acc3,[$tp,#8*2] // result!!! + cmp $bp,$t3 // done yet? + ldp $acc2,$acc3,[sp,#8*6] + ldp $m0,$m1,[$t1,#8*0] // n[0..3] + ldp $m2,$m3,[$t1,#8*2] + add $np,$t1,#8*4 + b.eq .Lmul4x_post + + ldr $bi,[$bp] + ldp $a0,$a1,[$ap,#8*0] // a[0..3] + ldp $a2,$a3,[$ap,#8*2] + adds $ap,$ap,#8*4 // clear carry bit + mov $carry,xzr + mov $tp,sp + b .Loop_mul4x_reduction + +.align 4 +.Lmul4x_post: + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + mov $rp,$t2 + mov $ap_end,$t2 // $rp copy + subs $t0,$acc0,$m0 + add $tp,sp,#8*8 + sbcs $t1,$acc1,$m1 + sub $cnt,$num,#8*4 + +.Lmul4x_sub: + sbcs $t2,$acc2,$m2 + ldp $m0,$m1,[$np,#8*0] + sub $cnt,$cnt,#8*4 + ldp $acc0,$acc1,[$tp,#8*0] + sbcs $t3,$acc3,$m3 + ldp $m2,$m3,[$np,#8*2] + add $np,$np,#8*4 + ldp $acc2,$acc3,[$tp,#8*2] + add $tp,$tp,#8*4 + stp $t0,$t1,[$rp,#8*0] + sbcs $t0,$acc0,$m0 + stp $t2,$t3,[$rp,#8*2] + add $rp,$rp,#8*4 + sbcs $t1,$acc1,$m1 + cbnz $cnt,.Lmul4x_sub + + sbcs $t2,$acc2,$m2 + mov $tp,sp + add $ap,sp,#8*4 + ldp $a0,$a1,[$ap_end,#8*0] + sbcs $t3,$acc3,$m3 + stp $t0,$t1,[$rp,#8*0] + ldp $a2,$a3,[$ap_end,#8*2] + stp $t2,$t3,[$rp,#8*2] + ldp $acc0,$acc1,[$ap,#8*0] + ldp $acc2,$acc3,[$ap,#8*2] + sbcs xzr,$topmost,xzr // did it borrow? + ldr x30,[x29,#8] // pull return address + + sub $cnt,$num,#8*4 +.Lmul4x_cond_copy: + sub $cnt,$cnt,#8*4 + csel $t0,$acc0,$a0,lo + stp xzr,xzr,[$tp,#8*0] + csel $t1,$acc1,$a1,lo + ldp $a0,$a1,[$ap_end,#8*4] + ldp $acc0,$acc1,[$ap,#8*4] + csel $t2,$acc2,$a2,lo + stp xzr,xzr,[$tp,#8*2] + add $tp,$tp,#8*4 + csel $t3,$acc3,$a3,lo + ldp $a2,$a3,[$ap_end,#8*6] + ldp $acc2,$acc3,[$ap,#8*6] + add $ap,$ap,#8*4 + stp $t0,$t1,[$ap_end,#8*0] + stp $t2,$t3,[$ap_end,#8*2] + add $ap_end,$ap_end,#8*4 + cbnz $cnt,.Lmul4x_cond_copy + + csel $t0,$acc0,$a0,lo + stp xzr,xzr,[$tp,#8*0] + csel $t1,$acc1,$a1,lo + stp xzr,xzr,[$tp,#8*2] + csel $t2,$acc2,$a2,lo + stp xzr,xzr,[$tp,#8*3] + csel $t3,$acc3,$a3,lo + stp xzr,xzr,[$tp,#8*4] + stp $t0,$t1,[$ap_end,#8*0] + stp $t2,$t3,[$ap_end,#8*2] + + b .Lmul4x_done + +.align 4 +.Lmul4x4_post_condition: + adc $carry,$carry,xzr + ldr $ap,[x29,#96] // pull rp + // $acc0-3,$carry hold result, $m0-7 hold modulus + subs $a0,$acc0,$m0 + ldr x30,[x29,#8] // pull return address + sbcs $a1,$acc1,$m1 + stp xzr,xzr,[sp,#8*0] + sbcs $a2,$acc2,$m2 + stp xzr,xzr,[sp,#8*2] + sbcs $a3,$acc3,$m3 + stp xzr,xzr,[sp,#8*4] + sbcs xzr,$carry,xzr // did it borrow? + stp xzr,xzr,[sp,#8*6] + + // $a0-3 hold result-modulus + csel $a0,$acc0,$a0,lo + csel $a1,$acc1,$a1,lo + csel $a2,$acc2,$a2,lo + csel $a3,$acc3,$a3,lo + stp $a0,$a1,[$ap,#8*0] + stp $a2,$a3,[$ap,#8*2] + +.Lmul4x_done: + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + mov x0,#1 + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + // x30 is popped earlier + AARCH64_VALIDATE_LINK_REGISTER + ret +.size bn_mul4x_mont,.-bn_mul4x_mont +___ +} +$code.=<<___; +.asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by " +.align 4 +___ + +print $code; + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/ring-0.17.14/crypto/fipsmodule/bn/asm/x86-mont.pl b/ring-0.17.14/crypto/fipsmodule/bn/asm/x86-mont.pl new file mode 100644 index 0000000000..afa09948bc --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/bn/asm/x86-mont.pl @@ -0,0 +1,334 @@ +#! /usr/bin/env perl +# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. +# ==================================================================== + +# October 2005 +# +# This is a "teaser" code, as it can be improved in several ways... +# First of all non-SSE2 path should be implemented (yes, for now it +# performs Montgomery multiplication/convolution only on SSE2-capable +# CPUs such as P4, others fall down to original code). Then inner loop +# can be unrolled and modulo-scheduled to improve ILP and possibly +# moved to 128-bit XMM register bank (though it would require input +# rearrangement and/or increase bus bandwidth utilization). Dedicated +# squaring procedure should give further performance improvement... +# Yet, for being draft, the code improves rsa512 *sign* benchmark by +# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-) + +# December 2006 +# +# Modulo-scheduling SSE2 loops results in further 15-20% improvement. +# Integer-only code [being equipped with dedicated squaring procedure] +# gives ~40% on rsa512 sign benchmark... + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../../perlasm"); +require "x86asm.pl"; + +$output = pop; +open STDOUT,">$output"; + +&asm_init($ARGV[0]); + +$sse2=1; + +&function_begin("bn_mul_mont"); + +$i="edx"; +$j="ecx"; +$ap="esi"; $tp="esi"; # overlapping variables!!! +$rp="edi"; $bp="edi"; # overlapping variables!!! +$np="ebp"; +$num="ebx"; + +$_num=&DWP(4*0,"esp"); # stack top layout +$_rp=&DWP(4*1,"esp"); +$_ap=&DWP(4*2,"esp"); +$_bp=&DWP(4*3,"esp"); +$_np=&DWP(4*4,"esp"); +$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp"); +$_sp=&DWP(4*6,"esp"); +$_bpend=&DWP(4*7,"esp"); +$frame=32; # size of above frame rounded up to 16n + + &xor ("eax","eax"); + &mov ("edi",&wparam(5)); # int num + + &lea ("esi",&wparam(0)); # put aside pointer to argument block + &lea ("edx",&wparam(1)); # load ap + &add ("edi",2); # extra two words on top of tp + &neg ("edi"); + &lea ("ebp",&DWP(-$frame,"esp","edi",4)); # future alloca($frame+4*(num+2)) + &neg ("edi"); + + # minimize cache contention by arranging 2K window between stack + # pointer and ap argument [np is also position sensitive vector, + # but it's assumed to be near ap, as it's allocated at ~same + # time]. + &mov ("eax","ebp"); + &sub ("eax","edx"); + &and ("eax",2047); + &sub ("ebp","eax"); # this aligns sp and ap modulo 2048 + + &xor ("edx","ebp"); + &and ("edx",2048); + &xor ("edx",2048); + &sub ("ebp","edx"); # this splits them apart modulo 4096 + + &and ("ebp",-64); # align to cache line + + # An OS-agnostic version of __chkstk. + # + # Some OSes (Windows) insist on stack being "wired" to + # physical memory in strictly sequential manner, i.e. if stack + # allocation spans two pages, then reference to farmost one can + # be punishable by SEGV. But page walking can do good even on + # other OSes, because it guarantees that villain thread hits + # the guard page before it can make damage to innocent one... + &mov ("eax","esp"); + &sub ("eax","ebp"); + &and ("eax",-4096); + &mov ("edx","esp"); # saved stack pointer! + &lea ("esp",&DWP(0,"ebp","eax")); + &mov ("eax",&DWP(0,"esp")); + &cmp ("esp","ebp"); + &ja (&label("page_walk")); + &jmp (&label("page_walk_done")); + +&set_label("page_walk",16); + &lea ("esp",&DWP(-4096,"esp")); + &mov ("eax",&DWP(0,"esp")); + &cmp ("esp","ebp"); + &ja (&label("page_walk")); +&set_label("page_walk_done"); + + ################################# load argument block... + &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp + &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap + &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp + &mov ("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np + &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0 + #&mov ("edi",&DWP(5*4,"esi"));# int num + + &mov ("esi",&DWP(0,"esi")); # pull n0[0] + &mov ($_rp,"eax"); # ... save a copy of argument block + &mov ($_ap,"ebx"); + &mov ($_bp,"ecx"); + &mov ($_np,"ebp"); + &mov ($_n0,"esi"); + &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling + #&mov ($_num,$num); # redundant as $num is not reused + &mov ($_sp,"edx"); # saved stack pointer! + +if($sse2) { +$acc0="mm0"; # mmx register bank layout +$acc1="mm1"; +$car0="mm2"; +$car1="mm3"; +$mul0="mm4"; +$mul1="mm5"; +$temp="mm6"; +$mask="mm7"; + + &mov ("eax",-1); + &movd ($mask,"eax"); # mask 32 lower bits + + &mov ($ap,$_ap); # load input pointers + &mov ($bp,$_bp); + &mov ($np,$_np); + + &xor ($i,$i); # i=0 + &xor ($j,$j); # j=0 + + &movd ($mul0,&DWP(0,$bp)); # bp[0] + &movd ($mul1,&DWP(0,$ap)); # ap[0] + &movd ($car1,&DWP(0,$np)); # np[0] + + &pmuludq($mul1,$mul0); # ap[0]*bp[0] + &movq ($car0,$mul1); + &movq ($acc0,$mul1); # I wish movd worked for + &pand ($acc0,$mask); # inter-register transfers + + &pmuludq($mul1,$_n0q); # *=n0 + + &pmuludq($car1,$mul1); # "t[0]"*np[0]*n0 + &paddq ($car1,$acc0); + + &movd ($acc1,&DWP(4,$np)); # np[1] + &movd ($acc0,&DWP(4,$ap)); # ap[1] + + &psrlq ($car0,32); + &psrlq ($car1,32); + + &inc ($j); # j++ +&set_label("1st",16); + &pmuludq($acc0,$mul0); # ap[j]*bp[0] + &pmuludq($acc1,$mul1); # np[j]*m1 + &paddq ($car0,$acc0); # +=c0 + &paddq ($car1,$acc1); # +=c1 + + &movq ($acc0,$car0); + &pand ($acc0,$mask); + &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] + &paddq ($car1,$acc0); # +=ap[j]*bp[0]; + &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] + &psrlq ($car0,32); + &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]= + &psrlq ($car1,32); + + &lea ($j,&DWP(1,$j)); + &cmp ($j,$num); + &jl (&label("1st")); + + &pmuludq($acc0,$mul0); # ap[num-1]*bp[0] + &pmuludq($acc1,$mul1); # np[num-1]*m1 + &paddq ($car0,$acc0); # +=c0 + &paddq ($car1,$acc1); # +=c1 + + &movq ($acc0,$car0); + &pand ($acc0,$mask); + &paddq ($car1,$acc0); # +=ap[num-1]*bp[0]; + &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= + + &psrlq ($car0,32); + &psrlq ($car1,32); + + &paddq ($car1,$car0); + &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] + + &inc ($i); # i++ +&set_label("outer"); + &xor ($j,$j); # j=0 + + &movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i] + &movd ($mul1,&DWP(0,$ap)); # ap[0] + &movd ($temp,&DWP($frame,"esp")); # tp[0] + &movd ($car1,&DWP(0,$np)); # np[0] + &pmuludq($mul1,$mul0); # ap[0]*bp[i] + + &paddq ($mul1,$temp); # +=tp[0] + &movq ($acc0,$mul1); + &movq ($car0,$mul1); + &pand ($acc0,$mask); + + &pmuludq($mul1,$_n0q); # *=n0 + + &pmuludq($car1,$mul1); + &paddq ($car1,$acc0); + + &movd ($temp,&DWP($frame+4,"esp")); # tp[1] + &movd ($acc1,&DWP(4,$np)); # np[1] + &movd ($acc0,&DWP(4,$ap)); # ap[1] + + &psrlq ($car0,32); + &psrlq ($car1,32); + &paddq ($car0,$temp); # +=tp[1] + + &inc ($j); # j++ + &dec ($num); +&set_label("inner"); + &pmuludq($acc0,$mul0); # ap[j]*bp[i] + &pmuludq($acc1,$mul1); # np[j]*m1 + &paddq ($car0,$acc0); # +=c0 + &paddq ($car1,$acc1); # +=c1 + + &movq ($acc0,$car0); + &movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1] + &pand ($acc0,$mask); + &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] + &paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j] + &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] + &psrlq ($car0,32); + &movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]= + &psrlq ($car1,32); + &paddq ($car0,$temp); # +=tp[j+1] + + &dec ($num); + &lea ($j,&DWP(1,$j)); # j++ + &jnz (&label("inner")); + + &mov ($num,$j); + &pmuludq($acc0,$mul0); # ap[num-1]*bp[i] + &pmuludq($acc1,$mul1); # np[num-1]*m1 + &paddq ($car0,$acc0); # +=c0 + &paddq ($car1,$acc1); # +=c1 + + &movq ($acc0,$car0); + &pand ($acc0,$mask); + &paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1] + &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= + &psrlq ($car0,32); + &psrlq ($car1,32); + + &movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num] + &paddq ($car1,$car0); + &paddq ($car1,$temp); + &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] + + &lea ($i,&DWP(1,$i)); # i++ + &cmp ($i,$num); + &jle (&label("outer")); + + &emms (); # done with mmx bank + &jmp (&label("common_tail")); +} + +&set_label("common_tail",16); + &mov ($np,$_np); # load modulus pointer + &mov ($rp,$_rp); # load result pointer + &lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped] + + &mov ("eax",&DWP(0,$tp)); # tp[0] + &mov ($j,$num); # j=num-1 + &xor ($i,$i); # i=0 and clear CF! + +&set_label("sub",16); + &sbb ("eax",&DWP(0,$np,$i,4)); + &mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i] + &dec ($j); # doesn't affect CF! + &mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1] + &lea ($i,&DWP(1,$i)); # i++ + &jge (&label("sub")); + + &sbb ("eax",0); # handle upmost overflow bit + &mov ("edx",-1); + &xor ("edx","eax"); + &jmp (&label("copy")); + +&set_label("copy",16); # conditional copy + &mov ($tp,&DWP($frame,"esp",$num,4)); + &mov ($np,&DWP(0,$rp,$num,4)); + &mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector + &and ($tp,"eax"); + &and ($np,"edx"); + &or ($np,$tp); + &mov (&DWP(0,$rp,$num,4),$np); + &dec ($num); + &jge (&label("copy")); + + &mov ("esp",$_sp); # pull saved stack pointer + &mov ("eax",1); +&function_end("bn_mul_mont"); + +&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by "); + +&asm_finish(); + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/ring-0.17.14/crypto/fipsmodule/bn/asm/x86_64-mont.pl b/ring-0.17.14/crypto/fipsmodule/bn/asm/x86_64-mont.pl new file mode 100644 index 0000000000..acbcd31099 --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/bn/asm/x86_64-mont.pl @@ -0,0 +1,1561 @@ +#! /usr/bin/env perl +# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. +# ==================================================================== + +# October 2005. +# +# Montgomery multiplication routine for x86_64. While it gives modest +# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more +# than twice, >2x, as fast. Most common rsa1024 sign is improved by +# respectful 50%. It remains to be seen if loop unrolling and +# dedicated squaring routine can provide further improvement... + +# July 2011. +# +# Add dedicated squaring procedure. Performance improvement varies +# from platform to platform, but in average it's ~5%/15%/25%/33% +# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively. + +# August 2011. +# +# Unroll and modulo-schedule inner loops in such manner that they +# are "fallen through" for input lengths of 8, which is critical for +# 1024-bit RSA *sign*. Average performance improvement in comparison +# to *initial* version of this module from 2005 is ~0%/30%/40%/45% +# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively. + +# June 2013. +# +# Optimize reduction in squaring procedure and improve 1024+-bit RSA +# sign performance by 10-16% on Intel Sandy Bridge and later +# (virtually same on non-Intel processors). + +# August 2013. +# +# Add MULX/ADOX/ADCX code path. + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; + +# In upstream, this is controlled by shelling out to the compiler to check +# versions, but BoringSSL is intended to be used with pre-generated perlasm +# output, so this isn't useful anyway. +$addx = 1; + +# void bn_mul_mont_nohw( +$rp="%rdi"; # BN_ULONG *rp, +$ap="%rsi"; # const BN_ULONG *ap, +$bp="%rdx"; # const BN_ULONG *bp, +$np="%rcx"; # const BN_ULONG *np, +$n0="%r8"; # const BN_ULONG *n0, +# TODO(davidben): The code below treats $num as an int, but C passes in a +# size_t. +$num="%r9"; # size_t num); +$lo0="%r10"; +$hi0="%r11"; +$hi1="%r13"; +$i="%r14"; +$j="%r15"; +$m0="%rbx"; +$m1="%rbp"; + +$code=<<___; +.text + +.globl bn_mul_mont_nohw +.type bn_mul_mont_nohw,\@function,6 +.align 16 +bn_mul_mont_nohw: +.cfi_startproc + _CET_ENDBR + mov ${num}d,${num}d + mov %rsp,%rax +.cfi_def_cfa_register %rax + push %rbx +.cfi_push %rbx + push %rbp +.cfi_push %rbp + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + + neg $num + mov %rsp,%r11 + lea -16(%rsp,$num,8),%r10 # future alloca(8*(num+2)) + neg $num # restore $num + and \$-1024,%r10 # minimize TLB usage + + # An OS-agnostic version of __chkstk. + # + # Some OSes (Windows) insist on stack being "wired" to + # physical memory in strictly sequential manner, i.e. if stack + # allocation spans two pages, then reference to farmost one can + # be punishable by SEGV. But page walking can do good even on + # other OSes, because it guarantees that villain thread hits + # the guard page before it can make damage to innocent one... + sub %r10,%r11 + and \$-4096,%r11 + lea (%r10,%r11),%rsp + mov (%rsp),%r11 + cmp %r10,%rsp + ja .Lmul_page_walk + jmp .Lmul_page_walk_done + +.align 16 +.Lmul_page_walk: + lea -4096(%rsp),%rsp + mov (%rsp),%r11 + cmp %r10,%rsp + ja .Lmul_page_walk +.Lmul_page_walk_done: + + mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp +.cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8 +.Lmul_body: + mov $bp,%r12 # reassign $bp +___ + $bp="%r12"; +$code.=<<___; + mov ($n0),$n0 # pull n0[0] value + mov ($bp),$m0 # m0=bp[0] + mov ($ap),%rax + + xor $i,$i # i=0 + xor $j,$j # j=0 + + mov $n0,$m1 + mulq $m0 # ap[0]*bp[0] + mov %rax,$lo0 + mov ($np),%rax + + imulq $lo0,$m1 # "tp[0]"*n0 + mov %rdx,$hi0 + + mulq $m1 # np[0]*m1 + add %rax,$lo0 # discarded + mov 8($ap),%rax + adc \$0,%rdx + mov %rdx,$hi1 + + lea 1($j),$j # j++ + jmp .L1st_enter + +.align 16 +.L1st: + add %rax,$hi1 + mov ($ap,$j,8),%rax + adc \$0,%rdx + add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] + mov $lo0,$hi0 + adc \$0,%rdx + mov $hi1,-16(%rsp,$j,8) # tp[j-1] + mov %rdx,$hi1 + +.L1st_enter: + mulq $m0 # ap[j]*bp[0] + add %rax,$hi0 + mov ($np,$j,8),%rax + adc \$0,%rdx + lea 1($j),$j # j++ + mov %rdx,$lo0 + + mulq $m1 # np[j]*m1 + cmp $num,$j + jne .L1st + + add %rax,$hi1 + mov ($ap),%rax # ap[0] + adc \$0,%rdx + add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $hi1,-16(%rsp,$j,8) # tp[j-1] + mov %rdx,$hi1 + mov $lo0,$hi0 + + xor %rdx,%rdx + add $hi0,$hi1 + adc \$0,%rdx + mov $hi1,-8(%rsp,$num,8) + mov %rdx,(%rsp,$num,8) # store upmost overflow bit + + lea 1($i),$i # i++ + jmp .Louter +.align 16 +.Louter: + mov ($bp,$i,8),$m0 # m0=bp[i] + xor $j,$j # j=0 + mov $n0,$m1 + mov (%rsp),$lo0 + mulq $m0 # ap[0]*bp[i] + add %rax,$lo0 # ap[0]*bp[i]+tp[0] + mov ($np),%rax + adc \$0,%rdx + + imulq $lo0,$m1 # tp[0]*n0 + mov %rdx,$hi0 + + mulq $m1 # np[0]*m1 + add %rax,$lo0 # discarded + mov 8($ap),%rax + adc \$0,%rdx + mov 8(%rsp),$lo0 # tp[1] + mov %rdx,$hi1 + + lea 1($j),$j # j++ + jmp .Linner_enter + +.align 16 +.Linner: + add %rax,$hi1 + mov ($ap,$j,8),%rax + adc \$0,%rdx + add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] + mov (%rsp,$j,8),$lo0 + adc \$0,%rdx + mov $hi1,-16(%rsp,$j,8) # tp[j-1] + mov %rdx,$hi1 + +.Linner_enter: + mulq $m0 # ap[j]*bp[i] + add %rax,$hi0 + mov ($np,$j,8),%rax + adc \$0,%rdx + add $hi0,$lo0 # ap[j]*bp[i]+tp[j] + mov %rdx,$hi0 + adc \$0,$hi0 + lea 1($j),$j # j++ + + mulq $m1 # np[j]*m1 + cmp $num,$j + jne .Linner + + add %rax,$hi1 + mov ($ap),%rax # ap[0] + adc \$0,%rdx + add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] + mov (%rsp,$j,8),$lo0 + adc \$0,%rdx + mov $hi1,-16(%rsp,$j,8) # tp[j-1] + mov %rdx,$hi1 + + xor %rdx,%rdx + add $hi0,$hi1 + adc \$0,%rdx + add $lo0,$hi1 # pull upmost overflow bit + adc \$0,%rdx + mov $hi1,-8(%rsp,$num,8) + mov %rdx,(%rsp,$num,8) # store upmost overflow bit + + lea 1($i),$i # i++ + cmp $num,$i + jb .Louter + + xor $i,$i # i=0 and clear CF! + mov (%rsp),%rax # tp[0] + mov $num,$j # j=num + +.align 16 +.Lsub: sbb ($np,$i,8),%rax + mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] + mov 8(%rsp,$i,8),%rax # tp[i+1] + lea 1($i),$i # i++ + dec $j # doesn't affect CF! + jnz .Lsub + + sbb \$0,%rax # handle upmost overflow bit + mov \$-1,%rbx + xor %rax,%rbx # not %rax + xor $i,$i + mov $num,$j # j=num + +.Lcopy: # conditional copy + mov ($rp,$i,8),%rcx + mov (%rsp,$i,8),%rdx + and %rbx,%rcx + and %rax,%rdx + mov $num,(%rsp,$i,8) # zap temporary vector + or %rcx,%rdx + mov %rdx,($rp,$i,8) # rp[i]=tp[i] + lea 1($i),$i + sub \$1,$j + jnz .Lcopy + + mov 8(%rsp,$num,8),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 + mov \$1,%rax + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbp +.cfi_restore %rbp + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lmul_epilogue: + ret +.cfi_endproc +.size bn_mul_mont_nohw,.-bn_mul_mont_nohw +___ +{{{ +my @A=("%r10","%r11"); +my @N=("%r13","%rdi"); +$code.=<<___; +.globl bn_mul4x_mont +.type bn_mul4x_mont,\@function,6 +.align 16 +bn_mul4x_mont: +.cfi_startproc + _CET_ENDBR + mov ${num}d,${num}d + mov %rsp,%rax +.cfi_def_cfa_register %rax + push %rbx +.cfi_push %rbx + push %rbp +.cfi_push %rbp + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + + neg $num + mov %rsp,%r11 + lea -32(%rsp,$num,8),%r10 # future alloca(8*(num+4)) + neg $num # restore + and \$-1024,%r10 # minimize TLB usage + + sub %r10,%r11 + and \$-4096,%r11 + lea (%r10,%r11),%rsp + mov (%rsp),%r11 + cmp %r10,%rsp + ja .Lmul4x_page_walk + jmp .Lmul4x_page_walk_done + +.Lmul4x_page_walk: + lea -4096(%rsp),%rsp + mov (%rsp),%r11 + cmp %r10,%rsp + ja .Lmul4x_page_walk +.Lmul4x_page_walk_done: + + mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp +.cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8 +.Lmul4x_body: + mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp + mov %rdx,%r12 # reassign $bp +___ + $bp="%r12"; +$code.=<<___; + mov ($n0),$n0 # pull n0[0] value + mov ($bp),$m0 # m0=bp[0] + mov ($ap),%rax + + xor $i,$i # i=0 + xor $j,$j # j=0 + + mov $n0,$m1 + mulq $m0 # ap[0]*bp[0] + mov %rax,$A[0] + mov ($np),%rax + + imulq $A[0],$m1 # "tp[0]"*n0 + mov %rdx,$A[1] + + mulq $m1 # np[0]*m1 + add %rax,$A[0] # discarded + mov 8($ap),%rax + adc \$0,%rdx + mov %rdx,$N[1] + + mulq $m0 + add %rax,$A[1] + mov 8($np),%rax + adc \$0,%rdx + mov %rdx,$A[0] + + mulq $m1 + add %rax,$N[1] + mov 16($ap),%rax + adc \$0,%rdx + add $A[1],$N[1] + lea 4($j),$j # j++ + adc \$0,%rdx + mov $N[1],(%rsp) + mov %rdx,$N[0] + jmp .L1st4x +.align 16 +.L1st4x: + mulq $m0 # ap[j]*bp[0] + add %rax,$A[0] + mov -16($np,$j,8),%rax + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov -8($ap,$j,8),%rax + adc \$0,%rdx + add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $N[0],-24(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[0] + add %rax,$A[1] + mov -8($np,$j,8),%rax + adc \$0,%rdx + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov ($ap,$j,8),%rax + adc \$0,%rdx + add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $N[1],-16(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[0] + + mulq $m0 # ap[j]*bp[0] + add %rax,$A[0] + mov ($np,$j,8),%rax + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov 8($ap,$j,8),%rax + adc \$0,%rdx + add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $N[0],-8(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[0] + add %rax,$A[1] + mov 8($np,$j,8),%rax + adc \$0,%rdx + lea 4($j),$j # j++ + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov -16($ap,$j,8),%rax + adc \$0,%rdx + add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $N[1],-32(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[0] + cmp $num,$j + jb .L1st4x + + mulq $m0 # ap[j]*bp[0] + add %rax,$A[0] + mov -16($np,$j,8),%rax + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov -8($ap,$j,8),%rax + adc \$0,%rdx + add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $N[0],-24(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[0] + add %rax,$A[1] + mov -8($np,$j,8),%rax + adc \$0,%rdx + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov ($ap),%rax # ap[0] + adc \$0,%rdx + add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $N[1],-16(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[0] + + xor $N[1],$N[1] + add $A[0],$N[0] + adc \$0,$N[1] + mov $N[0],-8(%rsp,$j,8) + mov $N[1],(%rsp,$j,8) # store upmost overflow bit + + lea 1($i),$i # i++ +.align 4 +.Louter4x: + mov ($bp,$i,8),$m0 # m0=bp[i] + xor $j,$j # j=0 + mov (%rsp),$A[0] + mov $n0,$m1 + mulq $m0 # ap[0]*bp[i] + add %rax,$A[0] # ap[0]*bp[i]+tp[0] + mov ($np),%rax + adc \$0,%rdx + + imulq $A[0],$m1 # tp[0]*n0 + mov %rdx,$A[1] + + mulq $m1 # np[0]*m1 + add %rax,$A[0] # "$N[0]", discarded + mov 8($ap),%rax + adc \$0,%rdx + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[1] + mov 8($np),%rax + adc \$0,%rdx + add 8(%rsp),$A[1] # +tp[1] + adc \$0,%rdx + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov 16($ap),%rax + adc \$0,%rdx + add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] + lea 4($j),$j # j+=2 + adc \$0,%rdx + mov $N[1],(%rsp) # tp[j-1] + mov %rdx,$N[0] + jmp .Linner4x +.align 16 +.Linner4x: + mulq $m0 # ap[j]*bp[i] + add %rax,$A[0] + mov -16($np,$j,8),%rax + adc \$0,%rdx + add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov -8($ap,$j,8),%rax + adc \$0,%rdx + add $A[0],$N[0] + adc \$0,%rdx + mov $N[0],-24(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[1] + mov -8($np,$j,8),%rax + adc \$0,%rdx + add -8(%rsp,$j,8),$A[1] + adc \$0,%rdx + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov ($ap,$j,8),%rax + adc \$0,%rdx + add $A[1],$N[1] + adc \$0,%rdx + mov $N[1],-16(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[0] + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[0] + mov ($np,$j,8),%rax + adc \$0,%rdx + add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov 8($ap,$j,8),%rax + adc \$0,%rdx + add $A[0],$N[0] + adc \$0,%rdx + mov $N[0],-8(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[1] + mov 8($np,$j,8),%rax + adc \$0,%rdx + add 8(%rsp,$j,8),$A[1] + adc \$0,%rdx + lea 4($j),$j # j++ + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov -16($ap,$j,8),%rax + adc \$0,%rdx + add $A[1],$N[1] + adc \$0,%rdx + mov $N[1],-32(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[0] + cmp $num,$j + jb .Linner4x + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[0] + mov -16($np,$j,8),%rax + adc \$0,%rdx + add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov -8($ap,$j,8),%rax + adc \$0,%rdx + add $A[0],$N[0] + adc \$0,%rdx + mov $N[0],-24(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[1] + mov -8($np,$j,8),%rax + adc \$0,%rdx + add -8(%rsp,$j,8),$A[1] + adc \$0,%rdx + lea 1($i),$i # i++ + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov ($ap),%rax # ap[0] + adc \$0,%rdx + add $A[1],$N[1] + adc \$0,%rdx + mov $N[1],-16(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[0] + + xor $N[1],$N[1] + add $A[0],$N[0] + adc \$0,$N[1] + add (%rsp,$num,8),$N[0] # pull upmost overflow bit + adc \$0,$N[1] + mov $N[0],-8(%rsp,$j,8) + mov $N[1],(%rsp,$j,8) # store upmost overflow bit + + cmp $num,$i + jb .Louter4x +___ +{ +my @ri=("%rax","%rdx",$m0,$m1); +$code.=<<___; + mov 16(%rsp,$num,8),$rp # restore $rp + lea -4($num),$j + mov 0(%rsp),@ri[0] # tp[0] + mov 8(%rsp),@ri[1] # tp[1] + shr \$2,$j # j=num/4-1 + lea (%rsp),$ap # borrow ap for tp + xor $i,$i # i=0 and clear CF! + + sub 0($np),@ri[0] + mov 16($ap),@ri[2] # tp[2] + mov 24($ap),@ri[3] # tp[3] + sbb 8($np),@ri[1] + +.Lsub4x: + mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] + mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] + sbb 16($np,$i,8),@ri[2] + mov 32($ap,$i,8),@ri[0] # tp[i+1] + mov 40($ap,$i,8),@ri[1] + sbb 24($np,$i,8),@ri[3] + mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] + mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] + sbb 32($np,$i,8),@ri[0] + mov 48($ap,$i,8),@ri[2] + mov 56($ap,$i,8),@ri[3] + sbb 40($np,$i,8),@ri[1] + lea 4($i),$i # i++ + dec $j # doesn't affect CF! + jnz .Lsub4x + + mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] + mov 32($ap,$i,8),@ri[0] # load overflow bit + sbb 16($np,$i,8),@ri[2] + mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] + sbb 24($np,$i,8),@ri[3] + mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] + + sbb \$0,@ri[0] # handle upmost overflow bit + mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] + pxor %xmm0,%xmm0 + movq @ri[0],%xmm4 + pcmpeqd %xmm5,%xmm5 + pshufd \$0,%xmm4,%xmm4 + mov $num,$j + pxor %xmm4,%xmm5 + shr \$2,$j # j=num/4 + xor %eax,%eax # i=0 + + jmp .Lcopy4x +.align 16 +.Lcopy4x: # conditional copy + movdqa (%rsp,%rax),%xmm1 + movdqu ($rp,%rax),%xmm2 + pand %xmm4,%xmm1 + pand %xmm5,%xmm2 + movdqa 16(%rsp,%rax),%xmm3 + movdqa %xmm0,(%rsp,%rax) + por %xmm2,%xmm1 + movdqu 16($rp,%rax),%xmm2 + movdqu %xmm1,($rp,%rax) + pand %xmm4,%xmm3 + pand %xmm5,%xmm2 + movdqa %xmm0,16(%rsp,%rax) + por %xmm2,%xmm3 + movdqu %xmm3,16($rp,%rax) + lea 32(%rax),%rax + dec $j + jnz .Lcopy4x +___ +} +$code.=<<___; + mov 8(%rsp,$num,8),%rsi # restore %rsp +.cfi_def_cfa %rsi, 8 + mov \$1,%rax + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbp +.cfi_restore %rbp + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lmul4x_epilogue: + ret +.cfi_endproc +.size bn_mul4x_mont,.-bn_mul4x_mont +___ +}}} + {{{ +###################################################################### +# void bn_sqr8x_mont( +my $rptr="%rdi"; # const BN_ULONG *rptr, +my $aptr="%rsi"; # const BN_ULONG *aptr, +my $mulx_adx_capable="%rdx"; # Different than upstream! +my $nptr="%rcx"; # const BN_ULONG *nptr, +my $n0 ="%r8"; # const BN_ULONG *n0); +my $num ="%r9"; # int num, has to be divisible by 8 + +my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); +my @A0=("%r10","%r11"); +my @A1=("%r12","%r13"); +my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); + +$code.=<<___ if ($addx); +.extern bn_sqrx8x_internal # see x86_64-mont5 module +___ +$code.=<<___; +.extern bn_sqr8x_internal # see x86_64-mont5 module + +.globl bn_sqr8x_mont +.type bn_sqr8x_mont,\@function,6 +.align 32 +bn_sqr8x_mont: +.cfi_startproc + _CET_ENDBR + mov ${num}d,${num}d + mov %rsp,%rax +.cfi_def_cfa_register %rax + push %rbx +.cfi_push %rbx + push %rbp +.cfi_push %rbp + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lsqr8x_prologue: + + mov ${num}d,%r10d + shl \$3,${num}d # convert $num to bytes + shl \$3+2,%r10 # 4*$num + neg $num + + ############################################################## + # ensure that stack frame doesn't alias with $aptr modulo + # 4096. this is done to allow memory disambiguation logic + # do its job. + # + lea -64(%rsp,$num,2),%r11 + mov %rsp,%rbp + mov ($n0),$n0 # *n0 + sub $aptr,%r11 + and \$4095,%r11 + cmp %r11,%r10 + jb .Lsqr8x_sp_alt + sub %r11,%rbp # align with $aptr + lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num) + jmp .Lsqr8x_sp_done + +.align 32 +.Lsqr8x_sp_alt: + lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num + lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num) + sub %r10,%r11 + mov \$0,%r10 + cmovc %r10,%r11 + sub %r11,%rbp +.Lsqr8x_sp_done: + and \$-64,%rbp + mov %rsp,%r11 + sub %rbp,%r11 + and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lsqr8x_page_walk + jmp .Lsqr8x_page_walk_done + +.align 16 +.Lsqr8x_page_walk: + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lsqr8x_page_walk +.Lsqr8x_page_walk_done: + + mov $num,%r10 + neg $num + + mov $n0, 32(%rsp) + mov %rax, 40(%rsp) # save original %rsp +.cfi_cfa_expression %rsp+40,deref,+8 +.Lsqr8x_body: + + movq $nptr, %xmm2 # save pointer to modulus + pxor %xmm0,%xmm0 + movq $rptr,%xmm1 # save $rptr + movq %r10, %xmm3 # -$num +___ +$code.=<<___ if ($addx); + test $mulx_adx_capable,$mulx_adx_capable + jz .Lsqr8x_nox + + call bn_sqrx8x_internal # see x86_64-mont5 module + # %rax top-most carry + # %rbp nptr + # %rcx -8*num + # %r8 end of tp[2*num] + lea (%r8,%rcx),%rbx + mov %rcx,$num + mov %rcx,%rdx + movq %xmm1,$rptr + sar \$3+2,%rcx # %cf=0 + jmp .Lsqr8x_sub + +.align 32 +.Lsqr8x_nox: +___ +$code.=<<___; + call bn_sqr8x_internal # see x86_64-mont5 module + # %rax top-most carry + # %rbp nptr + # %r8 -8*num + # %rdi end of tp[2*num] + lea (%rdi,$num),%rbx + mov $num,%rcx + mov $num,%rdx + movq %xmm1,$rptr + sar \$3+2,%rcx # %cf=0 + jmp .Lsqr8x_sub + +.align 32 +.Lsqr8x_sub: + mov 8*0(%rbx),%r12 + mov 8*1(%rbx),%r13 + mov 8*2(%rbx),%r14 + mov 8*3(%rbx),%r15 + lea 8*4(%rbx),%rbx + sbb 8*0(%rbp),%r12 + sbb 8*1(%rbp),%r13 + sbb 8*2(%rbp),%r14 + sbb 8*3(%rbp),%r15 + lea 8*4(%rbp),%rbp + mov %r12,8*0($rptr) + mov %r13,8*1($rptr) + mov %r14,8*2($rptr) + mov %r15,8*3($rptr) + lea 8*4($rptr),$rptr + inc %rcx # preserves %cf + jnz .Lsqr8x_sub + + sbb \$0,%rax # top-most carry + lea (%rbx,$num),%rbx # rewind + lea ($rptr,$num),$rptr # rewind + + movq %rax,%xmm1 + pxor %xmm0,%xmm0 + pshufd \$0,%xmm1,%xmm1 + mov 40(%rsp),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 + jmp .Lsqr8x_cond_copy + +.align 32 +.Lsqr8x_cond_copy: + movdqa 16*0(%rbx),%xmm2 + movdqa 16*1(%rbx),%xmm3 + lea 16*2(%rbx),%rbx + movdqu 16*0($rptr),%xmm4 + movdqu 16*1($rptr),%xmm5 + lea 16*2($rptr),$rptr + movdqa %xmm0,-16*2(%rbx) # zero tp + movdqa %xmm0,-16*1(%rbx) + movdqa %xmm0,-16*2(%rbx,%rdx) + movdqa %xmm0,-16*1(%rbx,%rdx) + pcmpeqd %xmm1,%xmm0 + pand %xmm1,%xmm2 + pand %xmm1,%xmm3 + pand %xmm0,%xmm4 + pand %xmm0,%xmm5 + pxor %xmm0,%xmm0 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqu %xmm4,-16*2($rptr) + movdqu %xmm5,-16*1($rptr) + add \$32,$num + jnz .Lsqr8x_cond_copy + + mov \$1,%rax + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbp +.cfi_restore %rbp + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lsqr8x_epilogue: + ret +.cfi_endproc +.size bn_sqr8x_mont,.-bn_sqr8x_mont +___ +}}} + +if ($addx) {{{ +my $bp="%rdx"; # original value + +$code.=<<___; +.globl bn_mulx4x_mont +.type bn_mulx4x_mont,\@function,6 +.align 32 +bn_mulx4x_mont: +.cfi_startproc + _CET_ENDBR + mov %rsp,%rax +.cfi_def_cfa_register %rax + push %rbx +.cfi_push %rbx + push %rbp +.cfi_push %rbp + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lmulx4x_prologue: + + shl \$3,${num}d # convert $num to bytes + xor %r10,%r10 + sub $num,%r10 # -$num + mov ($n0),$n0 # *n0 + lea -72(%rsp,%r10),%rbp # future alloca(frame+$num+8) + and \$-128,%rbp + mov %rsp,%r11 + sub %rbp,%r11 + and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lmulx4x_page_walk + jmp .Lmulx4x_page_walk_done + +.align 16 +.Lmulx4x_page_walk: + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lmulx4x_page_walk +.Lmulx4x_page_walk_done: + + lea ($bp,$num),%r10 + ############################################################## + # Stack layout + # +0 num + # +8 off-loaded &b[i] + # +16 end of b[num] + # +24 saved n0 + # +32 saved rp + # +40 saved %rsp + # +48 inner counter + # +56 + # +64 tmp[num+1] + # + mov $num,0(%rsp) # save $num + shr \$5,$num + mov %r10,16(%rsp) # end of b[num] + sub \$1,$num + mov $n0, 24(%rsp) # save *n0 + mov $rp, 32(%rsp) # save $rp + mov %rax,40(%rsp) # save original %rsp +.cfi_cfa_expression %rsp+40,deref,+8 + mov $num,48(%rsp) # inner counter + jmp .Lmulx4x_body + +.align 32 +.Lmulx4x_body: +___ +my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)= + ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax"); +my $rptr=$bptr; +$code.=<<___; + lea 8($bp),$bptr + mov ($bp),%rdx # b[0], $bp==%rdx actually + lea 64+32(%rsp),$tptr + mov %rdx,$bi + + mulx 0*8($aptr),$mi,%rax # a[0]*b[0] + mulx 1*8($aptr),%r11,%r14 # a[1]*b[0] + add %rax,%r11 + mov $bptr,8(%rsp) # off-load &b[i] + mulx 2*8($aptr),%r12,%r13 # ... + adc %r14,%r12 + adc \$0,%r13 + + mov $mi,$bptr # borrow $bptr + imulq 24(%rsp),$mi # "t[0]"*n0 + xor $zero,$zero # cf=0, of=0 + + mulx 3*8($aptr),%rax,%r14 + mov $mi,%rdx + lea 4*8($aptr),$aptr + adcx %rax,%r13 + adcx $zero,%r14 # cf=0 + + mulx 0*8($nptr),%rax,%r10 + adcx %rax,$bptr # discarded + adox %r11,%r10 + mulx 1*8($nptr),%rax,%r11 + adcx %rax,%r10 + adox %r12,%r11 + .byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 # mulx 2*8($nptr),%rax,%r12 + mov 48(%rsp),$bptr # counter value + mov %r10,-4*8($tptr) + adcx %rax,%r11 + adox %r13,%r12 + mulx 3*8($nptr),%rax,%r15 + mov $bi,%rdx + mov %r11,-3*8($tptr) + adcx %rax,%r12 + adox $zero,%r15 # of=0 + lea 4*8($nptr),$nptr + mov %r12,-2*8($tptr) + + jmp .Lmulx4x_1st + +.align 32 +.Lmulx4x_1st: + adcx $zero,%r15 # cf=0, modulo-scheduled + mulx 0*8($aptr),%r10,%rax # a[4]*b[0] + adcx %r14,%r10 + mulx 1*8($aptr),%r11,%r14 # a[5]*b[0] + adcx %rax,%r11 + mulx 2*8($aptr),%r12,%rax # ... + adcx %r14,%r12 + mulx 3*8($aptr),%r13,%r14 + .byte 0x67,0x67 + mov $mi,%rdx + adcx %rax,%r13 + adcx $zero,%r14 # cf=0 + lea 4*8($aptr),$aptr + lea 4*8($tptr),$tptr + + adox %r15,%r10 + mulx 0*8($nptr),%rax,%r15 + adcx %rax,%r10 + adox %r15,%r11 + mulx 1*8($nptr),%rax,%r15 + adcx %rax,%r11 + adox %r15,%r12 + mulx 2*8($nptr),%rax,%r15 + mov %r10,-5*8($tptr) + adcx %rax,%r12 + mov %r11,-4*8($tptr) + adox %r15,%r13 + mulx 3*8($nptr),%rax,%r15 + mov $bi,%rdx + mov %r12,-3*8($tptr) + adcx %rax,%r13 + adox $zero,%r15 + lea 4*8($nptr),$nptr + mov %r13,-2*8($tptr) + + dec $bptr # of=0, pass cf + jnz .Lmulx4x_1st + + mov 0(%rsp),$num # load num + mov 8(%rsp),$bptr # re-load &b[i] + adc $zero,%r15 # modulo-scheduled + add %r15,%r14 + sbb %r15,%r15 # top-most carry + mov %r14,-1*8($tptr) + jmp .Lmulx4x_outer + +.align 32 +.Lmulx4x_outer: + mov ($bptr),%rdx # b[i] + lea 8($bptr),$bptr # b++ + sub $num,$aptr # rewind $aptr + mov %r15,($tptr) # save top-most carry + lea 64+4*8(%rsp),$tptr + sub $num,$nptr # rewind $nptr + + mulx 0*8($aptr),$mi,%r11 # a[0]*b[i] + xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0 + mov %rdx,$bi + mulx 1*8($aptr),%r14,%r12 # a[1]*b[i] + adox -4*8($tptr),$mi + adcx %r14,%r11 + mulx 2*8($aptr),%r15,%r13 # ... + adox -3*8($tptr),%r11 + adcx %r15,%r12 + adox -2*8($tptr),%r12 + adcx $zero,%r13 + adox $zero,%r13 + + mov $bptr,8(%rsp) # off-load &b[i] + mov $mi,%r15 + imulq 24(%rsp),$mi # "t[0]"*n0 + xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0 + + mulx 3*8($aptr),%rax,%r14 + mov $mi,%rdx + adcx %rax,%r13 + adox -1*8($tptr),%r13 + adcx $zero,%r14 + lea 4*8($aptr),$aptr + adox $zero,%r14 + + mulx 0*8($nptr),%rax,%r10 + adcx %rax,%r15 # discarded + adox %r11,%r10 + mulx 1*8($nptr),%rax,%r11 + adcx %rax,%r10 + adox %r12,%r11 + mulx 2*8($nptr),%rax,%r12 + mov %r10,-4*8($tptr) + adcx %rax,%r11 + adox %r13,%r12 + mulx 3*8($nptr),%rax,%r15 + mov $bi,%rdx + mov %r11,-3*8($tptr) + lea 4*8($nptr),$nptr + adcx %rax,%r12 + adox $zero,%r15 # of=0 + mov 48(%rsp),$bptr # counter value + mov %r12,-2*8($tptr) + + jmp .Lmulx4x_inner + +.align 32 +.Lmulx4x_inner: + mulx 0*8($aptr),%r10,%rax # a[4]*b[i] + adcx $zero,%r15 # cf=0, modulo-scheduled + adox %r14,%r10 + mulx 1*8($aptr),%r11,%r14 # a[5]*b[i] + adcx 0*8($tptr),%r10 + adox %rax,%r11 + mulx 2*8($aptr),%r12,%rax # ... + adcx 1*8($tptr),%r11 + adox %r14,%r12 + mulx 3*8($aptr),%r13,%r14 + mov $mi,%rdx + adcx 2*8($tptr),%r12 + adox %rax,%r13 + adcx 3*8($tptr),%r13 + adox $zero,%r14 # of=0 + lea 4*8($aptr),$aptr + lea 4*8($tptr),$tptr + adcx $zero,%r14 # cf=0 + + adox %r15,%r10 + mulx 0*8($nptr),%rax,%r15 + adcx %rax,%r10 + adox %r15,%r11 + mulx 1*8($nptr),%rax,%r15 + adcx %rax,%r11 + adox %r15,%r12 + mulx 2*8($nptr),%rax,%r15 + mov %r10,-5*8($tptr) + adcx %rax,%r12 + adox %r15,%r13 + mulx 3*8($nptr),%rax,%r15 + mov $bi,%rdx + mov %r11,-4*8($tptr) + mov %r12,-3*8($tptr) + adcx %rax,%r13 + adox $zero,%r15 + lea 4*8($nptr),$nptr + mov %r13,-2*8($tptr) + + dec $bptr # of=0, pass cf + jnz .Lmulx4x_inner + + mov 0(%rsp),$num # load num + mov 8(%rsp),$bptr # re-load &b[i] + adc $zero,%r15 # modulo-scheduled + sub 0*8($tptr),$zero # pull top-most carry + adc %r15,%r14 + sbb %r15,%r15 # top-most carry + mov %r14,-1*8($tptr) + + cmp 16(%rsp),$bptr + jne .Lmulx4x_outer + + lea 64(%rsp),$tptr + sub $num,$nptr # rewind $nptr + neg %r15 + mov $num,%rdx + shr \$3+2,$num # %cf=0 + mov 32(%rsp),$rptr # restore rp + jmp .Lmulx4x_sub + +.align 32 +.Lmulx4x_sub: + mov 8*0($tptr),%r11 + mov 8*1($tptr),%r12 + mov 8*2($tptr),%r13 + mov 8*3($tptr),%r14 + lea 8*4($tptr),$tptr + sbb 8*0($nptr),%r11 + sbb 8*1($nptr),%r12 + sbb 8*2($nptr),%r13 + sbb 8*3($nptr),%r14 + lea 8*4($nptr),$nptr + mov %r11,8*0($rptr) + mov %r12,8*1($rptr) + mov %r13,8*2($rptr) + mov %r14,8*3($rptr) + lea 8*4($rptr),$rptr + dec $num # preserves %cf + jnz .Lmulx4x_sub + + sbb \$0,%r15 # top-most carry + lea 64(%rsp),$tptr + sub %rdx,$rptr # rewind + + movq %r15,%xmm1 + pxor %xmm0,%xmm0 + pshufd \$0,%xmm1,%xmm1 + mov 40(%rsp),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 + jmp .Lmulx4x_cond_copy + +.align 32 +.Lmulx4x_cond_copy: + movdqa 16*0($tptr),%xmm2 + movdqa 16*1($tptr),%xmm3 + lea 16*2($tptr),$tptr + movdqu 16*0($rptr),%xmm4 + movdqu 16*1($rptr),%xmm5 + lea 16*2($rptr),$rptr + movdqa %xmm0,-16*2($tptr) # zero tp + movdqa %xmm0,-16*1($tptr) + pcmpeqd %xmm1,%xmm0 + pand %xmm1,%xmm2 + pand %xmm1,%xmm3 + pand %xmm0,%xmm4 + pand %xmm0,%xmm5 + pxor %xmm0,%xmm0 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqu %xmm4,-16*2($rptr) + movdqu %xmm5,-16*1($rptr) + sub \$32,%rdx + jnz .Lmulx4x_cond_copy + + mov %rdx,($tptr) + + mov \$1,%rax + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbp +.cfi_restore %rbp + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lmulx4x_epilogue: + ret +.cfi_endproc +.size bn_mulx4x_mont,.-bn_mulx4x_mont +___ +}}} +$code.=<<___; +.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by " +.align 16 +___ + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type mul_handler,\@abi-omnipotent +.align 16 +mul_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # end of prologue label + cmp %r10,%rbx # context->RipRsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + mov 192($context),%r10 # pull $num + mov 8(%rax,%r10,8),%rax # pull saved stack pointer + + jmp .Lcommon_pop_regs +.size mul_handler,.-mul_handler + +.type sqr_handler,\@abi-omnipotent +.align 16 +sqr_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # end of prologue label + cmp %r10,%rbx # context->Rip<.Lsqr_prologue + jb .Lcommon_seh_tail + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # body label + cmp %r10,%rbx # context->Rip<.Lsqr_body + jb .Lcommon_pop_regs + + mov 152($context),%rax # pull context->Rsp + + mov 8(%r11),%r10d # HandlerData[2] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue + jae .Lcommon_seh_tail + + mov 40(%rax),%rax # pull saved stack pointer + +.Lcommon_pop_regs: + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 + +.Lcommon_seh_tail: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size sqr_handler,.-sqr_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_bn_mul_mont_nohw + .rva .LSEH_end_bn_mul_mont_nohw + .rva .LSEH_info_bn_mul_mont_nohw + + .rva .LSEH_begin_bn_mul4x_mont + .rva .LSEH_end_bn_mul4x_mont + .rva .LSEH_info_bn_mul4x_mont + + .rva .LSEH_begin_bn_sqr8x_mont + .rva .LSEH_end_bn_sqr8x_mont + .rva .LSEH_info_bn_sqr8x_mont +___ +$code.=<<___ if ($addx); + .rva .LSEH_begin_bn_mulx4x_mont + .rva .LSEH_end_bn_mulx4x_mont + .rva .LSEH_info_bn_mulx4x_mont +___ +$code.=<<___; +.section .xdata +.align 8 +.LSEH_info_bn_mul_mont_nohw: + .byte 9,0,0,0 + .rva mul_handler + .rva .Lmul_body,.Lmul_epilogue # HandlerData[] +.LSEH_info_bn_mul4x_mont: + .byte 9,0,0,0 + .rva mul_handler + .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[] +.LSEH_info_bn_sqr8x_mont: + .byte 9,0,0,0 + .rva sqr_handler + .rva .Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[] +.align 8 +___ +$code.=<<___ if ($addx); +.LSEH_info_bn_mulx4x_mont: + .byte 9,0,0,0 + .rva sqr_handler + .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] +.align 8 +___ +} + +print $code; +close STDOUT or die "error closing STDOUT: $!"; diff --git a/ring-0.17.14/crypto/fipsmodule/bn/asm/x86_64-mont5.pl b/ring-0.17.14/crypto/fipsmodule/bn/asm/x86_64-mont5.pl new file mode 100644 index 0000000000..e7628712a7 --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/bn/asm/x86_64-mont5.pl @@ -0,0 +1,3352 @@ +#! /usr/bin/env perl +# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. +# ==================================================================== + +# August 2011. +# +# Companion to x86_64-mont.pl that optimizes cache-timing attack +# countermeasures. The subroutines are produced by replacing bp[i] +# references in their x86_64-mont.pl counterparts with cache-neutral +# references to powers table computed in BN_mod_exp_mont_consttime. +# In addition subroutine that scatters elements of the powers table +# is implemented, so that scatter-/gathering can be tuned without +# bn_exp.c modifications. + +# August 2013. +# +# Add MULX/AD*X code paths and additional interfaces to optimize for +# branch prediction unit. For input lengths that are multiples of 8 +# the np argument is not just modulus value, but one interleaved +# with 0. This is to optimize post-condition... + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; + +# In upstream, this is controlled by shelling out to the compiler to check +# versions, but BoringSSL is intended to be used with pre-generated perlasm +# output, so this isn't useful anyway. +$addx = 1; + +# int bn_mul_mont_gather5_nohw( +$rp="%rdi"; # BN_ULONG *rp, +$ap="%rsi"; # const BN_ULONG *ap, +$bp="%rdx"; # const BN_ULONG *bp, +$np="%rcx"; # const BN_ULONG *np, +$n0="%r8"; # const BN_ULONG *n0, +$num="%r9"; # int num, + # int idx); # 0 to 2^5-1, "index" in $bp holding + # pre-computed powers of a', interlaced + # in such manner that b[0] is $bp[idx], + # b[1] is [2^5+idx], etc. +$lo0="%r10"; +$hi0="%r11"; +$hi1="%r13"; +$i="%r14"; +$j="%r15"; +$m0="%rbx"; +$m1="%rbp"; + +$code=<<___; +.text + +___ +{{{ +my @A=("%r10","%r11"); +my @N=("%r13","%rdi"); +$code.=<<___; +.globl bn_mul4x_mont_gather5 +.type bn_mul4x_mont_gather5,\@function,6 +.align 32 +bn_mul4x_mont_gather5: +.cfi_startproc + _CET_ENDBR + .byte 0x67 + mov %rsp,%rax +.cfi_def_cfa_register %rax + push %rbx +.cfi_push %rbx + push %rbp +.cfi_push %rbp + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lmul4x_prologue: + + .byte 0x67 + # num is declared as an int, a 32-bit parameter, so the upper half is + # undefined. It is important that this write to ${num}, which zeros the + # upper half, predates the first access. + shl \$3,${num}d # convert $num to bytes + lea ($num,$num,2),%r10 # 3*$num in bytes + neg $num # -$num + + ############################################################## + # Ensure that stack frame doesn't alias with $rptr+3*$num + # modulo 4096, which covers ret[num], am[num] and n[num] + # (see bn_exp.c). This is done to allow memory disambiguation + # logic do its magic. [Extra [num] is allocated in order + # to align with bn_power5's frame, which is cleansed after + # completing exponentiation. Extra 256 bytes is for power mask + # calculated from 7th argument, the index.] + # + lea -320(%rsp,$num,2),%r11 + mov %rsp,%rbp + sub $rp,%r11 + and \$4095,%r11 + cmp %r11,%r10 + jb .Lmul4xsp_alt + sub %r11,%rbp # align with $rp + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) + jmp .Lmul4xsp_done + +.align 32 +.Lmul4xsp_alt: + lea 4096-320(,$num,2),%r10 + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) + sub %r10,%r11 + mov \$0,%r10 + cmovc %r10,%r11 + sub %r11,%rbp +.Lmul4xsp_done: + and \$-64,%rbp + mov %rsp,%r11 + sub %rbp,%r11 + and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lmul4x_page_walk + jmp .Lmul4x_page_walk_done + +.Lmul4x_page_walk: + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lmul4x_page_walk +.Lmul4x_page_walk_done: + + neg $num + + mov %rax,40(%rsp) +.cfi_cfa_expression %rsp+40,deref,+8 +.Lmul4x_body: + + call mul4x_internal + + mov 40(%rsp),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 + mov \$1,%rax + + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbp +.cfi_restore %rbp + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lmul4x_epilogue: + ret +.cfi_endproc +.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 + +.type mul4x_internal,\@abi-omnipotent +.align 32 +mul4x_internal: +.cfi_startproc + shl \$5,$num # $num was in bytes + movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument, index + lea .Linc(%rip),%rax + lea 128(%rdx,$num),%r13 # end of powers table (+size optimization) + shr \$5,$num # restore $num +___ + $bp="%r12"; + $STRIDE=2**5*8; # 5 is "window size" + $tp=$i; +$code.=<<___; + movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 + movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 + lea 88-112(%rsp,$num),%r10 # place the mask after tp[num+1] (+ICache optimization) + lea 128(%rdx),$bp # size optimization + + pshufd \$0,%xmm5,%xmm5 # broadcast index + movdqa %xmm1,%xmm4 + .byte 0x67,0x67 + movdqa %xmm1,%xmm2 +___ +######################################################################## +# Calculate masks by comparing 0..31 to $idx and save result to stack. +# +# We compute sixteen 16-byte masks and store them on the stack. Mask i is stored +# in `16*i - 128`(%rax) and contains the comparisons for idx == 2*i and +# idx == 2*i + 1 in its lower and upper halves, respectively. Mask calculations +# are scheduled in groups of four. +$code.=<<___; + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 # compare to 1,0 + .byte 0x67 + movdqa %xmm4,%xmm3 +___ +for($i=0;$i<$STRIDE/16-4;$i+=4) { +$code.=<<___; + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 # compare to 3,2 + movdqa %xmm0,`16*($i+0)+112`(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 # compare to 5,4 + movdqa %xmm1,`16*($i+1)+112`(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 # compare to 7,6 + movdqa %xmm2,`16*($i+2)+112`(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,`16*($i+3)+112`(%r10) + movdqa %xmm4,%xmm3 +___ +} +$code.=<<___; # last iteration can be optimized + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,`16*($i+0)+112`(%r10) + + paddd %xmm2,%xmm3 + .byte 0x67 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,`16*($i+1)+112`(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,`16*($i+2)+112`(%r10) + pand `16*($i+0)-128`($bp),%xmm0 # while it's still in register + + pand `16*($i+1)-128`($bp),%xmm1 + pand `16*($i+2)-128`($bp),%xmm2 + movdqa %xmm3,`16*($i+3)+112`(%r10) + pand `16*($i+3)-128`($bp),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 +___ +for($i=0;$i<$STRIDE/16-4;$i+=4) { +$code.=<<___; + movdqa `16*($i+0)-128`($bp),%xmm4 + movdqa `16*($i+1)-128`($bp),%xmm5 + movdqa `16*($i+2)-128`($bp),%xmm2 + pand `16*($i+0)+112`(%r10),%xmm4 + movdqa `16*($i+3)-128`($bp),%xmm3 + pand `16*($i+1)+112`(%r10),%xmm5 + por %xmm4,%xmm0 + pand `16*($i+2)+112`(%r10),%xmm2 + por %xmm5,%xmm1 + pand `16*($i+3)+112`(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 +___ +} +$code.=<<___; + por %xmm1,%xmm0 + # Combine the upper and lower halves of %xmm0. + pshufd \$0x4e,%xmm0,%xmm1 # Swap upper and lower halves. + por %xmm1,%xmm0 + lea $STRIDE($bp),$bp + movq %xmm0,$m0 # m0=bp[0] + + mov %r13,16+8(%rsp) # save end of b[num] + mov $rp, 56+8(%rsp) # save $rp + + mov ($n0),$n0 # pull n0[0] value + mov ($ap),%rax + lea ($ap,$num),$ap # end of a[num] + neg $num + + mov $n0,$m1 + mulq $m0 # ap[0]*bp[0] + mov %rax,$A[0] + mov ($np),%rax + + imulq $A[0],$m1 # "tp[0]"*n0 + lea 64+8(%rsp),$tp + mov %rdx,$A[1] + + mulq $m1 # np[0]*m1 + add %rax,$A[0] # discarded + mov 8($ap,$num),%rax + adc \$0,%rdx + mov %rdx,$N[1] + + mulq $m0 + add %rax,$A[1] + mov 8*1($np),%rax + adc \$0,%rdx + mov %rdx,$A[0] + + mulq $m1 + add %rax,$N[1] + mov 16($ap,$num),%rax + adc \$0,%rdx + add $A[1],$N[1] + lea 4*8($num),$j # j=4 + lea 8*4($np),$np + adc \$0,%rdx + mov $N[1],($tp) + mov %rdx,$N[0] + jmp .L1st4x + +.align 32 +.L1st4x: + mulq $m0 # ap[j]*bp[0] + add %rax,$A[0] + mov -8*2($np),%rax + lea 32($tp),$tp + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov -8($ap,$j),%rax + adc \$0,%rdx + add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $N[0],-24($tp) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[0] + add %rax,$A[1] + mov -8*1($np),%rax + adc \$0,%rdx + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov ($ap,$j),%rax + adc \$0,%rdx + add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $N[1],-16($tp) # tp[j-1] + mov %rdx,$N[0] + + mulq $m0 # ap[j]*bp[0] + add %rax,$A[0] + mov 8*0($np),%rax + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov 8($ap,$j),%rax + adc \$0,%rdx + add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $N[0],-8($tp) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[0] + add %rax,$A[1] + mov 8*1($np),%rax + adc \$0,%rdx + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov 16($ap,$j),%rax + adc \$0,%rdx + add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] + lea 8*4($np),$np + adc \$0,%rdx + mov $N[1],($tp) # tp[j-1] + mov %rdx,$N[0] + + add \$32,$j # j+=4 + jnz .L1st4x + + mulq $m0 # ap[j]*bp[0] + add %rax,$A[0] + mov -8*2($np),%rax + lea 32($tp),$tp + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov -8($ap),%rax + adc \$0,%rdx + add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $N[0],-24($tp) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[0] + add %rax,$A[1] + mov -8*1($np),%rax + adc \$0,%rdx + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov ($ap,$num),%rax # ap[0] + adc \$0,%rdx + add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $N[1],-16($tp) # tp[j-1] + mov %rdx,$N[0] + + lea ($np,$num),$np # rewind $np + + xor $N[1],$N[1] + add $A[0],$N[0] + adc \$0,$N[1] + mov $N[0],-8($tp) + + jmp .Louter4x + +.align 32 +.Louter4x: + lea 16+128($tp),%rdx # where 256-byte mask is (+size optimization) + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 +___ +for($i=0;$i<$STRIDE/16;$i+=4) { +$code.=<<___; + movdqa `16*($i+0)-128`($bp),%xmm0 + movdqa `16*($i+1)-128`($bp),%xmm1 + movdqa `16*($i+2)-128`($bp),%xmm2 + movdqa `16*($i+3)-128`($bp),%xmm3 + pand `16*($i+0)-128`(%rdx),%xmm0 + pand `16*($i+1)-128`(%rdx),%xmm1 + por %xmm0,%xmm4 + pand `16*($i+2)-128`(%rdx),%xmm2 + por %xmm1,%xmm5 + pand `16*($i+3)-128`(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 +___ +} +$code.=<<___; + por %xmm5,%xmm4 + # Combine the upper and lower halves of %xmm4 as %xmm0. + pshufd \$0x4e,%xmm4,%xmm0 # Swap upper and lower halves. + por %xmm4,%xmm0 + lea $STRIDE($bp),$bp + movq %xmm0,$m0 # m0=bp[i] + + mov ($tp,$num),$A[0] + mov $n0,$m1 + mulq $m0 # ap[0]*bp[i] + add %rax,$A[0] # ap[0]*bp[i]+tp[0] + mov ($np),%rax + adc \$0,%rdx + + imulq $A[0],$m1 # tp[0]*n0 + mov %rdx,$A[1] + mov $N[1],($tp) # store upmost overflow bit + + lea ($tp,$num),$tp # rewind $tp + + mulq $m1 # np[0]*m1 + add %rax,$A[0] # "$N[0]", discarded + mov 8($ap,$num),%rax + adc \$0,%rdx + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[1] + mov 8*1($np),%rax + adc \$0,%rdx + add 8($tp),$A[1] # +tp[1] + adc \$0,%rdx + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov 16($ap,$num),%rax + adc \$0,%rdx + add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] + lea 4*8($num),$j # j=4 + lea 8*4($np),$np + adc \$0,%rdx + mov %rdx,$N[0] + jmp .Linner4x + +.align 32 +.Linner4x: + mulq $m0 # ap[j]*bp[i] + add %rax,$A[0] + mov -8*2($np),%rax + adc \$0,%rdx + add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] + lea 32($tp),$tp + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov -8($ap,$j),%rax + adc \$0,%rdx + add $A[0],$N[0] + adc \$0,%rdx + mov $N[1],-32($tp) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[1] + mov -8*1($np),%rax + adc \$0,%rdx + add -8($tp),$A[1] + adc \$0,%rdx + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov ($ap,$j),%rax + adc \$0,%rdx + add $A[1],$N[1] + adc \$0,%rdx + mov $N[0],-24($tp) # tp[j-1] + mov %rdx,$N[0] + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[0] + mov 8*0($np),%rax + adc \$0,%rdx + add ($tp),$A[0] # ap[j]*bp[i]+tp[j] + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov 8($ap,$j),%rax + adc \$0,%rdx + add $A[0],$N[0] + adc \$0,%rdx + mov $N[1],-16($tp) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[1] + mov 8*1($np),%rax + adc \$0,%rdx + add 8($tp),$A[1] + adc \$0,%rdx + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov 16($ap,$j),%rax + adc \$0,%rdx + add $A[1],$N[1] + lea 8*4($np),$np + adc \$0,%rdx + mov $N[0],-8($tp) # tp[j-1] + mov %rdx,$N[0] + + add \$32,$j # j+=4 + jnz .Linner4x + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[0] + mov -8*2($np),%rax + adc \$0,%rdx + add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] + lea 32($tp),$tp + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov -8($ap),%rax + adc \$0,%rdx + add $A[0],$N[0] + adc \$0,%rdx + mov $N[1],-32($tp) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[1] + mov $m1,%rax + mov -8*1($np),$m1 + adc \$0,%rdx + add -8($tp),$A[1] + adc \$0,%rdx + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov ($ap,$num),%rax # ap[0] + adc \$0,%rdx + add $A[1],$N[1] + adc \$0,%rdx + mov $N[0],-24($tp) # tp[j-1] + mov %rdx,$N[0] + + mov $N[1],-16($tp) # tp[j-1] + lea ($np,$num),$np # rewind $np + + xor $N[1],$N[1] + add $A[0],$N[0] + adc \$0,$N[1] + add ($tp),$N[0] # pull upmost overflow bit + adc \$0,$N[1] # upmost overflow bit + mov $N[0],-8($tp) + + cmp 16+8(%rsp),$bp + jb .Louter4x +___ +if (1) { +$code.=<<___; + xor %rax,%rax + sub $N[0],$m1 # compare top-most words + adc $j,$j # $j is zero + or $j,$N[1] + sub $N[1],%rax # %rax=-$N[1] + lea ($tp,$num),%rbx # tptr in .sqr4x_sub + mov ($np),%r12 + lea ($np),%rbp # nptr in .sqr4x_sub + mov %r9,%rcx + sar \$3+2,%rcx + mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub + dec %r12 # so that after 'not' we get -n[0] + xor %r10,%r10 + mov 8*1(%rbp),%r13 + mov 8*2(%rbp),%r14 + mov 8*3(%rbp),%r15 + jmp .Lsqr4x_sub_entry +___ +} +$code.=<<___; +.cfi_endproc +.size mul4x_internal,.-mul4x_internal +___ +}}} + {{{ +###################################################################### +# void bn_power5_nohw( +my $rptr="%rdi"; # BN_ULONG *rptr, +my $aptr="%rsi"; # const BN_ULONG *aptr, +my $bptr="%rdx"; # const void *table, +my $nptr="%rcx"; # const BN_ULONG *nptr, +my $n0 ="%r8"; # const BN_ULONG *n0); +my $num ="%r9"; # int num, has to be divisible by 8 + # int pwr + +my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); +my @A0=("%r10","%r11"); +my @A1=("%r12","%r13"); +my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); + +$code.=<<___; +.globl bn_power5_nohw +.type bn_power5_nohw,\@function,6 +.align 32 +bn_power5_nohw: +.cfi_startproc + _CET_ENDBR + mov %rsp,%rax +.cfi_def_cfa_register %rax + push %rbx +.cfi_push %rbx + push %rbp +.cfi_push %rbp + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lpower5_prologue: + + # num is declared as an int, a 32-bit parameter, so the upper half is + # undefined. It is important that this write to ${num}, which zeros the + # upper half, come before the first access. + shl \$3,${num}d # convert $num to bytes + lea ($num,$num,2),%r10d # 3*$num + neg $num + mov ($n0),$n0 # *n0 + + ############################################################## + # Ensure that stack frame doesn't alias with $rptr+3*$num + # modulo 4096, which covers ret[num], am[num] and n[num] + # (see bn_exp.c). This is done to allow memory disambiguation + # logic do its magic. [Extra 256 bytes is for power mask + # calculated from 7th argument, the index.] + # + lea -320(%rsp,$num,2),%r11 + mov %rsp,%rbp + sub $rptr,%r11 + and \$4095,%r11 + cmp %r11,%r10 + jb .Lpwr_sp_alt + sub %r11,%rbp # align with $aptr + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) + jmp .Lpwr_sp_done + +.align 32 +.Lpwr_sp_alt: + lea 4096-320(,$num,2),%r10 + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) + sub %r10,%r11 + mov \$0,%r10 + cmovc %r10,%r11 + sub %r11,%rbp +.Lpwr_sp_done: + and \$-64,%rbp + mov %rsp,%r11 + sub %rbp,%r11 + and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lpwr_page_walk + jmp .Lpwr_page_walk_done + +.Lpwr_page_walk: + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lpwr_page_walk +.Lpwr_page_walk_done: + + mov $num,%r10 + neg $num + + ############################################################## + # Stack layout + # + # +0 saved $num, used in reduction section + # +8 &t[2*$num], used in reduction section + # +32 saved *n0 + # +40 saved %rsp + # +48 t[2*$num] + # + mov $n0, 32(%rsp) + mov %rax, 40(%rsp) # save original %rsp +.cfi_cfa_expression %rsp+40,deref,+8 +.Lpower5_body: + movq $rptr,%xmm1 # save $rptr, used in sqr8x + movq $nptr,%xmm2 # save $nptr + movq %r10, %xmm3 # -$num, used in sqr8x + movq $bptr,%xmm4 + + call __bn_sqr8x_internal + call __bn_post4x_internal + call __bn_sqr8x_internal + call __bn_post4x_internal + call __bn_sqr8x_internal + call __bn_post4x_internal + call __bn_sqr8x_internal + call __bn_post4x_internal + call __bn_sqr8x_internal + call __bn_post4x_internal + + movq %xmm2,$nptr + movq %xmm4,$bptr + mov $aptr,$rptr + mov 40(%rsp),%rax + lea 32(%rsp),$n0 + + call mul4x_internal + + mov 40(%rsp),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 + mov \$1,%rax + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbp +.cfi_restore %rbp + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpower5_epilogue: + ret +.cfi_endproc +.size bn_power5_nohw,.-bn_power5_nohw + +.globl bn_sqr8x_internal +.hidden bn_sqr8x_internal +.type bn_sqr8x_internal,\@abi-omnipotent +.align 32 +bn_sqr8x_internal: +__bn_sqr8x_internal: +.cfi_startproc + _CET_ENDBR + ############################################################## + # Squaring part: + # + # a) multiply-n-add everything but a[i]*a[i]; + # b) shift result of a) by 1 to the left and accumulate + # a[i]*a[i] products; + # + ############################################################## + # a[1]a[0] + # a[2]a[0] + # a[3]a[0] + # a[2]a[1] + # a[4]a[0] + # a[3]a[1] + # a[5]a[0] + # a[4]a[1] + # a[3]a[2] + # a[6]a[0] + # a[5]a[1] + # a[4]a[2] + # a[7]a[0] + # a[6]a[1] + # a[5]a[2] + # a[4]a[3] + # a[7]a[1] + # a[6]a[2] + # a[5]a[3] + # a[7]a[2] + # a[6]a[3] + # a[5]a[4] + # a[7]a[3] + # a[6]a[4] + # a[7]a[4] + # a[6]a[5] + # a[7]a[5] + # a[7]a[6] + # a[1]a[0] + # a[2]a[0] + # a[3]a[0] + # a[4]a[0] + # a[5]a[0] + # a[6]a[0] + # a[7]a[0] + # a[2]a[1] + # a[3]a[1] + # a[4]a[1] + # a[5]a[1] + # a[6]a[1] + # a[7]a[1] + # a[3]a[2] + # a[4]a[2] + # a[5]a[2] + # a[6]a[2] + # a[7]a[2] + # a[4]a[3] + # a[5]a[3] + # a[6]a[3] + # a[7]a[3] + # a[5]a[4] + # a[6]a[4] + # a[7]a[4] + # a[6]a[5] + # a[7]a[5] + # a[7]a[6] + # a[0]a[0] + # a[1]a[1] + # a[2]a[2] + # a[3]a[3] + # a[4]a[4] + # a[5]a[5] + # a[6]a[6] + # a[7]a[7] + + lea 32(%r10),$i # $i=-($num-32) + lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2] + + mov $num,$j # $j=$num + + # comments apply to $num==8 case + mov -32($aptr,$i),$a0 # a[0] + lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] + mov -24($aptr,$i),%rax # a[1] + lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] + mov -16($aptr,$i),$ai # a[2] + mov %rax,$a1 + + mul $a0 # a[1]*a[0] + mov %rax,$A0[0] # a[1]*a[0] + mov $ai,%rax # a[2] + mov %rdx,$A0[1] + mov $A0[0],-24($tptr,$i) # t[1] + + mul $a0 # a[2]*a[0] + add %rax,$A0[1] + mov $ai,%rax + adc \$0,%rdx + mov $A0[1],-16($tptr,$i) # t[2] + mov %rdx,$A0[0] + + + mov -8($aptr,$i),$ai # a[3] + mul $a1 # a[2]*a[1] + mov %rax,$A1[0] # a[2]*a[1]+t[3] + mov $ai,%rax + mov %rdx,$A1[1] + + lea ($i),$j + mul $a0 # a[3]*a[0] + add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] + mov $ai,%rax + mov %rdx,$A0[1] + adc \$0,$A0[1] + add $A1[0],$A0[0] + adc \$0,$A0[1] + mov $A0[0],-8($tptr,$j) # t[3] + jmp .Lsqr4x_1st + +.align 32 +.Lsqr4x_1st: + mov ($aptr,$j),$ai # a[4] + mul $a1 # a[3]*a[1] + add %rax,$A1[1] # a[3]*a[1]+t[4] + mov $ai,%rax + mov %rdx,$A1[0] + adc \$0,$A1[0] + + mul $a0 # a[4]*a[0] + add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] + mov $ai,%rax # a[3] + mov 8($aptr,$j),$ai # a[5] + mov %rdx,$A0[0] + adc \$0,$A0[0] + add $A1[1],$A0[1] + adc \$0,$A0[0] + + + mul $a1 # a[4]*a[3] + add %rax,$A1[0] # a[4]*a[3]+t[5] + mov $ai,%rax + mov $A0[1],($tptr,$j) # t[4] + mov %rdx,$A1[1] + adc \$0,$A1[1] + + mul $a0 # a[5]*a[2] + add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] + mov $ai,%rax + mov 16($aptr,$j),$ai # a[6] + mov %rdx,$A0[1] + adc \$0,$A0[1] + add $A1[0],$A0[0] + adc \$0,$A0[1] + + mul $a1 # a[5]*a[3] + add %rax,$A1[1] # a[5]*a[3]+t[6] + mov $ai,%rax + mov $A0[0],8($tptr,$j) # t[5] + mov %rdx,$A1[0] + adc \$0,$A1[0] + + mul $a0 # a[6]*a[2] + add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6] + mov $ai,%rax # a[3] + mov 24($aptr,$j),$ai # a[7] + mov %rdx,$A0[0] + adc \$0,$A0[0] + add $A1[1],$A0[1] + adc \$0,$A0[0] + + + mul $a1 # a[6]*a[5] + add %rax,$A1[0] # a[6]*a[5]+t[7] + mov $ai,%rax + mov $A0[1],16($tptr,$j) # t[6] + mov %rdx,$A1[1] + adc \$0,$A1[1] + lea 32($j),$j + + mul $a0 # a[7]*a[4] + add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6] + mov $ai,%rax + mov %rdx,$A0[1] + adc \$0,$A0[1] + add $A1[0],$A0[0] + adc \$0,$A0[1] + mov $A0[0],-8($tptr,$j) # t[7] + + cmp \$0,$j + jne .Lsqr4x_1st + + mul $a1 # a[7]*a[5] + add %rax,$A1[1] + lea 16($i),$i + adc \$0,%rdx + add $A0[1],$A1[1] + adc \$0,%rdx + + mov $A1[1],($tptr) # t[8] + mov %rdx,$A1[0] + mov %rdx,8($tptr) # t[9] + jmp .Lsqr4x_outer + +.align 32 +.Lsqr4x_outer: # comments apply to $num==6 case + mov -32($aptr,$i),$a0 # a[0] + lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] + mov -24($aptr,$i),%rax # a[1] + lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] + mov -16($aptr,$i),$ai # a[2] + mov %rax,$a1 + + mul $a0 # a[1]*a[0] + mov -24($tptr,$i),$A0[0] # t[1] + add %rax,$A0[0] # a[1]*a[0]+t[1] + mov $ai,%rax # a[2] + adc \$0,%rdx + mov $A0[0],-24($tptr,$i) # t[1] + mov %rdx,$A0[1] + + mul $a0 # a[2]*a[0] + add %rax,$A0[1] + mov $ai,%rax + adc \$0,%rdx + add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2] + mov %rdx,$A0[0] + adc \$0,$A0[0] + mov $A0[1],-16($tptr,$i) # t[2] + + xor $A1[0],$A1[0] + + mov -8($aptr,$i),$ai # a[3] + mul $a1 # a[2]*a[1] + add %rax,$A1[0] # a[2]*a[1]+t[3] + mov $ai,%rax + adc \$0,%rdx + add -8($tptr,$i),$A1[0] + mov %rdx,$A1[1] + adc \$0,$A1[1] + + mul $a0 # a[3]*a[0] + add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] + mov $ai,%rax + adc \$0,%rdx + add $A1[0],$A0[0] + mov %rdx,$A0[1] + adc \$0,$A0[1] + mov $A0[0],-8($tptr,$i) # t[3] + + lea ($i),$j + jmp .Lsqr4x_inner + +.align 32 +.Lsqr4x_inner: + mov ($aptr,$j),$ai # a[4] + mul $a1 # a[3]*a[1] + add %rax,$A1[1] # a[3]*a[1]+t[4] + mov $ai,%rax + mov %rdx,$A1[0] + adc \$0,$A1[0] + add ($tptr,$j),$A1[1] + adc \$0,$A1[0] + + .byte 0x67 + mul $a0 # a[4]*a[0] + add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] + mov $ai,%rax # a[3] + mov 8($aptr,$j),$ai # a[5] + mov %rdx,$A0[0] + adc \$0,$A0[0] + add $A1[1],$A0[1] + adc \$0,$A0[0] + + mul $a1 # a[4]*a[3] + add %rax,$A1[0] # a[4]*a[3]+t[5] + mov $A0[1],($tptr,$j) # t[4] + mov $ai,%rax + mov %rdx,$A1[1] + adc \$0,$A1[1] + add 8($tptr,$j),$A1[0] + lea 16($j),$j # j++ + adc \$0,$A1[1] + + mul $a0 # a[5]*a[2] + add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] + mov $ai,%rax + adc \$0,%rdx + add $A1[0],$A0[0] + mov %rdx,$A0[1] + adc \$0,$A0[1] + mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below + + cmp \$0,$j + jne .Lsqr4x_inner + + .byte 0x67 + mul $a1 # a[5]*a[3] + add %rax,$A1[1] + adc \$0,%rdx + add $A0[1],$A1[1] + adc \$0,%rdx + + mov $A1[1],($tptr) # t[6], "preloaded t[2]" below + mov %rdx,$A1[0] + mov %rdx,8($tptr) # t[7], "preloaded t[3]" below + + add \$16,$i + jnz .Lsqr4x_outer + + # comments apply to $num==4 case + mov -32($aptr),$a0 # a[0] + lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] + mov -24($aptr),%rax # a[1] + lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] + mov -16($aptr),$ai # a[2] + mov %rax,$a1 + + mul $a0 # a[1]*a[0] + add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1] + mov $ai,%rax # a[2] + mov %rdx,$A0[1] + adc \$0,$A0[1] + + mul $a0 # a[2]*a[0] + add %rax,$A0[1] + mov $ai,%rax + mov $A0[0],-24($tptr) # t[1] + mov %rdx,$A0[0] + adc \$0,$A0[0] + add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2] + mov -8($aptr),$ai # a[3] + adc \$0,$A0[0] + + mul $a1 # a[2]*a[1] + add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3] + mov $ai,%rax + mov $A0[1],-16($tptr) # t[2] + mov %rdx,$A1[1] + adc \$0,$A1[1] + + mul $a0 # a[3]*a[0] + add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] + mov $ai,%rax + mov %rdx,$A0[1] + adc \$0,$A0[1] + add $A1[0],$A0[0] + adc \$0,$A0[1] + mov $A0[0],-8($tptr) # t[3] + + mul $a1 # a[3]*a[1] + add %rax,$A1[1] + mov -16($aptr),%rax # a[2] + adc \$0,%rdx + add $A0[1],$A1[1] + adc \$0,%rdx + + mov $A1[1],($tptr) # t[4] + mov %rdx,$A1[0] + mov %rdx,8($tptr) # t[5] + + mul $ai # a[2]*a[3] +___ +{ +my ($shift,$carry)=($a0,$a1); +my @S=(@A1,$ai,$n0); +$code.=<<___; + add \$16,$i + xor $shift,$shift + sub $num,$i # $i=16-$num + xor $carry,$carry + + add $A1[0],%rax # t[5] + adc \$0,%rdx + mov %rax,8($tptr) # t[5] + mov %rdx,16($tptr) # t[6] + mov $carry,24($tptr) # t[7] + + mov -16($aptr,$i),%rax # a[0] + lea 48+8(%rsp),$tptr + xor $A0[0],$A0[0] # t[0] + mov 8($tptr),$A0[1] # t[1] + + lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift + shr \$63,$A0[0] + lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | + shr \$63,$A0[1] + or $A0[0],$S[1] # | t[2*i]>>63 + mov 16($tptr),$A0[0] # t[2*i+2] # prefetch + mov $A0[1],$shift # shift=t[2*i+1]>>63 + mul %rax # a[i]*a[i] + neg $carry # mov $carry,cf + mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch + adc %rax,$S[0] + mov -8($aptr,$i),%rax # a[i+1] # prefetch + mov $S[0],($tptr) + adc %rdx,$S[1] + + lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift + mov $S[1],8($tptr) + sbb $carry,$carry # mov cf,$carry + shr \$63,$A0[0] + lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | + shr \$63,$A0[1] + or $A0[0],$S[3] # | t[2*i]>>63 + mov 32($tptr),$A0[0] # t[2*i+2] # prefetch + mov $A0[1],$shift # shift=t[2*i+1]>>63 + mul %rax # a[i]*a[i] + neg $carry # mov $carry,cf + mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch + adc %rax,$S[2] + mov 0($aptr,$i),%rax # a[i+1] # prefetch + mov $S[2],16($tptr) + adc %rdx,$S[3] + lea 16($i),$i + mov $S[3],24($tptr) + sbb $carry,$carry # mov cf,$carry + lea 64($tptr),$tptr + jmp .Lsqr4x_shift_n_add + +.align 32 +.Lsqr4x_shift_n_add: + lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift + shr \$63,$A0[0] + lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | + shr \$63,$A0[1] + or $A0[0],$S[1] # | t[2*i]>>63 + mov -16($tptr),$A0[0] # t[2*i+2] # prefetch + mov $A0[1],$shift # shift=t[2*i+1]>>63 + mul %rax # a[i]*a[i] + neg $carry # mov $carry,cf + mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch + adc %rax,$S[0] + mov -8($aptr,$i),%rax # a[i+1] # prefetch + mov $S[0],-32($tptr) + adc %rdx,$S[1] + + lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift + mov $S[1],-24($tptr) + sbb $carry,$carry # mov cf,$carry + shr \$63,$A0[0] + lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | + shr \$63,$A0[1] + or $A0[0],$S[3] # | t[2*i]>>63 + mov 0($tptr),$A0[0] # t[2*i+2] # prefetch + mov $A0[1],$shift # shift=t[2*i+1]>>63 + mul %rax # a[i]*a[i] + neg $carry # mov $carry,cf + mov 8($tptr),$A0[1] # t[2*i+2+1] # prefetch + adc %rax,$S[2] + mov 0($aptr,$i),%rax # a[i+1] # prefetch + mov $S[2],-16($tptr) + adc %rdx,$S[3] + + lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift + mov $S[3],-8($tptr) + sbb $carry,$carry # mov cf,$carry + shr \$63,$A0[0] + lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | + shr \$63,$A0[1] + or $A0[0],$S[1] # | t[2*i]>>63 + mov 16($tptr),$A0[0] # t[2*i+2] # prefetch + mov $A0[1],$shift # shift=t[2*i+1]>>63 + mul %rax # a[i]*a[i] + neg $carry # mov $carry,cf + mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch + adc %rax,$S[0] + mov 8($aptr,$i),%rax # a[i+1] # prefetch + mov $S[0],0($tptr) + adc %rdx,$S[1] + + lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift + mov $S[1],8($tptr) + sbb $carry,$carry # mov cf,$carry + shr \$63,$A0[0] + lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | + shr \$63,$A0[1] + or $A0[0],$S[3] # | t[2*i]>>63 + mov 32($tptr),$A0[0] # t[2*i+2] # prefetch + mov $A0[1],$shift # shift=t[2*i+1]>>63 + mul %rax # a[i]*a[i] + neg $carry # mov $carry,cf + mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch + adc %rax,$S[2] + mov 16($aptr,$i),%rax # a[i+1] # prefetch + mov $S[2],16($tptr) + adc %rdx,$S[3] + mov $S[3],24($tptr) + sbb $carry,$carry # mov cf,$carry + lea 64($tptr),$tptr + add \$32,$i + jnz .Lsqr4x_shift_n_add + + lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift + .byte 0x67 + shr \$63,$A0[0] + lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | + shr \$63,$A0[1] + or $A0[0],$S[1] # | t[2*i]>>63 + mov -16($tptr),$A0[0] # t[2*i+2] # prefetch + mov $A0[1],$shift # shift=t[2*i+1]>>63 + mul %rax # a[i]*a[i] + neg $carry # mov $carry,cf + mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch + adc %rax,$S[0] + mov -8($aptr),%rax # a[i+1] # prefetch + mov $S[0],-32($tptr) + adc %rdx,$S[1] + + lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift + mov $S[1],-24($tptr) + sbb $carry,$carry # mov cf,$carry + shr \$63,$A0[0] + lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | + shr \$63,$A0[1] + or $A0[0],$S[3] # | t[2*i]>>63 + mul %rax # a[i]*a[i] + neg $carry # mov $carry,cf + adc %rax,$S[2] + adc %rdx,$S[3] + mov $S[2],-16($tptr) + mov $S[3],-8($tptr) +___ +} +###################################################################### +# Montgomery reduction part, "word-by-word" algorithm. +# +# This new path is inspired by multiple submissions from Intel, by +# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, +# Vinodh Gopal... +{ +my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx"); + +$code.=<<___; + movq %xmm2,$nptr +__bn_sqr8x_reduction: + xor %rax,%rax + lea ($nptr,$num),%rcx # end of n[] + lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer + mov %rcx,0+8(%rsp) + lea 48+8(%rsp,$num),$tptr # end of initial t[] window + mov %rdx,8+8(%rsp) + neg $num + jmp .L8x_reduction_loop + +.align 32 +.L8x_reduction_loop: + lea ($tptr,$num),$tptr # start of current t[] window + .byte 0x66 + mov 8*0($tptr),$m0 + mov 8*1($tptr),%r9 + mov 8*2($tptr),%r10 + mov 8*3($tptr),%r11 + mov 8*4($tptr),%r12 + mov 8*5($tptr),%r13 + mov 8*6($tptr),%r14 + mov 8*7($tptr),%r15 + mov %rax,(%rdx) # store top-most carry bit + lea 8*8($tptr),$tptr + + .byte 0x67 + mov $m0,%r8 + imulq 32+8(%rsp),$m0 # n0*a[0] + mov 8*0($nptr),%rax # n[0] + mov \$8,%ecx + jmp .L8x_reduce + +.align 32 +.L8x_reduce: + mulq $m0 + mov 8*1($nptr),%rax # n[1] + neg %r8 + mov %rdx,%r8 + adc \$0,%r8 + + mulq $m0 + add %rax,%r9 + mov 8*2($nptr),%rax + adc \$0,%rdx + add %r9,%r8 + mov $m0,48-8+8(%rsp,%rcx,8) # put aside n0*a[i] + mov %rdx,%r9 + adc \$0,%r9 + + mulq $m0 + add %rax,%r10 + mov 8*3($nptr),%rax + adc \$0,%rdx + add %r10,%r9 + mov 32+8(%rsp),$carry # pull n0, borrow $carry + mov %rdx,%r10 + adc \$0,%r10 + + mulq $m0 + add %rax,%r11 + mov 8*4($nptr),%rax + adc \$0,%rdx + imulq %r8,$carry # modulo-scheduled + add %r11,%r10 + mov %rdx,%r11 + adc \$0,%r11 + + mulq $m0 + add %rax,%r12 + mov 8*5($nptr),%rax + adc \$0,%rdx + add %r12,%r11 + mov %rdx,%r12 + adc \$0,%r12 + + mulq $m0 + add %rax,%r13 + mov 8*6($nptr),%rax + adc \$0,%rdx + add %r13,%r12 + mov %rdx,%r13 + adc \$0,%r13 + + mulq $m0 + add %rax,%r14 + mov 8*7($nptr),%rax + adc \$0,%rdx + add %r14,%r13 + mov %rdx,%r14 + adc \$0,%r14 + + mulq $m0 + mov $carry,$m0 # n0*a[i] + add %rax,%r15 + mov 8*0($nptr),%rax # n[0] + adc \$0,%rdx + add %r15,%r14 + mov %rdx,%r15 + adc \$0,%r15 + + dec %ecx + jnz .L8x_reduce + + lea 8*8($nptr),$nptr + xor %rax,%rax + mov 8+8(%rsp),%rdx # pull end of t[] + cmp 0+8(%rsp),$nptr # end of n[]? + jae .L8x_no_tail + + .byte 0x66 + add 8*0($tptr),%r8 + adc 8*1($tptr),%r9 + adc 8*2($tptr),%r10 + adc 8*3($tptr),%r11 + adc 8*4($tptr),%r12 + adc 8*5($tptr),%r13 + adc 8*6($tptr),%r14 + adc 8*7($tptr),%r15 + sbb $carry,$carry # top carry + + mov 48+56+8(%rsp),$m0 # pull n0*a[0] + mov \$8,%ecx + mov 8*0($nptr),%rax + jmp .L8x_tail + +.align 32 +.L8x_tail: + mulq $m0 + add %rax,%r8 + mov 8*1($nptr),%rax + mov %r8,($tptr) # save result + mov %rdx,%r8 + adc \$0,%r8 + + mulq $m0 + add %rax,%r9 + mov 8*2($nptr),%rax + adc \$0,%rdx + add %r9,%r8 + lea 8($tptr),$tptr # $tptr++ + mov %rdx,%r9 + adc \$0,%r9 + + mulq $m0 + add %rax,%r10 + mov 8*3($nptr),%rax + adc \$0,%rdx + add %r10,%r9 + mov %rdx,%r10 + adc \$0,%r10 + + mulq $m0 + add %rax,%r11 + mov 8*4($nptr),%rax + adc \$0,%rdx + add %r11,%r10 + mov %rdx,%r11 + adc \$0,%r11 + + mulq $m0 + add %rax,%r12 + mov 8*5($nptr),%rax + adc \$0,%rdx + add %r12,%r11 + mov %rdx,%r12 + adc \$0,%r12 + + mulq $m0 + add %rax,%r13 + mov 8*6($nptr),%rax + adc \$0,%rdx + add %r13,%r12 + mov %rdx,%r13 + adc \$0,%r13 + + mulq $m0 + add %rax,%r14 + mov 8*7($nptr),%rax + adc \$0,%rdx + add %r14,%r13 + mov %rdx,%r14 + adc \$0,%r14 + + mulq $m0 + mov 48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i] + add %rax,%r15 + adc \$0,%rdx + add %r15,%r14 + mov 8*0($nptr),%rax # pull n[0] + mov %rdx,%r15 + adc \$0,%r15 + + dec %ecx + jnz .L8x_tail + + lea 8*8($nptr),$nptr + mov 8+8(%rsp),%rdx # pull end of t[] + cmp 0+8(%rsp),$nptr # end of n[]? + jae .L8x_tail_done # break out of loop + + mov 48+56+8(%rsp),$m0 # pull n0*a[0] + neg $carry + mov 8*0($nptr),%rax # pull n[0] + adc 8*0($tptr),%r8 + adc 8*1($tptr),%r9 + adc 8*2($tptr),%r10 + adc 8*3($tptr),%r11 + adc 8*4($tptr),%r12 + adc 8*5($tptr),%r13 + adc 8*6($tptr),%r14 + adc 8*7($tptr),%r15 + sbb $carry,$carry # top carry + + mov \$8,%ecx + jmp .L8x_tail + +.align 32 +.L8x_tail_done: + xor %rax,%rax + add (%rdx),%r8 # can this overflow? + adc \$0,%r9 + adc \$0,%r10 + adc \$0,%r11 + adc \$0,%r12 + adc \$0,%r13 + adc \$0,%r14 + adc \$0,%r15 + adc \$0,%rax + + neg $carry +.L8x_no_tail: + adc 8*0($tptr),%r8 + adc 8*1($tptr),%r9 + adc 8*2($tptr),%r10 + adc 8*3($tptr),%r11 + adc 8*4($tptr),%r12 + adc 8*5($tptr),%r13 + adc 8*6($tptr),%r14 + adc 8*7($tptr),%r15 + adc \$0,%rax # top-most carry + mov -8($nptr),%rcx # np[num-1] + xor $carry,$carry + + movq %xmm2,$nptr # restore $nptr + + mov %r8,8*0($tptr) # store top 512 bits + mov %r9,8*1($tptr) + movq %xmm3,$num # $num is %r9, can't be moved upwards + mov %r10,8*2($tptr) + mov %r11,8*3($tptr) + mov %r12,8*4($tptr) + mov %r13,8*5($tptr) + mov %r14,8*6($tptr) + mov %r15,8*7($tptr) + lea 8*8($tptr),$tptr + + cmp %rdx,$tptr # end of t[]? + jb .L8x_reduction_loop + ret +.cfi_endproc +.size bn_sqr8x_internal,.-bn_sqr8x_internal +___ +} +############################################################## +# Post-condition, 4x unrolled +# +{ +my ($tptr,$nptr)=("%rbx","%rbp"); +$code.=<<___; +.type __bn_post4x_internal,\@abi-omnipotent +.align 32 +__bn_post4x_internal: +.cfi_startproc + mov 8*0($nptr),%r12 + lea (%rdi,$num),$tptr # %rdi was $tptr above + mov $num,%rcx + movq %xmm1,$rptr # restore $rptr + neg %rax + movq %xmm1,$aptr # prepare for back-to-back call + sar \$3+2,%rcx + dec %r12 # so that after 'not' we get -n[0] + xor %r10,%r10 + mov 8*1($nptr),%r13 + mov 8*2($nptr),%r14 + mov 8*3($nptr),%r15 + jmp .Lsqr4x_sub_entry + +.align 16 +.Lsqr4x_sub: + mov 8*0($nptr),%r12 + mov 8*1($nptr),%r13 + mov 8*2($nptr),%r14 + mov 8*3($nptr),%r15 +.Lsqr4x_sub_entry: + lea 8*4($nptr),$nptr + not %r12 + not %r13 + not %r14 + not %r15 + and %rax,%r12 + and %rax,%r13 + and %rax,%r14 + and %rax,%r15 + + neg %r10 # mov %r10,%cf + adc 8*0($tptr),%r12 + adc 8*1($tptr),%r13 + adc 8*2($tptr),%r14 + adc 8*3($tptr),%r15 + mov %r12,8*0($rptr) + lea 8*4($tptr),$tptr + mov %r13,8*1($rptr) + sbb %r10,%r10 # mov %cf,%r10 + mov %r14,8*2($rptr) + mov %r15,8*3($rptr) + lea 8*4($rptr),$rptr + + inc %rcx # pass %cf + jnz .Lsqr4x_sub + + mov $num,%r10 # prepare for back-to-back call + neg $num # restore $num + ret +.cfi_endproc +.size __bn_post4x_internal,.-__bn_post4x_internal +___ +} +}}} + +if ($addx) {{{ +my $bp="%rdx"; # restore original value + +$code.=<<___; +.globl bn_mulx4x_mont_gather5 +.type bn_mulx4x_mont_gather5,\@function,6 +.align 32 +bn_mulx4x_mont_gather5: +.cfi_startproc + _CET_ENDBR + mov %rsp,%rax +.cfi_def_cfa_register %rax + push %rbx +.cfi_push %rbx + push %rbp +.cfi_push %rbp + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lmulx4x_prologue: + + # num is declared as an int, a 32-bit parameter, so the upper half is + # undefined. It is important that this write to ${num}, which zeros the + # upper half, predates the first access. + shl \$3,${num}d # convert $num to bytes + lea ($num,$num,2),%r10 # 3*$num in bytes + neg $num # -$num + mov ($n0),$n0 # *n0 + + ############################################################## + # Ensure that stack frame doesn't alias with $rptr+3*$num + # modulo 4096, which covers ret[num], am[num] and n[num] + # (see bn_exp.c). This is done to allow memory disambiguation + # logic do its magic. [Extra [num] is allocated in order + # to align with bn_power5's frame, which is cleansed after + # completing exponentiation. Extra 256 bytes is for power mask + # calculated from 7th argument, the index.] + # + lea -320(%rsp,$num,2),%r11 + mov %rsp,%rbp + sub $rp,%r11 + and \$4095,%r11 + cmp %r11,%r10 + jb .Lmulx4xsp_alt + sub %r11,%rbp # align with $aptr + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) + jmp .Lmulx4xsp_done + +.Lmulx4xsp_alt: + lea 4096-320(,$num,2),%r10 + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) + sub %r10,%r11 + mov \$0,%r10 + cmovc %r10,%r11 + sub %r11,%rbp +.Lmulx4xsp_done: + and \$-64,%rbp # ensure alignment + mov %rsp,%r11 + sub %rbp,%r11 + and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lmulx4x_page_walk + jmp .Lmulx4x_page_walk_done + +.Lmulx4x_page_walk: + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lmulx4x_page_walk +.Lmulx4x_page_walk_done: + + ############################################################## + # Stack layout + # +0 -num + # +8 off-loaded &b[i] + # +16 end of b[num] + # +24 inner counter + # +32 saved n0 + # +40 saved %rsp + # +48 + # +56 saved rp + # +64 tmp[num+1] + # + mov $n0, 32(%rsp) # save *n0 + mov %rax,40(%rsp) # save original %rsp +.cfi_cfa_expression %rsp+40,deref,+8 +.Lmulx4x_body: + call mulx4x_internal + + mov 40(%rsp),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 + mov \$1,%rax + + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbp +.cfi_restore %rbp + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lmulx4x_epilogue: + ret +.cfi_endproc +.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 + +.type mulx4x_internal,\@abi-omnipotent +.align 32 +mulx4x_internal: +.cfi_startproc + mov $num,8(%rsp) # save -$num (it was in bytes) + mov $num,%r10 + neg $num # restore $num + shl \$5,$num + neg %r10 # restore $num + lea 128($bp,$num),%r13 # end of powers table (+size optimization) + shr \$5+5,$num + movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument + sub \$1,$num + lea .Linc(%rip),%rax + mov %r13,16+8(%rsp) # end of b[num] + mov $num,24+8(%rsp) # inner counter + mov $rp, 56+8(%rsp) # save $rp +___ +my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)= + ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax"); +my $rptr=$bptr; +my $STRIDE=2**5*8; # 5 is "window size" +$code.=<<___; + movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 + movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 + lea 88-112(%rsp,%r10),%r10 # place the mask after tp[num+1] (+ICache optimization) + lea 128($bp),$bptr # size optimization + + pshufd \$0,%xmm5,%xmm5 # broadcast index + movdqa %xmm1,%xmm4 + .byte 0x67 + movdqa %xmm1,%xmm2 +___ +######################################################################## +# Calculate masks by comparing 0..31 to $idx and save result to stack. +# +# We compute sixteen 16-byte masks and store them on the stack. Mask i is stored +# in `16*i - 128`(%rax) and contains the comparisons for idx == 2*i and +# idx == 2*i + 1 in its lower and upper halves, respectively. Mask calculations +# are scheduled in groups of four. +$code.=<<___; + .byte 0x67 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 # compare to 1,0 + movdqa %xmm4,%xmm3 +___ +for($i=0;$i<$STRIDE/16-4;$i+=4) { +$code.=<<___; + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 # compare to 3,2 + movdqa %xmm0,`16*($i+0)+112`(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 # compare to 5,4 + movdqa %xmm1,`16*($i+1)+112`(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 # compare to 7,6 + movdqa %xmm2,`16*($i+2)+112`(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,`16*($i+3)+112`(%r10) + movdqa %xmm4,%xmm3 +___ +} +$code.=<<___; # last iteration can be optimized + .byte 0x67 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,`16*($i+0)+112`(%r10) + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,`16*($i+1)+112`(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,`16*($i+2)+112`(%r10) + + pand `16*($i+0)-128`($bptr),%xmm0 # while it's still in register + pand `16*($i+1)-128`($bptr),%xmm1 + pand `16*($i+2)-128`($bptr),%xmm2 + movdqa %xmm3,`16*($i+3)+112`(%r10) + pand `16*($i+3)-128`($bptr),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 +___ +for($i=0;$i<$STRIDE/16-4;$i+=4) { +$code.=<<___; + movdqa `16*($i+0)-128`($bptr),%xmm4 + movdqa `16*($i+1)-128`($bptr),%xmm5 + movdqa `16*($i+2)-128`($bptr),%xmm2 + pand `16*($i+0)+112`(%r10),%xmm4 + movdqa `16*($i+3)-128`($bptr),%xmm3 + pand `16*($i+1)+112`(%r10),%xmm5 + por %xmm4,%xmm0 + pand `16*($i+2)+112`(%r10),%xmm2 + por %xmm5,%xmm1 + pand `16*($i+3)+112`(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 +___ +} +$code.=<<___; + pxor %xmm1,%xmm0 + # Combine the upper and lower halves of %xmm0. + pshufd \$0x4e,%xmm0,%xmm1 # Swap upper and lower halves. + por %xmm1,%xmm0 + lea $STRIDE($bptr),$bptr + movq %xmm0,%rdx # bp[0] + lea 64+8*4+8(%rsp),$tptr + + mov %rdx,$bi + mulx 0*8($aptr),$mi,%rax # a[0]*b[0] + mulx 1*8($aptr),%r11,%r12 # a[1]*b[0] + add %rax,%r11 + mulx 2*8($aptr),%rax,%r13 # ... + adc %rax,%r12 + adc \$0,%r13 + mulx 3*8($aptr),%rax,%r14 + + mov $mi,%r15 + imulq 32+8(%rsp),$mi # "t[0]"*n0 + xor $zero,$zero # cf=0, of=0 + mov $mi,%rdx + + mov $bptr,8+8(%rsp) # off-load &b[i] + + lea 4*8($aptr),$aptr + adcx %rax,%r13 + adcx $zero,%r14 # cf=0 + + mulx 0*8($nptr),%rax,%r10 + adcx %rax,%r15 # discarded + adox %r11,%r10 + mulx 1*8($nptr),%rax,%r11 + adcx %rax,%r10 + adox %r12,%r11 + mulx 2*8($nptr),%rax,%r12 + mov 24+8(%rsp),$bptr # counter value + mov %r10,-8*4($tptr) + adcx %rax,%r11 + adox %r13,%r12 + mulx 3*8($nptr),%rax,%r15 + mov $bi,%rdx + mov %r11,-8*3($tptr) + adcx %rax,%r12 + adox $zero,%r15 # of=0 + lea 4*8($nptr),$nptr + mov %r12,-8*2($tptr) + jmp .Lmulx4x_1st + +.align 32 +.Lmulx4x_1st: + adcx $zero,%r15 # cf=0, modulo-scheduled + mulx 0*8($aptr),%r10,%rax # a[4]*b[0] + adcx %r14,%r10 + mulx 1*8($aptr),%r11,%r14 # a[5]*b[0] + adcx %rax,%r11 + mulx 2*8($aptr),%r12,%rax # ... + adcx %r14,%r12 + mulx 3*8($aptr),%r13,%r14 + .byte 0x67,0x67 + mov $mi,%rdx + adcx %rax,%r13 + adcx $zero,%r14 # cf=0 + lea 4*8($aptr),$aptr + lea 4*8($tptr),$tptr + + adox %r15,%r10 + mulx 0*8($nptr),%rax,%r15 + adcx %rax,%r10 + adox %r15,%r11 + mulx 1*8($nptr),%rax,%r15 + adcx %rax,%r11 + adox %r15,%r12 + mulx 2*8($nptr),%rax,%r15 + mov %r10,-5*8($tptr) + adcx %rax,%r12 + mov %r11,-4*8($tptr) + adox %r15,%r13 + mulx 3*8($nptr),%rax,%r15 + mov $bi,%rdx + mov %r12,-3*8($tptr) + adcx %rax,%r13 + adox $zero,%r15 + lea 4*8($nptr),$nptr + mov %r13,-2*8($tptr) + + dec $bptr # of=0, pass cf + jnz .Lmulx4x_1st + + mov 8(%rsp),$num # load -num + adc $zero,%r15 # modulo-scheduled + lea ($aptr,$num),$aptr # rewind $aptr + add %r15,%r14 + mov 8+8(%rsp),$bptr # re-load &b[i] + adc $zero,$zero # top-most carry + mov %r14,-1*8($tptr) + jmp .Lmulx4x_outer + +.align 32 +.Lmulx4x_outer: + lea 16-256($tptr),%r10 # where 256-byte mask is (+density control) + pxor %xmm4,%xmm4 + .byte 0x67,0x67 + pxor %xmm5,%xmm5 +___ +for($i=0;$i<$STRIDE/16;$i+=4) { +$code.=<<___; + movdqa `16*($i+0)-128`($bptr),%xmm0 + movdqa `16*($i+1)-128`($bptr),%xmm1 + movdqa `16*($i+2)-128`($bptr),%xmm2 + pand `16*($i+0)+256`(%r10),%xmm0 + movdqa `16*($i+3)-128`($bptr),%xmm3 + pand `16*($i+1)+256`(%r10),%xmm1 + por %xmm0,%xmm4 + pand `16*($i+2)+256`(%r10),%xmm2 + por %xmm1,%xmm5 + pand `16*($i+3)+256`(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 +___ +} +$code.=<<___; + por %xmm5,%xmm4 + # Combine the upper and lower halves of %xmm4 as %xmm0. + pshufd \$0x4e,%xmm4,%xmm0 # Swap upper and lower halves. + por %xmm4,%xmm0 + lea $STRIDE($bptr),$bptr + movq %xmm0,%rdx # m0=bp[i] + + mov $zero,($tptr) # save top-most carry + lea 4*8($tptr,$num),$tptr # rewind $tptr + mulx 0*8($aptr),$mi,%r11 # a[0]*b[i] + xor $zero,$zero # cf=0, of=0 + mov %rdx,$bi + mulx 1*8($aptr),%r14,%r12 # a[1]*b[i] + adox -4*8($tptr),$mi # +t[0] + adcx %r14,%r11 + mulx 2*8($aptr),%r15,%r13 # ... + adox -3*8($tptr),%r11 + adcx %r15,%r12 + mulx 3*8($aptr),%rdx,%r14 + adox -2*8($tptr),%r12 + adcx %rdx,%r13 + lea ($nptr,$num),$nptr # rewind $nptr + lea 4*8($aptr),$aptr + adox -1*8($tptr),%r13 + adcx $zero,%r14 + adox $zero,%r14 + + mov $mi,%r15 + imulq 32+8(%rsp),$mi # "t[0]"*n0 + + mov $mi,%rdx + xor $zero,$zero # cf=0, of=0 + mov $bptr,8+8(%rsp) # off-load &b[i] + + mulx 0*8($nptr),%rax,%r10 + adcx %rax,%r15 # discarded + adox %r11,%r10 + mulx 1*8($nptr),%rax,%r11 + adcx %rax,%r10 + adox %r12,%r11 + mulx 2*8($nptr),%rax,%r12 + adcx %rax,%r11 + adox %r13,%r12 + mulx 3*8($nptr),%rax,%r15 + mov $bi,%rdx + mov 24+8(%rsp),$bptr # counter value + mov %r10,-8*4($tptr) + adcx %rax,%r12 + mov %r11,-8*3($tptr) + adox $zero,%r15 # of=0 + mov %r12,-8*2($tptr) + lea 4*8($nptr),$nptr + jmp .Lmulx4x_inner + +.align 32 +.Lmulx4x_inner: + mulx 0*8($aptr),%r10,%rax # a[4]*b[i] + adcx $zero,%r15 # cf=0, modulo-scheduled + adox %r14,%r10 + mulx 1*8($aptr),%r11,%r14 # a[5]*b[i] + adcx 0*8($tptr),%r10 + adox %rax,%r11 + mulx 2*8($aptr),%r12,%rax # ... + adcx 1*8($tptr),%r11 + adox %r14,%r12 + mulx 3*8($aptr),%r13,%r14 + mov $mi,%rdx + adcx 2*8($tptr),%r12 + adox %rax,%r13 + adcx 3*8($tptr),%r13 + adox $zero,%r14 # of=0 + lea 4*8($aptr),$aptr + lea 4*8($tptr),$tptr + adcx $zero,%r14 # cf=0 + + adox %r15,%r10 + mulx 0*8($nptr),%rax,%r15 + adcx %rax,%r10 + adox %r15,%r11 + mulx 1*8($nptr),%rax,%r15 + adcx %rax,%r11 + adox %r15,%r12 + mulx 2*8($nptr),%rax,%r15 + mov %r10,-5*8($tptr) + adcx %rax,%r12 + adox %r15,%r13 + mov %r11,-4*8($tptr) + mulx 3*8($nptr),%rax,%r15 + mov $bi,%rdx + lea 4*8($nptr),$nptr + mov %r12,-3*8($tptr) + adcx %rax,%r13 + adox $zero,%r15 + mov %r13,-2*8($tptr) + + dec $bptr # of=0, pass cf + jnz .Lmulx4x_inner + + mov 0+8(%rsp),$num # load -num + adc $zero,%r15 # modulo-scheduled + sub 0*8($tptr),$bptr # pull top-most carry to %cf + mov 8+8(%rsp),$bptr # re-load &b[i] + mov 16+8(%rsp),%r10 + adc %r15,%r14 + lea ($aptr,$num),$aptr # rewind $aptr + adc $zero,$zero # top-most carry + mov %r14,-1*8($tptr) + + cmp %r10,$bptr + jb .Lmulx4x_outer + + mov -8($nptr),%r10 + mov $zero,%r8 + mov ($nptr,$num),%r12 + lea ($nptr,$num),%rbp # rewind $nptr + mov $num,%rcx + lea ($tptr,$num),%rdi # rewind $tptr + xor %eax,%eax + xor %r15,%r15 + sub %r14,%r10 # compare top-most words + adc %r15,%r15 + or %r15,%r8 + sar \$3+2,%rcx + sub %r8,%rax # %rax=-%r8 + mov 56+8(%rsp),%rdx # restore rp + dec %r12 # so that after 'not' we get -n[0] + mov 8*1(%rbp),%r13 + xor %r8,%r8 + mov 8*2(%rbp),%r14 + mov 8*3(%rbp),%r15 + jmp .Lsqrx4x_sub_entry # common post-condition +.cfi_endproc +.size mulx4x_internal,.-mulx4x_internal +___ +} { +###################################################################### +# void bn_powerx5( +my $rptr="%rdi"; # BN_ULONG *rptr, +my $aptr="%rsi"; # const BN_ULONG *aptr, +my $bptr="%rdx"; # const void *table, +my $nptr="%rcx"; # const BN_ULONG *nptr, +my $n0 ="%r8"; # const BN_ULONG *n0); +my $num ="%r9"; # int num, has to be divisible by 8 + # int pwr); + +my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); +my @A0=("%r10","%r11"); +my @A1=("%r12","%r13"); +my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); + +$code.=<<___; +.globl bn_powerx5 +.type bn_powerx5,\@function,6 +.align 32 +bn_powerx5: +.cfi_startproc + _CET_ENDBR + mov %rsp,%rax +.cfi_def_cfa_register %rax + push %rbx +.cfi_push %rbx + push %rbp +.cfi_push %rbp + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lpowerx5_prologue: + + # num is declared as an int, a 32-bit parameter, so the upper half is + # undefined. It is important that this write to ${num}, which zeros the + # upper half, predates the first access. + shl \$3,${num}d # convert $num to bytes + lea ($num,$num,2),%r10 # 3*$num in bytes + neg $num + mov ($n0),$n0 # *n0 + + ############################################################## + # Ensure that stack frame doesn't alias with $rptr+3*$num + # modulo 4096, which covers ret[num], am[num] and n[num] + # (see bn_exp.c). This is done to allow memory disambiguation + # logic do its magic. [Extra 256 bytes is for power mask + # calculated from 7th argument, the index.] + # + lea -320(%rsp,$num,2),%r11 + mov %rsp,%rbp + sub $rptr,%r11 + and \$4095,%r11 + cmp %r11,%r10 + jb .Lpwrx_sp_alt + sub %r11,%rbp # align with $aptr + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) + jmp .Lpwrx_sp_done + +.align 32 +.Lpwrx_sp_alt: + lea 4096-320(,$num,2),%r10 + lea -320(%rbp,$num,2),%rbp # alloca(frame+2*$num*8+256) + sub %r10,%r11 + mov \$0,%r10 + cmovc %r10,%r11 + sub %r11,%rbp +.Lpwrx_sp_done: + and \$-64,%rbp + mov %rsp,%r11 + sub %rbp,%r11 + and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lpwrx_page_walk + jmp .Lpwrx_page_walk_done + +.Lpwrx_page_walk: + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lpwrx_page_walk +.Lpwrx_page_walk_done: + + mov $num,%r10 + neg $num + + ############################################################## + # Stack layout + # + # +0 saved $num, used in reduction section + # +8 &t[2*$num], used in reduction section + # +16 intermediate carry bit + # +24 top-most carry bit, used in reduction section + # +32 saved *n0 + # +40 saved %rsp + # +48 t[2*$num] + # + pxor %xmm0,%xmm0 + movq $rptr,%xmm1 # save $rptr + movq $nptr,%xmm2 # save $nptr + movq %r10, %xmm3 # -$num + movq $bptr,%xmm4 + mov $n0, 32(%rsp) + mov %rax, 40(%rsp) # save original %rsp +.cfi_cfa_expression %rsp+40,deref,+8 +.Lpowerx5_body: + + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + + mov %r10,$num # -num + mov $aptr,$rptr + movq %xmm2,$nptr + movq %xmm4,$bptr + mov 40(%rsp),%rax + + call mulx4x_internal + + mov 40(%rsp),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 + mov \$1,%rax + + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbp +.cfi_restore %rbp + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpowerx5_epilogue: + ret +.cfi_endproc +.size bn_powerx5,.-bn_powerx5 + +.globl bn_sqrx8x_internal +.hidden bn_sqrx8x_internal +.type bn_sqrx8x_internal,\@abi-omnipotent +.align 32 +bn_sqrx8x_internal: +__bn_sqrx8x_internal: +.cfi_startproc + _CET_ENDBR + ################################################################## + # Squaring part: + # + # a) multiply-n-add everything but a[i]*a[i]; + # b) shift result of a) by 1 to the left and accumulate + # a[i]*a[i] products; + # + ################################################################## + # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] + # a[1]a[0] + # a[2]a[0] + # a[3]a[0] + # a[2]a[1] + # a[3]a[1] + # a[3]a[2] + # + # a[4]a[0] + # a[5]a[0] + # a[6]a[0] + # a[7]a[0] + # a[4]a[1] + # a[5]a[1] + # a[6]a[1] + # a[7]a[1] + # a[4]a[2] + # a[5]a[2] + # a[6]a[2] + # a[7]a[2] + # a[4]a[3] + # a[5]a[3] + # a[6]a[3] + # a[7]a[3] + # + # a[5]a[4] + # a[6]a[4] + # a[7]a[4] + # a[6]a[5] + # a[7]a[5] + # a[7]a[6] + # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] +___ +{ +my ($zero,$carry)=("%rbp","%rcx"); +my $aaptr=$zero; +$code.=<<___; + lea 48+8(%rsp),$tptr + lea ($aptr,$num),$aaptr + mov $num,0+8(%rsp) # save $num + mov $aaptr,8+8(%rsp) # save end of $aptr + jmp .Lsqr8x_zero_start + +.align 32 +.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 +.Lsqrx8x_zero: + .byte 0x3e + movdqa %xmm0,0*8($tptr) + movdqa %xmm0,2*8($tptr) + movdqa %xmm0,4*8($tptr) + movdqa %xmm0,6*8($tptr) +.Lsqr8x_zero_start: # aligned at 32 + movdqa %xmm0,8*8($tptr) + movdqa %xmm0,10*8($tptr) + movdqa %xmm0,12*8($tptr) + movdqa %xmm0,14*8($tptr) + lea 16*8($tptr),$tptr + sub \$64,$num + jnz .Lsqrx8x_zero + + mov 0*8($aptr),%rdx # a[0], modulo-scheduled + #xor %r9,%r9 # t[1], ex-$num, zero already + xor %r10,%r10 + xor %r11,%r11 + xor %r12,%r12 + xor %r13,%r13 + xor %r14,%r14 + xor %r15,%r15 + lea 48+8(%rsp),$tptr + xor $zero,$zero # cf=0, cf=0 + jmp .Lsqrx8x_outer_loop + +.align 32 +.Lsqrx8x_outer_loop: + mulx 1*8($aptr),%r8,%rax # a[1]*a[0] + adcx %r9,%r8 # a[1]*a[0]+=t[1] + adox %rax,%r10 + mulx 2*8($aptr),%r9,%rax # a[2]*a[0] + adcx %r10,%r9 + adox %rax,%r11 + .byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 3*8($aptr),%r10,%rax # ... + adcx %r11,%r10 + adox %rax,%r12 + .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 # mulx 4*8($aptr),%r11,%rax + adcx %r12,%r11 + adox %rax,%r13 + mulx 5*8($aptr),%r12,%rax + adcx %r13,%r12 + adox %rax,%r14 + mulx 6*8($aptr),%r13,%rax + adcx %r14,%r13 + adox %r15,%rax + mulx 7*8($aptr),%r14,%r15 + mov 1*8($aptr),%rdx # a[1] + adcx %rax,%r14 + adox $zero,%r15 + adc 8*8($tptr),%r15 + mov %r8,1*8($tptr) # t[1] + mov %r9,2*8($tptr) # t[2] + sbb $carry,$carry # mov %cf,$carry + xor $zero,$zero # cf=0, of=0 + + + mulx 2*8($aptr),%r8,%rbx # a[2]*a[1] + mulx 3*8($aptr),%r9,%rax # a[3]*a[1] + adcx %r10,%r8 + adox %rbx,%r9 + mulx 4*8($aptr),%r10,%rbx # ... + adcx %r11,%r9 + adox %rax,%r10 + .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 # mulx 5*8($aptr),%r11,%rax + adcx %r12,%r10 + adox %rbx,%r11 + .byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r12,%rbx + adcx %r13,%r11 + adox %r14,%r12 + .byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r13,%r14 + mov 2*8($aptr),%rdx # a[2] + adcx %rax,%r12 + adox %rbx,%r13 + adcx %r15,%r13 + adox $zero,%r14 # of=0 + adcx $zero,%r14 # cf=0 + + mov %r8,3*8($tptr) # t[3] + mov %r9,4*8($tptr) # t[4] + + mulx 3*8($aptr),%r8,%rbx # a[3]*a[2] + mulx 4*8($aptr),%r9,%rax # a[4]*a[2] + adcx %r10,%r8 + adox %rbx,%r9 + mulx 5*8($aptr),%r10,%rbx # ... + adcx %r11,%r9 + adox %rax,%r10 + .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r11,%rax + adcx %r12,%r10 + adox %r13,%r11 + .byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r12,%r13 + .byte 0x3e + mov 3*8($aptr),%rdx # a[3] + adcx %rbx,%r11 + adox %rax,%r12 + adcx %r14,%r12 + mov %r8,5*8($tptr) # t[5] + mov %r9,6*8($tptr) # t[6] + mulx 4*8($aptr),%r8,%rax # a[4]*a[3] + adox $zero,%r13 # of=0 + adcx $zero,%r13 # cf=0 + + mulx 5*8($aptr),%r9,%rbx # a[5]*a[3] + adcx %r10,%r8 + adox %rax,%r9 + mulx 6*8($aptr),%r10,%rax # ... + adcx %r11,%r9 + adox %r12,%r10 + mulx 7*8($aptr),%r11,%r12 + mov 4*8($aptr),%rdx # a[4] + mov 5*8($aptr),%r14 # a[5] + adcx %rbx,%r10 + adox %rax,%r11 + mov 6*8($aptr),%r15 # a[6] + adcx %r13,%r11 + adox $zero,%r12 # of=0 + adcx $zero,%r12 # cf=0 + + mov %r8,7*8($tptr) # t[7] + mov %r9,8*8($tptr) # t[8] + + mulx %r14,%r9,%rax # a[5]*a[4] + mov 7*8($aptr),%r8 # a[7] + adcx %r10,%r9 + mulx %r15,%r10,%rbx # a[6]*a[4] + adox %rax,%r10 + adcx %r11,%r10 + mulx %r8,%r11,%rax # a[7]*a[4] + mov %r14,%rdx # a[5] + adox %rbx,%r11 + adcx %r12,%r11 + #adox $zero,%rax # of=0 + adcx $zero,%rax # cf=0 + + mulx %r15,%r14,%rbx # a[6]*a[5] + mulx %r8,%r12,%r13 # a[7]*a[5] + mov %r15,%rdx # a[6] + lea 8*8($aptr),$aptr + adcx %r14,%r11 + adox %rbx,%r12 + adcx %rax,%r12 + adox $zero,%r13 + + .byte 0x67,0x67 + mulx %r8,%r8,%r14 # a[7]*a[6] + adcx %r8,%r13 + adcx $zero,%r14 + + cmp 8+8(%rsp),$aptr + je .Lsqrx8x_outer_break + + neg $carry # mov $carry,%cf + mov \$-8,%rcx + mov $zero,%r15 + mov 8*8($tptr),%r8 + adcx 9*8($tptr),%r9 # +=t[9] + adcx 10*8($tptr),%r10 # ... + adcx 11*8($tptr),%r11 + adc 12*8($tptr),%r12 + adc 13*8($tptr),%r13 + adc 14*8($tptr),%r14 + adc 15*8($tptr),%r15 + lea ($aptr),$aaptr + lea 2*64($tptr),$tptr + sbb %rax,%rax # mov %cf,$carry + + mov -64($aptr),%rdx # a[0] + mov %rax,16+8(%rsp) # offload $carry + mov $tptr,24+8(%rsp) + + #lea 8*8($tptr),$tptr # see 2*8*8($tptr) above + xor %eax,%eax # cf=0, of=0 + jmp .Lsqrx8x_loop + +.align 32 +.Lsqrx8x_loop: + mov %r8,%rbx + mulx 0*8($aaptr),%rax,%r8 # a[8]*a[i] + adcx %rax,%rbx # +=t[8] + adox %r9,%r8 + + mulx 1*8($aaptr),%rax,%r9 # ... + adcx %rax,%r8 + adox %r10,%r9 + + mulx 2*8($aaptr),%rax,%r10 + adcx %rax,%r9 + adox %r11,%r10 + + mulx 3*8($aaptr),%rax,%r11 + adcx %rax,%r10 + adox %r12,%r11 + + .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 4*8($aaptr),%rax,%r12 + adcx %rax,%r11 + adox %r13,%r12 + + mulx 5*8($aaptr),%rax,%r13 + adcx %rax,%r12 + adox %r14,%r13 + + mulx 6*8($aaptr),%rax,%r14 + mov %rbx,($tptr,%rcx,8) # store t[8+i] + mov \$0,%ebx + adcx %rax,%r13 + adox %r15,%r14 + + .byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 # mulx 7*8($aaptr),%rax,%r15 + mov 8($aptr,%rcx,8),%rdx # a[i] + adcx %rax,%r14 + adox %rbx,%r15 # %rbx is 0, of=0 + adcx %rbx,%r15 # cf=0 + + .byte 0x67 + inc %rcx # of=0 + jnz .Lsqrx8x_loop + + lea 8*8($aaptr),$aaptr + mov \$-8,%rcx + cmp 8+8(%rsp),$aaptr # done? + je .Lsqrx8x_break + + sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf + .byte 0x66 + mov -64($aptr),%rdx + adcx 0*8($tptr),%r8 + adcx 1*8($tptr),%r9 + adc 2*8($tptr),%r10 + adc 3*8($tptr),%r11 + adc 4*8($tptr),%r12 + adc 5*8($tptr),%r13 + adc 6*8($tptr),%r14 + adc 7*8($tptr),%r15 + lea 8*8($tptr),$tptr + .byte 0x67 + sbb %rax,%rax # mov %cf,%rax + xor %ebx,%ebx # cf=0, of=0 + mov %rax,16+8(%rsp) # offload carry + jmp .Lsqrx8x_loop + +.align 32 +.Lsqrx8x_break: + xor $zero,$zero + sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf + adcx $zero,%r8 + mov 24+8(%rsp),$carry # initial $tptr, borrow $carry + adcx $zero,%r9 + mov 0*8($aptr),%rdx # a[8], modulo-scheduled + adc \$0,%r10 + mov %r8,0*8($tptr) + adc \$0,%r11 + adc \$0,%r12 + adc \$0,%r13 + adc \$0,%r14 + adc \$0,%r15 + cmp $carry,$tptr # cf=0, of=0 + je .Lsqrx8x_outer_loop + + mov %r9,1*8($tptr) + mov 1*8($carry),%r9 + mov %r10,2*8($tptr) + mov 2*8($carry),%r10 + mov %r11,3*8($tptr) + mov 3*8($carry),%r11 + mov %r12,4*8($tptr) + mov 4*8($carry),%r12 + mov %r13,5*8($tptr) + mov 5*8($carry),%r13 + mov %r14,6*8($tptr) + mov 6*8($carry),%r14 + mov %r15,7*8($tptr) + mov 7*8($carry),%r15 + mov $carry,$tptr + jmp .Lsqrx8x_outer_loop + +.align 32 +.Lsqrx8x_outer_break: + mov %r9,9*8($tptr) # t[9] + movq %xmm3,%rcx # -$num + mov %r10,10*8($tptr) # ... + mov %r11,11*8($tptr) + mov %r12,12*8($tptr) + mov %r13,13*8($tptr) + mov %r14,14*8($tptr) +___ +} { +my $i="%rcx"; +$code.=<<___; + lea 48+8(%rsp),$tptr + mov ($aptr,$i),%rdx # a[0] + + mov 8($tptr),$A0[1] # t[1] + xor $A0[0],$A0[0] # t[0], of=0, cf=0 + mov 0+8(%rsp),$num # restore $num + adox $A0[1],$A0[1] + mov 16($tptr),$A1[0] # t[2] # prefetch + mov 24($tptr),$A1[1] # t[3] # prefetch + #jmp .Lsqrx4x_shift_n_add # happens to be aligned + +.align 32 +.Lsqrx4x_shift_n_add: + mulx %rdx,%rax,%rbx + adox $A1[0],$A1[0] + adcx $A0[0],%rax + .byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 # mov 8($aptr,$i),%rdx # a[i+1] # prefetch + .byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 # mov 32($tptr),$A0[0] # t[2*i+4] # prefetch + adox $A1[1],$A1[1] + adcx $A0[1],%rbx + mov 40($tptr),$A0[1] # t[2*i+4+1] # prefetch + mov %rax,0($tptr) + mov %rbx,8($tptr) + + mulx %rdx,%rax,%rbx + adox $A0[0],$A0[0] + adcx $A1[0],%rax + mov 16($aptr,$i),%rdx # a[i+2] # prefetch + mov 48($tptr),$A1[0] # t[2*i+6] # prefetch + adox $A0[1],$A0[1] + adcx $A1[1],%rbx + mov 56($tptr),$A1[1] # t[2*i+6+1] # prefetch + mov %rax,16($tptr) + mov %rbx,24($tptr) + + mulx %rdx,%rax,%rbx + adox $A1[0],$A1[0] + adcx $A0[0],%rax + mov 24($aptr,$i),%rdx # a[i+3] # prefetch + lea 32($i),$i + mov 64($tptr),$A0[0] # t[2*i+8] # prefetch + adox $A1[1],$A1[1] + adcx $A0[1],%rbx + mov 72($tptr),$A0[1] # t[2*i+8+1] # prefetch + mov %rax,32($tptr) + mov %rbx,40($tptr) + + mulx %rdx,%rax,%rbx + adox $A0[0],$A0[0] + adcx $A1[0],%rax + jrcxz .Lsqrx4x_shift_n_add_break + .byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 # mov 0($aptr,$i),%rdx # a[i+4] # prefetch + adox $A0[1],$A0[1] + adcx $A1[1],%rbx + mov 80($tptr),$A1[0] # t[2*i+10] # prefetch + mov 88($tptr),$A1[1] # t[2*i+10+1] # prefetch + mov %rax,48($tptr) + mov %rbx,56($tptr) + lea 64($tptr),$tptr + nop + jmp .Lsqrx4x_shift_n_add + +.align 32 +.Lsqrx4x_shift_n_add_break: + adcx $A1[1],%rbx + mov %rax,48($tptr) + mov %rbx,56($tptr) + lea 64($tptr),$tptr # end of t[] buffer +___ +} +###################################################################### +# Montgomery reduction part, "word-by-word" algorithm. +# +# This new path is inspired by multiple submissions from Intel, by +# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, +# Vinodh Gopal... +{ +my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx"); + +$code.=<<___; + movq %xmm2,$nptr +__bn_sqrx8x_reduction: + xor %eax,%eax # initial top-most carry bit + mov 32+8(%rsp),%rbx # n0 + mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr) + lea -8*8($nptr,$num),%rcx # end of n[] + #lea 48+8(%rsp,$num,2),$tptr # end of t[] buffer + mov %rcx, 0+8(%rsp) # save end of n[] + mov $tptr,8+8(%rsp) # save end of t[] + + lea 48+8(%rsp),$tptr # initial t[] window + jmp .Lsqrx8x_reduction_loop + +.align 32 +.Lsqrx8x_reduction_loop: + mov 8*1($tptr),%r9 + mov 8*2($tptr),%r10 + mov 8*3($tptr),%r11 + mov 8*4($tptr),%r12 + mov %rdx,%r8 + imulq %rbx,%rdx # n0*a[i] + mov 8*5($tptr),%r13 + mov 8*6($tptr),%r14 + mov 8*7($tptr),%r15 + mov %rax,24+8(%rsp) # store top-most carry bit + + lea 8*8($tptr),$tptr + xor $carry,$carry # cf=0,of=0 + mov \$-8,%rcx + jmp .Lsqrx8x_reduce + +.align 32 +.Lsqrx8x_reduce: + mov %r8, %rbx + mulx 8*0($nptr),%rax,%r8 # n[0] + adcx %rbx,%rax # discarded + adox %r9,%r8 + + mulx 8*1($nptr),%rbx,%r9 # n[1] + adcx %rbx,%r8 + adox %r10,%r9 + + mulx 8*2($nptr),%rbx,%r10 + adcx %rbx,%r9 + adox %r11,%r10 + + mulx 8*3($nptr),%rbx,%r11 + adcx %rbx,%r10 + adox %r12,%r11 + + .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rbx,%r12 + mov %rdx,%rax + mov %r8,%rdx + adcx %rbx,%r11 + adox %r13,%r12 + + mulx 32+8(%rsp),%rbx,%rdx # %rdx discarded + mov %rax,%rdx + mov %rax,64+48+8(%rsp,%rcx,8) # put aside n0*a[i] + + mulx 8*5($nptr),%rax,%r13 + adcx %rax,%r12 + adox %r14,%r13 + + mulx 8*6($nptr),%rax,%r14 + adcx %rax,%r13 + adox %r15,%r14 + + mulx 8*7($nptr),%rax,%r15 + mov %rbx,%rdx + adcx %rax,%r14 + adox $carry,%r15 # $carry is 0 + adcx $carry,%r15 # cf=0 + + .byte 0x67,0x67,0x67 + inc %rcx # of=0 + jnz .Lsqrx8x_reduce + + mov $carry,%rax # xor %rax,%rax + cmp 0+8(%rsp),$nptr # end of n[]? + jae .Lsqrx8x_no_tail + + mov 48+8(%rsp),%rdx # pull n0*a[0] + add 8*0($tptr),%r8 + lea 8*8($nptr),$nptr + mov \$-8,%rcx + adcx 8*1($tptr),%r9 + adcx 8*2($tptr),%r10 + adc 8*3($tptr),%r11 + adc 8*4($tptr),%r12 + adc 8*5($tptr),%r13 + adc 8*6($tptr),%r14 + adc 8*7($tptr),%r15 + lea 8*8($tptr),$tptr + sbb %rax,%rax # top carry + + xor $carry,$carry # of=0, cf=0 + mov %rax,16+8(%rsp) + jmp .Lsqrx8x_tail + +.align 32 +.Lsqrx8x_tail: + mov %r8,%rbx + mulx 8*0($nptr),%rax,%r8 + adcx %rax,%rbx + adox %r9,%r8 + + mulx 8*1($nptr),%rax,%r9 + adcx %rax,%r8 + adox %r10,%r9 + + mulx 8*2($nptr),%rax,%r10 + adcx %rax,%r9 + adox %r11,%r10 + + mulx 8*3($nptr),%rax,%r11 + adcx %rax,%r10 + adox %r12,%r11 + + .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rax,%r12 + adcx %rax,%r11 + adox %r13,%r12 + + mulx 8*5($nptr),%rax,%r13 + adcx %rax,%r12 + adox %r14,%r13 + + mulx 8*6($nptr),%rax,%r14 + adcx %rax,%r13 + adox %r15,%r14 + + mulx 8*7($nptr),%rax,%r15 + mov 72+48+8(%rsp,%rcx,8),%rdx # pull n0*a[i] + adcx %rax,%r14 + adox $carry,%r15 + mov %rbx,($tptr,%rcx,8) # save result + mov %r8,%rbx + adcx $carry,%r15 # cf=0 + + inc %rcx # of=0 + jnz .Lsqrx8x_tail + + cmp 0+8(%rsp),$nptr # end of n[]? + jae .Lsqrx8x_tail_done # break out of loop + + sub 16+8(%rsp),$carry # mov 16(%rsp),%cf + mov 48+8(%rsp),%rdx # pull n0*a[0] + lea 8*8($nptr),$nptr + adc 8*0($tptr),%r8 + adc 8*1($tptr),%r9 + adc 8*2($tptr),%r10 + adc 8*3($tptr),%r11 + adc 8*4($tptr),%r12 + adc 8*5($tptr),%r13 + adc 8*6($tptr),%r14 + adc 8*7($tptr),%r15 + lea 8*8($tptr),$tptr + sbb %rax,%rax + sub \$8,%rcx # mov \$-8,%rcx + + xor $carry,$carry # of=0, cf=0 + mov %rax,16+8(%rsp) + jmp .Lsqrx8x_tail + +.align 32 +.Lsqrx8x_tail_done: + xor %rax,%rax + add 24+8(%rsp),%r8 # can this overflow? + adc \$0,%r9 + adc \$0,%r10 + adc \$0,%r11 + adc \$0,%r12 + adc \$0,%r13 + adc \$0,%r14 + adc \$0,%r15 + adc \$0,%rax + + sub 16+8(%rsp),$carry # mov 16(%rsp),%cf +.Lsqrx8x_no_tail: # %cf is 0 if jumped here + adc 8*0($tptr),%r8 + movq %xmm3,%rcx + adc 8*1($tptr),%r9 + mov 8*7($nptr),$carry + movq %xmm2,$nptr # restore $nptr + adc 8*2($tptr),%r10 + adc 8*3($tptr),%r11 + adc 8*4($tptr),%r12 + adc 8*5($tptr),%r13 + adc 8*6($tptr),%r14 + adc 8*7($tptr),%r15 + adc \$0,%rax # top-most carry + + mov 32+8(%rsp),%rbx # n0 + mov 8*8($tptr,%rcx),%rdx # modulo-scheduled "%r8" + + mov %r8,8*0($tptr) # store top 512 bits + lea 8*8($tptr),%r8 # borrow %r8 + mov %r9,8*1($tptr) + mov %r10,8*2($tptr) + mov %r11,8*3($tptr) + mov %r12,8*4($tptr) + mov %r13,8*5($tptr) + mov %r14,8*6($tptr) + mov %r15,8*7($tptr) + + lea 8*8($tptr,%rcx),$tptr # start of current t[] window + cmp 8+8(%rsp),%r8 # end of t[]? + jb .Lsqrx8x_reduction_loop + ret +.cfi_endproc +.size bn_sqrx8x_internal,.-bn_sqrx8x_internal +___ +} +############################################################## +# Post-condition, 4x unrolled +# +{ +my ($rptr,$nptr)=("%rdx","%rbp"); +$code.=<<___; +.align 32 +.type __bn_postx4x_internal,\@abi-omnipotent +__bn_postx4x_internal: +.cfi_startproc + mov 8*0($nptr),%r12 + mov %rcx,%r10 # -$num + mov %rcx,%r9 # -$num + neg %rax + sar \$3+2,%rcx + #lea 48+8(%rsp,%r9),$tptr + movq %xmm1,$rptr # restore $rptr + movq %xmm1,$aptr # prepare for back-to-back call + dec %r12 # so that after 'not' we get -n[0] + mov 8*1($nptr),%r13 + xor %r8,%r8 + mov 8*2($nptr),%r14 + mov 8*3($nptr),%r15 + jmp .Lsqrx4x_sub_entry + +.align 16 +.Lsqrx4x_sub: + mov 8*0($nptr),%r12 + mov 8*1($nptr),%r13 + mov 8*2($nptr),%r14 + mov 8*3($nptr),%r15 +.Lsqrx4x_sub_entry: + andn %rax,%r12,%r12 + lea 8*4($nptr),$nptr + andn %rax,%r13,%r13 + andn %rax,%r14,%r14 + andn %rax,%r15,%r15 + + neg %r8 # mov %r8,%cf + adc 8*0($tptr),%r12 + adc 8*1($tptr),%r13 + adc 8*2($tptr),%r14 + adc 8*3($tptr),%r15 + mov %r12,8*0($rptr) + lea 8*4($tptr),$tptr + mov %r13,8*1($rptr) + sbb %r8,%r8 # mov %cf,%r8 + mov %r14,8*2($rptr) + mov %r15,8*3($rptr) + lea 8*4($rptr),$rptr + + inc %rcx + jnz .Lsqrx4x_sub + + neg %r9 # restore $num + + ret +.cfi_endproc +.size __bn_postx4x_internal,.-__bn_postx4x_internal +___ +} +}}} +{ +my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order + ("%rdi","%esi","%rdx","%ecx"); # Unix order +my $out=$inp; +my $STRIDE=2**5*8; +my $N=$STRIDE/4; + +$code.=<<___; +.globl bn_scatter5 +.type bn_scatter5,\@abi-omnipotent +.align 16 +bn_scatter5: +.cfi_startproc + _CET_ENDBR + cmp \$0, $num + jz .Lscatter_epilogue + + # $tbl stores 32 entries, t0 through t31. Each entry has $num words. + # They are interleaved in memory as follows: + # + # t0[0] t1[0] t2[0] ... t31[0] + # t0[1] t1[1] t2[1] ... t31[1] + # ... + # t0[$num-1] t1[$num-1] t2[$num-1] ... t31[$num-1] + + lea ($tbl,$idx,8),$tbl +.Lscatter: + mov ($inp),%rax + lea 8($inp),$inp + mov %rax,($tbl) + lea 32*8($tbl),$tbl + sub \$1,$num + jnz .Lscatter +.Lscatter_epilogue: + ret +.cfi_endproc +.size bn_scatter5,.-bn_scatter5 + +.globl bn_gather5 +.type bn_gather5,\@abi-omnipotent +.align 32 +bn_gather5: +.cfi_startproc +.LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases + _CET_ENDBR + # I can't trust assembler to use specific encoding:-( + .byte 0x4c,0x8d,0x14,0x24 #lea (%rsp),%r10 +.cfi_def_cfa_register %r10 + .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 #sub $0x108,%rsp + lea .Linc(%rip),%rax + and \$-16,%rsp # shouldn't be formally required + + movd $idx,%xmm5 + movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 + movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 + lea 128($tbl),%r11 # size optimization + lea 128(%rsp),%rax # size optimization + + pshufd \$0,%xmm5,%xmm5 # broadcast $idx + movdqa %xmm1,%xmm4 + movdqa %xmm1,%xmm2 +___ +######################################################################## +# Calculate masks by comparing 0..31 to $idx and save result to stack. +# +# We compute sixteen 16-byte masks and store them on the stack. Mask i is stored +# in `16*i - 128`(%rax) and contains the comparisons for idx == 2*i and +# idx == 2*i + 1 in its lower and upper halves, respectively. Mask calculations +# are scheduled in groups of four. +for($i=0;$i<$STRIDE/16;$i+=4) { +$code.=<<___; + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 # compare to 1,0 +___ +$code.=<<___ if ($i); + movdqa %xmm3,`16*($i-1)-128`(%rax) +___ +$code.=<<___; + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 # compare to 3,2 + movdqa %xmm0,`16*($i+0)-128`(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 # compare to 5,4 + movdqa %xmm1,`16*($i+1)-128`(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 # compare to 7,6 + movdqa %xmm2,`16*($i+2)-128`(%rax) + movdqa %xmm4,%xmm2 +___ +} +$code.=<<___; + movdqa %xmm3,`16*($i-1)-128`(%rax) + jmp .Lgather + +.align 32 +.Lgather: + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 +___ +for($i=0;$i<$STRIDE/16;$i+=4) { +# Combine the masks with the corresponding table entries to select the correct +# entry. +$code.=<<___; + movdqa `16*($i+0)-128`(%r11),%xmm0 + movdqa `16*($i+1)-128`(%r11),%xmm1 + movdqa `16*($i+2)-128`(%r11),%xmm2 + pand `16*($i+0)-128`(%rax),%xmm0 + movdqa `16*($i+3)-128`(%r11),%xmm3 + pand `16*($i+1)-128`(%rax),%xmm1 + por %xmm0,%xmm4 + pand `16*($i+2)-128`(%rax),%xmm2 + por %xmm1,%xmm5 + pand `16*($i+3)-128`(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 +___ +} +$code.=<<___; + por %xmm5,%xmm4 + lea $STRIDE(%r11),%r11 + # Combine the upper and lower halves of %xmm0. + pshufd \$0x4e,%xmm4,%xmm0 # Swap upper and lower halves. + por %xmm4,%xmm0 + movq %xmm0,($out) # m0=bp[0] + lea 8($out),$out + sub \$1,$num + jnz .Lgather + + lea (%r10),%rsp +.cfi_def_cfa_register %rsp + ret +.LSEH_end_bn_gather5: +.cfi_endproc +.size bn_gather5,.-bn_gather5 +___ +} +$code.=<<___; +.section .rodata +.align 64 +.Linc: + .long 0,0, 1,1 + .long 2,2, 2,2 +.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by " +.text +___ + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type mul_handler,\@abi-omnipotent +.align 16 +mul_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # end of prologue label + cmp %r10,%rbx # context->RipRipRsp + + mov 8(%r11),%r10d # HandlerData[2] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + lea .Lmul4x_epilogue(%rip),%r10 # *ring*: hacked for deletion of _nohw + cmp %r10,%rbx + ja .Lbody_40 + + mov 192($context),%r10 # pull $num + mov 8(%rax,%r10,8),%rax # pull saved stack pointer + + jmp .Lcommon_pop_regs + +.Lbody_40: + mov 40(%rax),%rax # pull saved stack pointer +.Lcommon_pop_regs: + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 + +.Lcommon_seh_tail: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size mul_handler,.-mul_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_bn_mul4x_mont_gather5 + .rva .LSEH_end_bn_mul4x_mont_gather5 + .rva .LSEH_info_bn_mul4x_mont_gather5 + + .rva .LSEH_begin_bn_power5_nohw + .rva .LSEH_end_bn_power5_nohw + .rva .LSEH_info_bn_power5_nohw +___ +$code.=<<___ if ($addx); + .rva .LSEH_begin_bn_mulx4x_mont_gather5 + .rva .LSEH_end_bn_mulx4x_mont_gather5 + .rva .LSEH_info_bn_mulx4x_mont_gather5 + + .rva .LSEH_begin_bn_powerx5 + .rva .LSEH_end_bn_powerx5 + .rva .LSEH_info_bn_powerx5 +___ +$code.=<<___; + .rva .LSEH_begin_bn_gather5 + .rva .LSEH_end_bn_gather5 + .rva .LSEH_info_bn_gather5 + +.section .xdata +.align 8 +.LSEH_info_bn_mul4x_mont_gather5: + .byte 9,0,0,0 + .rva mul_handler + .rva .Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] +.align 8 +.LSEH_info_bn_power5_nohw: + .byte 9,0,0,0 + .rva mul_handler + .rva .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue # HandlerData[] +___ +$code.=<<___ if ($addx); +.align 8 +.LSEH_info_bn_mulx4x_mont_gather5: + .byte 9,0,0,0 + .rva mul_handler + .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] +.align 8 +.LSEH_info_bn_powerx5: + .byte 9,0,0,0 + .rva mul_handler + .rva .Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[] +___ +$code.=<<___; +.align 8 +.LSEH_info_bn_gather5: + .byte 0x01,0x0b,0x03,0x0a + .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108 + .byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp) +.align 8 +___ +} + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; + +print $code; +close STDOUT or die "error closing STDOUT: $!"; diff --git a/ring-0.17.14/crypto/fipsmodule/bn/internal.h b/ring-0.17.14/crypto/fipsmodule/bn/internal.h new file mode 100644 index 0000000000..f59b5af080 --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/bn/internal.h @@ -0,0 +1,154 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// Copyright (c) 2002, Oracle and/or its affiliates. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_BN_INTERNAL_H +#define OPENSSL_HEADER_BN_INTERNAL_H + +#include + +#if defined(OPENSSL_X86_64) && defined(_MSC_VER) && !defined(__clang__) +#pragma warning(push, 3) +#include +#pragma warning(pop) +#pragma intrinsic(_umul128) +#endif + +#include "../../internal.h" + +typedef crypto_word_t BN_ULONG; + +#if defined(OPENSSL_64_BIT) + +#if defined(BORINGSSL_HAS_UINT128) +// MSVC doesn't support two-word integers on 64-bit. +#define BN_ULLONG uint128_t +#endif + +#define BN_BITS2 64 +#define BN_MONT_CTX_N0_LIMBS 1 +#define BN_MONT_CTX_N0(hi, lo) TOBN(hi, lo), 0 +#define TOBN(hi, lo) ((BN_ULONG)(hi) << 32 | (lo)) + +#elif defined(OPENSSL_32_BIT) + +#define BN_ULLONG uint64_t +#define BN_BITS2 32 +// On some 32-bit platforms, Montgomery multiplication is done using 64-bit +// arithmetic with SIMD instructions. On such platforms, |BN_MONT_CTX::n0| +// needs to be two words long. Only certain 32-bit platforms actually make use +// of n0[1] and shorter R value would suffice for the others. However, +// currently only the assembly files know which is which. +#define BN_MONT_CTX_N0_LIMBS 2 +#define BN_MONT_CTX_N0(hi, lo) TOBN(hi, lo) +#define TOBN(hi, lo) (lo), (hi) + +#else +#error "Must define either OPENSSL_32_BIT or OPENSSL_64_BIT" +#endif + + + +// BN_MONTGOMERY_MAX_WORDS is the maximum numer of words allowed in a |BIGNUM| +// used with Montgomery reduction. Ideally this limit would be applied to all +// |BIGNUM|s, in |bn_wexpand|, but the exactfloat library needs to create 8 MiB +// values for other operations. +// #define BN_MONTGOMERY_MAX_WORDS (8 * 1024 / sizeof(BN_ULONG)) + +// bn_mul_mont writes |ap| * |bp| mod |np| to |rp|, each |num| words +// long. Inputs and outputs are in Montgomery form. |n0| is a pointer to +// an |N0|. +// +// If at least one of |ap| or |bp| is fully reduced, |rp| will be fully reduced. +// If neither is fully-reduced, the output may not be either. +// +// This function allocates |num| words on the stack, so |num| should be at most +// |BN_MONTGOMERY_MAX_WORDS|. +// +// TODO(davidben): The x86_64 implementation expects a 32-bit input and masks +// off upper bits. The aarch64 implementation expects a 64-bit input and does +// not. |size_t| is the safer option but not strictly correct for x86_64. But +// the |BN_MONTGOMERY_MAX_WORDS| bound makes this moot. +// +// See also discussion in |ToWord| in abi_test.h for notes on smaller-than-word +// inputs. +// +// |num| must be at least 4, at least on x86. +// +// In other forks, |bn_mul_mont| returns an |int| indicating whether it +// actually did the multiplication. All our implementations always do the +// multiplication, and forcing callers to deal with the possibility of it +// failing just leads to further problems. +OPENSSL_STATIC_ASSERT(sizeof(int) == sizeof(size_t) || + (sizeof(int) == 4 && sizeof(size_t) == 8), + "int and size_t ABI mismatch"); +#if defined(OPENSSL_X86_64) +void bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, + const BN_ULONG *np, const BN_ULONG *n0, size_t num); +static inline void bn_mul_mont_small( + BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, + const BN_ULONG *np, const BN_ULONG *n0, size_t num) { + bn_mul_mont_nohw(rp, ap, bp, np, n0, num); +} +#elif defined(OPENSSL_AARCH64) +void bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, + const BN_ULONG *np, const BN_ULONG *n0, size_t num); +static inline void bn_mul_mont_small( + BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, + const BN_ULONG *np, const BN_ULONG *n0, size_t num) { + // No point in optimizing for P-256 because P-256 doesn't call into + // this on AArch64. + bn_mul_mont_nohw(rp, ap, bp, np, n0, num); +} +#elif defined(OPENSSL_ARM) +void bn_mul8x_mont_neon(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, + const BN_ULONG *np, const BN_ULONG *n0, size_t num); +void bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, + const BN_ULONG *np, const BN_ULONG *n0, size_t num); +static inline void bn_mul_mont_small( + BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, + const BN_ULONG *np, const BN_ULONG *n0, size_t num) { + // Approximate what `bn_mul_mont` did so that the NEON version for P-256 + // when practical. + if (num == 8) { + // XXX: This should not be accessing `neon_available` directly. + if (neon_available) { + bn_mul8x_mont_neon(rp, ap, bp, np, n0, num); + return; + } + } + bn_mul_mont_nohw(rp, ap, bp, np, n0, num); +} +#else +void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, + const BN_ULONG *np, const BN_ULONG *n0, size_t num); +static inline void bn_mul_mont_small( + BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, + const BN_ULONG *np, const BN_ULONG *n0, size_t num) { + bn_mul_mont(rp, ap, bp, np, n0, num); +} +#endif + +static inline void bn_umult_lohi(BN_ULONG *low_out, BN_ULONG *high_out, + BN_ULONG a, BN_ULONG b) { +#if defined(OPENSSL_X86_64) && defined(_MSC_VER) && !defined(__clang__) + *low_out = _umul128(a, b, high_out); +#else + BN_ULLONG result = (BN_ULLONG)a * b; + *low_out = (BN_ULONG)result; + *high_out = (BN_ULONG)(result >> BN_BITS2); +#endif +} + +#endif // OPENSSL_HEADER_BN_INTERNAL_H diff --git a/ring-0.17.14/crypto/fipsmodule/bn/montgomery.c b/ring-0.17.14/crypto/fipsmodule/bn/montgomery.c new file mode 100644 index 0000000000..07e757d32e --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/bn/montgomery.c @@ -0,0 +1,64 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "internal.h" +#include "../../internal.h" + +#include "../../limbs/limbs.h" +#include "../../limbs/limbs.inl" + +OPENSSL_STATIC_ASSERT(BN_MONT_CTX_N0_LIMBS == 1 || BN_MONT_CTX_N0_LIMBS == 2, + "BN_MONT_CTX_N0_LIMBS value is invalid"); +OPENSSL_STATIC_ASSERT( + sizeof(BN_ULONG) * BN_MONT_CTX_N0_LIMBS == sizeof(uint64_t), + "uint64_t is insufficient precision for n0"); + +int bn_from_montgomery_in_place(BN_ULONG r[], size_t num_r, BN_ULONG a[], + size_t num_a, const BN_ULONG n[], + size_t num_n, + const BN_ULONG n0_[BN_MONT_CTX_N0_LIMBS]) { + if (num_n == 0 || num_r != num_n || num_a != 2 * num_n) { + return 0; + } + + // Add multiples of |n| to |r| until R = 2^(nl * BN_BITS2) divides it. On + // input, we had |r| < |n| * R, so now |r| < 2 * |n| * R. Note that |r| + // includes |carry| which is stored separately. + BN_ULONG n0 = n0_[0]; + BN_ULONG carry = 0; + for (size_t i = 0; i < num_n; i++) { + BN_ULONG v = limbs_mul_add_limb(a + i, n, a[i] * n0, num_n); + v += carry + a[i + num_n]; + carry |= (v != a[i + num_n]); + carry &= (v <= a[i + num_n]); + a[i + num_n] = v; + } + + // Shift |num_n| words to divide by R. We have |a| < 2 * |n|. Note that |a| + // includes |carry| which is stored separately. + a += num_n; + + // |a| thus requires at most one additional subtraction |n| to be reduced. + // Subtract |n| and select the answer in constant time. + BN_ULONG v = limbs_sub(r, a, n, num_n) - carry; + // |v| is one if |a| - |n| underflowed or zero if it did not. Note |v| cannot + // be -1. That would imply the subtraction did not fit in |num_n| words, and + // we know at most one subtraction is needed. + v = 0u - v; + for (size_t i = 0; i < num_n; i++) { + r[i] = constant_time_select_w(v, a[i], r[i]); + a[i] = 0; + } + return 1; +} diff --git a/ring-0.17.14/crypto/fipsmodule/bn/montgomery_inv.c b/ring-0.17.14/crypto/fipsmodule/bn/montgomery_inv.c new file mode 100644 index 0000000000..070cdc1c35 --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/bn/montgomery_inv.c @@ -0,0 +1,105 @@ +/* Copyright 2016 Brian Smith. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#include "internal.h" +#include "../../internal.h" + + +OPENSSL_STATIC_ASSERT(BN_MONT_CTX_N0_LIMBS == 1 || BN_MONT_CTX_N0_LIMBS == 2, + "BN_MONT_CTX_N0_LIMBS value is invalid"); +OPENSSL_STATIC_ASSERT(sizeof(BN_ULONG) * BN_MONT_CTX_N0_LIMBS == sizeof(uint64_t), + "uint64_t is insufficient precision for n0"); + +// LG_LITTLE_R is log_2(r). +#define LG_LITTLE_R (BN_MONT_CTX_N0_LIMBS * BN_BITS2) + +// bn_neg_inv_r_mod_n_u64 calculates the -1/n mod r; i.e. it calculates |v| +// such that u*r - v*n == 1. |r| is the constant defined in |bn_mont_n0|. |n| +// must be odd. +// +// This is derived from |xbinGCD| in Henry S. Warren, Jr.'s "Montgomery +// Multiplication" (http://www.hackersdelight.org/MontgomeryMultiplication.pdf). +// It is very similar to the MODULAR-INVERSE function in Stephen R. Dussé's and +// Burton S. Kaliski Jr.'s "A Cryptographic Library for the Motorola DSP56000" +// (http://link.springer.com/chapter/10.1007%2F3-540-46877-3_21). +// +// This is inspired by Joppe W. Bos's "Constant Time Modular Inversion" +// (http://www.joppebos.com/files/CTInversion.pdf) so that the inversion is +// constant-time with respect to |n|. We assume uint64_t additions, +// subtractions, shifts, and bitwise operations are all constant time, which +// may be a large leap of faith on 32-bit targets. We avoid division and +// multiplication, which tend to be the most problematic in terms of timing +// leaks. +// +// Most GCD implementations return values such that |u*r + v*n == 1|, so the +// caller would have to negate the resultant |v| for the purpose of Montgomery +// multiplication. This implementation does the negation implicitly by doing +// the computations as a difference instead of a sum. +uint64_t bn_neg_inv_mod_r_u64(uint64_t n) { + dev_assert_secret(n % 2 == 1); + + // alpha == 2**(lg r - 1) == r / 2. + static const uint64_t alpha = UINT64_C(1) << (LG_LITTLE_R - 1); + + const uint64_t beta = n; + + uint64_t u = 1; + uint64_t v = 0; + + // The invariant maintained from here on is: + // 2**(lg r - i) == u*2*alpha - v*beta. + for (size_t i = 0; i < LG_LITTLE_R; ++i) { +#if BN_BITS2 == 64 && defined(BN_ULLONG) + dev_assert_secret((BN_ULLONG)(1) << (LG_LITTLE_R - i) == + ((BN_ULLONG)u * 2 * alpha) - ((BN_ULLONG)v * beta)); +#endif + + // Delete a common factor of 2 in u and v if |u| is even. Otherwise, set + // |u = (u + beta) / 2| and |v = (v / 2) + alpha|. + + uint64_t u_is_odd = UINT64_C(0) - (u & 1); // Either 0xff..ff or 0. + + // The addition can overflow, so use Dietz's method for it. + // + // Dietz calculates (x+y)/2 by (x xor y)>>1 + x&y. This is valid for all + // (unsigned) x and y, even when x+y overflows. Evidence for 32-bit values + // (embedded in 64 bits to so that overflow can be ignored): + // + // (declare-fun x () (_ BitVec 64)) + // (declare-fun y () (_ BitVec 64)) + // (assert (let ( + // (one (_ bv1 64)) + // (thirtyTwo (_ bv32 64))) + // (and + // (bvult x (bvshl one thirtyTwo)) + // (bvult y (bvshl one thirtyTwo)) + // (not (= + // (bvadd (bvlshr (bvxor x y) one) (bvand x y)) + // (bvlshr (bvadd x y) one))) + // ))) + // (check-sat) + uint64_t beta_if_u_is_odd = beta & u_is_odd; // Either |beta| or 0. + u = ((u ^ beta_if_u_is_odd) >> 1) + (u & beta_if_u_is_odd); + + uint64_t alpha_if_u_is_odd = alpha & u_is_odd; /* Either |alpha| or 0. */ + v = (v >> 1) + alpha_if_u_is_odd; + } + + // The invariant now shows that u*r - v*n == 1 since r == 2 * alpha. +#if BN_BITS2 == 64 && defined(BN_ULLONG) + declassify_assert(1 == ((BN_ULLONG)u * 2 * alpha) - ((BN_ULLONG)v * beta)); +#endif + + return v; +} diff --git a/ring-0.17.14/crypto/fipsmodule/ec/asm/p256-armv8-asm.pl b/ring-0.17.14/crypto/fipsmodule/ec/asm/p256-armv8-asm.pl new file mode 100644 index 0000000000..720b3b489c --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/ec/asm/p256-armv8-asm.pl @@ -0,0 +1,1567 @@ +#! /usr/bin/env perl +# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. +# ==================================================================== +# +# ECP_NISTZ256 module for ARMv8. +# +# February 2015. +# +# Original ECP_NISTZ256 submission targeting x86_64 is detailed in +# http://eprint.iacr.org/2013/816. +# +# with/without -DECP_NISTZ256_ASM +# Apple A7 +190-360% +# Cortex-A53 +190-400% +# Cortex-A57 +190-350% +# Denver +230-400% +# +# Ranges denote minimum and maximum improvement coefficients depending +# on benchmark. Lower coefficients are for ECDSA sign, server-side +# operation. Keep in mind that +400% means 5x improvement. + +# The first two arguments should always be the flavour and output file path. +if ($#ARGV < 1) { die "Not enough arguments provided. + Two arguments are necessary: the flavour and the output file path."; } + +$flavour = shift; +$output = shift; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +{ +my ($rp,$ap,$bp,$bi,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$poly1,$poly3, + $acc0,$acc1,$acc2,$acc3,$acc4,$acc5) = + map("x$_",(0..17,19,20)); + +my ($acc6,$acc7)=($ap,$bp); # used in __ecp_nistz256_sqr_mont + +$code.=<<___; +.section .rodata +.align 5 +.Lpoly: +.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 +.LRR: // 2^512 mod P precomputed for NIST P256 polynomial +.quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd +.Lone_mont: +.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe +.Lone: +.quad 1,0,0,0 +.Lord: +.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 +.LordK: +.quad 0xccd1c8aaee00bc4f +.asciz "ECP_NISTZ256 for ARMv8, CRYPTOGAMS by " +.text + +// void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], +// const BN_ULONG x2[4]); +.globl ecp_nistz256_mul_mont +.type ecp_nistz256_mul_mont,%function +.align 4 +ecp_nistz256_mul_mont: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + + ldr $bi,[$bp] // bp[0] + ldp $a0,$a1,[$ap] + ldp $a2,$a3,[$ap,#16] + adrp $poly3,:pg_hi21:.Lpoly + add $poly3,$poly3,:lo12:.Lpoly + ldr $poly1,[$poly3,#8] + ldr $poly3,[$poly3,#24] + + bl __ecp_nistz256_mul_mont + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont + +// void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_sqr_mont +.type ecp_nistz256_sqr_mont,%function +.align 4 +ecp_nistz256_sqr_mont: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + + ldp $a0,$a1,[$ap] + ldp $a2,$a3,[$ap,#16] + adrp $poly3,:pg_hi21:.Lpoly + add $poly3,$poly3,:lo12:.Lpoly + ldr $poly1,[$poly3,#8] + ldr $poly3,[$poly3,#24] + + bl __ecp_nistz256_sqr_mont + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont + +// void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_neg +.type ecp_nistz256_neg,%function +.align 4 +ecp_nistz256_neg: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov $bp,$ap + mov $acc0,xzr // a = 0 + mov $acc1,xzr + mov $acc2,xzr + mov $acc3,xzr + adrp $poly3,:pg_hi21:.Lpoly + add $poly3,$poly3,:lo12:.Lpoly + ldr $poly1,[$poly3,#8] + ldr $poly3,[$poly3,#24] + + bl __ecp_nistz256_sub_from + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_nistz256_neg,.-ecp_nistz256_neg + +// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded +// to $a0-$a3 and b[0] - to $bi +.type __ecp_nistz256_mul_mont,%function +.align 4 +__ecp_nistz256_mul_mont: + mul $acc0,$a0,$bi // a[0]*b[0] + umulh $t0,$a0,$bi + + mul $acc1,$a1,$bi // a[1]*b[0] + umulh $t1,$a1,$bi + + mul $acc2,$a2,$bi // a[2]*b[0] + umulh $t2,$a2,$bi + + mul $acc3,$a3,$bi // a[3]*b[0] + umulh $t3,$a3,$bi + ldr $bi,[$bp,#8] // b[1] + + adds $acc1,$acc1,$t0 // accumulate high parts of multiplication + lsl $t0,$acc0,#32 + adcs $acc2,$acc2,$t1 + lsr $t1,$acc0,#32 + adcs $acc3,$acc3,$t2 + adc $acc4,xzr,$t3 + mov $acc5,xzr +___ +for($i=1;$i<4;$i++) { + # Reduction iteration is normally performed by accumulating + # result of multiplication of modulus by "magic" digit [and + # omitting least significant word, which is guaranteed to + # be 0], but thanks to special form of modulus and "magic" + # digit being equal to least significant word, it can be + # performed with additions and subtractions alone. Indeed: + # + # ffff0001.00000000.0000ffff.ffffffff + # * abcdefgh + # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh + # + # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we + # rewrite above as: + # + # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh + # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000 + # - 0000abcd.efgh0000.00000000.00000000.abcdefgh + # + # or marking redundant operations: + # + # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.-------- + # + abcdefgh.abcdefgh.0000abcd.efgh0000.-------- + # - 0000abcd.efgh0000.--------.--------.-------- + +$code.=<<___; + subs $t2,$acc0,$t0 // "*0xffff0001" + sbc $t3,$acc0,$t1 + adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] + mul $t0,$a0,$bi // lo(a[0]*b[i]) + adcs $acc1,$acc2,$t1 + mul $t1,$a1,$bi // lo(a[1]*b[i]) + adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 + mul $t2,$a2,$bi // lo(a[2]*b[i]) + adcs $acc3,$acc4,$t3 + mul $t3,$a3,$bi // lo(a[3]*b[i]) + adc $acc4,$acc5,xzr + + adds $acc0,$acc0,$t0 // accumulate low parts of multiplication + umulh $t0,$a0,$bi // hi(a[0]*b[i]) + adcs $acc1,$acc1,$t1 + umulh $t1,$a1,$bi // hi(a[1]*b[i]) + adcs $acc2,$acc2,$t2 + umulh $t2,$a2,$bi // hi(a[2]*b[i]) + adcs $acc3,$acc3,$t3 + umulh $t3,$a3,$bi // hi(a[3]*b[i]) + adc $acc4,$acc4,xzr +___ +$code.=<<___ if ($i<3); + ldr $bi,[$bp,#8*($i+1)] // b[$i+1] +___ +$code.=<<___; + adds $acc1,$acc1,$t0 // accumulate high parts of multiplication + lsl $t0,$acc0,#32 + adcs $acc2,$acc2,$t1 + lsr $t1,$acc0,#32 + adcs $acc3,$acc3,$t2 + adcs $acc4,$acc4,$t3 + adc $acc5,xzr,xzr +___ +} +$code.=<<___; + // last reduction + subs $t2,$acc0,$t0 // "*0xffff0001" + sbc $t3,$acc0,$t1 + adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] + adcs $acc1,$acc2,$t1 + adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 + adcs $acc3,$acc4,$t3 + adc $acc4,$acc5,xzr + + adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus + sbcs $t1,$acc1,$poly1 + sbcs $t2,$acc2,xzr + sbcs $t3,$acc3,$poly3 + sbcs xzr,$acc4,xzr // did it borrow? + + csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus + csel $acc1,$acc1,$t1,lo + csel $acc2,$acc2,$t2,lo + stp $acc0,$acc1,[$rp] + csel $acc3,$acc3,$t3,lo + stp $acc2,$acc3,[$rp,#16] + + ret +.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont + +// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded +// to $a0-$a3 +.type __ecp_nistz256_sqr_mont,%function +.align 4 +__ecp_nistz256_sqr_mont: + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul $acc1,$a1,$a0 // a[1]*a[0] + umulh $t1,$a1,$a0 + mul $acc2,$a2,$a0 // a[2]*a[0] + umulh $t2,$a2,$a0 + mul $acc3,$a3,$a0 // a[3]*a[0] + umulh $acc4,$a3,$a0 + + adds $acc2,$acc2,$t1 // accumulate high parts of multiplication + mul $t0,$a2,$a1 // a[2]*a[1] + umulh $t1,$a2,$a1 + adcs $acc3,$acc3,$t2 + mul $t2,$a3,$a1 // a[3]*a[1] + umulh $t3,$a3,$a1 + adc $acc4,$acc4,xzr // can't overflow + + mul $acc5,$a3,$a2 // a[3]*a[2] + umulh $acc6,$a3,$a2 + + adds $t1,$t1,$t2 // accumulate high parts of multiplication + mul $acc0,$a0,$a0 // a[0]*a[0] + adc $t2,$t3,xzr // can't overflow + + adds $acc3,$acc3,$t0 // accumulate low parts of multiplication + umulh $a0,$a0,$a0 + adcs $acc4,$acc4,$t1 + mul $t1,$a1,$a1 // a[1]*a[1] + adcs $acc5,$acc5,$t2 + umulh $a1,$a1,$a1 + adc $acc6,$acc6,xzr // can't overflow + + adds $acc1,$acc1,$acc1 // acc[1-6]*=2 + mul $t2,$a2,$a2 // a[2]*a[2] + adcs $acc2,$acc2,$acc2 + umulh $a2,$a2,$a2 + adcs $acc3,$acc3,$acc3 + mul $t3,$a3,$a3 // a[3]*a[3] + adcs $acc4,$acc4,$acc4 + umulh $a3,$a3,$a3 + adcs $acc5,$acc5,$acc5 + adcs $acc6,$acc6,$acc6 + adc $acc7,xzr,xzr + + adds $acc1,$acc1,$a0 // +a[i]*a[i] + adcs $acc2,$acc2,$t1 + adcs $acc3,$acc3,$a1 + adcs $acc4,$acc4,$t2 + adcs $acc5,$acc5,$a2 + lsl $t0,$acc0,#32 + adcs $acc6,$acc6,$t3 + lsr $t1,$acc0,#32 + adc $acc7,$acc7,$a3 +___ +for($i=0;$i<3;$i++) { # reductions, see commentary in + # multiplication for details +$code.=<<___; + subs $t2,$acc0,$t0 // "*0xffff0001" + sbc $t3,$acc0,$t1 + adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] + adcs $acc1,$acc2,$t1 + lsl $t0,$acc0,#32 + adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 + lsr $t1,$acc0,#32 + adc $acc3,$t3,xzr // can't overflow +___ +} +$code.=<<___; + subs $t2,$acc0,$t0 // "*0xffff0001" + sbc $t3,$acc0,$t1 + adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] + adcs $acc1,$acc2,$t1 + adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 + adc $acc3,$t3,xzr // can't overflow + + adds $acc0,$acc0,$acc4 // accumulate upper half + adcs $acc1,$acc1,$acc5 + adcs $acc2,$acc2,$acc6 + adcs $acc3,$acc3,$acc7 + adc $acc4,xzr,xzr + + adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus + sbcs $t1,$acc1,$poly1 + sbcs $t2,$acc2,xzr + sbcs $t3,$acc3,$poly3 + sbcs xzr,$acc4,xzr // did it borrow? + + csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus + csel $acc1,$acc1,$t1,lo + csel $acc2,$acc2,$t2,lo + stp $acc0,$acc1,[$rp] + csel $acc3,$acc3,$t3,lo + stp $acc2,$acc3,[$rp,#16] + + ret +.size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont + +// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to +// $a0-$a3 and $t0-$t3. This is done because it's used in multiple +// contexts, e.g. in multiplication by 2 and 3... +.type __ecp_nistz256_add_to,%function +.align 4 +__ecp_nistz256_add_to: + adds $acc0,$acc0,$t0 // ret = a+b + adcs $acc1,$acc1,$t1 + adcs $acc2,$acc2,$t2 + adcs $acc3,$acc3,$t3 + adc $ap,xzr,xzr // zap $ap + + adds $t0,$acc0,#1 // subs $t0,$a0,#-1 // tmp = ret-modulus + sbcs $t1,$acc1,$poly1 + sbcs $t2,$acc2,xzr + sbcs $t3,$acc3,$poly3 + sbcs xzr,$ap,xzr // did subtraction borrow? + + csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus + csel $acc1,$acc1,$t1,lo + csel $acc2,$acc2,$t2,lo + stp $acc0,$acc1,[$rp] + csel $acc3,$acc3,$t3,lo + stp $acc2,$acc3,[$rp,#16] + + ret +.size __ecp_nistz256_add_to,.-__ecp_nistz256_add_to + +.type __ecp_nistz256_sub_from,%function +.align 4 +__ecp_nistz256_sub_from: + ldp $t0,$t1,[$bp] + ldp $t2,$t3,[$bp,#16] + subs $acc0,$acc0,$t0 // ret = a-b + sbcs $acc1,$acc1,$t1 + sbcs $acc2,$acc2,$t2 + sbcs $acc3,$acc3,$t3 + sbc $ap,xzr,xzr // zap $ap + + subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus + adcs $t1,$acc1,$poly1 + adcs $t2,$acc2,xzr + adc $t3,$acc3,$poly3 + cmp $ap,xzr // did subtraction borrow? + + csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret + csel $acc1,$acc1,$t1,eq + csel $acc2,$acc2,$t2,eq + stp $acc0,$acc1,[$rp] + csel $acc3,$acc3,$t3,eq + stp $acc2,$acc3,[$rp,#16] + + ret +.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from + +.type __ecp_nistz256_sub_morf,%function +.align 4 +__ecp_nistz256_sub_morf: + ldp $t0,$t1,[$bp] + ldp $t2,$t3,[$bp,#16] + subs $acc0,$t0,$acc0 // ret = b-a + sbcs $acc1,$t1,$acc1 + sbcs $acc2,$t2,$acc2 + sbcs $acc3,$t3,$acc3 + sbc $ap,xzr,xzr // zap $ap + + subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus + adcs $t1,$acc1,$poly1 + adcs $t2,$acc2,xzr + adc $t3,$acc3,$poly3 + cmp $ap,xzr // did subtraction borrow? + + csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret + csel $acc1,$acc1,$t1,eq + csel $acc2,$acc2,$t2,eq + stp $acc0,$acc1,[$rp] + csel $acc3,$acc3,$t3,eq + stp $acc2,$acc3,[$rp,#16] + + ret +.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf + +.type __ecp_nistz256_div_by_2,%function +.align 4 +__ecp_nistz256_div_by_2: + subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = a+modulus + adcs $t1,$acc1,$poly1 + adcs $t2,$acc2,xzr + adcs $t3,$acc3,$poly3 + adc $ap,xzr,xzr // zap $ap + tst $acc0,#1 // is a even? + + csel $acc0,$acc0,$t0,eq // ret = even ? a : a+modulus + csel $acc1,$acc1,$t1,eq + csel $acc2,$acc2,$t2,eq + csel $acc3,$acc3,$t3,eq + csel $ap,xzr,$ap,eq + + lsr $acc0,$acc0,#1 // ret >>= 1 + orr $acc0,$acc0,$acc1,lsl#63 + lsr $acc1,$acc1,#1 + orr $acc1,$acc1,$acc2,lsl#63 + lsr $acc2,$acc2,#1 + orr $acc2,$acc2,$acc3,lsl#63 + lsr $acc3,$acc3,#1 + stp $acc0,$acc1,[$rp] + orr $acc3,$acc3,$ap,lsl#63 + stp $acc2,$acc3,[$rp,#16] + + ret +.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 +___ +######################################################################## +# following subroutines are "literal" implementation of those found in +# ecp_nistz256.c +# +######################################################################## +# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); +# +{ +my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3)); +# above map() describes stack layout with 4 temporary +# 256-bit vectors on top. +my ($rp_real,$ap_real) = map("x$_",(21,22)); + +$code.=<<___; +.globl ecp_nistz256_point_double +.type ecp_nistz256_point_double,%function +.align 5 +ecp_nistz256_point_double: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + sub sp,sp,#32*4 + +.Ldouble_shortcut: + ldp $acc0,$acc1,[$ap,#32] + mov $rp_real,$rp + ldp $acc2,$acc3,[$ap,#48] + mov $ap_real,$ap + adrp $poly3,:pg_hi21:.Lpoly + add $poly3,$poly3,:lo12:.Lpoly + ldr $poly1,[$poly3,#8] + mov $t0,$acc0 + ldr $poly3,[$poly3,#24] + mov $t1,$acc1 + ldp $a0,$a1,[$ap_real,#64] // forward load for p256_sqr_mont + mov $t2,$acc2 + mov $t3,$acc3 + ldp $a2,$a3,[$ap_real,#64+16] + add $rp,sp,#$S + bl __ecp_nistz256_add_to // p256_mul_by_2(S, in_y); + + add $rp,sp,#$Zsqr + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); + + ldp $t0,$t1,[$ap_real] + ldp $t2,$t3,[$ap_real,#16] + mov $a0,$acc0 // put Zsqr aside for p256_sub + mov $a1,$acc1 + mov $a2,$acc2 + mov $a3,$acc3 + add $rp,sp,#$M + bl __ecp_nistz256_add_to // p256_add(M, Zsqr, in_x); + + add $bp,$ap_real,#0 + mov $acc0,$a0 // restore Zsqr + mov $acc1,$a1 + ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont + mov $acc2,$a2 + mov $acc3,$a3 + ldp $a2,$a3,[sp,#$S+16] + add $rp,sp,#$Zsqr + bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); + + add $rp,sp,#$S + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); + + ldr $bi,[$ap_real,#32] + ldp $a0,$a1,[$ap_real,#64] + ldp $a2,$a3,[$ap_real,#64+16] + add $bp,$ap_real,#32 + add $rp,sp,#$tmp0 + bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); + + mov $t0,$acc0 + mov $t1,$acc1 + ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont + mov $t2,$acc2 + mov $t3,$acc3 + ldp $a2,$a3,[sp,#$S+16] + add $rp,$rp_real,#64 + bl __ecp_nistz256_add_to // p256_mul_by_2(res_z, tmp0); + + add $rp,sp,#$tmp0 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); + + ldr $bi,[sp,#$Zsqr] // forward load for p256_mul_mont + ldp $a0,$a1,[sp,#$M] + ldp $a2,$a3,[sp,#$M+16] + add $rp,$rp_real,#32 + bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); + + add $bp,sp,#$Zsqr + add $rp,sp,#$M + bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); + + mov $t0,$acc0 // duplicate M + mov $t1,$acc1 + mov $t2,$acc2 + mov $t3,$acc3 + mov $a0,$acc0 // put M aside + mov $a1,$acc1 + mov $a2,$acc2 + mov $a3,$acc3 + add $rp,sp,#$M + bl __ecp_nistz256_add_to + mov $t0,$a0 // restore M + mov $t1,$a1 + ldr $bi,[$ap_real] // forward load for p256_mul_mont + mov $t2,$a2 + ldp $a0,$a1,[sp,#$S] + mov $t3,$a3 + ldp $a2,$a3,[sp,#$S+16] + bl __ecp_nistz256_add_to // p256_mul_by_3(M, M); + + add $bp,$ap_real,#0 + add $rp,sp,#$S + bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); + + mov $t0,$acc0 + mov $t1,$acc1 + ldp $a0,$a1,[sp,#$M] // forward load for p256_sqr_mont + mov $t2,$acc2 + mov $t3,$acc3 + ldp $a2,$a3,[sp,#$M+16] + add $rp,sp,#$tmp0 + bl __ecp_nistz256_add_to // p256_mul_by_2(tmp0, S); + + add $rp,$rp_real,#0 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); + + add $bp,sp,#$tmp0 + bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); + + add $bp,sp,#$S + add $rp,sp,#$S + bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); + + ldr $bi,[sp,#$M] + mov $a0,$acc0 // copy S + mov $a1,$acc1 + mov $a2,$acc2 + mov $a3,$acc3 + add $bp,sp,#$M + bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); + + add $bp,$rp_real,#32 + add $rp,$rp_real,#32 + bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); + + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_nistz256_point_double,.-ecp_nistz256_point_double +___ +} + +######################################################################## +# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, +# const P256_POINT *in2); +{ +my ($res_x,$res_y,$res_z, + $H,$Hsqr,$R,$Rsqr,$Hcub, + $U1,$U2,$S1,$S2)=map(32*$_,(0..11)); +my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); +# above map() describes stack layout with 12 temporary +# 256-bit vectors on top. +my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp0,$temp1,$temp2)=map("x$_",(21..28)); + +$code.=<<___; +.globl ecp_nistz256_point_add +.type ecp_nistz256_point_add,%function +.align 5 +ecp_nistz256_point_add: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#32*12 + + ldp $a0,$a1,[$bp,#64] // in2_z + ldp $a2,$a3,[$bp,#64+16] + mov $rp_real,$rp + mov $ap_real,$ap + mov $bp_real,$bp + adrp $poly3,:pg_hi21:.Lpoly + add $poly3,$poly3,:lo12:.Lpoly + ldr $poly1,[$poly3,#8] + ldr $poly3,[$poly3,#24] + orr $t0,$a0,$a1 + orr $t2,$a2,$a3 + orr $in2infty,$t0,$t2 + cmp $in2infty,#0 + csetm $in2infty,ne // ~in2infty + add $rp,sp,#$Z2sqr + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); + + ldp $a0,$a1,[$ap_real,#64] // in1_z + ldp $a2,$a3,[$ap_real,#64+16] + orr $t0,$a0,$a1 + orr $t2,$a2,$a3 + orr $in1infty,$t0,$t2 + cmp $in1infty,#0 + csetm $in1infty,ne // ~in1infty + add $rp,sp,#$Z1sqr + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); + + ldr $bi,[$bp_real,#64] + ldp $a0,$a1,[sp,#$Z2sqr] + ldp $a2,$a3,[sp,#$Z2sqr+16] + add $bp,$bp_real,#64 + add $rp,sp,#$S1 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z); + + ldr $bi,[$ap_real,#64] + ldp $a0,$a1,[sp,#$Z1sqr] + ldp $a2,$a3,[sp,#$Z1sqr+16] + add $bp,$ap_real,#64 + add $rp,sp,#$S2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); + + ldr $bi,[$ap_real,#32] + ldp $a0,$a1,[sp,#$S1] + ldp $a2,$a3,[sp,#$S1+16] + add $bp,$ap_real,#32 + add $rp,sp,#$S1 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y); + + ldr $bi,[$bp_real,#32] + ldp $a0,$a1,[sp,#$S2] + ldp $a2,$a3,[sp,#$S2+16] + add $bp,$bp_real,#32 + add $rp,sp,#$S2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); + + add $bp,sp,#$S1 + ldr $bi,[sp,#$Z2sqr] // forward load for p256_mul_mont + ldp $a0,$a1,[$ap_real] + ldp $a2,$a3,[$ap_real,#16] + add $rp,sp,#$R + bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1); + + orr $acc0,$acc0,$acc1 // see if result is zero + orr $acc2,$acc2,$acc3 + orr $temp0,$acc0,$acc2 // ~is_equal(S1,S2) + + add $bp,sp,#$Z2sqr + add $rp,sp,#$U1 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr); + + ldr $bi,[sp,#$Z1sqr] + ldp $a0,$a1,[$bp_real] + ldp $a2,$a3,[$bp_real,#16] + add $bp,sp,#$Z1sqr + add $rp,sp,#$U2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr); + + add $bp,sp,#$U1 + ldp $a0,$a1,[sp,#$R] // forward load for p256_sqr_mont + ldp $a2,$a3,[sp,#$R+16] + add $rp,sp,#$H + bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1); + + orr $acc0,$acc0,$acc1 // see if result is zero + orr $acc2,$acc2,$acc3 + orr $acc0,$acc0,$acc2 // ~is_equal(U1,U2) + + mvn $temp1,$in1infty // -1/0 -> 0/-1 + mvn $temp2,$in2infty // -1/0 -> 0/-1 + orr $acc0,$acc0,$temp1 + orr $acc0,$acc0,$temp2 + orr $acc0,$acc0,$temp0 + cbnz $acc0,.Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) + +.Ladd_double: + mov $ap,$ap_real + mov $rp,$rp_real + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + add sp,sp,#256 // #256 is from #32*(12-4). difference in stack frames + b .Ldouble_shortcut + +.align 4 +.Ladd_proceed: + add $rp,sp,#$Rsqr + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); + + ldr $bi,[$ap_real,#64] + ldp $a0,$a1,[sp,#$H] + ldp $a2,$a3,[sp,#$H+16] + add $bp,$ap_real,#64 + add $rp,sp,#$res_z + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); + + ldp $a0,$a1,[sp,#$H] + ldp $a2,$a3,[sp,#$H+16] + add $rp,sp,#$Hsqr + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); + + ldr $bi,[$bp_real,#64] + ldp $a0,$a1,[sp,#$res_z] + ldp $a2,$a3,[sp,#$res_z+16] + add $bp,$bp_real,#64 + add $rp,sp,#$res_z + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z); + + ldr $bi,[sp,#$H] + ldp $a0,$a1,[sp,#$Hsqr] + ldp $a2,$a3,[sp,#$Hsqr+16] + add $bp,sp,#$H + add $rp,sp,#$Hcub + bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); + + ldr $bi,[sp,#$Hsqr] + ldp $a0,$a1,[sp,#$U1] + ldp $a2,$a3,[sp,#$U1+16] + add $bp,sp,#$Hsqr + add $rp,sp,#$U2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr); + + mov $t0,$acc0 + mov $t1,$acc1 + mov $t2,$acc2 + mov $t3,$acc3 + add $rp,sp,#$Hsqr + bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); + + add $bp,sp,#$Rsqr + add $rp,sp,#$res_x + bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); + + add $bp,sp,#$Hcub + bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); + + add $bp,sp,#$U2 + ldr $bi,[sp,#$Hcub] // forward load for p256_mul_mont + ldp $a0,$a1,[sp,#$S1] + ldp $a2,$a3,[sp,#$S1+16] + add $rp,sp,#$res_y + bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); + + add $bp,sp,#$Hcub + add $rp,sp,#$S2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub); + + ldr $bi,[sp,#$R] + ldp $a0,$a1,[sp,#$res_y] + ldp $a2,$a3,[sp,#$res_y+16] + add $bp,sp,#$R + add $rp,sp,#$res_y + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); + + add $bp,sp,#$S2 + bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); + + ldp $a0,$a1,[sp,#$res_x] // res + ldp $a2,$a3,[sp,#$res_x+16] + ldp $t0,$t1,[$bp_real] // in2 + ldp $t2,$t3,[$bp_real,#16] +___ +for($i=0;$i<64;$i+=32) { # conditional moves +$code.=<<___; + ldp $acc0,$acc1,[$ap_real,#$i] // in1 + cmp $in1infty,#0 // ~$in1intfy, remember? + ldp $acc2,$acc3,[$ap_real,#$i+16] + csel $t0,$a0,$t0,ne + csel $t1,$a1,$t1,ne + ldp $a0,$a1,[sp,#$res_x+$i+32] // res + csel $t2,$a2,$t2,ne + csel $t3,$a3,$t3,ne + cmp $in2infty,#0 // ~$in2intfy, remember? + ldp $a2,$a3,[sp,#$res_x+$i+48] + csel $acc0,$t0,$acc0,ne + csel $acc1,$t1,$acc1,ne + ldp $t0,$t1,[$bp_real,#$i+32] // in2 + csel $acc2,$t2,$acc2,ne + csel $acc3,$t3,$acc3,ne + ldp $t2,$t3,[$bp_real,#$i+48] + stp $acc0,$acc1,[$rp_real,#$i] + stp $acc2,$acc3,[$rp_real,#$i+16] +___ +} +$code.=<<___; + ldp $acc0,$acc1,[$ap_real,#$i] // in1 + cmp $in1infty,#0 // ~$in1intfy, remember? + ldp $acc2,$acc3,[$ap_real,#$i+16] + csel $t0,$a0,$t0,ne + csel $t1,$a1,$t1,ne + csel $t2,$a2,$t2,ne + csel $t3,$a3,$t3,ne + cmp $in2infty,#0 // ~$in2intfy, remember? + csel $acc0,$t0,$acc0,ne + csel $acc1,$t1,$acc1,ne + csel $acc2,$t2,$acc2,ne + csel $acc3,$t3,$acc3,ne + stp $acc0,$acc1,[$rp_real,#$i] + stp $acc2,$acc3,[$rp_real,#$i+16] + +.Ladd_done: + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_nistz256_point_add,.-ecp_nistz256_point_add +___ +} + +######################################################################## +# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, +# const P256_POINT_AFFINE *in2); +{ +my ($res_x,$res_y,$res_z, + $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9)); +my $Z1sqr = $S2; +# above map() describes stack layout with 10 temporary +# 256-bit vectors on top. +my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26)); + +$code.=<<___; +.globl ecp_nistz256_point_add_affine +.type ecp_nistz256_point_add_affine,%function +.align 5 +ecp_nistz256_point_add_affine: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + sub sp,sp,#32*10 + + mov $rp_real,$rp + mov $ap_real,$ap + mov $bp_real,$bp + adrp $poly3,:pg_hi21:.Lpoly + add $poly3,$poly3,:lo12:.Lpoly + ldr $poly1,[$poly3,#8] + ldr $poly3,[$poly3,#24] + + ldp $a0,$a1,[$ap,#64] // in1_z + ldp $a2,$a3,[$ap,#64+16] + orr $t0,$a0,$a1 + orr $t2,$a2,$a3 + orr $in1infty,$t0,$t2 + cmp $in1infty,#0 + csetm $in1infty,ne // ~in1infty + + ldp $acc0,$acc1,[$bp] // in2_x + ldp $acc2,$acc3,[$bp,#16] + ldp $t0,$t1,[$bp,#32] // in2_y + ldp $t2,$t3,[$bp,#48] + orr $acc0,$acc0,$acc1 + orr $acc2,$acc2,$acc3 + orr $t0,$t0,$t1 + orr $t2,$t2,$t3 + orr $acc0,$acc0,$acc2 + orr $t0,$t0,$t2 + orr $in2infty,$acc0,$t0 + cmp $in2infty,#0 + csetm $in2infty,ne // ~in2infty + + add $rp,sp,#$Z1sqr + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); + + mov $a0,$acc0 + mov $a1,$acc1 + mov $a2,$acc2 + mov $a3,$acc3 + ldr $bi,[$bp_real] + add $bp,$bp_real,#0 + add $rp,sp,#$U2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); + + add $bp,$ap_real,#0 + ldr $bi,[$ap_real,#64] // forward load for p256_mul_mont + ldp $a0,$a1,[sp,#$Z1sqr] + ldp $a2,$a3,[sp,#$Z1sqr+16] + add $rp,sp,#$H + bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); + + add $bp,$ap_real,#64 + add $rp,sp,#$S2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); + + ldr $bi,[$ap_real,#64] + ldp $a0,$a1,[sp,#$H] + ldp $a2,$a3,[sp,#$H+16] + add $bp,$ap_real,#64 + add $rp,sp,#$res_z + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); + + ldr $bi,[$bp_real,#32] + ldp $a0,$a1,[sp,#$S2] + ldp $a2,$a3,[sp,#$S2+16] + add $bp,$bp_real,#32 + add $rp,sp,#$S2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); + + add $bp,$ap_real,#32 + ldp $a0,$a1,[sp,#$H] // forward load for p256_sqr_mont + ldp $a2,$a3,[sp,#$H+16] + add $rp,sp,#$R + bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); + + add $rp,sp,#$Hsqr + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); + + ldp $a0,$a1,[sp,#$R] + ldp $a2,$a3,[sp,#$R+16] + add $rp,sp,#$Rsqr + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); + + ldr $bi,[sp,#$H] + ldp $a0,$a1,[sp,#$Hsqr] + ldp $a2,$a3,[sp,#$Hsqr+16] + add $bp,sp,#$H + add $rp,sp,#$Hcub + bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); + + ldr $bi,[$ap_real] + ldp $a0,$a1,[sp,#$Hsqr] + ldp $a2,$a3,[sp,#$Hsqr+16] + add $bp,$ap_real,#0 + add $rp,sp,#$U2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); + + mov $t0,$acc0 + mov $t1,$acc1 + mov $t2,$acc2 + mov $t3,$acc3 + add $rp,sp,#$Hsqr + bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); + + add $bp,sp,#$Rsqr + add $rp,sp,#$res_x + bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); + + add $bp,sp,#$Hcub + bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); + + add $bp,sp,#$U2 + ldr $bi,[$ap_real,#32] // forward load for p256_mul_mont + ldp $a0,$a1,[sp,#$Hcub] + ldp $a2,$a3,[sp,#$Hcub+16] + add $rp,sp,#$res_y + bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); + + add $bp,$ap_real,#32 + add $rp,sp,#$S2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); + + ldr $bi,[sp,#$R] + ldp $a0,$a1,[sp,#$res_y] + ldp $a2,$a3,[sp,#$res_y+16] + add $bp,sp,#$R + add $rp,sp,#$res_y + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); + + add $bp,sp,#$S2 + bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); + + ldp $a0,$a1,[sp,#$res_x] // res + ldp $a2,$a3,[sp,#$res_x+16] + ldp $t0,$t1,[$bp_real] // in2 + ldp $t2,$t3,[$bp_real,#16] +___ +for($i=0;$i<64;$i+=32) { # conditional moves +$code.=<<___; + ldp $acc0,$acc1,[$ap_real,#$i] // in1 + cmp $in1infty,#0 // ~$in1intfy, remember? + ldp $acc2,$acc3,[$ap_real,#$i+16] + csel $t0,$a0,$t0,ne + csel $t1,$a1,$t1,ne + ldp $a0,$a1,[sp,#$res_x+$i+32] // res + csel $t2,$a2,$t2,ne + csel $t3,$a3,$t3,ne + cmp $in2infty,#0 // ~$in2intfy, remember? + ldp $a2,$a3,[sp,#$res_x+$i+48] + csel $acc0,$t0,$acc0,ne + csel $acc1,$t1,$acc1,ne + ldp $t0,$t1,[$bp_real,#$i+32] // in2 + csel $acc2,$t2,$acc2,ne + csel $acc3,$t3,$acc3,ne + ldp $t2,$t3,[$bp_real,#$i+48] + stp $acc0,$acc1,[$rp_real,#$i] + stp $acc2,$acc3,[$rp_real,#$i+16] +___ +$code.=<<___ if ($i == 0); + adrp $bp_real,:pg_hi21:.Lone_mont-64 + add $bp_real,$bp_real,:lo12:.Lone_mont-64 +___ +} +$code.=<<___; + ldp $acc0,$acc1,[$ap_real,#$i] // in1 + cmp $in1infty,#0 // ~$in1intfy, remember? + ldp $acc2,$acc3,[$ap_real,#$i+16] + csel $t0,$a0,$t0,ne + csel $t1,$a1,$t1,ne + csel $t2,$a2,$t2,ne + csel $t3,$a3,$t3,ne + cmp $in2infty,#0 // ~$in2intfy, remember? + csel $acc0,$t0,$acc0,ne + csel $acc1,$t1,$acc1,ne + csel $acc2,$t2,$acc2,ne + csel $acc3,$t3,$acc3,ne + stp $acc0,$acc1,[$rp_real,#$i] + stp $acc2,$acc3,[$rp_real,#$i+16] + + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x29,x30,[sp],#80 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine +___ +} +if (1) { +my ($ord0,$ord1) = ($poly1,$poly3); +my ($ord2,$ord3,$ordk,$t4) = map("x$_",(21..24)); +my $acc7 = $bi; + +$code.=<<___; +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], +// uint64_t b[4]); +.globl ecp_nistz256_ord_mul_mont +.type ecp_nistz256_ord_mul_mont,%function +.align 4 +ecp_nistz256_ord_mul_mont: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + adrp $ordk,:pg_hi21:.Lord + add $ordk,$ordk,:lo12:.Lord + ldr $bi,[$bp] // bp[0] + ldp $a0,$a1,[$ap] + ldp $a2,$a3,[$ap,#16] + + ldp $ord0,$ord1,[$ordk,#0] + ldp $ord2,$ord3,[$ordk,#16] + ldr $ordk,[$ordk,#32] + + mul $acc0,$a0,$bi // a[0]*b[0] + umulh $t0,$a0,$bi + + mul $acc1,$a1,$bi // a[1]*b[0] + umulh $t1,$a1,$bi + + mul $acc2,$a2,$bi // a[2]*b[0] + umulh $t2,$a2,$bi + + mul $acc3,$a3,$bi // a[3]*b[0] + umulh $acc4,$a3,$bi + + mul $t4,$acc0,$ordk + + adds $acc1,$acc1,$t0 // accumulate high parts of multiplication + adcs $acc2,$acc2,$t1 + adcs $acc3,$acc3,$t2 + adc $acc4,$acc4,xzr + mov $acc5,xzr +___ +for ($i=1;$i<4;$i++) { + ################################################################ + # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz + # * abcdefgh + # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx + # + # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we + # rewrite above as: + # + # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx + # - 0000abcd.efgh0000.abcdefgh.00000000.00000000 + # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh +$code.=<<___; + ldr $bi,[$bp,#8*$i] // b[i] + + lsl $t0,$t4,#32 + subs $acc2,$acc2,$t4 + lsr $t1,$t4,#32 + sbcs $acc3,$acc3,$t0 + sbcs $acc4,$acc4,$t1 + sbc $acc5,$acc5,xzr + + subs xzr,$acc0,#1 + umulh $t1,$ord0,$t4 + mul $t2,$ord1,$t4 + umulh $t3,$ord1,$t4 + + adcs $t2,$t2,$t1 + mul $t0,$a0,$bi + adc $t3,$t3,xzr + mul $t1,$a1,$bi + + adds $acc0,$acc1,$t2 + mul $t2,$a2,$bi + adcs $acc1,$acc2,$t3 + mul $t3,$a3,$bi + adcs $acc2,$acc3,$t4 + adcs $acc3,$acc4,$t4 + adc $acc4,$acc5,xzr + + adds $acc0,$acc0,$t0 // accumulate low parts + umulh $t0,$a0,$bi + adcs $acc1,$acc1,$t1 + umulh $t1,$a1,$bi + adcs $acc2,$acc2,$t2 + umulh $t2,$a2,$bi + adcs $acc3,$acc3,$t3 + umulh $t3,$a3,$bi + adc $acc4,$acc4,xzr + mul $t4,$acc0,$ordk + adds $acc1,$acc1,$t0 // accumulate high parts + adcs $acc2,$acc2,$t1 + adcs $acc3,$acc3,$t2 + adcs $acc4,$acc4,$t3 + adc $acc5,xzr,xzr +___ +} +$code.=<<___; + lsl $t0,$t4,#32 // last reduction + subs $acc2,$acc2,$t4 + lsr $t1,$t4,#32 + sbcs $acc3,$acc3,$t0 + sbcs $acc4,$acc4,$t1 + sbc $acc5,$acc5,xzr + + subs xzr,$acc0,#1 + umulh $t1,$ord0,$t4 + mul $t2,$ord1,$t4 + umulh $t3,$ord1,$t4 + + adcs $t2,$t2,$t1 + adc $t3,$t3,xzr + + adds $acc0,$acc1,$t2 + adcs $acc1,$acc2,$t3 + adcs $acc2,$acc3,$t4 + adcs $acc3,$acc4,$t4 + adc $acc4,$acc5,xzr + + subs $t0,$acc0,$ord0 // ret -= modulus + sbcs $t1,$acc1,$ord1 + sbcs $t2,$acc2,$ord2 + sbcs $t3,$acc3,$ord3 + sbcs xzr,$acc4,xzr + + csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus + csel $acc1,$acc1,$t1,lo + csel $acc2,$acc2,$t2,lo + stp $acc0,$acc1,[$rp] + csel $acc3,$acc3,$t3,lo + stp $acc2,$acc3,[$rp,#16] + + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldr x29,[sp],#64 + ret +.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont + +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], +// uint64_t rep); +.globl ecp_nistz256_ord_sqr_mont +.type ecp_nistz256_ord_sqr_mont,%function +.align 4 +ecp_nistz256_ord_sqr_mont: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + adrp $ordk,:pg_hi21:.Lord + add $ordk,$ordk,:lo12:.Lord + ldp $a0,$a1,[$ap] + ldp $a2,$a3,[$ap,#16] + + ldp $ord0,$ord1,[$ordk,#0] + ldp $ord2,$ord3,[$ordk,#16] + ldr $ordk,[$ordk,#32] + b .Loop_ord_sqr + +.align 4 +.Loop_ord_sqr: + sub $bp,$bp,#1 + //////////////////////////////////////////////////////////////// + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul $acc1,$a1,$a0 // a[1]*a[0] + umulh $t1,$a1,$a0 + mul $acc2,$a2,$a0 // a[2]*a[0] + umulh $t2,$a2,$a0 + mul $acc3,$a3,$a0 // a[3]*a[0] + umulh $acc4,$a3,$a0 + + adds $acc2,$acc2,$t1 // accumulate high parts of multiplication + mul $t0,$a2,$a1 // a[2]*a[1] + umulh $t1,$a2,$a1 + adcs $acc3,$acc3,$t2 + mul $t2,$a3,$a1 // a[3]*a[1] + umulh $t3,$a3,$a1 + adc $acc4,$acc4,xzr // can't overflow + + mul $acc5,$a3,$a2 // a[3]*a[2] + umulh $acc6,$a3,$a2 + + adds $t1,$t1,$t2 // accumulate high parts of multiplication + mul $acc0,$a0,$a0 // a[0]*a[0] + adc $t2,$t3,xzr // can't overflow + + adds $acc3,$acc3,$t0 // accumulate low parts of multiplication + umulh $a0,$a0,$a0 + adcs $acc4,$acc4,$t1 + mul $t1,$a1,$a1 // a[1]*a[1] + adcs $acc5,$acc5,$t2 + umulh $a1,$a1,$a1 + adc $acc6,$acc6,xzr // can't overflow + + adds $acc1,$acc1,$acc1 // acc[1-6]*=2 + mul $t2,$a2,$a2 // a[2]*a[2] + adcs $acc2,$acc2,$acc2 + umulh $a2,$a2,$a2 + adcs $acc3,$acc3,$acc3 + mul $t3,$a3,$a3 // a[3]*a[3] + adcs $acc4,$acc4,$acc4 + umulh $a3,$a3,$a3 + adcs $acc5,$acc5,$acc5 + adcs $acc6,$acc6,$acc6 + adc $acc7,xzr,xzr + + adds $acc1,$acc1,$a0 // +a[i]*a[i] + mul $t4,$acc0,$ordk + adcs $acc2,$acc2,$t1 + adcs $acc3,$acc3,$a1 + adcs $acc4,$acc4,$t2 + adcs $acc5,$acc5,$a2 + adcs $acc6,$acc6,$t3 + adc $acc7,$acc7,$a3 +___ +for($i=0; $i<4; $i++) { # reductions +$code.=<<___; + subs xzr,$acc0,#1 + umulh $t1,$ord0,$t4 + mul $t2,$ord1,$t4 + umulh $t3,$ord1,$t4 + + adcs $t2,$t2,$t1 + adc $t3,$t3,xzr + + adds $acc0,$acc1,$t2 + adcs $acc1,$acc2,$t3 + adcs $acc2,$acc3,$t4 + adc $acc3,xzr,$t4 // can't overflow +___ +$code.=<<___ if ($i<3); + mul $t3,$acc0,$ordk +___ +$code.=<<___; + lsl $t0,$t4,#32 + subs $acc1,$acc1,$t4 + lsr $t1,$t4,#32 + sbcs $acc2,$acc2,$t0 + sbc $acc3,$acc3,$t1 // can't borrow +___ + ($t3,$t4) = ($t4,$t3); +} +$code.=<<___; + adds $acc0,$acc0,$acc4 // accumulate upper half + adcs $acc1,$acc1,$acc5 + adcs $acc2,$acc2,$acc6 + adcs $acc3,$acc3,$acc7 + adc $acc4,xzr,xzr + + subs $t0,$acc0,$ord0 // ret -= modulus + sbcs $t1,$acc1,$ord1 + sbcs $t2,$acc2,$ord2 + sbcs $t3,$acc3,$ord3 + sbcs xzr,$acc4,xzr + + csel $a0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus + csel $a1,$acc1,$t1,lo + csel $a2,$acc2,$t2,lo + csel $a3,$acc3,$t3,lo + + cbnz $bp,.Loop_ord_sqr + + stp $a0,$a1,[$rp] + stp $a2,$a3,[$rp,#16] + + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldr x29,[sp],#64 + ret +.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont +___ +} } + +######################################################################## +# select subroutines +# These select functions are similar to those in p256-x86_64-asm.pl +# They load all points in the lookup table +# keeping in the output only the one corresponding to the input index. +{ +my ($val,$in_t)=map("x$_",(0..1)); +my ($index)=("w2"); +my ($Idx_ctr,$Val_in, $Mask_64)=("w9", "x10", "x11"); +my ($Mask)=("v3"); +my ($Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("v$_",(16..21)); +my ($T0a,$T0b,$T0c,$T0d,$T0e,$T0f)=map("v$_",(22..27)); +$code.=<<___; +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); +.globl ecp_nistz256_select_w5 +.type ecp_nistz256_select_w5,%function +.align 4 +ecp_nistz256_select_w5: + AARCH64_VALID_CALL_TARGET + + // $Val_in := $val + // $Idx_ctr := 0; loop counter and incremented internal index + mov $Val_in, $val + mov $Idx_ctr, #0 + + // [$Ra-$Rf] := 0 + movi $Ra.16b, #0 + movi $Rb.16b, #0 + movi $Rc.16b, #0 + movi $Rd.16b, #0 + movi $Re.16b, #0 + movi $Rf.16b, #0 + +.Lselect_w5_loop: + // Loop 16 times. + + // Increment index (loop counter); tested at the end of the loop + add $Idx_ctr, $Idx_ctr, #1 + + // [$T0a-$T0f] := Load a (3*256-bit = 6*128-bit) table entry starting at $in_t + // and advance $in_t to point to the next entry + ld1 {$T0a.2d, $T0b.2d, $T0c.2d, $T0d.2d}, [$in_t],#64 + + // $Mask_64 := ($Idx_ctr == $index)? All 1s : All 0s + cmp $Idx_ctr, $index + csetm $Mask_64, eq + + // continue loading ... + ld1 {$T0e.2d, $T0f.2d}, [$in_t],#32 + + // duplicate mask_64 into Mask (all 0s or all 1s) + dup $Mask.2d, $Mask_64 + + // [$Ra-$Rd] := (Mask == all 1s)? [$T0a-$T0d] : [$Ra-$Rd] + // i.e., values in output registers will remain the same if $Idx_ctr != $index + bit $Ra.16b, $T0a.16b, $Mask.16b + bit $Rb.16b, $T0b.16b, $Mask.16b + + bit $Rc.16b, $T0c.16b, $Mask.16b + bit $Rd.16b, $T0d.16b, $Mask.16b + + bit $Re.16b, $T0e.16b, $Mask.16b + bit $Rf.16b, $T0f.16b, $Mask.16b + + // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back + tbz $Idx_ctr, #4, .Lselect_w5_loop + + // Write [$Ra-$Rf] to memory at the output pointer + st1 {$Ra.2d, $Rb.2d, $Rc.2d, $Rd.2d}, [$Val_in],#64 + st1 {$Re.2d, $Rf.2d}, [$Val_in] + + ret +.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 + + +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); +.globl ecp_nistz256_select_w7 +.type ecp_nistz256_select_w7,%function +.align 4 +ecp_nistz256_select_w7: + AARCH64_VALID_CALL_TARGET + + // $Idx_ctr := 0; loop counter and incremented internal index + mov $Idx_ctr, #0 + + // [$Ra-$Rf] := 0 + movi $Ra.16b, #0 + movi $Rb.16b, #0 + movi $Rc.16b, #0 + movi $Rd.16b, #0 + +.Lselect_w7_loop: + // Loop 64 times. + + // Increment index (loop counter); tested at the end of the loop + add $Idx_ctr, $Idx_ctr, #1 + + // [$T0a-$T0d] := Load a (2*256-bit = 4*128-bit) table entry starting at $in_t + // and advance $in_t to point to the next entry + ld1 {$T0a.2d, $T0b.2d, $T0c.2d, $T0d.2d}, [$in_t],#64 + + // $Mask_64 := ($Idx_ctr == $index)? All 1s : All 0s + cmp $Idx_ctr, $index + csetm $Mask_64, eq + + // duplicate mask_64 into Mask (all 0s or all 1s) + dup $Mask.2d, $Mask_64 + + // [$Ra-$Rd] := (Mask == all 1s)? [$T0a-$T0d] : [$Ra-$Rd] + // i.e., values in output registers will remain the same if $Idx_ctr != $index + bit $Ra.16b, $T0a.16b, $Mask.16b + bit $Rb.16b, $T0b.16b, $Mask.16b + + bit $Rc.16b, $T0c.16b, $Mask.16b + bit $Rd.16b, $T0d.16b, $Mask.16b + + // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back + tbz $Idx_ctr, #6, .Lselect_w7_loop + + // Write [$Ra-$Rd] to memory at the output pointer + st1 {$Ra.2d, $Rb.2d, $Rc.2d, $Rd.2d}, [$val] + + ret +.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 +___ +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + print $_,"\n"; +} +close STDOUT or die "error closing STDOUT: $!"; # enforce flush diff --git a/ring-0.17.14/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl b/ring-0.17.14/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl new file mode 100644 index 0000000000..87d9852695 --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl @@ -0,0 +1,4144 @@ +#! /usr/bin/env perl +# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +# Copyright (c) 2014, Intel Corporation. All Rights Reserved. +# Copyright (c) 2015 CloudFlare, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3) +# (1) Intel Corporation, Israel Development Center, Haifa, Israel +# (2) University of Haifa, Israel +# (3) CloudFlare, Inc. +# +# Reference: +# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with +# 256 Bit Primes" + +# Further optimization by : +# +# this/original with/without -DECP_NISTZ256_ASM(*) +# Opteron +15-49% +150-195% +# Bulldozer +18-45% +175-240% +# P4 +24-46% +100-150% +# Westmere +18-34% +87-160% +# Sandy Bridge +14-35% +120-185% +# Ivy Bridge +11-35% +125-180% +# Haswell +10-37% +160-200% +# Broadwell +24-58% +210-270% +# Atom +20-50% +180-240% +# VIA Nano +50-160% +480-480% +# +# (*) "without -DECP_NISTZ256_ASM" refers to build with +# "enable-ec_nistp_64_gcc_128"; +# +# Ranges denote minimum and maximum improvement coefficients depending +# on benchmark. In "this/original" column lower coefficient is for +# ECDSA sign, while in "with/without" - for ECDH key agreement, and +# higher - for ECDSA sign, relatively fastest server-side operation. +# Keep in mind that +100% means 2x improvement. + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; + +$avx = 2; +$addx = 1; + +$code.=<<___; +.text + +# The polynomial +.section .rodata +.align 64 +.Lpoly: +.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 + +.LOne: +.long 1,1,1,1,1,1,1,1 +.LTwo: +.long 2,2,2,2,2,2,2,2 +.LThree: +.long 3,3,3,3,3,3,3,3 +.LONE_mont: +.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe + +# Constants for computations modulo ord(p256) +.Lord: +.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000 +.LordK: +.quad 0xccd1c8aaee00bc4f +.text +___ + +{ +my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11)); +my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13"); +my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx"); + +$code.=<<___; + +################################################################################ +# void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]); +.globl ecp_nistz256_neg +.type ecp_nistz256_neg,\@function,2 +.align 32 +ecp_nistz256_neg: +.cfi_startproc + _CET_ENDBR + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 +.Lneg_body: + + xor $a0, $a0 + xor $a1, $a1 + xor $a2, $a2 + xor $a3, $a3 + xor $t4, $t4 + + sub 8*0($a_ptr), $a0 + sbb 8*1($a_ptr), $a1 + sbb 8*2($a_ptr), $a2 + mov $a0, $t0 + sbb 8*3($a_ptr), $a3 + lea .Lpoly(%rip), $a_ptr + mov $a1, $t1 + sbb \$0, $t4 + + add 8*0($a_ptr), $a0 + mov $a2, $t2 + adc 8*1($a_ptr), $a1 + adc 8*2($a_ptr), $a2 + mov $a3, $t3 + adc 8*3($a_ptr), $a3 + test $t4, $t4 + + cmovz $t0, $a0 + cmovz $t1, $a1 + mov $a0, 8*0($r_ptr) + cmovz $t2, $a2 + mov $a1, 8*1($r_ptr) + cmovz $t3, $a3 + mov $a2, 8*2($r_ptr) + mov $a3, 8*3($r_ptr) + + mov 0(%rsp),%r13 +.cfi_restore %r13 + mov 8(%rsp),%r12 +.cfi_restore %r12 + lea 16(%rsp),%rsp +.cfi_adjust_cfa_offset -16 +.Lneg_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_neg,.-ecp_nistz256_neg +___ +} +{ +my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); +my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); +my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax"); +my ($poly1,$poly3)=($acc6,$acc7); + +$code.=<<___; +################################################################################ +# void ecp_nistz256_ord_mul_mont( +# uint64_t res[4], +# uint64_t a[4], +# uint64_t b[4]); + +.globl ecp_nistz256_ord_mul_mont_nohw +.type ecp_nistz256_ord_mul_mont_nohw,\@function,3 +.align 32 +ecp_nistz256_ord_mul_mont_nohw: +.cfi_startproc + _CET_ENDBR + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lord_mul_body: + + mov 8*0($b_org), %rax + mov $b_org, $b_ptr + lea .Lord(%rip), %r14 + mov .LordK(%rip), %r15 + + ################################# * b[0] + mov %rax, $t0 + mulq 8*0($a_ptr) + mov %rax, $acc0 + mov $t0, %rax + mov %rdx, $acc1 + + mulq 8*1($a_ptr) + add %rax, $acc1 + mov $t0, %rax + adc \$0, %rdx + mov %rdx, $acc2 + + mulq 8*2($a_ptr) + add %rax, $acc2 + mov $t0, %rax + adc \$0, %rdx + + mov $acc0, $acc5 + imulq %r15,$acc0 + + mov %rdx, $acc3 + mulq 8*3($a_ptr) + add %rax, $acc3 + mov $acc0, %rax + adc \$0, %rdx + mov %rdx, $acc4 + + ################################# First reduction step + mulq 8*0(%r14) + mov $acc0, $t1 + add %rax, $acc5 # guaranteed to be zero + mov $acc0, %rax + adc \$0, %rdx + mov %rdx, $t0 + + sub $acc0, $acc2 + sbb \$0, $acc0 # can't borrow + + mulq 8*1(%r14) + add $t0, $acc1 + adc \$0, %rdx + add %rax, $acc1 + mov $t1, %rax + adc %rdx, $acc2 + mov $t1, %rdx + adc \$0, $acc0 # can't overflow + + shl \$32, %rax + shr \$32, %rdx + sub %rax, $acc3 + mov 8*1($b_ptr), %rax + sbb %rdx, $t1 # can't borrow + + add $acc0, $acc3 + adc $t1, $acc4 + adc \$0, $acc5 + + ################################# * b[1] + mov %rax, $t0 + mulq 8*0($a_ptr) + add %rax, $acc1 + mov $t0, %rax + adc \$0, %rdx + mov %rdx, $t1 + + mulq 8*1($a_ptr) + add $t1, $acc2 + adc \$0, %rdx + add %rax, $acc2 + mov $t0, %rax + adc \$0, %rdx + mov %rdx, $t1 + + mulq 8*2($a_ptr) + add $t1, $acc3 + adc \$0, %rdx + add %rax, $acc3 + mov $t0, %rax + adc \$0, %rdx + + mov $acc1, $t0 + imulq %r15, $acc1 + + mov %rdx, $t1 + mulq 8*3($a_ptr) + add $t1, $acc4 + adc \$0, %rdx + xor $acc0, $acc0 + add %rax, $acc4 + mov $acc1, %rax + adc %rdx, $acc5 + adc \$0, $acc0 + + ################################# Second reduction step + mulq 8*0(%r14) + mov $acc1, $t1 + add %rax, $t0 # guaranteed to be zero + mov $acc1, %rax + adc %rdx, $t0 + + sub $acc1, $acc3 + sbb \$0, $acc1 # can't borrow + + mulq 8*1(%r14) + add $t0, $acc2 + adc \$0, %rdx + add %rax, $acc2 + mov $t1, %rax + adc %rdx, $acc3 + mov $t1, %rdx + adc \$0, $acc1 # can't overflow + + shl \$32, %rax + shr \$32, %rdx + sub %rax, $acc4 + mov 8*2($b_ptr), %rax + sbb %rdx, $t1 # can't borrow + + add $acc1, $acc4 + adc $t1, $acc5 + adc \$0, $acc0 + + ################################## * b[2] + mov %rax, $t0 + mulq 8*0($a_ptr) + add %rax, $acc2 + mov $t0, %rax + adc \$0, %rdx + mov %rdx, $t1 + + mulq 8*1($a_ptr) + add $t1, $acc3 + adc \$0, %rdx + add %rax, $acc3 + mov $t0, %rax + adc \$0, %rdx + mov %rdx, $t1 + + mulq 8*2($a_ptr) + add $t1, $acc4 + adc \$0, %rdx + add %rax, $acc4 + mov $t0, %rax + adc \$0, %rdx + + mov $acc2, $t0 + imulq %r15, $acc2 + + mov %rdx, $t1 + mulq 8*3($a_ptr) + add $t1, $acc5 + adc \$0, %rdx + xor $acc1, $acc1 + add %rax, $acc5 + mov $acc2, %rax + adc %rdx, $acc0 + adc \$0, $acc1 + + ################################# Third reduction step + mulq 8*0(%r14) + mov $acc2, $t1 + add %rax, $t0 # guaranteed to be zero + mov $acc2, %rax + adc %rdx, $t0 + + sub $acc2, $acc4 + sbb \$0, $acc2 # can't borrow + + mulq 8*1(%r14) + add $t0, $acc3 + adc \$0, %rdx + add %rax, $acc3 + mov $t1, %rax + adc %rdx, $acc4 + mov $t1, %rdx + adc \$0, $acc2 # can't overflow + + shl \$32, %rax + shr \$32, %rdx + sub %rax, $acc5 + mov 8*3($b_ptr), %rax + sbb %rdx, $t1 # can't borrow + + add $acc2, $acc5 + adc $t1, $acc0 + adc \$0, $acc1 + + ################################# * b[3] + mov %rax, $t0 + mulq 8*0($a_ptr) + add %rax, $acc3 + mov $t0, %rax + adc \$0, %rdx + mov %rdx, $t1 + + mulq 8*1($a_ptr) + add $t1, $acc4 + adc \$0, %rdx + add %rax, $acc4 + mov $t0, %rax + adc \$0, %rdx + mov %rdx, $t1 + + mulq 8*2($a_ptr) + add $t1, $acc5 + adc \$0, %rdx + add %rax, $acc5 + mov $t0, %rax + adc \$0, %rdx + + mov $acc3, $t0 + imulq %r15, $acc3 + + mov %rdx, $t1 + mulq 8*3($a_ptr) + add $t1, $acc0 + adc \$0, %rdx + xor $acc2, $acc2 + add %rax, $acc0 + mov $acc3, %rax + adc %rdx, $acc1 + adc \$0, $acc2 + + ################################# Last reduction step + mulq 8*0(%r14) + mov $acc3, $t1 + add %rax, $t0 # guaranteed to be zero + mov $acc3, %rax + adc %rdx, $t0 + + sub $acc3, $acc5 + sbb \$0, $acc3 # can't borrow + + mulq 8*1(%r14) + add $t0, $acc4 + adc \$0, %rdx + add %rax, $acc4 + mov $t1, %rax + adc %rdx, $acc5 + mov $t1, %rdx + adc \$0, $acc3 # can't overflow + + shl \$32, %rax + shr \$32, %rdx + sub %rax, $acc0 + sbb %rdx, $t1 # can't borrow + + add $acc3, $acc0 + adc $t1, $acc1 + adc \$0, $acc2 + + ################################# Subtract ord + mov $acc4, $a_ptr + sub 8*0(%r14), $acc4 + mov $acc5, $acc3 + sbb 8*1(%r14), $acc5 + mov $acc0, $t0 + sbb 8*2(%r14), $acc0 + mov $acc1, $t1 + sbb 8*3(%r14), $acc1 + sbb \$0, $acc2 + + cmovc $a_ptr, $acc4 + cmovc $acc3, $acc5 + cmovc $t0, $acc0 + cmovc $t1, $acc1 + + mov $acc4, 8*0($r_ptr) + mov $acc5, 8*1($r_ptr) + mov $acc0, 8*2($r_ptr) + mov $acc1, 8*3($r_ptr) + + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbx +.cfi_restore %rbx + mov 40(%rsp),%rbp +.cfi_restore %rbp + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_mul_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_ord_mul_mont_nohw,.-ecp_nistz256_ord_mul_mont_nohw + +################################################################################ +# void ecp_nistz256_ord_sqr_mont( +# uint64_t res[4], +# uint64_t a[4], +# uint64_t rep); + +.globl ecp_nistz256_ord_sqr_mont_nohw +.type ecp_nistz256_ord_sqr_mont_nohw,\@function,3 +.align 32 +ecp_nistz256_ord_sqr_mont_nohw: +.cfi_startproc + _CET_ENDBR + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lord_sqr_body: + + mov 8*0($a_ptr), $acc0 + mov 8*1($a_ptr), %rax + mov 8*2($a_ptr), $acc6 + mov 8*3($a_ptr), $acc7 + lea .Lord(%rip), $a_ptr # pointer to modulus + mov $b_org, $b_ptr + jmp .Loop_ord_sqr + +.align 32 +.Loop_ord_sqr: + ################################# a[1:] * a[0] + mov %rax, $t1 # put aside a[1] + mul $acc0 # a[1] * a[0] + mov %rax, $acc1 + movq $t1, %xmm1 # offload a[1] + mov $acc6, %rax + mov %rdx, $acc2 + + mul $acc0 # a[2] * a[0] + add %rax, $acc2 + mov $acc7, %rax + movq $acc6, %xmm2 # offload a[2] + adc \$0, %rdx + mov %rdx, $acc3 + + mul $acc0 # a[3] * a[0] + add %rax, $acc3 + mov $acc7, %rax + movq $acc7, %xmm3 # offload a[3] + adc \$0, %rdx + mov %rdx, $acc4 + + ################################# a[3] * a[2] + mul $acc6 # a[3] * a[2] + mov %rax, $acc5 + mov $acc6, %rax + mov %rdx, $acc6 + + ################################# a[2:] * a[1] + mul $t1 # a[2] * a[1] + add %rax, $acc3 + mov $acc7, %rax + adc \$0, %rdx + mov %rdx, $acc7 + + mul $t1 # a[3] * a[1] + add %rax, $acc4 + adc \$0, %rdx + + add $acc7, $acc4 + adc %rdx, $acc5 + adc \$0, $acc6 # can't overflow + + ################################# *2 + xor $acc7, $acc7 + mov $acc0, %rax + add $acc1, $acc1 + adc $acc2, $acc2 + adc $acc3, $acc3 + adc $acc4, $acc4 + adc $acc5, $acc5 + adc $acc6, $acc6 + adc \$0, $acc7 + + ################################# Missing products + mul %rax # a[0] * a[0] + mov %rax, $acc0 + movq %xmm1, %rax + mov %rdx, $t1 + + mul %rax # a[1] * a[1] + add $t1, $acc1 + adc %rax, $acc2 + movq %xmm2, %rax + adc \$0, %rdx + mov %rdx, $t1 + + mul %rax # a[2] * a[2] + add $t1, $acc3 + adc %rax, $acc4 + movq %xmm3, %rax + adc \$0, %rdx + mov %rdx, $t1 + + mov $acc0, $t0 + imulq 8*4($a_ptr), $acc0 # *= .LordK + + mul %rax # a[3] * a[3] + add $t1, $acc5 + adc %rax, $acc6 + mov 8*0($a_ptr), %rax # modulus[0] + adc %rdx, $acc7 # can't overflow + + ################################# First reduction step + mul $acc0 + mov $acc0, $t1 + add %rax, $t0 # guaranteed to be zero + mov 8*1($a_ptr), %rax # modulus[1] + adc %rdx, $t0 + + sub $acc0, $acc2 + sbb \$0, $t1 # can't borrow + + mul $acc0 + add $t0, $acc1 + adc \$0, %rdx + add %rax, $acc1 + mov $acc0, %rax + adc %rdx, $acc2 + mov $acc0, %rdx + adc \$0, $t1 # can't overflow + + mov $acc1, $t0 + imulq 8*4($a_ptr), $acc1 # *= .LordK + + shl \$32, %rax + shr \$32, %rdx + sub %rax, $acc3 + mov 8*0($a_ptr), %rax + sbb %rdx, $acc0 # can't borrow + + add $t1, $acc3 + adc \$0, $acc0 # can't overflow + + ################################# Second reduction step + mul $acc1 + mov $acc1, $t1 + add %rax, $t0 # guaranteed to be zero + mov 8*1($a_ptr), %rax + adc %rdx, $t0 + + sub $acc1, $acc3 + sbb \$0, $t1 # can't borrow + + mul $acc1 + add $t0, $acc2 + adc \$0, %rdx + add %rax, $acc2 + mov $acc1, %rax + adc %rdx, $acc3 + mov $acc1, %rdx + adc \$0, $t1 # can't overflow + + mov $acc2, $t0 + imulq 8*4($a_ptr), $acc2 # *= .LordK + + shl \$32, %rax + shr \$32, %rdx + sub %rax, $acc0 + mov 8*0($a_ptr), %rax + sbb %rdx, $acc1 # can't borrow + + add $t1, $acc0 + adc \$0, $acc1 # can't overflow + + ################################# Third reduction step + mul $acc2 + mov $acc2, $t1 + add %rax, $t0 # guaranteed to be zero + mov 8*1($a_ptr), %rax + adc %rdx, $t0 + + sub $acc2, $acc0 + sbb \$0, $t1 # can't borrow + + mul $acc2 + add $t0, $acc3 + adc \$0, %rdx + add %rax, $acc3 + mov $acc2, %rax + adc %rdx, $acc0 + mov $acc2, %rdx + adc \$0, $t1 # can't overflow + + mov $acc3, $t0 + imulq 8*4($a_ptr), $acc3 # *= .LordK + + shl \$32, %rax + shr \$32, %rdx + sub %rax, $acc1 + mov 8*0($a_ptr), %rax + sbb %rdx, $acc2 # can't borrow + + add $t1, $acc1 + adc \$0, $acc2 # can't overflow + + ################################# Last reduction step + mul $acc3 + mov $acc3, $t1 + add %rax, $t0 # guaranteed to be zero + mov 8*1($a_ptr), %rax + adc %rdx, $t0 + + sub $acc3, $acc1 + sbb \$0, $t1 # can't borrow + + mul $acc3 + add $t0, $acc0 + adc \$0, %rdx + add %rax, $acc0 + mov $acc3, %rax + adc %rdx, $acc1 + mov $acc3, %rdx + adc \$0, $t1 # can't overflow + + shl \$32, %rax + shr \$32, %rdx + sub %rax, $acc2 + sbb %rdx, $acc3 # can't borrow + + add $t1, $acc2 + adc \$0, $acc3 # can't overflow + + ################################# Add bits [511:256] of the sqr result + xor %rdx, %rdx + add $acc4, $acc0 + adc $acc5, $acc1 + mov $acc0, $acc4 + adc $acc6, $acc2 + adc $acc7, $acc3 + mov $acc1, %rax + adc \$0, %rdx + + ################################# Compare to modulus + sub 8*0($a_ptr), $acc0 + mov $acc2, $acc6 + sbb 8*1($a_ptr), $acc1 + sbb 8*2($a_ptr), $acc2 + mov $acc3, $acc7 + sbb 8*3($a_ptr), $acc3 + sbb \$0, %rdx + + cmovc $acc4, $acc0 + cmovnc $acc1, %rax + cmovnc $acc2, $acc6 + cmovnc $acc3, $acc7 + + dec $b_ptr + jnz .Loop_ord_sqr + + mov $acc0, 8*0($r_ptr) + mov %rax, 8*1($r_ptr) + pxor %xmm1, %xmm1 + mov $acc6, 8*2($r_ptr) + pxor %xmm2, %xmm2 + mov $acc7, 8*3($r_ptr) + pxor %xmm3, %xmm3 + + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbx +.cfi_restore %rbx + mov 40(%rsp),%rbp +.cfi_restore %rbp + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_sqr_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_ord_sqr_mont_nohw,.-ecp_nistz256_ord_sqr_mont_nohw +___ + +$code.=<<___ if ($addx); +################################################################################ +.globl ecp_nistz256_ord_mul_mont_adx +.type ecp_nistz256_ord_mul_mont_adx,\@function,3 +.align 32 +ecp_nistz256_ord_mul_mont_adx: +.cfi_startproc +.Lecp_nistz256_ord_mul_mont_adx: + _CET_ENDBR + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lord_mulx_body: + + mov $b_org, $b_ptr + mov 8*0($b_org), %rdx + mov 8*0($a_ptr), $acc1 + mov 8*1($a_ptr), $acc2 + mov 8*2($a_ptr), $acc3 + mov 8*3($a_ptr), $acc4 + lea -128($a_ptr), $a_ptr # control u-op density + lea .Lord-128(%rip), %r14 + mov .LordK(%rip), %r15 + + ################################# Multiply by b[0] + mulx $acc1, $acc0, $acc1 + mulx $acc2, $t0, $acc2 + mulx $acc3, $t1, $acc3 + add $t0, $acc1 + mulx $acc4, $t0, $acc4 + mov $acc0, %rdx + mulx %r15, %rdx, %rax + adc $t1, $acc2 + adc $t0, $acc3 + adc \$0, $acc4 + + ################################# reduction + xor $acc5, $acc5 # $acc5=0, cf=0, of=0 + mulx 8*0+128(%r14), $t0, $t1 + adcx $t0, $acc0 # guaranteed to be zero + adox $t1, $acc1 + + mulx 8*1+128(%r14), $t0, $t1 + adcx $t0, $acc1 + adox $t1, $acc2 + + mulx 8*2+128(%r14), $t0, $t1 + adcx $t0, $acc2 + adox $t1, $acc3 + + mulx 8*3+128(%r14), $t0, $t1 + mov 8*1($b_ptr), %rdx + adcx $t0, $acc3 + adox $t1, $acc4 + adcx $acc0, $acc4 + adox $acc0, $acc5 + adc \$0, $acc5 # cf=0, of=0 + + ################################# Multiply by b[1] + mulx 8*0+128($a_ptr), $t0, $t1 + adcx $t0, $acc1 + adox $t1, $acc2 + + mulx 8*1+128($a_ptr), $t0, $t1 + adcx $t0, $acc2 + adox $t1, $acc3 + + mulx 8*2+128($a_ptr), $t0, $t1 + adcx $t0, $acc3 + adox $t1, $acc4 + + mulx 8*3+128($a_ptr), $t0, $t1 + mov $acc1, %rdx + mulx %r15, %rdx, %rax + adcx $t0, $acc4 + adox $t1, $acc5 + + adcx $acc0, $acc5 + adox $acc0, $acc0 + adc \$0, $acc0 # cf=0, of=0 + + ################################# reduction + mulx 8*0+128(%r14), $t0, $t1 + adcx $t0, $acc1 # guaranteed to be zero + adox $t1, $acc2 + + mulx 8*1+128(%r14), $t0, $t1 + adcx $t0, $acc2 + adox $t1, $acc3 + + mulx 8*2+128(%r14), $t0, $t1 + adcx $t0, $acc3 + adox $t1, $acc4 + + mulx 8*3+128(%r14), $t0, $t1 + mov 8*2($b_ptr), %rdx + adcx $t0, $acc4 + adox $t1, $acc5 + adcx $acc1, $acc5 + adox $acc1, $acc0 + adc \$0, $acc0 # cf=0, of=0 + + ################################# Multiply by b[2] + mulx 8*0+128($a_ptr), $t0, $t1 + adcx $t0, $acc2 + adox $t1, $acc3 + + mulx 8*1+128($a_ptr), $t0, $t1 + adcx $t0, $acc3 + adox $t1, $acc4 + + mulx 8*2+128($a_ptr), $t0, $t1 + adcx $t0, $acc4 + adox $t1, $acc5 + + mulx 8*3+128($a_ptr), $t0, $t1 + mov $acc2, %rdx + mulx %r15, %rdx, %rax + adcx $t0, $acc5 + adox $t1, $acc0 + + adcx $acc1, $acc0 + adox $acc1, $acc1 + adc \$0, $acc1 # cf=0, of=0 + + ################################# reduction + mulx 8*0+128(%r14), $t0, $t1 + adcx $t0, $acc2 # guaranteed to be zero + adox $t1, $acc3 + + mulx 8*1+128(%r14), $t0, $t1 + adcx $t0, $acc3 + adox $t1, $acc4 + + mulx 8*2+128(%r14), $t0, $t1 + adcx $t0, $acc4 + adox $t1, $acc5 + + mulx 8*3+128(%r14), $t0, $t1 + mov 8*3($b_ptr), %rdx + adcx $t0, $acc5 + adox $t1, $acc0 + adcx $acc2, $acc0 + adox $acc2, $acc1 + adc \$0, $acc1 # cf=0, of=0 + + ################################# Multiply by b[3] + mulx 8*0+128($a_ptr), $t0, $t1 + adcx $t0, $acc3 + adox $t1, $acc4 + + mulx 8*1+128($a_ptr), $t0, $t1 + adcx $t0, $acc4 + adox $t1, $acc5 + + mulx 8*2+128($a_ptr), $t0, $t1 + adcx $t0, $acc5 + adox $t1, $acc0 + + mulx 8*3+128($a_ptr), $t0, $t1 + mov $acc3, %rdx + mulx %r15, %rdx, %rax + adcx $t0, $acc0 + adox $t1, $acc1 + + adcx $acc2, $acc1 + adox $acc2, $acc2 + adc \$0, $acc2 # cf=0, of=0 + + ################################# reduction + mulx 8*0+128(%r14), $t0, $t1 + adcx $t0, $acc3 # guranteed to be zero + adox $t1, $acc4 + + mulx 8*1+128(%r14), $t0, $t1 + adcx $t0, $acc4 + adox $t1, $acc5 + + mulx 8*2+128(%r14), $t0, $t1 + adcx $t0, $acc5 + adox $t1, $acc0 + + mulx 8*3+128(%r14), $t0, $t1 + lea 128(%r14),%r14 + mov $acc4, $t2 + adcx $t0, $acc0 + adox $t1, $acc1 + mov $acc5, $t3 + adcx $acc3, $acc1 + adox $acc3, $acc2 + adc \$0, $acc2 + + ################################# + # Branch-less conditional subtraction of P + mov $acc0, $t0 + sub 8*0(%r14), $acc4 + sbb 8*1(%r14), $acc5 + sbb 8*2(%r14), $acc0 + mov $acc1, $t1 + sbb 8*3(%r14), $acc1 + sbb \$0, $acc2 + + cmovc $t2, $acc4 + cmovc $t3, $acc5 + cmovc $t0, $acc0 + cmovc $t1, $acc1 + + mov $acc4, 8*0($r_ptr) + mov $acc5, 8*1($r_ptr) + mov $acc0, 8*2($r_ptr) + mov $acc1, 8*3($r_ptr) + + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbx +.cfi_restore %rbx + mov 40(%rsp),%rbp +.cfi_restore %rbp + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_mulx_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_ord_mul_mont_adx,.-ecp_nistz256_ord_mul_mont_adx + +.globl ecp_nistz256_ord_sqr_mont_adx +.type ecp_nistz256_ord_sqr_mont_adx,\@function,3 +.align 32 +ecp_nistz256_ord_sqr_mont_adx: +.cfi_startproc + _CET_ENDBR +.Lecp_nistz256_ord_sqr_mont_adx: + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lord_sqrx_body: + + mov $b_org, $b_ptr + mov 8*0($a_ptr), %rdx + mov 8*1($a_ptr), $acc6 + mov 8*2($a_ptr), $acc7 + mov 8*3($a_ptr), $acc0 + lea .Lord(%rip), $a_ptr + jmp .Loop_ord_sqrx + +.align 32 +.Loop_ord_sqrx: + mulx $acc6, $acc1, $acc2 # a[0]*a[1] + mulx $acc7, $t0, $acc3 # a[0]*a[2] + mov %rdx, %rax # offload a[0] + movq $acc6, %xmm1 # offload a[1] + mulx $acc0, $t1, $acc4 # a[0]*a[3] + mov $acc6, %rdx + add $t0, $acc2 + movq $acc7, %xmm2 # offload a[2] + adc $t1, $acc3 + adc \$0, $acc4 + xor $acc5, $acc5 # $acc5=0,cf=0,of=0 + ################################# + mulx $acc7, $t0, $t1 # a[1]*a[2] + adcx $t0, $acc3 + adox $t1, $acc4 + + mulx $acc0, $t0, $t1 # a[1]*a[3] + mov $acc7, %rdx + adcx $t0, $acc4 + adox $t1, $acc5 + adc \$0, $acc5 + ################################# + mulx $acc0, $t0, $acc6 # a[2]*a[3] + mov %rax, %rdx + movq $acc0, %xmm3 # offload a[3] + xor $acc7, $acc7 # $acc7=0,cf=0,of=0 + adcx $acc1, $acc1 # acc1:6<<1 + adox $t0, $acc5 + adcx $acc2, $acc2 + adox $acc7, $acc6 # of=0 + + ################################# a[i]*a[i] + mulx %rdx, $acc0, $t1 + movq %xmm1, %rdx + adcx $acc3, $acc3 + adox $t1, $acc1 + adcx $acc4, $acc4 + mulx %rdx, $t0, $t4 + movq %xmm2, %rdx + adcx $acc5, $acc5 + adox $t0, $acc2 + adcx $acc6, $acc6 + mulx %rdx, $t0, $t1 + .byte 0x67 + movq %xmm3, %rdx + adox $t4, $acc3 + adcx $acc7, $acc7 + adox $t0, $acc4 + adox $t1, $acc5 + mulx %rdx, $t0, $t4 + adox $t0, $acc6 + adox $t4, $acc7 + + ################################# reduction + mov $acc0, %rdx + mulx 8*4($a_ptr), %rdx, $t0 + + xor %rax, %rax # cf=0, of=0 + mulx 8*0($a_ptr), $t0, $t1 + adcx $t0, $acc0 # guaranteed to be zero + adox $t1, $acc1 + mulx 8*1($a_ptr), $t0, $t1 + adcx $t0, $acc1 + adox $t1, $acc2 + mulx 8*2($a_ptr), $t0, $t1 + adcx $t0, $acc2 + adox $t1, $acc3 + mulx 8*3($a_ptr), $t0, $t1 + adcx $t0, $acc3 + adox $t1, $acc0 # of=0 + adcx %rax, $acc0 # cf=0 + + ################################# + mov $acc1, %rdx + mulx 8*4($a_ptr), %rdx, $t0 + + mulx 8*0($a_ptr), $t0, $t1 + adox $t0, $acc1 # guaranteed to be zero + adcx $t1, $acc2 + mulx 8*1($a_ptr), $t0, $t1 + adox $t0, $acc2 + adcx $t1, $acc3 + mulx 8*2($a_ptr), $t0, $t1 + adox $t0, $acc3 + adcx $t1, $acc0 + mulx 8*3($a_ptr), $t0, $t1 + adox $t0, $acc0 + adcx $t1, $acc1 # cf=0 + adox %rax, $acc1 # of=0 + + ################################# + mov $acc2, %rdx + mulx 8*4($a_ptr), %rdx, $t0 + + mulx 8*0($a_ptr), $t0, $t1 + adcx $t0, $acc2 # guaranteed to be zero + adox $t1, $acc3 + mulx 8*1($a_ptr), $t0, $t1 + adcx $t0, $acc3 + adox $t1, $acc0 + mulx 8*2($a_ptr), $t0, $t1 + adcx $t0, $acc0 + adox $t1, $acc1 + mulx 8*3($a_ptr), $t0, $t1 + adcx $t0, $acc1 + adox $t1, $acc2 # of=0 + adcx %rax, $acc2 # cf=0 + + ################################# + mov $acc3, %rdx + mulx 8*4($a_ptr), %rdx, $t0 + + mulx 8*0($a_ptr), $t0, $t1 + adox $t0, $acc3 # guaranteed to be zero + adcx $t1, $acc0 + mulx 8*1($a_ptr), $t0, $t1 + adox $t0, $acc0 + adcx $t1, $acc1 + mulx 8*2($a_ptr), $t0, $t1 + adox $t0, $acc1 + adcx $t1, $acc2 + mulx 8*3($a_ptr), $t0, $t1 + adox $t0, $acc2 + adcx $t1, $acc3 + adox %rax, $acc3 + + ################################# accumulate upper half + add $acc0, $acc4 # add $acc4, $acc0 + adc $acc5, $acc1 + mov $acc4, %rdx + adc $acc6, $acc2 + adc $acc7, $acc3 + mov $acc1, $acc6 + adc \$0, %rax + + ################################# compare to modulus + sub 8*0($a_ptr), $acc4 + mov $acc2, $acc7 + sbb 8*1($a_ptr), $acc1 + sbb 8*2($a_ptr), $acc2 + mov $acc3, $acc0 + sbb 8*3($a_ptr), $acc3 + sbb \$0, %rax + + cmovnc $acc4, %rdx + cmovnc $acc1, $acc6 + cmovnc $acc2, $acc7 + cmovnc $acc3, $acc0 + + dec $b_ptr + jnz .Loop_ord_sqrx + + mov %rdx, 8*0($r_ptr) + mov $acc6, 8*1($r_ptr) + pxor %xmm1, %xmm1 + mov $acc7, 8*2($r_ptr) + pxor %xmm2, %xmm2 + mov $acc0, 8*3($r_ptr) + pxor %xmm3, %xmm3 + + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbx +.cfi_restore %rbx + mov 40(%rsp),%rbp +.cfi_restore %rbp + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_sqrx_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_ord_sqr_mont_adx,.-ecp_nistz256_ord_sqr_mont_adx +___ + +$code.=<<___; +################################################################################ +# void ecp_nistz256_mul_mont( +# uint64_t res[4], +# uint64_t a[4], +# uint64_t b[4]); + +.globl ecp_nistz256_mul_mont_nohw +.type ecp_nistz256_mul_mont_nohw,\@function,3 +.align 32 +ecp_nistz256_mul_mont_nohw: +.cfi_startproc + _CET_ENDBR + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lmul_body: + mov $b_org, $b_ptr + mov 8*0($b_org), %rax + mov 8*0($a_ptr), $acc1 + mov 8*1($a_ptr), $acc2 + mov 8*2($a_ptr), $acc3 + mov 8*3($a_ptr), $acc4 + + call __ecp_nistz256_mul_montq + + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbx +.cfi_restore %rbx + mov 40(%rsp),%rbp +.cfi_restore %rbp + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lmul_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_mul_mont_nohw,.-ecp_nistz256_mul_mont_nohw + +.type __ecp_nistz256_mul_montq,\@abi-omnipotent +.align 32 +__ecp_nistz256_mul_montq: +.cfi_startproc + ######################################################################## + # Multiply a by b[0] + mov %rax, $t1 + mulq $acc1 + mov .Lpoly+8*1(%rip),$poly1 + mov %rax, $acc0 + mov $t1, %rax + mov %rdx, $acc1 + + mulq $acc2 + mov .Lpoly+8*3(%rip),$poly3 + add %rax, $acc1 + mov $t1, %rax + adc \$0, %rdx + mov %rdx, $acc2 + + mulq $acc3 + add %rax, $acc2 + mov $t1, %rax + adc \$0, %rdx + mov %rdx, $acc3 + + mulq $acc4 + add %rax, $acc3 + mov $acc0, %rax + adc \$0, %rdx + xor $acc5, $acc5 + mov %rdx, $acc4 + + ######################################################################## + # First reduction step + # Basically now we want to multiply acc[0] by p256, + # and add the result to the acc. + # Due to the special form of p256 we do some optimizations + # + # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0] + # then we add acc[0] and get acc[0] x 2^96 + + mov $acc0, $t1 + shl \$32, $acc0 + mulq $poly3 + shr \$32, $t1 + add $acc0, $acc1 # +=acc[0]<<96 + adc $t1, $acc2 + adc %rax, $acc3 + mov 8*1($b_ptr), %rax + adc %rdx, $acc4 + adc \$0, $acc5 + xor $acc0, $acc0 + + ######################################################################## + # Multiply by b[1] + mov %rax, $t1 + mulq 8*0($a_ptr) + add %rax, $acc1 + mov $t1, %rax + adc \$0, %rdx + mov %rdx, $t0 + + mulq 8*1($a_ptr) + add $t0, $acc2 + adc \$0, %rdx + add %rax, $acc2 + mov $t1, %rax + adc \$0, %rdx + mov %rdx, $t0 + + mulq 8*2($a_ptr) + add $t0, $acc3 + adc \$0, %rdx + add %rax, $acc3 + mov $t1, %rax + adc \$0, %rdx + mov %rdx, $t0 + + mulq 8*3($a_ptr) + add $t0, $acc4 + adc \$0, %rdx + add %rax, $acc4 + mov $acc1, %rax + adc %rdx, $acc5 + adc \$0, $acc0 + + ######################################################################## + # Second reduction step + mov $acc1, $t1 + shl \$32, $acc1 + mulq $poly3 + shr \$32, $t1 + add $acc1, $acc2 + adc $t1, $acc3 + adc %rax, $acc4 + mov 8*2($b_ptr), %rax + adc %rdx, $acc5 + adc \$0, $acc0 + xor $acc1, $acc1 + + ######################################################################## + # Multiply by b[2] + mov %rax, $t1 + mulq 8*0($a_ptr) + add %rax, $acc2 + mov $t1, %rax + adc \$0, %rdx + mov %rdx, $t0 + + mulq 8*1($a_ptr) + add $t0, $acc3 + adc \$0, %rdx + add %rax, $acc3 + mov $t1, %rax + adc \$0, %rdx + mov %rdx, $t0 + + mulq 8*2($a_ptr) + add $t0, $acc4 + adc \$0, %rdx + add %rax, $acc4 + mov $t1, %rax + adc \$0, %rdx + mov %rdx, $t0 + + mulq 8*3($a_ptr) + add $t0, $acc5 + adc \$0, %rdx + add %rax, $acc5 + mov $acc2, %rax + adc %rdx, $acc0 + adc \$0, $acc1 + + ######################################################################## + # Third reduction step + mov $acc2, $t1 + shl \$32, $acc2 + mulq $poly3 + shr \$32, $t1 + add $acc2, $acc3 + adc $t1, $acc4 + adc %rax, $acc5 + mov 8*3($b_ptr), %rax + adc %rdx, $acc0 + adc \$0, $acc1 + xor $acc2, $acc2 + + ######################################################################## + # Multiply by b[3] + mov %rax, $t1 + mulq 8*0($a_ptr) + add %rax, $acc3 + mov $t1, %rax + adc \$0, %rdx + mov %rdx, $t0 + + mulq 8*1($a_ptr) + add $t0, $acc4 + adc \$0, %rdx + add %rax, $acc4 + mov $t1, %rax + adc \$0, %rdx + mov %rdx, $t0 + + mulq 8*2($a_ptr) + add $t0, $acc5 + adc \$0, %rdx + add %rax, $acc5 + mov $t1, %rax + adc \$0, %rdx + mov %rdx, $t0 + + mulq 8*3($a_ptr) + add $t0, $acc0 + adc \$0, %rdx + add %rax, $acc0 + mov $acc3, %rax + adc %rdx, $acc1 + adc \$0, $acc2 + + ######################################################################## + # Final reduction step + mov $acc3, $t1 + shl \$32, $acc3 + mulq $poly3 + shr \$32, $t1 + add $acc3, $acc4 + adc $t1, $acc5 + mov $acc4, $t0 + adc %rax, $acc0 + adc %rdx, $acc1 + mov $acc5, $t1 + adc \$0, $acc2 + + ######################################################################## + # Branch-less conditional subtraction of P + sub \$-1, $acc4 # .Lpoly[0] + mov $acc0, $t2 + sbb $poly1, $acc5 # .Lpoly[1] + sbb \$0, $acc0 # .Lpoly[2] + mov $acc1, $t3 + sbb $poly3, $acc1 # .Lpoly[3] + sbb \$0, $acc2 + + cmovc $t0, $acc4 + cmovc $t1, $acc5 + mov $acc4, 8*0($r_ptr) + cmovc $t2, $acc0 + mov $acc5, 8*1($r_ptr) + cmovc $t3, $acc1 + mov $acc0, 8*2($r_ptr) + mov $acc1, 8*3($r_ptr) + + ret +.cfi_endproc +.size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq + +################################################################################ +# void ecp_nistz256_sqr_mont( +# uint64_t res[4], +# uint64_t a[4]); + +# we optimize the square according to S.Gueron and V.Krasnov, +# "Speeding up Big-Number Squaring" +.globl ecp_nistz256_sqr_mont_nohw +.type ecp_nistz256_sqr_mont_nohw,\@function,2 +.align 32 +ecp_nistz256_sqr_mont_nohw: +.cfi_startproc + _CET_ENDBR + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lsqr_body: + mov 8*0($a_ptr), %rax + mov 8*1($a_ptr), $acc6 + mov 8*2($a_ptr), $acc7 + mov 8*3($a_ptr), $acc0 + + call __ecp_nistz256_sqr_montq + + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbx +.cfi_restore %rbx + mov 40(%rsp),%rbp +.cfi_restore %rbp + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lsqr_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_sqr_mont_nohw,.-ecp_nistz256_sqr_mont_nohw + +.type __ecp_nistz256_sqr_montq,\@abi-omnipotent +.align 32 +__ecp_nistz256_sqr_montq: +.cfi_startproc + mov %rax, $acc5 + mulq $acc6 # a[1]*a[0] + mov %rax, $acc1 + mov $acc7, %rax + mov %rdx, $acc2 + + mulq $acc5 # a[0]*a[2] + add %rax, $acc2 + mov $acc0, %rax + adc \$0, %rdx + mov %rdx, $acc3 + + mulq $acc5 # a[0]*a[3] + add %rax, $acc3 + mov $acc7, %rax + adc \$0, %rdx + mov %rdx, $acc4 + + ################################# + mulq $acc6 # a[1]*a[2] + add %rax, $acc3 + mov $acc0, %rax + adc \$0, %rdx + mov %rdx, $t1 + + mulq $acc6 # a[1]*a[3] + add %rax, $acc4 + mov $acc0, %rax + adc \$0, %rdx + add $t1, $acc4 + mov %rdx, $acc5 + adc \$0, $acc5 + + ################################# + mulq $acc7 # a[2]*a[3] + xor $acc7, $acc7 + add %rax, $acc5 + mov 8*0($a_ptr), %rax + mov %rdx, $acc6 + adc \$0, $acc6 + + add $acc1, $acc1 # acc1:6<<1 + adc $acc2, $acc2 + adc $acc3, $acc3 + adc $acc4, $acc4 + adc $acc5, $acc5 + adc $acc6, $acc6 + adc \$0, $acc7 + + mulq %rax + mov %rax, $acc0 + mov 8*1($a_ptr), %rax + mov %rdx, $t0 + + mulq %rax + add $t0, $acc1 + adc %rax, $acc2 + mov 8*2($a_ptr), %rax + adc \$0, %rdx + mov %rdx, $t0 + + mulq %rax + add $t0, $acc3 + adc %rax, $acc4 + mov 8*3($a_ptr), %rax + adc \$0, %rdx + mov %rdx, $t0 + + mulq %rax + add $t0, $acc5 + adc %rax, $acc6 + mov $acc0, %rax + adc %rdx, $acc7 + + mov .Lpoly+8*1(%rip), $a_ptr + mov .Lpoly+8*3(%rip), $t1 + + ########################################## + # Now the reduction + # First iteration + mov $acc0, $t0 + shl \$32, $acc0 + mulq $t1 + shr \$32, $t0 + add $acc0, $acc1 # +=acc[0]<<96 + adc $t0, $acc2 + adc %rax, $acc3 + mov $acc1, %rax + adc \$0, %rdx + + ########################################## + # Second iteration + mov $acc1, $t0 + shl \$32, $acc1 + mov %rdx, $acc0 + mulq $t1 + shr \$32, $t0 + add $acc1, $acc2 + adc $t0, $acc3 + adc %rax, $acc0 + mov $acc2, %rax + adc \$0, %rdx + + ########################################## + # Third iteration + mov $acc2, $t0 + shl \$32, $acc2 + mov %rdx, $acc1 + mulq $t1 + shr \$32, $t0 + add $acc2, $acc3 + adc $t0, $acc0 + adc %rax, $acc1 + mov $acc3, %rax + adc \$0, %rdx + + ########################################### + # Last iteration + mov $acc3, $t0 + shl \$32, $acc3 + mov %rdx, $acc2 + mulq $t1 + shr \$32, $t0 + add $acc3, $acc0 + adc $t0, $acc1 + adc %rax, $acc2 + adc \$0, %rdx + xor $acc3, $acc3 + + ############################################ + # Add the rest of the acc + add $acc0, $acc4 + adc $acc1, $acc5 + mov $acc4, $acc0 + adc $acc2, $acc6 + adc %rdx, $acc7 + mov $acc5, $acc1 + adc \$0, $acc3 + + sub \$-1, $acc4 # .Lpoly[0] + mov $acc6, $acc2 + sbb $a_ptr, $acc5 # .Lpoly[1] + sbb \$0, $acc6 # .Lpoly[2] + mov $acc7, $t0 + sbb $t1, $acc7 # .Lpoly[3] + sbb \$0, $acc3 + + cmovc $acc0, $acc4 + cmovc $acc1, $acc5 + mov $acc4, 8*0($r_ptr) + cmovc $acc2, $acc6 + mov $acc5, 8*1($r_ptr) + cmovc $t0, $acc7 + mov $acc6, 8*2($r_ptr) + mov $acc7, 8*3($r_ptr) + + ret +.cfi_endproc +.size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq +___ + +if ($addx) { +$code.=<<___; +.globl ecp_nistz256_mul_mont_adx +.type ecp_nistz256_mul_mont_adx,\@function,3 +.align 32 +ecp_nistz256_mul_mont_adx: +.cfi_startproc + _CET_ENDBR + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lmulx_body: + mov $b_org, $b_ptr + mov 8*0($b_org), %rdx + mov 8*0($a_ptr), $acc1 + mov 8*1($a_ptr), $acc2 + mov 8*2($a_ptr), $acc3 + mov 8*3($a_ptr), $acc4 + lea -128($a_ptr), $a_ptr # control u-op density + + call __ecp_nistz256_mul_montx + + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbx +.cfi_restore %rbx + mov 40(%rsp),%rbp +.cfi_restore %rbp + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lmulx_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_mul_mont_adx,.-ecp_nistz256_mul_mont_adx + +.type __ecp_nistz256_mul_montx,\@abi-omnipotent +.align 32 +__ecp_nistz256_mul_montx: +.cfi_startproc + ######################################################################## + # Multiply by b[0] + mulx $acc1, $acc0, $acc1 + mulx $acc2, $t0, $acc2 + mov \$32, $poly1 + xor $acc5, $acc5 # cf=0 + mulx $acc3, $t1, $acc3 + mov .Lpoly+8*3(%rip), $poly3 + adc $t0, $acc1 + mulx $acc4, $t0, $acc4 + mov $acc0, %rdx + adc $t1, $acc2 + shlx $poly1,$acc0,$t1 + adc $t0, $acc3 + shrx $poly1,$acc0,$t0 + adc \$0, $acc4 + + ######################################################################## + # First reduction step + add $t1, $acc1 + adc $t0, $acc2 + + mulx $poly3, $t0, $t1 + mov 8*1($b_ptr), %rdx + adc $t0, $acc3 + adc $t1, $acc4 + adc \$0, $acc5 + xor $acc0, $acc0 # $acc0=0,cf=0,of=0 + + ######################################################################## + # Multiply by b[1] + mulx 8*0+128($a_ptr), $t0, $t1 + adcx $t0, $acc1 + adox $t1, $acc2 + + mulx 8*1+128($a_ptr), $t0, $t1 + adcx $t0, $acc2 + adox $t1, $acc3 + + mulx 8*2+128($a_ptr), $t0, $t1 + adcx $t0, $acc3 + adox $t1, $acc4 + + mulx 8*3+128($a_ptr), $t0, $t1 + mov $acc1, %rdx + adcx $t0, $acc4 + shlx $poly1, $acc1, $t0 + adox $t1, $acc5 + shrx $poly1, $acc1, $t1 + + adcx $acc0, $acc5 + adox $acc0, $acc0 + adc \$0, $acc0 + + ######################################################################## + # Second reduction step + add $t0, $acc2 + adc $t1, $acc3 + + mulx $poly3, $t0, $t1 + mov 8*2($b_ptr), %rdx + adc $t0, $acc4 + adc $t1, $acc5 + adc \$0, $acc0 + xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0 + + ######################################################################## + # Multiply by b[2] + mulx 8*0+128($a_ptr), $t0, $t1 + adcx $t0, $acc2 + adox $t1, $acc3 + + mulx 8*1+128($a_ptr), $t0, $t1 + adcx $t0, $acc3 + adox $t1, $acc4 + + mulx 8*2+128($a_ptr), $t0, $t1 + adcx $t0, $acc4 + adox $t1, $acc5 + + mulx 8*3+128($a_ptr), $t0, $t1 + mov $acc2, %rdx + adcx $t0, $acc5 + shlx $poly1, $acc2, $t0 + adox $t1, $acc0 + shrx $poly1, $acc2, $t1 + + adcx $acc1, $acc0 + adox $acc1, $acc1 + adc \$0, $acc1 + + ######################################################################## + # Third reduction step + add $t0, $acc3 + adc $t1, $acc4 + + mulx $poly3, $t0, $t1 + mov 8*3($b_ptr), %rdx + adc $t0, $acc5 + adc $t1, $acc0 + adc \$0, $acc1 + xor $acc2, $acc2 # $acc2=0,cf=0,of=0 + + ######################################################################## + # Multiply by b[3] + mulx 8*0+128($a_ptr), $t0, $t1 + adcx $t0, $acc3 + adox $t1, $acc4 + + mulx 8*1+128($a_ptr), $t0, $t1 + adcx $t0, $acc4 + adox $t1, $acc5 + + mulx 8*2+128($a_ptr), $t0, $t1 + adcx $t0, $acc5 + adox $t1, $acc0 + + mulx 8*3+128($a_ptr), $t0, $t1 + mov $acc3, %rdx + adcx $t0, $acc0 + shlx $poly1, $acc3, $t0 + adox $t1, $acc1 + shrx $poly1, $acc3, $t1 + + adcx $acc2, $acc1 + adox $acc2, $acc2 + adc \$0, $acc2 + + ######################################################################## + # Fourth reduction step + add $t0, $acc4 + adc $t1, $acc5 + + mulx $poly3, $t0, $t1 + mov $acc4, $t2 + mov .Lpoly+8*1(%rip), $poly1 + adc $t0, $acc0 + mov $acc5, $t3 + adc $t1, $acc1 + adc \$0, $acc2 + + ######################################################################## + # Branch-less conditional subtraction of P + xor %eax, %eax + mov $acc0, $t0 + sbb \$-1, $acc4 # .Lpoly[0] + sbb $poly1, $acc5 # .Lpoly[1] + sbb \$0, $acc0 # .Lpoly[2] + mov $acc1, $t1 + sbb $poly3, $acc1 # .Lpoly[3] + sbb \$0, $acc2 + + cmovc $t2, $acc4 + cmovc $t3, $acc5 + mov $acc4, 8*0($r_ptr) + cmovc $t0, $acc0 + mov $acc5, 8*1($r_ptr) + cmovc $t1, $acc1 + mov $acc0, 8*2($r_ptr) + mov $acc1, 8*3($r_ptr) + + ret +.cfi_endproc +.size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx + +.globl ecp_nistz256_sqr_mont_adx +.type ecp_nistz256_sqr_mont_adx,\@function,2 +.align 32 +ecp_nistz256_sqr_mont_adx: +.cfi_startproc + _CET_ENDBR + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lsqrx_body: + mov 8*0($a_ptr), %rdx + mov 8*1($a_ptr), $acc6 + mov 8*2($a_ptr), $acc7 + mov 8*3($a_ptr), $acc0 + lea -128($a_ptr), $a_ptr # control u-op density + + call __ecp_nistz256_sqr_montx + + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbx +.cfi_restore %rbx + mov 40(%rsp),%rbp +.cfi_restore %rbp + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lsqrx_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_sqr_mont_adx,.-ecp_nistz256_sqr_mont_adx + +.type __ecp_nistz256_sqr_montx,\@abi-omnipotent +.align 32 +__ecp_nistz256_sqr_montx: +.cfi_startproc + mulx $acc6, $acc1, $acc2 # a[0]*a[1] + mulx $acc7, $t0, $acc3 # a[0]*a[2] + xor %eax, %eax + adc $t0, $acc2 + mulx $acc0, $t1, $acc4 # a[0]*a[3] + mov $acc6, %rdx + adc $t1, $acc3 + adc \$0, $acc4 + xor $acc5, $acc5 # $acc5=0,cf=0,of=0 + + ################################# + mulx $acc7, $t0, $t1 # a[1]*a[2] + adcx $t0, $acc3 + adox $t1, $acc4 + + mulx $acc0, $t0, $t1 # a[1]*a[3] + mov $acc7, %rdx + adcx $t0, $acc4 + adox $t1, $acc5 + adc \$0, $acc5 + + ################################# + mulx $acc0, $t0, $acc6 # a[2]*a[3] + mov 8*0+128($a_ptr), %rdx + xor $acc7, $acc7 # $acc7=0,cf=0,of=0 + adcx $acc1, $acc1 # acc1:6<<1 + adox $t0, $acc5 + adcx $acc2, $acc2 + adox $acc7, $acc6 # of=0 + + mulx %rdx, $acc0, $t1 + mov 8*1+128($a_ptr), %rdx + adcx $acc3, $acc3 + adox $t1, $acc1 + adcx $acc4, $acc4 + mulx %rdx, $t0, $t4 + mov 8*2+128($a_ptr), %rdx + adcx $acc5, $acc5 + adox $t0, $acc2 + adcx $acc6, $acc6 + .byte 0x67 + mulx %rdx, $t0, $t1 + mov 8*3+128($a_ptr), %rdx + adox $t4, $acc3 + adcx $acc7, $acc7 + adox $t0, $acc4 + mov \$32, $a_ptr + adox $t1, $acc5 + .byte 0x67,0x67 + mulx %rdx, $t0, $t4 + mov .Lpoly+8*3(%rip), %rdx + adox $t0, $acc6 + shlx $a_ptr, $acc0, $t0 + adox $t4, $acc7 + shrx $a_ptr, $acc0, $t4 + mov %rdx,$t1 + + # reduction step 1 + add $t0, $acc1 + adc $t4, $acc2 + + mulx $acc0, $t0, $acc0 + adc $t0, $acc3 + shlx $a_ptr, $acc1, $t0 + adc \$0, $acc0 + shrx $a_ptr, $acc1, $t4 + + # reduction step 2 + add $t0, $acc2 + adc $t4, $acc3 + + mulx $acc1, $t0, $acc1 + adc $t0, $acc0 + shlx $a_ptr, $acc2, $t0 + adc \$0, $acc1 + shrx $a_ptr, $acc2, $t4 + + # reduction step 3 + add $t0, $acc3 + adc $t4, $acc0 + + mulx $acc2, $t0, $acc2 + adc $t0, $acc1 + shlx $a_ptr, $acc3, $t0 + adc \$0, $acc2 + shrx $a_ptr, $acc3, $t4 + + # reduction step 4 + add $t0, $acc0 + adc $t4, $acc1 + + mulx $acc3, $t0, $acc3 + adc $t0, $acc2 + adc \$0, $acc3 + + xor $t3, $t3 + add $acc0, $acc4 # accumulate upper half + mov .Lpoly+8*1(%rip), $a_ptr + adc $acc1, $acc5 + mov $acc4, $acc0 + adc $acc2, $acc6 + adc $acc3, $acc7 + mov $acc5, $acc1 + adc \$0, $t3 + + sub \$-1, $acc4 # .Lpoly[0] + mov $acc6, $acc2 + sbb $a_ptr, $acc5 # .Lpoly[1] + sbb \$0, $acc6 # .Lpoly[2] + mov $acc7, $acc3 + sbb $t1, $acc7 # .Lpoly[3] + sbb \$0, $t3 + + cmovc $acc0, $acc4 + cmovc $acc1, $acc5 + mov $acc4, 8*0($r_ptr) + cmovc $acc2, $acc6 + mov $acc5, 8*1($r_ptr) + cmovc $acc3, $acc7 + mov $acc6, 8*2($r_ptr) + mov $acc7, 8*3($r_ptr) + + ret +.cfi_endproc +.size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx +___ +} +} +{ +my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); +my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7)); +my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15)); +my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15)); + +$code.=<<___; +################################################################################ +# void ecp_nistz256_select_w5_nohw(uint64_t *val, uint64_t *in_t, int index); +.globl ecp_nistz256_select_w5_nohw +.type ecp_nistz256_select_w5_nohw,\@abi-omnipotent +.align 32 +ecp_nistz256_select_w5_nohw: +.cfi_startproc + _CET_ENDBR +___ +$code.=<<___ if ($win64); + lea -0x88(%rsp), %rax +.LSEH_begin_ecp_nistz256_select_w5_nohw: + .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp + .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) + .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) + .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) + .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) + .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) + .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) + .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) + .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) + .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) + .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) +___ +$code.=<<___; + movdqa .LOne(%rip), $ONE + movd $index, $INDEX + + pxor $Ra, $Ra + pxor $Rb, $Rb + pxor $Rc, $Rc + pxor $Rd, $Rd + pxor $Re, $Re + pxor $Rf, $Rf + + movdqa $ONE, $M0 + pshufd \$0, $INDEX, $INDEX + + mov \$16, %rax +.Lselect_loop_sse_w5: + + movdqa $M0, $TMP0 + paddd $ONE, $M0 + pcmpeqd $INDEX, $TMP0 + + movdqa 16*0($in_t), $T0a + movdqa 16*1($in_t), $T0b + movdqa 16*2($in_t), $T0c + movdqa 16*3($in_t), $T0d + movdqa 16*4($in_t), $T0e + movdqa 16*5($in_t), $T0f + lea 16*6($in_t), $in_t + + pand $TMP0, $T0a + pand $TMP0, $T0b + por $T0a, $Ra + pand $TMP0, $T0c + por $T0b, $Rb + pand $TMP0, $T0d + por $T0c, $Rc + pand $TMP0, $T0e + por $T0d, $Rd + pand $TMP0, $T0f + por $T0e, $Re + por $T0f, $Rf + + dec %rax + jnz .Lselect_loop_sse_w5 + + movdqu $Ra, 16*0($val) + movdqu $Rb, 16*1($val) + movdqu $Rc, 16*2($val) + movdqu $Rd, 16*3($val) + movdqu $Re, 16*4($val) + movdqu $Rf, 16*5($val) +___ +$code.=<<___ if ($win64); + movaps (%rsp), %xmm6 + movaps 0x10(%rsp), %xmm7 + movaps 0x20(%rsp), %xmm8 + movaps 0x30(%rsp), %xmm9 + movaps 0x40(%rsp), %xmm10 + movaps 0x50(%rsp), %xmm11 + movaps 0x60(%rsp), %xmm12 + movaps 0x70(%rsp), %xmm13 + movaps 0x80(%rsp), %xmm14 + movaps 0x90(%rsp), %xmm15 + lea 0xa8(%rsp), %rsp +___ +$code.=<<___; + ret +.cfi_endproc +.LSEH_end_ecp_nistz256_select_w5_nohw: +.size ecp_nistz256_select_w5_nohw,.-ecp_nistz256_select_w5_nohw + +################################################################################ +# void ecp_nistz256_select_w7_nohw(uint64_t *val, uint64_t *in_t, int index); +.globl ecp_nistz256_select_w7_nohw +.type ecp_nistz256_select_w7_nohw,\@abi-omnipotent +.align 32 +ecp_nistz256_select_w7_nohw: +.cfi_startproc + _CET_ENDBR +___ +$code.=<<___ if ($win64); + lea -0x88(%rsp), %rax +.LSEH_begin_ecp_nistz256_select_w7_nohw: + .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp + .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) + .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) + .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) + .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) + .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) + .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) + .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) + .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) + .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) + .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) +___ +$code.=<<___; + movdqa .LOne(%rip), $M0 + movd $index, $INDEX + + pxor $Ra, $Ra + pxor $Rb, $Rb + pxor $Rc, $Rc + pxor $Rd, $Rd + + movdqa $M0, $ONE + pshufd \$0, $INDEX, $INDEX + mov \$64, %rax + +.Lselect_loop_sse_w7: + movdqa $M0, $TMP0 + paddd $ONE, $M0 + movdqa 16*0($in_t), $T0a + movdqa 16*1($in_t), $T0b + pcmpeqd $INDEX, $TMP0 + movdqa 16*2($in_t), $T0c + movdqa 16*3($in_t), $T0d + lea 16*4($in_t), $in_t + + pand $TMP0, $T0a + pand $TMP0, $T0b + por $T0a, $Ra + pand $TMP0, $T0c + por $T0b, $Rb + pand $TMP0, $T0d + por $T0c, $Rc + prefetcht0 255($in_t) + por $T0d, $Rd + + dec %rax + jnz .Lselect_loop_sse_w7 + + movdqu $Ra, 16*0($val) + movdqu $Rb, 16*1($val) + movdqu $Rc, 16*2($val) + movdqu $Rd, 16*3($val) +___ +$code.=<<___ if ($win64); + movaps (%rsp), %xmm6 + movaps 0x10(%rsp), %xmm7 + movaps 0x20(%rsp), %xmm8 + movaps 0x30(%rsp), %xmm9 + movaps 0x40(%rsp), %xmm10 + movaps 0x50(%rsp), %xmm11 + movaps 0x60(%rsp), %xmm12 + movaps 0x70(%rsp), %xmm13 + movaps 0x80(%rsp), %xmm14 + movaps 0x90(%rsp), %xmm15 + lea 0xa8(%rsp), %rsp +___ +$code.=<<___; + ret +.cfi_endproc +.LSEH_end_ecp_nistz256_select_w7_nohw: +.size ecp_nistz256_select_w7_nohw,.-ecp_nistz256_select_w7_nohw +___ +} +if ($avx>1) { +my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); +my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4)); +my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9)); +my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14)); + +$code.=<<___; +################################################################################ +# void ecp_nistz256_select_w5_avx2(uint64_t *val, uint64_t *in_t, int index); +.globl ecp_nistz256_select_w5_avx2 +.type ecp_nistz256_select_w5_avx2,\@abi-omnipotent +.align 32 +ecp_nistz256_select_w5_avx2: +.cfi_startproc + _CET_ENDBR + vzeroupper +___ +$code.=<<___ if ($win64); + lea -0x88(%rsp), %rax + mov %rsp,%r11 +.LSEH_begin_ecp_nistz256_select_w5_avx2: + .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax), %rsp + .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6, -0x20(%rax) + .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7, -0x10(%rax) + .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8, 8(%rax) + .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9, 0x10(%rax) + .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10, 0x20(%rax) + .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11, 0x30(%rax) + .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12, 0x40(%rax) + .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13, 0x50(%rax) + .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14, 0x60(%rax) + .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15, 0x70(%rax) +___ +$code.=<<___; + vmovdqa .LTwo(%rip), $TWO + + vpxor $Ra, $Ra, $Ra + vpxor $Rb, $Rb, $Rb + vpxor $Rc, $Rc, $Rc + + vmovdqa .LOne(%rip), $M0 + vmovdqa .LTwo(%rip), $M1 + + vmovd $index, %xmm1 + vpermd $INDEX, $Ra, $INDEX + + mov \$8, %rax +.Lselect_loop_avx2_w5: + + vmovdqa 32*0($in_t), $T0a + vmovdqa 32*1($in_t), $T0b + vmovdqa 32*2($in_t), $T0c + + vmovdqa 32*3($in_t), $T1a + vmovdqa 32*4($in_t), $T1b + vmovdqa 32*5($in_t), $T1c + + vpcmpeqd $INDEX, $M0, $TMP0 + vpcmpeqd $INDEX, $M1, $TMP1 + + vpaddd $TWO, $M0, $M0 + vpaddd $TWO, $M1, $M1 + lea 32*6($in_t), $in_t + + vpand $TMP0, $T0a, $T0a + vpand $TMP0, $T0b, $T0b + vpand $TMP0, $T0c, $T0c + vpand $TMP1, $T1a, $T1a + vpand $TMP1, $T1b, $T1b + vpand $TMP1, $T1c, $T1c + + vpxor $T0a, $Ra, $Ra + vpxor $T0b, $Rb, $Rb + vpxor $T0c, $Rc, $Rc + vpxor $T1a, $Ra, $Ra + vpxor $T1b, $Rb, $Rb + vpxor $T1c, $Rc, $Rc + + dec %rax + jnz .Lselect_loop_avx2_w5 + + vmovdqu $Ra, 32*0($val) + vmovdqu $Rb, 32*1($val) + vmovdqu $Rc, 32*2($val) + vzeroupper +___ +$code.=<<___ if ($win64); + movaps (%rsp), %xmm6 + movaps 0x10(%rsp), %xmm7 + movaps 0x20(%rsp), %xmm8 + movaps 0x30(%rsp), %xmm9 + movaps 0x40(%rsp), %xmm10 + movaps 0x50(%rsp), %xmm11 + movaps 0x60(%rsp), %xmm12 + movaps 0x70(%rsp), %xmm13 + movaps 0x80(%rsp), %xmm14 + movaps 0x90(%rsp), %xmm15 + lea (%r11), %rsp +___ +$code.=<<___; + ret +.cfi_endproc +.LSEH_end_ecp_nistz256_select_w5_avx2: +.size ecp_nistz256_select_w5_avx2,.-ecp_nistz256_select_w5_avx2 +___ +} +if ($avx>1) { +my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); +my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3)); +my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7)); +my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11)); +my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15)); + +$code.=<<___; + +################################################################################ +# void ecp_nistz256_select_w7_avx2(uint64_t *val, uint64_t *in_t, int index); +.globl ecp_nistz256_select_w7_avx2 +.type ecp_nistz256_select_w7_avx2,\@abi-omnipotent +.align 32 +ecp_nistz256_select_w7_avx2: +.cfi_startproc + _CET_ENDBR + vzeroupper +___ +$code.=<<___ if ($win64); + mov %rsp,%r11 + lea -0x88(%rsp), %rax +.LSEH_begin_ecp_nistz256_select_w7_avx2: + .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax), %rsp + .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6, -0x20(%rax) + .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7, -0x10(%rax) + .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8, 8(%rax) + .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9, 0x10(%rax) + .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10, 0x20(%rax) + .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11, 0x30(%rax) + .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12, 0x40(%rax) + .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13, 0x50(%rax) + .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14, 0x60(%rax) + .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15, 0x70(%rax) +___ +$code.=<<___; + vmovdqa .LThree(%rip), $THREE + + vpxor $Ra, $Ra, $Ra + vpxor $Rb, $Rb, $Rb + + vmovdqa .LOne(%rip), $M0 + vmovdqa .LTwo(%rip), $M1 + vmovdqa .LThree(%rip), $M2 + + vmovd $index, %xmm1 + vpermd $INDEX, $Ra, $INDEX + # Skip index = 0, because it is implicitly the point at infinity + + mov \$21, %rax +.Lselect_loop_avx2_w7: + + vmovdqa 32*0($in_t), $T0a + vmovdqa 32*1($in_t), $T0b + + vmovdqa 32*2($in_t), $T1a + vmovdqa 32*3($in_t), $T1b + + vmovdqa 32*4($in_t), $T2a + vmovdqa 32*5($in_t), $T2b + + vpcmpeqd $INDEX, $M0, $TMP0 + vpcmpeqd $INDEX, $M1, $TMP1 + vpcmpeqd $INDEX, $M2, $TMP2 + + vpaddd $THREE, $M0, $M0 + vpaddd $THREE, $M1, $M1 + vpaddd $THREE, $M2, $M2 + lea 32*6($in_t), $in_t + + vpand $TMP0, $T0a, $T0a + vpand $TMP0, $T0b, $T0b + vpand $TMP1, $T1a, $T1a + vpand $TMP1, $T1b, $T1b + vpand $TMP2, $T2a, $T2a + vpand $TMP2, $T2b, $T2b + + vpxor $T0a, $Ra, $Ra + vpxor $T0b, $Rb, $Rb + vpxor $T1a, $Ra, $Ra + vpxor $T1b, $Rb, $Rb + vpxor $T2a, $Ra, $Ra + vpxor $T2b, $Rb, $Rb + + dec %rax + jnz .Lselect_loop_avx2_w7 + + + vmovdqa 32*0($in_t), $T0a + vmovdqa 32*1($in_t), $T0b + + vpcmpeqd $INDEX, $M0, $TMP0 + + vpand $TMP0, $T0a, $T0a + vpand $TMP0, $T0b, $T0b + + vpxor $T0a, $Ra, $Ra + vpxor $T0b, $Rb, $Rb + + vmovdqu $Ra, 32*0($val) + vmovdqu $Rb, 32*1($val) + vzeroupper +___ +$code.=<<___ if ($win64); + movaps (%rsp), %xmm6 + movaps 0x10(%rsp), %xmm7 + movaps 0x20(%rsp), %xmm8 + movaps 0x30(%rsp), %xmm9 + movaps 0x40(%rsp), %xmm10 + movaps 0x50(%rsp), %xmm11 + movaps 0x60(%rsp), %xmm12 + movaps 0x70(%rsp), %xmm13 + movaps 0x80(%rsp), %xmm14 + movaps 0x90(%rsp), %xmm15 + lea (%r11), %rsp +___ +$code.=<<___; + ret +.cfi_endproc +.LSEH_end_ecp_nistz256_select_w7_avx2: +.size ecp_nistz256_select_w7_avx2,.-ecp_nistz256_select_w7_avx2 +___ +} +{{{ +######################################################################## +# This block implements higher level point_double, point_add and +# point_add_affine. The key to performance in this case is to allow +# out-of-order execution logic to overlap computations from next step +# with tail processing from current step. By using tailored calling +# sequence we minimize inter-step overhead to give processor better +# shot at overlapping operations... +# +# You will notice that input data is copied to stack. Trouble is that +# there are no registers to spare for holding original pointers and +# reloading them, pointers, would create undesired dependencies on +# effective addresses calculation paths. In other words it's too done +# to favour out-of-order execution logic. +# + +my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); +my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); +my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4); +my ($poly1,$poly3)=($acc6,$acc7); + +sub load_for_mul () { +my ($a,$b,$src0) = @_; +my $bias = $src0 eq "%rax" ? 0 : -128; + +" mov $b, $src0 + lea $b, $b_ptr + mov 8*0+$a, $acc1 + mov 8*1+$a, $acc2 + lea $bias+$a, $a_ptr + mov 8*2+$a, $acc3 + mov 8*3+$a, $acc4" +} + +sub load_for_sqr () { +my ($a,$src0) = @_; +my $bias = $src0 eq "%rax" ? 0 : -128; + +" mov 8*0+$a, $src0 + mov 8*1+$a, $acc6 + lea $bias+$a, $a_ptr + mov 8*2+$a, $acc7 + mov 8*3+$a, $acc0" +} + + { +######################################################################## +# operate in 4-5-0-1 "name space" that matches multiplication output +# +my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); + +$code.=<<___; +.type __ecp_nistz256_add_toq,\@abi-omnipotent +.align 32 +__ecp_nistz256_add_toq: +.cfi_startproc + xor $t4,$t4 + add 8*0($b_ptr), $a0 + adc 8*1($b_ptr), $a1 + mov $a0, $t0 + adc 8*2($b_ptr), $a2 + adc 8*3($b_ptr), $a3 + mov $a1, $t1 + adc \$0, $t4 + + sub \$-1, $a0 + mov $a2, $t2 + sbb $poly1, $a1 + sbb \$0, $a2 + mov $a3, $t3 + sbb $poly3, $a3 + sbb \$0, $t4 + + cmovc $t0, $a0 + cmovc $t1, $a1 + mov $a0, 8*0($r_ptr) + cmovc $t2, $a2 + mov $a1, 8*1($r_ptr) + cmovc $t3, $a3 + mov $a2, 8*2($r_ptr) + mov $a3, 8*3($r_ptr) + + ret +.cfi_endproc +.size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq + +.type __ecp_nistz256_sub_fromq,\@abi-omnipotent +.align 32 +__ecp_nistz256_sub_fromq: +.cfi_startproc + sub 8*0($b_ptr), $a0 + sbb 8*1($b_ptr), $a1 + mov $a0, $t0 + sbb 8*2($b_ptr), $a2 + sbb 8*3($b_ptr), $a3 + mov $a1, $t1 + sbb $t4, $t4 + + add \$-1, $a0 + mov $a2, $t2 + adc $poly1, $a1 + adc \$0, $a2 + mov $a3, $t3 + adc $poly3, $a3 + test $t4, $t4 + + cmovz $t0, $a0 + cmovz $t1, $a1 + mov $a0, 8*0($r_ptr) + cmovz $t2, $a2 + mov $a1, 8*1($r_ptr) + cmovz $t3, $a3 + mov $a2, 8*2($r_ptr) + mov $a3, 8*3($r_ptr) + + ret +.cfi_endproc +.size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq + +.type __ecp_nistz256_subq,\@abi-omnipotent +.align 32 +__ecp_nistz256_subq: +.cfi_startproc + sub $a0, $t0 + sbb $a1, $t1 + mov $t0, $a0 + sbb $a2, $t2 + sbb $a3, $t3 + mov $t1, $a1 + sbb $t4, $t4 + + add \$-1, $t0 + mov $t2, $a2 + adc $poly1, $t1 + adc \$0, $t2 + mov $t3, $a3 + adc $poly3, $t3 + test $t4, $t4 + + cmovnz $t0, $a0 + cmovnz $t1, $a1 + cmovnz $t2, $a2 + cmovnz $t3, $a3 + + ret +.cfi_endproc +.size __ecp_nistz256_subq,.-__ecp_nistz256_subq + +.type __ecp_nistz256_mul_by_2q,\@abi-omnipotent +.align 32 +__ecp_nistz256_mul_by_2q: +.cfi_startproc + xor $t4, $t4 + add $a0, $a0 # a0:a3+a0:a3 + adc $a1, $a1 + mov $a0, $t0 + adc $a2, $a2 + adc $a3, $a3 + mov $a1, $t1 + adc \$0, $t4 + + sub \$-1, $a0 + mov $a2, $t2 + sbb $poly1, $a1 + sbb \$0, $a2 + mov $a3, $t3 + sbb $poly3, $a3 + sbb \$0, $t4 + + cmovc $t0, $a0 + cmovc $t1, $a1 + mov $a0, 8*0($r_ptr) + cmovc $t2, $a2 + mov $a1, 8*1($r_ptr) + cmovc $t3, $a3 + mov $a2, 8*2($r_ptr) + mov $a3, 8*3($r_ptr) + + ret +.cfi_endproc +.size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q +___ + } +sub gen_double () { + my $x = shift; + my ($src0,$sfx,$bias); + my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); + + if ($x ne "x") { + $src0 = "%rax"; + $sfx = "_nohw"; + $bias = 0; + } else { + $src0 = "%rdx"; + $sfx = "_adx"; + $bias = 128; + } +$code.=<<___; +.globl ecp_nistz256_point_double$sfx +.type ecp_nistz256_point_double$sfx,\@function,2 +.align 32 +ecp_nistz256_point_double$sfx: +.cfi_startproc + _CET_ENDBR + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$32*5+8, %rsp +.cfi_adjust_cfa_offset 32*5+8 +.Lpoint_double${x}_body: + +.Lpoint_double_shortcut$x: + movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x + mov $a_ptr, $b_ptr # backup copy + movdqu 0x10($a_ptr), %xmm1 + mov 0x20+8*0($a_ptr), $acc4 # load in_y in "5-4-0-1" order + mov 0x20+8*1($a_ptr), $acc5 + mov 0x20+8*2($a_ptr), $acc0 + mov 0x20+8*3($a_ptr), $acc1 + mov .Lpoly+8*1(%rip), $poly1 + mov .Lpoly+8*3(%rip), $poly3 + movdqa %xmm0, $in_x(%rsp) + movdqa %xmm1, $in_x+0x10(%rsp) + lea 0x20($r_ptr), $acc2 + lea 0x40($r_ptr), $acc3 + movq $r_ptr, %xmm0 + movq $acc2, %xmm1 + movq $acc3, %xmm2 + + lea $S(%rsp), $r_ptr + call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(S, in_y); + + mov 0x40+8*0($a_ptr), $src0 + mov 0x40+8*1($a_ptr), $acc6 + mov 0x40+8*2($a_ptr), $acc7 + mov 0x40+8*3($a_ptr), $acc0 + lea 0x40-$bias($a_ptr), $a_ptr + lea $Zsqr(%rsp), $r_ptr + call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Zsqr, in_z); + + `&load_for_sqr("$S(%rsp)", "$src0")` + lea $S(%rsp), $r_ptr + call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(S, S); + + mov 0x20($b_ptr), $src0 # $b_ptr is still valid + mov 0x40+8*0($b_ptr), $acc1 + mov 0x40+8*1($b_ptr), $acc2 + mov 0x40+8*2($b_ptr), $acc3 + mov 0x40+8*3($b_ptr), $acc4 + lea 0x40-$bias($b_ptr), $a_ptr + lea 0x20($b_ptr), $b_ptr + movq %xmm2, $r_ptr + call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, in_z, in_y); + call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(res_z, res_z); + + mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order + mov $in_x+8*1(%rsp), $acc5 + lea $Zsqr(%rsp), $b_ptr + mov $in_x+8*2(%rsp), $acc0 + mov $in_x+8*3(%rsp), $acc1 + lea $M(%rsp), $r_ptr + call __ecp_nistz256_add_to$x # p256_add(M, in_x, Zsqr); + + mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order + mov $in_x+8*1(%rsp), $acc5 + lea $Zsqr(%rsp), $b_ptr + mov $in_x+8*2(%rsp), $acc0 + mov $in_x+8*3(%rsp), $acc1 + lea $Zsqr(%rsp), $r_ptr + call __ecp_nistz256_sub_from$x # p256_sub(Zsqr, in_x, Zsqr); + + `&load_for_sqr("$S(%rsp)", "$src0")` + movq %xmm1, $r_ptr + call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S); +___ +{ +######## ecp_nistz256_div_by_2(res_y, res_y); ########################## +# operate in 4-5-6-7 "name space" that matches squaring output +# +my ($poly1,$poly3)=($a_ptr,$t1); +my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2); + +$code.=<<___; + xor $t4, $t4 + mov $a0, $t0 + add \$-1, $a0 + mov $a1, $t1 + adc $poly1, $a1 + mov $a2, $t2 + adc \$0, $a2 + mov $a3, $t3 + adc $poly3, $a3 + adc \$0, $t4 + xor $a_ptr, $a_ptr # borrow $a_ptr + test \$1, $t0 + + cmovz $t0, $a0 + cmovz $t1, $a1 + cmovz $t2, $a2 + cmovz $t3, $a3 + cmovz $a_ptr, $t4 + + mov $a1, $t0 # a0:a3>>1 + shr \$1, $a0 + shl \$63, $t0 + mov $a2, $t1 + shr \$1, $a1 + or $t0, $a0 + shl \$63, $t1 + mov $a3, $t2 + shr \$1, $a2 + or $t1, $a1 + shl \$63, $t2 + mov $a0, 8*0($r_ptr) + shr \$1, $a3 + mov $a1, 8*1($r_ptr) + shl \$63, $t4 + or $t2, $a2 + or $t4, $a3 + mov $a2, 8*2($r_ptr) + mov $a3, 8*3($r_ptr) +___ +} +$code.=<<___; + `&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")` + lea $M(%rsp), $r_ptr + call __ecp_nistz256_mul_mont$x # p256_mul_mont(M, M, Zsqr); + + lea $tmp0(%rsp), $r_ptr + call __ecp_nistz256_mul_by_2$x + + lea $M(%rsp), $b_ptr + lea $M(%rsp), $r_ptr + call __ecp_nistz256_add_to$x # p256_mul_by_3(M, M); + + `&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")` + lea $S(%rsp), $r_ptr + call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, in_x); + + lea $tmp0(%rsp), $r_ptr + call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(tmp0, S); + + `&load_for_sqr("$M(%rsp)", "$src0")` + movq %xmm0, $r_ptr + call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_x, M); + + lea $tmp0(%rsp), $b_ptr + mov $acc6, $acc0 # harmonize sqr output and sub input + mov $acc7, $acc1 + mov $a_ptr, $poly1 + mov $t1, $poly3 + call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, tmp0); + + mov $S+8*0(%rsp), $t0 + mov $S+8*1(%rsp), $t1 + mov $S+8*2(%rsp), $t2 + mov $S+8*3(%rsp), $acc2 # "4-5-0-1" order + lea $S(%rsp), $r_ptr + call __ecp_nistz256_sub$x # p256_sub(S, S, res_x); + + mov $M(%rsp), $src0 + lea $M(%rsp), $b_ptr + mov $acc4, $acc6 # harmonize sub output and mul input + xor %ecx, %ecx + mov $acc4, $S+8*0(%rsp) # have to save:-( + mov $acc5, $acc2 + mov $acc5, $S+8*1(%rsp) + cmovz $acc0, $acc3 + mov $acc0, $S+8*2(%rsp) + lea $S-$bias(%rsp), $a_ptr + cmovz $acc1, $acc4 + mov $acc1, $S+8*3(%rsp) + mov $acc6, $acc1 + lea $S(%rsp), $r_ptr + call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, M); + + movq %xmm1, $b_ptr + movq %xmm1, $r_ptr + call __ecp_nistz256_sub_from$x # p256_sub(res_y, S, res_y); + + lea 32*5+56(%rsp), %rsi +.cfi_def_cfa %rsi,8 + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbx +.cfi_restore %rbx + mov -8(%rsi),%rbp +.cfi_restore %rbp + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpoint_double${x}_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx +___ +} +&gen_double("q"); + +sub gen_add () { + my $x = shift; + my ($src0,$sfx,$bias); + my ($H,$Hsqr,$R,$Rsqr,$Hcub, + $U1,$U2,$S1,$S2, + $res_x,$res_y,$res_z, + $in1_x,$in1_y,$in1_z, + $in2_x,$in2_y,$in2_z)=map(32*$_,(0..17)); + my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); + + if ($x ne "x") { + $src0 = "%rax"; + $sfx = "_nohw"; + $bias = 0; + } else { + $src0 = "%rdx"; + $sfx = "_adx"; + $bias = 128; + } +$code.=<<___; +.globl ecp_nistz256_point_add$sfx +.type ecp_nistz256_point_add$sfx,\@function,3 +.align 32 +ecp_nistz256_point_add$sfx: +.cfi_startproc + _CET_ENDBR + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$32*18+8, %rsp +.cfi_adjust_cfa_offset 32*18+8 +.Lpoint_add${x}_body: + + movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr + movdqu 0x10($a_ptr), %xmm1 + movdqu 0x20($a_ptr), %xmm2 + movdqu 0x30($a_ptr), %xmm3 + movdqu 0x40($a_ptr), %xmm4 + movdqu 0x50($a_ptr), %xmm5 + mov $a_ptr, $b_ptr # reassign + mov $b_org, $a_ptr # reassign + movdqa %xmm0, $in1_x(%rsp) + movdqa %xmm1, $in1_x+0x10(%rsp) + movdqa %xmm2, $in1_y(%rsp) + movdqa %xmm3, $in1_y+0x10(%rsp) + movdqa %xmm4, $in1_z(%rsp) + movdqa %xmm5, $in1_z+0x10(%rsp) + por %xmm4, %xmm5 + + movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$b_ptr + pshufd \$0xb1, %xmm5, %xmm3 + movdqu 0x10($a_ptr), %xmm1 + movdqu 0x20($a_ptr), %xmm2 + por %xmm3, %xmm5 + movdqu 0x30($a_ptr), %xmm3 + mov 0x40+8*0($a_ptr), $src0 # load original in2_z + mov 0x40+8*1($a_ptr), $acc6 + mov 0x40+8*2($a_ptr), $acc7 + mov 0x40+8*3($a_ptr), $acc0 + movdqa %xmm0, $in2_x(%rsp) + pshufd \$0x1e, %xmm5, %xmm4 + movdqa %xmm1, $in2_x+0x10(%rsp) + movdqu 0x40($a_ptr),%xmm0 # in2_z again + movdqu 0x50($a_ptr),%xmm1 + movdqa %xmm2, $in2_y(%rsp) + movdqa %xmm3, $in2_y+0x10(%rsp) + por %xmm4, %xmm5 + pxor %xmm4, %xmm4 + por %xmm0, %xmm1 + movq $r_ptr, %xmm0 # save $r_ptr + + lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid + mov $src0, $in2_z+8*0(%rsp) # make in2_z copy + mov $acc6, $in2_z+8*1(%rsp) + mov $acc7, $in2_z+8*2(%rsp) + mov $acc0, $in2_z+8*3(%rsp) + lea $Z2sqr(%rsp), $r_ptr # Z2^2 + call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z2sqr, in2_z); + + pcmpeqd %xmm4, %xmm5 + pshufd \$0xb1, %xmm1, %xmm4 + por %xmm1, %xmm4 + pshufd \$0, %xmm5, %xmm5 # in1infty + pshufd \$0x1e, %xmm4, %xmm3 + por %xmm3, %xmm4 + pxor %xmm3, %xmm3 + pcmpeqd %xmm3, %xmm4 + pshufd \$0, %xmm4, %xmm4 # in2infty + mov 0x40+8*0($b_ptr), $src0 # load original in1_z + mov 0x40+8*1($b_ptr), $acc6 + mov 0x40+8*2($b_ptr), $acc7 + mov 0x40+8*3($b_ptr), $acc0 + movq $b_ptr, %xmm1 + + lea 0x40-$bias($b_ptr), $a_ptr + lea $Z1sqr(%rsp), $r_ptr # Z1^2 + call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); + + `&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")` + lea $S1(%rsp), $r_ptr # S1 = Z2^3 + call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, Z2sqr, in2_z); + + `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` + lea $S2(%rsp), $r_ptr # S2 = Z1^3 + call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); + + `&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")` + lea $S1(%rsp), $r_ptr # S1 = Y1*Z2^3 + call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, S1, in1_y); + + `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` + lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 + call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); + + lea $S1(%rsp), $b_ptr + lea $R(%rsp), $r_ptr # R = S2 - S1 + call __ecp_nistz256_sub_from$x # p256_sub(R, S2, S1); + + or $acc5, $acc4 # see if result is zero + movdqa %xmm4, %xmm2 + or $acc0, $acc4 + or $acc1, $acc4 + por %xmm5, %xmm2 # in1infty || in2infty + movq $acc4, %xmm3 + + `&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")` + lea $U1(%rsp), $r_ptr # U1 = X1*Z2^2 + call __ecp_nistz256_mul_mont$x # p256_mul_mont(U1, in1_x, Z2sqr); + + `&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")` + lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 + call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in2_x, Z1sqr); + + lea $U1(%rsp), $b_ptr + lea $H(%rsp), $r_ptr # H = U2 - U1 + call __ecp_nistz256_sub_from$x # p256_sub(H, U2, U1); + + or $acc5, $acc4 # see if result is zero + or $acc0, $acc4 + or $acc1, $acc4 # !is_equal(U1, U2) + + movq %xmm2, $acc0 + movq %xmm3, $acc1 + or $acc0, $acc4 + .byte 0x3e # predict taken + jnz .Ladd_proceed$x # !is_equal(U1, U2) || in1infty || in2infty + + # We now know A = B or A = -B and neither is infinity. Compare the + # y-coordinates via S1 and S2. + test $acc1, $acc1 + jz .Ladd_double$x # is_equal(S1, S2) + + # A = -B, so the result is infinity. + # + # TODO(davidben): Does .Ladd_proceed handle this case? It seems to, in + # which case we should eliminate this special-case and simplify the + # timing analysis. + movq %xmm0, $r_ptr # restore $r_ptr + pxor %xmm0, %xmm0 + movdqu %xmm0, 0x00($r_ptr) + movdqu %xmm0, 0x10($r_ptr) + movdqu %xmm0, 0x20($r_ptr) + movdqu %xmm0, 0x30($r_ptr) + movdqu %xmm0, 0x40($r_ptr) + movdqu %xmm0, 0x50($r_ptr) + jmp .Ladd_done$x + +.align 32 +.Ladd_double$x: + movq %xmm1, $a_ptr # restore $a_ptr + movq %xmm0, $r_ptr # restore $r_ptr + add \$`32*(18-5)`, %rsp # difference in frame sizes +.cfi_adjust_cfa_offset `-32*(18-5)` + jmp .Lpoint_double_shortcut$x +.cfi_adjust_cfa_offset `32*(18-5)` + +.align 32 +.Ladd_proceed$x: + `&load_for_sqr("$R(%rsp)", "$src0")` + lea $Rsqr(%rsp), $r_ptr # R^2 + call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); + + `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` + lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 + call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); + + `&load_for_sqr("$H(%rsp)", "$src0")` + lea $Hsqr(%rsp), $r_ptr # H^2 + call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); + + `&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")` + lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 + call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, res_z, in2_z); + + `&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")` + lea $Hcub(%rsp), $r_ptr # H^3 + call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); + + `&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")` + lea $U2(%rsp), $r_ptr # U1*H^2 + call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, U1, Hsqr); +___ +{ +####################################################################### +# operate in 4-5-0-1 "name space" that matches multiplication output +# +my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); +my ($poly1, $poly3)=($acc6,$acc7); + +$code.=<<___; + #lea $U2(%rsp), $a_ptr + #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 + #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); + + xor $t4, $t4 + add $acc0, $acc0 # a0:a3+a0:a3 + lea $Rsqr(%rsp), $a_ptr + adc $acc1, $acc1 + mov $acc0, $t0 + adc $acc2, $acc2 + adc $acc3, $acc3 + mov $acc1, $t1 + adc \$0, $t4 + + sub \$-1, $acc0 + mov $acc2, $t2 + sbb $poly1, $acc1 + sbb \$0, $acc2 + mov $acc3, $t3 + sbb $poly3, $acc3 + sbb \$0, $t4 + + cmovc $t0, $acc0 + mov 8*0($a_ptr), $t0 + cmovc $t1, $acc1 + mov 8*1($a_ptr), $t1 + cmovc $t2, $acc2 + mov 8*2($a_ptr), $t2 + cmovc $t3, $acc3 + mov 8*3($a_ptr), $t3 + + call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); + + lea $Hcub(%rsp), $b_ptr + lea $res_x(%rsp), $r_ptr + call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); + + mov $U2+8*0(%rsp), $t0 + mov $U2+8*1(%rsp), $t1 + mov $U2+8*2(%rsp), $t2 + mov $U2+8*3(%rsp), $t3 + lea $res_y(%rsp), $r_ptr + + call __ecp_nistz256_sub$x # p256_sub(res_y, U2, res_x); + + mov $acc0, 8*0($r_ptr) # save the result, as + mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't + mov $acc2, 8*2($r_ptr) + mov $acc3, 8*3($r_ptr) +___ +} +$code.=<<___; + `&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")` + lea $S2(%rsp), $r_ptr + call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S1, Hcub); + + `&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")` + lea $res_y(%rsp), $r_ptr + call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_y, R, res_y); + + lea $S2(%rsp), $b_ptr + lea $res_y(%rsp), $r_ptr + call __ecp_nistz256_sub_from$x # p256_sub(res_y, res_y, S2); + + movq %xmm0, $r_ptr # restore $r_ptr + + movdqa %xmm5, %xmm0 # copy_conditional(res_z, in2_z, in1infty); + movdqa %xmm5, %xmm1 + pandn $res_z(%rsp), %xmm0 + movdqa %xmm5, %xmm2 + pandn $res_z+0x10(%rsp), %xmm1 + movdqa %xmm5, %xmm3 + pand $in2_z(%rsp), %xmm2 + pand $in2_z+0x10(%rsp), %xmm3 + por %xmm0, %xmm2 + por %xmm1, %xmm3 + + movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); + movdqa %xmm4, %xmm1 + pandn %xmm2, %xmm0 + movdqa %xmm4, %xmm2 + pandn %xmm3, %xmm1 + movdqa %xmm4, %xmm3 + pand $in1_z(%rsp), %xmm2 + pand $in1_z+0x10(%rsp), %xmm3 + por %xmm0, %xmm2 + por %xmm1, %xmm3 + movdqu %xmm2, 0x40($r_ptr) + movdqu %xmm3, 0x50($r_ptr) + + movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); + movdqa %xmm5, %xmm1 + pandn $res_x(%rsp), %xmm0 + movdqa %xmm5, %xmm2 + pandn $res_x+0x10(%rsp), %xmm1 + movdqa %xmm5, %xmm3 + pand $in2_x(%rsp), %xmm2 + pand $in2_x+0x10(%rsp), %xmm3 + por %xmm0, %xmm2 + por %xmm1, %xmm3 + + movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); + movdqa %xmm4, %xmm1 + pandn %xmm2, %xmm0 + movdqa %xmm4, %xmm2 + pandn %xmm3, %xmm1 + movdqa %xmm4, %xmm3 + pand $in1_x(%rsp), %xmm2 + pand $in1_x+0x10(%rsp), %xmm3 + por %xmm0, %xmm2 + por %xmm1, %xmm3 + movdqu %xmm2, 0x00($r_ptr) + movdqu %xmm3, 0x10($r_ptr) + + movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); + movdqa %xmm5, %xmm1 + pandn $res_y(%rsp), %xmm0 + movdqa %xmm5, %xmm2 + pandn $res_y+0x10(%rsp), %xmm1 + movdqa %xmm5, %xmm3 + pand $in2_y(%rsp), %xmm2 + pand $in2_y+0x10(%rsp), %xmm3 + por %xmm0, %xmm2 + por %xmm1, %xmm3 + + movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); + movdqa %xmm4, %xmm1 + pandn %xmm2, %xmm0 + movdqa %xmm4, %xmm2 + pandn %xmm3, %xmm1 + movdqa %xmm4, %xmm3 + pand $in1_y(%rsp), %xmm2 + pand $in1_y+0x10(%rsp), %xmm3 + por %xmm0, %xmm2 + por %xmm1, %xmm3 + movdqu %xmm2, 0x20($r_ptr) + movdqu %xmm3, 0x30($r_ptr) + +.Ladd_done$x: + lea 32*18+56(%rsp), %rsi +.cfi_def_cfa %rsi,8 + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbx +.cfi_restore %rbx + mov -8(%rsi),%rbp +.cfi_restore %rbp + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpoint_add${x}_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx +___ +} +&gen_add("q"); + +sub gen_add_affine () { + my $x = shift; + my ($src0,$sfx,$bias); + my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr, + $res_x,$res_y,$res_z, + $in1_x,$in1_y,$in1_z, + $in2_x,$in2_y)=map(32*$_,(0..14)); + my $Z1sqr = $S2; + + if ($x ne "x") { + $src0 = "%rax"; + $sfx = "_nohw"; + $bias = 0; + } else { + $src0 = "%rdx"; + $sfx = "_adx"; + $bias = 128; + } +$code.=<<___; +.globl ecp_nistz256_point_add_affine$sfx +.type ecp_nistz256_point_add_affine$sfx,\@function,3 +.align 32 +ecp_nistz256_point_add_affine$sfx: +.cfi_startproc + _CET_ENDBR + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$32*15+8, %rsp +.cfi_adjust_cfa_offset 32*15+8 +.Ladd_affine${x}_body: + + movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr + mov $b_org, $b_ptr # reassign + movdqu 0x10($a_ptr), %xmm1 + movdqu 0x20($a_ptr), %xmm2 + movdqu 0x30($a_ptr), %xmm3 + movdqu 0x40($a_ptr), %xmm4 + movdqu 0x50($a_ptr), %xmm5 + mov 0x40+8*0($a_ptr), $src0 # load original in1_z + mov 0x40+8*1($a_ptr), $acc6 + mov 0x40+8*2($a_ptr), $acc7 + mov 0x40+8*3($a_ptr), $acc0 + movdqa %xmm0, $in1_x(%rsp) + movdqa %xmm1, $in1_x+0x10(%rsp) + movdqa %xmm2, $in1_y(%rsp) + movdqa %xmm3, $in1_y+0x10(%rsp) + movdqa %xmm4, $in1_z(%rsp) + movdqa %xmm5, $in1_z+0x10(%rsp) + por %xmm4, %xmm5 + + movdqu 0x00($b_ptr), %xmm0 # copy *(P256_POINT_AFFINE *)$b_ptr + pshufd \$0xb1, %xmm5, %xmm3 + movdqu 0x10($b_ptr), %xmm1 + movdqu 0x20($b_ptr), %xmm2 + por %xmm3, %xmm5 + movdqu 0x30($b_ptr), %xmm3 + movdqa %xmm0, $in2_x(%rsp) + pshufd \$0x1e, %xmm5, %xmm4 + movdqa %xmm1, $in2_x+0x10(%rsp) + por %xmm0, %xmm1 + movq $r_ptr, %xmm0 # save $r_ptr + movdqa %xmm2, $in2_y(%rsp) + movdqa %xmm3, $in2_y+0x10(%rsp) + por %xmm2, %xmm3 + por %xmm4, %xmm5 + pxor %xmm4, %xmm4 + por %xmm1, %xmm3 + + lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid + lea $Z1sqr(%rsp), $r_ptr # Z1^2 + call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); + + pcmpeqd %xmm4, %xmm5 + pshufd \$0xb1, %xmm3, %xmm4 + mov 0x00($b_ptr), $src0 # $b_ptr is still valid + #lea 0x00($b_ptr), $b_ptr + mov $acc4, $acc1 # harmonize sqr output and mul input + por %xmm3, %xmm4 + pshufd \$0, %xmm5, %xmm5 # in1infty + pshufd \$0x1e, %xmm4, %xmm3 + mov $acc5, $acc2 + por %xmm3, %xmm4 + pxor %xmm3, %xmm3 + mov $acc6, $acc3 + pcmpeqd %xmm3, %xmm4 + pshufd \$0, %xmm4, %xmm4 # in2infty + + lea $Z1sqr-$bias(%rsp), $a_ptr + mov $acc7, $acc4 + lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 + call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, Z1sqr, in2_x); + + lea $in1_x(%rsp), $b_ptr + lea $H(%rsp), $r_ptr # H = U2 - U1 + call __ecp_nistz256_sub_from$x # p256_sub(H, U2, in1_x); + + `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` + lea $S2(%rsp), $r_ptr # S2 = Z1^3 + call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); + + `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` + lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 + call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); + + `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` + lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 + call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); + + lea $in1_y(%rsp), $b_ptr + lea $R(%rsp), $r_ptr # R = S2 - S1 + call __ecp_nistz256_sub_from$x # p256_sub(R, S2, in1_y); + + `&load_for_sqr("$H(%rsp)", "$src0")` + lea $Hsqr(%rsp), $r_ptr # H^2 + call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); + + `&load_for_sqr("$R(%rsp)", "$src0")` + lea $Rsqr(%rsp), $r_ptr # R^2 + call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); + + `&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")` + lea $Hcub(%rsp), $r_ptr # H^3 + call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); + + `&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")` + lea $U2(%rsp), $r_ptr # U1*H^2 + call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in1_x, Hsqr); +___ +{ +####################################################################### +# operate in 4-5-0-1 "name space" that matches multiplication output +# +my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); +my ($poly1, $poly3)=($acc6,$acc7); + +$code.=<<___; + #lea $U2(%rsp), $a_ptr + #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 + #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); + + xor $t4, $t4 + add $acc0, $acc0 # a0:a3+a0:a3 + lea $Rsqr(%rsp), $a_ptr + adc $acc1, $acc1 + mov $acc0, $t0 + adc $acc2, $acc2 + adc $acc3, $acc3 + mov $acc1, $t1 + adc \$0, $t4 + + sub \$-1, $acc0 + mov $acc2, $t2 + sbb $poly1, $acc1 + sbb \$0, $acc2 + mov $acc3, $t3 + sbb $poly3, $acc3 + sbb \$0, $t4 + + cmovc $t0, $acc0 + mov 8*0($a_ptr), $t0 + cmovc $t1, $acc1 + mov 8*1($a_ptr), $t1 + cmovc $t2, $acc2 + mov 8*2($a_ptr), $t2 + cmovc $t3, $acc3 + mov 8*3($a_ptr), $t3 + + call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); + + lea $Hcub(%rsp), $b_ptr + lea $res_x(%rsp), $r_ptr + call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); + + mov $U2+8*0(%rsp), $t0 + mov $U2+8*1(%rsp), $t1 + mov $U2+8*2(%rsp), $t2 + mov $U2+8*3(%rsp), $t3 + lea $H(%rsp), $r_ptr + + call __ecp_nistz256_sub$x # p256_sub(H, U2, res_x); + + mov $acc0, 8*0($r_ptr) # save the result, as + mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't + mov $acc2, 8*2($r_ptr) + mov $acc3, 8*3($r_ptr) +___ +} +$code.=<<___; + `&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")` + lea $S2(%rsp), $r_ptr + call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Hcub, in1_y); + + `&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")` + lea $H(%rsp), $r_ptr + call __ecp_nistz256_mul_mont$x # p256_mul_mont(H, H, R); + + lea $S2(%rsp), $b_ptr + lea $res_y(%rsp), $r_ptr + call __ecp_nistz256_sub_from$x # p256_sub(res_y, H, S2); + + movq %xmm0, $r_ptr # restore $r_ptr + + movdqa %xmm5, %xmm0 # copy_conditional(res_z, ONE, in1infty); + movdqa %xmm5, %xmm1 + pandn $res_z(%rsp), %xmm0 + movdqa %xmm5, %xmm2 + pandn $res_z+0x10(%rsp), %xmm1 + movdqa %xmm5, %xmm3 + pand .LONE_mont(%rip), %xmm2 + pand .LONE_mont+0x10(%rip), %xmm3 + por %xmm0, %xmm2 + por %xmm1, %xmm3 + + movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); + movdqa %xmm4, %xmm1 + pandn %xmm2, %xmm0 + movdqa %xmm4, %xmm2 + pandn %xmm3, %xmm1 + movdqa %xmm4, %xmm3 + pand $in1_z(%rsp), %xmm2 + pand $in1_z+0x10(%rsp), %xmm3 + por %xmm0, %xmm2 + por %xmm1, %xmm3 + movdqu %xmm2, 0x40($r_ptr) + movdqu %xmm3, 0x50($r_ptr) + + movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); + movdqa %xmm5, %xmm1 + pandn $res_x(%rsp), %xmm0 + movdqa %xmm5, %xmm2 + pandn $res_x+0x10(%rsp), %xmm1 + movdqa %xmm5, %xmm3 + pand $in2_x(%rsp), %xmm2 + pand $in2_x+0x10(%rsp), %xmm3 + por %xmm0, %xmm2 + por %xmm1, %xmm3 + + movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); + movdqa %xmm4, %xmm1 + pandn %xmm2, %xmm0 + movdqa %xmm4, %xmm2 + pandn %xmm3, %xmm1 + movdqa %xmm4, %xmm3 + pand $in1_x(%rsp), %xmm2 + pand $in1_x+0x10(%rsp), %xmm3 + por %xmm0, %xmm2 + por %xmm1, %xmm3 + movdqu %xmm2, 0x00($r_ptr) + movdqu %xmm3, 0x10($r_ptr) + + movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); + movdqa %xmm5, %xmm1 + pandn $res_y(%rsp), %xmm0 + movdqa %xmm5, %xmm2 + pandn $res_y+0x10(%rsp), %xmm1 + movdqa %xmm5, %xmm3 + pand $in2_y(%rsp), %xmm2 + pand $in2_y+0x10(%rsp), %xmm3 + por %xmm0, %xmm2 + por %xmm1, %xmm3 + + movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); + movdqa %xmm4, %xmm1 + pandn %xmm2, %xmm0 + movdqa %xmm4, %xmm2 + pandn %xmm3, %xmm1 + movdqa %xmm4, %xmm3 + pand $in1_y(%rsp), %xmm2 + pand $in1_y+0x10(%rsp), %xmm3 + por %xmm0, %xmm2 + por %xmm1, %xmm3 + movdqu %xmm2, 0x20($r_ptr) + movdqu %xmm3, 0x30($r_ptr) + + lea 32*15+56(%rsp), %rsi +.cfi_def_cfa %rsi,8 + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbx +.cfi_restore %rbx + mov -8(%rsi),%rbp +.cfi_restore %rbp + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Ladd_affine${x}_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx +___ +} +&gen_add_affine("q"); + +######################################################################## +# AD*X magic +# +if ($addx) { { +######################################################################## +# operate in 4-5-0-1 "name space" that matches multiplication output +# +my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); + +$code.=<<___; +.type __ecp_nistz256_add_tox,\@abi-omnipotent +.align 32 +__ecp_nistz256_add_tox: +.cfi_startproc + xor $t4, $t4 + adc 8*0($b_ptr), $a0 + adc 8*1($b_ptr), $a1 + mov $a0, $t0 + adc 8*2($b_ptr), $a2 + adc 8*3($b_ptr), $a3 + mov $a1, $t1 + adc \$0, $t4 + + xor $t3, $t3 + sbb \$-1, $a0 + mov $a2, $t2 + sbb $poly1, $a1 + sbb \$0, $a2 + mov $a3, $t3 + sbb $poly3, $a3 + sbb \$0, $t4 + + cmovc $t0, $a0 + cmovc $t1, $a1 + mov $a0, 8*0($r_ptr) + cmovc $t2, $a2 + mov $a1, 8*1($r_ptr) + cmovc $t3, $a3 + mov $a2, 8*2($r_ptr) + mov $a3, 8*3($r_ptr) + + ret +.cfi_endproc +.size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox + +.type __ecp_nistz256_sub_fromx,\@abi-omnipotent +.align 32 +__ecp_nistz256_sub_fromx: +.cfi_startproc + xor $t4, $t4 + sbb 8*0($b_ptr), $a0 + sbb 8*1($b_ptr), $a1 + mov $a0, $t0 + sbb 8*2($b_ptr), $a2 + sbb 8*3($b_ptr), $a3 + mov $a1, $t1 + sbb \$0, $t4 + + xor $t3, $t3 + adc \$-1, $a0 + mov $a2, $t2 + adc $poly1, $a1 + adc \$0, $a2 + mov $a3, $t3 + adc $poly3, $a3 + + bt \$0, $t4 + cmovnc $t0, $a0 + cmovnc $t1, $a1 + mov $a0, 8*0($r_ptr) + cmovnc $t2, $a2 + mov $a1, 8*1($r_ptr) + cmovnc $t3, $a3 + mov $a2, 8*2($r_ptr) + mov $a3, 8*3($r_ptr) + + ret +.cfi_endproc +.size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx + +.type __ecp_nistz256_subx,\@abi-omnipotent +.align 32 +__ecp_nistz256_subx: +.cfi_startproc + xor $t4, $t4 + sbb $a0, $t0 + sbb $a1, $t1 + mov $t0, $a0 + sbb $a2, $t2 + sbb $a3, $t3 + mov $t1, $a1 + sbb \$0, $t4 + + xor $a3 ,$a3 + adc \$-1, $t0 + mov $t2, $a2 + adc $poly1, $t1 + adc \$0, $t2 + mov $t3, $a3 + adc $poly3, $t3 + + bt \$0, $t4 + cmovc $t0, $a0 + cmovc $t1, $a1 + cmovc $t2, $a2 + cmovc $t3, $a3 + + ret +.cfi_endproc +.size __ecp_nistz256_subx,.-__ecp_nistz256_subx + +.type __ecp_nistz256_mul_by_2x,\@abi-omnipotent +.align 32 +__ecp_nistz256_mul_by_2x: +.cfi_startproc + xor $t4, $t4 + adc $a0, $a0 # a0:a3+a0:a3 + adc $a1, $a1 + mov $a0, $t0 + adc $a2, $a2 + adc $a3, $a3 + mov $a1, $t1 + adc \$0, $t4 + + xor $t3, $t3 + sbb \$-1, $a0 + mov $a2, $t2 + sbb $poly1, $a1 + sbb \$0, $a2 + mov $a3, $t3 + sbb $poly3, $a3 + sbb \$0, $t4 + + cmovc $t0, $a0 + cmovc $t1, $a1 + mov $a0, 8*0($r_ptr) + cmovc $t2, $a2 + mov $a1, 8*1($r_ptr) + cmovc $t3, $a3 + mov $a2, 8*2($r_ptr) + mov $a3, 8*3($r_ptr) + + ret +.cfi_endproc +.size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x +___ + } +&gen_double("x"); +&gen_add("x"); +&gen_add_affine("x"); +} +}}} + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind + +.type short_handler,\@abi-omnipotent +.align 16 +short_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # end of prologue label + cmp %r10,%rbx # context->RipRsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + lea 16(%rax),%rax + + mov -8(%rax),%r12 + mov -16(%rax),%r13 + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + + jmp .Lcommon_seh_tail +.size short_handler,.-short_handler + +.type full_handler,\@abi-omnipotent +.align 16 +full_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # end of prologue label + cmp %r10,%rbx # context->RipRsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + mov 8(%r11),%r10d # HandlerData[2] + lea (%rax,%r10),%rax + + mov -8(%rax),%rbp + mov -16(%rax),%rbx + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 + +.Lcommon_seh_tail: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size full_handler,.-full_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_ecp_nistz256_neg + .rva .LSEH_end_ecp_nistz256_neg + .rva .LSEH_info_ecp_nistz256_neg + + .rva .LSEH_begin_ecp_nistz256_ord_mul_mont_nohw + .rva .LSEH_end_ecp_nistz256_ord_mul_mont_nohw + .rva .LSEH_info_ecp_nistz256_ord_mul_mont_nohw + + .rva .LSEH_begin_ecp_nistz256_ord_sqr_mont_nohw + .rva .LSEH_end_ecp_nistz256_ord_sqr_mont_nohw + .rva .LSEH_info_ecp_nistz256_ord_sqr_mont_nohw +___ +$code.=<<___ if ($addx); + .rva .LSEH_begin_ecp_nistz256_ord_mul_mont_adx + .rva .LSEH_end_ecp_nistz256_ord_mul_mont_adx + .rva .LSEH_info_ecp_nistz256_ord_mul_mont_adx + + .rva .LSEH_begin_ecp_nistz256_ord_sqr_mont_adx + .rva .LSEH_end_ecp_nistz256_ord_sqr_mont_adx + .rva .LSEH_info_ecp_nistz256_ord_sqr_mont_adx +___ +$code.=<<___; + .rva .LSEH_begin_ecp_nistz256_mul_mont_nohw + .rva .LSEH_end_ecp_nistz256_mul_mont_nohw + .rva .LSEH_info_ecp_nistz256_mul_mont_nohw + + .rva .LSEH_begin_ecp_nistz256_sqr_mont_nohw + .rva .LSEH_end_ecp_nistz256_sqr_mont_nohw + .rva .LSEH_info_ecp_nistz256_sqr_mont_nohw +___ +$code.=<<___ if ($addx); + .rva .LSEH_begin_ecp_nistz256_mul_mont_adx + .rva .LSEH_end_ecp_nistz256_mul_mont_adx + .rva .LSEH_info_ecp_nistz256_mul_mont_adx + + .rva .LSEH_begin_ecp_nistz256_sqr_mont_adx + .rva .LSEH_end_ecp_nistz256_sqr_mont_adx + .rva .LSEH_info_ecp_nistz256_sqr_mont_adx +___ +$code.=<<___; + .rva .LSEH_begin_ecp_nistz256_select_w5_nohw + .rva .LSEH_end_ecp_nistz256_select_w5_nohw + .rva .LSEH_info_ecp_nistz256_select_wX_nohw + + .rva .LSEH_begin_ecp_nistz256_select_w7_nohw + .rva .LSEH_end_ecp_nistz256_select_w7_nohw + .rva .LSEH_info_ecp_nistz256_select_wX_nohw +___ +$code.=<<___ if ($avx>1); + .rva .LSEH_begin_ecp_nistz256_select_w5_avx2 + .rva .LSEH_end_ecp_nistz256_select_w5_avx2 + .rva .LSEH_info_ecp_nistz256_select_wX_avx2 + + .rva .LSEH_begin_ecp_nistz256_select_w7_avx2 + .rva .LSEH_end_ecp_nistz256_select_w7_avx2 + .rva .LSEH_info_ecp_nistz256_select_wX_avx2 +___ +$code.=<<___; + .rva .LSEH_begin_ecp_nistz256_point_double_nohw + .rva .LSEH_end_ecp_nistz256_point_double_nohw + .rva .LSEH_info_ecp_nistz256_point_double_nohw + + .rva .LSEH_begin_ecp_nistz256_point_add_nohw + .rva .LSEH_end_ecp_nistz256_point_add_nohw + .rva .LSEH_info_ecp_nistz256_point_add_nohw + + .rva .LSEH_begin_ecp_nistz256_point_add_affine_nohw + .rva .LSEH_end_ecp_nistz256_point_add_affine_nohw + .rva .LSEH_info_ecp_nistz256_point_add_affine_nohw +___ +$code.=<<___ if ($addx); + .rva .LSEH_begin_ecp_nistz256_point_double_adx + .rva .LSEH_end_ecp_nistz256_point_double_adx + .rva .LSEH_info_ecp_nistz256_point_double_adx + + .rva .LSEH_begin_ecp_nistz256_point_add_adx + .rva .LSEH_end_ecp_nistz256_point_add_adx + .rva .LSEH_info_ecp_nistz256_point_add_adx + + .rva .LSEH_begin_ecp_nistz256_point_add_affine_adx + .rva .LSEH_end_ecp_nistz256_point_add_affine_adx + .rva .LSEH_info_ecp_nistz256_point_add_affine_adx +___ +$code.=<<___; + +.section .xdata +.align 8 +.LSEH_info_ecp_nistz256_neg: + .byte 9,0,0,0 + .rva short_handler + .rva .Lneg_body,.Lneg_epilogue # HandlerData[] +.LSEH_info_ecp_nistz256_ord_mul_mont_nohw: + .byte 9,0,0,0 + .rva full_handler + .rva .Lord_mul_body,.Lord_mul_epilogue # HandlerData[] + .long 48,0 +.LSEH_info_ecp_nistz256_ord_sqr_mont_nohw: + .byte 9,0,0,0 + .rva full_handler + .rva .Lord_sqr_body,.Lord_sqr_epilogue # HandlerData[] + .long 48,0 +___ +$code.=<<___ if ($addx); +.LSEH_info_ecp_nistz256_ord_mul_mont_adx: + .byte 9,0,0,0 + .rva full_handler + .rva .Lord_mulx_body,.Lord_mulx_epilogue # HandlerData[] + .long 48,0 +.LSEH_info_ecp_nistz256_ord_sqr_mont_adx: + .byte 9,0,0,0 + .rva full_handler + .rva .Lord_sqrx_body,.Lord_sqrx_epilogue # HandlerData[] + .long 48,0 +___ +$code.=<<___; +.LSEH_info_ecp_nistz256_mul_mont_nohw: + .byte 9,0,0,0 + .rva full_handler + .rva .Lmul_body,.Lmul_epilogue # HandlerData[] + .long 48,0 +.LSEH_info_ecp_nistz256_sqr_mont_nohw: + .byte 9,0,0,0 + .rva full_handler + .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[] + .long 48,0 +___ +$code.=<<___ if ($addx); +.LSEH_info_ecp_nistz256_mul_mont_adx: + .byte 9,0,0,0 + .rva full_handler + .rva .Lmulx_body,.Lmulx_epilogue # HandlerData[] + .long 48,0 +.LSEH_info_ecp_nistz256_sqr_mont_adx: + .byte 9,0,0,0 + .rva full_handler + .rva .Lsqrx_body,.Lsqrx_epilogue # HandlerData[] + .long 48,0 +___ +$code.=<<___; +.LSEH_info_ecp_nistz256_select_wX_nohw: + .byte 0x01,0x33,0x16,0x00 + .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15 + .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14 + .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13 + .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12 + .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11 + .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10 + .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9 + .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8 + .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 + .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 + .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8 + .align 8 +___ +$code.=<<___ if ($avx>1); +.LSEH_info_ecp_nistz256_select_wX_avx2: + .byte 0x01,0x36,0x17,0x0b + .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 + .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 + .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 + .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 + .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 + .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 + .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 + .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 + .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 + .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 + .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8 + .byte 0x00,0xb3,0x00,0x00 # set_frame r11 + .align 8 +___ +$code.=<<___; +.LSEH_info_ecp_nistz256_point_double_nohw: + .byte 9,0,0,0 + .rva full_handler + .rva .Lpoint_doubleq_body,.Lpoint_doubleq_epilogue # HandlerData[] + .long 32*5+56,0 +.LSEH_info_ecp_nistz256_point_add_nohw: + .byte 9,0,0,0 + .rva full_handler + .rva .Lpoint_addq_body,.Lpoint_addq_epilogue # HandlerData[] + .long 32*18+56,0 +.LSEH_info_ecp_nistz256_point_add_affine_nohw: + .byte 9,0,0,0 + .rva full_handler + .rva .Ladd_affineq_body,.Ladd_affineq_epilogue # HandlerData[] + .long 32*15+56,0 +___ +$code.=<<___ if ($addx); +.align 8 +.LSEH_info_ecp_nistz256_point_double_adx: + .byte 9,0,0,0 + .rva full_handler + .rva .Lpoint_doublex_body,.Lpoint_doublex_epilogue # HandlerData[] + .long 32*5+56,0 +.LSEH_info_ecp_nistz256_point_add_adx: + .byte 9,0,0,0 + .rva full_handler + .rva .Lpoint_addx_body,.Lpoint_addx_epilogue # HandlerData[] + .long 32*18+56,0 +.LSEH_info_ecp_nistz256_point_add_affine_adx: + .byte 9,0,0,0 + .rva full_handler + .rva .Ladd_affinex_body,.Ladd_affinex_epilogue # HandlerData[] + .long 32*15+56,0 +___ +} + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close STDOUT or die "error closing STDOUT: $!"; diff --git a/ring-0.17.14/crypto/fipsmodule/ec/ecp_nistz.c b/ring-0.17.14/crypto/fipsmodule/ec/ecp_nistz.c new file mode 100644 index 0000000000..1811e28374 --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/ec/ecp_nistz.c @@ -0,0 +1,52 @@ +/* Copyright (c) 2014, Intel Corporation. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#include "ecp_nistz.h" + +#if defined(__GNUC__) +#pragma GCC diagnostic ignored "-Wconversion" +#endif + +/* Fills |str| with the bytewise little-endian encoding of |scalar|, where + * |scalar| has |num_limbs| limbs. |str| is padded with zeros at the end up + * to |str_len| bytes. Actually, |str_len| must be exactly one byte more than + * needed to encode |num_limbs| losslessly, so that there is an extra byte at + * the end. The extra byte is useful because the caller will be breaking |str| + * up into windows of a number of bits (5 or 7) that isn't divisible by 8, and + * so it is useful for it to be able to read an extra zero byte. */ +void little_endian_bytes_from_scalar(uint8_t str[], size_t str_len, + const Limb scalar[], + size_t num_limbs) { + debug_assert_nonsecret(str_len == (num_limbs * sizeof(Limb)) + 1); + + size_t i; + for (i = 0; i < num_limbs * sizeof(Limb); i += sizeof(Limb)) { + Limb d = scalar[i / sizeof(Limb)]; + + str[i + 0] = d & 0xff; + str[i + 1] = (d >> 8) & 0xff; + str[i + 2] = (d >> 16) & 0xff; + str[i + 3] = (d >>= 24) & 0xff; + if (sizeof(Limb) == 8) { + d >>= 8; + str[i + 4] = d & 0xff; + str[i + 5] = (d >> 8) & 0xff; + str[i + 6] = (d >> 16) & 0xff; + str[i + 7] = (d >> 24) & 0xff; + } + } + for (; i < str_len; i++) { + str[i] = 0; + } +} diff --git a/ring-0.17.14/crypto/fipsmodule/ec/ecp_nistz.h b/ring-0.17.14/crypto/fipsmodule/ec/ecp_nistz.h new file mode 100644 index 0000000000..3c04c475c6 --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/ec/ecp_nistz.h @@ -0,0 +1,274 @@ +/* Copyright (c) 2015, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#ifndef OPENSSL_HEADER_EC_ECP_NISTZ_H +#define OPENSSL_HEADER_EC_ECP_NISTZ_H + +#include + +#include "../../limbs/limbs.h" + +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#pragma GCC diagnostic ignored "-Wsign-conversion" +#endif + +// This function looks at `w + 1` scalar bits (`w` current, 1 adjacent less +// significant bit), and recodes them into a signed digit for use in fast point +// multiplication: the use of signed rather than unsigned digits means that +// fewer points need to be precomputed, given that point inversion is easy (a +// precomputed point dP makes -dP available as well). +// +// BACKGROUND: +// +// Signed digits for multiplication were introduced by Booth ("A signed binary +// multiplication technique", Quart. Journ. Mech. and Applied Math., vol. IV, +// pt. 2 (1951), pp. 236-240), in that case for multiplication of integers. +// Booth's original encoding did not generally improve the density of nonzero +// digits over the binary representation, and was merely meant to simplify the +// handling of signed factors given in two's complement; but it has since been +// shown to be the basis of various signed-digit representations that do have +// further advantages, including the wNAF, using the following general +// approach: +// +// (1) Given a binary representation +// +// b_k ... b_2 b_1 b_0, +// +// of a nonnegative integer (b_k in {0, 1}), rewrite it in digits 0, 1, -1 +// by using bit-wise subtraction as follows: +// +// b_k b_(k-1) ... b_2 b_1 b_0 +// - b_k ... b_3 b_2 b_1 b_0 +// ----------------------------------------- +// s_(k+1) s_k ... s_3 s_2 s_1 s_0 +// +// A left-shift followed by subtraction of the original value yields a new +// representation of the same value, using signed bits s_i = b_(i-1) - b_i. +// This representation from Booth's paper has since appeared in the +// literature under a variety of different names including "reversed binary +// form", "alternating greedy expansion", "mutual opposite form", and +// "sign-alternating {+-1}-representation". +// +// An interesting property is that among the nonzero bits, values 1 and -1 +// strictly alternate. +// +// (2) Various window schemes can be applied to the Booth representation of +// integers: for example, right-to-left sliding windows yield the wNAF +// (a signed-digit encoding independently discovered by various researchers +// in the 1990s), and left-to-right sliding windows yield a left-to-right +// equivalent of the wNAF (independently discovered by various researchers +// around 2004). +// +// To prevent leaking information through side channels in point multiplication, +// we need to recode the given integer into a regular pattern: sliding windows +// as in wNAFs won't do, we need their fixed-window equivalent -- which is a few +// decades older: we'll be using the so-called "modified Booth encoding" due to +// MacSorley ("High-speed arithmetic in binary computers", Proc. IRE, vol. 49 +// (1961), pp. 67-91), in a radix-2**w setting. That is, we always combine `w` +// signed bits into a signed digit, e.g. (for `w == 5`): +// +// s_(5j + 4) s_(5j + 3) s_(5j + 2) s_(5j + 1) s_(5j) +// +// The sign-alternating property implies that the resulting digit values are +// integers from `-2**(w-1)` to `2**(w-1)`, e.g. -16 to 16 for `w == 5`. +// +// Of course, we don't actually need to compute the signed digits s_i as an +// intermediate step (that's just a nice way to see how this scheme relates +// to the wNAF): a direct computation obtains the recoded digit from the +// six bits b_(5j + 4) ... b_(5j - 1). +// +// This function takes those `w` bits as an integer (e.g. 0 .. 63), writing the +// recoded digit to *sign (0 for positive, 1 for negative) and *digit (absolute +// value, in the range 0 .. 2**(w-1). Note that this integer essentially provides +// the input bits "shifted to the left" by one position: for example, the input +// to compute the least significant recoded digit, given that there's no bit +// b_-1, has to be b_4 b_3 b_2 b_1 b_0 0. +// +// DOUBLING CASE: +// +// Point addition formulas for short Weierstrass curves are often incomplete. +// Edge cases such as P + P or P + ∞ must be handled separately. This +// complicates constant-time requirements. P + ∞ cannot be avoided (any window +// may be zero) and is handled with constant-time selects. P + P (where P is not +// ∞) usually is not. Instead, windowing strategies are chosen to avoid this +// case. Whether this happens depends on the group order. +// +// Let w be the window width (in this function, w = 5). The non-trivial doubling +// case in single-point scalar multiplication may occur if and only if the +// 2^(w-1) bit of the group order is zero. +// +// Note the above only holds if the scalar is fully reduced and the group order +// is a prime that is much larger than 2^w. It also only holds when windows +// are applied from most significant to least significant, doubling between each +// window. It does not apply to more complex table strategies such as +// |EC_nistz256_method|. +// +// PROOF: +// +// Let n be the group order. Let l be the number of bits needed to represent n. +// Assume there exists some 0 <= k < n such that signed w-bit windowed +// multiplication hits the doubling case. +// +// Windowed multiplication consists of iterating over groups of s_i (defined +// above based on k's binary representation) from most to least significant. At +// iteration i (for i = ..., 3w, 2w, w, 0, starting from the most significant +// window), we: +// +// 1. Double the accumulator A, w times. Let A_i be the value of A at this +// point. +// +// 2. Set A to T_i + A_i, where T_i is a precomputed multiple of P +// corresponding to the window s_(i+w-1) ... s_i. +// +// Let j be the index such that A_j = T_j ≠ ∞. Looking at A_i and T_i as +// multiples of P, define a_i and t_i to be scalar coefficients of A_i and T_i. +// Thus a_j = t_j ≠ 0 (mod n). Note a_i and t_i may not be reduced mod n. t_i is +// the value of the w signed bits s_(i+w-1) ... s_i. a_i is computed as a_i = +// 2^w * (a_(i+w) + t_(i+w)). +// +// t_i is bounded by -2^(w-1) <= t_i <= 2^(w-1). Additionally, we may write it +// in terms of unsigned bits b_i. t_i consists of signed bits s_(i+w-1) ... s_i. +// This is computed as: +// +// b_(i+w-2) b_(i+w-3) ... b_i b_(i-1) +// - b_(i+w-1) b_(i+w-2) ... b_(i+1) b_i +// -------------------------------------------- +// t_i = s_(i+w-1) s_(i+w-2) ... s_(i+1) s_i +// +// Observe that b_(i+w-2) through b_i occur in both terms. Let x be the integer +// represented by that bit string, i.e. 2^(w-2)*b_(i+w-2) + ... + b_i. +// +// t_i = (2*x + b_(i-1)) - (2^(w-1)*b_(i+w-1) + x) +// = x - 2^(w-1)*b_(i+w-1) + b_(i-1) +// +// Or, using C notation for bit operations: +// +// t_i = (k>>i) & ((1<<(w-1)) - 1) - (k>>i) & (1<<(w-1)) + (k>>(i-1)) & 1 +// +// Note b_(i-1) is added in left-shifted by one (or doubled) from its place. +// This is compensated by t_(i-w)'s subtraction term. Thus, a_i may be computed +// by adding b_l b_(l-1) ... b_(i+1) b_i and an extra copy of b_(i-1). In C +// notation, this is: +// +// a_i = (k>>(i+w)) << w + ((k>>(i+w-1)) & 1) << w +// +// Observe that, while t_i may be positive or negative, a_i is bounded by +// 0 <= a_i < n + 2^w. Additionally, a_i can only be zero if b_(i+w-1) and up +// are all zero. (Note this implies a non-trivial P + (-P) is unreachable for +// all groups. That would imply the subsequent a_i is zero, which means all +// terms thus far were zero.) +// +// Returning to our doubling position, we have a_j = t_j (mod n). We now +// determine the value of a_j - t_j, which must be divisible by n. Our bounds on +// a_j and t_j imply a_j - t_j is 0 or n. If it is 0, a_j = t_j. However, 2^w +// divides a_j and -2^(w-1) <= t_j <= 2^(w-1), so this can only happen if +// a_j = t_j = 0, which is a trivial doubling. Therefore, a_j - t_j = n. +// +// Now we determine j. Suppose j > 0. w divides j, so j >= w. Then, +// +// n = a_j - t_j = (k>>(j+w)) << w + ((k>>(j+w-1)) & 1) << w - t_j +// <= k/2^j + 2^w - t_j +// < n/2^w + 2^w + 2^(w-1) +// +// n is much larger than 2^w, so this is impossible. Thus, j = 0: only the final +// addition may hit the doubling case. +// +// Finally, we consider bit patterns for n and k. Divide k into k_H + k_M + k_L +// such that k_H is the contribution from b_(l-1) .. b_w, k_M is the +// contribution from b_(w-1), and k_L is the contribution from b_(w-2) ... b_0. +// That is: +// +// - 2^w divides k_H +// - k_M is 0 or 2^(w-1) +// - 0 <= k_L < 2^(w-1) +// +// Divide n into n_H + n_M + n_L similarly. We thus have: +// +// t_0 = (k>>0) & ((1<<(w-1)) - 1) - (k>>0) & (1<<(w-1)) + (k>>(0-1)) & 1 +// = k & ((1<<(w-1)) - 1) - k & (1<<(w-1)) +// = k_L - k_M +// +// a_0 = (k>>(0+w)) << w + ((k>>(0+w-1)) & 1) << w +// = (k>>w) << w + ((k>>(w-1)) & 1) << w +// = k_H + 2*k_M +// +// n = a_0 - t_0 +// n_H + n_M + n_L = (k_H + 2*k_M) - (k_L - k_M) +// = k_H + 3*k_M - k_L +// +// k_H - k_L < k and k < n, so k_H - k_L ≠ n. Therefore k_M is not 0 and must be +// 2^(w-1). Now we consider k_H and n_H. We know k_H <= n_H. Suppose k_H = n_H. +// Then, +// +// n_M + n_L = 3*(2^(w-1)) - k_L +// > 3*(2^(w-1)) - 2^(w-1) +// = 2^w +// +// Contradiction (n_M + n_L is the bottom w bits of n). Thus k_H < n_H. Suppose +// k_H < n_H - 2*2^w. Then, +// +// n_H + n_M + n_L = k_H + 3*(2^(w-1)) - k_L +// < n_H - 2*2^w + 3*(2^(w-1)) - k_L +// n_M + n_L < -2^(w-1) - k_L +// +// Contradiction. Thus, k_H = n_H - 2^w. (Note 2^w divides n_H and k_H.) Thus, +// +// n_H + n_M + n_L = k_H + 3*(2^(w-1)) - k_L +// = n_H - 2^w + 3*(2^(w-1)) - k_L +// n_M + n_L = 2^(w-1) - k_L +// <= 2^(w-1) +// +// Equality would mean 2^(w-1) divides n, which is impossible if n is prime. +// Thus n_M + n_L < 2^(w-1), so n_M is zero, proving our condition. +// +// This proof constructs k, so, to show the converse, let k_H = n_H - 2^w, +// k_M = 2^(w-1), k_L = 2^(w-1) - n_L. This will result in a non-trivial point +// doubling in the final addition and is the only such scalar. +// +// COMMON CURVES: +// +// The group orders for common curves end in the following bit patterns: +// +// P-521: ...00001001; w = 4 is okay +// P-384: ...01110011; w = 2, 5, 6, 7 are okay +// P-256: ...01010001; w = 5, 7 are okay +// P-224: ...00111101; w = 3, 4, 5, 6 are okay +static inline void booth_recode(crypto_word_t *is_negative, crypto_word_t *digit, + crypto_word_t in, crypto_word_t w) { + debug_assert_nonsecret(w >= 2); + debug_assert_nonsecret(w <= 7); + + // Set all bits of `s` to MSB(in), similar to |constant_time_msb_s|, + // but 'in' seen as (`w+1`)-bit value. + crypto_word_t s = ~((in >> w) - 1); + crypto_word_t d; + d = ((crypto_word_t)1u << (w + 1)) - in - 1; + d = (d & s) | (in & ~s); + d = (d >> 1) + (d & 1); + + *is_negative = constant_time_is_nonzero_w(s & 1); + *digit = d; +} + +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + +void little_endian_bytes_from_scalar(uint8_t str[], size_t str_len, + const Limb scalar[], + size_t num_limbs); + +#endif // OPENSSL_HEADER_EC_ECP_NISTZ_H diff --git a/ring-0.17.14/crypto/fipsmodule/ec/ecp_nistz384.h b/ring-0.17.14/crypto/fipsmodule/ec/ecp_nistz384.h new file mode 100644 index 0000000000..ca87e60721 --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/ec/ecp_nistz384.h @@ -0,0 +1,34 @@ +/* Copyright (c) 2014, Intel Corporation. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#ifndef OPENSSL_HEADER_EC_ECP_NISTZ384_H +#define OPENSSL_HEADER_EC_ECP_NISTZ384_H + +#include "../../limbs/limbs.h" + +#define P384_LIMBS (384u / LIMB_BITS) + +typedef struct { + Limb X[P384_LIMBS]; + Limb Y[P384_LIMBS]; + Limb Z[P384_LIMBS]; +} P384_POINT; + +typedef struct { + Limb X[P384_LIMBS]; + Limb Y[P384_LIMBS]; +} P384_POINT_AFFINE; + + +#endif // OPENSSL_HEADER_EC_ECP_NISTZ384_H diff --git a/ring-0.17.14/crypto/fipsmodule/ec/ecp_nistz384.inl b/ring-0.17.14/crypto/fipsmodule/ec/ecp_nistz384.inl new file mode 100644 index 0000000000..ae28f97ae5 --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/ec/ecp_nistz384.inl @@ -0,0 +1,300 @@ +/* Copyright (c) 2014, Intel Corporation. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +/* Developers and authors: + * Shay Gueron (1, 2), and Vlad Krasnov (1) + * (1) Intel Corporation, Israel Development Center + * (2) University of Haifa + * Reference: + * Shay Gueron and Vlad Krasnov + * "Fast Prime Field Elliptic Curve Cryptography with 256 Bit Primes" + * http://eprint.iacr.org/2013/816 */ + +#include "ecp_nistz.h" + +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wsign-conversion" +#endif + +/* Point double: r = 2*a */ +static void nistz384_point_double(P384_POINT *r, const P384_POINT *a) { + BN_ULONG S[P384_LIMBS]; + BN_ULONG M[P384_LIMBS]; + BN_ULONG Zsqr[P384_LIMBS]; + BN_ULONG tmp0[P384_LIMBS]; + + const BN_ULONG *in_x = a->X; + const BN_ULONG *in_y = a->Y; + const BN_ULONG *in_z = a->Z; + + BN_ULONG *res_x = r->X; + BN_ULONG *res_y = r->Y; + BN_ULONG *res_z = r->Z; + + elem_mul_by_2(S, in_y); + + elem_sqr_mont(Zsqr, in_z); + + elem_sqr_mont(S, S); + + elem_mul_mont(res_z, in_z, in_y); + elem_mul_by_2(res_z, res_z); + + elem_add(M, in_x, Zsqr); + elem_sub(Zsqr, in_x, Zsqr); + + elem_sqr_mont(res_y, S); + elem_div_by_2(res_y, res_y); + + elem_mul_mont(M, M, Zsqr); + elem_mul_by_3(M, M); + + elem_mul_mont(S, S, in_x); + elem_mul_by_2(tmp0, S); + + elem_sqr_mont(res_x, M); + + elem_sub(res_x, res_x, tmp0); + elem_sub(S, S, res_x); + + elem_mul_mont(S, S, M); + elem_sub(res_y, S, res_y); +} + +/* Point addition: r = a+b */ +static void nistz384_point_add(P384_POINT *r, const P384_POINT *a, + const P384_POINT *b) { + BN_ULONG U2[P384_LIMBS], S2[P384_LIMBS]; + BN_ULONG U1[P384_LIMBS], S1[P384_LIMBS]; + BN_ULONG Z1sqr[P384_LIMBS]; + BN_ULONG Z2sqr[P384_LIMBS]; + BN_ULONG H[P384_LIMBS], R[P384_LIMBS]; + BN_ULONG Hsqr[P384_LIMBS]; + BN_ULONG Rsqr[P384_LIMBS]; + BN_ULONG Hcub[P384_LIMBS]; + + BN_ULONG res_x[P384_LIMBS]; + BN_ULONG res_y[P384_LIMBS]; + BN_ULONG res_z[P384_LIMBS]; + + const BN_ULONG *in1_x = a->X; + const BN_ULONG *in1_y = a->Y; + const BN_ULONG *in1_z = a->Z; + + const BN_ULONG *in2_x = b->X; + const BN_ULONG *in2_y = b->Y; + const BN_ULONG *in2_z = b->Z; + + BN_ULONG in1infty = is_zero(a->Z); + BN_ULONG in2infty = is_zero(b->Z); + + elem_sqr_mont(Z2sqr, in2_z); /* Z2^2 */ + elem_sqr_mont(Z1sqr, in1_z); /* Z1^2 */ + + elem_mul_mont(S1, Z2sqr, in2_z); /* S1 = Z2^3 */ + elem_mul_mont(S2, Z1sqr, in1_z); /* S2 = Z1^3 */ + + elem_mul_mont(S1, S1, in1_y); /* S1 = Y1*Z2^3 */ + elem_mul_mont(S2, S2, in2_y); /* S2 = Y2*Z1^3 */ + elem_sub(R, S2, S1); /* R = S2 - S1 */ + + elem_mul_mont(U1, in1_x, Z2sqr); /* U1 = X1*Z2^2 */ + elem_mul_mont(U2, in2_x, Z1sqr); /* U2 = X2*Z1^2 */ + elem_sub(H, U2, U1); /* H = U2 - U1 */ + + BN_ULONG is_exceptional = is_equal(U1, U2) & ~in1infty & ~in2infty; + if (is_exceptional) { + if (is_equal(S1, S2)) { + nistz384_point_double(r, a); + } else { + limbs_zero(r->X, P384_LIMBS); + limbs_zero(r->Y, P384_LIMBS); + limbs_zero(r->Z, P384_LIMBS); + } + return; + } + + elem_sqr_mont(Rsqr, R); /* R^2 */ + elem_mul_mont(res_z, H, in1_z); /* Z3 = H*Z1*Z2 */ + elem_sqr_mont(Hsqr, H); /* H^2 */ + elem_mul_mont(res_z, res_z, in2_z); /* Z3 = H*Z1*Z2 */ + elem_mul_mont(Hcub, Hsqr, H); /* H^3 */ + + elem_mul_mont(U2, U1, Hsqr); /* U1*H^2 */ + elem_mul_by_2(Hsqr, U2); /* 2*U1*H^2 */ + + elem_sub(res_x, Rsqr, Hsqr); + elem_sub(res_x, res_x, Hcub); + + elem_sub(res_y, U2, res_x); + + elem_mul_mont(S2, S1, Hcub); + elem_mul_mont(res_y, R, res_y); + elem_sub(res_y, res_y, S2); + + copy_conditional(res_x, in2_x, in1infty); + copy_conditional(res_y, in2_y, in1infty); + copy_conditional(res_z, in2_z, in1infty); + + copy_conditional(res_x, in1_x, in2infty); + copy_conditional(res_y, in1_y, in2infty); + copy_conditional(res_z, in1_z, in2infty); + + limbs_copy(r->X, res_x, P384_LIMBS); + limbs_copy(r->Y, res_y, P384_LIMBS); + limbs_copy(r->Z, res_z, P384_LIMBS); +} + +static void add_precomputed_w5(P384_POINT *r, crypto_word_t wvalue, + const P384_POINT table[16]) { + crypto_word_t recoded_is_negative; + crypto_word_t recoded; + booth_recode(&recoded_is_negative, &recoded, wvalue, 5); + + alignas(64) P384_POINT h; + p384_point_select_w5(&h, table, recoded); + + alignas(64) BN_ULONG tmp[P384_LIMBS]; + p384_elem_neg(tmp, h.Y); + copy_conditional(h.Y, tmp, recoded_is_negative); + + nistz384_point_add(r, r, &h); +} + +/* r = p * p_scalar */ +static void nistz384_point_mul(P384_POINT *r, + const BN_ULONG p_scalar[P384_LIMBS], + const Limb p_x[P384_LIMBS], + const Limb p_y[P384_LIMBS]) { + static const size_t kWindowSize = 5; + static const crypto_word_t kMask = (1 << (5 /* kWindowSize */ + 1)) - 1; + + uint8_t p_str[(P384_LIMBS * sizeof(Limb)) + 1]; + little_endian_bytes_from_scalar(p_str, sizeof(p_str) / sizeof(p_str[0]), + p_scalar, P384_LIMBS); + + /* A |P384_POINT| is (3 * 48) = 144 bytes, and the 64-byte alignment should + * add no more than 63 bytes of overhead. Thus, |table| should require + * ~2367 ((144 * 16) + 63) bytes of stack space. */ + alignas(64) P384_POINT table[16]; + + /* table[0] is implicitly (0,0,0) (the point at infinity), therefore it is + * not stored. All other values are actually stored with an offset of -1 in + * table. */ + P384_POINT *row = table; + + limbs_copy(row[1 - 1].X, p_x, P384_LIMBS); + limbs_copy(row[1 - 1].Y, p_y, P384_LIMBS); + limbs_copy(row[1 - 1].Z, ONE, P384_LIMBS); + + nistz384_point_double(&row[2 - 1], &row[1 - 1]); + nistz384_point_add(&row[3 - 1], &row[2 - 1], &row[1 - 1]); + nistz384_point_double(&row[4 - 1], &row[2 - 1]); + nistz384_point_double(&row[6 - 1], &row[3 - 1]); + nistz384_point_double(&row[8 - 1], &row[4 - 1]); + nistz384_point_double(&row[12 - 1], &row[6 - 1]); + nistz384_point_add(&row[5 - 1], &row[4 - 1], &row[1 - 1]); + nistz384_point_add(&row[7 - 1], &row[6 - 1], &row[1 - 1]); + nistz384_point_add(&row[9 - 1], &row[8 - 1], &row[1 - 1]); + nistz384_point_add(&row[13 - 1], &row[12 - 1], &row[1 - 1]); + nistz384_point_double(&row[14 - 1], &row[7 - 1]); + nistz384_point_double(&row[10 - 1], &row[5 - 1]); + nistz384_point_add(&row[15 - 1], &row[14 - 1], &row[1 - 1]); + nistz384_point_add(&row[11 - 1], &row[10 - 1], &row[1 - 1]); + nistz384_point_double(&row[16 - 1], &row[8 - 1]); + + static const size_t START_INDEX = 384 - 4; + size_t index = START_INDEX; + + BN_ULONG recoded_is_negative; + crypto_word_t recoded; + + crypto_word_t wvalue = p_str[(index - 1) / 8]; + wvalue = (wvalue >> ((index - 1) % 8)) & kMask; + + booth_recode(&recoded_is_negative, &recoded, wvalue, 5); + dev_assert_secret(!recoded_is_negative); + + p384_point_select_w5(r, table, recoded); + + while (index >= kWindowSize) { + if (index != START_INDEX) { + size_t off = (index - 1) / 8; + + wvalue = p_str[off] | p_str[off + 1] << 8; + wvalue = (wvalue >> ((index - 1) % 8)) & kMask; + add_precomputed_w5(r, wvalue, table); + } + + index -= kWindowSize; + + nistz384_point_double(r, r); + nistz384_point_double(r, r); + nistz384_point_double(r, r); + nistz384_point_double(r, r); + nistz384_point_double(r, r); + } + + /* Final window */ + wvalue = p_str[0]; + wvalue = (wvalue << 1) & kMask; + add_precomputed_w5(r, wvalue, table); +} + +void p384_point_double(Limb r[3][P384_LIMBS], const Limb a[3][P384_LIMBS]) +{ + P384_POINT t; + limbs_copy(t.X, a[0], P384_LIMBS); + limbs_copy(t.Y, a[1], P384_LIMBS); + limbs_copy(t.Z, a[2], P384_LIMBS); + nistz384_point_double(&t, &t); + limbs_copy(r[0], t.X, P384_LIMBS); + limbs_copy(r[1], t.Y, P384_LIMBS); + limbs_copy(r[2], t.Z, P384_LIMBS); +} + +void p384_point_add(Limb r[3][P384_LIMBS], + const Limb a[3][P384_LIMBS], + const Limb b[3][P384_LIMBS]) +{ + P384_POINT t1; + limbs_copy(t1.X, a[0], P384_LIMBS); + limbs_copy(t1.Y, a[1], P384_LIMBS); + limbs_copy(t1.Z, a[2], P384_LIMBS); + + P384_POINT t2; + limbs_copy(t2.X, b[0], P384_LIMBS); + limbs_copy(t2.Y, b[1], P384_LIMBS); + limbs_copy(t2.Z, b[2], P384_LIMBS); + + nistz384_point_add(&t1, &t1, &t2); + + limbs_copy(r[0], t1.X, P384_LIMBS); + limbs_copy(r[1], t1.Y, P384_LIMBS); + limbs_copy(r[2], t1.Z, P384_LIMBS); +} + +void p384_point_mul(Limb r[3][P384_LIMBS], const BN_ULONG p_scalar[P384_LIMBS], + const Limb p_x[P384_LIMBS], const Limb p_y[P384_LIMBS]) { + alignas(64) P384_POINT acc; + nistz384_point_mul(&acc, p_scalar, p_x, p_y); + limbs_copy(r[0], acc.X, P384_LIMBS); + limbs_copy(r[1], acc.Y, P384_LIMBS); + limbs_copy(r[2], acc.Z, P384_LIMBS); +} + +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic pop +#endif diff --git a/ring-0.17.14/crypto/fipsmodule/ec/gfp_p256.c b/ring-0.17.14/crypto/fipsmodule/ec/gfp_p256.c new file mode 100644 index 0000000000..2aa0ae2ce2 --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/ec/gfp_p256.c @@ -0,0 +1,54 @@ +/* Copyright 2016 Brian Smith. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#include "./p256_shared.h" + +#include "../../limbs/limbs.h" + +#if !defined(OPENSSL_USE_NISTZ256) + +typedef Limb ScalarMont[P256_LIMBS]; +typedef Limb Scalar[P256_LIMBS]; + +#include "../bn/internal.h" + +static const BN_ULONG N[P256_LIMBS] = { +#if defined(OPENSSL_64_BIT) + 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000 +#else + 0xfc632551, 0xf3b9cac2, 0xa7179e84, 0xbce6faad, 0xffffffff, 0xffffffff, 0, + 0xffffffff +#endif +}; + +static const BN_ULONG N_N0[] = { + BN_MONT_CTX_N0(0xccd1c8aa, 0xee00bc4f) +}; + +void p256_scalar_mul_mont(ScalarMont r, const ScalarMont a, + const ScalarMont b) { + /* XXX: Inefficient. TODO: optimize with dedicated multiplication routine. */ + bn_mul_mont_small(r, a, b, N, N_N0, P256_LIMBS); +} + +/* XXX: Inefficient. TODO: optimize with dedicated squaring routine. */ +void p256_scalar_sqr_rep_mont(ScalarMont r, const ScalarMont a, Limb rep) { + dev_assert_secret(rep >= 1); + p256_scalar_mul_mont(r, a, a); + for (Limb i = 1; i < rep; ++i) { + p256_scalar_mul_mont(r, r, r); + } +} + +#endif diff --git a/ring-0.17.14/crypto/fipsmodule/ec/gfp_p384.c b/ring-0.17.14/crypto/fipsmodule/ec/gfp_p384.c new file mode 100644 index 0000000000..f9a66faa2a --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/ec/gfp_p384.c @@ -0,0 +1,246 @@ +/* Copyright 2016 Brian Smith. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#include "../../limbs/limbs.h" + +#include "ecp_nistz384.h" +#include "../bn/internal.h" +#include "../../internal.h" + +#include "../../limbs/limbs.inl" + + /* XXX: Here we assume that the conversion from |Carry| to |Limb| is + * constant-time, but we haven't verified that assumption. TODO: Fix it so + * we don't need to make that assumption. */ + + +typedef Limb Elem[P384_LIMBS]; +typedef Limb ScalarMont[P384_LIMBS]; +typedef Limb Scalar[P384_LIMBS]; + +static const BN_ULONG Q[P384_LIMBS] = { +#if defined(OPENSSL_64_BIT) + 0xffffffff, 0xffffffff00000000, 0xfffffffffffffffe, 0xffffffffffffffff, + 0xffffffffffffffff, 0xffffffffffffffff +#else + 0xffffffff, 0, 0, 0xffffffff, 0xfffffffe, 0xffffffff, 0xffffffff, 0xffffffff, + 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff +#endif +}; + +static const BN_ULONG N[P384_LIMBS] = { +#if defined(OPENSSL_64_BIT) + 0xecec196accc52973, 0x581a0db248b0a77a, 0xc7634d81f4372ddf, 0xffffffffffffffff, + 0xffffffffffffffff, 0xffffffffffffffff +#else + 0xccc52973, 0xecec196a, 0x48b0a77a, 0x581a0db2, 0xf4372ddf, 0xc7634d81, + 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff +#endif +}; + +static const BN_ULONG ONE[P384_LIMBS] = { +#if defined(OPENSSL_64_BIT) + 0xffffffff00000001, 0xffffffff, 1, 0, 0 +#else + 1, 0xffffffff, 0xffffffff, 0, 1, 0, 0, 0, 0, 0 +#endif +}; + +static const Elem Q_PLUS_1_SHR_1 = { +#if defined(OPENSSL_64_BIT) + 0x80000000, 0x7fffffff80000000, 0xffffffffffffffff, 0xffffffffffffffff, + 0xffffffffffffffff, 0x7fffffffffffffff +#else + 0x80000000, 0, 0x80000000, 0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff, + 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x7fffffff +#endif +}; + +static const BN_ULONG Q_N0[] = { + BN_MONT_CTX_N0(1, 1) +}; + +static const BN_ULONG N_N0[] = { + BN_MONT_CTX_N0(0x6ed46089, 0xe88fdc45) +}; + +/* XXX: MSVC for x86 warns when it fails to inline these functions it should + * probably inline. */ +#if defined(_MSC_VER) && !defined(__clang__) && defined(OPENSSL_X86) +#define INLINE_IF_POSSIBLE __forceinline +#else +#define INLINE_IF_POSSIBLE inline +#endif + +static inline Limb is_equal(const Elem a, const Elem b) { + return LIMBS_equal(a, b, P384_LIMBS); +} + +static inline Limb is_zero(const BN_ULONG a[P384_LIMBS]) { + return LIMBS_are_zero(a, P384_LIMBS); +} + +static inline void copy_conditional(Elem r, const Elem a, + const Limb condition) { + for (size_t i = 0; i < P384_LIMBS; ++i) { + r[i] = constant_time_select_w(condition, a[i], r[i]); + } +} + + +static inline void elem_add(Elem r, const Elem a, const Elem b) { + LIMBS_add_mod(r, a, b, Q, P384_LIMBS); +} + +static inline void elem_sub(Elem r, const Elem a, const Elem b) { + LIMBS_sub_mod(r, a, b, Q, P384_LIMBS); +} + +static void elem_div_by_2(Elem r, const Elem a) { + /* Consider the case where `a` is even. Then we can shift `a` right one bit + * and the result will still be valid because we didn't lose any bits and so + * `(a >> 1) * 2 == a (mod q)`, which is the invariant we must satisfy. + * + * The remainder of this comment is considering the case where `a` is odd. + * + * Since `a` is odd, it isn't the case that `(a >> 1) * 2 == a (mod q)` + * because the lowest bit is lost during the shift. For example, consider: + * + * ```python + * q = 2**384 - 2**128 - 2**96 + 2**32 - 1 + * a = 2**383 + * two_a = a * 2 % q + * assert two_a == 0x100000000ffffffffffffffff00000001 + * ``` + * + * Notice there how `(2 * a) % q` wrapped around to a smaller odd value. When + * we divide `two_a` by two (mod q), we need to get the value `2**383`, which + * we obviously can't get with just a right shift. + * + * `q` is odd, and `a` is odd, so `a + q` is even. We could calculate + * `(a + q) >> 1` and then reduce it mod `q`. However, then we would have to + * keep track of an extra most significant bit. We can avoid that by instead + * calculating `(a >> 1) + ((q + 1) >> 1)`. The `1` in `q + 1` is the least + * significant bit of `a`. `q + 1` is even, which means it can be shifted + * without losing any bits. Since `q` is odd, `q - 1` is even, so the largest + * odd field element is `q - 2`. Thus we know that `a <= q - 2`. We know + * `(q + 1) >> 1` is `(q + 1) / 2` since (`q + 1`) is even. The value of + * `a >> 1` is `(a - 1)/2` since the shift will drop the least significant + * bit of `a`, which is 1. Thus: + * + * sum = ((q + 1) >> 1) + (a >> 1) + * sum = (q + 1)/2 + (a >> 1) (substituting (q + 1)/2) + * <= (q + 1)/2 + (q - 2 - 1)/2 (substituting a <= q - 2) + * <= (q + 1)/2 + (q - 3)/2 (simplifying) + * <= (q + 1 + q - 3)/2 (factoring out the common divisor) + * <= (2q - 2)/2 (simplifying) + * <= q - 1 (simplifying) + * + * Thus, no reduction of the sum mod `q` is necessary. */ + + Limb is_odd = constant_time_is_nonzero_w(a[0] & 1); + + /* r = a >> 1. */ + Limb carry = a[P384_LIMBS - 1] & 1; + r[P384_LIMBS - 1] = a[P384_LIMBS - 1] >> 1; + for (size_t i = 1; i < P384_LIMBS; ++i) { + Limb new_carry = a[P384_LIMBS - i - 1]; + r[P384_LIMBS - i - 1] = + (a[P384_LIMBS - i - 1] >> 1) | (carry << (LIMB_BITS - 1)); + carry = new_carry; + } + + Elem adjusted; + BN_ULONG carry2 = limbs_add(adjusted, r, Q_PLUS_1_SHR_1, P384_LIMBS); + dev_assert_secret(carry2 == 0); + (void)carry2; + copy_conditional(r, adjusted, is_odd); +} + +static inline void elem_mul_mont(Elem r, const Elem a, const Elem b) { + /* XXX: Not (clearly) constant-time; inefficient.*/ + bn_mul_mont_small(r, a, b, Q, Q_N0, P384_LIMBS); +} + +static inline void elem_mul_by_2(Elem r, const Elem a) { + LIMBS_shl_mod(r, a, Q, P384_LIMBS); +} + +static INLINE_IF_POSSIBLE void elem_mul_by_3(Elem r, const Elem a) { + /* XXX: inefficient. TODO: Replace with an integrated shift + add. */ + Elem doubled; + elem_add(doubled, a, a); + elem_add(r, doubled, a); +} + +static inline void elem_sqr_mont(Elem r, const Elem a) { + /* XXX: Inefficient. TODO: Add a dedicated squaring routine. */ + elem_mul_mont(r, a, a); +} + +void p384_elem_sub(Elem r, const Elem a, const Elem b) { + elem_sub(r, a, b); +} + +void p384_elem_div_by_2(Elem r, const Elem a) { + elem_div_by_2(r, a); +} + +void p384_elem_mul_mont(Elem r, const Elem a, const Elem b) { + elem_mul_mont(r, a, b); +} + +void p384_elem_neg(Elem r, const Elem a) { + Limb is_zero = LIMBS_are_zero(a, P384_LIMBS); + Carry borrow = limbs_sub(r, Q, a, P384_LIMBS); + dev_assert_secret(borrow == 0); + (void)borrow; + for (size_t i = 0; i < P384_LIMBS; ++i) { + r[i] = constant_time_select_w(is_zero, 0, r[i]); + } +} + + +void p384_scalar_mul_mont(ScalarMont r, const ScalarMont a, + const ScalarMont b) { + /* XXX: Inefficient. TODO: Add dedicated multiplication routine. */ + bn_mul_mont_small(r, a, b, N, N_N0, P384_LIMBS); +} + + +/* TODO(perf): Optimize this. */ + +static void p384_point_select_w5(P384_POINT *out, + const P384_POINT table[16], size_t index) { + Elem x; limbs_zero(x, P384_LIMBS); + Elem y; limbs_zero(y, P384_LIMBS); + Elem z; limbs_zero(z, P384_LIMBS); + + // TODO: Rewrite in terms of |limbs_select|. + for (size_t i = 0; i < 16; ++i) { + crypto_word_t equal = constant_time_eq_w(index, (crypto_word_t)i + 1); + for (size_t j = 0; j < P384_LIMBS; ++j) { + x[j] = constant_time_select_w(equal, table[i].X[j], x[j]); + y[j] = constant_time_select_w(equal, table[i].Y[j], y[j]); + z[j] = constant_time_select_w(equal, table[i].Z[j], z[j]); + } + } + + limbs_copy(out->X, x, P384_LIMBS); + limbs_copy(out->Y, y, P384_LIMBS); + limbs_copy(out->Z, z, P384_LIMBS); +} + + +#include "ecp_nistz384.inl" diff --git a/ring-0.17.14/crypto/fipsmodule/ec/p256-nistz-table.h b/ring-0.17.14/crypto/fipsmodule/ec/p256-nistz-table.h new file mode 100644 index 0000000000..03bf782f48 --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/ec/p256-nistz-table.h @@ -0,0 +1,9502 @@ +// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +// Copyright (c) 2015, Intel Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This is the precomputed constant time access table for the code in +// p256-nistz.c, for the default generator. The table consists of 37 +// subtables, each subtable contains 64 affine points. The affine points are +// encoded as eight uint64's, four for the x coordinate and four for the y. +// Both values are in little-endian order. There are 37 tables because a +// signed, 6-bit wNAF form of the scalar is used and ceil(256/(6 + 1)) = 37. +// Within each table there are 64 values because the 6-bit wNAF value can take +// 64 values, ignoring the sign bit, which is implemented by performing a +// negation of the affine point when required. We would like to align it to 2MB +// in order to increase the chances of using a large page but that appears to +// lead to invalid ELF files being produced. + +// This file is generated by make_tables.go. + +static const alignas(4096) PRECOMP256_ROW ecp_nistz256_precomputed[37] = { + {{{TOBN(0x79e730d4, 0x18a9143c), TOBN(0x75ba95fc, 0x5fedb601), + TOBN(0x79fb732b, 0x77622510), TOBN(0x18905f76, 0xa53755c6)}, + {TOBN(0xddf25357, 0xce95560a), TOBN(0x8b4ab8e4, 0xba19e45c), + TOBN(0xd2e88688, 0xdd21f325), TOBN(0x8571ff18, 0x25885d85)}}, + {{TOBN(0x850046d4, 0x10ddd64d), TOBN(0xaa6ae3c1, 0xa433827d), + TOBN(0x73220503, 0x8d1490d9), TOBN(0xf6bb32e4, 0x3dcf3a3b)}, + {TOBN(0x2f3648d3, 0x61bee1a5), TOBN(0x152cd7cb, 0xeb236ff8), + TOBN(0x19a8fb0e, 0x92042dbe), TOBN(0x78c57751, 0x0a5b8a3b)}}, + {{TOBN(0xffac3f90, 0x4eebc127), TOBN(0xb027f84a, 0x087d81fb), + TOBN(0x66ad77dd, 0x87cbbc98), TOBN(0x26936a3f, 0xb6ff747e)}, + {TOBN(0xb04c5c1f, 0xc983a7eb), TOBN(0x583e47ad, 0x0861fe1a), + TOBN(0x78820831, 0x1a2ee98e), TOBN(0xd5f06a29, 0xe587cc07)}}, + {{TOBN(0x74b0b50d, 0x46918dcc), TOBN(0x4650a6ed, 0xc623c173), + TOBN(0x0cdaacac, 0xe8100af2), TOBN(0x577362f5, 0x41b0176b)}, + {TOBN(0x2d96f24c, 0xe4cbaba6), TOBN(0x17628471, 0xfad6f447), + TOBN(0x6b6c36de, 0xe5ddd22e), TOBN(0x84b14c39, 0x4c5ab863)}}, + {{TOBN(0xbe1b8aae, 0xc45c61f5), TOBN(0x90ec649a, 0x94b9537d), + TOBN(0x941cb5aa, 0xd076c20c), TOBN(0xc9079605, 0x890523c8)}, + {TOBN(0xeb309b4a, 0xe7ba4f10), TOBN(0x73c568ef, 0xe5eb882b), + TOBN(0x3540a987, 0x7e7a1f68), TOBN(0x73a076bb, 0x2dd1e916)}}, + {{TOBN(0x40394737, 0x3e77664a), TOBN(0x55ae744f, 0x346cee3e), + TOBN(0xd50a961a, 0x5b17a3ad), TOBN(0x13074b59, 0x54213673)}, + {TOBN(0x93d36220, 0xd377e44b), TOBN(0x299c2b53, 0xadff14b5), + TOBN(0xf424d44c, 0xef639f11), TOBN(0xa4c9916d, 0x4a07f75f)}}, + {{TOBN(0x0746354e, 0xa0173b4f), TOBN(0x2bd20213, 0xd23c00f7), + TOBN(0xf43eaab5, 0x0c23bb08), TOBN(0x13ba5119, 0xc3123e03)}, + {TOBN(0x2847d030, 0x3f5b9d4d), TOBN(0x6742f2f2, 0x5da67bdd), + TOBN(0xef933bdc, 0x77c94195), TOBN(0xeaedd915, 0x6e240867)}}, + {{TOBN(0x27f14cd1, 0x9499a78f), TOBN(0x462ab5c5, 0x6f9b3455), + TOBN(0x8f90f02a, 0xf02cfc6b), TOBN(0xb763891e, 0xb265230d)}, + {TOBN(0xf59da3a9, 0x532d4977), TOBN(0x21e3327d, 0xcf9eba15), + TOBN(0x123c7b84, 0xbe60bbf0), TOBN(0x56ec12f2, 0x7706df76)}}, + {{TOBN(0x75c96e8f, 0x264e20e8), TOBN(0xabe6bfed, 0x59a7a841), + TOBN(0x2cc09c04, 0x44c8eb00), TOBN(0xe05b3080, 0xf0c4e16b)}, + {TOBN(0x1eb7777a, 0xa45f3314), TOBN(0x56af7bed, 0xce5d45e3), + TOBN(0x2b6e019a, 0x88b12f1a), TOBN(0x086659cd, 0xfd835f9b)}}, + {{TOBN(0x2c18dbd1, 0x9dc21ec8), TOBN(0x98f9868a, 0x0fcf8139), + TOBN(0x737d2cd6, 0x48250b49), TOBN(0xcc61c947, 0x24b3428f)}, + {TOBN(0x0c2b4078, 0x80dd9e76), TOBN(0xc43a8991, 0x383fbe08), + TOBN(0x5f7d2d65, 0x779be5d2), TOBN(0x78719a54, 0xeb3b4ab5)}}, + {{TOBN(0xea7d260a, 0x6245e404), TOBN(0x9de40795, 0x6e7fdfe0), + TOBN(0x1ff3a415, 0x8dac1ab5), TOBN(0x3e7090f1, 0x649c9073)}, + {TOBN(0x1a768561, 0x2b944e88), TOBN(0x250f939e, 0xe57f61c8), + TOBN(0x0c0daa89, 0x1ead643d), TOBN(0x68930023, 0xe125b88e)}}, + {{TOBN(0x04b71aa7, 0xd2697768), TOBN(0xabdedef5, 0xca345a33), + TOBN(0x2409d29d, 0xee37385e), TOBN(0x4ee1df77, 0xcb83e156)}, + {TOBN(0x0cac12d9, 0x1cbb5b43), TOBN(0x170ed2f6, 0xca895637), + TOBN(0x28228cfa, 0x8ade6d66), TOBN(0x7ff57c95, 0x53238aca)}}, + {{TOBN(0xccc42563, 0x4b2ed709), TOBN(0x0e356769, 0x856fd30d), + TOBN(0xbcbcd43f, 0x559e9811), TOBN(0x738477ac, 0x5395b759)}, + {TOBN(0x35752b90, 0xc00ee17f), TOBN(0x68748390, 0x742ed2e3), + TOBN(0x7cd06422, 0xbd1f5bc1), TOBN(0xfbc08769, 0xc9e7b797)}}, + {{TOBN(0xa242a35b, 0xb0cf664a), TOBN(0x126e48f7, 0x7f9707e3), + TOBN(0x1717bf54, 0xc6832660), TOBN(0xfaae7332, 0xfd12c72e)}, + {TOBN(0x27b52db7, 0x995d586b), TOBN(0xbe29569e, 0x832237c2), + TOBN(0xe8e4193e, 0x2a65e7db), TOBN(0x152706dc, 0x2eaa1bbb)}}, + {{TOBN(0x72bcd8b7, 0xbc60055b), TOBN(0x03cc23ee, 0x56e27e4b), + TOBN(0xee337424, 0xe4819370), TOBN(0xe2aa0e43, 0x0ad3da09)}, + {TOBN(0x40b8524f, 0x6383c45d), TOBN(0xd7663554, 0x42a41b25), + TOBN(0x64efa6de, 0x778a4797), TOBN(0x2042170a, 0x7079adf4)}}, + {{TOBN(0x808b0b65, 0x0bc6fb80), TOBN(0x5882e075, 0x3ffe2e6b), + TOBN(0xd5ef2f7c, 0x2c83f549), TOBN(0x54d63c80, 0x9103b723)}, + {TOBN(0xf2f11bd6, 0x52a23f9b), TOBN(0x3670c319, 0x4b0b6587), + TOBN(0x55c4623b, 0xb1580e9e), TOBN(0x64edf7b2, 0x01efe220)}}, + {{TOBN(0x97091dcb, 0xd53c5c9d), TOBN(0xf17624b6, 0xac0a177b), + TOBN(0xb0f13975, 0x2cfe2dff), TOBN(0xc1a35c0a, 0x6c7a574e)}, + {TOBN(0x227d3146, 0x93e79987), TOBN(0x0575bf30, 0xe89cb80e), + TOBN(0x2f4e247f, 0x0d1883bb), TOBN(0xebd51226, 0x3274c3d0)}}, + {{TOBN(0x5f3e51c8, 0x56ada97a), TOBN(0x4afc964d, 0x8f8b403e), + TOBN(0xa6f247ab, 0x412e2979), TOBN(0x675abd1b, 0x6f80ebda)}, + {TOBN(0x66a2bd72, 0x5e485a1d), TOBN(0x4b2a5caf, 0x8f4f0b3c), + TOBN(0x2626927f, 0x1b847bba), TOBN(0x6c6fc7d9, 0x0502394d)}}, + {{TOBN(0xfea912ba, 0xa5659ae8), TOBN(0x68363aba, 0x25e1a16e), + TOBN(0xb8842277, 0x752c41ac), TOBN(0xfe545c28, 0x2897c3fc)}, + {TOBN(0x2d36e9e7, 0xdc4c696b), TOBN(0x5806244a, 0xfba977c5), + TOBN(0x85665e9b, 0xe39508c1), TOBN(0xf720ee25, 0x6d12597b)}}, + {{TOBN(0x8a979129, 0xd2337a31), TOBN(0x5916868f, 0x0f862bdc), + TOBN(0x048099d9, 0x5dd283ba), TOBN(0xe2d1eeb6, 0xfe5bfb4e)}, + {TOBN(0x82ef1c41, 0x7884005d), TOBN(0xa2d4ec17, 0xffffcbae), + TOBN(0x9161c53f, 0x8aa95e66), TOBN(0x5ee104e1, 0xc5fee0d0)}}, + {{TOBN(0x562e4cec, 0xc135b208), TOBN(0x74e1b265, 0x4783f47d), + TOBN(0x6d2a506c, 0x5a3f3b30), TOBN(0xecead9f4, 0xc16762fc)}, + {TOBN(0xf29dd4b2, 0xe286e5b9), TOBN(0x1b0fadc0, 0x83bb3c61), + TOBN(0x7a75023e, 0x7fac29a4), TOBN(0xc086d5f1, 0xc9477fa3)}}, + {{TOBN(0x0fc61135, 0x2f6f3076), TOBN(0xc99ffa23, 0xe3912a9a), + TOBN(0x6a0b0685, 0xd2f8ba3d), TOBN(0xfdc777e8, 0xe93358a4)}, + {TOBN(0x94a787bb, 0x35415f04), TOBN(0x640c2d6a, 0x4d23fea4), + TOBN(0x9de917da, 0x153a35b5), TOBN(0x793e8d07, 0x5d5cd074)}}, + {{TOBN(0xf4f87653, 0x2de45068), TOBN(0x37c7a7e8, 0x9e2e1f6e), + TOBN(0xd0825fa2, 0xa3584069), TOBN(0xaf2cea7c, 0x1727bf42)}, + {TOBN(0x0360a4fb, 0x9e4785a9), TOBN(0xe5fda49c, 0x27299f4a), + TOBN(0x48068e13, 0x71ac2f71), TOBN(0x83d0687b, 0x9077666f)}}, + {{TOBN(0x6d3883b2, 0x15d02819), TOBN(0x6d0d7550, 0x40dd9a35), + TOBN(0x61d7cbf9, 0x1d2b469f), TOBN(0xf97b232f, 0x2efc3115)}, + {TOBN(0xa551d750, 0xb24bcbc7), TOBN(0x11ea4949, 0x88a1e356), + TOBN(0x7669f031, 0x93cb7501), TOBN(0x595dc55e, 0xca737b8a)}}, + {{TOBN(0xa4a319ac, 0xd837879f), TOBN(0x6fc1b49e, 0xed6b67b0), + TOBN(0xe3959933, 0x32f1f3af), TOBN(0x966742eb, 0x65432a2e)}, + {TOBN(0x4b8dc9fe, 0xb4966228), TOBN(0x96cc6312, 0x43f43950), + TOBN(0x12068859, 0xc9b731ee), TOBN(0x7b948dc3, 0x56f79968)}}, + {{TOBN(0x61e4ad32, 0xed1f8008), TOBN(0xe6c9267a, 0xd8b17538), + TOBN(0x1ac7c5eb, 0x857ff6fb), TOBN(0x994baaa8, 0x55f2fb10)}, + {TOBN(0x84cf14e1, 0x1d248018), TOBN(0x5a39898b, 0x628ac508), + TOBN(0x14fde97b, 0x5fa944f5), TOBN(0xed178030, 0xd12e5ac7)}}, + {{TOBN(0x042c2af4, 0x97e2feb4), TOBN(0xd36a42d7, 0xaebf7313), + TOBN(0x49d2c9eb, 0x084ffdd7), TOBN(0x9f8aa54b, 0x2ef7c76a)}, + {TOBN(0x9200b7ba, 0x09895e70), TOBN(0x3bd0c66f, 0xddb7fb58), + TOBN(0x2d97d108, 0x78eb4cbb), TOBN(0x2d431068, 0xd84bde31)}}, + {{TOBN(0x4b523eb7, 0x172ccd1f), TOBN(0x7323cb28, 0x30a6a892), + TOBN(0x97082ec0, 0xcfe153eb), TOBN(0xe97f6b6a, 0xf2aadb97)}, + {TOBN(0x1d3d393e, 0xd1a83da1), TOBN(0xa6a7f9c7, 0x804b2a68), + TOBN(0x4a688b48, 0x2d0cb71e), TOBN(0xa9b4cc5f, 0x40585278)}}, + {{TOBN(0x5e5db46a, 0xcb66e132), TOBN(0xf1be963a, 0x0d925880), + TOBN(0x944a7027, 0x0317b9e2), TOBN(0xe266f959, 0x48603d48)}, + {TOBN(0x98db6673, 0x5c208899), TOBN(0x90472447, 0xa2fb18a3), + TOBN(0x8a966939, 0x777c619f), TOBN(0x3798142a, 0x2a3be21b)}}, + {{TOBN(0xb4241cb1, 0x3298b343), TOBN(0xa3a14e49, 0xb44f65a1), + TOBN(0xc5f4d6cd, 0x3ac77acd), TOBN(0xd0288cb5, 0x52b6fc3c)}, + {TOBN(0xd5cc8c2f, 0x1c040abc), TOBN(0xb675511e, 0x06bf9b4a), + TOBN(0xd667da37, 0x9b3aa441), TOBN(0x460d45ce, 0x51601f72)}}, + {{TOBN(0xe2f73c69, 0x6755ff89), TOBN(0xdd3cf7e7, 0x473017e6), + TOBN(0x8ef5689d, 0x3cf7600d), TOBN(0x948dc4f8, 0xb1fc87b4)}, + {TOBN(0xd9e9fe81, 0x4ea53299), TOBN(0x2d921ca2, 0x98eb6028), + TOBN(0xfaecedfd, 0x0c9803fc), TOBN(0xf38ae891, 0x4d7b4745)}}, + {{TOBN(0xd8c5fccf, 0xc5e3a3d8), TOBN(0xbefd904c, 0x4079dfbf), + TOBN(0xbc6d6a58, 0xfead0197), TOBN(0x39227077, 0x695532a4)}, + {TOBN(0x09e23e6d, 0xdbef42f5), TOBN(0x7e449b64, 0x480a9908), + TOBN(0x7b969c1a, 0xad9a2e40), TOBN(0x6231d792, 0x9591c2a4)}}, + {{TOBN(0x87151456, 0x0f664534), TOBN(0x85ceae7c, 0x4b68f103), + TOBN(0xac09c4ae, 0x65578ab9), TOBN(0x33ec6868, 0xf044b10c)}, + {TOBN(0x6ac4832b, 0x3a8ec1f1), TOBN(0x5509d128, 0x5847d5ef), + TOBN(0xf909604f, 0x763f1574), TOBN(0xb16c4303, 0xc32f63c4)}}, + {{TOBN(0xb6ab2014, 0x7ca23cd3), TOBN(0xcaa7a5c6, 0xa391849d), + TOBN(0x5b0673a3, 0x75678d94), TOBN(0xc982ddd4, 0xdd303e64)}, + {TOBN(0xfd7b000b, 0x5db6f971), TOBN(0xbba2cb1f, 0x6f876f92), + TOBN(0xc77332a3, 0x3c569426), TOBN(0xa159100c, 0x570d74f8)}}, + {{TOBN(0xfd16847f, 0xdec67ef5), TOBN(0x742ee464, 0x233e76b7), + TOBN(0x0b8e4134, 0xefc2b4c8), TOBN(0xca640b86, 0x42a3e521)}, + {TOBN(0x653a0190, 0x8ceb6aa9), TOBN(0x313c300c, 0x547852d5), + TOBN(0x24e4ab12, 0x6b237af7), TOBN(0x2ba90162, 0x8bb47af8)}}, + {{TOBN(0x3d5e58d6, 0xa8219bb7), TOBN(0xc691d0bd, 0x1b06c57f), + TOBN(0x0ae4cb10, 0xd257576e), TOBN(0x3569656c, 0xd54a3dc3)}, + {TOBN(0xe5ebaebd, 0x94cda03a), TOBN(0x934e82d3, 0x162bfe13), + TOBN(0x450ac0ba, 0xe251a0c6), TOBN(0x480b9e11, 0xdd6da526)}}, + {{TOBN(0x00467bc5, 0x8cce08b5), TOBN(0xb636458c, 0x7f178d55), + TOBN(0xc5748bae, 0xa677d806), TOBN(0x2763a387, 0xdfa394eb)}, + {TOBN(0xa12b448a, 0x7d3cebb6), TOBN(0xe7adda3e, 0x6f20d850), + TOBN(0xf63ebce5, 0x1558462c), TOBN(0x58b36143, 0x620088a8)}}, + {{TOBN(0x8a2cc3ca, 0x4d63c0ee), TOBN(0x51233117, 0x0fe948ce), + TOBN(0x7463fd85, 0x222ef33b), TOBN(0xadf0c7dc, 0x7c603d6c)}, + {TOBN(0x0ec32d3b, 0xfe7765e5), TOBN(0xccaab359, 0xbf380409), + TOBN(0xbdaa84d6, 0x8e59319c), TOBN(0xd9a4c280, 0x9c80c34d)}}, + {{TOBN(0xa9d89488, 0xa059c142), TOBN(0x6f5ae714, 0xff0b9346), + TOBN(0x068f237d, 0x16fb3664), TOBN(0x5853e4c4, 0x363186ac)}, + {TOBN(0xe2d87d23, 0x63c52f98), TOBN(0x2ec4a766, 0x81828876), + TOBN(0x47b864fa, 0xe14e7b1c), TOBN(0x0c0bc0e5, 0x69192408)}}, + {{TOBN(0xe4d7681d, 0xb82e9f3e), TOBN(0x83200f0b, 0xdf25e13c), + TOBN(0x8909984c, 0x66f27280), TOBN(0x462d7b00, 0x75f73227)}, + {TOBN(0xd90ba188, 0xf2651798), TOBN(0x74c6e18c, 0x36ab1c34), + TOBN(0xab256ea3, 0x5ef54359), TOBN(0x03466612, 0xd1aa702f)}}, + {{TOBN(0x624d6049, 0x2ed22e91), TOBN(0x6fdfe0b5, 0x6f072822), + TOBN(0xeeca1115, 0x39ce2271), TOBN(0x98100a4f, 0xdb01614f)}, + {TOBN(0xb6b0daa2, 0xa35c628f), TOBN(0xb6f94d2e, 0xc87e9a47), + TOBN(0xc6773259, 0x1d57d9ce), TOBN(0xf70bfeec, 0x03884a7b)}}, + {{TOBN(0x5fb35ccf, 0xed2bad01), TOBN(0xa155cbe3, 0x1da6a5c7), + TOBN(0xc2e2594c, 0x30a92f8f), TOBN(0x649c89ce, 0x5bfafe43)}, + {TOBN(0xd158667d, 0xe9ff257a), TOBN(0x9b359611, 0xf32c50ae), + TOBN(0x4b00b20b, 0x906014cf), TOBN(0xf3a8cfe3, 0x89bc7d3d)}}, + {{TOBN(0x4ff23ffd, 0x248a7d06), TOBN(0x80c5bfb4, 0x878873fa), + TOBN(0xb7d9ad90, 0x05745981), TOBN(0x179c85db, 0x3db01994)}, + {TOBN(0xba41b062, 0x61a6966c), TOBN(0x4d82d052, 0xeadce5a8), + TOBN(0x9e91cd3b, 0xa5e6a318), TOBN(0x47795f4f, 0x95b2dda0)}}, + {{TOBN(0xecfd7c1f, 0xd55a897c), TOBN(0x009194ab, 0xb29110fb), + TOBN(0x5f0e2046, 0xe381d3b0), TOBN(0x5f3425f6, 0xa98dd291)}, + {TOBN(0xbfa06687, 0x730d50da), TOBN(0x0423446c, 0x4b083b7f), + TOBN(0x397a247d, 0xd69d3417), TOBN(0xeb629f90, 0x387ba42a)}}, + {{TOBN(0x1ee426cc, 0xd5cd79bf), TOBN(0x0032940b, 0x946c6e18), + TOBN(0x1b1e8ae0, 0x57477f58), TOBN(0xe94f7d34, 0x6d823278)}, + {TOBN(0xc747cb96, 0x782ba21a), TOBN(0xc5254469, 0xf72b33a5), + TOBN(0x772ef6de, 0xc7f80c81), TOBN(0xd73acbfe, 0x2cd9e6b5)}}, + {{TOBN(0x4075b5b1, 0x49ee90d9), TOBN(0x785c339a, 0xa06e9eba), + TOBN(0xa1030d5b, 0xabf825e0), TOBN(0xcec684c3, 0xa42931dc)}, + {TOBN(0x42ab62c9, 0xc1586e63), TOBN(0x45431d66, 0x5ab43f2b), + TOBN(0x57c8b2c0, 0x55f7835d), TOBN(0x033da338, 0xc1b7f865)}}, + {{TOBN(0x283c7513, 0xcaa76097), TOBN(0x0a624fa9, 0x36c83906), + TOBN(0x6b20afec, 0x715af2c7), TOBN(0x4b969974, 0xeba78bfd)}, + {TOBN(0x220755cc, 0xd921d60e), TOBN(0x9b944e10, 0x7baeca13), + TOBN(0x04819d51, 0x5ded93d4), TOBN(0x9bbff86e, 0x6dddfd27)}}, + {{TOBN(0x6b344130, 0x77adc612), TOBN(0xa7496529, 0xbbd803a0), + TOBN(0x1a1baaa7, 0x6d8805bd), TOBN(0xc8403902, 0x470343ad)}, + {TOBN(0x39f59f66, 0x175adff1), TOBN(0x0b26d7fb, 0xb7d8c5b7), + TOBN(0xa875f5ce, 0x529d75e3), TOBN(0x85efc7e9, 0x41325cc2)}}, + {{TOBN(0x21950b42, 0x1ff6acd3), TOBN(0xffe70484, 0x53dc6909), + TOBN(0xff4cd0b2, 0x28766127), TOBN(0xabdbe608, 0x4fb7db2b)}, + {TOBN(0x837c9228, 0x5e1109e8), TOBN(0x26147d27, 0xf4645b5a), + TOBN(0x4d78f592, 0xf7818ed8), TOBN(0xd394077e, 0xf247fa36)}}, + {{TOBN(0x0fb9c2d0, 0x488c171a), TOBN(0xa78bfbaa, 0x13685278), + TOBN(0xedfbe268, 0xd5b1fa6a), TOBN(0x0dceb8db, 0x2b7eaba7)}, + {TOBN(0xbf9e8089, 0x9ae2b710), TOBN(0xefde7ae6, 0xa4449c96), + TOBN(0x43b7716b, 0xcc143a46), TOBN(0xd7d34194, 0xc3628c13)}}, + {{TOBN(0x508cec1c, 0x3b3f64c9), TOBN(0xe20bc0ba, 0x1e5edf3f), + TOBN(0xda1deb85, 0x2f4318d4), TOBN(0xd20ebe0d, 0x5c3fa443)}, + {TOBN(0x370b4ea7, 0x73241ea3), TOBN(0x61f1511c, 0x5e1a5f65), + TOBN(0x99a5e23d, 0x82681c62), TOBN(0xd731e383, 0xa2f54c2d)}}, + {{TOBN(0x2692f36e, 0x83445904), TOBN(0x2e0ec469, 0xaf45f9c0), + TOBN(0x905a3201, 0xc67528b7), TOBN(0x88f77f34, 0xd0e5e542)}, + {TOBN(0xf67a8d29, 0x5864687c), TOBN(0x23b92eae, 0x22df3562), + TOBN(0x5c27014b, 0x9bbec39e), TOBN(0x7ef2f226, 0x9c0f0f8d)}}, + {{TOBN(0x97359638, 0x546c4d8d), TOBN(0x5f9c3fc4, 0x92f24679), + TOBN(0x912e8bed, 0xa8c8acd9), TOBN(0xec3a318d, 0x306634b0)}, + {TOBN(0x80167f41, 0xc31cb264), TOBN(0x3db82f6f, 0x522113f2), + TOBN(0xb155bcd2, 0xdcafe197), TOBN(0xfba1da59, 0x43465283)}}, + {{TOBN(0xa0425b8e, 0xb212cf53), TOBN(0x4f2e512e, 0xf8557c5f), + TOBN(0xc1286ff9, 0x25c4d56c), TOBN(0xbb8a0fea, 0xee26c851)}, + {TOBN(0xc28f70d2, 0xe7d6107e), TOBN(0x7ee0c444, 0xe76265aa), + TOBN(0x3df277a4, 0x1d1936b1), TOBN(0x1a556e3f, 0xea9595eb)}}, + {{TOBN(0x258bbbf9, 0xe7305683), TOBN(0x31eea5bf, 0x07ef5be6), + TOBN(0x0deb0e4a, 0x46c814c1), TOBN(0x5cee8449, 0xa7b730dd)}, + {TOBN(0xeab495c5, 0xa0182bde), TOBN(0xee759f87, 0x9e27a6b4), + TOBN(0xc2cf6a68, 0x80e518ca), TOBN(0x25e8013f, 0xf14cf3f4)}}, + {{TOBN(0x8fc44140, 0x7e8d7a14), TOBN(0xbb1ff3ca, 0x9556f36a), + TOBN(0x6a844385, 0x14600044), TOBN(0xba3f0c4a, 0x7451ae63)}, + {TOBN(0xdfcac25b, 0x1f9af32a), TOBN(0x01e0db86, 0xb1f2214b), + TOBN(0x4e9a5bc2, 0xa4b596ac), TOBN(0x83927681, 0x026c2c08)}}, + {{TOBN(0x3ec832e7, 0x7acaca28), TOBN(0x1bfeea57, 0xc7385b29), + TOBN(0x068212e3, 0xfd1eaf38), TOBN(0xc1329830, 0x6acf8ccc)}, + {TOBN(0xb909f2db, 0x2aac9e59), TOBN(0x5748060d, 0xb661782a), + TOBN(0xc5ab2632, 0xc79b7a01), TOBN(0xda44c6c6, 0x00017626)}}, + {{TOBN(0xf26c00e8, 0xa7ea82f0), TOBN(0x99cac80d, 0xe4299aaf), + TOBN(0xd66fe3b6, 0x7ed78be1), TOBN(0x305f725f, 0x648d02cd)}, + {TOBN(0x33ed1bc4, 0x623fb21b), TOBN(0xfa70533e, 0x7a6319ad), + TOBN(0x17ab562d, 0xbe5ffb3e), TOBN(0x06374994, 0x56674741)}}, + {{TOBN(0x69d44ed6, 0x5c46aa8e), TOBN(0x2100d5d3, 0xa8d063d1), + TOBN(0xcb9727ea, 0xa2d17c36), TOBN(0x4c2bab1b, 0x8add53b7)}, + {TOBN(0xa084e90c, 0x15426704), TOBN(0x778afcd3, 0xa837ebea), + TOBN(0x6651f701, 0x7ce477f8), TOBN(0xa0624998, 0x46fb7a8b)}}, + {{TOBN(0xdc1e6828, 0xed8a6e19), TOBN(0x33fc2336, 0x4189d9c7), + TOBN(0x026f8fe2, 0x671c39bc), TOBN(0xd40c4ccd, 0xbc6f9915)}, + {TOBN(0xafa135bb, 0xf80e75ca), TOBN(0x12c651a0, 0x22adff2c), + TOBN(0xc40a04bd, 0x4f51ad96), TOBN(0x04820109, 0xbbe4e832)}}, + {{TOBN(0x3667eb1a, 0x7f4c04cc), TOBN(0x59556621, 0xa9404f84), + TOBN(0x71cdf653, 0x7eceb50a), TOBN(0x994a44a6, 0x9b8335fa)}, + {TOBN(0xd7faf819, 0xdbeb9b69), TOBN(0x473c5680, 0xeed4350d), + TOBN(0xb6658466, 0xda44bba2), TOBN(0x0d1bc780, 0x872bdbf3)}}, + {{TOBN(0xe535f175, 0xa1962f91), TOBN(0x6ed7e061, 0xed58f5a7), + TOBN(0x177aa4c0, 0x2089a233), TOBN(0x0dbcb03a, 0xe539b413)}, + {TOBN(0xe3dc424e, 0xbb32e38e), TOBN(0x6472e5ef, 0x6806701e), + TOBN(0xdd47ff98, 0x814be9ee), TOBN(0x6b60cfff, 0x35ace009)}}, + {{TOBN(0xb8d3d931, 0x9ff91fe5), TOBN(0x039c4800, 0xf0518eed), + TOBN(0x95c37632, 0x9182cb26), TOBN(0x0763a434, 0x82fc568d)}, + {TOBN(0x707c04d5, 0x383e76ba), TOBN(0xac98b930, 0x824e8197), + TOBN(0x92bf7c8f, 0x91230de0), TOBN(0x90876a01, 0x40959b70)}}, + {{TOBN(0xdb6d96f3, 0x05968b80), TOBN(0x380a0913, 0x089f73b9), + TOBN(0x7da70b83, 0xc2c61e01), TOBN(0x95fb8394, 0x569b38c7)}, + {TOBN(0x9a3c6512, 0x80edfe2f), TOBN(0x8f726bb9, 0x8faeaf82), + TOBN(0x8010a4a0, 0x78424bf8), TOBN(0x29672044, 0x0e844970)}}}, + {{{TOBN(0x63c5cb81, 0x7a2ad62a), TOBN(0x7ef2b6b9, 0xac62ff54), + TOBN(0x3749bba4, 0xb3ad9db5), TOBN(0xad311f2c, 0x46d5a617)}, + {TOBN(0xb77a8087, 0xc2ff3b6d), TOBN(0xb46feaf3, 0x367834ff), + TOBN(0xf8aa266d, 0x75d6b138), TOBN(0xfa38d320, 0xec008188)}}, + {{TOBN(0x486d8ffa, 0x696946fc), TOBN(0x50fbc6d8, 0xb9cba56d), + TOBN(0x7e3d423e, 0x90f35a15), TOBN(0x7c3da195, 0xc0dd962c)}, + {TOBN(0xe673fdb0, 0x3cfd5d8b), TOBN(0x0704b7c2, 0x889dfca5), + TOBN(0xf6ce581f, 0xf52305aa), TOBN(0x399d49eb, 0x914d5e53)}}, + {{TOBN(0x380a496d, 0x6ec293cd), TOBN(0x733dbda7, 0x8e7051f5), + TOBN(0x037e388d, 0xb849140a), TOBN(0xee4b32b0, 0x5946dbf6)}, + {TOBN(0xb1c4fda9, 0xcae368d1), TOBN(0x5001a7b0, 0xfdb0b2f3), + TOBN(0x6df59374, 0x2e3ac46e), TOBN(0x4af675f2, 0x39b3e656)}}, + {{TOBN(0x44e38110, 0x39949296), TOBN(0x5b63827b, 0x361db1b5), + TOBN(0x3e5323ed, 0x206eaff5), TOBN(0x942370d2, 0xc21f4290)}, + {TOBN(0xf2caaf2e, 0xe0d985a1), TOBN(0x192cc64b, 0x7239846d), + TOBN(0x7c0b8f47, 0xae6312f8), TOBN(0x7dc61f91, 0x96620108)}}, + {{TOBN(0xb830fb5b, 0xc2da7de9), TOBN(0xd0e643df, 0x0ff8d3be), + TOBN(0x31ee77ba, 0x188a9641), TOBN(0x4e8aa3aa, 0xbcf6d502)}, + {TOBN(0xf9fb6532, 0x9a49110f), TOBN(0xd18317f6, 0x2dd6b220), + TOBN(0x7e3ced41, 0x52c3ea5a), TOBN(0x0d296a14, 0x7d579c4a)}}, + {{TOBN(0x35d6a53e, 0xed4c3717), TOBN(0x9f8240cf, 0x3d0ed2a3), + TOBN(0x8c0d4d05, 0xe5543aa5), TOBN(0x45d5bbfb, 0xdd33b4b4)}, + {TOBN(0xfa04cc73, 0x137fd28e), TOBN(0x862ac6ef, 0xc73b3ffd), + TOBN(0x403ff9f5, 0x31f51ef2), TOBN(0x34d5e0fc, 0xbc73f5a2)}}, + {{TOBN(0xf2526820, 0x08913f4f), TOBN(0xea20ed61, 0xeac93d95), + TOBN(0x51ed38b4, 0x6ca6b26c), TOBN(0x8662dcbc, 0xea4327b0)}, + {TOBN(0x6daf295c, 0x725d2aaa), TOBN(0xbad2752f, 0x8e52dcda), + TOBN(0x2210e721, 0x0b17dacc), TOBN(0xa37f7912, 0xd51e8232)}}, + {{TOBN(0x4f7081e1, 0x44cc3add), TOBN(0xd5ffa1d6, 0x87be82cf), + TOBN(0x89890b6c, 0x0edd6472), TOBN(0xada26e1a, 0x3ed17863)}, + {TOBN(0x276f2715, 0x63483caa), TOBN(0xe6924cd9, 0x2f6077fd), + TOBN(0x05a7fe98, 0x0a466e3c), TOBN(0xf1c794b0, 0xb1902d1f)}}, + {{TOBN(0xe5213688, 0x82a8042c), TOBN(0xd931cfaf, 0xcd278298), + TOBN(0x069a0ae0, 0xf597a740), TOBN(0x0adbb3f3, 0xeb59107c)}, + {TOBN(0x983e951e, 0x5eaa8eb8), TOBN(0xe663a8b5, 0x11b48e78), + TOBN(0x1631cc0d, 0x8a03f2c5), TOBN(0x7577c11e, 0x11e271e2)}}, + {{TOBN(0x33b2385c, 0x08369a90), TOBN(0x2990c59b, 0x190eb4f8), + TOBN(0x819a6145, 0xc68eac80), TOBN(0x7a786d62, 0x2ec4a014)}, + {TOBN(0x33faadbe, 0x20ac3a8d), TOBN(0x31a21781, 0x5aba2d30), + TOBN(0x209d2742, 0xdba4f565), TOBN(0xdb2ce9e3, 0x55aa0fbb)}}, + {{TOBN(0x8cef334b, 0x168984df), TOBN(0xe81dce17, 0x33879638), + TOBN(0xf6e6949c, 0x263720f0), TOBN(0x5c56feaf, 0xf593cbec)}, + {TOBN(0x8bff5601, 0xfde58c84), TOBN(0x74e24117, 0x2eccb314), + TOBN(0xbcf01b61, 0x4c9a8a78), TOBN(0xa233e35e, 0x544c9868)}}, + {{TOBN(0xb3156bf3, 0x8bd7aff1), TOBN(0x1b5ee4cb, 0x1d81b146), + TOBN(0x7ba1ac41, 0xd628a915), TOBN(0x8f3a8f9c, 0xfd89699e)}, + {TOBN(0x7329b9c9, 0xa0748be7), TOBN(0x1d391c95, 0xa92e621f), + TOBN(0xe51e6b21, 0x4d10a837), TOBN(0xd255f53a, 0x4947b435)}}, + {{TOBN(0x07669e04, 0xf1788ee3), TOBN(0xc14f27af, 0xa86938a2), + TOBN(0x8b47a334, 0xe93a01c0), TOBN(0xff627438, 0xd9366808)}, + {TOBN(0x7a0985d8, 0xca2a5965), TOBN(0x3d9a5542, 0xd6e9b9b3), + TOBN(0xc23eb80b, 0x4cf972e8), TOBN(0x5c1c33bb, 0x4fdf72fd)}}, + {{TOBN(0x0c4a58d4, 0x74a86108), TOBN(0xf8048a8f, 0xee4c5d90), + TOBN(0xe3c7c924, 0xe86d4c80), TOBN(0x28c889de, 0x056a1e60)}, + {TOBN(0x57e2662e, 0xb214a040), TOBN(0xe8c48e98, 0x37e10347), + TOBN(0x87742862, 0x80ac748a), TOBN(0xf1c24022, 0x186b06f2)}}, + {{TOBN(0xac2dd4c3, 0x5f74040a), TOBN(0x409aeb71, 0xfceac957), + TOBN(0x4fbad782, 0x55c4ec23), TOBN(0xb359ed61, 0x8a7b76ec)}, + {TOBN(0x12744926, 0xed6f4a60), TOBN(0xe21e8d7f, 0x4b912de3), + TOBN(0xe2575a59, 0xfc705a59), TOBN(0x72f1d4de, 0xed2dbc0e)}}, + {{TOBN(0x3d2b24b9, 0xeb7926b8), TOBN(0xbff88cb3, 0xcdbe5509), + TOBN(0xd0f399af, 0xe4dd640b), TOBN(0x3c5fe130, 0x2f76ed45)}, + {TOBN(0x6f3562f4, 0x3764fb3d), TOBN(0x7b5af318, 0x3151b62d), + TOBN(0xd5bd0bc7, 0xd79ce5f3), TOBN(0xfdaf6b20, 0xec66890f)}}, + {{TOBN(0x735c67ec, 0x6063540c), TOBN(0x50b259c2, 0xe5f9cb8f), + TOBN(0xb8734f9a, 0x3f99c6ab), TOBN(0xf8cc13d5, 0xa3a7bc85)}, + {TOBN(0x80c1b305, 0xc5217659), TOBN(0xfe5364d4, 0x4ec12a54), + TOBN(0xbd87045e, 0x681345fe), TOBN(0x7f8efeb1, 0x582f897f)}}, + {{TOBN(0xe8cbf1e5, 0xd5923359), TOBN(0xdb0cea9d, 0x539b9fb0), + TOBN(0x0c5b34cf, 0x49859b98), TOBN(0x5e583c56, 0xa4403cc6)}, + {TOBN(0x11fc1a2d, 0xd48185b7), TOBN(0xc93fbc7e, 0x6e521787), + TOBN(0x47e7a058, 0x05105b8b), TOBN(0x7b4d4d58, 0xdb8260c8)}}, + {{TOBN(0xe33930b0, 0x46eb842a), TOBN(0x8e844a9a, 0x7bdae56d), + TOBN(0x34ef3a9e, 0x13f7fdfc), TOBN(0xb3768f82, 0x636ca176)}, + {TOBN(0x2821f4e0, 0x4e09e61c), TOBN(0x414dc3a1, 0xa0c7cddc), + TOBN(0xd5379437, 0x54945fcd), TOBN(0x151b6eef, 0xb3555ff1)}}, + {{TOBN(0xb31bd613, 0x6339c083), TOBN(0x39ff8155, 0xdfb64701), + TOBN(0x7c3388d2, 0xe29604ab), TOBN(0x1e19084b, 0xa6b10442)}, + {TOBN(0x17cf54c0, 0xeccd47ef), TOBN(0x89693385, 0x4a5dfb30), + TOBN(0x69d023fb, 0x47daf9f6), TOBN(0x9222840b, 0x7d91d959)}}, + {{TOBN(0x439108f5, 0x803bac62), TOBN(0x0b7dd91d, 0x379bd45f), + TOBN(0xd651e827, 0xca63c581), TOBN(0x5c5d75f6, 0x509c104f)}, + {TOBN(0x7d5fc738, 0x1f2dc308), TOBN(0x20faa7bf, 0xd98454be), + TOBN(0x95374bee, 0xa517b031), TOBN(0xf036b9b1, 0x642692ac)}}, + {{TOBN(0xc5106109, 0x39842194), TOBN(0xb7e2353e, 0x49d05295), + TOBN(0xfc8c1d5c, 0xefb42ee0), TOBN(0xe04884eb, 0x08ce811c)}, + {TOBN(0xf1f75d81, 0x7419f40e), TOBN(0x5b0ac162, 0xa995c241), + TOBN(0x120921bb, 0xc4c55646), TOBN(0x713520c2, 0x8d33cf97)}}, + {{TOBN(0xb4a65a5c, 0xe98c5100), TOBN(0x6cec871d, 0x2ddd0f5a), + TOBN(0x251f0b7f, 0x9ba2e78b), TOBN(0x224a8434, 0xce3a2a5f)}, + {TOBN(0x26827f61, 0x25f5c46f), TOBN(0x6a22bedc, 0x48545ec0), + TOBN(0x25ae5fa0, 0xb1bb5cdc), TOBN(0xd693682f, 0xfcb9b98f)}}, + {{TOBN(0x32027fe8, 0x91e5d7d3), TOBN(0xf14b7d17, 0x73a07678), + TOBN(0xf88497b3, 0xc0dfdd61), TOBN(0xf7c2eec0, 0x2a8c4f48)}, + {TOBN(0xaa5573f4, 0x3756e621), TOBN(0xc013a240, 0x1825b948), + TOBN(0x1c03b345, 0x63878572), TOBN(0xa0472bea, 0x653a4184)}}, + {{TOBN(0xf4222e27, 0x0ac69a80), TOBN(0x34096d25, 0xf51e54f6), + TOBN(0x00a648cb, 0x8fffa591), TOBN(0x4e87acdc, 0x69b6527f)}, + {TOBN(0x0575e037, 0xe285ccb4), TOBN(0x188089e4, 0x50ddcf52), + TOBN(0xaa96c9a8, 0x870ff719), TOBN(0x74a56cd8, 0x1fc7e369)}}, + {{TOBN(0x41d04ee2, 0x1726931a), TOBN(0x0bbbb2c8, 0x3660ecfd), + TOBN(0xa6ef6de5, 0x24818e18), TOBN(0xe421cc51, 0xe7d57887)}, + {TOBN(0xf127d208, 0xbea87be6), TOBN(0x16a475d3, 0xb1cdd682), + TOBN(0x9db1b684, 0x439b63f7), TOBN(0x5359b3db, 0xf0f113b6)}}, + {{TOBN(0xdfccf1de, 0x8bf06e31), TOBN(0x1fdf8f44, 0xdd383901), + TOBN(0x10775cad, 0x5017e7d2), TOBN(0xdfc3a597, 0x58d11eef)}, + {TOBN(0x6ec9c8a0, 0xb1ecff10), TOBN(0xee6ed6cc, 0x28400549), + TOBN(0xb5ad7bae, 0x1b4f8d73), TOBN(0x61b4f11d, 0xe00aaab9)}}, + {{TOBN(0x7b32d69b, 0xd4eff2d7), TOBN(0x88ae6771, 0x4288b60f), + TOBN(0x159461b4, 0x37a1e723), TOBN(0x1f3d4789, 0x570aae8c)}, + {TOBN(0x869118c0, 0x7f9871da), TOBN(0x35fbda78, 0xf635e278), + TOBN(0x738f3641, 0xe1541dac), TOBN(0x6794b13a, 0xc0dae45f)}}, + {{TOBN(0x065064ac, 0x09cc0917), TOBN(0x27c53729, 0xc68540fd), + TOBN(0x0d2d4c8e, 0xef227671), TOBN(0xd23a9f80, 0xa1785a04)}, + {TOBN(0x98c59528, 0x52650359), TOBN(0xfa09ad01, 0x74a1acad), + TOBN(0x082d5a29, 0x0b55bf5c), TOBN(0xa40f1c67, 0x419b8084)}}, + {{TOBN(0x3a5c752e, 0xdcc18770), TOBN(0x4baf1f2f, 0x8825c3a5), + TOBN(0xebd63f74, 0x21b153ed), TOBN(0xa2383e47, 0xb2f64723)}, + {TOBN(0xe7bf620a, 0x2646d19a), TOBN(0x56cb44ec, 0x03c83ffd), + TOBN(0xaf7267c9, 0x4f6be9f1), TOBN(0x8b2dfd7b, 0xc06bb5e9)}}, + {{TOBN(0xb87072f2, 0xa672c5c7), TOBN(0xeacb11c8, 0x0d53c5e2), + TOBN(0x22dac29d, 0xff435932), TOBN(0x37bdb99d, 0x4408693c)}, + {TOBN(0xf6e62fb6, 0x2899c20f), TOBN(0x3535d512, 0x447ece24), + TOBN(0xfbdc6b88, 0xff577ce3), TOBN(0x726693bd, 0x190575f2)}}, + {{TOBN(0x6772b0e5, 0xab4b35a2), TOBN(0x1d8b6001, 0xf5eeaacf), + TOBN(0x728f7ce4, 0x795b9580), TOBN(0x4a20ed2a, 0x41fb81da)}, + {TOBN(0x9f685cd4, 0x4fec01e6), TOBN(0x3ed7ddcc, 0xa7ff50ad), + TOBN(0x460fd264, 0x0c2d97fd), TOBN(0x3a241426, 0xeb82f4f9)}}, + {{TOBN(0x17d1df2c, 0x6a8ea820), TOBN(0xb2b50d3b, 0xf22cc254), + TOBN(0x03856cba, 0xb7291426), TOBN(0x87fd26ae, 0x04f5ee39)}, + {TOBN(0x9cb696cc, 0x02bee4ba), TOBN(0x53121804, 0x06820fd6), + TOBN(0xa5dfc269, 0x0212e985), TOBN(0x666f7ffa, 0x160f9a09)}}, + {{TOBN(0xc503cd33, 0xbccd9617), TOBN(0x365dede4, 0xba7730a3), + TOBN(0x798c6355, 0x5ddb0786), TOBN(0xa6c3200e, 0xfc9cd3bc)}, + {TOBN(0x060ffb2c, 0xe5e35efd), TOBN(0x99a4e25b, 0x5555a1c1), + TOBN(0x11d95375, 0xf70b3751), TOBN(0x0a57354a, 0x160e1bf6)}}, + {{TOBN(0xecb3ae4b, 0xf8e4b065), TOBN(0x07a834c4, 0x2e53022b), + TOBN(0x1cd300b3, 0x8692ed96), TOBN(0x16a6f792, 0x61ee14ec)}, + {TOBN(0x8f1063c6, 0x6a8649ed), TOBN(0xfbcdfcfe, 0x869f3e14), + TOBN(0x2cfb97c1, 0x00a7b3ec), TOBN(0xcea49b3c, 0x7130c2f1)}}, + {{TOBN(0x462d044f, 0xe9d96488), TOBN(0x4b53d52e, 0x8182a0c1), + TOBN(0x84b6ddd3, 0x0391e9e9), TOBN(0x80ab7b48, 0xb1741a09)}, + {TOBN(0xec0e15d4, 0x27d3317f), TOBN(0x8dfc1ddb, 0x1a64671e), + TOBN(0x93cc5d5f, 0xd49c5b92), TOBN(0xc995d53d, 0x3674a331)}}, + {{TOBN(0x302e41ec, 0x090090ae), TOBN(0x2278a0cc, 0xedb06830), + TOBN(0x1d025932, 0xfbc99690), TOBN(0x0c32fbd2, 0xb80d68da)}, + {TOBN(0xd79146da, 0xf341a6c1), TOBN(0xae0ba139, 0x1bef68a0), + TOBN(0xc6b8a563, 0x8d774b3a), TOBN(0x1cf307bd, 0x880ba4d7)}}, + {{TOBN(0xc033bdc7, 0x19803511), TOBN(0xa9f97b3b, 0x8888c3be), + TOBN(0x3d68aebc, 0x85c6d05e), TOBN(0xc3b88a9d, 0x193919eb)}, + {TOBN(0x2d300748, 0xc48b0ee3), TOBN(0x7506bc7c, 0x07a746c1), + TOBN(0xfc48437c, 0x6e6d57f3), TOBN(0x5bd71587, 0xcfeaa91a)}}, + {{TOBN(0xa4ed0408, 0xc1bc5225), TOBN(0xd0b946db, 0x2719226d), + TOBN(0x109ecd62, 0x758d2d43), TOBN(0x75c8485a, 0x2751759b)}, + {TOBN(0xb0b75f49, 0x9ce4177a), TOBN(0x4fa61a1e, 0x79c10c3d), + TOBN(0xc062d300, 0xa167fcd7), TOBN(0x4df3874c, 0x750f0fa8)}}, + {{TOBN(0x29ae2cf9, 0x83dfedc9), TOBN(0xf8437134, 0x8d87631a), + TOBN(0xaf571711, 0x7429c8d2), TOBN(0x18d15867, 0x146d9272)}, + {TOBN(0x83053ecf, 0x69769bb7), TOBN(0xc55eb856, 0xc479ab82), + TOBN(0x5ef7791c, 0x21b0f4b2), TOBN(0xaa5956ba, 0x3d491525)}}, + {{TOBN(0x407a96c2, 0x9fe20eba), TOBN(0xf27168bb, 0xe52a5ad3), + TOBN(0x43b60ab3, 0xbf1d9d89), TOBN(0xe45c51ef, 0x710e727a)}, + {TOBN(0xdfca5276, 0x099b4221), TOBN(0x8dc6407c, 0x2557a159), + TOBN(0x0ead8335, 0x91035895), TOBN(0x0a9db957, 0x9c55dc32)}}, + {{TOBN(0xe40736d3, 0xdf61bc76), TOBN(0x13a619c0, 0x3f778cdb), + TOBN(0x6dd921a4, 0xc56ea28f), TOBN(0x76a52433, 0x2fa647b4)}, + {TOBN(0x23591891, 0xac5bdc5d), TOBN(0xff4a1a72, 0xbac7dc01), + TOBN(0x9905e261, 0x62df8453), TOBN(0x3ac045df, 0xe63b265f)}}, + {{TOBN(0x8a3f341b, 0xad53dba7), TOBN(0x8ec269cc, 0x837b625a), + TOBN(0xd71a2782, 0x3ae31189), TOBN(0x8fb4f9a3, 0x55e96120)}, + {TOBN(0x804af823, 0xff9875cf), TOBN(0x23224f57, 0x5d442a9b), + TOBN(0x1c4d3b9e, 0xecc62679), TOBN(0x91da22fb, 0xa0e7ddb1)}}, + {{TOBN(0xa370324d, 0x6c04a661), TOBN(0x9710d3b6, 0x5e376d17), + TOBN(0xed8c98f0, 0x3044e357), TOBN(0xc364ebbe, 0x6422701c)}, + {TOBN(0x347f5d51, 0x7733d61c), TOBN(0xd55644b9, 0xcea826c3), + TOBN(0x80c6e0ad, 0x55a25548), TOBN(0x0aa7641d, 0x844220a7)}}, + {{TOBN(0x1438ec81, 0x31810660), TOBN(0x9dfa6507, 0xde4b4043), + TOBN(0x10b515d8, 0xcc3e0273), TOBN(0x1b6066dd, 0x28d8cfb2)}, + {TOBN(0xd3b04591, 0x9c9efebd), TOBN(0x425d4bdf, 0xa21c1ff4), + TOBN(0x5fe5af19, 0xd57607d3), TOBN(0xbbf773f7, 0x54481084)}}, + {{TOBN(0x8435bd69, 0x94b03ed1), TOBN(0xd9ad1de3, 0x634cc546), + TOBN(0x2cf423fc, 0x00e420ca), TOBN(0xeed26d80, 0xa03096dd)}, + {TOBN(0xd7f60be7, 0xa4db09d2), TOBN(0xf47f569d, 0x960622f7), + TOBN(0xe5925fd7, 0x7296c729), TOBN(0xeff2db26, 0x26ca2715)}}, + {{TOBN(0xa6fcd014, 0xb913e759), TOBN(0x53da4786, 0x8ff4de93), + TOBN(0x14616d79, 0xc32068e1), TOBN(0xb187d664, 0xccdf352e)}, + {TOBN(0xf7afb650, 0x1dc90b59), TOBN(0x8170e943, 0x7daa1b26), + TOBN(0xc8e3bdd8, 0x700c0a84), TOBN(0x6e8d345f, 0x6482bdfa)}}, + {{TOBN(0x84cfbfa1, 0xc5c5ea50), TOBN(0xd3baf14c, 0x67960681), + TOBN(0x26398403, 0x0dd50942), TOBN(0xe4b7839c, 0x4716a663)}, + {TOBN(0xd5f1f794, 0xe7de6dc0), TOBN(0x5cd0f4d4, 0x622aa7ce), + TOBN(0x5295f3f1, 0x59acfeec), TOBN(0x8d933552, 0x953e0607)}}, + {{TOBN(0xc7db8ec5, 0x776c5722), TOBN(0xdc467e62, 0x2b5f290c), + TOBN(0xd4297e70, 0x4ff425a9), TOBN(0x4be924c1, 0x0cf7bb72)}, + {TOBN(0x0d5dc5ae, 0xa1892131), TOBN(0x8bf8a8e3, 0xa705c992), + TOBN(0x73a0b064, 0x7a305ac5), TOBN(0x00c9ca4e, 0x9a8c77a8)}}, + {{TOBN(0x5dfee80f, 0x83774bdd), TOBN(0x63131602, 0x85734485), + TOBN(0xa1b524ae, 0x914a69a9), TOBN(0xebc2ffaf, 0xd4e300d7)}, + {TOBN(0x52c93db7, 0x7cfa46a5), TOBN(0x71e6161f, 0x21653b50), + TOBN(0x3574fc57, 0xa4bc580a), TOBN(0xc09015dd, 0xe1bc1253)}}, + {{TOBN(0x4b7b47b2, 0xd174d7aa), TOBN(0x4072d8e8, 0xf3a15d04), + TOBN(0xeeb7d47f, 0xd6fa07ed), TOBN(0x6f2b9ff9, 0xedbdafb1)}, + {TOBN(0x18c51615, 0x3760fe8a), TOBN(0x7a96e6bf, 0xf06c6c13), + TOBN(0x4d7a0410, 0x0ea2d071), TOBN(0xa1914e9b, 0x0be2a5ce)}}, + {{TOBN(0x5726e357, 0xd8a3c5cf), TOBN(0x1197ecc3, 0x2abb2b13), + TOBN(0x6c0d7f7f, 0x31ae88dd), TOBN(0x15b20d1a, 0xfdbb3efe)}, + {TOBN(0xcd06aa26, 0x70584039), TOBN(0x2277c969, 0xa7dc9747), + TOBN(0xbca69587, 0x7855d815), TOBN(0x899ea238, 0x5188b32a)}}, + {{TOBN(0x37d9228b, 0x760c1c9d), TOBN(0xc7efbb11, 0x9b5c18da), + TOBN(0x7f0d1bc8, 0x19f6dbc5), TOBN(0x4875384b, 0x07e6905b)}, + {TOBN(0xc7c50baa, 0x3ba8cd86), TOBN(0xb0ce40fb, 0xc2905de0), + TOBN(0x70840673, 0x7a231952), TOBN(0xa912a262, 0xcf43de26)}}, + {{TOBN(0x9c38ddcc, 0xeb5b76c1), TOBN(0x746f5285, 0x26fc0ab4), + TOBN(0x52a63a50, 0xd62c269f), TOBN(0x60049c55, 0x99458621)}, + {TOBN(0xe7f48f82, 0x3c2f7c9e), TOBN(0x6bd99043, 0x917d5cf3), + TOBN(0xeb1317a8, 0x8701f469), TOBN(0xbd3fe2ed, 0x9a449fe0)}}, + {{TOBN(0x421e79ca, 0x12ef3d36), TOBN(0x9ee3c36c, 0x3e7ea5de), + TOBN(0xe48198b5, 0xcdff36f7), TOBN(0xaff4f967, 0xc6b82228)}, + {TOBN(0x15e19dd0, 0xc47adb7e), TOBN(0x45699b23, 0x032e7dfa), + TOBN(0x40680c8b, 0x1fae026a), TOBN(0x5a347a48, 0x550dbf4d)}}, + {{TOBN(0xe652533b, 0x3cef0d7d), TOBN(0xd94f7b18, 0x2bbb4381), + TOBN(0x838752be, 0x0e80f500), TOBN(0x8e6e2488, 0x9e9c9bfb)}, + {TOBN(0xc9751697, 0x16caca6a), TOBN(0x866c49d8, 0x38531ad9), + TOBN(0xc917e239, 0x7151ade1), TOBN(0x2d016ec1, 0x6037c407)}}, + {{TOBN(0xa407ccc9, 0x00eac3f9), TOBN(0x835f6280, 0xe2ed4748), + TOBN(0xcc54c347, 0x1cc98e0d), TOBN(0x0e969937, 0xdcb572eb)}, + {TOBN(0x1b16c8e8, 0x8f30c9cb), TOBN(0xa606ae75, 0x373c4661), + TOBN(0x47aa689b, 0x35502cab), TOBN(0xf89014ae, 0x4d9bb64f)}}, + {{TOBN(0x202f6a9c, 0x31c71f7b), TOBN(0x01f95aa3, 0x296ffe5c), + TOBN(0x5fc06014, 0x53cec3a3), TOBN(0xeb991237, 0x5f498a45)}, + {TOBN(0xae9a935e, 0x5d91ba87), TOBN(0xc6ac6281, 0x0b564a19), + TOBN(0x8a8fe81c, 0x3bd44e69), TOBN(0x7c8b467f, 0x9dd11d45)}}, + {{TOBN(0xf772251f, 0xea5b8e69), TOBN(0xaeecb3bd, 0xc5b75fbc), + TOBN(0x1aca3331, 0x887ff0e5), TOBN(0xbe5d49ff, 0x19f0a131)}, + {TOBN(0x582c13aa, 0xe5c8646f), TOBN(0xdbaa12e8, 0x20e19980), + TOBN(0x8f40f31a, 0xf7abbd94), TOBN(0x1f13f5a8, 0x1dfc7663)}}, + {{TOBN(0x5d81f1ee, 0xaceb4fc0), TOBN(0x36256002, 0x5e6f0f42), + TOBN(0x4b67d6d7, 0x751370c8), TOBN(0x2608b698, 0x03e80589)}, + {TOBN(0xcfc0d2fc, 0x05268301), TOBN(0xa6943d39, 0x40309212), + TOBN(0x192a90c2, 0x1fd0e1c2), TOBN(0xb209f113, 0x37f1dc76)}}, + {{TOBN(0xefcc5e06, 0x97bf1298), TOBN(0xcbdb6730, 0x219d639e), + TOBN(0xd009c116, 0xb81e8c6f), TOBN(0xa3ffdde3, 0x1a7ce2e5)}, + {TOBN(0xc53fbaaa, 0xa914d3ba), TOBN(0x836d500f, 0x88df85ee), + TOBN(0xd98dc71b, 0x66ee0751), TOBN(0x5a3d7005, 0x714516fd)}}, + {{TOBN(0x21d3634d, 0x39eedbba), TOBN(0x35cd2e68, 0x0455a46d), + TOBN(0xc8cafe65, 0xf9d7eb0c), TOBN(0xbda3ce9e, 0x00cefb3e)}, + {TOBN(0xddc17a60, 0x2c9cf7a4), TOBN(0x01572ee4, 0x7bcb8773), + TOBN(0xa92b2b01, 0x8c7548df), TOBN(0x732fd309, 0xa84600e3)}}, + {{TOBN(0xe22109c7, 0x16543a40), TOBN(0x9acafd36, 0xfede3c6c), + TOBN(0xfb206852, 0x6824e614), TOBN(0x2a4544a9, 0xda25dca0)}, + {TOBN(0x25985262, 0x91d60b06), TOBN(0x281b7be9, 0x28753545), + TOBN(0xec667b1a, 0x90f13b27), TOBN(0x33a83aff, 0x940e2eb4)}}, + {{TOBN(0x80009862, 0xd5d721d5), TOBN(0x0c3357a3, 0x5bd3a182), + TOBN(0x27f3a83b, 0x7aa2cda4), TOBN(0xb58ae74e, 0xf6f83085)}, + {TOBN(0x2a911a81, 0x2e6dad6b), TOBN(0xde286051, 0xf43d6c5b), + TOBN(0x4bdccc41, 0xf996c4d8), TOBN(0xe7312ec0, 0x0ae1e24e)}}}, + {{{TOBN(0xf8d112e7, 0x6e6485b3), TOBN(0x4d3e24db, 0x771c52f8), + TOBN(0x48e3ee41, 0x684a2f6d), TOBN(0x7161957d, 0x21d95551)}, + {TOBN(0x19631283, 0xcdb12a6c), TOBN(0xbf3fa882, 0x2e50e164), + TOBN(0xf6254b63, 0x3166cc73), TOBN(0x3aefa7ae, 0xaee8cc38)}}, + {{TOBN(0x79b0fe62, 0x3b36f9fd), TOBN(0x26543b23, 0xfde19fc0), + TOBN(0x136e64a0, 0x958482ef), TOBN(0x23f63771, 0x9b095825)}, + {TOBN(0x14cfd596, 0xb6a1142e), TOBN(0x5ea6aac6, 0x335aac0b), + TOBN(0x86a0e8bd, 0xf3081dd5), TOBN(0x5fb89d79, 0x003dc12a)}}, + {{TOBN(0xf615c33a, 0xf72e34d4), TOBN(0x0bd9ea40, 0x110eec35), + TOBN(0x1c12bc5b, 0xc1dea34e), TOBN(0x686584c9, 0x49ae4699)}, + {TOBN(0x13ad95d3, 0x8c97b942), TOBN(0x4609561a, 0x4e5c7562), + TOBN(0x9e94a4ae, 0xf2737f89), TOBN(0xf57594c6, 0x371c78b6)}}, + {{TOBN(0x0f0165fc, 0xe3779ee3), TOBN(0xe00e7f9d, 0xbd495d9e), + TOBN(0x1fa4efa2, 0x20284e7a), TOBN(0x4564bade, 0x47ac6219)}, + {TOBN(0x90e6312a, 0xc4708e8e), TOBN(0x4f5725fb, 0xa71e9adf), + TOBN(0xe95f55ae, 0x3d684b9f), TOBN(0x47f7ccb1, 0x1e94b415)}}, + {{TOBN(0x7322851b, 0x8d946581), TOBN(0xf0d13133, 0xbdf4a012), + TOBN(0xa3510f69, 0x6584dae0), TOBN(0x03a7c171, 0x3c9f6c6d)}, + {TOBN(0x5be97f38, 0xe475381a), TOBN(0xca1ba422, 0x85823334), + TOBN(0xf83cc5c7, 0x0be17dda), TOBN(0x158b1494, 0x0b918c0f)}}, + {{TOBN(0xda3a77e5, 0x522e6b69), TOBN(0x69c908c3, 0xbbcd6c18), + TOBN(0x1f1b9e48, 0xd924fd56), TOBN(0x37c64e36, 0xaa4bb3f7)}, + {TOBN(0x5a4fdbdf, 0xee478d7d), TOBN(0xba75c8bc, 0x0193f7a0), + TOBN(0x84bc1e84, 0x56cd16df), TOBN(0x1fb08f08, 0x46fad151)}}, + {{TOBN(0x8a7cabf9, 0x842e9f30), TOBN(0xa331d4bf, 0x5eab83af), + TOBN(0xd272cfba, 0x017f2a6a), TOBN(0x27560abc, 0x83aba0e3)}, + {TOBN(0x94b83387, 0x0e3a6b75), TOBN(0x25c6aea2, 0x6b9f50f5), + TOBN(0x803d691d, 0xb5fdf6d0), TOBN(0x03b77509, 0xe6333514)}}, + {{TOBN(0x36178903, 0x61a341c1), TOBN(0x3604dc60, 0x0cfd6142), + TOBN(0x022295eb, 0x8533316c), TOBN(0x3dbde4ac, 0x44af2922)}, + {TOBN(0x898afc5d, 0x1c7eef69), TOBN(0x58896805, 0xd14f4fa1), + TOBN(0x05002160, 0x203c21ca), TOBN(0x6f0d1f30, 0x40ef730b)}}, + {{TOBN(0x8e8c44d4, 0x196224f8), TOBN(0x75a4ab95, 0x374d079d), + TOBN(0x79085ecc, 0x7d48f123), TOBN(0x56f04d31, 0x1bf65ad8)}, + {TOBN(0xe220bf1c, 0xbda602b2), TOBN(0x73ee1742, 0xf9612c69), + TOBN(0x76008fc8, 0x084fd06b), TOBN(0x4000ef9f, 0xf11380d1)}}, + {{TOBN(0x48201b4b, 0x12cfe297), TOBN(0x3eee129c, 0x292f74e5), + TOBN(0xe1fe114e, 0xc9e874e8), TOBN(0x899b055c, 0x92c5fc41)}, + {TOBN(0x4e477a64, 0x3a39c8cf), TOBN(0x82f09efe, 0x78963cc9), + TOBN(0x6fd3fd8f, 0xd333f863), TOBN(0x85132b2a, 0xdc949c63)}}, + {{TOBN(0x7e06a3ab, 0x516eb17b), TOBN(0x73bec06f, 0xd2c7372b), + TOBN(0xe4f74f55, 0xba896da6), TOBN(0xbb4afef8, 0x8e9eb40f)}, + {TOBN(0x2d75bec8, 0xe61d66b0), TOBN(0x02bda4b4, 0xef29300b), + TOBN(0x8bbaa8de, 0x026baa5a), TOBN(0xff54befd, 0xa07f4440)}}, + {{TOBN(0xbd9b8b1d, 0xbe7a2af3), TOBN(0xec51caa9, 0x4fb74a72), + TOBN(0xb9937a4b, 0x63879697), TOBN(0x7c9a9d20, 0xec2687d5)}, + {TOBN(0x1773e44f, 0x6ef5f014), TOBN(0x8abcf412, 0xe90c6900), + TOBN(0x387bd022, 0x8142161e), TOBN(0x50393755, 0xfcb6ff2a)}}, + {{TOBN(0x9813fd56, 0xed6def63), TOBN(0x53cf6482, 0x7d53106c), + TOBN(0x991a35bd, 0x431f7ac1), TOBN(0xf1e274dd, 0x63e65faf)}, + {TOBN(0xf63ffa3c, 0x44cc7880), TOBN(0x411a426b, 0x7c256981), + TOBN(0xb698b9fd, 0x93a420e0), TOBN(0x89fdddc0, 0xae53f8fe)}}, + {{TOBN(0x766e0722, 0x32398baa), TOBN(0x205fee42, 0x5cfca031), + TOBN(0xa49f5341, 0x7a029cf2), TOBN(0xa88c68b8, 0x4023890d)}, + {TOBN(0xbc275041, 0x7337aaa8), TOBN(0x9ed364ad, 0x0eb384f4), + TOBN(0xe0816f85, 0x29aba92f), TOBN(0x2e9e1941, 0x04e38a88)}}, + {{TOBN(0x57eef44a, 0x3dafd2d5), TOBN(0x35d1fae5, 0x97ed98d8), + TOBN(0x50628c09, 0x2307f9b1), TOBN(0x09d84aae, 0xd6cba5c6)}, + {TOBN(0x67071bc7, 0x88aaa691), TOBN(0x2dea57a9, 0xafe6cb03), + TOBN(0xdfe11bb4, 0x3d78ac01), TOBN(0x7286418c, 0x7fd7aa51)}}, + {{TOBN(0xfabf7709, 0x77f7195a), TOBN(0x8ec86167, 0xadeb838f), + TOBN(0xea1285a8, 0xbb4f012d), TOBN(0xd6883503, 0x9a3eab3f)}, + {TOBN(0xee5d24f8, 0x309004c2), TOBN(0xa96e4b76, 0x13ffe95e), + TOBN(0x0cdffe12, 0xbd223ea4), TOBN(0x8f5c2ee5, 0xb6739a53)}}, + {{TOBN(0x5cb4aaa5, 0xdd968198), TOBN(0xfa131c52, 0x72413a6c), + TOBN(0x53d46a90, 0x9536d903), TOBN(0xb270f0d3, 0x48606d8e)}, + {TOBN(0x518c7564, 0xa053a3bc), TOBN(0x088254b7, 0x1a86caef), + TOBN(0xb3ba8cb4, 0x0ab5efd0), TOBN(0x5c59900e, 0x4605945d)}}, + {{TOBN(0xecace1dd, 0xa1887395), TOBN(0x40960f36, 0x932a65de), + TOBN(0x9611ff5c, 0x3aa95529), TOBN(0xc58215b0, 0x7c1e5a36)}, + {TOBN(0xd48c9b58, 0xf0e1a524), TOBN(0xb406856b, 0xf590dfb8), + TOBN(0xc7605e04, 0x9cd95662), TOBN(0x0dd036ee, 0xa33ecf82)}}, + {{TOBN(0xa50171ac, 0xc33156b3), TOBN(0xf09d24ea, 0x4a80172e), + TOBN(0x4e1f72c6, 0x76dc8eef), TOBN(0xe60caadc, 0x5e3d44ee)}, + {TOBN(0x006ef8a6, 0x979b1d8f), TOBN(0x60908a1c, 0x97788d26), + TOBN(0x6e08f95b, 0x266feec0), TOBN(0x618427c2, 0x22e8c94e)}}, + {{TOBN(0x3d613339, 0x59145a65), TOBN(0xcd9bc368, 0xfa406337), + TOBN(0x82d11be3, 0x2d8a52a0), TOBN(0xf6877b27, 0x97a1c590)}, + {TOBN(0x837a819b, 0xf5cbdb25), TOBN(0x2a4fd1d8, 0xde090249), + TOBN(0x622a7de7, 0x74990e5f), TOBN(0x840fa5a0, 0x7945511b)}}, + {{TOBN(0x30b974be, 0x6558842d), TOBN(0x70df8c64, 0x17f3d0a6), + TOBN(0x7c803520, 0x7542e46d), TOBN(0x7251fe7f, 0xe4ecc823)}, + {TOBN(0xe59134cb, 0x5e9aac9a), TOBN(0x11bb0934, 0xf0045d71), + TOBN(0x53e5d9b5, 0xdbcb1d4e), TOBN(0x8d97a905, 0x92defc91)}}, + {{TOBN(0xfe289327, 0x7946d3f9), TOBN(0xe132bd24, 0x07472273), + TOBN(0xeeeb510c, 0x1eb6ae86), TOBN(0x777708c5, 0xf0595067)}, + {TOBN(0x18e2c8cd, 0x1297029e), TOBN(0x2c61095c, 0xbbf9305e), + TOBN(0xe466c258, 0x6b85d6d9), TOBN(0x8ac06c36, 0xda1ea530)}}, + {{TOBN(0xa365dc39, 0xa1304668), TOBN(0xe4a9c885, 0x07f89606), + TOBN(0x65a4898f, 0xacc7228d), TOBN(0x3e2347ff, 0x84ca8303)}, + {TOBN(0xa5f6fb77, 0xea7d23a3), TOBN(0x2fac257d, 0x672a71cd), + TOBN(0x6908bef8, 0x7e6a44d3), TOBN(0x8ff87566, 0x891d3d7a)}}, + {{TOBN(0xe58e90b3, 0x6b0cf82e), TOBN(0x6438d246, 0x2615b5e7), + TOBN(0x07b1f8fc, 0x669c145a), TOBN(0xb0d8b2da, 0x36f1e1cb)}, + {TOBN(0x54d5dadb, 0xd9184c4d), TOBN(0x3dbb18d5, 0xf93d9976), + TOBN(0x0a3e0f56, 0xd1147d47), TOBN(0x2afa8c8d, 0xa0a48609)}}, + {{TOBN(0x275353e8, 0xbc36742c), TOBN(0x898f427e, 0xeea0ed90), + TOBN(0x26f4947e, 0x3e477b00), TOBN(0x8ad8848a, 0x308741e3)}, + {TOBN(0x6c703c38, 0xd74a2a46), TOBN(0x5e3e05a9, 0x9ba17ba2), + TOBN(0xc1fa6f66, 0x4ab9a9e4), TOBN(0x474a2d9a, 0x3841d6ec)}}, + {{TOBN(0x871239ad, 0x653ae326), TOBN(0x14bcf72a, 0xa74cbb43), + TOBN(0x8737650e, 0x20d4c083), TOBN(0x3df86536, 0x110ed4af)}, + {TOBN(0xd2d86fe7, 0xb53ca555), TOBN(0x688cb00d, 0xabd5d538), + TOBN(0xcf81bda3, 0x1ad38468), TOBN(0x7ccfe3cc, 0xf01167b6)}}, + {{TOBN(0xcf4f47e0, 0x6c4c1fe6), TOBN(0x557e1f1a, 0x298bbb79), + TOBN(0xf93b974f, 0x30d45a14), TOBN(0x174a1d2d, 0x0baf97c4)}, + {TOBN(0x7a003b30, 0xc51fbf53), TOBN(0xd8940991, 0xee68b225), + TOBN(0x5b0aa7b7, 0x1c0f4173), TOBN(0x975797c9, 0xa20a7153)}}, + {{TOBN(0x26e08c07, 0xe3533d77), TOBN(0xd7222e6a, 0x2e341c99), + TOBN(0x9d60ec3d, 0x8d2dc4ed), TOBN(0xbdfe0d8f, 0x7c476cf8)}, + {TOBN(0x1fe59ab6, 0x1d056605), TOBN(0xa9ea9df6, 0x86a8551f), + TOBN(0x8489941e, 0x47fb8d8c), TOBN(0xfeb874eb, 0x4a7f1b10)}}, + {{TOBN(0xfe5fea86, 0x7ee0d98f), TOBN(0x201ad34b, 0xdbf61864), + TOBN(0x45d8fe47, 0x37c031d4), TOBN(0xd5f49fae, 0x795f0822)}, + {TOBN(0xdb0fb291, 0xc7f4a40c), TOBN(0x2e69d9c1, 0x730ddd92), + TOBN(0x754e1054, 0x49d76987), TOBN(0x8a24911d, 0x7662db87)}}, + {{TOBN(0x61fc1810, 0x60a71676), TOBN(0xe852d1a8, 0xf66a8ad1), + TOBN(0x172bbd65, 0x6417231e), TOBN(0x0d6de7bd, 0x3babb11f)}, + {TOBN(0x6fde6f88, 0xc8e347f8), TOBN(0x1c587547, 0x9bd99cc3), + TOBN(0x78e54ed0, 0x34076950), TOBN(0x97f0f334, 0x796e83ba)}}, + {{TOBN(0xe4dbe1ce, 0x4924867a), TOBN(0xbd5f51b0, 0x60b84917), + TOBN(0x37530040, 0x3cb09a79), TOBN(0xdb3fe0f8, 0xff1743d8)}, + {TOBN(0xed7894d8, 0x556fa9db), TOBN(0xfa262169, 0x23412fbf), + TOBN(0x563be0db, 0xba7b9291), TOBN(0x6ca8b8c0, 0x0c9fb234)}}, + {{TOBN(0xed406aa9, 0xbd763802), TOBN(0xc21486a0, 0x65303da1), + TOBN(0x61ae291e, 0xc7e62ec4), TOBN(0x622a0492, 0xdf99333e)}, + {TOBN(0x7fd80c9d, 0xbb7a8ee0), TOBN(0xdc2ed3bc, 0x6c01aedb), + TOBN(0x35c35a12, 0x08be74ec), TOBN(0xd540cb1a, 0x469f671f)}}, + {{TOBN(0xd16ced4e, 0xcf84f6c7), TOBN(0x8561fb9c, 0x2d090f43), + TOBN(0x7e693d79, 0x6f239db4), TOBN(0xa736f928, 0x77bd0d94)}, + {TOBN(0x07b4d929, 0x2c1950ee), TOBN(0xda177543, 0x56dc11b3), + TOBN(0xa5dfbbaa, 0x7a6a878e), TOBN(0x1c70cb29, 0x4decb08a)}}, + {{TOBN(0xfba28c8b, 0x6f0f7c50), TOBN(0xa8eba2b8, 0x854dcc6d), + TOBN(0x5ff8e89a, 0x36b78642), TOBN(0x070c1c8e, 0xf6873adf)}, + {TOBN(0xbbd3c371, 0x6484d2e4), TOBN(0xfb78318f, 0x0d414129), + TOBN(0x2621a39c, 0x6ad93b0b), TOBN(0x979d74c2, 0xa9e917f7)}}, + {{TOBN(0xfc195647, 0x61fb0428), TOBN(0x4d78954a, 0xbee624d4), + TOBN(0xb94896e0, 0xb8ae86fd), TOBN(0x6667ac0c, 0xc91c8b13)}, + {TOBN(0x9f180512, 0x43bcf832), TOBN(0xfbadf8b7, 0xa0010137), + TOBN(0xc69b4089, 0xb3ba8aa7), TOBN(0xfac4bacd, 0xe687ce85)}}, + {{TOBN(0x9164088d, 0x977eab40), TOBN(0x51f4c5b6, 0x2760b390), + TOBN(0xd238238f, 0x340dd553), TOBN(0x358566c3, 0xdb1d31c9)}, + {TOBN(0x3a5ad69e, 0x5068f5ff), TOBN(0xf31435fc, 0xdaff6b06), + TOBN(0xae549a5b, 0xd6debff0), TOBN(0x59e5f0b7, 0x75e01331)}}, + {{TOBN(0x5d492fb8, 0x98559acf), TOBN(0x96018c2e, 0x4db79b50), + TOBN(0x55f4a48f, 0x609f66aa), TOBN(0x1943b3af, 0x4900a14f)}, + {TOBN(0xc22496df, 0x15a40d39), TOBN(0xb2a44684, 0x4c20f7c5), + TOBN(0x76a35afa, 0x3b98404c), TOBN(0xbec75725, 0xff5d1b77)}}, + {{TOBN(0xb67aa163, 0xbea06444), TOBN(0x27e95bb2, 0xf724b6f2), + TOBN(0x3c20e3e9, 0xd238c8ab), TOBN(0x1213754e, 0xddd6ae17)}, + {TOBN(0x8c431020, 0x716e0f74), TOBN(0x6679c82e, 0xffc095c2), + TOBN(0x2eb3adf4, 0xd0ac2932), TOBN(0x2cc970d3, 0x01bb7a76)}}, + {{TOBN(0x70c71f2f, 0x740f0e66), TOBN(0x545c616b, 0x2b6b23cc), + TOBN(0x4528cfcb, 0xb40a8bd7), TOBN(0xff839633, 0x2ab27722)}, + {TOBN(0x049127d9, 0x025ac99a), TOBN(0xd314d4a0, 0x2b63e33b), + TOBN(0xc8c310e7, 0x28d84519), TOBN(0x0fcb8983, 0xb3bc84ba)}}, + {{TOBN(0x2cc52261, 0x38634818), TOBN(0x501814f4, 0xb44c2e0b), + TOBN(0xf7e181aa, 0x54dfdba3), TOBN(0xcfd58ff0, 0xe759718c)}, + {TOBN(0xf90cdb14, 0xd3b507a8), TOBN(0x57bd478e, 0xc50bdad8), + TOBN(0x29c197e2, 0x50e5f9aa), TOBN(0x4db6eef8, 0xe40bc855)}}, + {{TOBN(0x2cc8f21a, 0xd1fc0654), TOBN(0xc71cc963, 0x81269d73), + TOBN(0xecfbb204, 0x077f49f9), TOBN(0xdde92571, 0xca56b793)}, + {TOBN(0x9abed6a3, 0xf97ad8f7), TOBN(0xe6c19d3f, 0x924de3bd), + TOBN(0x8dce92f4, 0xa140a800), TOBN(0x85f44d1e, 0x1337af07)}}, + {{TOBN(0x5953c08b, 0x09d64c52), TOBN(0xa1b5e49f, 0xf5df9749), + TOBN(0x336a8fb8, 0x52735f7d), TOBN(0xb332b6db, 0x9add676b)}, + {TOBN(0x558b88a0, 0xb4511aa4), TOBN(0x09788752, 0xdbd5cc55), + TOBN(0x16b43b9c, 0xd8cd52bd), TOBN(0x7f0bc5a0, 0xc2a2696b)}}, + {{TOBN(0x146e12d4, 0xc11f61ef), TOBN(0x9ce10754, 0x3a83e79e), + TOBN(0x08ec73d9, 0x6cbfca15), TOBN(0x09ff29ad, 0x5b49653f)}, + {TOBN(0xe31b72bd, 0xe7da946e), TOBN(0xebf9eb3b, 0xee80a4f2), + TOBN(0xd1aabd08, 0x17598ce4), TOBN(0x18b5fef4, 0x53f37e80)}}, + {{TOBN(0xd5d5cdd3, 0x5958cd79), TOBN(0x3580a1b5, 0x1d373114), + TOBN(0xa36e4c91, 0xfa935726), TOBN(0xa38c534d, 0xef20d760)}, + {TOBN(0x7088e40a, 0x2ff5845b), TOBN(0xe5bb40bd, 0xbd78177f), + TOBN(0x4f06a7a8, 0x857f9920), TOBN(0xe3cc3e50, 0xe968f05d)}}, + {{TOBN(0x1d68b7fe, 0xe5682d26), TOBN(0x5206f76f, 0xaec7f87c), + TOBN(0x41110530, 0x041951ab), TOBN(0x58ec52c1, 0xd4b5a71a)}, + {TOBN(0xf3488f99, 0x0f75cf9a), TOBN(0xf411951f, 0xba82d0d5), + TOBN(0x27ee75be, 0x618895ab), TOBN(0xeae060d4, 0x6d8aab14)}}, + {{TOBN(0x9ae1df73, 0x7fb54dc2), TOBN(0x1f3e391b, 0x25963649), + TOBN(0x242ec32a, 0xfe055081), TOBN(0x5bd450ef, 0x8491c9bd)}, + {TOBN(0x367efc67, 0x981eb389), TOBN(0xed7e1928, 0x3a0550d5), + TOBN(0x362e776b, 0xab3ce75c), TOBN(0xe890e308, 0x1f24c523)}}, + {{TOBN(0xb961b682, 0xfeccef76), TOBN(0x8b8e11f5, 0x8bba6d92), + TOBN(0x8f2ccc4c, 0x2b2375c4), TOBN(0x0d7f7a52, 0xe2f86cfa)}, + {TOBN(0xfd94d30a, 0x9efe5633), TOBN(0x2d8d246b, 0x5451f934), + TOBN(0x2234c6e3, 0x244e6a00), TOBN(0xde2b5b0d, 0xddec8c50)}}, + {{TOBN(0x2ce53c5a, 0xbf776f5b), TOBN(0x6f724071, 0x60357b05), + TOBN(0xb2593717, 0x71bf3f7a), TOBN(0x87d2501c, 0x440c4a9f)}, + {TOBN(0x440552e1, 0x87b05340), TOBN(0xb7bf7cc8, 0x21624c32), + TOBN(0x4155a6ce, 0x22facddb), TOBN(0x5a4228cb, 0x889837ef)}}, + {{TOBN(0xef87d6d6, 0xfd4fd671), TOBN(0xa233687e, 0xc2daa10e), + TOBN(0x75622244, 0x03c0eb96), TOBN(0x7632d184, 0x8bf19be6)}, + {TOBN(0x05d0f8e9, 0x40735ff4), TOBN(0x3a3e6e13, 0xc00931f1), + TOBN(0x31ccde6a, 0xdafe3f18), TOBN(0xf381366a, 0xcfe51207)}}, + {{TOBN(0x24c222a9, 0x60167d92), TOBN(0x62f9d6f8, 0x7529f18c), + TOBN(0x412397c0, 0x0353b114), TOBN(0x334d89dc, 0xef808043)}, + {TOBN(0xd9ec63ba, 0x2a4383ce), TOBN(0xcec8e937, 0x5cf92ba0), + TOBN(0xfb8b4288, 0xc8be74c0), TOBN(0x67d6912f, 0x105d4391)}}, + {{TOBN(0x7b996c46, 0x1b913149), TOBN(0x36aae2ef, 0x3a4e02da), + TOBN(0xb68aa003, 0x972de594), TOBN(0x284ec70d, 0x4ec6d545)}, + {TOBN(0xf3d2b2d0, 0x61391d54), TOBN(0x69c5d5d6, 0xfe114e92), + TOBN(0xbe0f00b5, 0xb4482dff), TOBN(0xe1596fa5, 0xf5bf33c5)}}, + {{TOBN(0x10595b56, 0x96a71cba), TOBN(0x944938b2, 0xfdcadeb7), + TOBN(0xa282da4c, 0xfccd8471), TOBN(0x98ec05f3, 0x0d37bfe1)}, + {TOBN(0xe171ce1b, 0x0698304a), TOBN(0x2d691444, 0x21bdf79b), + TOBN(0xd0cd3b74, 0x1b21dec1), TOBN(0x712ecd8b, 0x16a15f71)}}, + {{TOBN(0x8d4c00a7, 0x00fd56e1), TOBN(0x02ec9692, 0xf9527c18), + TOBN(0x21c44937, 0x4a3e42e1), TOBN(0x9176fbab, 0x1392ae0a)}, + {TOBN(0x8726f1ba, 0x44b7b618), TOBN(0xb4d7aae9, 0xf1de491c), + TOBN(0xf91df7b9, 0x07b582c0), TOBN(0x7e116c30, 0xef60aa3a)}}, + {{TOBN(0x99270f81, 0x466265d7), TOBN(0xb15b6fe2, 0x4df7adf0), + TOBN(0xfe33b2d3, 0xf9738f7f), TOBN(0x48553ab9, 0xd6d70f95)}, + {TOBN(0x2cc72ac8, 0xc21e94db), TOBN(0x795ac38d, 0xbdc0bbee), + TOBN(0x0a1be449, 0x2e40478f), TOBN(0x81bd3394, 0x052bde55)}}, + {{TOBN(0x63c8dbe9, 0x56b3c4f2), TOBN(0x017a99cf, 0x904177cc), + TOBN(0x947bbddb, 0x4d010fc1), TOBN(0xacf9b00b, 0xbb2c9b21)}, + {TOBN(0x2970bc8d, 0x47173611), TOBN(0x1a4cbe08, 0xac7d756f), + TOBN(0x06d9f4aa, 0x67d541a2), TOBN(0xa3e8b689, 0x59c2cf44)}}, + {{TOBN(0xaad066da, 0x4d88f1dd), TOBN(0xc604f165, 0x7ad35dea), + TOBN(0x7edc0720, 0x4478ca67), TOBN(0xa10dfae0, 0xba02ce06)}, + {TOBN(0xeceb1c76, 0xaf36f4e4), TOBN(0x994b2292, 0xaf3f8f48), + TOBN(0xbf9ed77b, 0x77c8a68c), TOBN(0x74f544ea, 0x51744c9d)}}, + {{TOBN(0x82d05bb9, 0x8113a757), TOBN(0x4ef2d2b4, 0x8a9885e4), + TOBN(0x1e332be5, 0x1aa7865f), TOBN(0x22b76b18, 0x290d1a52)}, + {TOBN(0x308a2310, 0x44351683), TOBN(0x9d861896, 0xa3f22840), + TOBN(0x5959ddcd, 0x841ed947), TOBN(0x0def0c94, 0x154b73bf)}}, + {{TOBN(0xf0105417, 0x4c7c15e0), TOBN(0x539bfb02, 0x3a277c32), + TOBN(0xe699268e, 0xf9dccf5f), TOBN(0x9f5796a5, 0x0247a3bd)}, + {TOBN(0x8b839de8, 0x4f157269), TOBN(0xc825c1e5, 0x7a30196b), + TOBN(0x6ef0aabc, 0xdc8a5a91), TOBN(0xf4a8ce6c, 0x498b7fe6)}}, + {{TOBN(0x1cce35a7, 0x70cbac78), TOBN(0x83488e9b, 0xf6b23958), + TOBN(0x0341a070, 0xd76cb011), TOBN(0xda6c9d06, 0xae1b2658)}, + {TOBN(0xb701fb30, 0xdd648c52), TOBN(0x994ca02c, 0x52fb9fd1), + TOBN(0x06933117, 0x6f563086), TOBN(0x3d2b8100, 0x17856bab)}}, + {{TOBN(0xe89f48c8, 0x5963a46e), TOBN(0x658ab875, 0xa99e61c7), + TOBN(0x6e296f87, 0x4b8517b4), TOBN(0x36c4fcdc, 0xfc1bc656)}, + {TOBN(0xde5227a1, 0xa3906def), TOBN(0x9fe95f57, 0x62418945), + TOBN(0x20c91e81, 0xfdd96cde), TOBN(0x5adbe47e, 0xda4480de)}}, + {{TOBN(0xa009370f, 0x396de2b6), TOBN(0x98583d4b, 0xf0ecc7bd), + TOBN(0xf44f6b57, 0xe51d0672), TOBN(0x03d6b078, 0x556b1984)}, + {TOBN(0x27dbdd93, 0xb0b64912), TOBN(0x9b3a3434, 0x15687b09), + TOBN(0x0dba6461, 0x51ec20a9), TOBN(0xec93db7f, 0xff28187c)}}, + {{TOBN(0x00ff8c24, 0x66e48bdd), TOBN(0x2514f2f9, 0x11ccd78e), + TOBN(0xeba11f4f, 0xe1250603), TOBN(0x8a22cd41, 0x243fa156)}, + {TOBN(0xa4e58df4, 0xb283e4c6), TOBN(0x78c29859, 0x8b39783f), + TOBN(0x5235aee2, 0xa5259809), TOBN(0xc16284b5, 0x0e0227dd)}}, + {{TOBN(0xa5f57916, 0x1338830d), TOBN(0x6d4b8a6b, 0xd2123fca), + TOBN(0x236ea68a, 0xf9c546f8), TOBN(0xc1d36873, 0xfa608d36)}, + {TOBN(0xcd76e495, 0x8d436d13), TOBN(0xd4d9c221, 0x8fb080af), + TOBN(0x665c1728, 0xe8ad3fb5), TOBN(0xcf1ebe4d, 0xb3d572e0)}}, + {{TOBN(0xa7a8746a, 0x584c5e20), TOBN(0x267e4ea1, 0xb9dc7035), + TOBN(0x593a15cf, 0xb9548c9b), TOBN(0x5e6e2135, 0x4bd012f3)}, + {TOBN(0xdf31cc6a, 0x8c8f936e), TOBN(0x8af84d04, 0xb5c241dc), + TOBN(0x63990a6f, 0x345efb86), TOBN(0x6fef4e61, 0xb9b962cb)}}}, + {{{TOBN(0xf6368f09, 0x25722608), TOBN(0x131260db, 0x131cf5c6), + TOBN(0x40eb353b, 0xfab4f7ac), TOBN(0x85c78880, 0x37eee829)}, + {TOBN(0x4c1581ff, 0xc3bdf24e), TOBN(0x5bff75cb, 0xf5c3c5a8), + TOBN(0x35e8c83f, 0xa14e6f40), TOBN(0xb81d1c0f, 0x0295e0ca)}}, + {{TOBN(0xfcde7cc8, 0xf43a730f), TOBN(0xe89b6f3c, 0x33ab590e), + TOBN(0xc823f529, 0xad03240b), TOBN(0x82b79afe, 0x98bea5db)}, + {TOBN(0x568f2856, 0x962fe5de), TOBN(0x0c590adb, 0x60c591f3), + TOBN(0x1fc74a14, 0x4a28a858), TOBN(0x3b662498, 0xb3203f4c)}}, + {{TOBN(0x91e3cf0d, 0x6c39765a), TOBN(0xa2db3acd, 0xac3cca0b), + TOBN(0x288f2f08, 0xcb953b50), TOBN(0x2414582c, 0xcf43cf1a)}, + {TOBN(0x8dec8bbc, 0x60eee9a8), TOBN(0x54c79f02, 0x729aa042), + TOBN(0xd81cd5ec, 0x6532f5d5), TOBN(0xa672303a, 0xcf82e15f)}}, + {{TOBN(0x376aafa8, 0x719c0563), TOBN(0xcd8ad2dc, 0xbc5fc79f), + TOBN(0x303fdb9f, 0xcb750cd3), TOBN(0x14ff052f, 0x4418b08e)}, + {TOBN(0xf75084cf, 0x3e2d6520), TOBN(0x7ebdf0f8, 0x144ed509), + TOBN(0xf43bf0f2, 0xd3f25b98), TOBN(0x86ad71cf, 0xa354d837)}}, + {{TOBN(0xb827fe92, 0x26f43572), TOBN(0xdfd3ab5b, 0x5d824758), + TOBN(0x315dd23a, 0x539094c1), TOBN(0x85c0e37a, 0x66623d68)}, + {TOBN(0x575c7972, 0x7be19ae0), TOBN(0x616a3396, 0xdf0d36b5), + TOBN(0xa1ebb3c8, 0x26b1ff7e), TOBN(0x635b9485, 0x140ad453)}}, + {{TOBN(0x92bf3cda, 0xda430c0b), TOBN(0x4702850e, 0x3a96dac6), + TOBN(0xc91cf0a5, 0x15ac326a), TOBN(0x95de4f49, 0xab8c25e4)}, + {TOBN(0xb01bad09, 0xe265c17c), TOBN(0x24e45464, 0x087b3881), + TOBN(0xd43e583c, 0xe1fac5ca), TOBN(0xe17cb318, 0x6ead97a6)}}, + {{TOBN(0x6cc39243, 0x74dcec46), TOBN(0x33cfc02d, 0x54c2b73f), + TOBN(0x82917844, 0xf26cd99c), TOBN(0x8819dd95, 0xd1773f89)}, + {TOBN(0x09572aa6, 0x0871f427), TOBN(0x8e0cf365, 0xf6f01c34), + TOBN(0x7fa52988, 0xbff1f5af), TOBN(0x4eb357ea, 0xe75e8e50)}}, + {{TOBN(0xd9d0c8c4, 0x868af75d), TOBN(0xd7325cff, 0x45c8c7ea), + TOBN(0xab471996, 0xcc81ecb0), TOBN(0xff5d55f3, 0x611824ed)}, + {TOBN(0xbe314541, 0x1977a0ee), TOBN(0x5085c4c5, 0x722038c6), + TOBN(0x2d5335bf, 0xf94bb495), TOBN(0x894ad8a6, 0xc8e2a082)}}, + {{TOBN(0x5c3e2341, 0xada35438), TOBN(0xf4a9fc89, 0x049b8c4e), + TOBN(0xbeeb355a, 0x9f17cf34), TOBN(0x3f311e0e, 0x6c91fe10)}, + {TOBN(0xc2d20038, 0x92ab9891), TOBN(0x257bdcc1, 0x3e8ce9a9), + TOBN(0x1b2d9789, 0x88c53bee), TOBN(0x927ce89a, 0xcdba143a)}}, + {{TOBN(0xb0a32cca, 0x523db280), TOBN(0x5c889f8a, 0x50d43783), + TOBN(0x503e04b3, 0x4897d16f), TOBN(0x8cdb6e78, 0x08f5f2e8)}, + {TOBN(0x6ab91cf0, 0x179c8e74), TOBN(0xd8874e52, 0x48211d60), + TOBN(0xf948d4d5, 0xea851200), TOBN(0x4076d41e, 0xe6f9840a)}}, + {{TOBN(0xc20e263c, 0x47b517ea), TOBN(0x79a448fd, 0x30685e5e), + TOBN(0xe55f6f78, 0xf90631a0), TOBN(0x88a790b1, 0xa79e6346)}, + {TOBN(0x62160c7d, 0x80969fe8), TOBN(0x54f92fd4, 0x41491bb9), + TOBN(0xa6645c23, 0x5c957526), TOBN(0xf44cc5ae, 0xbea3ce7b)}}, + {{TOBN(0xf7628327, 0x8b1e68b7), TOBN(0xc731ad7a, 0x303f29d3), + TOBN(0xfe5a9ca9, 0x57d03ecb), TOBN(0x96c0d50c, 0x41bc97a7)}, + {TOBN(0xc4669fe7, 0x9b4f7f24), TOBN(0xfdd781d8, 0x3d9967ef), + TOBN(0x7892c7c3, 0x5d2c208d), TOBN(0x8bf64f7c, 0xae545cb3)}}, + {{TOBN(0xc01f862c, 0x467be912), TOBN(0xf4c85ee9, 0xc73d30cc), + TOBN(0x1fa6f4be, 0x6ab83ec7), TOBN(0xa07a3c1c, 0x4e3e3cf9)}, + {TOBN(0x87f8ef45, 0x0c00beb3), TOBN(0x30e2c2b3, 0x000d4c3e), + TOBN(0x1aa00b94, 0xfe08bf5b), TOBN(0x32c133aa, 0x9224ef52)}}, + {{TOBN(0x38df16bb, 0x32e5685d), TOBN(0x68a9e069, 0x58e6f544), + TOBN(0x495aaff7, 0xcdc5ebc6), TOBN(0xf894a645, 0x378b135f)}, + {TOBN(0xf316350a, 0x09e27ecf), TOBN(0xeced201e, 0x58f7179d), + TOBN(0x2eec273c, 0xe97861ba), TOBN(0x47ec2cae, 0xd693be2e)}}, + {{TOBN(0xfa4c97c4, 0xf68367ce), TOBN(0xe4f47d0b, 0xbe5a5755), + TOBN(0x17de815d, 0xb298a979), TOBN(0xd7eca659, 0xc177dc7d)}, + {TOBN(0x20fdbb71, 0x49ded0a3), TOBN(0x4cb2aad4, 0xfb34d3c5), + TOBN(0x2cf31d28, 0x60858a33), TOBN(0x3b6873ef, 0xa24aa40f)}}, + {{TOBN(0x540234b2, 0x2c11bb37), TOBN(0x2d0366dd, 0xed4c74a3), + TOBN(0xf9a968da, 0xeec5f25d), TOBN(0x36601068, 0x67b63142)}, + {TOBN(0x07cd6d2c, 0x68d7b6d4), TOBN(0xa8f74f09, 0x0c842942), + TOBN(0xe2751404, 0x7768b1ee), TOBN(0x4b5f7e89, 0xfe62aee4)}}, + {{TOBN(0xc6a77177, 0x89070d26), TOBN(0xa1f28e4e, 0xdd1c8bc7), + TOBN(0xea5f4f06, 0x469e1f17), TOBN(0x78fc242a, 0xfbdb78e0)}, + {TOBN(0xc9c7c592, 0x8b0588f1), TOBN(0xb6b7a0fd, 0x1535921e), + TOBN(0xcc5bdb91, 0xbde5ae35), TOBN(0xb42c485e, 0x12ff1864)}}, + {{TOBN(0xa1113e13, 0xdbab98aa), TOBN(0xde9d469b, 0xa17b1024), + TOBN(0x23f48b37, 0xc0462d3a), TOBN(0x3752e537, 0x7c5c078d)}, + {TOBN(0xe3a86add, 0x15544eb9), TOBN(0xf013aea7, 0x80fba279), + TOBN(0x8b5bb76c, 0xf22001b5), TOBN(0xe617ba14, 0xf02891ab)}}, + {{TOBN(0xd39182a6, 0x936219d3), TOBN(0x5ce1f194, 0xae51cb19), + TOBN(0xc78f8598, 0xbf07a74c), TOBN(0x6d7158f2, 0x22cbf1bc)}, + {TOBN(0x3b846b21, 0xe300ce18), TOBN(0x35fba630, 0x2d11275d), + TOBN(0x5fe25c36, 0xa0239b9b), TOBN(0xd8beb35d, 0xdf05d940)}}, + {{TOBN(0x4db02bb0, 0x1f7e320d), TOBN(0x0641c364, 0x6da320ea), + TOBN(0x6d95fa5d, 0x821389a3), TOBN(0x92699748, 0x8fcd8e3d)}, + {TOBN(0x316fef17, 0xceb6c143), TOBN(0x67fcb841, 0xd933762b), + TOBN(0xbb837e35, 0x118b17f8), TOBN(0x4b92552f, 0x9fd24821)}}, + {{TOBN(0xae6bc70e, 0x46aca793), TOBN(0x1cf0b0e4, 0xe579311b), + TOBN(0x8dc631be, 0x5802f716), TOBN(0x099bdc6f, 0xbddbee4d)}, + {TOBN(0xcc352bb2, 0x0caf8b05), TOBN(0xf74d505a, 0x72d63df2), + TOBN(0xb9876d4b, 0x91c4f408), TOBN(0x1ce18473, 0x9e229b2d)}}, + {{TOBN(0x49507597, 0x83abdb4a), TOBN(0x850fbcb6, 0xdee84b18), + TOBN(0x6325236e, 0x609e67dc), TOBN(0x04d831d9, 0x9336c6d8)}, + {TOBN(0x8deaae3b, 0xfa12d45d), TOBN(0xe425f8ce, 0x4746e246), + TOBN(0x8004c175, 0x24f5f31e), TOBN(0xaca16d8f, 0xad62c3b7)}}, + {{TOBN(0x0dc15a6a, 0x9152f934), TOBN(0xf1235e5d, 0xed0e12c1), + TOBN(0xc33c06ec, 0xda477dac), TOBN(0x76be8732, 0xb2ea0006)}, + {TOBN(0xcf3f7831, 0x0c0cd313), TOBN(0x3c524553, 0xa614260d), + TOBN(0x31a756f8, 0xcab22d15), TOBN(0x03ee10d1, 0x77827a20)}}, + {{TOBN(0xd1e059b2, 0x1994ef20), TOBN(0x2a653b69, 0x638ae318), + TOBN(0x70d5eb58, 0x2f699010), TOBN(0x279739f7, 0x09f5f84a)}, + {TOBN(0x5da4663c, 0x8b799336), TOBN(0xfdfdf14d, 0x203c37eb), + TOBN(0x32d8a9dc, 0xa1dbfb2d), TOBN(0xab40cff0, 0x77d48f9b)}}, + {{TOBN(0xc018b383, 0xd20b42d5), TOBN(0xf9a810ef, 0x9f78845f), + TOBN(0x40af3753, 0xbdba9df0), TOBN(0xb90bdcfc, 0x131dfdf9)}, + {TOBN(0x18720591, 0xf01ab782), TOBN(0xc823f211, 0x6af12a88), + TOBN(0xa51b80f3, 0x0dc14401), TOBN(0xde248f77, 0xfb2dfbe3)}}, + {{TOBN(0xef5a44e5, 0x0cafe751), TOBN(0x73997c9c, 0xd4dcd221), + TOBN(0x32fd86d1, 0xde854024), TOBN(0xd5b53adc, 0xa09b84bb)}, + {TOBN(0x008d7a11, 0xdcedd8d1), TOBN(0x406bd1c8, 0x74b32c84), + TOBN(0x5d4472ff, 0x05dde8b1), TOBN(0x2e25f2cd, 0xfce2b32f)}}, + {{TOBN(0xbec0dd5e, 0x29dfc254), TOBN(0x4455fcf6, 0x2b98b267), + TOBN(0x0b4d43a5, 0xc72df2ad), TOBN(0xea70e6be, 0x48a75397)}, + {TOBN(0x2aad6169, 0x5820f3bf), TOBN(0xf410d2dd, 0x9e37f68f), + TOBN(0x70fb7dba, 0x7be5ac83), TOBN(0x636bb645, 0x36ec3eec)}}, + {{TOBN(0x27104ea3, 0x9754e21c), TOBN(0xbc87a3e6, 0x8d63c373), + TOBN(0x483351d7, 0x4109db9a), TOBN(0x0fa724e3, 0x60134da7)}, + {TOBN(0x9ff44c29, 0xb0720b16), TOBN(0x2dd0cf13, 0x06aceead), + TOBN(0x5942758c, 0xe26929a6), TOBN(0x96c5db92, 0xb766a92b)}}, + {{TOBN(0xcec7d4c0, 0x5f18395e), TOBN(0xd3f22744, 0x1f80d032), + TOBN(0x7a68b37a, 0xcb86075b), TOBN(0x074764dd, 0xafef92db)}, + {TOBN(0xded1e950, 0x7bc7f389), TOBN(0xc580c850, 0xb9756460), + TOBN(0xaeeec2a4, 0x7da48157), TOBN(0x3f0b4e7f, 0x82c587b3)}}, + {{TOBN(0x231c6de8, 0xa9f19c53), TOBN(0x5717bd73, 0x6974e34e), + TOBN(0xd9e1d216, 0xf1508fa9), TOBN(0x9f112361, 0xdadaa124)}, + {TOBN(0x80145e31, 0x823b7348), TOBN(0x4dd8f0d5, 0xac634069), + TOBN(0xe3d82fc7, 0x2297c258), TOBN(0x276fcfee, 0x9cee7431)}}, + {{TOBN(0x8eb61b5e, 0x2bc0aea9), TOBN(0x4f668fd5, 0xde329431), + TOBN(0x03a32ab1, 0x38e4b87e), TOBN(0xe1374517, 0x73d0ef0b)}, + {TOBN(0x1a46f7e6, 0x853ac983), TOBN(0xc3bdf42e, 0x68e78a57), + TOBN(0xacf20785, 0x2ea96dd1), TOBN(0xa10649b9, 0xf1638460)}}, + {{TOBN(0xf2369f0b, 0x879fbbed), TOBN(0x0ff0ae86, 0xda9d1869), + TOBN(0x5251d759, 0x56766f45), TOBN(0x4984d8c0, 0x2be8d0fc)}, + {TOBN(0x7ecc95a6, 0xd21008f0), TOBN(0x29bd54a0, 0x3a1a1c49), + TOBN(0xab9828c5, 0xd26c50f3), TOBN(0x32c0087c, 0x51d0d251)}}, + {{TOBN(0x9bac3ce6, 0x0c1cdb26), TOBN(0xcd94d947, 0x557ca205), + TOBN(0x1b1bd598, 0x9db1fdcd), TOBN(0x0eda0108, 0xa3d8b149)}, + {TOBN(0x95066610, 0x56152fcc), TOBN(0xc2f037e6, 0xe7192b33), + TOBN(0xdeffb41a, 0xc92e05a4), TOBN(0x1105f6c2, 0xc2f6c62e)}}, + {{TOBN(0x68e73500, 0x8733913c), TOBN(0xcce86163, 0x3f3adc40), + TOBN(0xf407a942, 0x38a278e9), TOBN(0xd13c1b9d, 0x2ab21292)}, + {TOBN(0x93ed7ec7, 0x1c74cf5c), TOBN(0x8887dc48, 0xf1a4c1b4), + TOBN(0x3830ff30, 0x4b3a11f1), TOBN(0x358c5a3c, 0x58937cb6)}}, + {{TOBN(0x027dc404, 0x89022829), TOBN(0x40e93977, 0x3b798f79), + TOBN(0x90ad3337, 0x38be6ead), TOBN(0x9c23f6bc, 0xf34c0a5d)}, + {TOBN(0xd1711a35, 0xfbffd8bb), TOBN(0x60fcfb49, 0x1949d3dd), + TOBN(0x09c8ef4b, 0x7825d93a), TOBN(0x24233cff, 0xa0a8c968)}}, + {{TOBN(0x67ade46c, 0xe6d982af), TOBN(0xebb6bf3e, 0xe7544d7c), + TOBN(0xd6b9ba76, 0x3d8bd087), TOBN(0x46fe382d, 0x4dc61280)}, + {TOBN(0xbd39a7e8, 0xb5bdbd75), TOBN(0xab381331, 0xb8f228fe), + TOBN(0x0709a77c, 0xce1c4300), TOBN(0x6a247e56, 0xf337ceac)}}, + {{TOBN(0x8f34f21b, 0x636288be), TOBN(0x9dfdca74, 0xc8a7c305), + TOBN(0x6decfd1b, 0xea919e04), TOBN(0xcdf2688d, 0x8e1991f8)}, + {TOBN(0xe607df44, 0xd0f8a67e), TOBN(0xd985df4b, 0x0b58d010), + TOBN(0x57f834c5, 0x0c24f8f4), TOBN(0xe976ef56, 0xa0bf01ae)}}, + {{TOBN(0x536395ac, 0xa1c32373), TOBN(0x351027aa, 0x734c0a13), + TOBN(0xd2f1b5d6, 0x5e6bd5bc), TOBN(0x2b539e24, 0x223debed)}, + {TOBN(0xd4994cec, 0x0eaa1d71), TOBN(0x2a83381d, 0x661dcf65), + TOBN(0x5f1aed2f, 0x7b54c740), TOBN(0x0bea3fa5, 0xd6dda5ee)}}, + {{TOBN(0x9d4fb684, 0x36cc6134), TOBN(0x8eb9bbf3, 0xc0a443dd), + TOBN(0xfc500e2e, 0x383b7d2a), TOBN(0x7aad621c, 0x5b775257)}, + {TOBN(0x69284d74, 0x0a8f7cc0), TOBN(0xe820c2ce, 0x07562d65), + TOBN(0xbf9531b9, 0x499758ee), TOBN(0x73e95ca5, 0x6ee0cc2d)}}, + {{TOBN(0xf61790ab, 0xfbaf50a5), TOBN(0xdf55e76b, 0x684e0750), + TOBN(0xec516da7, 0xf176b005), TOBN(0x575553bb, 0x7a2dddc7)}, + {TOBN(0x37c87ca3, 0x553afa73), TOBN(0x315f3ffc, 0x4d55c251), + TOBN(0xe846442a, 0xaf3e5d35), TOBN(0x61b91149, 0x6495ff28)}}, + {{TOBN(0x23cc95d3, 0xfa326dc3), TOBN(0x1df4da1f, 0x18fc2cea), + TOBN(0x24bf9adc, 0xd0a37d59), TOBN(0xb6710053, 0x320d6e1e)}, + {TOBN(0x96f9667e, 0x618344d1), TOBN(0xcc7ce042, 0xa06445af), + TOBN(0xa02d8514, 0xd68dbc3a), TOBN(0x4ea109e4, 0x280b5a5b)}}, + {{TOBN(0x5741a7ac, 0xb40961bf), TOBN(0x4ada5937, 0x6aa56bfa), + TOBN(0x7feb9145, 0x02b765d1), TOBN(0x561e97be, 0xe6ad1582)}, + {TOBN(0xbbc4a5b6, 0xda3982f5), TOBN(0x0c2659ed, 0xb546f468), + TOBN(0xb8e7e6aa, 0x59612d20), TOBN(0xd83dfe20, 0xac19e8e0)}}, + {{TOBN(0x8530c45f, 0xb835398c), TOBN(0x6106a8bf, 0xb38a41c2), + TOBN(0x21e8f9a6, 0x35f5dcdb), TOBN(0x39707137, 0xcae498ed)}, + {TOBN(0x70c23834, 0xd8249f00), TOBN(0x9f14b58f, 0xab2537a0), + TOBN(0xd043c365, 0x5f61c0c2), TOBN(0xdc5926d6, 0x09a194a7)}}, + {{TOBN(0xddec0339, 0x8e77738a), TOBN(0xd07a63ef, 0xfba46426), + TOBN(0x2e58e79c, 0xee7f6e86), TOBN(0xe59b0459, 0xff32d241)}, + {TOBN(0xc5ec84e5, 0x20fa0338), TOBN(0x97939ac8, 0xeaff5ace), + TOBN(0x0310a4e3, 0xb4a38313), TOBN(0x9115fba2, 0x8f9d9885)}}, + {{TOBN(0x8dd710c2, 0x5fadf8c3), TOBN(0x66be38a2, 0xce19c0e2), + TOBN(0xd42a279c, 0x4cfe5022), TOBN(0x597bb530, 0x0e24e1b8)}, + {TOBN(0x3cde86b7, 0xc153ca7f), TOBN(0xa8d30fb3, 0x707d63bd), + TOBN(0xac905f92, 0xbd60d21e), TOBN(0x98e7ffb6, 0x7b9a54ab)}}, + {{TOBN(0xd7147df8, 0xe9726a30), TOBN(0xb5e216ff, 0xafce3533), + TOBN(0xb550b799, 0x2ff1ec40), TOBN(0x6b613b87, 0xa1e953fd)}, + {TOBN(0x87b88dba, 0x792d5610), TOBN(0x2ee1270a, 0xa190fbe1), + TOBN(0x02f4e2dc, 0x2ef581da), TOBN(0x016530e4, 0xeff82a95)}}, + {{TOBN(0xcbb93dfd, 0x8fd6ee89), TOBN(0x16d3d986, 0x46848fff), + TOBN(0x600eff24, 0x1da47adf), TOBN(0x1b9754a0, 0x0ad47a71)}, + {TOBN(0x8f9266df, 0x70c33b98), TOBN(0xaadc87ae, 0xdf34186e), + TOBN(0x0d2ce8e1, 0x4ad24132), TOBN(0x8a47cbfc, 0x19946eba)}}, + {{TOBN(0x47feeb66, 0x62b5f3af), TOBN(0xcefab561, 0x0abb3734), + TOBN(0x449de60e, 0x19f35cb1), TOBN(0x39f8db14, 0x157f0eb9)}, + {TOBN(0xffaecc5b, 0x3c61bfd6), TOBN(0xa5a4d41d, 0x41216703), + TOBN(0x7f8fabed, 0x224e1cc2), TOBN(0x0d5a8186, 0x871ad953)}}, + {{TOBN(0xf10774f7, 0xd22da9a9), TOBN(0x45b8a678, 0xcc8a9b0d), + TOBN(0xd9c2e722, 0xbdc32cff), TOBN(0xbf71b5f5, 0x337202a5)}, + {TOBN(0x95c57f2f, 0x69fc4db9), TOBN(0xb6dad34c, 0x765d01e1), + TOBN(0x7e0bd13f, 0xcb904635), TOBN(0x61751253, 0x763a588c)}}, + {{TOBN(0xd85c2997, 0x81af2c2d), TOBN(0xc0f7d9c4, 0x81b9d7da), + TOBN(0x838a34ae, 0x08533e8d), TOBN(0x15c4cb08, 0x311d8311)}, + {TOBN(0x97f83285, 0x8e121e14), TOBN(0xeea7dc1e, 0x85000a5f), + TOBN(0x0c6059b6, 0x5d256274), TOBN(0xec9beace, 0xb95075c0)}}, + {{TOBN(0x173daad7, 0x1df97828), TOBN(0xbf851cb5, 0xa8937877), + TOBN(0xb083c594, 0x01646f3c), TOBN(0x3bad30cf, 0x50c6d352)}, + {TOBN(0xfeb2b202, 0x496bbcea), TOBN(0x3cf9fd4f, 0x18a1e8ba), + TOBN(0xd26de7ff, 0x1c066029), TOBN(0x39c81e9e, 0x4e9ed4f8)}}, + {{TOBN(0xd8be0cb9, 0x7b390d35), TOBN(0x01df2bbd, 0x964aab27), + TOBN(0x3e8c1a65, 0xc3ef64f8), TOBN(0x567291d1, 0x716ed1dd)}, + {TOBN(0x95499c6c, 0x5f5406d3), TOBN(0x71fdda39, 0x5ba8e23f), + TOBN(0xcfeb320e, 0xd5096ece), TOBN(0xbe7ba92b, 0xca66dd16)}}, + {{TOBN(0x4608d36b, 0xc6fb5a7d), TOBN(0xe3eea15a, 0x6d2dd0e0), + TOBN(0x75b0a3eb, 0x8f97a36a), TOBN(0xf59814cc, 0x1c83de1e)}, + {TOBN(0x56c9c5b0, 0x1c33c23f), TOBN(0xa96c1da4, 0x6faa4136), + TOBN(0x46bf2074, 0xde316551), TOBN(0x3b866e7b, 0x1f756c8f)}}, + {{TOBN(0x727727d8, 0x1495ed6b), TOBN(0xb2394243, 0xb682dce7), + TOBN(0x8ab8454e, 0x758610f3), TOBN(0xc243ce84, 0x857d72a4)}, + {TOBN(0x7b320d71, 0xdbbf370f), TOBN(0xff9afa37, 0x78e0f7ca), + TOBN(0x0119d1e0, 0xea7b523f), TOBN(0xb997f8cb, 0x058c7d42)}}, + {{TOBN(0x285bcd2a, 0x37bbb184), TOBN(0x51dcec49, 0xa45d1fa6), + TOBN(0x6ade3b64, 0xe29634cb), TOBN(0x080c94a7, 0x26b86ef1)}, + {TOBN(0xba583db1, 0x2283fbe3), TOBN(0x902bddc8, 0x5a9315ed), + TOBN(0x07c1ccb3, 0x86964bec), TOBN(0x78f4eacf, 0xb6258301)}}, + {{TOBN(0x4bdf3a49, 0x56f90823), TOBN(0xba0f5080, 0x741d777b), + TOBN(0x091d71c3, 0xf38bf760), TOBN(0x9633d50f, 0x9b625b02)}, + {TOBN(0x03ecb743, 0xb8c9de61), TOBN(0xb4751254, 0x5de74720), + TOBN(0x9f9defc9, 0x74ce1cb2), TOBN(0x774a4f6a, 0x00bd32ef)}}, + {{TOBN(0xaca385f7, 0x73848f22), TOBN(0x53dad716, 0xf3f8558e), + TOBN(0xab7b34b0, 0x93c471f9), TOBN(0xf530e069, 0x19644bc7)}, + {TOBN(0x3d9fb1ff, 0xdd59d31a), TOBN(0x4382e0df, 0x08daa795), + TOBN(0x165c6f4b, 0xd5cc88d7), TOBN(0xeaa392d5, 0x4a18c900)}}, + {{TOBN(0x94203c67, 0x648024ee), TOBN(0x188763f2, 0x8c2fabcd), + TOBN(0xa80f87ac, 0xbbaec835), TOBN(0x632c96e0, 0xf29d8d54)}, + {TOBN(0x29b0a60e, 0x4c00a95e), TOBN(0x2ef17f40, 0xe011e9fa), + TOBN(0xf6c0e1d1, 0x15b77223), TOBN(0xaaec2c62, 0x14b04e32)}}, + {{TOBN(0xd35688d8, 0x3d84e58c), TOBN(0x2af5094c, 0x958571db), + TOBN(0x4fff7e19, 0x760682a6), TOBN(0x4cb27077, 0xe39a407c)}, + {TOBN(0x0f59c547, 0x4ff0e321), TOBN(0x169f34a6, 0x1b34c8ff), + TOBN(0x2bff1096, 0x52bc1ba7), TOBN(0xa25423b7, 0x83583544)}}, + {{TOBN(0x5d55d5d5, 0x0ac8b782), TOBN(0xff6622ec, 0x2db3c892), + TOBN(0x48fce741, 0x6b8bb642), TOBN(0x31d6998c, 0x69d7e3dc)}, + {TOBN(0xdbaf8004, 0xcadcaed0), TOBN(0x801b0142, 0xd81d053c), + TOBN(0x94b189fc, 0x59630ec6), TOBN(0x120e9934, 0xaf762c8e)}}, + {{TOBN(0x53a29aa4, 0xfdc6a404), TOBN(0x19d8e01e, 0xa1909948), + TOBN(0x3cfcabf1, 0xd7e89681), TOBN(0x3321a50d, 0x4e132d37)}, + {TOBN(0xd0496863, 0xe9a86111), TOBN(0x8c0cde61, 0x06a3bc65), + TOBN(0xaf866c49, 0xfc9f8eef), TOBN(0x2066350e, 0xff7f5141)}}, + {{TOBN(0x4f8a4689, 0xe56ddfbd), TOBN(0xea1b0c07, 0xfe32983a), + TOBN(0x2b317462, 0x873cb8cb), TOBN(0x658deddc, 0x2d93229f)}, + {TOBN(0x65efaf4d, 0x0f64ef58), TOBN(0xfe43287d, 0x730cc7a8), + TOBN(0xaebc0c72, 0x3d047d70), TOBN(0x92efa539, 0xd92d26c9)}}, + {{TOBN(0x06e78457, 0x94b56526), TOBN(0x415cb80f, 0x0961002d), + TOBN(0x89e5c565, 0x76dcb10f), TOBN(0x8bbb6982, 0xff9259fe)}, + {TOBN(0x4fe8795b, 0x9abc2668), TOBN(0xb5d4f534, 0x1e678fb1), + TOBN(0x6601f3be, 0x7b7da2b9), TOBN(0x98da59e2, 0xa13d6805)}}, + {{TOBN(0x190d8ea6, 0x01799a52), TOBN(0xa20cec41, 0xb86d2952), + TOBN(0x3062ffb2, 0x7fff2a7c), TOBN(0x741b32e5, 0x79f19d37)}, + {TOBN(0xf80d8181, 0x4eb57d47), TOBN(0x7a2d0ed4, 0x16aef06b), + TOBN(0x09735fb0, 0x1cecb588), TOBN(0x1641caaa, 0xc6061f5b)}}}, + {{{TOBN(0x7f99824f, 0x20151427), TOBN(0x206828b6, 0x92430206), + TOBN(0xaa9097d7, 0xe1112357), TOBN(0xacf9a2f2, 0x09e414ec)}, + {TOBN(0xdbdac9da, 0x27915356), TOBN(0x7e0734b7, 0x001efee3), + TOBN(0x54fab5bb, 0xd2b288e2), TOBN(0x4c630fc4, 0xf62dd09c)}}, + {{TOBN(0x8537107a, 0x1ac2703b), TOBN(0xb49258d8, 0x6bc857b5), + TOBN(0x57df14de, 0xbcdaccd1), TOBN(0x24ab68d7, 0xc4ae8529)}, + {TOBN(0x7ed8b5d4, 0x734e59d0), TOBN(0x5f8740c8, 0xc495cc80), + TOBN(0x84aedd5a, 0x291db9b3), TOBN(0x80b360f8, 0x4fb995be)}}, + {{TOBN(0xae915f5d, 0x5fa067d1), TOBN(0x4134b57f, 0x9668960c), + TOBN(0xbd3656d6, 0xa48edaac), TOBN(0xdac1e3e4, 0xfc1d7436)}, + {TOBN(0x674ff869, 0xd81fbb26), TOBN(0x449ed3ec, 0xb26c33d4), + TOBN(0x85138705, 0xd94203e8), TOBN(0xccde538b, 0xbeeb6f4a)}}, + {{TOBN(0x55d5c68d, 0xa61a76fa), TOBN(0x598b441d, 0xca1554dc), + TOBN(0xd39923b9, 0x773b279c), TOBN(0x33331d3c, 0x36bf9efc)}, + {TOBN(0x2d4c848e, 0x298de399), TOBN(0xcfdb8e77, 0xa1a27f56), + TOBN(0x94c855ea, 0x57b8ab70), TOBN(0xdcdb9dae, 0x6f7879ba)}}, + {{TOBN(0x7bdff8c2, 0x019f2a59), TOBN(0xb3ce5bb3, 0xcb4fbc74), + TOBN(0xea907f68, 0x8a9173dd), TOBN(0x6cd3d0d3, 0x95a75439)}, + {TOBN(0x92ecc4d6, 0xefed021c), TOBN(0x09a9f9b0, 0x6a77339a), + TOBN(0x87ca6b15, 0x7188c64a), TOBN(0x10c29968, 0x44899158)}}, + {{TOBN(0x5859a229, 0xed6e82ef), TOBN(0x16f338e3, 0x65ebaf4e), + TOBN(0x0cd31387, 0x5ead67ae), TOBN(0x1c73d228, 0x54ef0bb4)}, + {TOBN(0x4cb55131, 0x74a5c8c7), TOBN(0x01cd2970, 0x7f69ad6a), + TOBN(0xa04d00dd, 0xe966f87e), TOBN(0xd96fe447, 0x0b7b0321)}}, + {{TOBN(0x342ac06e, 0x88fbd381), TOBN(0x02cd4a84, 0x5c35a493), + TOBN(0xe8fa89de, 0x54f1bbcd), TOBN(0x341d6367, 0x2575ed4c)}, + {TOBN(0xebe357fb, 0xd238202b), TOBN(0x600b4d1a, 0xa984ead9), + TOBN(0xc35c9f44, 0x52436ea0), TOBN(0x96fe0a39, 0xa370751b)}}, + {{TOBN(0x4c4f0736, 0x7f636a38), TOBN(0x9f943fb7, 0x0e76d5cb), + TOBN(0xb03510ba, 0xa8b68b8b), TOBN(0xc246780a, 0x9ed07a1f)}, + {TOBN(0x3c051415, 0x6d549fc2), TOBN(0xc2953f31, 0x607781ca), + TOBN(0x955e2c69, 0xd8d95413), TOBN(0xb300fadc, 0x7bd282e3)}}, + {{TOBN(0x81fe7b50, 0x87e9189f), TOBN(0xdb17375c, 0xf42dda27), + TOBN(0x22f7d896, 0xcf0a5904), TOBN(0xa0e57c5a, 0xebe348e6)}, + {TOBN(0xa61011d3, 0xf40e3c80), TOBN(0xb1189321, 0x8db705c5), + TOBN(0x4ed9309e, 0x50fedec3), TOBN(0xdcf14a10, 0x4d6d5c1d)}}, + {{TOBN(0x056c265b, 0x55691342), TOBN(0xe8e08504, 0x91049dc7), + TOBN(0x131329f5, 0xc9bae20a), TOBN(0x96c8b3e8, 0xd9dccdb4)}, + {TOBN(0x8c5ff838, 0xfb4ee6b4), TOBN(0xfc5a9aeb, 0x41e8ccf0), + TOBN(0x7417b764, 0xfae050c6), TOBN(0x0953c3d7, 0x00452080)}}, + {{TOBN(0x21372682, 0x38dfe7e8), TOBN(0xea417e15, 0x2bb79d4b), + TOBN(0x59641f1c, 0x76e7cf2d), TOBN(0x271e3059, 0xea0bcfcc)}, + {TOBN(0x624c7dfd, 0x7253ecbd), TOBN(0x2f552e25, 0x4fca6186), + TOBN(0xcbf84ecd, 0x4d866e9c), TOBN(0x73967709, 0xf68d4610)}}, + {{TOBN(0xa14b1163, 0xc27901b4), TOBN(0xfd9236e0, 0x899b8bf3), + TOBN(0x42b091ec, 0xcbc6da0a), TOBN(0xbb1dac6f, 0x5ad1d297)}, + {TOBN(0x80e61d53, 0xa91cf76e), TOBN(0x4110a412, 0xd31f1ee7), + TOBN(0x2d87c3ba, 0x13efcf77), TOBN(0x1f374bb4, 0xdf450d76)}}, + {{TOBN(0x5e78e2f2, 0x0d188dab), TOBN(0xe3968ed0, 0xf4b885ef), + TOBN(0x46c0568e, 0x7314570f), TOBN(0x31616338, 0x01170521)}, + {TOBN(0x18e1e7e2, 0x4f0c8afe), TOBN(0x4caa75ff, 0xdeea78da), + TOBN(0x82db67f2, 0x7c5d8a51), TOBN(0x36a44d86, 0x6f505370)}}, + {{TOBN(0xd72c5bda, 0x0333974f), TOBN(0x5db516ae, 0x27a70146), + TOBN(0x34705281, 0x210ef921), TOBN(0xbff17a8f, 0x0c9c38e5)}, + {TOBN(0x78f4814e, 0x12476da1), TOBN(0xc1e16613, 0x33c16980), + TOBN(0x9e5b386f, 0x424d4bca), TOBN(0x4c274e87, 0xc85740de)}}, + {{TOBN(0xb6a9b88d, 0x6c2f5226), TOBN(0x14d1b944, 0x550d7ca8), + TOBN(0x580c85fc, 0x1fc41709), TOBN(0xc1da368b, 0x54c6d519)}, + {TOBN(0x2b0785ce, 0xd5113cf7), TOBN(0x0670f633, 0x5a34708f), + TOBN(0x46e23767, 0x15cc3f88), TOBN(0x1b480cfa, 0x50c72c8f)}}, + {{TOBN(0x20288602, 0x4147519a), TOBN(0xd0981eac, 0x26b372f0), + TOBN(0xa9d4a7ca, 0xa785ebc8), TOBN(0xd953c50d, 0xdbdf58e9)}, + {TOBN(0x9d6361cc, 0xfd590f8f), TOBN(0x72e9626b, 0x44e6c917), + TOBN(0x7fd96110, 0x22eb64cf), TOBN(0x863ebb7e, 0x9eb288f3)}}, + {{TOBN(0x6e6ab761, 0x6aca8ee7), TOBN(0x97d10b39, 0xd7b40358), + TOBN(0x1687d377, 0x1e5feb0d), TOBN(0xc83e50e4, 0x8265a27a)}, + {TOBN(0x8f75a9fe, 0xc954b313), TOBN(0xcc2e8f47, 0x310d1f61), + TOBN(0xf5ba81c5, 0x6557d0e0), TOBN(0x25f9680c, 0x3eaf6207)}}, + {{TOBN(0xf95c6609, 0x4354080b), TOBN(0x5225bfa5, 0x7bf2fe1c), + TOBN(0xc5c004e2, 0x5c7d98fa), TOBN(0x3561bf1c, 0x019aaf60)}, + {TOBN(0x5e6f9f17, 0xba151474), TOBN(0xdec2f934, 0xb04f6eca), + TOBN(0x64e368a1, 0x269acb1e), TOBN(0x1332d9e4, 0x0cdda493)}}, + {{TOBN(0x60d6cf69, 0xdf23de05), TOBN(0x66d17da2, 0x009339a0), + TOBN(0x9fcac985, 0x0a693923), TOBN(0xbcf057fc, 0xed7c6a6d)}, + {TOBN(0xc3c5c8c5, 0xf0b5662c), TOBN(0x25318dd8, 0xdcba4f24), + TOBN(0x60e8cb75, 0x082b69ff), TOBN(0x7c23b3ee, 0x1e728c01)}}, + {{TOBN(0x15e10a0a, 0x097e4403), TOBN(0xcb3d0a86, 0x19854665), + TOBN(0x88d8e211, 0xd67d4826), TOBN(0xb39af66e, 0x0b9d2839)}, + {TOBN(0xa5f94588, 0xbd475ca8), TOBN(0xe06b7966, 0xc077b80b), + TOBN(0xfedb1485, 0xda27c26c), TOBN(0xd290d33a, 0xfe0fd5e0)}}, + {{TOBN(0xa40bcc47, 0xf34fb0fa), TOBN(0xb4760cc8, 0x1fb1ab09), + TOBN(0x8fca0993, 0xa273bfe3), TOBN(0x13e4fe07, 0xf70b213c)}, + {TOBN(0x3bcdb992, 0xfdb05163), TOBN(0x8c484b11, 0x0c2b19b6), + TOBN(0x1acb815f, 0xaaf2e3e2), TOBN(0xc6905935, 0xb89ff1b4)}}, + {{TOBN(0xb2ad6f9d, 0x586e74e1), TOBN(0x488883ad, 0x67b80484), + TOBN(0x758aa2c7, 0x369c3ddb), TOBN(0x8ab74e69, 0x9f9afd31)}, + {TOBN(0x10fc2d28, 0x5e21beb1), TOBN(0x3484518a, 0x318c42f9), + TOBN(0x377427dc, 0x53cf40c3), TOBN(0x9de0781a, 0x391bc1d9)}}, + {{TOBN(0x8faee858, 0x693807e1), TOBN(0xa3865327, 0x4e81ccc7), + TOBN(0x02c30ff2, 0x6f835b84), TOBN(0xb604437b, 0x0d3d38d4)}, + {TOBN(0xb3fc8a98, 0x5ca1823d), TOBN(0xb82f7ec9, 0x03be0324), + TOBN(0xee36d761, 0xcf684a33), TOBN(0x5a01df0e, 0x9f29bf7d)}}, + {{TOBN(0x686202f3, 0x1306583d), TOBN(0x05b10da0, 0x437c622e), + TOBN(0xbf9aaa0f, 0x076a7bc8), TOBN(0x25e94efb, 0x8f8f4e43)}, + {TOBN(0x8a35c9b7, 0xfa3dc26d), TOBN(0xe0e5fb93, 0x96ff03c5), + TOBN(0xa77e3843, 0xebc394ce), TOBN(0xcede6595, 0x8361de60)}}, + {{TOBN(0xd27c22f6, 0xa1993545), TOBN(0xab01cc36, 0x24d671ba), + TOBN(0x63fa2877, 0xa169c28e), TOBN(0x925ef904, 0x2eb08376)}, + {TOBN(0x3b2fa3cf, 0x53aa0b32), TOBN(0xb27beb5b, 0x71c49d7a), + TOBN(0xb60e1834, 0xd105e27f), TOBN(0xd6089788, 0x4f68570d)}}, + {{TOBN(0x23094ce0, 0xd6fbc2ac), TOBN(0x738037a1, 0x815ff551), + TOBN(0xda73b1bb, 0x6bef119c), TOBN(0xdcf6c430, 0xeef506ba)}, + {TOBN(0x00e4fe7b, 0xe3ef104a), TOBN(0xebdd9a2c, 0x0a065628), + TOBN(0x853a81c3, 0x8792043e), TOBN(0x22ad6ece, 0xb3b59108)}}, + {{TOBN(0x9fb813c0, 0x39cd297d), TOBN(0x8ec7e16e, 0x05bda5d9), + TOBN(0x2834797c, 0x0d104b96), TOBN(0xcc11a2e7, 0x7c511510)}, + {TOBN(0x96ca5a53, 0x96ee6380), TOBN(0x054c8655, 0xcea38742), + TOBN(0xb5946852, 0xd54dfa7d), TOBN(0x97c422e7, 0x1f4ab207)}}, + {{TOBN(0xbf907509, 0x0c22b540), TOBN(0x2cde42aa, 0xb7c267d4), + TOBN(0xba18f9ed, 0x5ab0d693), TOBN(0x3ba62aa6, 0x6e4660d9)}, + {TOBN(0xb24bf97b, 0xab9ea96a), TOBN(0x5d039642, 0xe3b60e32), + TOBN(0x4e6a4506, 0x7c4d9bd5), TOBN(0x666c5b9e, 0x7ed4a6a4)}}, + {{TOBN(0xfa3fdcd9, 0x8edbd7cc), TOBN(0x4660bb87, 0xc6ccd753), + TOBN(0x9ae90820, 0x21e6b64f), TOBN(0x8a56a713, 0xb36bfb3f)}, + {TOBN(0xabfce096, 0x5726d47f), TOBN(0x9eed01b2, 0x0b1a9a7f), + TOBN(0x30e9cad4, 0x4eb74a37), TOBN(0x7b2524cc, 0x53e9666d)}}, + {{TOBN(0x6a29683b, 0x8f4b002f), TOBN(0xc2200d7a, 0x41f4fc20), + TOBN(0xcf3af47a, 0x3a338acc), TOBN(0x6539a4fb, 0xe7128975)}, + {TOBN(0xcec31c14, 0xc33c7fcf), TOBN(0x7eb6799b, 0xc7be322b), + TOBN(0x119ef4e9, 0x6646f623), TOBN(0x7b7a26a5, 0x54d7299b)}}, + {{TOBN(0xcb37f08d, 0x403f46f2), TOBN(0x94b8fc43, 0x1a0ec0c7), + TOBN(0xbb8514e3, 0xc332142f), TOBN(0xf3ed2c33, 0xe80d2a7a)}, + {TOBN(0x8d2080af, 0xb639126c), TOBN(0xf7b6be60, 0xe3553ade), + TOBN(0x3950aa9f, 0x1c7e2b09), TOBN(0x847ff958, 0x6410f02b)}}, + {{TOBN(0x877b7cf5, 0x678a31b0), TOBN(0xd50301ae, 0x3998b620), + TOBN(0x734257c5, 0xc00fb396), TOBN(0xf9fb18a0, 0x04e672a6)}, + {TOBN(0xff8bd8eb, 0xe8758851), TOBN(0x1e64e4c6, 0x5d99ba44), + TOBN(0x4b8eaedf, 0x7dfd93b7), TOBN(0xba2f2a98, 0x04e76b8c)}}, + {{TOBN(0x7d790cba, 0xe8053433), TOBN(0xc8e725a0, 0x3d2c9585), + TOBN(0x58c5c476, 0xcdd8f5ed), TOBN(0xd106b952, 0xefa9fe1d)}, + {TOBN(0x3c5c775b, 0x0eff13a9), TOBN(0x242442ba, 0xe057b930), + TOBN(0xe9f458d4, 0xc9b70cbd), TOBN(0x69b71448, 0xa3cdb89a)}}, + {{TOBN(0x41ee46f6, 0x0e2ed742), TOBN(0x573f1045, 0x40067493), + TOBN(0xb1e154ff, 0x9d54c304), TOBN(0x2ad0436a, 0x8d3a7502)}, + {TOBN(0xee4aaa2d, 0x431a8121), TOBN(0xcd38b3ab, 0x886f11ed), + TOBN(0x57d49ea6, 0x034a0eb7), TOBN(0xd2b773bd, 0xf7e85e58)}}, + {{TOBN(0x4a559ac4, 0x9b5c1f14), TOBN(0xc444be1a, 0x3e54df2b), + TOBN(0x13aad704, 0xeda41891), TOBN(0xcd927bec, 0x5eb5c788)}, + {TOBN(0xeb3c8516, 0xe48c8a34), TOBN(0x1b7ac812, 0x4b546669), + TOBN(0x1815f896, 0x594df8ec), TOBN(0x87c6a79c, 0x79227865)}}, + {{TOBN(0xae02a2f0, 0x9b56ddbd), TOBN(0x1339b5ac, 0x8a2f1cf3), + TOBN(0xf2b569c7, 0x839dff0d), TOBN(0xb0b9e864, 0xfee9a43d)}, + {TOBN(0x4ff8ca41, 0x77bb064e), TOBN(0x145a2812, 0xfd249f63), + TOBN(0x3ab7beac, 0xf86f689a), TOBN(0x9bafec27, 0x01d35f5e)}}, + {{TOBN(0x28054c65, 0x4265aa91), TOBN(0xa4b18304, 0x035efe42), + TOBN(0x6887b0e6, 0x9639dec7), TOBN(0xf4b8f6ad, 0x3d52aea5)}, + {TOBN(0xfb9293cc, 0x971a8a13), TOBN(0x3f159e5d, 0x4c934d07), + TOBN(0x2c50e9b1, 0x09acbc29), TOBN(0x08eb65e6, 0x7154d129)}}, + {{TOBN(0x4feff589, 0x30b75c3e), TOBN(0x0bb82fe2, 0x94491c93), + TOBN(0xd8ac377a, 0x89af62bb), TOBN(0xd7b51490, 0x9685e49f)}, + {TOBN(0xabca9a7b, 0x04497f19), TOBN(0x1b35ed0a, 0x1a7ad13f), + TOBN(0x6b601e21, 0x3ec86ed6), TOBN(0xda91fcb9, 0xce0c76f1)}}, + {{TOBN(0x9e28507b, 0xd7ab27e1), TOBN(0x7c19a555, 0x63945b7b), + TOBN(0x6b43f0a1, 0xaafc9827), TOBN(0x443b4fbd, 0x3aa55b91)}, + {TOBN(0x962b2e65, 0x6962c88f), TOBN(0x139da8d4, 0xce0db0ca), + TOBN(0xb93f05dd, 0x1b8d6c4f), TOBN(0x779cdff7, 0x180b9824)}}, + {{TOBN(0xbba23fdd, 0xae57c7b7), TOBN(0x345342f2, 0x1b932522), + TOBN(0xfd9c80fe, 0x556d4aa3), TOBN(0xa03907ba, 0x6525bb61)}, + {TOBN(0x38b010e1, 0xff218933), TOBN(0xc066b654, 0xaa52117b), + TOBN(0x8e141920, 0x94f2e6ea), TOBN(0x66a27dca, 0x0d32f2b2)}}, + {{TOBN(0x69c7f993, 0x048b3717), TOBN(0xbf5a989a, 0xb178ae1c), + TOBN(0x49fa9058, 0x564f1d6b), TOBN(0x27ec6e15, 0xd31fde4e)}, + {TOBN(0x4cce0373, 0x7276e7fc), TOBN(0x64086d79, 0x89d6bf02), + TOBN(0x5a72f046, 0x4ccdd979), TOBN(0x909c3566, 0x47775631)}}, + {{TOBN(0x1c07bc6b, 0x75dd7125), TOBN(0xb4c6bc97, 0x87a0428d), + TOBN(0x507ece52, 0xfdeb6b9d), TOBN(0xfca56512, 0xb2c95432)}, + {TOBN(0x15d97181, 0xd0e8bd06), TOBN(0x384dd317, 0xc6bb46ea), + TOBN(0x5441ea20, 0x3952b624), TOBN(0xbcf70dee, 0x4e7dc2fb)}}, + {{TOBN(0x372b016e, 0x6628e8c3), TOBN(0x07a0d667, 0xb60a7522), + TOBN(0xcf05751b, 0x0a344ee2), TOBN(0x0ec09a48, 0x118bdeec)}, + {TOBN(0x6e4b3d4e, 0xd83dce46), TOBN(0x43a6316d, 0x99d2fc6e), + TOBN(0xa99d8989, 0x56cf044c), TOBN(0x7c7f4454, 0xae3e5fb7)}}, + {{TOBN(0xb2e6b121, 0xfbabbe92), TOBN(0x281850fb, 0xe1330076), + TOBN(0x093581ec, 0x97890015), TOBN(0x69b1dded, 0x75ff77f5)}, + {TOBN(0x7cf0b18f, 0xab105105), TOBN(0x953ced31, 0xa89ccfef), + TOBN(0x3151f85f, 0xeb914009), TOBN(0x3c9f1b87, 0x88ed48ad)}}, + {{TOBN(0xc9aba1a1, 0x4a7eadcb), TOBN(0x928e7501, 0x522e71cf), + TOBN(0xeaede727, 0x3a2e4f83), TOBN(0x467e10d1, 0x1ce3bbd3)}, + {TOBN(0xf3442ac3, 0xb955dcf0), TOBN(0xba96307d, 0xd3d5e527), + TOBN(0xf763a10e, 0xfd77f474), TOBN(0x5d744bd0, 0x6a6e1ff0)}}, + {{TOBN(0xd287282a, 0xa777899e), TOBN(0xe20eda8f, 0xd03f3cde), + TOBN(0x6a7e75bb, 0x50b07d31), TOBN(0x0b7e2a94, 0x6f379de4)}, + {TOBN(0x31cb64ad, 0x19f593cf), TOBN(0x7b1a9e4f, 0x1e76ef1d), + TOBN(0xe18c9c9d, 0xb62d609c), TOBN(0x439bad6d, 0xe779a650)}}, + {{TOBN(0x219d9066, 0xe032f144), TOBN(0x1db632b8, 0xe8b2ec6a), + TOBN(0xff0d0fd4, 0xfda12f78), TOBN(0x56fb4c2d, 0x2a25d265)}, + {TOBN(0x5f4e2ee1, 0x255a03f1), TOBN(0x61cd6af2, 0xe96af176), + TOBN(0xe0317ba8, 0xd068bc97), TOBN(0x927d6bab, 0x264b988e)}}, + {{TOBN(0xa18f07e0, 0xe90fb21e), TOBN(0x00fd2b80, 0xbba7fca1), + TOBN(0x20387f27, 0x95cd67b5), TOBN(0x5b89a4e7, 0xd39707f7)}, + {TOBN(0x8f83ad3f, 0x894407ce), TOBN(0xa0025b94, 0x6c226132), + TOBN(0xc79563c7, 0xf906c13b), TOBN(0x5f548f31, 0x4e7bb025)}}, + {{TOBN(0x2b4c6b8f, 0xeac6d113), TOBN(0xa67e3f9c, 0x0e813c76), + TOBN(0x3982717c, 0x3fe1f4b9), TOBN(0x58865819, 0x26d8050e)}, + {TOBN(0x99f3640c, 0xf7f06f20), TOBN(0xdc610216, 0x2a66ebc2), + TOBN(0x52f2c175, 0x767a1e08), TOBN(0x05660e1a, 0x5999871b)}}, + {{TOBN(0x6b0f1762, 0x6d3c4693), TOBN(0xf0e7d627, 0x37ed7bea), + TOBN(0xc51758c7, 0xb75b226d), TOBN(0x40a88628, 0x1f91613b)}, + {TOBN(0x889dbaa7, 0xbbb38ce0), TOBN(0xe0404b65, 0xbddcad81), + TOBN(0xfebccd3a, 0x8bc9671f), TOBN(0xfbf9a357, 0xee1f5375)}}, + {{TOBN(0x5dc169b0, 0x28f33398), TOBN(0xb07ec11d, 0x72e90f65), + TOBN(0xae7f3b4a, 0xfaab1eb1), TOBN(0xd970195e, 0x5f17538a)}, + {TOBN(0x52b05cbe, 0x0181e640), TOBN(0xf5debd62, 0x2643313d), + TOBN(0x76148154, 0x5df31f82), TOBN(0x23e03b33, 0x3a9e13c5)}}, + {{TOBN(0xff758949, 0x4fde0c1f), TOBN(0xbf8a1abe, 0xe5b6ec20), + TOBN(0x702278fb, 0x87e1db6c), TOBN(0xc447ad7a, 0x35ed658f)}, + {TOBN(0x48d4aa38, 0x03d0ccf2), TOBN(0x80acb338, 0x819a7c03), + TOBN(0x9bc7c89e, 0x6e17cecc), TOBN(0x46736b8b, 0x03be1d82)}}, + {{TOBN(0xd65d7b60, 0xc0432f96), TOBN(0xddebe7a3, 0xdeb5442f), + TOBN(0x79a25307, 0x7dff69a2), TOBN(0x37a56d94, 0x02cf3122)}, + {TOBN(0x8bab8aed, 0xf2350d0a), TOBN(0x13c3f276, 0x037b0d9a), + TOBN(0xc664957c, 0x44c65cae), TOBN(0x88b44089, 0xc2e71a88)}}, + {{TOBN(0xdb88e5a3, 0x5cb02664), TOBN(0x5d4c0bf1, 0x8686c72e), + TOBN(0xea3d9b62, 0xa682d53e), TOBN(0x9b605ef4, 0x0b2ad431)}, + {TOBN(0x71bac202, 0xc69645d0), TOBN(0xa115f03a, 0x6a1b66e7), + TOBN(0xfe2c563a, 0x158f4dc4), TOBN(0xf715b3a0, 0x4d12a78c)}}, + {{TOBN(0x8f7f0a48, 0xd413213a), TOBN(0x2035806d, 0xc04becdb), + TOBN(0xecd34a99, 0x5d8587f5), TOBN(0x4d8c3079, 0x9f6d3a71)}, + {TOBN(0x1b2a2a67, 0x8d95a8f6), TOBN(0xc58c9d7d, 0xf2110d0d), + TOBN(0xdeee81d5, 0xcf8fba3f), TOBN(0xa42be3c0, 0x0c7cdf68)}}, + {{TOBN(0x2126f742, 0xd43b5eaa), TOBN(0x054a0766, 0xdfa59b85), + TOBN(0x9d0d5e36, 0x126bfd45), TOBN(0xa1f8fbd7, 0x384f8a8f)}, + {TOBN(0x317680f5, 0xd563fccc), TOBN(0x48ca5055, 0xf280a928), + TOBN(0xe00b81b2, 0x27b578cf), TOBN(0x10aad918, 0x2994a514)}}, + {{TOBN(0xd9e07b62, 0xb7bdc953), TOBN(0x9f0f6ff2, 0x5bc086dd), + TOBN(0x09d1ccff, 0x655eee77), TOBN(0x45475f79, 0x5bef7df1)}, + {TOBN(0x3faa28fa, 0x86f702cc), TOBN(0x92e60905, 0x0f021f07), + TOBN(0xe9e62968, 0x7f8fa8c6), TOBN(0xbd71419a, 0xf036ea2c)}}, + {{TOBN(0x171ee1cc, 0x6028da9a), TOBN(0x5352fe1a, 0xc251f573), + TOBN(0xf8ff236e, 0x3fa997f4), TOBN(0xd831b6c9, 0xa5749d5f)}, + {TOBN(0x7c872e1d, 0xe350e2c2), TOBN(0xc56240d9, 0x1e0ce403), + TOBN(0xf9deb077, 0x6974f5cb), TOBN(0x7d50ba87, 0x961c3728)}}, + {{TOBN(0xd6f89426, 0x5a3a2518), TOBN(0xcf817799, 0xc6303d43), + TOBN(0x510a0471, 0x619e5696), TOBN(0xab049ff6, 0x3a5e307b)}, + {TOBN(0xe4cdf9b0, 0xfeb13ec7), TOBN(0xd5e97117, 0x9d8ff90c), + TOBN(0xf6f64d06, 0x9afa96af), TOBN(0x00d0bf5e, 0x9d2012a2)}}, + {{TOBN(0xe63f301f, 0x358bcdc0), TOBN(0x07689e99, 0x0a9d47f8), + TOBN(0x1f689e2f, 0x4f43d43a), TOBN(0x4d542a16, 0x90920904)}, + {TOBN(0xaea293d5, 0x9ca0a707), TOBN(0xd061fe45, 0x8ac68065), + TOBN(0x1033bf1b, 0x0090008c), TOBN(0x29749558, 0xc08a6db6)}}, + {{TOBN(0x74b5fc59, 0xc1d5d034), TOBN(0xf712e9f6, 0x67e215e0), + TOBN(0xfd520cbd, 0x860200e6), TOBN(0x0229acb4, 0x3ea22588)}, + {TOBN(0x9cd1e14c, 0xfff0c82e), TOBN(0x87684b62, 0x59c69e73), + TOBN(0xda85e61c, 0x96ccb989), TOBN(0x2d5dbb02, 0xa3d06493)}}, + {{TOBN(0xf22ad33a, 0xe86b173c), TOBN(0xe8e41ea5, 0xa79ff0e3), + TOBN(0x01d2d725, 0xdd0d0c10), TOBN(0x31f39088, 0x032d28f9)}, + {TOBN(0x7b3f71e1, 0x7829839e), TOBN(0x0cf691b4, 0x4502ae58), + TOBN(0xef658dbd, 0xbefc6115), TOBN(0xa5cd6ee5, 0xb3ab5314)}}, + {{TOBN(0x206c8d7b, 0x5f1d2347), TOBN(0x794645ba, 0x4cc2253a), + TOBN(0xd517d8ff, 0x58389e08), TOBN(0x4fa20dee, 0x9f847288)}, + {TOBN(0xeba072d8, 0xd797770a), TOBN(0x7360c91d, 0xbf429e26), + TOBN(0x7200a3b3, 0x80af8279), TOBN(0x6a1c9150, 0x82dadce3)}}, + {{TOBN(0x0ee6d3a7, 0xc35d8794), TOBN(0x042e6558, 0x0356bae5), + TOBN(0x9f59698d, 0x643322fd), TOBN(0x9379ae15, 0x50a61967)}, + {TOBN(0x64b9ae62, 0xfcc9981e), TOBN(0xaed3d631, 0x6d2934c6), + TOBN(0x2454b302, 0x5e4e65eb), TOBN(0xab09f647, 0xf9950428)}}}, + {{{TOBN(0xb2083a12, 0x22248acc), TOBN(0x1f6ec0ef, 0x3264e366), + TOBN(0x5659b704, 0x5afdee28), TOBN(0x7a823a40, 0xe6430bb5)}, + {TOBN(0x24592a04, 0xe1900a79), TOBN(0xcde09d4a, 0xc9ee6576), + TOBN(0x52b6463f, 0x4b5ea54a), TOBN(0x1efe9ed3, 0xd3ca65a7)}}, + {{TOBN(0xe27a6dbe, 0x305406dd), TOBN(0x8eb7dc7f, 0xdd5d1957), + TOBN(0xf54a6876, 0x387d4d8f), TOBN(0x9c479409, 0xc7762de4)}, + {TOBN(0xbe4d5b5d, 0x99b30778), TOBN(0x25380c56, 0x6e793682), + TOBN(0x602d37f3, 0xdac740e3), TOBN(0x140deabe, 0x1566e4ae)}}, + {{TOBN(0x4481d067, 0xafd32acf), TOBN(0xd8f0fcca, 0xe1f71ccf), + TOBN(0xd208dd0c, 0xb596f2da), TOBN(0xd049d730, 0x9aad93f9)}, + {TOBN(0xc79f263d, 0x42ab580e), TOBN(0x09411bb1, 0x23f707b4), + TOBN(0x8cfde1ff, 0x835e0eda), TOBN(0x72707490, 0x90f03402)}}, + {{TOBN(0xeaee6126, 0xc49a861e), TOBN(0x024f3b65, 0xe14f0d06), + TOBN(0x51a3f1e8, 0xc69bfc17), TOBN(0xc3c3a8e9, 0xa7686381)}, + {TOBN(0x3400752c, 0xb103d4c8), TOBN(0x02bc4613, 0x9218b36b), + TOBN(0xc67f75eb, 0x7651504a), TOBN(0xd6848b56, 0xd02aebfa)}}, + {{TOBN(0xbd9802e6, 0xc30fa92b), TOBN(0x5a70d96d, 0x9a552784), + TOBN(0x9085c4ea, 0x3f83169b), TOBN(0xfa9423bb, 0x06908228)}, + {TOBN(0x2ffebe12, 0xfe97a5b9), TOBN(0x85da6049, 0x71b99118), + TOBN(0x9cbc2f7f, 0x63178846), TOBN(0xfd96bc70, 0x9153218e)}}, + {{TOBN(0x958381db, 0x1782269b), TOBN(0xae34bf79, 0x2597e550), + TOBN(0xbb5c6064, 0x5f385153), TOBN(0x6f0e96af, 0xe3088048)}, + {TOBN(0xbf6a0215, 0x77884456), TOBN(0xb3b5688c, 0x69310ea7), + TOBN(0x17c94295, 0x04fad2de), TOBN(0xe020f0e5, 0x17896d4d)}}, + {{TOBN(0x730ba0ab, 0x0976505f), TOBN(0x567f6813, 0x095e2ec5), + TOBN(0x47062010, 0x6331ab71), TOBN(0x72cfa977, 0x41d22b9f)}, + {TOBN(0x33e55ead, 0x8a2373da), TOBN(0xa8d0d5f4, 0x7ba45a68), + TOBN(0xba1d8f9c, 0x03029d15), TOBN(0x8f34f1cc, 0xfc55b9f3)}}, + {{TOBN(0xcca4428d, 0xbbe5a1a9), TOBN(0x8187fd5f, 0x3126bd67), + TOBN(0x0036973a, 0x48105826), TOBN(0xa39b6663, 0xb8bd61a0)}, + {TOBN(0x6d42deef, 0x2d65a808), TOBN(0x4969044f, 0x94636b19), + TOBN(0xf611ee47, 0xdd5d564c), TOBN(0x7b2f3a49, 0xd2873077)}}, + {{TOBN(0x94157d45, 0x300eb294), TOBN(0x2b2a656e, 0x169c1494), + TOBN(0xc000dd76, 0xd3a47aa9), TOBN(0xa2864e4f, 0xa6243ea4)}, + {TOBN(0x82716c47, 0xdb89842e), TOBN(0x12dfd7d7, 0x61479fb7), + TOBN(0x3b9a2c56, 0xe0b2f6dc), TOBN(0x46be862a, 0xd7f85d67)}}, + {{TOBN(0x03b0d8dd, 0x0f82b214), TOBN(0x460c34f9, 0xf103cbc6), + TOBN(0xf32e5c03, 0x18d79e19), TOBN(0x8b8888ba, 0xa84117f8)}, + {TOBN(0x8f3c37dc, 0xc0722677), TOBN(0x10d21be9, 0x1c1c0f27), + TOBN(0xd47c8468, 0xe0f7a0c6), TOBN(0x9bf02213, 0xadecc0e0)}}, + {{TOBN(0x0baa7d12, 0x42b48b99), TOBN(0x1bcb665d, 0x48424096), + TOBN(0x8b847cd6, 0xebfb5cfb), TOBN(0x87c2ae56, 0x9ad4d10d)}, + {TOBN(0xf1cbb122, 0x0de36726), TOBN(0xe7043c68, 0x3fdfbd21), + TOBN(0x4bd0826a, 0x4e79d460), TOBN(0x11f5e598, 0x4bd1a2cb)}}, + {{TOBN(0x97554160, 0xb7fe7b6e), TOBN(0x7d16189a, 0x400a3fb2), + TOBN(0xd73e9bea, 0xe328ca1e), TOBN(0x0dd04b97, 0xe793d8cc)}, + {TOBN(0xa9c83c9b, 0x506db8cc), TOBN(0x5cd47aae, 0xcf38814c), + TOBN(0x26fc430d, 0xb64b45e6), TOBN(0x079b5499, 0xd818ea84)}}, + {{TOBN(0xebb01102, 0xc1c24a3b), TOBN(0xca24e568, 0x1c161c1a), + TOBN(0x103eea69, 0x36f00a4a), TOBN(0x9ad76ee8, 0x76176c7b)}, + {TOBN(0x97451fc2, 0x538e0ff7), TOBN(0x94f89809, 0x6604b3b0), + TOBN(0x6311436e, 0x3249cfd7), TOBN(0x27b4a7bd, 0x41224f69)}}, + {{TOBN(0x03b5d21a, 0xe0ac2941), TOBN(0x279b0254, 0xc2d31937), + TOBN(0x3307c052, 0xcac992d0), TOBN(0x6aa7cb92, 0xefa8b1f3)}, + {TOBN(0x5a182580, 0x0d37c7a5), TOBN(0x13380c37, 0x342d5422), + TOBN(0x92ac2d66, 0xd5d2ef92), TOBN(0x035a70c9, 0x030c63c6)}}, + {{TOBN(0xc16025dd, 0x4ce4f152), TOBN(0x1f419a71, 0xf9df7c06), + TOBN(0x6d5b2214, 0x91e4bb14), TOBN(0xfc43c6cc, 0x839fb4ce)}, + {TOBN(0x49f06591, 0x925d6b2d), TOBN(0x4b37d9d3, 0x62186598), + TOBN(0x8c54a971, 0xd01b1629), TOBN(0xe1a9c29f, 0x51d50e05)}}, + {{TOBN(0x5109b785, 0x71ba1861), TOBN(0x48b22d5c, 0xd0c8f93d), + TOBN(0xe8fa84a7, 0x8633bb93), TOBN(0x53fba6ba, 0x5aebbd08)}, + {TOBN(0x7ff27df3, 0xe5eea7d8), TOBN(0x521c8796, 0x68ca7158), + TOBN(0xb9d5133b, 0xce6f1a05), TOBN(0x2d50cd53, 0xfd0ebee4)}}, + {{TOBN(0xc82115d6, 0xc5a3ef16), TOBN(0x993eff9d, 0xba079221), + TOBN(0xe4da2c5e, 0x4b5da81c), TOBN(0x9a89dbdb, 0x8033fd85)}, + {TOBN(0x60819ebf, 0x2b892891), TOBN(0x53902b21, 0x5d14a4d5), + TOBN(0x6ac35051, 0xd7fda421), TOBN(0xcc6ab885, 0x61c83284)}}, + {{TOBN(0x14eba133, 0xf74cff17), TOBN(0x240aaa03, 0xecb813f2), + TOBN(0xcfbb6540, 0x6f665bee), TOBN(0x084b1fe4, 0xa425ad73)}, + {TOBN(0x009d5d16, 0xd081f6a6), TOBN(0x35304fe8, 0xeef82c90), + TOBN(0xf20346d5, 0xaa9eaa22), TOBN(0x0ada9f07, 0xac1c91e3)}}, + {{TOBN(0xa6e21678, 0x968a6144), TOBN(0x54c1f77c, 0x07b31a1e), + TOBN(0xd6bb787e, 0x5781fbe1), TOBN(0x61bd2ee0, 0xe31f1c4a)}, + {TOBN(0xf25aa1e9, 0x781105fc), TOBN(0x9cf2971f, 0x7b2f8e80), + TOBN(0x26d15412, 0xcdff919b), TOBN(0x01db4ebe, 0x34bc896e)}}, + {{TOBN(0x7d9b3e23, 0xb40df1cf), TOBN(0x59337373, 0x94e971b4), + TOBN(0xbf57bd14, 0x669cf921), TOBN(0x865daedf, 0x0c1a1064)}, + {TOBN(0x3eb70bd3, 0x83279125), TOBN(0xbc3d5b9f, 0x34ecdaab), + TOBN(0x91e3ed7e, 0x5f755caf), TOBN(0x49699f54, 0xd41e6f02)}}, + {{TOBN(0x185770e1, 0xd4a7a15b), TOBN(0x08f3587a, 0xeaac87e7), + TOBN(0x352018db, 0x473133ea), TOBN(0x674ce719, 0x04fd30fc)}, + {TOBN(0x7b8d9835, 0x088b3e0e), TOBN(0x7a0356a9, 0x5d0d47a1), + TOBN(0x9d9e7659, 0x6474a3c4), TOBN(0x61ea48a7, 0xff66966c)}}, + {{TOBN(0x30417758, 0x0f3e4834), TOBN(0xfdbb21c2, 0x17a9afcb), + TOBN(0x756fa17f, 0x2f9a67b3), TOBN(0x2a6b2421, 0xa245c1a8)}, + {TOBN(0x64be2794, 0x4af02291), TOBN(0xade465c6, 0x2a5804fe), + TOBN(0x8dffbd39, 0xa6f08fd7), TOBN(0xc4efa84c, 0xaa14403b)}}, + {{TOBN(0xa1b91b2a, 0x442b0f5c), TOBN(0xb748e317, 0xcf997736), + TOBN(0x8d1b62bf, 0xcee90e16), TOBN(0x907ae271, 0x0b2078c0)}, + {TOBN(0xdf31534b, 0x0c9bcddd), TOBN(0x043fb054, 0x39adce83), + TOBN(0x99031043, 0xd826846a), TOBN(0x61a9c0d6, 0xb144f393)}}, + {{TOBN(0xdab48046, 0x47718427), TOBN(0xdf17ff9b, 0x6e830f8b), + TOBN(0x408d7ee8, 0xe49a1347), TOBN(0x6ac71e23, 0x91c1d4ae)}, + {TOBN(0xc8cbb9fd, 0x1defd73c), TOBN(0x19840657, 0xbbbbfec5), + TOBN(0x39db1cb5, 0x9e7ef8ea), TOBN(0x78aa8296, 0x64105f30)}}, + {{TOBN(0xa3d9b7f0, 0xa3738c29), TOBN(0x0a2f235a, 0xbc3250a3), + TOBN(0x55e506f6, 0x445e4caf), TOBN(0x0974f73d, 0x33475f7a)}, + {TOBN(0xd37dbba3, 0x5ba2f5a8), TOBN(0x542c6e63, 0x6af40066), + TOBN(0x26d99b53, 0xc5d73e2c), TOBN(0x06060d7d, 0x6c3ca33e)}}, + {{TOBN(0xcdbef1c2, 0x065fef4a), TOBN(0x77e60f7d, 0xfd5b92e3), + TOBN(0xd7c549f0, 0x26708350), TOBN(0x201b3ad0, 0x34f121bf)}, + {TOBN(0x5fcac2a1, 0x0334fc14), TOBN(0x8a9a9e09, 0x344552f6), + TOBN(0x7dd8a1d3, 0x97653082), TOBN(0x5fc0738f, 0x79d4f289)}}, + {{TOBN(0x787d244d, 0x17d2d8c3), TOBN(0xeffc6345, 0x70830684), + TOBN(0x5ddb96dd, 0xe4f73ae5), TOBN(0x8efb14b1, 0x172549a5)}, + {TOBN(0x6eb73eee, 0x2245ae7a), TOBN(0xbca4061e, 0xea11f13e), + TOBN(0xb577421d, 0x30b01f5d), TOBN(0xaa688b24, 0x782e152c)}}, + {{TOBN(0x67608e71, 0xbd3502ba), TOBN(0x4ef41f24, 0xb4de75a0), + TOBN(0xb08dde5e, 0xfd6125e5), TOBN(0xde484825, 0xa409543f)}, + {TOBN(0x1f198d98, 0x65cc2295), TOBN(0x428a3771, 0x6e0edfa2), + TOBN(0x4f9697a2, 0xadf35fc7), TOBN(0x01a43c79, 0xf7cac3c7)}}, + {{TOBN(0xb05d7059, 0x0fd3659a), TOBN(0x8927f30c, 0xbb7f2d9a), + TOBN(0x4023d1ac, 0x8cf984d3), TOBN(0x32125ed3, 0x02897a45)}, + {TOBN(0xfb572dad, 0x3d414205), TOBN(0x73000ef2, 0xe3fa82a9), + TOBN(0x4c0868e9, 0xf10a5581), TOBN(0x5b61fc67, 0x6b0b3ca5)}}, + {{TOBN(0xc1258d5b, 0x7cae440c), TOBN(0x21c08b41, 0x402b7531), + TOBN(0xf61a8955, 0xde932321), TOBN(0x3568faf8, 0x2d1408af)}, + {TOBN(0x71b15e99, 0x9ecf965b), TOBN(0xf14ed248, 0xe917276f), + TOBN(0xc6f4caa1, 0x820cf9e2), TOBN(0x681b20b2, 0x18d83c7e)}}, + {{TOBN(0x6cde738d, 0xc6c01120), TOBN(0x71db0813, 0xae70e0db), + TOBN(0x95fc0644, 0x74afe18c), TOBN(0x34619053, 0x129e2be7)}, + {TOBN(0x80615cea, 0xdb2a3b15), TOBN(0x0a49a19e, 0xdb4c7073), + TOBN(0x0e1b84c8, 0x8fd2d367), TOBN(0xd74bf462, 0x033fb8aa)}}, + {{TOBN(0x889f6d65, 0x533ef217), TOBN(0x7158c7e4, 0xc3ca2e87), + TOBN(0xfb670dfb, 0xdc2b4167), TOBN(0x75910a01, 0x844c257f)}, + {TOBN(0xf336bf07, 0xcf88577d), TOBN(0x22245250, 0xe45e2ace), + TOBN(0x2ed92e8d, 0x7ca23d85), TOBN(0x29f8be4c, 0x2b812f58)}}, + {{TOBN(0xdd9ebaa7, 0x076fe12b), TOBN(0x3f2400cb, 0xae1537f9), + TOBN(0x1aa93528, 0x17bdfb46), TOBN(0xc0f98430, 0x67883b41)}, + {TOBN(0x5590ede1, 0x0170911d), TOBN(0x7562f5bb, 0x34d4b17f), + TOBN(0xe1fa1df2, 0x1826b8d2), TOBN(0xb40b796a, 0x6bd80d59)}}, + {{TOBN(0xd65bf197, 0x3467ba92), TOBN(0x8c9b46db, 0xf70954b0), + TOBN(0x97c8a0f3, 0x0e78f15d), TOBN(0xa8f3a69a, 0x85a4c961)}, + {TOBN(0x4242660f, 0x61e4ce9b), TOBN(0xbf06aab3, 0x6ea6790c), + TOBN(0xc6706f8e, 0xec986416), TOBN(0x9e56dec1, 0x9a9fc225)}}, + {{TOBN(0x527c46f4, 0x9a9898d9), TOBN(0xd799e77b, 0x5633cdef), + TOBN(0x24eacc16, 0x7d9e4297), TOBN(0xabb61cea, 0x6b1cb734)}, + {TOBN(0xbee2e8a7, 0xf778443c), TOBN(0x3bb42bf1, 0x29de2fe6), + TOBN(0xcbed86a1, 0x3003bb6f), TOBN(0xd3918e6c, 0xd781cdf6)}}, + {{TOBN(0x4bee3271, 0x9a5103f1), TOBN(0x5243efc6, 0xf50eac06), + TOBN(0xb8e122cb, 0x6adcc119), TOBN(0x1b7faa84, 0xc0b80a08)}, + {TOBN(0x32c3d1bd, 0x6dfcd08c), TOBN(0x129dec4e, 0x0be427de), + TOBN(0x98ab679c, 0x1d263c83), TOBN(0xafc83cb7, 0xcef64eff)}}, + {{TOBN(0x85eb6088, 0x2fa6be76), TOBN(0x892585fb, 0x1328cbfe), + TOBN(0xc154d3ed, 0xcf618dda), TOBN(0xc44f601b, 0x3abaf26e)}, + {TOBN(0x7bf57d0b, 0x2be1fdfd), TOBN(0xa833bd2d, 0x21137fee), + TOBN(0x9353af36, 0x2db591a8), TOBN(0xc76f26dc, 0x5562a056)}}, + {{TOBN(0x1d87e47d, 0x3fdf5a51), TOBN(0x7afb5f93, 0x55c9cab0), + TOBN(0x91bbf58f, 0x89e0586e), TOBN(0x7c72c018, 0x0d843709)}, + {TOBN(0xa9a5aafb, 0x99b5c3dc), TOBN(0xa48a0f1d, 0x3844aeb0), + TOBN(0x7178b7dd, 0xb667e482), TOBN(0x453985e9, 0x6e23a59a)}}, + {{TOBN(0x4a54c860, 0x01b25dd8), TOBN(0x0dd37f48, 0xfb897c8a), + TOBN(0x5f8aa610, 0x0ea90cd9), TOBN(0xc8892c68, 0x16d5830d)}, + {TOBN(0xeb4befc0, 0xef514ca5), TOBN(0x478eb679, 0xe72c9ee6), + TOBN(0x9bca20da, 0xdbc40d5f), TOBN(0xf015de21, 0xdde4f64a)}}, + {{TOBN(0xaa6a4de0, 0xeaf4b8a5), TOBN(0x68cfd9ca, 0x4bc60e32), + TOBN(0x668a4b01, 0x7fd15e70), TOBN(0xd9f0694a, 0xf27dc09d)}, + {TOBN(0xf6c3cad5, 0xba708bcd), TOBN(0x5cd2ba69, 0x5bb95c2a), + TOBN(0xaa28c1d3, 0x33c0a58f), TOBN(0x23e274e3, 0xabc77870)}}, + {{TOBN(0x44c3692d, 0xdfd20a4a), TOBN(0x091c5fd3, 0x81a66653), + TOBN(0x6c0bb691, 0x09a0757d), TOBN(0x9072e8b9, 0x667343ea)}, + {TOBN(0x31d40eb0, 0x80848bec), TOBN(0x95bd480a, 0x79fd36cc), + TOBN(0x01a77c61, 0x65ed43f5), TOBN(0xafccd127, 0x2e0d40bf)}}, + {{TOBN(0xeccfc82d, 0x1cc1884b), TOBN(0xc85ac201, 0x5d4753b4), + TOBN(0xc7a6caac, 0x658e099f), TOBN(0xcf46369e, 0x04b27390)}, + {TOBN(0xe2e7d049, 0x506467ea), TOBN(0x481b63a2, 0x37cdeccc), + TOBN(0x4029abd8, 0xed80143a), TOBN(0x28bfe3c7, 0xbcb00b88)}}, + {{TOBN(0x3bec1009, 0x0643d84a), TOBN(0x885f3668, 0xabd11041), + TOBN(0xdb02432c, 0xf83a34d6), TOBN(0x32f7b360, 0x719ceebe)}, + {TOBN(0xf06c7837, 0xdad1fe7a), TOBN(0x60a157a9, 0x5441a0b0), + TOBN(0x704970e9, 0xe2d47550), TOBN(0xcd2bd553, 0x271b9020)}}, + {{TOBN(0xff57f82f, 0x33e24a0b), TOBN(0x9cbee23f, 0xf2565079), + TOBN(0x16353427, 0xeb5f5825), TOBN(0x276feec4, 0xe948d662)}, + {TOBN(0xd1b62bc6, 0xda10032b), TOBN(0x718351dd, 0xf0e72a53), + TOBN(0x93452076, 0x2420e7ba), TOBN(0x96368fff, 0x3a00118d)}}, + {{TOBN(0x00ce2d26, 0x150a49e4), TOBN(0x0c28b636, 0x3f04706b), + TOBN(0xbad65a46, 0x58b196d0), TOBN(0x6c8455fc, 0xec9f8b7c)}, + {TOBN(0xe90c895f, 0x2d71867e), TOBN(0x5c0be31b, 0xedf9f38c), + TOBN(0x2a37a15e, 0xd8f6ec04), TOBN(0x239639e7, 0x8cd85251)}}, + {{TOBN(0xd8975315, 0x9c7c4c6b), TOBN(0x603aa3c0, 0xd7409af7), + TOBN(0xb8d53d0c, 0x007132fb), TOBN(0x68d12af7, 0xa6849238)}, + {TOBN(0xbe0607e7, 0xbf5d9279), TOBN(0x9aa50055, 0xaada74ce), + TOBN(0xe81079cb, 0xba7e8ccb), TOBN(0x610c71d1, 0xa5f4ff5e)}}, + {{TOBN(0x9e2ee1a7, 0x5aa07093), TOBN(0xca84004b, 0xa75da47c), + TOBN(0x074d3951, 0x3de75401), TOBN(0xf938f756, 0xbb311592)}, + {TOBN(0x96197618, 0x00a43421), TOBN(0x39a25362, 0x07bc78c8), + TOBN(0x278f710a, 0x0a171276), TOBN(0xb28446ea, 0x8d1a8f08)}}, + {{TOBN(0x184781bf, 0xe3b6a661), TOBN(0x7751cb1d, 0xe6d279f7), + TOBN(0xf8ff95d6, 0xc59eb662), TOBN(0x186d90b7, 0x58d3dea7)}, + {TOBN(0x0e4bb6c1, 0xdfb4f754), TOBN(0x5c5cf56b, 0x2b2801dc), + TOBN(0xc561e452, 0x1f54564d), TOBN(0xb4fb8c60, 0xf0dd7f13)}}, + {{TOBN(0xf8849630, 0x33ff98c7), TOBN(0x9619fffa, 0xcf17769c), + TOBN(0xf8090bf6, 0x1bfdd80a), TOBN(0x14d9a149, 0x422cfe63)}, + {TOBN(0xb354c360, 0x6f6df9ea), TOBN(0xdbcf770d, 0x218f17ea), + TOBN(0x207db7c8, 0x79eb3480), TOBN(0x213dbda8, 0x559b6a26)}}, + {{TOBN(0xac4c200b, 0x29fc81b3), TOBN(0xebc3e09f, 0x171d87c1), + TOBN(0x91799530, 0x1481aa9e), TOBN(0x051b92e1, 0x92e114fa)}, + {TOBN(0xdf8f92e9, 0xecb5537f), TOBN(0x44b1b2cc, 0x290c7483), + TOBN(0xa711455a, 0x2adeb016), TOBN(0x964b6856, 0x81a10c2c)}}, + {{TOBN(0x4f159d99, 0xcec03623), TOBN(0x05532225, 0xef3271ea), + TOBN(0xb231bea3, 0xc5ee4849), TOBN(0x57a54f50, 0x7094f103)}, + {TOBN(0x3e2d421d, 0x9598b352), TOBN(0xe865a49c, 0x67412ab4), + TOBN(0xd2998a25, 0x1cc3a912), TOBN(0x5d092808, 0x0c74d65d)}}, + {{TOBN(0x73f45908, 0x4088567a), TOBN(0xeb6b280e, 0x1f214a61), + TOBN(0x8c9adc34, 0xcaf0c13d), TOBN(0x39d12938, 0xf561fb80)}, + {TOBN(0xb2dc3a5e, 0xbc6edfb4), TOBN(0x7485b1b1, 0xfe4d210e), + TOBN(0x062e0400, 0xe186ae72), TOBN(0x91e32d5c, 0x6eeb3b88)}}, + {{TOBN(0x6df574d7, 0x4be59224), TOBN(0xebc88ccc, 0x716d55f3), + TOBN(0x26c2e6d0, 0xcad6ed33), TOBN(0xc6e21e7d, 0x0d3e8b10)}, + {TOBN(0x2cc5840e, 0x5bcc36bb), TOBN(0x9292445e, 0x7da74f69), + TOBN(0x8be8d321, 0x4e5193a8), TOBN(0x3ec23629, 0x8df06413)}}, + {{TOBN(0xc7e9ae85, 0xb134defa), TOBN(0x6073b1d0, 0x1bb2d475), + TOBN(0xb9ad615e, 0x2863c00d), TOBN(0x9e29493d, 0x525f4ac4)}, + {TOBN(0xc32b1dea, 0x4e9acf4f), TOBN(0x3e1f01c8, 0xa50db88d), + TOBN(0xb05d70ea, 0x04da916c), TOBN(0x714b0d0a, 0xd865803e)}}, + {{TOBN(0x4bd493fc, 0x9920cb5e), TOBN(0x5b44b1f7, 0x92c7a3ac), + TOBN(0xa2a77293, 0xbcec9235), TOBN(0x5ee06e87, 0xcd378553)}, + {TOBN(0xceff8173, 0xda621607), TOBN(0x2bb03e4c, 0x99f5d290), + TOBN(0x2945106a, 0xa6f734ac), TOBN(0xb5056604, 0xd25c4732)}}, + {{TOBN(0x5945920c, 0xe079afee), TOBN(0x686e17a0, 0x6789831f), + TOBN(0x5966bee8, 0xb74a5ae5), TOBN(0x38a673a2, 0x1e258d46)}, + {TOBN(0xbd1cc1f2, 0x83141c95), TOBN(0x3b2ecf4f, 0x0e96e486), + TOBN(0xcd3aa896, 0x74e5fc78), TOBN(0x415ec10c, 0x2482fa7a)}}, + {{TOBN(0x15234419, 0x80503380), TOBN(0x513d917a, 0xd314b392), + TOBN(0xb0b52f4e, 0x63caecae), TOBN(0x07bf22ad, 0x2dc7780b)}, + {TOBN(0xe761e8a1, 0xe4306839), TOBN(0x1b3be962, 0x5dd7feaa), + TOBN(0x4fe728de, 0x74c778f1), TOBN(0xf1fa0bda, 0x5e0070f6)}}, + {{TOBN(0x85205a31, 0x6ec3f510), TOBN(0x2c7e4a14, 0xd2980475), + TOBN(0xde3c19c0, 0x6f30ebfd), TOBN(0xdb1c1f38, 0xd4b7e644)}, + {TOBN(0xfe291a75, 0x5dce364a), TOBN(0xb7b22a3c, 0x058f5be3), + TOBN(0x2cd2c302, 0x37fea38c), TOBN(0x2930967a, 0x2e17be17)}}, + {{TOBN(0x87f009de, 0x0c061c65), TOBN(0xcb014aac, 0xedc6ed44), + TOBN(0x49bd1cb4, 0x3bafb1eb), TOBN(0x81bd8b5c, 0x282d3688)}, + {TOBN(0x1cdab87e, 0xf01a17af), TOBN(0x21f37ac4, 0xe710063b), + TOBN(0x5a6c5676, 0x42fc8193), TOBN(0xf4753e70, 0x56a6015c)}}, + {{TOBN(0x020f795e, 0xa15b0a44), TOBN(0x8f37c8d7, 0x8958a958), + TOBN(0x63b7e89b, 0xa4b675b5), TOBN(0xb4fb0c0c, 0x0fc31aea)}, + {TOBN(0xed95e639, 0xa7ff1f2e), TOBN(0x9880f5a3, 0x619614fb), + TOBN(0xdeb6ff02, 0x947151ab), TOBN(0x5bc5118c, 0xa868dcdb)}}, + {{TOBN(0xd8da2055, 0x4c20cea5), TOBN(0xcac2776e, 0x14c4d69a), + TOBN(0xcccb22c1, 0x622d599b), TOBN(0xa4ddb653, 0x68a9bb50)}, + {TOBN(0x2c4ff151, 0x1b4941b4), TOBN(0xe1ff19b4, 0x6efba588), + TOBN(0x35034363, 0xc48345e0), TOBN(0x45542e3d, 0x1e29dfc4)}}, + {{TOBN(0xf197cb91, 0x349f7aed), TOBN(0x3b2b5a00, 0x8fca8420), + TOBN(0x7c175ee8, 0x23aaf6d8), TOBN(0x54dcf421, 0x35af32b6)}, + {TOBN(0x0ba14307, 0x27d6561e), TOBN(0x879d5ee4, 0xd175b1e2), + TOBN(0xc7c43673, 0x99807db5), TOBN(0x77a54455, 0x9cd55bcd)}}, + {{TOBN(0xe6c2ff13, 0x0105c072), TOBN(0x18f7a99f, 0x8dda7da4), + TOBN(0x4c301820, 0x0e2d35c1), TOBN(0x06a53ca0, 0xd9cc6c82)}, + {TOBN(0xaa21cc1e, 0xf1aa1d9e), TOBN(0x32414334, 0x4a75b1e8), + TOBN(0x2a6d1328, 0x0ebe9fdc), TOBN(0x16bd173f, 0x98a4755a)}}, + {{TOBN(0xfbb9b245, 0x2133ffd9), TOBN(0x39a8b2f1, 0x830f1a20), + TOBN(0x484bc97d, 0xd5a1f52a), TOBN(0xd6aebf56, 0xa40eddf8)}, + {TOBN(0x32257acb, 0x76ccdac6), TOBN(0xaf4d36ec, 0x1586ff27), + TOBN(0x8eaa8863, 0xf8de7dd1), TOBN(0x0045d5cf, 0x88647c16)}}}, + {{{TOBN(0xa6f3d574, 0xc005979d), TOBN(0xc2072b42, 0x6a40e350), + TOBN(0xfca5c156, 0x8de2ecf9), TOBN(0xa8c8bf5b, 0xa515344e)}, + {TOBN(0x97aee555, 0x114df14a), TOBN(0xd4374a4d, 0xfdc5ec6b), + TOBN(0x754cc28f, 0x2ca85418), TOBN(0x71cb9e27, 0xd3c41f78)}}, + {{TOBN(0x89105079, 0x03605c39), TOBN(0xf0843d9e, 0xa142c96c), + TOBN(0xf3744934, 0x16923684), TOBN(0x732caa2f, 0xfa0a2893)}, + {TOBN(0xb2e8c270, 0x61160170), TOBN(0xc32788cc, 0x437fbaa3), + TOBN(0x39cd818e, 0xa6eda3ac), TOBN(0xe2e94239, 0x9e2b2e07)}}, + {{TOBN(0x6967d39b, 0x0260e52a), TOBN(0xd42585cc, 0x90653325), + TOBN(0x0d9bd605, 0x21ca7954), TOBN(0x4fa20877, 0x81ed57b3)}, + {TOBN(0x60c1eff8, 0xe34a0bbe), TOBN(0x56b0040c, 0x84f6ef64), + TOBN(0x28be2b24, 0xb1af8483), TOBN(0xb2278163, 0xf5531614)}}, + {{TOBN(0x8df27545, 0x5922ac1c), TOBN(0xa7b3ef5c, 0xa52b3f63), + TOBN(0x8e77b214, 0x71de57c4), TOBN(0x31682c10, 0x834c008b)}, + {TOBN(0xc76824f0, 0x4bd55d31), TOBN(0xb6d1c086, 0x17b61c71), + TOBN(0x31db0903, 0xc2a5089d), TOBN(0x9c092172, 0x184e5d3f)}}, + {{TOBN(0xdd7ced5b, 0xc00cc638), TOBN(0x1a2015eb, 0x61278fc2), + TOBN(0x2e8e5288, 0x6a37f8d6), TOBN(0xc457786f, 0xe79933ad)}, + {TOBN(0xb3fe4cce, 0x2c51211a), TOBN(0xad9b10b2, 0x24c20498), + TOBN(0x90d87a4f, 0xd28db5e5), TOBN(0x698cd105, 0x3aca2fc3)}}, + {{TOBN(0x4f112d07, 0xe91b536d), TOBN(0xceb982f2, 0x9eba09d6), + TOBN(0x3c157b2c, 0x197c396f), TOBN(0xe23c2d41, 0x7b66eb24)}, + {TOBN(0x480c57d9, 0x3f330d37), TOBN(0xb3a4c8a1, 0x79108deb), + TOBN(0x702388de, 0xcb199ce5), TOBN(0x0b019211, 0xb944a8d4)}}, + {{TOBN(0x24f2a692, 0x840bb336), TOBN(0x7c353bdc, 0xa669fa7b), + TOBN(0xda20d6fc, 0xdec9c300), TOBN(0x625fbe2f, 0xa13a4f17)}, + {TOBN(0xa2b1b61a, 0xdbc17328), TOBN(0x008965bf, 0xa9515621), + TOBN(0x49690939, 0xc620ff46), TOBN(0x182dd27d, 0x8717e91c)}}, + {{TOBN(0x5ace5035, 0xea6c3997), TOBN(0x54259aaa, 0xc2610bef), + TOBN(0xef18bb3f, 0x3c80dd39), TOBN(0x6910b95b, 0x5fc3fa39)}, + {TOBN(0xfce2f510, 0x43e09aee), TOBN(0xced56c9f, 0xa7675665), + TOBN(0x10e265ac, 0xd872db61), TOBN(0x6982812e, 0xae9fce69)}}, + {{TOBN(0x29be11c6, 0xce800998), TOBN(0x72bb1752, 0xb90360d9), + TOBN(0x2c193197, 0x5a4ad590), TOBN(0x2ba2f548, 0x9fc1dbc0)}, + {TOBN(0x7fe4eebb, 0xe490ebe0), TOBN(0x12a0a4cd, 0x7fae11c0), + TOBN(0x7197cf81, 0xe903ba37), TOBN(0xcf7d4aa8, 0xde1c6dd8)}}, + {{TOBN(0x92af6bf4, 0x3fd5684c), TOBN(0x2b26eecf, 0x80360aa1), + TOBN(0xbd960f30, 0x00546a82), TOBN(0x407b3c43, 0xf59ad8fe)}, + {TOBN(0x86cae5fe, 0x249c82ba), TOBN(0x9e0faec7, 0x2463744c), + TOBN(0x87f551e8, 0x94916272), TOBN(0x033f9344, 0x6ceb0615)}}, + {{TOBN(0x1e5eb0d1, 0x8be82e84), TOBN(0x89967f0e, 0x7a582fef), + TOBN(0xbcf687d5, 0xa6e921fa), TOBN(0xdfee4cf3, 0xd37a09ba)}, + {TOBN(0x94f06965, 0xb493c465), TOBN(0x638b9a1c, 0x7635c030), + TOBN(0x76667864, 0x66f05e9f), TOBN(0xccaf6808, 0xc04da725)}}, + {{TOBN(0xca2eb690, 0x768fccfc), TOBN(0xf402d37d, 0xb835b362), + TOBN(0x0efac0d0, 0xe2fdfcce), TOBN(0xefc9cdef, 0xb638d990)}, + {TOBN(0x2af12b72, 0xd1669a8b), TOBN(0x33c536bc, 0x5774ccbd), + TOBN(0x30b21909, 0xfb34870e), TOBN(0xc38fa2f7, 0x7df25aca)}}, + {{TOBN(0x74c5f02b, 0xbf81f3f5), TOBN(0x0525a5ae, 0xaf7e4581), + TOBN(0x88d2aaba, 0x433c54ae), TOBN(0xed9775db, 0x806a56c5)}, + {TOBN(0xd320738a, 0xc0edb37d), TOBN(0x25fdb6ee, 0x66cc1f51), + TOBN(0xac661d17, 0x10600d76), TOBN(0x931ec1f3, 0xbdd1ed76)}}, + {{TOBN(0x65c11d62, 0x19ee43f1), TOBN(0x5cd57c3e, 0x60829d97), + TOBN(0xd26c91a3, 0x984be6e8), TOBN(0xf08d9309, 0x8b0c53bd)}, + {TOBN(0x94bc9e5b, 0xc016e4ea), TOBN(0xd3916839, 0x11d43d2b), + TOBN(0x886c5ad7, 0x73701155), TOBN(0xe0377626, 0x20b00715)}}, + {{TOBN(0x7f01c9ec, 0xaa80ba59), TOBN(0x3083411a, 0x68538e51), + TOBN(0x970370f1, 0xe88128af), TOBN(0x625cc3db, 0x91dec14b)}, + {TOBN(0xfef9666c, 0x01ac3107), TOBN(0xb2a8d577, 0xd5057ac3), + TOBN(0xb0f26299, 0x92be5df7), TOBN(0xf579c8e5, 0x00353924)}}, + {{TOBN(0xb8fa3d93, 0x1341ed7a), TOBN(0x4223272c, 0xa7b59d49), + TOBN(0x3dcb1947, 0x83b8c4a4), TOBN(0x4e413c01, 0xed1302e4)}, + {TOBN(0x6d999127, 0xe17e44ce), TOBN(0xee86bf75, 0x33b3adfb), + TOBN(0xf6902fe6, 0x25aa96ca), TOBN(0xb73540e4, 0xe5aae47d)}}, + {{TOBN(0x32801d7b, 0x1b4a158c), TOBN(0xe571c99e, 0x27e2a369), + TOBN(0x40cb76c0, 0x10d9f197), TOBN(0xc308c289, 0x3167c0ae)}, + {TOBN(0xa6ef9dd3, 0xeb7958f2), TOBN(0xa7226dfc, 0x300879b1), + TOBN(0x6cd0b362, 0x7edf0636), TOBN(0x4efbce6c, 0x7bc37eed)}}, + {{TOBN(0x75f92a05, 0x8d699021), TOBN(0x586d4c79, 0x772566e3), + TOBN(0x378ca5f1, 0x761ad23a), TOBN(0x650d86fc, 0x1465a8ac)}, + {TOBN(0x7a4ed457, 0x842ba251), TOBN(0x6b65e3e6, 0x42234933), + TOBN(0xaf1543b7, 0x31aad657), TOBN(0xa4cefe98, 0xcbfec369)}}, + {{TOBN(0xb587da90, 0x9f47befb), TOBN(0x6562e9fb, 0x41312d13), + TOBN(0xa691ea59, 0xeff1cefe), TOBN(0xcc30477a, 0x05fc4cf6)}, + {TOBN(0xa1632461, 0x0b0ffd3d), TOBN(0xa1f16f3b, 0x5b355956), + TOBN(0x5b148d53, 0x4224ec24), TOBN(0xdc834e7b, 0xf977012a)}}, + {{TOBN(0x7bfc5e75, 0xb2c69dbc), TOBN(0x3aa77a29, 0x03c3da6c), + TOBN(0xde0df03c, 0xca910271), TOBN(0xcbd5ca4a, 0x7806dc55)}, + {TOBN(0xe1ca5807, 0x6db476cb), TOBN(0xfde15d62, 0x5f37a31e), + TOBN(0xf49af520, 0xf41af416), TOBN(0x96c5c5b1, 0x7d342db5)}}, + {{TOBN(0x155c43b7, 0xeb4ceb9b), TOBN(0x2e993010, 0x4e77371a), + TOBN(0x1d2987da, 0x675d43af), TOBN(0xef2bc1c0, 0x8599fd72)}, + {TOBN(0x96894b7b, 0x9342f6b2), TOBN(0x201eadf2, 0x7c8e71f0), + TOBN(0xf3479d9f, 0x4a1f3efc), TOBN(0xe0f8a742, 0x702a9704)}}, + {{TOBN(0xeafd44b6, 0xb3eba40c), TOBN(0xf9739f29, 0xc1c1e0d0), + TOBN(0x0091471a, 0x619d505e), TOBN(0xc15f9c96, 0x9d7c263e)}, + {TOBN(0x5be47285, 0x83afbe33), TOBN(0xa3b6d6af, 0x04f1e092), + TOBN(0xe76526b9, 0x751a9d11), TOBN(0x2ec5b26d, 0x9a4ae4d2)}}, + {{TOBN(0xeb66f4d9, 0x02f6fb8d), TOBN(0x4063c561, 0x96912164), + TOBN(0xeb7050c1, 0x80ef3000), TOBN(0x288d1c33, 0xeaa5b3f0)}, + {TOBN(0xe87c68d6, 0x07806fd8), TOBN(0xb2f7f9d5, 0x4bbbf50f), + TOBN(0x25972f3a, 0xac8d6627), TOBN(0xf8547774, 0x10e8c13b)}}, + {{TOBN(0xcc50ef6c, 0x872b4a60), TOBN(0xab2a34a4, 0x4613521b), + TOBN(0x39c5c190, 0x983e15d1), TOBN(0x61dde5df, 0x59905512)}, + {TOBN(0xe417f621, 0x9f2275f3), TOBN(0x0750c8b6, 0x451d894b), + TOBN(0x75b04ab9, 0x78b0bdaa), TOBN(0x3bfd9fd4, 0x458589bd)}}, + {{TOBN(0xf1013e30, 0xee9120b6), TOBN(0x2b51af93, 0x23a4743e), + TOBN(0xea96ffae, 0x48d14d9e), TOBN(0x71dc0dbe, 0x698a1d32)}, + {TOBN(0x914962d2, 0x0180cca4), TOBN(0x1ae60677, 0xc3568963), + TOBN(0x8cf227b1, 0x437bc444), TOBN(0xc650c83b, 0xc9962c7a)}}, + {{TOBN(0x23c2c7dd, 0xfe7ccfc4), TOBN(0xf925c89d, 0x1b929d48), + TOBN(0x4460f74b, 0x06783c33), TOBN(0xac2c8d49, 0xa590475a)}, + {TOBN(0xfb40b407, 0xb807bba0), TOBN(0x9d1e362d, 0x69ff8f3a), + TOBN(0xa33e9681, 0xcbef64a4), TOBN(0x67ece5fa, 0x332fb4b2)}}, + {{TOBN(0x6900a99b, 0x739f10e3), TOBN(0xc3341ca9, 0xff525925), + TOBN(0xee18a626, 0xa9e2d041), TOBN(0xa5a83685, 0x29580ddd)}, + {TOBN(0xf3470c81, 0x9d7de3cd), TOBN(0xedf02586, 0x2062cf9c), + TOBN(0xf43522fa, 0xc010edb0), TOBN(0x30314135, 0x13a4b1ae)}}, + {{TOBN(0xc792e02a, 0xdb22b94b), TOBN(0x993d8ae9, 0xa1eaa45b), + TOBN(0x8aad6cd3, 0xcd1e1c63), TOBN(0x89529ca7, 0xc5ce688a)}, + {TOBN(0x2ccee3aa, 0xe572a253), TOBN(0xe02b6438, 0x02a21efb), + TOBN(0xa7091b6e, 0xc9430358), TOBN(0x06d1b1fa, 0x9d7db504)}}, + {{TOBN(0x58846d32, 0xc4744733), TOBN(0x40517c71, 0x379f9e34), + TOBN(0x2f65655f, 0x130ef6ca), TOBN(0x526e4488, 0xf1f3503f)}, + {TOBN(0x8467bd17, 0x7ee4a976), TOBN(0x1d9dc913, 0x921363d1), + TOBN(0xd8d24c33, 0xb069e041), TOBN(0x5eb5da0a, 0x2cdf7f51)}}, + {{TOBN(0x1c0f3cb1, 0x197b994f), TOBN(0x3c95a6c5, 0x2843eae9), + TOBN(0x7766ffc9, 0xa6097ea5), TOBN(0x7bea4093, 0xd723b867)}, + {TOBN(0xb48e1f73, 0x4db378f9), TOBN(0x70025b00, 0xe37b77ac), + TOBN(0x943dc8e7, 0xaf24ad46), TOBN(0xb98a15ac, 0x16d00a85)}}, + {{TOBN(0x3adc38ba, 0x2743b004), TOBN(0xb1c7f4f7, 0x334415ee), + TOBN(0xea43df8f, 0x1e62d05a), TOBN(0x32618905, 0x9d76a3b6)}, + {TOBN(0x2fbd0bb5, 0xa23a0f46), TOBN(0x5bc971db, 0x6a01918c), + TOBN(0x7801d94a, 0xb4743f94), TOBN(0xb94df65e, 0x676ae22b)}}, + {{TOBN(0xaafcbfab, 0xaf95894c), TOBN(0x7b9bdc07, 0x276b2241), + TOBN(0xeaf98362, 0x5bdda48b), TOBN(0x5977faf2, 0xa3fcb4df)}, + {TOBN(0xbed042ef, 0x052c4b5b), TOBN(0x9fe87f71, 0x067591f0), + TOBN(0xc89c73ca, 0x22f24ec7), TOBN(0x7d37fa9e, 0xe64a9f1b)}}, + {{TOBN(0x2710841a, 0x15562627), TOBN(0x2c01a613, 0xc243b034), + TOBN(0x1d135c56, 0x2bc68609), TOBN(0xc2ca1715, 0x8b03f1f6)}, + {TOBN(0xc9966c2d, 0x3eb81d82), TOBN(0xc02abf4a, 0x8f6df13e), + TOBN(0x77b34bd7, 0x8f72b43b), TOBN(0xaff6218f, 0x360c82b0)}}, + {{TOBN(0x0aa5726c, 0x8d55b9d2), TOBN(0xdc0adbe9, 0x99e9bffb), + TOBN(0x9097549c, 0xefb9e72a), TOBN(0x16755712, 0x9dfb3111)}, + {TOBN(0xdd8bf984, 0xf26847f9), TOBN(0xbcb8e387, 0xdfb30cb7), + TOBN(0xc1fd32a7, 0x5171ef9c), TOBN(0x977f3fc7, 0x389b363f)}}, + {{TOBN(0x116eaf2b, 0xf4babda0), TOBN(0xfeab68bd, 0xf7113c8e), + TOBN(0xd1e3f064, 0xb7def526), TOBN(0x1ac30885, 0xe0b3fa02)}, + {TOBN(0x1c5a6e7b, 0x40142d9d), TOBN(0x839b5603, 0x30921c0b), + TOBN(0x48f301fa, 0x36a116a3), TOBN(0x380e1107, 0xcfd9ee6d)}}, + {{TOBN(0x7945ead8, 0x58854be1), TOBN(0x4111c12e, 0xcbd4d49d), + TOBN(0xece3b1ec, 0x3a29c2ef), TOBN(0x6356d404, 0x8d3616f5)}, + {TOBN(0x9f0d6a8f, 0x594d320e), TOBN(0x0989316d, 0xf651ccd2), + TOBN(0x6c32117a, 0x0f8fdde4), TOBN(0x9abe5cc5, 0xa26a9bbc)}}, + {{TOBN(0xcff560fb, 0x9723f671), TOBN(0x21b2a12d, 0x7f3d593c), + TOBN(0xe4cb18da, 0x24ba0696), TOBN(0x186e2220, 0xc3543384)}, + {TOBN(0x722f64e0, 0x88312c29), TOBN(0x94282a99, 0x17dc7752), + TOBN(0x62467bbf, 0x5a85ee89), TOBN(0xf435c650, 0xf10076a0)}}, + {{TOBN(0xc9ff1539, 0x43b3a50b), TOBN(0x7132130c, 0x1a53efbc), + TOBN(0x31bfe063, 0xf7b0c5b7), TOBN(0xb0179a7d, 0x4ea994cc)}, + {TOBN(0x12d064b3, 0xc85f455b), TOBN(0x47259328, 0x8f6e0062), + TOBN(0xf64e590b, 0xb875d6d9), TOBN(0x22dd6225, 0xad92bcc7)}}, + {{TOBN(0xb658038e, 0xb9c3bd6d), TOBN(0x00cdb0d6, 0xfbba27c8), + TOBN(0x0c681337, 0x1062c45d), TOBN(0xd8515b8c, 0x2d33407d)}, + {TOBN(0xcb8f699e, 0x8cbb5ecf), TOBN(0x8c4347f8, 0xc608d7d8), + TOBN(0x2c11850a, 0xbb3e00db), TOBN(0x20a8dafd, 0xecb49d19)}}, + {{TOBN(0xbd781480, 0x45ee2f40), TOBN(0x75e354af, 0x416b60cf), + TOBN(0xde0b58a1, 0x8d49a8c4), TOBN(0xe40e94e2, 0xfa359536)}, + {TOBN(0xbd4fa59f, 0x62accd76), TOBN(0x05cf466a, 0x8c762837), + TOBN(0xb5abda99, 0x448c277b), TOBN(0x5a9e01bf, 0x48b13740)}}, + {{TOBN(0x9d457798, 0x326aad8d), TOBN(0xbdef4954, 0xc396f7e7), + TOBN(0x6fb274a2, 0xc253e292), TOBN(0x2800bf0a, 0x1cfe53e7)}, + {TOBN(0x22426d31, 0x44438fd4), TOBN(0xef233923, 0x5e259f9a), + TOBN(0x4188503c, 0x03f66264), TOBN(0x9e5e7f13, 0x7f9fdfab)}}, + {{TOBN(0x565eb76c, 0x5fcc1aba), TOBN(0xea632548, 0x59b5bff8), + TOBN(0x5587c087, 0xaab6d3fa), TOBN(0x92b639ea, 0x6ce39c1b)}, + {TOBN(0x0706e782, 0x953b135c), TOBN(0x7308912e, 0x425268ef), + TOBN(0x599e92c7, 0x090e7469), TOBN(0x83b90f52, 0x9bc35e75)}}, + {{TOBN(0x4750b3d0, 0x244975b3), TOBN(0xf3a44358, 0x11965d72), + TOBN(0x179c6774, 0x9c8dc751), TOBN(0xff18cdfe, 0xd23d9ff0)}, + {TOBN(0xc4013833, 0x2028e247), TOBN(0x96e280e2, 0xf3bfbc79), + TOBN(0xf60417bd, 0xd0880a84), TOBN(0x263c9f3d, 0x2a568151)}}, + {{TOBN(0x36be15b3, 0x2d2ce811), TOBN(0x846dc0c2, 0xf8291d21), + TOBN(0x5cfa0ecb, 0x789fcfdb), TOBN(0x45a0beed, 0xd7535b9a)}, + {TOBN(0xec8e9f07, 0x96d69af1), TOBN(0x31a7c5b8, 0x599ab6dc), + TOBN(0xd36d45ef, 0xf9e2e09f), TOBN(0x3cf49ef1, 0xdcee954b)}}, + {{TOBN(0x6be34cf3, 0x086cff9b), TOBN(0x88dbd491, 0x39a3360f), + TOBN(0x1e96b8cc, 0x0dbfbd1d), TOBN(0xc1e5f7bf, 0xcb7e2552)}, + {TOBN(0x0547b214, 0x28819d98), TOBN(0xc770dd9c, 0x7aea9dcb), + TOBN(0xaef0d4c7, 0x041d68c8), TOBN(0xcc2b9818, 0x13cb9ba8)}}, + {{TOBN(0x7fc7bc76, 0xfe86c607), TOBN(0x6b7b9337, 0x502a9a95), + TOBN(0x1948dc27, 0xd14dab63), TOBN(0x249dd198, 0xdae047be)}, + {TOBN(0xe8356584, 0xa981a202), TOBN(0x3531dd18, 0x3a893387), + TOBN(0x1be11f90, 0xc85c7209), TOBN(0x93d2fe1e, 0xe2a52b5a)}}, + {{TOBN(0x8225bfe2, 0xec6d6b97), TOBN(0x9cf6d6f4, 0xbd0aa5de), + TOBN(0x911459cb, 0x54779f5f), TOBN(0x5649cddb, 0x86aeb1f3)}, + {TOBN(0x32133579, 0x3f26ce5a), TOBN(0xc289a102, 0x550f431e), + TOBN(0x559dcfda, 0x73b84c6f), TOBN(0x84973819, 0xee3ac4d7)}}, + {{TOBN(0xb51e55e6, 0xf2606a82), TOBN(0xe25f7061, 0x90f2fb57), + TOBN(0xacef6c2a, 0xb1a4e37c), TOBN(0x864e359d, 0x5dcf2706)}, + {TOBN(0x479e6b18, 0x7ce57316), TOBN(0x2cab2500, 0x3a96b23d), + TOBN(0xed489862, 0x8ef16df7), TOBN(0x2056538c, 0xef3758b5)}}, + {{TOBN(0xa7df865e, 0xf15d3101), TOBN(0x80c5533a, 0x61b553d7), + TOBN(0x366e1997, 0x4ed14294), TOBN(0x6620741f, 0xb3c0bcd6)}, + {TOBN(0x21d1d9c4, 0xedc45418), TOBN(0x005b859e, 0xc1cc4a9d), + TOBN(0xdf01f630, 0xa1c462f0), TOBN(0x15d06cf3, 0xf26820c7)}}, + {{TOBN(0x9f7f24ee, 0x3484be47), TOBN(0x2ff33e96, 0x4a0c902f), + TOBN(0x00bdf457, 0x5a0bc453), TOBN(0x2378dfaf, 0x1aa238db)}, + {TOBN(0x272420ec, 0x856720f2), TOBN(0x2ad9d95b, 0x96797291), + TOBN(0xd1242cc6, 0x768a1558), TOBN(0x2e287f8b, 0x5cc86aa8)}}, + {{TOBN(0x796873d0, 0x990cecaa), TOBN(0xade55f81, 0x675d4080), + TOBN(0x2645eea3, 0x21f0cd84), TOBN(0x7a1efa0f, 0xb4e17d02)}, + {TOBN(0xf6858420, 0x037cc061), TOBN(0x682e05f0, 0xd5d43e12), + TOBN(0x59c36994, 0x27218710), TOBN(0x85cbba4d, 0x3f7cd2fc)}}, + {{TOBN(0x726f9729, 0x7a3cd22a), TOBN(0x9f8cd5dc, 0x4a628397), + TOBN(0x17b93ab9, 0xc23165ed), TOBN(0xff5f5dbf, 0x122823d4)}, + {TOBN(0xc1e4e4b5, 0x654a446d), TOBN(0xd1a9496f, 0x677257ba), + TOBN(0x6387ba94, 0xde766a56), TOBN(0x23608bc8, 0x521ec74a)}}, + {{TOBN(0x16a522d7, 0x6688c4d4), TOBN(0x9d6b4282, 0x07373abd), + TOBN(0xa62f07ac, 0xb42efaa3), TOBN(0xf73e00f7, 0xe3b90180)}, + {TOBN(0x36175fec, 0x49421c3e), TOBN(0xc4e44f9b, 0x3dcf2678), + TOBN(0x76df436b, 0x7220f09f), TOBN(0x172755fb, 0x3aa8b6cf)}}, + {{TOBN(0xbab89d57, 0x446139cc), TOBN(0x0a0a6e02, 0x5fe0208f), + TOBN(0xcdbb63e2, 0x11e5d399), TOBN(0x33ecaa12, 0xa8977f0b)}, + {TOBN(0x59598b21, 0xf7c42664), TOBN(0xb3e91b32, 0xab65d08a), + TOBN(0x035822ee, 0xf4502526), TOBN(0x1dcf0176, 0x720a82a9)}}, + {{TOBN(0x50f8598f, 0x3d589e02), TOBN(0xdf0478ff, 0xb1d63d2c), + TOBN(0x8b8068bd, 0x1571cd07), TOBN(0x30c3aa4f, 0xd79670cd)}, + {TOBN(0x25e8fd4b, 0x941ade7f), TOBN(0x3d1debdc, 0x32790011), + TOBN(0x65b6dcbd, 0x3a3f9ff0), TOBN(0x282736a4, 0x793de69c)}}, + {{TOBN(0xef69a0c3, 0xd41d3bd3), TOBN(0xb533b8c9, 0x07a26bde), + TOBN(0xe2801d97, 0xdb2edf9f), TOBN(0xdc4a8269, 0xe1877af0)}, + {TOBN(0x6c1c5851, 0x3d590dbe), TOBN(0x84632f6b, 0xee4e9357), + TOBN(0xd36d36b7, 0x79b33374), TOBN(0xb46833e3, 0x9bbca2e6)}}, + {{TOBN(0x37893913, 0xf7fc0586), TOBN(0x385315f7, 0x66bf4719), + TOBN(0x72c56293, 0xb31855dc), TOBN(0xd1416d4e, 0x849061fe)}, + {TOBN(0xbeb3ab78, 0x51047213), TOBN(0x447f6e61, 0xf040c996), + TOBN(0xd06d310d, 0x638b1d0c), TOBN(0xe28a413f, 0xbad1522e)}}, + {{TOBN(0x685a76cb, 0x82003f86), TOBN(0x610d07f7, 0x0bcdbca3), + TOBN(0x6ff66021, 0x9ca4c455), TOBN(0x7df39b87, 0xcea10eec)}, + {TOBN(0xb9255f96, 0xe22db218), TOBN(0x8cc6d9eb, 0x08a34c44), + TOBN(0xcd4ffb86, 0x859f9276), TOBN(0x8fa15eb2, 0x50d07335)}}, + {{TOBN(0xdf553845, 0xcf2c24b5), TOBN(0x89f66a9f, 0x52f9c3ba), + TOBN(0x8f22b5b9, 0xe4a7ceb3), TOBN(0xaffef809, 0x0e134686)}, + {TOBN(0x3e53e1c6, 0x8eb8fac2), TOBN(0x93c1e4eb, 0x28aec98e), + TOBN(0xb6b91ec5, 0x32a43bcb), TOBN(0x2dbfa947, 0xb2d74a51)}}, + {{TOBN(0xe065d190, 0xca84bad7), TOBN(0xfb13919f, 0xad58e65c), + TOBN(0x3c41718b, 0xf1cb6e31), TOBN(0x688969f0, 0x06d05c3f)}, + {TOBN(0xd4f94ce7, 0x21264d45), TOBN(0xfdfb65e9, 0x7367532b), + TOBN(0x5b1be8b1, 0x0945a39d), TOBN(0x229f789c, 0x2b8baf3b)}}, + {{TOBN(0xd8f41f3e, 0x6f49f15d), TOBN(0x678ce828, 0x907f0792), + TOBN(0xc69ace82, 0xfca6e867), TOBN(0x106451ae, 0xd01dcc89)}, + {TOBN(0x1bb4f7f0, 0x19fc32d2), TOBN(0x64633dfc, 0xb00c52d2), + TOBN(0x8f13549a, 0xad9ea445), TOBN(0x99a3bf50, 0xfb323705)}}, + {{TOBN(0x0c9625a2, 0x534d4dbc), TOBN(0x45b8f1d1, 0xc2a2fea3), + TOBN(0x76ec21a1, 0xa530fc1a), TOBN(0x4bac9c2a, 0x9e5bd734)}, + {TOBN(0x5996d76a, 0x7b4e3587), TOBN(0x0045cdee, 0x1182d9e3), + TOBN(0x1aee24b9, 0x1207f13d), TOBN(0x66452e97, 0x97345a41)}}, + {{TOBN(0x16e5b054, 0x9f950cd0), TOBN(0x9cc72fb1, 0xd7fdd075), + TOBN(0x6edd61e7, 0x66249663), TOBN(0xde4caa4d, 0xf043cccb)}, + {TOBN(0x11b1f57a, 0x55c7ac17), TOBN(0x779cbd44, 0x1a85e24d), + TOBN(0x78030f86, 0xe46081e7), TOBN(0xfd4a6032, 0x8e20f643)}}, + {{TOBN(0xcc7a6488, 0x0a750c0f), TOBN(0x39bacfe3, 0x4e548e83), + TOBN(0x3d418c76, 0x0c110f05), TOBN(0x3e4daa4c, 0xb1f11588)}, + {TOBN(0x2733e7b5, 0x5ffc69ff), TOBN(0x46f147bc, 0x92053127), + TOBN(0x885b2434, 0xd722df94), TOBN(0x6a444f65, 0xe6fc6b7c)}}}, + {{{TOBN(0x7a1a465a, 0xc3f16ea8), TOBN(0x115a461d, 0xb2f1d11c), + TOBN(0x4767dd95, 0x6c68a172), TOBN(0x3392f2eb, 0xd13a4698)}, + {TOBN(0xc7a99ccd, 0xe526cdc7), TOBN(0x8e537fdc, 0x22292b81), + TOBN(0x76d8cf69, 0xa6d39198), TOBN(0xffc5ff43, 0x2446852d)}}, + {{TOBN(0x97b14f7e, 0xa90567e6), TOBN(0x513257b7, 0xb6ae5cb7), + TOBN(0x85454a3c, 0x9f10903d), TOBN(0xd8d2c9ad, 0x69bc3724)}, + {TOBN(0x38da9324, 0x6b29cb44), TOBN(0xb540a21d, 0x77c8cbac), + TOBN(0x9bbfe435, 0x01918e42), TOBN(0xfffa707a, 0x56c3614e)}}, + {{TOBN(0x0ce4e3f1, 0xd4e353b7), TOBN(0x062d8a14, 0xef46b0a0), + TOBN(0x6408d5ab, 0x574b73fd), TOBN(0xbc41d1c9, 0xd3273ffd)}, + {TOBN(0x3538e1e7, 0x6be77800), TOBN(0x71fe8b37, 0xc5655031), + TOBN(0x1cd91621, 0x6b9b331a), TOBN(0xad825d0b, 0xbb388f73)}}, + {{TOBN(0x56c2e05b, 0x1cb76219), TOBN(0x0ec0bf91, 0x71567e7e), + TOBN(0xe7076f86, 0x61c4c910), TOBN(0xd67b085b, 0xbabc04d9)}, + {TOBN(0x9fb90459, 0x5e93a96a), TOBN(0x7526c1ea, 0xfbdc249a), + TOBN(0x0d44d367, 0xecdd0bb7), TOBN(0x95399917, 0x9dc0d695)}}, + {{TOBN(0x61360ee9, 0x9e240d18), TOBN(0x057cdcac, 0xb4b94466), + TOBN(0xe7667cd1, 0x2fe5325c), TOBN(0x1fa297b5, 0x21974e3b)}, + {TOBN(0xfa4081e7, 0xdb083d76), TOBN(0x31993be6, 0xf206bd15), + TOBN(0x8949269b, 0x14c19f8c), TOBN(0x21468d72, 0xa9d92357)}}, + {{TOBN(0x2ccbc583, 0xa4c506ec), TOBN(0x957ed188, 0xd1acfe97), + TOBN(0x8baed833, 0x12f1aea2), TOBN(0xef2a6cb4, 0x8325362d)}, + {TOBN(0x130dde42, 0x8e195c43), TOBN(0xc842025a, 0x0e6050c6), + TOBN(0x2da972a7, 0x08686a5d), TOBN(0xb52999a1, 0xe508b4a8)}}, + {{TOBN(0xd9f090b9, 0x10a5a8bd), TOBN(0xca91d249, 0x096864da), + TOBN(0x8e6a93be, 0x3f67dbc1), TOBN(0xacae6fba, 0xf5f4764c)}, + {TOBN(0x1563c6e0, 0xd21411a0), TOBN(0x28fa787f, 0xda0a4ad8), + TOBN(0xd524491c, 0x908c8030), TOBN(0x1257ba0e, 0x4c795f07)}}, + {{TOBN(0x83f49167, 0xceca9754), TOBN(0x426d2cf6, 0x4b7939a0), + TOBN(0x2555e355, 0x723fd0bf), TOBN(0xa96e6d06, 0xc4f144e2)}, + {TOBN(0x4768a8dd, 0x87880e61), TOBN(0x15543815, 0xe508e4d5), + TOBN(0x09d7e772, 0xb1b65e15), TOBN(0x63439dd6, 0xac302fa0)}}, + {{TOBN(0xb93f802f, 0xc14e35c2), TOBN(0x71735b7c, 0x4341333c), + TOBN(0x03a25104, 0x16d4f362), TOBN(0x3f4d069b, 0xbf433c8e)}, + {TOBN(0x0d83ae01, 0xf78f5a7c), TOBN(0x50a8ffbe, 0x7c4eed07), + TOBN(0xc74f8906, 0x76e10f83), TOBN(0x7d080966, 0x9ddaf8e1)}}, + {{TOBN(0xb11df8e1, 0x698e04cc), TOBN(0x877be203, 0x169005c8), + TOBN(0x32749e8c, 0x4f3c6179), TOBN(0x2dbc9d0a, 0x7853fc05)}, + {TOBN(0x187d4f93, 0x9454d937), TOBN(0xe682ce9d, 0xb4800e1b), + TOBN(0xa9129ad8, 0x165e68e8), TOBN(0x0fe29735, 0xbe7f785b)}}, + {{TOBN(0x5303f40c, 0x5b9e02b7), TOBN(0xa37c9692, 0x35ee04e8), + TOBN(0x5f46cc20, 0x34d6632b), TOBN(0x55ef72b2, 0x96ac545b)}, + {TOBN(0xabec5c1f, 0x7b91b062), TOBN(0x0a79e1c7, 0xbb33e821), + TOBN(0xbb04b428, 0x3a9f4117), TOBN(0x0de1f28f, 0xfd2a475a)}}, + {{TOBN(0x31019ccf, 0x3a4434b4), TOBN(0xa3458111, 0x1a7954dc), + TOBN(0xa9dac80d, 0xe34972a7), TOBN(0xb043d054, 0x74f6b8dd)}, + {TOBN(0x021c319e, 0x11137b1a), TOBN(0x00a754ce, 0xed5cc03f), + TOBN(0x0aa2c794, 0xcbea5ad4), TOBN(0x093e67f4, 0x70c015b6)}}, + {{TOBN(0x72cdfee9, 0xc97e3f6b), TOBN(0xc10bcab4, 0xb6da7461), + TOBN(0x3b02d2fc, 0xb59806b9), TOBN(0x85185e89, 0xa1de6f47)}, + {TOBN(0x39e6931f, 0x0eb6c4d4), TOBN(0x4d4440bd, 0xd4fa5b04), + TOBN(0x5418786e, 0x34be7eb8), TOBN(0x6380e521, 0x9d7259bc)}}, + {{TOBN(0x20ac0351, 0xd598d710), TOBN(0x272c4166, 0xcb3a4da4), + TOBN(0xdb82fe1a, 0xca71de1f), TOBN(0x746e79f2, 0xd8f54b0f)}, + {TOBN(0x6e7fc736, 0x4b573e9b), TOBN(0x75d03f46, 0xfd4b5040), + TOBN(0x5c1cc36d, 0x0b98d87b), TOBN(0x513ba3f1, 0x1f472da1)}}, + {{TOBN(0x79d0af26, 0xabb177dd), TOBN(0xf82ab568, 0x7891d564), + TOBN(0x2b6768a9, 0x72232173), TOBN(0xefbb3bb0, 0x8c1f6619)}, + {TOBN(0xb29c11db, 0xa6d18358), TOBN(0x519e2797, 0xb0916d3a), + TOBN(0xd4dc18f0, 0x9188e290), TOBN(0x648e86e3, 0x98b0ca7f)}}, + {{TOBN(0x859d3145, 0x983c38b5), TOBN(0xb14f176c, 0x637abc8b), + TOBN(0x2793fb9d, 0xcaff7be6), TOBN(0xebe5a55f, 0x35a66a5a)}, + {TOBN(0x7cec1dcd, 0x9f87dc59), TOBN(0x7c595cd3, 0xfbdbf560), + TOBN(0x5b543b22, 0x26eb3257), TOBN(0x69080646, 0xc4c935fd)}}, + {{TOBN(0x7f2e4403, 0x81e9ede3), TOBN(0x243c3894, 0xcaf6df0a), + TOBN(0x7c605bb1, 0x1c073b11), TOBN(0xcd06a541, 0xba6a4a62)}, + {TOBN(0x29168949, 0x49d4e2e5), TOBN(0x33649d07, 0x4af66880), + TOBN(0xbfc0c885, 0xe9a85035), TOBN(0xb4e52113, 0xfc410f4b)}}, + {{TOBN(0xdca3b706, 0x78a6513b), TOBN(0x92ea4a2a, 0x9edb1943), + TOBN(0x02642216, 0xdb6e2dd8), TOBN(0x9b45d0b4, 0x9fd57894)}, + {TOBN(0x114e70db, 0xc69d11ae), TOBN(0x1477dd19, 0x4c57595f), + TOBN(0xbc2208b4, 0xec77c272), TOBN(0x95c5b4d7, 0xdb68f59c)}}, + {{TOBN(0xb8c4fc63, 0x42e532b7), TOBN(0x386ba422, 0x9ae35290), + TOBN(0xfb5dda42, 0xd201ecbc), TOBN(0x2353dc8b, 0xa0e38fd6)}, + {TOBN(0x9a0b85ea, 0x68f7e978), TOBN(0x96ec5682, 0x2ad6d11f), + TOBN(0x5e279d6c, 0xe5f6886d), TOBN(0xd3fe03cd, 0x3cb1914d)}}, + {{TOBN(0xfe541fa4, 0x7ea67c77), TOBN(0x952bd2af, 0xe3ea810c), + TOBN(0x791fef56, 0x8d01d374), TOBN(0xa3a1c621, 0x0f11336e)}, + {TOBN(0x5ad0d5a9, 0xc7ec6d79), TOBN(0xff7038af, 0x3225c342), + TOBN(0x003c6689, 0xbc69601b), TOBN(0x25059bc7, 0x45e8747d)}}, + {{TOBN(0xfa4965b2, 0xf2086fbf), TOBN(0xf6840ea6, 0x86916078), + TOBN(0xd7ac7620, 0x70081d6c), TOBN(0xe600da31, 0xb5328645)}, + {TOBN(0x01916f63, 0x529b8a80), TOBN(0xe80e4858, 0x2d7d6f3e), + TOBN(0x29eb0fe8, 0xd664ca7c), TOBN(0xf017637b, 0xe7b43b0c)}}, + {{TOBN(0x9a75c806, 0x76cb2566), TOBN(0x8f76acb1, 0xb24892d9), + TOBN(0x7ae7b9cc, 0x1f08fe45), TOBN(0x19ef7329, 0x6a4907d8)}, + {TOBN(0x2db4ab71, 0x5f228bf0), TOBN(0xf3cdea39, 0x817032d7), + TOBN(0x0b1f482e, 0xdcabe3c0), TOBN(0x3baf76b4, 0xbb86325c)}}, + {{TOBN(0xd49065e0, 0x10089465), TOBN(0x3bab5d29, 0x8e77c596), + TOBN(0x7636c3a6, 0x193dbd95), TOBN(0xdef5d294, 0xb246e499)}, + {TOBN(0xb22c58b9, 0x286b2475), TOBN(0xa0b93939, 0xcd80862b), + TOBN(0x3002c83a, 0xf0992388), TOBN(0x6de01f9b, 0xeacbe14c)}}, + {{TOBN(0x6aac688e, 0xadd70482), TOBN(0x708de92a, 0x7b4a4e8a), + TOBN(0x75b6dd73, 0x758a6eef), TOBN(0xea4bf352, 0x725b3c43)}, + {TOBN(0x10041f2c, 0x87912868), TOBN(0xb1b1be95, 0xef09297a), + TOBN(0x19ae23c5, 0xa9f3860a), TOBN(0xc4f0f839, 0x515dcf4b)}}, + {{TOBN(0x3c7ecca3, 0x97f6306a), TOBN(0x744c44ae, 0x68a3a4b0), + TOBN(0x69cd13a0, 0xb3a1d8a2), TOBN(0x7cad0a1e, 0x5256b578)}, + {TOBN(0xea653fcd, 0x33791d9e), TOBN(0x9cc2a05d, 0x74b2e05f), + TOBN(0x73b391dc, 0xfd7affa2), TOBN(0xddb7091e, 0xb6b05442)}}, + {{TOBN(0xc71e27bf, 0x8538a5c6), TOBN(0x195c63dd, 0x89abff17), + TOBN(0xfd315285, 0x1b71e3da), TOBN(0x9cbdfda7, 0xfa680fa0)}, + {TOBN(0x9db876ca, 0x849d7eab), TOBN(0xebe2764b, 0x3c273271), + TOBN(0x663357e3, 0xf208dcea), TOBN(0x8c5bd833, 0x565b1b70)}}, + {{TOBN(0xccc3b4f5, 0x9837fc0d), TOBN(0x9b641ba8, 0xa79cf00f), + TOBN(0x7428243d, 0xdfdf3990), TOBN(0x83a594c4, 0x020786b1)}, + {TOBN(0xb712451a, 0x526c4502), TOBN(0x9d39438e, 0x6adb3f93), + TOBN(0xfdb261e3, 0xe9ff0ccd), TOBN(0x80344e3c, 0xe07af4c3)}}, + {{TOBN(0x75900d7c, 0x2fa4f126), TOBN(0x08a3b865, 0x5c99a232), + TOBN(0x2478b6bf, 0xdb25e0c3), TOBN(0x482cc2c2, 0x71db2edf)}, + {TOBN(0x37df7e64, 0x5f321bb8), TOBN(0x8a93821b, 0x9a8005b4), + TOBN(0x3fa2f10c, 0xcc8c1958), TOBN(0x0d332218, 0x2c269d0a)}}, + {{TOBN(0x20ab8119, 0xe246b0e6), TOBN(0xb39781e4, 0xd349fd17), + TOBN(0xd293231e, 0xb31aa100), TOBN(0x4b779c97, 0xbb032168)}, + {TOBN(0x4b3f19e1, 0xc8470500), TOBN(0x45b7efe9, 0x0c4c869d), + TOBN(0xdb84f38a, 0xa1a6bbcc), TOBN(0x3b59cb15, 0xb2fddbc1)}}, + {{TOBN(0xba5514df, 0x3fd165e8), TOBN(0x499fd6a9, 0x061f8811), + TOBN(0x72cd1fe0, 0xbfef9f00), TOBN(0x120a4bb9, 0x79ad7e8a)}, + {TOBN(0xf2ffd095, 0x5f4a5ac5), TOBN(0xcfd174f1, 0x95a7a2f0), + TOBN(0xd42301ba, 0x9d17baf1), TOBN(0xd2fa487a, 0x77f22089)}}, + {{TOBN(0x9cb09efe, 0xb1dc77e1), TOBN(0xe9566939, 0x21c99682), + TOBN(0x8c546901, 0x6c6067bb), TOBN(0xfd378574, 0x61c24456)}, + {TOBN(0x2b6a6cbe, 0x81796b33), TOBN(0x62d550f6, 0x58e87f8b), + TOBN(0x1b763e1c, 0x7f1b01b4), TOBN(0x4b93cfea, 0x1b1b5e12)}}, + {{TOBN(0xb9345238, 0x1d531696), TOBN(0x57201c00, 0x88cdde69), + TOBN(0xdde92251, 0x9a86afc7), TOBN(0xe3043895, 0xbd35cea8)}, + {TOBN(0x7608c1e1, 0x8555970d), TOBN(0x8267dfa9, 0x2535935e), + TOBN(0xd4c60a57, 0x322ea38b), TOBN(0xe0bf7977, 0x804ef8b5)}}, + {{TOBN(0x1a0dab28, 0xc06fece4), TOBN(0xd405991e, 0x94e7b49d), + TOBN(0xc542b6d2, 0x706dab28), TOBN(0xcb228da3, 0xa91618fb)}, + {TOBN(0x224e4164, 0x107d1cea), TOBN(0xeb9fdab3, 0xd0f5d8f1), + TOBN(0xc02ba386, 0x0d6e41cd), TOBN(0x676a72c5, 0x9b1f7146)}}, + {{TOBN(0xffd6dd98, 0x4d6cb00b), TOBN(0xcef9c5ca, 0xde2e8d7c), + TOBN(0xa1bbf5d7, 0x641c7936), TOBN(0x1b95b230, 0xee8f772e)}, + {TOBN(0xf765a92e, 0xe8ac25b1), TOBN(0xceb04cfc, 0x3a18b7c6), + TOBN(0x27944cef, 0x0acc8966), TOBN(0xcbb3c957, 0x434c1004)}}, + {{TOBN(0x9c9971a1, 0xa43ff93c), TOBN(0x5bc2db17, 0xa1e358a9), + TOBN(0x45b4862e, 0xa8d9bc82), TOBN(0x70ebfbfb, 0x2201e052)}, + {TOBN(0xafdf64c7, 0x92871591), TOBN(0xea5bcae6, 0xb42d0219), + TOBN(0xde536c55, 0x2ad8f03c), TOBN(0xcd6c3f4d, 0xa76aa33c)}}, + {{TOBN(0xbeb5f623, 0x0bca6de3), TOBN(0xdd20dd99, 0xb1e706fd), + TOBN(0x90b3ff9d, 0xac9059d4), TOBN(0x2d7b2902, 0x7ccccc4e)}, + {TOBN(0x8a090a59, 0xce98840f), TOBN(0xa5d947e0, 0x8410680a), + TOBN(0x49ae346a, 0x923379a5), TOBN(0x7dbc84f9, 0xb28a3156)}}, + {{TOBN(0xfd40d916, 0x54a1aff2), TOBN(0xabf318ba, 0x3a78fb9b), + TOBN(0x50152ed8, 0x3029f95e), TOBN(0x9fc1dd77, 0xc58ad7fa)}, + {TOBN(0x5fa57915, 0x13595c17), TOBN(0xb9504668, 0x8f62b3a9), + TOBN(0x907b5b24, 0xff3055b0), TOBN(0x2e995e35, 0x9a84f125)}}, + {{TOBN(0x87dacf69, 0x7e9bbcfb), TOBN(0x95d0c1d6, 0xe86d96e3), + TOBN(0x65726e3c, 0x2d95a75c), TOBN(0x2c3c9001, 0xacd27f21)}, + {TOBN(0x1deab561, 0x6c973f57), TOBN(0x108b7e2c, 0xa5221643), + TOBN(0x5fee9859, 0xc4ef79d4), TOBN(0xbd62b88a, 0x40d4b8c6)}}, + {{TOBN(0xb4dd29c4, 0x197c75d6), TOBN(0x266a6df2, 0xb7076feb), + TOBN(0x9512d0ea, 0x4bf2df11), TOBN(0x1320c24f, 0x6b0cc9ec)}, + {TOBN(0x6bb1e0e1, 0x01a59596), TOBN(0x8317c5bb, 0xeff9aaac), + TOBN(0x65bb405e, 0x385aa6c9), TOBN(0x613439c1, 0x8f07988f)}}, + {{TOBN(0xd730049f, 0x16a66e91), TOBN(0xe97f2820, 0xfa1b0e0d), + TOBN(0x4131e003, 0x304c28ea), TOBN(0x820ab732, 0x526bac62)}, + {TOBN(0xb2ac9ef9, 0x28714423), TOBN(0x54ecfffa, 0xadb10cb2), + TOBN(0x8781476e, 0xf886a4cc), TOBN(0x4b2c87b5, 0xdb2f8d49)}}, + {{TOBN(0xe857cd20, 0x0a44295d), TOBN(0x707d7d21, 0x58c6b044), + TOBN(0xae8521f9, 0xf596757c), TOBN(0x87448f03, 0x67b2b714)}, + {TOBN(0x13a9bc45, 0x5ebcd58d), TOBN(0x79bcced9, 0x9122d3c1), + TOBN(0x3c644247, 0x9e076642), TOBN(0x0cf22778, 0x2df4767d)}}, + {{TOBN(0x5e61aee4, 0x71d444b6), TOBN(0x211236bf, 0xc5084a1d), + TOBN(0x7e15bc9a, 0x4fd3eaf6), TOBN(0x68df2c34, 0xab622bf5)}, + {TOBN(0x9e674f0f, 0x59bf4f36), TOBN(0xf883669b, 0xd7f34d73), + TOBN(0xc48ac1b8, 0x31497b1d), TOBN(0x323b925d, 0x5106703b)}}, + {{TOBN(0x22156f42, 0x74082008), TOBN(0xeffc521a, 0xc8482bcb), + TOBN(0x5c6831bf, 0x12173479), TOBN(0xcaa2528f, 0xc4739490)}, + {TOBN(0x84d2102a, 0x8f1b3c4d), TOBN(0xcf64dfc1, 0x2d9bec0d), + TOBN(0x433febad, 0x78a546ef), TOBN(0x1f621ec3, 0x7b73cef1)}}, + {{TOBN(0x6aecd627, 0x37338615), TOBN(0x162082ab, 0x01d8edf6), + TOBN(0x833a8119, 0x19e86b66), TOBN(0x6023a251, 0xd299b5db)}, + {TOBN(0xf5bb0c3a, 0xbbf04b89), TOBN(0x6735eb69, 0xae749a44), + TOBN(0xd0e058c5, 0x4713de3b), TOBN(0xfdf2593e, 0x2c3d4ccd)}}, + {{TOBN(0x1b8f414e, 0xfdd23667), TOBN(0xdd52aaca, 0xfa2015ee), + TOBN(0x3e31b517, 0xbd9625ff), TOBN(0x5ec9322d, 0x8db5918c)}, + {TOBN(0xbc73ac85, 0xa96f5294), TOBN(0x82aa5bf3, 0x61a0666a), + TOBN(0x49755810, 0xbf08ac42), TOBN(0xd21cdfd5, 0x891cedfc)}}, + {{TOBN(0x918cb57b, 0x67f8be10), TOBN(0x365d1a7c, 0x56ffa726), + TOBN(0x2435c504, 0x6532de93), TOBN(0xc0fc5e10, 0x2674cd02)}, + {TOBN(0x6e51fcf8, 0x9cbbb142), TOBN(0x1d436e5a, 0xafc50692), + TOBN(0x766bffff, 0x3fbcae22), TOBN(0x3148c2fd, 0xfd55d3b8)}}, + {{TOBN(0x52c7fdc9, 0x233222fa), TOBN(0x89ff1092, 0xe419fb6b), + TOBN(0x3cd6db99, 0x25254977), TOBN(0x2e85a161, 0x1cf12ca7)}, + {TOBN(0xadd2547c, 0xdc810bc9), TOBN(0xea3f458f, 0x9d257c22), + TOBN(0x642c1fbe, 0x27d6b19b), TOBN(0xed07e6b5, 0x140481a6)}}, + {{TOBN(0x6ada1d42, 0x86d2e0f8), TOBN(0xe5920122, 0x0e8a9fd5), + TOBN(0x02c936af, 0x708c1b49), TOBN(0x60f30fee, 0x2b4bfaff)}, + {TOBN(0x6637ad06, 0x858e6a61), TOBN(0xce4c7767, 0x3fd374d0), + TOBN(0x39d54b2d, 0x7188defb), TOBN(0xa8c9d250, 0xf56a6b66)}}, + {{TOBN(0x58fc0f5e, 0xb24fe1dc), TOBN(0x9eaf9dee, 0x6b73f24c), + TOBN(0xa90d588b, 0x33650705), TOBN(0xde5b62c5, 0xaf2ec729)}, + {TOBN(0x5c72cfae, 0xd3c2b36e), TOBN(0x868c19d5, 0x034435da), + TOBN(0x88605f93, 0xe17ee145), TOBN(0xaa60c4ee, 0x77a5d5b1)}}, + {{TOBN(0xbcf5bfd2, 0x3b60c472), TOBN(0xaf4ef13c, 0xeb1d3049), + TOBN(0x373f44fc, 0xe13895c9), TOBN(0xf29b382f, 0x0cbc9822)}, + {TOBN(0x1bfcb853, 0x73efaef6), TOBN(0xcf56ac9c, 0xa8c96f40), + TOBN(0xd7adf109, 0x7a191e24), TOBN(0x98035f44, 0xbf8a8dc2)}}, + {{TOBN(0xf40a71b9, 0x1e750c84), TOBN(0xc57f7b0c, 0x5dc6c469), + TOBN(0x49a0e79c, 0x6fbc19c1), TOBN(0x6b0f5889, 0xa48ebdb8)}, + {TOBN(0x5d3fd084, 0xa07c4e9f), TOBN(0xc3830111, 0xab27de14), + TOBN(0x0e4929fe, 0x33e08dcc), TOBN(0xf4a5ad24, 0x40bb73a3)}}, + {{TOBN(0xde86c2bf, 0x490f97ca), TOBN(0x288f09c6, 0x67a1ce18), + TOBN(0x364bb886, 0x1844478d), TOBN(0x7840fa42, 0xceedb040)}, + {TOBN(0x1269fdd2, 0x5a631b37), TOBN(0x94761f1e, 0xa47c8b7d), + TOBN(0xfc0c2e17, 0x481c6266), TOBN(0x85e16ea2, 0x3daa5fa7)}}, + {{TOBN(0xccd86033, 0x92491048), TOBN(0x0c2f6963, 0xf4d402d7), + TOBN(0x6336f7df, 0xdf6a865c), TOBN(0x0a2a463c, 0xb5c02a87)}, + {TOBN(0xb0e29be7, 0xbf2f12ee), TOBN(0xf0a22002, 0x66bad988), + TOBN(0x27f87e03, 0x9123c1d7), TOBN(0x21669c55, 0x328a8c98)}}, + {{TOBN(0x186b9803, 0x92f14529), TOBN(0xd3d056cc, 0x63954df3), + TOBN(0x2f03fd58, 0x175a46f6), TOBN(0x63e34ebe, 0x11558558)}, + {TOBN(0xe13fedee, 0x5b80cfa5), TOBN(0xe872a120, 0xd401dbd1), + TOBN(0x52657616, 0xe8a9d667), TOBN(0xbc8da4b6, 0xe08d6693)}}, + {{TOBN(0x370fb9bb, 0x1b703e75), TOBN(0x6773b186, 0xd4338363), + TOBN(0x18dad378, 0xecef7bff), TOBN(0xaac787ed, 0x995677da)}, + {TOBN(0x4801ea8b, 0x0437164b), TOBN(0xf430ad20, 0x73fe795e), + TOBN(0xb164154d, 0x8ee5eb73), TOBN(0x0884ecd8, 0x108f7c0e)}}, + {{TOBN(0x0e6ec096, 0x5f520698), TOBN(0x640631fe, 0x44f7b8d9), + TOBN(0x92fd34fc, 0xa35a68b9), TOBN(0x9c5a4b66, 0x4d40cf4e)}, + {TOBN(0x949454bf, 0x80b6783d), TOBN(0x80e701fe, 0x3a320a10), + TOBN(0x8d1a564a, 0x1a0a39b2), TOBN(0x1436d53d, 0x320587db)}}, + {{TOBN(0xf5096e6d, 0x6556c362), TOBN(0xbc23a3c0, 0xe2455d7e), + TOBN(0x3a7aee54, 0x807230f9), TOBN(0x9ba1cfa6, 0x22ae82fd)}, + {TOBN(0x833a057a, 0x99c5d706), TOBN(0x8be85f4b, 0x842315c9), + TOBN(0xd083179a, 0x66a72f12), TOBN(0x2fc77d5d, 0xcdcc73cd)}}, + {{TOBN(0x22b88a80, 0x5616ee30), TOBN(0xfb09548f, 0xe7ab1083), + TOBN(0x8ad6ab0d, 0x511270cd), TOBN(0x61f6c57a, 0x6924d9ab)}, + {TOBN(0xa0f7bf72, 0x90aecb08), TOBN(0x849f87c9, 0x0df784a4), + TOBN(0x27c79c15, 0xcfaf1d03), TOBN(0xbbf9f675, 0xc463face)}}, + {{TOBN(0x91502c65, 0x765ba543), TOBN(0x18ce3cac, 0x42ea60dd), + TOBN(0xe5cee6ac, 0x6e43ecb3), TOBN(0x63e4e910, 0x68f2aeeb)}, + {TOBN(0x26234fa3, 0xc85932ee), TOBN(0x96883e8b, 0x4c90c44d), + TOBN(0x29b9e738, 0xa18a50f6), TOBN(0xbfc62b2a, 0x3f0420df)}}, + {{TOBN(0xd22a7d90, 0x6d3e1fa9), TOBN(0x17115618, 0xfe05b8a3), + TOBN(0x2a0c9926, 0xbb2b9c01), TOBN(0xc739fcc6, 0xe07e76a2)}, + {TOBN(0x540e9157, 0x165e439a), TOBN(0x06353a62, 0x6a9063d8), + TOBN(0x84d95594, 0x61e927a3), TOBN(0x013b9b26, 0xe2e0be7f)}}, + {{TOBN(0x4feaec3b, 0x973497f1), TOBN(0x15c0f94e, 0x093ebc2d), + TOBN(0x6af5f227, 0x33af0583), TOBN(0x0c2af206, 0xc61f3340)}, + {TOBN(0xd25dbdf1, 0x4457397c), TOBN(0x2e8ed017, 0xcabcbae0), + TOBN(0xe3010938, 0xc2815306), TOBN(0xbaa99337, 0xe8c6cd68)}}, + {{TOBN(0x08513182, 0x3b0ec7de), TOBN(0x1e1b822b, 0x58df05df), + TOBN(0x5c14842f, 0xa5c3b683), TOBN(0x98fe977e, 0x3eba34ce)}, + {TOBN(0xfd2316c2, 0x0d5e8873), TOBN(0xe48d839a, 0xbd0d427d), + TOBN(0x495b2218, 0x623fc961), TOBN(0x24ee56e7, 0xb46fba5e)}}, + {{TOBN(0x9184a55b, 0x91e4de58), TOBN(0xa7488ca5, 0xdfdea288), + TOBN(0xa723862e, 0xa8dcc943), TOBN(0x92d762b2, 0x849dc0fc)}, + {TOBN(0x3c444a12, 0x091ff4a9), TOBN(0x581113fa, 0x0cada274), + TOBN(0xb9de0a45, 0x30d8eae2), TOBN(0x5e0fcd85, 0xdf6b41ea)}}, + {{TOBN(0x6233ea68, 0xc094dbb5), TOBN(0xb77d062e, 0xd968d410), + TOBN(0x3e719bbc, 0x58b3002d), TOBN(0x68e7dd3d, 0x3dc49d58)}, + {TOBN(0x8d825740, 0x013a5e58), TOBN(0x21311747, 0x3c9e3c1b), + TOBN(0x0cb0a2a7, 0x7c99b6ab), TOBN(0x5c48a3b3, 0xc2f888f2)}}}, + {{{TOBN(0xc7913e91, 0x991724f3), TOBN(0x5eda799c, 0x39cbd686), + TOBN(0xddb595c7, 0x63d4fc1e), TOBN(0x6b63b80b, 0xac4fed54)}, + {TOBN(0x6ea0fc69, 0x7e5fb516), TOBN(0x737708ba, 0xd0f1c964), + TOBN(0x9628745f, 0x11a92ca5), TOBN(0x61f37958, 0x9a86967a)}}, + {{TOBN(0x9af39b2c, 0xaa665072), TOBN(0x78322fa4, 0xefd324ef), + TOBN(0x3d153394, 0xc327bd31), TOBN(0x81d5f271, 0x3129dab0)}, + {TOBN(0xc72e0c42, 0xf48027f5), TOBN(0xaa40cdbc, 0x8536e717), + TOBN(0xf45a657a, 0x2d369d0f), TOBN(0xb03bbfc4, 0xea7f74e6)}}, + {{TOBN(0x46a8c418, 0x0d738ded), TOBN(0x6f1a5bb0, 0xe0de5729), + TOBN(0xf10230b9, 0x8ba81675), TOBN(0x32c6f30c, 0x112b33d4)}, + {TOBN(0x7559129d, 0xd8fffb62), TOBN(0x6a281b47, 0xb459bf05), + TOBN(0x77c1bd3a, 0xfa3b6776), TOBN(0x0709b380, 0x7829973a)}}, + {{TOBN(0x8c26b232, 0xa3326505), TOBN(0x38d69272, 0xee1d41bf), + TOBN(0x0459453e, 0xffe32afa), TOBN(0xce8143ad, 0x7cb3ea87)}, + {TOBN(0x932ec1fa, 0x7e6ab666), TOBN(0x6cd2d230, 0x22286264), + TOBN(0x459a46fe, 0x6736f8ed), TOBN(0x50bf0d00, 0x9eca85bb)}}, + {{TOBN(0x0b825852, 0x877a21ec), TOBN(0x300414a7, 0x0f537a94), + TOBN(0x3f1cba40, 0x21a9a6a2), TOBN(0x50824eee, 0x76943c00)}, + {TOBN(0xa0dbfcec, 0xf83cba5d), TOBN(0xf9538148, 0x93b4f3c0), + TOBN(0x61744162, 0x48f24dd7), TOBN(0x5322d64d, 0xe4fb09dd)}}, + {{TOBN(0x57447384, 0x3d9325f3), TOBN(0xa9bef2d0, 0xf371cb84), + TOBN(0x77d2188b, 0xa61e36c5), TOBN(0xbbd6a7d7, 0xc602df72)}, + {TOBN(0xba3aa902, 0x8f61bc0b), TOBN(0xf49085ed, 0x6ed0b6a1), + TOBN(0x8bc625d6, 0xae6e8298), TOBN(0x832b0b1d, 0xa2e9c01d)}}, + {{TOBN(0xa337c447, 0xf1f0ced1), TOBN(0x800cc793, 0x9492dd2b), + TOBN(0x4b93151d, 0xbea08efa), TOBN(0x820cf3f8, 0xde0a741e)}, + {TOBN(0xff1982dc, 0x1c0f7d13), TOBN(0xef921960, 0x84dde6ca), + TOBN(0x1ad7d972, 0x45f96ee3), TOBN(0x319c8dbe, 0x29dea0c7)}}, + {{TOBN(0xd3ea3871, 0x7b82b99b), TOBN(0x75922d4d, 0x470eb624), + TOBN(0x8f66ec54, 0x3b95d466), TOBN(0x66e673cc, 0xbee1e346)}, + {TOBN(0x6afe67c4, 0xb5f2b89a), TOBN(0x3de9c1e6, 0x290e5cd3), + TOBN(0x8c278bb6, 0x310a2ada), TOBN(0x420fa384, 0x0bdb323b)}}, + {{TOBN(0x0ae1d63b, 0x0eb919b0), TOBN(0xd74ee51d, 0xa74b9620), + TOBN(0x395458d0, 0xa674290c), TOBN(0x324c930f, 0x4620a510)}, + {TOBN(0x2d1f4d19, 0xfbac27d4), TOBN(0x4086e8ca, 0x9bedeeac), + TOBN(0x0cdd211b, 0x9b679ab8), TOBN(0x5970167d, 0x7090fec4)}}, + {{TOBN(0x3420f2c9, 0xfaf1fc63), TOBN(0x616d333a, 0x328c8bb4), + TOBN(0x7d65364c, 0x57f1fe4a), TOBN(0x9343e877, 0x55e5c73a)}, + {TOBN(0x5795176b, 0xe970e78c), TOBN(0xa36ccebf, 0x60533627), + TOBN(0xfc7c7380, 0x09cdfc1b), TOBN(0xb39a2afe, 0xb3fec326)}}, + {{TOBN(0xb7ff1ba1, 0x6224408a), TOBN(0xcc856e92, 0x247cfc5e), + TOBN(0x01f102e7, 0xc18bc493), TOBN(0x4613ab74, 0x2091c727)}, + {TOBN(0xaa25e89c, 0xc420bf2b), TOBN(0x00a53176, 0x90337ec2), + TOBN(0xd2be9f43, 0x7d025fc7), TOBN(0x3316fb85, 0x6e6fe3dc)}}, + {{TOBN(0x27520af5, 0x9ac50814), TOBN(0xfdf95e78, 0x9a8e4223), + TOBN(0xb7e7df2a, 0x56bec5a0), TOBN(0xf7022f7d, 0xdf159e5d)}, + {TOBN(0x93eeeab1, 0xcac1fe8f), TOBN(0x8040188c, 0x37451168), + TOBN(0x7ee8aa8a, 0xd967dce6), TOBN(0xfa0e79e7, 0x3abc9299)}}, + {{TOBN(0x67332cfc, 0x2064cfd1), TOBN(0x339c31de, 0xb0651934), + TOBN(0x719b28d5, 0x2a3bcbea), TOBN(0xee74c82b, 0x9d6ae5c6)}, + {TOBN(0x0927d05e, 0xbaf28ee6), TOBN(0x82cecf2c, 0x9d719028), + TOBN(0x0b0d353e, 0xddb30289), TOBN(0xfe4bb977, 0xfddb2e29)}}, + {{TOBN(0xbb5bb990, 0x640bfd9e), TOBN(0xd226e277, 0x82f62108), + TOBN(0x4bf00985, 0x02ffdd56), TOBN(0x7756758a, 0x2ca1b1b5)}, + {TOBN(0xc32b62a3, 0x5285fe91), TOBN(0xedbc546a, 0x8c9cd140), + TOBN(0x1e47a013, 0xaf5cb008), TOBN(0xbca7e720, 0x073ce8f2)}}, + {{TOBN(0xe10b2ab8, 0x17a91cae), TOBN(0xb89aab65, 0x08e27f63), + TOBN(0x7b3074a7, 0xdba3ddf9), TOBN(0x1c20ce09, 0x330c2972)}, + {TOBN(0x6b9917b4, 0x5fcf7e33), TOBN(0xe6793743, 0x945ceb42), + TOBN(0x18fc2215, 0x5c633d19), TOBN(0xad1adb3c, 0xc7485474)}}, + {{TOBN(0x646f9679, 0x6424c49b), TOBN(0xf888dfe8, 0x67c241c9), + TOBN(0xe12d4b93, 0x24f68b49), TOBN(0x9a6b62d8, 0xa571df20)}, + {TOBN(0x81b4b26d, 0x179483cb), TOBN(0x666f9632, 0x9511fae2), + TOBN(0xd281b3e4, 0xd53aa51f), TOBN(0x7f96a765, 0x7f3dbd16)}}, + {{TOBN(0xa7f8b5bf, 0x074a30ce), TOBN(0xd7f52107, 0x005a32e6), + TOBN(0x6f9e0907, 0x50237ed4), TOBN(0x2f21da47, 0x8096fa2b)}, + {TOBN(0xf3e19cb4, 0xeec863a0), TOBN(0xd18f77fd, 0x9527620a), + TOBN(0x9505c81c, 0x407c1cf8), TOBN(0x9998db4e, 0x1b6ec284)}}, + {{TOBN(0x7e3389e5, 0xc247d44d), TOBN(0x12507141, 0x3f4f3d80), + TOBN(0xd4ba0110, 0x4a78a6c7), TOBN(0x312874a0, 0x767720be)}, + {TOBN(0xded059a6, 0x75944370), TOBN(0xd6123d90, 0x3b2c0bdd), + TOBN(0xa56b717b, 0x51c108e3), TOBN(0x9bb7940e, 0x070623e9)}}, + {{TOBN(0x794e2d59, 0x84ac066c), TOBN(0xf5954a92, 0xe68c69a0), + TOBN(0x28c52458, 0x4fd99dcc), TOBN(0x60e639fc, 0xb1012517)}, + {TOBN(0xc2e60125, 0x7de79248), TOBN(0xe9ef6404, 0xf12fc6d7), + TOBN(0x4c4f2808, 0x2a3b5d32), TOBN(0x865ad32e, 0xc768eb8a)}}, + {{TOBN(0xac02331b, 0x13fb70b6), TOBN(0x037b44c1, 0x95599b27), + TOBN(0x1a860fc4, 0x60bd082c), TOBN(0xa2e25745, 0xc980cd01)}, + {TOBN(0xee3387a8, 0x1da0263e), TOBN(0x931bfb95, 0x2d10f3d6), + TOBN(0x5b687270, 0xa1f24a32), TOBN(0xf140e65d, 0xca494b86)}}, + {{TOBN(0x4f4ddf91, 0xb2f1ac7a), TOBN(0xf99eaabb, 0x760fee27), + TOBN(0x57f4008a, 0x49c228e5), TOBN(0x090be440, 0x1cf713bb)}, + {TOBN(0xac91fbe4, 0x5004f022), TOBN(0xd838c2c2, 0x569e1af6), + TOBN(0xd6c7d20b, 0x0f1daaa5), TOBN(0xaa063ac1, 0x1bbb02c0)}}, + {{TOBN(0x0938a422, 0x59558a78), TOBN(0x5343c669, 0x8435da2f), + TOBN(0x96f67b18, 0x034410dc), TOBN(0x7cc1e424, 0x84510804)}, + {TOBN(0x86a1543f, 0x16dfbb7d), TOBN(0x921fa942, 0x5b5bd592), + TOBN(0x9dcccb6e, 0xb33dd03c), TOBN(0x8581ddd9, 0xb843f51e)}}, + {{TOBN(0x54935fcb, 0x81d73c9e), TOBN(0x6d07e979, 0x0a5e97ab), + TOBN(0x4dc7b30a, 0xcf3a6bab), TOBN(0x147ab1f3, 0x170bee11)}, + {TOBN(0x0aaf8e3d, 0x9fafdee4), TOBN(0xfab3dbcb, 0x538a8b95), + TOBN(0x405df4b3, 0x6ef13871), TOBN(0xf1f4e9cb, 0x088d5a49)}}, + {{TOBN(0x9bcd24d3, 0x66b33f1d), TOBN(0x3b97b820, 0x5ce445c0), + TOBN(0xe2926549, 0xba93ff61), TOBN(0xd9c341ce, 0x4dafe616)}, + {TOBN(0xfb30a76e, 0x16efb6f3), TOBN(0xdf24b8ca, 0x605b953c), + TOBN(0x8bd52afe, 0xc2fffb9f), TOBN(0xbbac5ff7, 0xe19d0b96)}}, + {{TOBN(0x43c01b87, 0x459afccd), TOBN(0x6bd45143, 0xb7432652), + TOBN(0x84734530, 0x55b5d78e), TOBN(0x81088fdb, 0x1554ba7d)}, + {TOBN(0xada0a52c, 0x1e269375), TOBN(0xf9f037c4, 0x2dc5ec10), + TOBN(0xc0660607, 0x94bfbc11), TOBN(0xc0a630bb, 0xc9c40d2f)}}, + {{TOBN(0x5efc797e, 0xab64c31e), TOBN(0xffdb1dab, 0x74507144), + TOBN(0xf6124287, 0x1ca6790c), TOBN(0xe9609d81, 0xe69bf1bf)}, + {TOBN(0xdb898595, 0x00d24fc9), TOBN(0x9c750333, 0xe51fb417), + TOBN(0x51830a91, 0xfef7bbde), TOBN(0x0ce67dc8, 0x945f585c)}}, + {{TOBN(0x9a730ed4, 0x4763eb50), TOBN(0x24a0e221, 0xc1ab0d66), + TOBN(0x643b6393, 0x648748f3), TOBN(0x1982daa1, 0x6d3c6291)}, + {TOBN(0x6f00a9f7, 0x8bbc5549), TOBN(0x7a1783e1, 0x7f36384e), + TOBN(0xe8346323, 0xde977f50), TOBN(0x91ab688d, 0xb245502a)}}, + {{TOBN(0x331ab6b5, 0x6d0bdd66), TOBN(0x0a6ef32e, 0x64b71229), + TOBN(0x1028150e, 0xfe7c352f), TOBN(0x27e04350, 0xce7b39d3)}, + {TOBN(0x2a3c8acd, 0xc1070c82), TOBN(0xfb2034d3, 0x80c9feef), + TOBN(0x2d729621, 0x709f3729), TOBN(0x8df290bf, 0x62cb4549)}}, + {{TOBN(0x02f99f33, 0xfc2e4326), TOBN(0x3b30076d, 0x5eddf032), + TOBN(0xbb21f8cf, 0x0c652fb5), TOBN(0x314fb49e, 0xed91cf7b)}, + {TOBN(0xa013eca5, 0x2f700750), TOBN(0x2b9e3c23, 0x712a4575), + TOBN(0xe5355557, 0xaf30fbb0), TOBN(0x1ada3516, 0x7c77e771)}}, + {{TOBN(0x45f6ecb2, 0x7b135670), TOBN(0xe85d19df, 0x7cfc202e), + TOBN(0x0f1b50c7, 0x58d1be9f), TOBN(0x5ebf2c0a, 0xead2e344)}, + {TOBN(0x1531fe4e, 0xabc199c9), TOBN(0xc7032592, 0x56bab0ae), + TOBN(0x16ab2e48, 0x6c1fec54), TOBN(0x0f87fda8, 0x04280188)}}, + {{TOBN(0xdc9f46fc, 0x609e4a74), TOBN(0x2a44a143, 0xba667f91), + TOBN(0xbc3d8b95, 0xb4d83436), TOBN(0xa01e4bd0, 0xc7bd2958)}, + {TOBN(0x7b182932, 0x73483c90), TOBN(0xa79c6aa1, 0xa7c7b598), + TOBN(0xbf3983c6, 0xeaaac07e), TOBN(0x8f18181e, 0x96e0d4e6)}}, + {{TOBN(0x8553d37c, 0x051af62b), TOBN(0xe9a998eb, 0x0bf94496), + TOBN(0xe0844f9f, 0xb0d59aa1), TOBN(0x983fd558, 0xe6afb813)}, + {TOBN(0x9670c0ca, 0x65d69804), TOBN(0x732b22de, 0x6ea5ff2d), + TOBN(0xd7640ba9, 0x5fd8623b), TOBN(0x9f619163, 0xa6351782)}}, + {{TOBN(0x0bfc27ee, 0xacee5043), TOBN(0xae419e73, 0x2eb10f02), + TOBN(0x19c028d1, 0x8943fb05), TOBN(0x71f01cf7, 0xff13aa2a)}, + {TOBN(0x7790737e, 0x8887a132), TOBN(0x67513309, 0x66318410), + TOBN(0x9819e8a3, 0x7ddb795e), TOBN(0xfecb8ef5, 0xdad100b2)}}, + {{TOBN(0x59f74a22, 0x3021926a), TOBN(0xb7c28a49, 0x6f9b4c1c), + TOBN(0xed1a733f, 0x912ad0ab), TOBN(0x42a910af, 0x01a5659c)}, + {TOBN(0x3842c6e0, 0x7bd68cab), TOBN(0x2b57fa38, 0x76d70ac8), + TOBN(0x8a6707a8, 0x3c53aaeb), TOBN(0x62c1c510, 0x65b4db18)}}, + {{TOBN(0x8de2c1fb, 0xb2d09dc7), TOBN(0xc3dfed12, 0x266bd23b), + TOBN(0x927d039b, 0xd5b27db6), TOBN(0x2fb2f0f1, 0x103243da)}, + {TOBN(0xf855a07b, 0x80be7399), TOBN(0xed9327ce, 0x1f9f27a8), + TOBN(0xa0bd99c7, 0x729bdef7), TOBN(0x2b67125e, 0x28250d88)}}, + {{TOBN(0x784b26e8, 0x8670ced7), TOBN(0xe3dfe41f, 0xc31bd3b4), + TOBN(0x9e353a06, 0xbcc85cbc), TOBN(0x302e2909, 0x60178a9d)}, + {TOBN(0x860abf11, 0xa6eac16e), TOBN(0x76447000, 0xaa2b3aac), + TOBN(0x46ff9d19, 0x850afdab), TOBN(0x35bdd6a5, 0xfdb2d4c1)}}, + {{TOBN(0xe82594b0, 0x7e5c9ce9), TOBN(0x0f379e53, 0x20af346e), + TOBN(0x608b31e3, 0xbc65ad4a), TOBN(0x710c6b12, 0x267c4826)}, + {TOBN(0x51c966f9, 0x71954cf1), TOBN(0xb1cec793, 0x0d0aa215), + TOBN(0x1f155989, 0x86bd23a8), TOBN(0xae2ff99c, 0xf9452e86)}}, + {{TOBN(0xd8dd953c, 0x340ceaa2), TOBN(0x26355275, 0x2e2e9333), + TOBN(0x15d4e5f9, 0x8586f06d), TOBN(0xd6bf94a8, 0xf7cab546)}, + {TOBN(0x33c59a0a, 0xb76a9af0), TOBN(0x52740ab3, 0xba095af7), + TOBN(0xc444de8a, 0x24389ca0), TOBN(0xcc6f9863, 0x706da0cb)}}, + {{TOBN(0xb5a741a7, 0x6b2515cf), TOBN(0x71c41601, 0x9585c749), + TOBN(0x78350d4f, 0xe683de97), TOBN(0x31d61524, 0x63d0b5f5)}, + {TOBN(0x7a0cc5e1, 0xfbce090b), TOBN(0xaac927ed, 0xfbcb2a5b), + TOBN(0xe920de49, 0x20d84c35), TOBN(0x8c06a0b6, 0x22b4de26)}}, + {{TOBN(0xd34dd58b, 0xafe7ddf3), TOBN(0x55851fed, 0xc1e6e55b), + TOBN(0xd1395616, 0x960696e7), TOBN(0x940304b2, 0x5f22705f)}, + {TOBN(0x6f43f861, 0xb0a2a860), TOBN(0xcf121282, 0x0e7cc981), + TOBN(0x12186212, 0x0ab64a96), TOBN(0x09215b9a, 0xb789383c)}}, + {{TOBN(0x311eb305, 0x37387c09), TOBN(0xc5832fce, 0xf03ee760), + TOBN(0x30358f58, 0x32f7ea19), TOBN(0xe01d3c34, 0x91d53551)}, + {TOBN(0x1ca5ee41, 0xda48ea80), TOBN(0x34e71e8e, 0xcf4fa4c1), + TOBN(0x312abd25, 0x7af1e1c7), TOBN(0xe3afcdeb, 0x2153f4a5)}}, + {{TOBN(0x9d5c84d7, 0x00235e9a), TOBN(0x0308d3f4, 0x8c4c836f), + TOBN(0xc0a66b04, 0x89332de5), TOBN(0x610dd399, 0x89e566ef)}, + {TOBN(0xf8eea460, 0xd1ac1635), TOBN(0x84cbb3fb, 0x20a2c0df), + TOBN(0x40afb488, 0xe74a48c5), TOBN(0x29738198, 0xd326b150)}}, + {{TOBN(0x2a17747f, 0xa6d74081), TOBN(0x60ea4c05, 0x55a26214), + TOBN(0x53514bb4, 0x1f88c5fe), TOBN(0xedd64567, 0x7e83426c)}, + {TOBN(0xd5d6cbec, 0x96460b25), TOBN(0xa12fd0ce, 0x68dc115e), + TOBN(0xc5bc3ed2, 0x697840ea), TOBN(0x969876a8, 0xa6331e31)}}, + {{TOBN(0x60c36217, 0x472ff580), TOBN(0xf4229705, 0x4ad41393), + TOBN(0x4bd99ef0, 0xa03b8b92), TOBN(0x501c7317, 0xc144f4f6)}, + {TOBN(0x159009b3, 0x18464945), TOBN(0x6d5e594c, 0x74c5c6be), + TOBN(0x2d587011, 0x321a3660), TOBN(0xd1e184b1, 0x3898d022)}}, + {{TOBN(0x5ba04752, 0x4c6a7e04), TOBN(0x47fa1e2b, 0x45550b65), + TOBN(0x9419daf0, 0x48c0a9a5), TOBN(0x66362953, 0x7c243236)}, + {TOBN(0xcd0744b1, 0x5cb12a88), TOBN(0x561b6f9a, 0x2b646188), + TOBN(0x599415a5, 0x66c2c0c0), TOBN(0xbe3f0859, 0x0f83f09a)}}, + {{TOBN(0x9141c5be, 0xb92041b8), TOBN(0x01ae38c7, 0x26477d0d), + TOBN(0xca8b71f3, 0xd12c7a94), TOBN(0xfab5b31f, 0x765c70db)}, + {TOBN(0x76ae7492, 0x487443e9), TOBN(0x8595a310, 0x990d1349), + TOBN(0xf8dbeda8, 0x7d460a37), TOBN(0x7f7ad082, 0x1e45a38f)}}, + {{TOBN(0xed1d4db6, 0x1059705a), TOBN(0xa3dd492a, 0xe6b9c697), + TOBN(0x4b92ee3a, 0x6eb38bd5), TOBN(0xbab2609d, 0x67cc0bb7)}, + {TOBN(0x7fc4fe89, 0x6e70ee82), TOBN(0xeff2c56e, 0x13e6b7e3), + TOBN(0x9b18959e, 0x34d26fca), TOBN(0x2517ab66, 0x889d6b45)}}, + {{TOBN(0xf167b4e0, 0xbdefdd4f), TOBN(0x69958465, 0xf366e401), + TOBN(0x5aa368ab, 0xa73bbec0), TOBN(0x12148709, 0x7b240c21)}, + {TOBN(0x378c3233, 0x18969006), TOBN(0xcb4d73ce, 0xe1fe53d1), + TOBN(0x5f50a80e, 0x130c4361), TOBN(0xd67f5951, 0x7ef5212b)}}, + {{TOBN(0xf145e21e, 0x9e70c72e), TOBN(0xb2e52e29, 0x5566d2fb), + TOBN(0x44eaba4a, 0x032397f5), TOBN(0x5e56937b, 0x7e31a7de)}, + {TOBN(0x68dcf517, 0x456c61e1), TOBN(0xbc2e954a, 0xa8b0a388), + TOBN(0xe3552fa7, 0x60a8b755), TOBN(0x03442dae, 0x73ad0cde)}}, + {{TOBN(0x37ffe747, 0xceb26210), TOBN(0x983545e8, 0x787baef9), + TOBN(0x8b8c8535, 0x86a3de31), TOBN(0xc621dbcb, 0xfacd46db)}, + {TOBN(0x82e442e9, 0x59266fbb), TOBN(0xa3514c37, 0x339d471c), + TOBN(0x3a11b771, 0x62cdad96), TOBN(0xf0cb3b3c, 0xecf9bdf0)}}, + {{TOBN(0x3fcbdbce, 0x478e2135), TOBN(0x7547b5cf, 0xbda35342), + TOBN(0xa97e81f1, 0x8a677af6), TOBN(0xc8c2bf83, 0x28817987)}, + {TOBN(0xdf07eaaf, 0x45580985), TOBN(0xc68d1f05, 0xc93b45cb), + TOBN(0x106aa2fe, 0xc77b4cac), TOBN(0x4c1d8afc, 0x04a7ae86)}}, + {{TOBN(0xdb41c3fd, 0x9eb45ab2), TOBN(0x5b234b5b, 0xd4b22e74), + TOBN(0xda253dec, 0xf215958a), TOBN(0x67e0606e, 0xa04edfa0)}, + {TOBN(0xabbbf070, 0xef751b11), TOBN(0xf352f175, 0xf6f06dce), + TOBN(0xdfc4b6af, 0x6839f6b4), TOBN(0x53ddf9a8, 0x9959848e)}}, + {{TOBN(0xda49c379, 0xc21520b0), TOBN(0x90864ff0, 0xdbd5d1b6), + TOBN(0x2f055d23, 0x5f49c7f7), TOBN(0xe51e4e6a, 0xa796b2d8)}, + {TOBN(0xc361a67f, 0x5c9dc340), TOBN(0x5ad53c37, 0xbca7c620), + TOBN(0xda1d6588, 0x32c756d0), TOBN(0xad60d911, 0x8bb67e13)}}, + {{TOBN(0xd6c47bdf, 0x0eeec8c6), TOBN(0x4a27fec1, 0x078a1821), + TOBN(0x081f7415, 0xc3099524), TOBN(0x8effdf0b, 0x82cd8060)}, + {TOBN(0xdb70ec1c, 0x65842df8), TOBN(0x8821b358, 0xd319a901), + TOBN(0x72ee56ee, 0xde42b529), TOBN(0x5bb39592, 0x236e4286)}}, + {{TOBN(0xd1183316, 0xfd6f7140), TOBN(0xf9fadb5b, 0xbd8e81f7), + TOBN(0x701d5e0c, 0x5a02d962), TOBN(0xfdee4dbf, 0x1b601324)}, + {TOBN(0xbed17407, 0x35d7620e), TOBN(0x04e3c2c3, 0xf48c0012), + TOBN(0x9ee29da7, 0x3455449a), TOBN(0x562cdef4, 0x91a836c4)}}, + {{TOBN(0x8f682a5f, 0x47701097), TOBN(0x617125d8, 0xff88d0c2), + TOBN(0x948fda24, 0x57bb86dd), TOBN(0x348abb8f, 0x289f7286)}, + {TOBN(0xeb10eab5, 0x99d94bbd), TOBN(0xd51ba28e, 0x4684d160), + TOBN(0xabe0e51c, 0x30c8f41a), TOBN(0x66588b45, 0x13254f4a)}}, + {{TOBN(0x147ebf01, 0xfad097a5), TOBN(0x49883ea8, 0x610e815d), + TOBN(0xe44d60ba, 0x8a11de56), TOBN(0xa970de6e, 0x827a7a6d)}, + {TOBN(0x2be41424, 0x5e17fc19), TOBN(0xd833c657, 0x01214057), + TOBN(0x1375813b, 0x363e723f), TOBN(0x6820bb88, 0xe6a52e9b)}}, + {{TOBN(0x7e7f6970, 0xd875d56a), TOBN(0xd6a0a9ac, 0x51fbf6bf), + TOBN(0x54ba8790, 0xa3083c12), TOBN(0xebaeb23d, 0x6ae7eb64)}, + {TOBN(0xa8685c3a, 0xb99a907a), TOBN(0xf1e74550, 0x026bf40b), + TOBN(0x7b73a027, 0xc802cd9e), TOBN(0x9a8a927c, 0x4fef4635)}}, + {{TOBN(0xe1b6f60c, 0x08191224), TOBN(0xc4126ebb, 0xde4ec091), + TOBN(0xe1dff4dc, 0x4ae38d84), TOBN(0xde3f57db, 0x4f2ef985)}, + {TOBN(0x34964337, 0xd446a1dd), TOBN(0x7bf217a0, 0x859e77f6), + TOBN(0x8ff10527, 0x8e1d13f5), TOBN(0xa304ef03, 0x74eeae27)}}, + {{TOBN(0xfc6f5e47, 0xd19dfa5a), TOBN(0xdb007de3, 0x7fad982b), + TOBN(0x28205ad1, 0x613715f5), TOBN(0x251e6729, 0x7889529e)}, + {TOBN(0x72705184, 0x1ae98e78), TOBN(0xf818537d, 0x271cac32), + TOBN(0xc8a15b7e, 0xb7f410f5), TOBN(0xc474356f, 0x81f62393)}}, + {{TOBN(0x92dbdc5a, 0xc242316b), TOBN(0xabe060ac, 0xdbf4aff5), + TOBN(0x6e8c38fe, 0x909a8ec6), TOBN(0x43e514e5, 0x6116cb94)}, + {TOBN(0x2078fa38, 0x07d784f9), TOBN(0x1161a880, 0xf4b5b357), + TOBN(0x5283ce79, 0x13adea3d), TOBN(0x0756c3e6, 0xcc6a910b)}}, + {{TOBN(0x60bcfe01, 0xaaa79697), TOBN(0x04a73b29, 0x56391db1), + TOBN(0xdd8dad47, 0x189b45a0), TOBN(0xbfac0dd0, 0x48d5b8d9)}, + {TOBN(0x34ab3af5, 0x7d3d2ec2), TOBN(0x6fa2fc2d, 0x207bd3af), + TOBN(0x9ff40092, 0x66550ded), TOBN(0x719b3e87, 0x1fd5b913)}}, + {{TOBN(0xa573a496, 0x6d17fbc7), TOBN(0x0cd1a70a, 0x73d2b24e), + TOBN(0x34e2c5ca, 0xb2676937), TOBN(0xe7050b06, 0xbf669f21)}, + {TOBN(0xfbe948b6, 0x1ede9046), TOBN(0xa0530051, 0x97662659), + TOBN(0x58cbd4ed, 0xf10124c5), TOBN(0xde2646e4, 0xdd6c06c8)}}, + {{TOBN(0x332f8108, 0x8cad38c0), TOBN(0x471b7e90, 0x6bd68ae2), + TOBN(0x56ac3fb2, 0x0d8e27a3), TOBN(0xb54660db, 0x136b4b0d)}, + {TOBN(0x123a1e11, 0xa6fd8de4), TOBN(0x44dbffea, 0xa37799ef), + TOBN(0x4540b977, 0xce6ac17c), TOBN(0x495173a8, 0xaf60acef)}}}, + {{{TOBN(0x9ebb284d, 0x391c2a82), TOBN(0xbcdd4863, 0x158308e8), + TOBN(0x006f16ec, 0x83f1edca), TOBN(0xa13e2c37, 0x695dc6c8)}, + {TOBN(0x2ab756f0, 0x4a057a87), TOBN(0xa8765500, 0xa6b48f98), + TOBN(0x4252face, 0x68651c44), TOBN(0xa52b540b, 0xe1765e02)}}, + {{TOBN(0x4f922fc5, 0x16a0d2bb), TOBN(0x0d5cc16c, 0x1a623499), + TOBN(0x9241cf3a, 0x57c62c8b), TOBN(0x2f5e6961, 0xfd1b667f)}, + {TOBN(0x5c15c70b, 0xf5a01797), TOBN(0x3d20b44d, 0x60956192), + TOBN(0x04911b37, 0x071fdb52), TOBN(0xf648f916, 0x8d6f0f7b)}}, + {{TOBN(0x6dc1acaf, 0xe60b7cf7), TOBN(0x25860a50, 0x84a9d869), + TOBN(0x56fc6f09, 0xe7ba8ac4), TOBN(0x828c5bd0, 0x6148d29e)}, + {TOBN(0xac6b435e, 0xdc55ae5f), TOBN(0xa527f56c, 0xc0117411), + TOBN(0x94d5045e, 0xfd24342c), TOBN(0x2c4c0a35, 0x70b67c0d)}}, + {{TOBN(0x027cc8b8, 0xfac61d9a), TOBN(0x7d25e062, 0xe3c6fe8a), + TOBN(0xe08805bf, 0xe5bff503), TOBN(0x13271e6c, 0x6ff632f7)}, + {TOBN(0x55dca6c0, 0x232f76a5), TOBN(0x8957c32d, 0x701ef426), + TOBN(0xee728bcb, 0xa10a5178), TOBN(0x5ea60411, 0xb62c5173)}}, + {{TOBN(0xfc4e964e, 0xd0b8892b), TOBN(0x9ea17683, 0x9301bb74), + TOBN(0x6265c5ae, 0xfcc48626), TOBN(0xe60cf82e, 0xbb3e9102)}, + {TOBN(0x57adf797, 0xd4df5531), TOBN(0x235b59a1, 0x8deeefe2), + TOBN(0x60adcf58, 0x3f306eb1), TOBN(0x105c2753, 0x3d09492d)}}, + {{TOBN(0x4090914b, 0xb5def996), TOBN(0x1cb69c83, 0x233dd1e7), + TOBN(0xc1e9c1d3, 0x9b3d5e76), TOBN(0x1f3338ed, 0xfccf6012)}, + {TOBN(0xb1e95d0d, 0x2f5378a8), TOBN(0xacf4c2c7, 0x2f00cd21), + TOBN(0x6e984240, 0xeb5fe290), TOBN(0xd66c038d, 0x248088ae)}}, + {{TOBN(0x804d264a, 0xf94d70cf), TOBN(0xbdb802ef, 0x7314bf7e), + TOBN(0x8fb54de2, 0x4333ed02), TOBN(0x740461e0, 0x285635d9)}, + {TOBN(0x4113b2c8, 0x365e9383), TOBN(0xea762c83, 0x3fdef652), + TOBN(0x4eec6e2e, 0x47b956c1), TOBN(0xa3d814be, 0x65620fa4)}}, + {{TOBN(0x9ad5462b, 0xb4d8bc50), TOBN(0x181c0b16, 0xa9195770), + TOBN(0xebd4fe1c, 0x78412a68), TOBN(0xae0341bc, 0xc0dff48c)}, + {TOBN(0xb6bc45cf, 0x7003e866), TOBN(0xf11a6dea, 0x8a24a41b), + TOBN(0x5407151a, 0xd04c24c2), TOBN(0x62c9d27d, 0xda5b7b68)}}, + {{TOBN(0x2e964235, 0x88cceff6), TOBN(0x8594c54f, 0x8b07ed69), + TOBN(0x1578e73c, 0xc84d0d0d), TOBN(0x7b4e1055, 0xff532868)}, + {TOBN(0xa348c0d5, 0xb5ec995a), TOBN(0xbf4b9d55, 0x14289a54), + TOBN(0x9ba155a6, 0x58fbd777), TOBN(0x186ed7a8, 0x1a84491d)}}, + {{TOBN(0xd4992b30, 0x614c0900), TOBN(0xda98d121, 0xbd00c24b), + TOBN(0x7f534dc8, 0x7ec4bfa1), TOBN(0x4a5ff674, 0x37dc34bc)}, + {TOBN(0x68c196b8, 0x1d7ea1d7), TOBN(0x38cf2893, 0x80a6d208), + TOBN(0xfd56cd09, 0xe3cbbd6e), TOBN(0xec72e27e, 0x4205a5b6)}}, + {{TOBN(0x15ea68f5, 0xa44f77f7), TOBN(0x7aa5f9fd, 0xb43c52bc), + TOBN(0x86ff676f, 0x94f0e609), TOBN(0xa4cde963, 0x2e2d432b)}, + {TOBN(0x8cafa0c0, 0xeee470af), TOBN(0x84137d0e, 0x8a3f5ec8), + TOBN(0xebb40411, 0xfaa31231), TOBN(0xa239c13f, 0x6f7f7ccf)}}, + {{TOBN(0x32865719, 0xa8afd30b), TOBN(0x86798328, 0x8a826dce), + TOBN(0xdf04e891, 0xc4a8fbe0), TOBN(0xbb6b6e1b, 0xebf56ad3)}, + {TOBN(0x0a695b11, 0x471f1ff0), TOBN(0xd76c3389, 0xbe15baf0), + TOBN(0x018edb95, 0xbe96c43e), TOBN(0xf2beaaf4, 0x90794158)}}, + {{TOBN(0x152db09e, 0xc3076a27), TOBN(0x5e82908e, 0xe416545d), + TOBN(0xa2c41272, 0x356d6f2e), TOBN(0xdc9c9642, 0x31fd74e1)}, + {TOBN(0x66ceb88d, 0x519bf615), TOBN(0xe29ecd76, 0x05a2274e), + TOBN(0x3a0473c4, 0xbf5e2fa0), TOBN(0x6b6eb671, 0x64284e67)}}, + {{TOBN(0xe8b97932, 0xb88756dd), TOBN(0xed4e8652, 0xf17e3e61), + TOBN(0xc2dd1499, 0x3ee1c4a4), TOBN(0xc0aaee17, 0x597f8c0e)}, + {TOBN(0x15c4edb9, 0x6c168af3), TOBN(0x6563c7bf, 0xb39ae875), + TOBN(0xadfadb6f, 0x20adb436), TOBN(0xad55e8c9, 0x9a042ac0)}}, + {{TOBN(0x975a1ed8, 0xb76da1f5), TOBN(0x10dfa466, 0xa58acb94), + TOBN(0x8dd7f7e3, 0xac060282), TOBN(0x6813e66a, 0x572a051e)}, + {TOBN(0xb4ccae1e, 0x350cb901), TOBN(0xb653d656, 0x50cb7822), + TOBN(0x42484710, 0xdfab3b87), TOBN(0xcd7ee537, 0x9b670fd0)}}, + {{TOBN(0x0a50b12e, 0x523b8bf6), TOBN(0x8009eb5b, 0x8f910c1b), + TOBN(0xf535af82, 0x4a167588), TOBN(0x0f835f9c, 0xfb2a2abd)}, + {TOBN(0xf59b2931, 0x2afceb62), TOBN(0xc797df2a, 0x169d383f), + TOBN(0xeb3f5fb0, 0x66ac02b0), TOBN(0x029d4c6f, 0xdaa2d0ca)}}, + {{TOBN(0xd4059bc1, 0xafab4bc5), TOBN(0x833f5c6f, 0x56783247), + TOBN(0xb5346630, 0x8d2d3605), TOBN(0x83387891, 0xd34d8433)}, + {TOBN(0xd973b30f, 0xadd9419a), TOBN(0xbcca1099, 0xafe3fce8), + TOBN(0x08178315, 0x0809aac6), TOBN(0x01b7f21a, 0x540f0f11)}}, + {{TOBN(0x65c29219, 0x909523c8), TOBN(0xa62f648f, 0xa3a1c741), + TOBN(0x88598d4f, 0x60c9e55a), TOBN(0xbce9141b, 0x0e4f347a)}, + {TOBN(0x9af97d84, 0x35f9b988), TOBN(0x0210da62, 0x320475b6), + TOBN(0x3c076e22, 0x9191476c), TOBN(0x7520dbd9, 0x44fc7834)}}, + {{TOBN(0x6a6b2cfe, 0xc1ab1bbd), TOBN(0xef8a65be, 0xdc650938), + TOBN(0x72855540, 0x805d7bc4), TOBN(0xda389396, 0xed11fdfd)}, + {TOBN(0xa9d5bd36, 0x74660876), TOBN(0x11d67c54, 0xb45dff35), + TOBN(0x6af7d148, 0xa4f5da94), TOBN(0xbb8d4c3f, 0xc0bbeb31)}}, + {{TOBN(0x87a7ebd1, 0xe0a1b12a), TOBN(0x1e4ef88d, 0x770ba95f), + TOBN(0x8c33345c, 0xdc2ae9cb), TOBN(0xcecf1276, 0x01cc8403)}, + {TOBN(0x687c012e, 0x1b39b80f), TOBN(0xfd90d0ad, 0x35c33ba4), + TOBN(0xa3ef5a67, 0x5c9661c2), TOBN(0x368fc88e, 0xe017429e)}}, + {{TOBN(0xd30c6761, 0x196a2fa2), TOBN(0x931b9817, 0xbd5b312e), + TOBN(0xba01000c, 0x72f54a31), TOBN(0xa203d2c8, 0x66eaa541)}, + {TOBN(0xf2abdee0, 0x98939db3), TOBN(0xe37d6c2c, 0x3e606c02), + TOBN(0xf2921574, 0x521ff643), TOBN(0x2781b3c4, 0xd7e2fca3)}}, + {{TOBN(0x664300b0, 0x7850ec06), TOBN(0xac5a38b9, 0x7d3a10cf), + TOBN(0x9233188d, 0xe34ab39d), TOBN(0xe77057e4, 0x5072cbb9)}, + {TOBN(0xbcf0c042, 0xb59e78df), TOBN(0x4cfc91e8, 0x1d97de52), + TOBN(0x4661a26c, 0x3ee0ca4a), TOBN(0x5620a4c1, 0xfb8507bc)}}, + {{TOBN(0x4b44d4aa, 0x049f842c), TOBN(0xceabc5d5, 0x1540e82b), + TOBN(0x306710fd, 0x15c6f156), TOBN(0xbe5ae52b, 0x63db1d72)}, + {TOBN(0x06f1e7e6, 0x334957f1), TOBN(0x57e388f0, 0x31144a70), + TOBN(0xfb69bb2f, 0xdf96447b), TOBN(0x0f78ebd3, 0x73e38a12)}}, + {{TOBN(0xb8222605, 0x2b7ce542), TOBN(0xe6d4ce99, 0x7472bde1), + TOBN(0x53e16ebe, 0x09d2f4da), TOBN(0x180ff42e, 0x53b92b2e)}, + {TOBN(0xc59bcc02, 0x2c34a1c6), TOBN(0x3803d6f9, 0x422c46c2), + TOBN(0x18aff74f, 0x5c14a8a2), TOBN(0x55aebf80, 0x10a08b28)}}, + {{TOBN(0x66097d58, 0x7135593f), TOBN(0x32e6eff7, 0x2be570cd), + TOBN(0x584e6a10, 0x2a8c860d), TOBN(0xcd185890, 0xa2eb4163)}, + {TOBN(0x7ceae99d, 0x6d97e134), TOBN(0xd42c6b70, 0xdd8447ce), + TOBN(0x59ddbb4a, 0xb8c50273), TOBN(0x03c612df, 0x3cf34e1e)}}, + {{TOBN(0x84b9ca15, 0x04b6c5a0), TOBN(0x35216f39, 0x18f0e3a3), + TOBN(0x3ec2d2bc, 0xbd986c00), TOBN(0x8bf546d9, 0xd19228fe)}, + {TOBN(0xd1c655a4, 0x4cd623c3), TOBN(0x366ce718, 0x502b8e5a), + TOBN(0x2cfc84b4, 0xeea0bfe7), TOBN(0xe01d5cee, 0xcf443e8e)}}, + {{TOBN(0x8ec045d9, 0x036520f8), TOBN(0xdfb3c3d1, 0x92d40e98), + TOBN(0x0bac4cce, 0xcc559a04), TOBN(0x35eccae5, 0x240ea6b1)}, + {TOBN(0x180b32db, 0xf8a5a0ac), TOBN(0x547972a5, 0xeb699700), + TOBN(0xa3765801, 0xca26bca0), TOBN(0x57e09d0e, 0xa647f25a)}}, + {{TOBN(0xb956970e, 0x2fdd23cc), TOBN(0xb80288bc, 0x5682e971), + TOBN(0xe6e6d91e, 0x9ae86ebc), TOBN(0x0564c83f, 0x8c9f1939)}, + {TOBN(0x551932a2, 0x39560368), TOBN(0xe893752b, 0x049c28e2), + TOBN(0x0b03cee5, 0xa6a158c3), TOBN(0xe12d656b, 0x04964263)}}, + {{TOBN(0x4b47554e, 0x63e3bc1d), TOBN(0xc719b6a2, 0x45044ff7), + TOBN(0x4f24d30a, 0xe48daa07), TOBN(0xa3f37556, 0xc8c1edc3)}, + {TOBN(0x9a47bf76, 0x0700d360), TOBN(0xbb1a1824, 0x822ae4e2), + TOBN(0x22e275a3, 0x89f1fb4c), TOBN(0x72b1aa23, 0x9968c5f5)}}, + {{TOBN(0xa75feaca, 0xbe063f64), TOBN(0x9b392f43, 0xbce47a09), + TOBN(0xd4241509, 0x1ad07aca), TOBN(0x4b0c591b, 0x8d26cd0f)}, + {TOBN(0x2d42ddfd, 0x92f1169a), TOBN(0x63aeb1ac, 0x4cbf2392), + TOBN(0x1de9e877, 0x0691a2af), TOBN(0xebe79af7, 0xd98021da)}}, + {{TOBN(0xcfdf2a4e, 0x40e50acf), TOBN(0xf0a98ad7, 0xaf01d665), + TOBN(0xefb640bf, 0x1831be1f), TOBN(0x6fe8bd2f, 0x80e9ada0)}, + {TOBN(0x94c103a1, 0x6cafbc91), TOBN(0x170f8759, 0x8308e08c), + TOBN(0x5de2d2ab, 0x9780ff4f), TOBN(0x666466bc, 0x45b201f2)}}, + {{TOBN(0x58af2010, 0xf5b343bc), TOBN(0x0f2e400a, 0xf2f142fe), + TOBN(0x3483bfde, 0xa85f4bdf), TOBN(0xf0b1d093, 0x03bfeaa9)}, + {TOBN(0x2ea01b95, 0xc7081603), TOBN(0xe943e4c9, 0x3dba1097), + TOBN(0x47be92ad, 0xb438f3a6), TOBN(0x00bb7742, 0xe5bf6636)}}, + {{TOBN(0x136b7083, 0x824297b4), TOBN(0x9d0e5580, 0x5584455f), + TOBN(0xab48cedc, 0xf1c7d69e), TOBN(0x53a9e481, 0x2a256e76)}, + {TOBN(0x0402b0e0, 0x65eb2413), TOBN(0xdadbbb84, 0x8fc407a7), + TOBN(0xa65cd5a4, 0x8d7f5492), TOBN(0x21d44293, 0x74bae294)}}, + {{TOBN(0x66917ce6, 0x3b5f1cc4), TOBN(0x37ae52ea, 0xce872e62), + TOBN(0xbb087b72, 0x2905f244), TOBN(0x12077086, 0x1e6af74f)}, + {TOBN(0x4b644e49, 0x1058edea), TOBN(0x827510e3, 0xb638ca1d), + TOBN(0x8cf2b704, 0x6038591c), TOBN(0xffc8b47a, 0xfe635063)}}, + {{TOBN(0x3ae220e6, 0x1b4d5e63), TOBN(0xbd864742, 0x9d961b4b), + TOBN(0x610c107e, 0x9bd16bed), TOBN(0x4270352a, 0x1127147b)}, + {TOBN(0x7d17ffe6, 0x64cfc50e), TOBN(0x50dee01a, 0x1e36cb42), + TOBN(0x068a7622, 0x35dc5f9a), TOBN(0x9a08d536, 0xdf53f62c)}}, + {{TOBN(0x4ed71457, 0x6be5f7de), TOBN(0xd93006f8, 0xc2263c9e), + TOBN(0xe073694c, 0xcacacb36), TOBN(0x2ff7a5b4, 0x3ae118ab)}, + {TOBN(0x3cce53f1, 0xcd871236), TOBN(0xf156a39d, 0xc2aa6d52), + TOBN(0x9cc5f271, 0xb198d76d), TOBN(0xbc615b6f, 0x81383d39)}}, + {{TOBN(0xa54538e8, 0xde3eee6b), TOBN(0x58c77538, 0xab910d91), + TOBN(0x31e5bdbc, 0x58d278bd), TOBN(0x3cde4adf, 0xb963acae)}, + {TOBN(0xb1881fd2, 0x5302169c), TOBN(0x8ca60fa0, 0xa989ed8b), + TOBN(0xa1999458, 0xff96a0ee), TOBN(0xc1141f03, 0xac6c283d)}}, + {{TOBN(0x7677408d, 0x6dfafed3), TOBN(0x33a01653, 0x39661588), + TOBN(0x3c9c15ec, 0x0b726fa0), TOBN(0x090cfd93, 0x6c9b56da)}, + {TOBN(0xe34f4bae, 0xa3c40af5), TOBN(0x3469eadb, 0xd21129f1), + TOBN(0xcc51674a, 0x1e207ce8), TOBN(0x1e293b24, 0xc83b1ef9)}}, + {{TOBN(0x17173d13, 0x1e6c0bb4), TOBN(0x19004695, 0x90776d35), + TOBN(0xe7980e34, 0x6de6f922), TOBN(0x873554cb, 0xf4dd9a22)}, + {TOBN(0x0316c627, 0xcbf18a51), TOBN(0x4d93651b, 0x3032c081), + TOBN(0x207f2771, 0x3946834d), TOBN(0x2c08d7b4, 0x30cdbf80)}}, + {{TOBN(0x137a4fb4, 0x86df2a61), TOBN(0xa1ed9c07, 0xecf7b4a2), + TOBN(0xb2e460e2, 0x7bd042ff), TOBN(0xb7f5e2fa, 0x5f62f5ec)}, + {TOBN(0x7aa6ec6b, 0xcc2423b7), TOBN(0x75ce0a7f, 0xba63eea7), + TOBN(0x67a45fb1, 0xf250a6e1), TOBN(0x93bc919c, 0xe53cdc9f)}}, + {{TOBN(0x9271f56f, 0x871942df), TOBN(0x2372ff6f, 0x7859ad66), + TOBN(0x5f4c2b96, 0x33cb1a78), TOBN(0xe3e29101, 0x5838aa83)}, + {TOBN(0xa7ed1611, 0xe4e8110c), TOBN(0x2a2d70d5, 0x330198ce), + TOBN(0xbdf132e8, 0x6720efe0), TOBN(0xe61a8962, 0x66a471bf)}}, + {{TOBN(0x796d3a85, 0x825808bd), TOBN(0x51dc3cb7, 0x3fd6e902), + TOBN(0x643c768a, 0x916219d1), TOBN(0x36cd7685, 0xa2ad7d32)}, + {TOBN(0xe3db9d05, 0xb22922a4), TOBN(0x6494c87e, 0xdba29660), + TOBN(0xf0ac91df, 0xbcd2ebc7), TOBN(0x4deb57a0, 0x45107f8d)}}, + {{TOBN(0x42271f59, 0xc3d12a73), TOBN(0x5f71687c, 0xa5c2c51d), + TOBN(0xcb1f50c6, 0x05797bcb), TOBN(0x29ed0ed9, 0xd6d34eb0)}, + {TOBN(0xe5fe5b47, 0x4683c2eb), TOBN(0x4956eeb5, 0x97447c46), + TOBN(0x5b163a43, 0x71207167), TOBN(0x93fa2fed, 0x0248c5ef)}}, + {{TOBN(0x67930af2, 0x31f63950), TOBN(0xa77797c1, 0x14caa2c9), + TOBN(0x526e80ee, 0x27ac7e62), TOBN(0xe1e6e626, 0x58b28aec)}, + {TOBN(0x636178b0, 0xb3c9fef0), TOBN(0xaf7752e0, 0x6d5f90be), + TOBN(0x94ecaf18, 0xeece51cf), TOBN(0x2864d0ed, 0xca806e1f)}}, + {{TOBN(0x6de2e383, 0x97c69134), TOBN(0x5a42c316, 0xeb291293), + TOBN(0xc7779219, 0x6a60bae0), TOBN(0xa24de346, 0x6b7599d1)}, + {TOBN(0x49d374aa, 0xb75d4941), TOBN(0x98900586, 0x2d501ff0), + TOBN(0x9f16d40e, 0xeb7974cf), TOBN(0x1033860b, 0xcdd8c115)}}, + {{TOBN(0xb6c69ac8, 0x2094cec3), TOBN(0x9976fb88, 0x403b770c), + TOBN(0x1dea026c, 0x4859590d), TOBN(0xb6acbb46, 0x8562d1fd)}, + {TOBN(0x7cd6c461, 0x44569d85), TOBN(0xc3190a36, 0x97f0891d), + TOBN(0xc6f53195, 0x48d5a17d), TOBN(0x7d919966, 0xd749abc8)}}, + {{TOBN(0x65104837, 0xdd1c8a20), TOBN(0x7e5410c8, 0x2f683419), + TOBN(0x958c3ca8, 0xbe94022e), TOBN(0x605c3197, 0x6145dac2)}, + {TOBN(0x3fc07501, 0x01683d54), TOBN(0x1d7127c5, 0x595b1234), + TOBN(0x10b8f87c, 0x9481277f), TOBN(0x677db2a8, 0xe65a1adb)}}, + {{TOBN(0xec2fccaa, 0xddce3345), TOBN(0x2a6811b7, 0x012a4350), + TOBN(0x96760ff1, 0xac598bdc), TOBN(0x054d652a, 0xd1bf4128)}, + {TOBN(0x0a1151d4, 0x92a21005), TOBN(0xad7f3971, 0x33110fdf), + TOBN(0x8c95928c, 0x1960100f), TOBN(0x6c91c825, 0x7bf03362)}}, + {{TOBN(0xc8c8b2a2, 0xce309f06), TOBN(0xfdb27b59, 0xca27204b), + TOBN(0xd223eaa5, 0x0848e32e), TOBN(0xb93e4b2e, 0xe7bfaf1e)}, + {TOBN(0xc5308ae6, 0x44aa3ded), TOBN(0x317a666a, 0xc015d573), + TOBN(0xc888ce23, 0x1a979707), TOBN(0xf141c1e6, 0x0d5c4958)}}, + {{TOBN(0xb53b7de5, 0x61906373), TOBN(0x858dbade, 0xeb999595), + TOBN(0x8cbb47b2, 0xa59e5c36), TOBN(0x660318b3, 0xdcf4e842)}, + {TOBN(0xbd161ccd, 0x12ba4b7a), TOBN(0xf399daab, 0xf8c8282a), + TOBN(0x1587633a, 0xeeb2130d), TOBN(0xa465311a, 0xda38dd7d)}}, + {{TOBN(0x5f75eec8, 0x64d3779b), TOBN(0x3c5d0476, 0xad64c171), + TOBN(0x87410371, 0x2a914428), TOBN(0x8096a891, 0x90e2fc29)}, + {TOBN(0xd3d2ae9d, 0x23b3ebc2), TOBN(0x90bdd6db, 0xa580cfd6), + TOBN(0x52dbb7f3, 0xc5b01f6c), TOBN(0xe68eded4, 0xe102a2dc)}}, + {{TOBN(0x17785b77, 0x99eb6df0), TOBN(0x26c3cc51, 0x7386b779), + TOBN(0x345ed988, 0x6417a48e), TOBN(0xe990b4e4, 0x07d6ef31)}, + {TOBN(0x0f456b7e, 0x2586abba), TOBN(0x239ca6a5, 0x59c96e9a), + TOBN(0xe327459c, 0xe2eb4206), TOBN(0x3a4c3313, 0xa002b90a)}}, + {{TOBN(0x2a114806, 0xf6a3f6fb), TOBN(0xad5cad2f, 0x85c251dd), + TOBN(0x92c1f613, 0xf5a784d3), TOBN(0xec7bfacf, 0x349766d5)}, + {TOBN(0x04b3cd33, 0x3e23cb3b), TOBN(0x3979fe84, 0xc5a64b2d), + TOBN(0x192e2720, 0x7e589106), TOBN(0xa60c43d1, 0xa15b527f)}}, + {{TOBN(0x2dae9082, 0xbe7cf3a6), TOBN(0xcc86ba92, 0xbc967274), + TOBN(0xf28a2ce8, 0xaea0a8a9), TOBN(0x404ca6d9, 0x6ee988b3)}, + {TOBN(0xfd7e9c5d, 0x005921b8), TOBN(0xf56297f1, 0x44e79bf9), + TOBN(0xa163b460, 0x0d75ddc2), TOBN(0x30b23616, 0xa1f2be87)}}, + {{TOBN(0x4b070d21, 0xbfe50e2b), TOBN(0x7ef8cfd0, 0xe1bfede1), + TOBN(0xadba0011, 0x2aac4ae0), TOBN(0x2a3e7d01, 0xb9ebd033)}, + {TOBN(0x995277ec, 0xe38d9d1c), TOBN(0xb500249e, 0x9c5d2de3), + TOBN(0x8912b820, 0xf13ca8c9), TOBN(0xc8798114, 0x877793af)}}, + {{TOBN(0x19e6125d, 0xec3f1dec), TOBN(0x07b1f040, 0x911178da), + TOBN(0xd93ededa, 0x904a6738), TOBN(0x55187a5a, 0x0bebedcd)}, + {TOBN(0xf7d04722, 0xeb329d41), TOBN(0xf449099e, 0xf170b391), + TOBN(0xfd317a69, 0xca99f828), TOBN(0x50c3db2b, 0x34a4976d)}}, + {{TOBN(0xe9ba7784, 0x3757b392), TOBN(0x326caefd, 0xaa3ca05a), + TOBN(0x78e5293b, 0xf1e593d4), TOBN(0x7842a937, 0x0d98fd13)}, + {TOBN(0xe694bf96, 0x5f96b10d), TOBN(0x373a9df6, 0x06a8cd05), + TOBN(0x997d1e51, 0xe8f0c7fc), TOBN(0x1d019790, 0x63fd972e)}}, + {{TOBN(0x0064d858, 0x5499fb32), TOBN(0x7b67bad9, 0x77a8aeb7), + TOBN(0x1d3eb977, 0x2d08eec5), TOBN(0x5fc047a6, 0xcbabae1d)}, + {TOBN(0x0577d159, 0xe54a64bb), TOBN(0x8862201b, 0xc43497e4), + TOBN(0xad6b4e28, 0x2ce0608d), TOBN(0x8b687b7d, 0x0b167aac)}}, + {{TOBN(0x6ed4d367, 0x8b2ecfa9), TOBN(0x24dfe62d, 0xa90c3c38), + TOBN(0xa1862e10, 0x3fe5c42b), TOBN(0x1ca73dca, 0xd5732a9f)}, + {TOBN(0x35f038b7, 0x76bb87ad), TOBN(0x674976ab, 0xf242b81f), + TOBN(0x4f2bde7e, 0xb0fd90cd), TOBN(0x6efc172e, 0xa7fdf092)}}, + {{TOBN(0x3806b69b, 0x92222f1f), TOBN(0x5a2459ca, 0x6cf7ae70), + TOBN(0x6789f69c, 0xa85217ee), TOBN(0x5f232b5e, 0xe3dc85ac)}, + {TOBN(0x660e3ec5, 0x48e9e516), TOBN(0x124b4e47, 0x3197eb31), + TOBN(0x10a0cb13, 0xaafcca23), TOBN(0x7bd63ba4, 0x8213224f)}}, + {{TOBN(0xaffad7cc, 0x290a7f4f), TOBN(0x6b409c9e, 0x0286b461), + TOBN(0x58ab809f, 0xffa407af), TOBN(0xc3122eed, 0xc68ac073)}, + {TOBN(0x17bf9e50, 0x4ef24d7e), TOBN(0x5d929794, 0x3e2a5811), + TOBN(0x519bc867, 0x02902e01), TOBN(0x76bba5da, 0x39c8a851)}}, + {{TOBN(0xe9f9669c, 0xda94951e), TOBN(0x4b6af58d, 0x66b8d418), + TOBN(0xfa321074, 0x17d426a4), TOBN(0xc78e66a9, 0x9dde6027)}, + {TOBN(0x0516c083, 0x4a53b964), TOBN(0xfc659d38, 0xff602330), + TOBN(0x0ab55e5c, 0x58c5c897), TOBN(0x985099b2, 0x838bc5df)}}, + {{TOBN(0x061d9efc, 0xc52fc238), TOBN(0x712b2728, 0x6ac1da3f), + TOBN(0xfb658149, 0x9283fe08), TOBN(0x4954ac94, 0xb8aaa2f7)}, + {TOBN(0x85c0ada4, 0x7fb2e74f), TOBN(0xee8ba98e, 0xb89926b0), + TOBN(0xe4f9d37d, 0x23d1af5b), TOBN(0x14ccdbf9, 0xba9b015e)}}, + {{TOBN(0xb674481b, 0x7bfe7178), TOBN(0x4e1debae, 0x65405868), + TOBN(0x061b2821, 0xc48c867d), TOBN(0x69c15b35, 0x513b30ea)}, + {TOBN(0x3b4a1666, 0x36871088), TOBN(0xe5e29f5d, 0x1220b1ff), + TOBN(0x4b82bb35, 0x233d9f4d), TOBN(0x4e076333, 0x18cdc675)}}}, + {{{TOBN(0x0d53f5c7, 0xa3e6fced), TOBN(0xe8cbbdd5, 0xf45fbdeb), + TOBN(0xf85c01df, 0x13339a70), TOBN(0x0ff71880, 0x142ceb81)}, + {TOBN(0x4c4e8774, 0xbd70437a), TOBN(0x5fb32891, 0xba0bda6a), + TOBN(0x1cdbebd2, 0xf18bd26e), TOBN(0x2f9526f1, 0x03a9d522)}}, + {{TOBN(0x40ce3051, 0x92c4d684), TOBN(0x8b04d725, 0x7612efcd), + TOBN(0xb9dcda36, 0x6f9cae20), TOBN(0x0edc4d24, 0xf058856c)}, + {TOBN(0x64f2e6bf, 0x85427900), TOBN(0x3de81295, 0xdc09dfea), + TOBN(0xd41b4487, 0x379bf26c), TOBN(0x50b62c6d, 0x6df135a9)}}, + {{TOBN(0xd4f8e3b4, 0xc72dfe67), TOBN(0xc416b0f6, 0x90e19fdf), + TOBN(0x18b9098d, 0x4c13bd35), TOBN(0xac11118a, 0x15b8cb9e)}, + {TOBN(0xf598a318, 0xf0062841), TOBN(0xbfe0602f, 0x89f356f4), + TOBN(0x7ae3637e, 0x30177a0c), TOBN(0x34097747, 0x61136537)}}, + {{TOBN(0x0db2fb5e, 0xd005832a), TOBN(0x5f5efd3b, 0x91042e4f), + TOBN(0x8c4ffdc6, 0xed70f8ca), TOBN(0xe4645d0b, 0xb52da9cc)}, + {TOBN(0x9596f58b, 0xc9001d1f), TOBN(0x52c8f0bc, 0x4e117205), + TOBN(0xfd4aa0d2, 0xe398a084), TOBN(0x815bfe3a, 0x104f49de)}}, + {{TOBN(0x97e5443f, 0x23885e5f), TOBN(0xf72f8f99, 0xe8433aab), + TOBN(0xbd00b154, 0xe4d4e604), TOBN(0xd0b35e6a, 0xe5e173ff)}, + {TOBN(0x57b2a048, 0x9164722d), TOBN(0x3e3c665b, 0x88761ec8), + TOBN(0x6bdd1397, 0x3da83832), TOBN(0x3c8b1a1e, 0x73dafe3b)}}, + {{TOBN(0x4497ace6, 0x54317cac), TOBN(0xbe600ab9, 0x521771b3), + TOBN(0xb42e409e, 0xb0dfe8b8), TOBN(0x386a67d7, 0x3942310f)}, + {TOBN(0x25548d8d, 0x4431cc28), TOBN(0xa7cff142, 0x985dc524), + TOBN(0x4d60f5a1, 0x93c4be32), TOBN(0x83ebd5c8, 0xd071c6e1)}}, + {{TOBN(0xba3a80a7, 0xb1fd2b0b), TOBN(0x9b3ad396, 0x5bec33e8), + TOBN(0xb3868d61, 0x79743fb3), TOBN(0xcfd169fc, 0xfdb462fa)}, + {TOBN(0xd3b499d7, 0x9ce0a6af), TOBN(0x55dc1cf1, 0xe42d3ff8), + TOBN(0x04fb9e6c, 0xc6c3e1b2), TOBN(0x47e6961d, 0x6f69a474)}}, + {{TOBN(0x54eb3acc, 0xe548b37b), TOBN(0xb38e7542, 0x84d40549), + TOBN(0x8c3daa51, 0x7b341b4f), TOBN(0x2f6928ec, 0x690bf7fa)}, + {TOBN(0x0496b323, 0x86ce6c41), TOBN(0x01be1c55, 0x10adadcd), + TOBN(0xc04e67e7, 0x4bb5faf9), TOBN(0x3cbaf678, 0xe15c9985)}}, + {{TOBN(0x8cd12145, 0x50ca4247), TOBN(0xba1aa47a, 0xe7dd30aa), + TOBN(0x2f81ddf1, 0xe58fee24), TOBN(0x03452936, 0xeec9b0e8)}, + {TOBN(0x8bdc3b81, 0x243aea96), TOBN(0x9a2919af, 0x15c3d0e5), + TOBN(0x9ea640ec, 0x10948361), TOBN(0x5ac86d5b, 0x6e0bcccf)}}, + {{TOBN(0xf892d918, 0xc36cf440), TOBN(0xaed3e837, 0xc939719c), + TOBN(0xb07b08d2, 0xc0218b64), TOBN(0x6f1bcbba, 0xce9790dd)}, + {TOBN(0x4a84d6ed, 0x60919b8e), TOBN(0xd8900791, 0x8ac1f9eb), + TOBN(0xf84941aa, 0x0dd5daef), TOBN(0xb22fe40a, 0x67fd62c5)}}, + {{TOBN(0x97e15ba2, 0x157f2db3), TOBN(0xbda2fc8f, 0x8e28ca9c), + TOBN(0x5d050da4, 0x37b9f454), TOBN(0x3d57eb57, 0x2379d72e)}, + {TOBN(0xe9b5eba2, 0xfb5ee997), TOBN(0x01648ca2, 0xe11538ca), + TOBN(0x32bb76f6, 0xf6327974), TOBN(0x338f14b8, 0xff3f4bb7)}}, + {{TOBN(0x524d226a, 0xd7ab9a2d), TOBN(0x9c00090d, 0x7dfae958), + TOBN(0x0ba5f539, 0x8751d8c2), TOBN(0x8afcbcdd, 0x3ab8262d)}, + {TOBN(0x57392729, 0xe99d043b), TOBN(0xef51263b, 0xaebc943a), + TOBN(0x9feace93, 0x20862935), TOBN(0x639efc03, 0xb06c817b)}}, + {{TOBN(0x1fe054b3, 0x66b4be7a), TOBN(0x3f25a9de, 0x84a37a1e), + TOBN(0xf39ef1ad, 0x78d75cd9), TOBN(0xd7b58f49, 0x5062c1b5)}, + {TOBN(0x6f74f9a9, 0xff563436), TOBN(0xf718ff29, 0xe8af51e7), + TOBN(0x5234d313, 0x15e97fec), TOBN(0xb6a8e2b1, 0x292f1c0a)}}, + {{TOBN(0xa7f53aa8, 0x327720c1), TOBN(0x956ca322, 0xba092cc8), + TOBN(0x8f03d64a, 0x28746c4d), TOBN(0x51fe1782, 0x66d0d392)}, + {TOBN(0xd19b34db, 0x3c832c80), TOBN(0x60dccc5c, 0x6da2e3b4), + TOBN(0x245dd62e, 0x0a104ccc), TOBN(0xa7ab1de1, 0x620b21fd)}}, + {{TOBN(0xb293ae0b, 0x3893d123), TOBN(0xf7b75783, 0xb15ee71c), + TOBN(0x5aa3c614, 0x42a9468b), TOBN(0xd686123c, 0xdb15d744)}, + {TOBN(0x8c616891, 0xa7ab4116), TOBN(0x6fcd72c8, 0xa4e6a459), + TOBN(0xac219110, 0x77e5fad7), TOBN(0xfb6a20e7, 0x704fa46b)}}, + {{TOBN(0xe839be7d, 0x341d81dc), TOBN(0xcddb6889, 0x32148379), + TOBN(0xda6211a1, 0xf7026ead), TOBN(0xf3b2575f, 0xf4d1cc5e)}, + {TOBN(0x40cfc8f6, 0xa7a73ae6), TOBN(0x83879a5e, 0x61d5b483), + TOBN(0xc5acb1ed, 0x41a50ebc), TOBN(0x59a60cc8, 0x3c07d8fa)}}, + {{TOBN(0x1b73bdce, 0xb1876262), TOBN(0x2b0d79f0, 0x12af4ee9), + TOBN(0x8bcf3b0b, 0xd46e1d07), TOBN(0x17d6af9d, 0xe45d152f)}, + {TOBN(0x73520461, 0x6d736451), TOBN(0x43cbbd97, 0x56b0bf5a), + TOBN(0xb0833a5b, 0xd5999b9d), TOBN(0x702614f0, 0xeb72e398)}}, + {{TOBN(0x0aadf01a, 0x59c3e9f8), TOBN(0x40200e77, 0xce6b3d16), + TOBN(0xda22bdd3, 0xdeddafad), TOBN(0x76dedaf4, 0x310d72e1)}, + {TOBN(0x49ef807c, 0x4bc2e88f), TOBN(0x6ba81291, 0x146dd5a5), + TOBN(0xa1a4077a, 0x7d8d59e9), TOBN(0x87b6a2e7, 0x802db349)}}, + {{TOBN(0xd5679997, 0x1b4e598e), TOBN(0xf499ef1f, 0x06fe4b1d), + TOBN(0x3978d3ae, 0xfcb267c5), TOBN(0xb582b557, 0x235786d0)}, + {TOBN(0x32b3b2ca, 0x1715cb07), TOBN(0x4c3de6a2, 0x8480241d), + TOBN(0x63b5ffed, 0xcb571ecd), TOBN(0xeaf53900, 0xed2fe9a9)}}, + {{TOBN(0xdec98d4a, 0xc3b81990), TOBN(0x1cb83722, 0x9e0cc8fe), + TOBN(0xfe0b0491, 0xd2b427b9), TOBN(0x0f2386ac, 0xe983a66c)}, + {TOBN(0x930c4d1e, 0xb3291213), TOBN(0xa2f82b2e, 0x59a62ae4), + TOBN(0x77233853, 0xf93e89e3), TOBN(0x7f8063ac, 0x11777c7f)}}, + {{TOBN(0xff0eb567, 0x59ad2877), TOBN(0x6f454642, 0x9865c754), + TOBN(0xe6fe701a, 0x236e9a84), TOBN(0xc586ef16, 0x06e40fc3)}, + {TOBN(0x3f62b6e0, 0x24bafad9), TOBN(0xc8b42bd2, 0x64da906a), + TOBN(0xc98e1eb4, 0xda3276a0), TOBN(0x30d0e5fc, 0x06cbf852)}}, + {{TOBN(0x1b6b2ae1, 0xe8b4dfd4), TOBN(0xd754d5c7, 0x8301cbac), + TOBN(0x66097629, 0x112a39ac), TOBN(0xf86b5999, 0x93ba4ab9)}, + {TOBN(0x26c9dea7, 0x99f9d581), TOBN(0x0473b1a8, 0xc2fafeaa), + TOBN(0x1469af55, 0x3b2505a5), TOBN(0x227d16d7, 0xd6a43323)}}, + {{TOBN(0x3316f73c, 0xad3d97f9), TOBN(0x52bf3bb5, 0x1f137455), + TOBN(0x953eafeb, 0x09954e7c), TOBN(0xa721dfed, 0xdd732411)}, + {TOBN(0xb4929821, 0x141d4579), TOBN(0x3411321c, 0xaa3bd435), + TOBN(0xafb355aa, 0x17fa6015), TOBN(0xb4e7ef4a, 0x18e42f0e)}}, + {{TOBN(0x604ac97c, 0x59371000), TOBN(0xe1c48c70, 0x7f759c18), + TOBN(0x3f62ecc5, 0xa5db6b65), TOBN(0x0a78b173, 0x38a21495)}, + {TOBN(0x6be1819d, 0xbcc8ad94), TOBN(0x70dc04f6, 0xd89c3400), + TOBN(0x462557b4, 0xa6b4840a), TOBN(0x544c6ade, 0x60bd21c0)}}, + {{TOBN(0x6a00f24e, 0x907a544b), TOBN(0xa7520dcb, 0x313da210), + TOBN(0xfe939b75, 0x11e4994b), TOBN(0x918b6ba6, 0xbc275d70)}, + {TOBN(0xd3e5e0fc, 0x644be892), TOBN(0x707a9816, 0xfdaf6c42), + TOBN(0x60145567, 0xf15c13fe), TOBN(0x4818ebaa, 0xe130a54a)}}, + {{TOBN(0x28aad3ad, 0x58d2f767), TOBN(0xdc5267fd, 0xd7e7c773), + TOBN(0x4919cc88, 0xc3afcc98), TOBN(0xaa2e6ab0, 0x2db8cd4b)}, + {TOBN(0xd46fec04, 0xd0c63eaa), TOBN(0xa1cb92c5, 0x19ffa832), + TOBN(0x678dd178, 0xe43a631f), TOBN(0xfb5ae1cd, 0x3dc788b3)}}, + {{TOBN(0x68b4fb90, 0x6e77de04), TOBN(0x7992bcf0, 0xf06dbb97), + TOBN(0x896e6a13, 0xc417c01d), TOBN(0x8d96332c, 0xb956be01)}, + {TOBN(0x902fc93a, 0x413aa2b9), TOBN(0x99a4d915, 0xfc98c8a5), + TOBN(0x52c29407, 0x565f1137), TOBN(0x4072690f, 0x21e4f281)}}, + {{TOBN(0x36e607cf, 0x02ff6072), TOBN(0xa47d2ca9, 0x8ad98cdc), + TOBN(0xbf471d1e, 0xf5f56609), TOBN(0xbcf86623, 0xf264ada0)}, + {TOBN(0xb70c0687, 0xaa9e5cb6), TOBN(0xc98124f2, 0x17401c6c), + TOBN(0x8189635f, 0xd4a61435), TOBN(0xd28fb8af, 0xa9d98ea6)}}, + {{TOBN(0xb9a67c2a, 0x40c251f8), TOBN(0x88cd5d87, 0xa2da44be), + TOBN(0x437deb96, 0xe09b5423), TOBN(0x150467db, 0x64287dc1)}, + {TOBN(0xe161debb, 0xcdabb839), TOBN(0xa79e9742, 0xf1839a3e), + TOBN(0xbb8dd3c2, 0x652d202b), TOBN(0x7b3e67f7, 0xe9f97d96)}}, + {{TOBN(0x5aa5d78f, 0xb1cb6ac9), TOBN(0xffa13e8e, 0xca1d0d45), + TOBN(0x369295dd, 0x2ba5bf95), TOBN(0xd68bd1f8, 0x39aff05e)}, + {TOBN(0xaf0d86f9, 0x26d783f2), TOBN(0x543a59b3, 0xfc3aafc1), + TOBN(0x3fcf81d2, 0x7b7da97c), TOBN(0xc990a056, 0xd25dee46)}}, + {{TOBN(0x3e6775b8, 0x519cce2c), TOBN(0xfc9af71f, 0xae13d863), + TOBN(0x774a4a6f, 0x47c1605c), TOBN(0x46ba4245, 0x2fd205e8)}, + {TOBN(0xa06feea4, 0xd3fd524d), TOBN(0x1e724641, 0x6de1acc2), + TOBN(0xf53816f1, 0x334e2b42), TOBN(0x49e5918e, 0x922f0024)}}, + {{TOBN(0x439530b6, 0x65c7322d), TOBN(0xcf12cc01, 0xb3c1b3fb), + TOBN(0xc70b0186, 0x0172f685), TOBN(0xb915ee22, 0x1b58391d)}, + {TOBN(0x9afdf03b, 0xa317db24), TOBN(0x87dec659, 0x17b8ffc4), + TOBN(0x7f46597b, 0xe4d3d050), TOBN(0x80a1c1ed, 0x006500e7)}}, + {{TOBN(0x84902a96, 0x78bf030e), TOBN(0xfb5e9c9a, 0x50560148), + TOBN(0x6dae0a92, 0x63362426), TOBN(0xdcaeecf4, 0xa9e30c40)}, + {TOBN(0xc0d887bb, 0x518d0c6b), TOBN(0x99181152, 0xcb985b9d), + TOBN(0xad186898, 0xef7bc381), TOBN(0x18168ffb, 0x9ee46201)}}, + {{TOBN(0x9a04cdaa, 0x2502753c), TOBN(0xbb279e26, 0x51407c41), + TOBN(0xeacb03aa, 0xf23564e5), TOBN(0x18336582, 0x71e61016)}, + {TOBN(0x8684b8c4, 0xeb809877), TOBN(0xb336e18d, 0xea0e672e), + TOBN(0xefb601f0, 0x34ee5867), TOBN(0x2733edbe, 0x1341cfd1)}}, + {{TOBN(0xb15e809a, 0x26025c3c), TOBN(0xe6e981a6, 0x9350df88), + TOBN(0x92376237, 0x8502fd8e), TOBN(0x4791f216, 0x0c12be9b)}, + {TOBN(0xb7256789, 0x25f02425), TOBN(0xec863194, 0x7a974443), + TOBN(0x7c0ce882, 0xfb41cc52), TOBN(0xc266ff7e, 0xf25c07f2)}}, + {{TOBN(0x3d4da8c3, 0x017025f3), TOBN(0xefcf628c, 0xfb9579b4), + TOBN(0x5c4d0016, 0x1f3716ec), TOBN(0x9c27ebc4, 0x6801116e)}, + {TOBN(0x5eba0ea1, 0x1da1767e), TOBN(0xfe151452, 0x47004c57), + TOBN(0x3ace6df6, 0x8c2373b7), TOBN(0x75c3dffe, 0x5dbc37ac)}}, + {{TOBN(0x3dc32a73, 0xddc925fc), TOBN(0xb679c841, 0x2f65ee0b), + TOBN(0x715a3295, 0x451cbfeb), TOBN(0xd9889768, 0xf76e9a29)}, + {TOBN(0xec20ce7f, 0xb28ad247), TOBN(0xe99146c4, 0x00894d79), + TOBN(0x71457d7c, 0x9f5e3ea7), TOBN(0x097b2662, 0x38030031)}}, + {{TOBN(0xdb7f6ae6, 0xcf9f82a8), TOBN(0x319decb9, 0x438f473a), + TOBN(0xa63ab386, 0x283856c3), TOBN(0x13e3172f, 0xb06a361b)}, + {TOBN(0x2959f8dc, 0x7d5a006c), TOBN(0x2dbc27c6, 0x75fba752), + TOBN(0xc1227ab2, 0x87c22c9e), TOBN(0x06f61f75, 0x71a268b2)}}, + {{TOBN(0x1b6bb971, 0x04779ce2), TOBN(0xaca83812, 0x0aadcb1d), + TOBN(0x297ae0bc, 0xaeaab2d5), TOBN(0xa5c14ee7, 0x5bfb9f13)}, + {TOBN(0xaa00c583, 0xf17a62c7), TOBN(0x39eb962c, 0x173759f6), + TOBN(0x1eeba1d4, 0x86c9a88f), TOBN(0x0ab6c37a, 0xdf016c5e)}}, + {{TOBN(0xa2a147db, 0xa28a0749), TOBN(0x246c20d6, 0xee519165), + TOBN(0x5068d1b1, 0xd3810715), TOBN(0xb1e7018c, 0x748160b9)}, + {TOBN(0x03f5b1fa, 0xf380ff62), TOBN(0xef7fb1dd, 0xf3cb2c1e), + TOBN(0xeab539a8, 0xfc91a7da), TOBN(0x83ddb707, 0xf3f9b561)}}, + {{TOBN(0xc550e211, 0xfe7df7a4), TOBN(0xa7cd07f2, 0x063f6f40), + TOBN(0xb0de3635, 0x2976879c), TOBN(0xb5f83f85, 0xe55741da)}, + {TOBN(0x4ea9d25e, 0xf3d8ac3d), TOBN(0x6fe2066f, 0x62819f02), + TOBN(0x4ab2b9c2, 0xcef4a564), TOBN(0x1e155d96, 0x5ffa2de3)}}, + {{TOBN(0x0eb0a19b, 0xc3a72d00), TOBN(0x4037665b, 0x8513c31b), + TOBN(0x2fb2b6bf, 0x04c64637), TOBN(0x45c34d6e, 0x08cdc639)}, + {TOBN(0x56f1e10f, 0xf01fd796), TOBN(0x4dfb8101, 0xfe3667b8), + TOBN(0xe0eda253, 0x9021d0c0), TOBN(0x7a94e9ff, 0x8a06c6ab)}}, + {{TOBN(0x2d3bb0d9, 0xbb9aa882), TOBN(0xea20e4e5, 0xec05fd10), + TOBN(0xed7eeb5f, 0x1a1ca64e), TOBN(0x2fa6b43c, 0xc6327cbd)}, + {TOBN(0xb577e3cf, 0x3aa91121), TOBN(0x8c6bd5ea, 0x3a34079b), + TOBN(0xd7e5ba39, 0x60e02fc0), TOBN(0xf16dd2c3, 0x90141bf8)}}, + {{TOBN(0xb57276d9, 0x80101b98), TOBN(0x760883fd, 0xb82f0f66), + TOBN(0x89d7de75, 0x4bc3eff3), TOBN(0x03b60643, 0x5dc2ab40)}, + {TOBN(0xcd6e53df, 0xe05beeac), TOBN(0xf2f1e862, 0xbc3325cd), + TOBN(0xdd0f7921, 0x774f03c3), TOBN(0x97ca7221, 0x4552cc1b)}}, + {{TOBN(0x5a0d6afe, 0x1cd19f72), TOBN(0xa20915dc, 0xf183fbeb), + TOBN(0x9fda4b40, 0x832c403c), TOBN(0x32738edd, 0xbe425442)}, + {TOBN(0x469a1df6, 0xb5eccf1a), TOBN(0x4b5aff42, 0x28bbe1f0), + TOBN(0x31359d7f, 0x570dfc93), TOBN(0xa18be235, 0xf0088628)}}, + {{TOBN(0xa5b30fba, 0xb00ed3a9), TOBN(0x34c61374, 0x73cdf8be), + TOBN(0x2c5c5f46, 0xabc56797), TOBN(0x5cecf93d, 0xb82a8ae2)}, + {TOBN(0x7d3dbe41, 0xa968fbf0), TOBN(0xd23d4583, 0x1a5c7f3d), + TOBN(0xf28f69a0, 0xc087a9c7), TOBN(0xc2d75471, 0x474471ca)}}, + {{TOBN(0x36ec9f4a, 0x4eb732ec), TOBN(0x6c943bbd, 0xb1ca6bed), + TOBN(0xd64535e1, 0xf2457892), TOBN(0x8b84a8ea, 0xf7e2ac06)}, + {TOBN(0xe0936cd3, 0x2499dd5f), TOBN(0x12053d7e, 0x0ed04e57), + TOBN(0x4bdd0076, 0xe4305d9d), TOBN(0x34a527b9, 0x1f67f0a2)}}, + {{TOBN(0xe79a4af0, 0x9cec46ea), TOBN(0xb15347a1, 0x658b9bc7), + TOBN(0x6bd2796f, 0x35af2f75), TOBN(0xac957990, 0x4051c435)}, + {TOBN(0x2669dda3, 0xc33a655d), TOBN(0x5d503c2e, 0x88514aa3), + TOBN(0xdfa11337, 0x3753dd41), TOBN(0x3f054673, 0x0b754f78)}}, + {{TOBN(0xbf185677, 0x496125bd), TOBN(0xfb0023c8, 0x3775006c), + TOBN(0xfa0f072f, 0x3a037899), TOBN(0x4222b6eb, 0x0e4aea57)}, + {TOBN(0x3dde5e76, 0x7866d25a), TOBN(0xb6eb04f8, 0x4837aa6f), + TOBN(0x5315591a, 0x2cf1cdb8), TOBN(0x6dfb4f41, 0x2d4e683c)}}, + {{TOBN(0x7e923ea4, 0x48ee1f3a), TOBN(0x9604d9f7, 0x05a2afd5), + TOBN(0xbe1d4a33, 0x40ea4948), TOBN(0x5b45f1f4, 0xb44cbd2f)}, + {TOBN(0x5faf8376, 0x4acc757e), TOBN(0xa7cf9ab8, 0x63d68ff7), + TOBN(0x8ad62f69, 0xdf0e404b), TOBN(0xd65f33c2, 0x12bdafdf)}}, + {{TOBN(0xc365de15, 0xa377b14e), TOBN(0x6bf5463b, 0x8e39f60c), + TOBN(0x62030d2d, 0x2ce68148), TOBN(0xd95867ef, 0xe6f843a8)}, + {TOBN(0xd39a0244, 0xef5ab017), TOBN(0x0bd2d8c1, 0x4ab55d12), + TOBN(0xc9503db3, 0x41639169), TOBN(0x2d4e25b0, 0xf7660c8a)}}, + {{TOBN(0x760cb3b5, 0xe224c5d7), TOBN(0xfa3baf8c, 0x68616919), + TOBN(0x9fbca113, 0x8d142552), TOBN(0x1ab18bf1, 0x7669ebf5)}, + {TOBN(0x55e6f53e, 0x9bdf25dd), TOBN(0x04cc0bf3, 0xcb6cd154), + TOBN(0x595bef49, 0x95e89080), TOBN(0xfe9459a8, 0x104a9ac1)}}, + {{TOBN(0xad2d89ca, 0xcce9bb32), TOBN(0xddea65e1, 0xf7de8285), + TOBN(0x62ed8c35, 0xb351bd4b), TOBN(0x4150ff36, 0x0c0e19a7)}, + {TOBN(0x86e3c801, 0x345f4e47), TOBN(0x3bf21f71, 0x203a266c), + TOBN(0x7ae110d4, 0x855b1f13), TOBN(0x5d6aaf6a, 0x07262517)}}, + {{TOBN(0x1e0f12e1, 0x813d28f1), TOBN(0x6000e11d, 0x7ad7a523), + TOBN(0xc7d8deef, 0xc744a17b), TOBN(0x1e990b48, 0x14c05a00)}, + {TOBN(0x68fddaee, 0x93e976d5), TOBN(0x696241d1, 0x46610d63), + TOBN(0xb204e7c3, 0x893dda88), TOBN(0x8bccfa65, 0x6a3a6946)}}, + {{TOBN(0xb59425b4, 0xc5cd1411), TOBN(0x701b4042, 0xff3658b1), + TOBN(0xe3e56bca, 0x4784cf93), TOBN(0x27de5f15, 0x8fe68d60)}, + {TOBN(0x4ab9cfce, 0xf8d53f19), TOBN(0xddb10311, 0xa40a730d), + TOBN(0x6fa73cd1, 0x4eee0a8a), TOBN(0xfd548748, 0x5249719d)}}, + {{TOBN(0x49d66316, 0xa8123ef0), TOBN(0x73c32db4, 0xe7f95438), + TOBN(0x2e2ed209, 0x0d9e7854), TOBN(0xf98a9329, 0x9d9f0507)}, + {TOBN(0xc5d33cf6, 0x0c6aa20a), TOBN(0x9a32ba14, 0x75279bb2), + TOBN(0x7e3202cb, 0x774a7307), TOBN(0x64ed4bc4, 0xe8c42dbd)}}, + {{TOBN(0xc20f1a06, 0xd4caed0d), TOBN(0xb8021407, 0x171d22b3), + TOBN(0xd426ca04, 0xd13268d7), TOBN(0x92377007, 0x25f4d126)}, + {TOBN(0x4204cbc3, 0x71f21a85), TOBN(0x18461b7a, 0xf82369ba), + TOBN(0xc0c07d31, 0x3fc858f9), TOBN(0x5deb5a50, 0xe2bab569)}}, + {{TOBN(0xd5959d46, 0xd5eea89e), TOBN(0xfdff8424, 0x08437f4b), + TOBN(0xf21071e4, 0x3cfe254f), TOBN(0x72417696, 0x95468321)}, + {TOBN(0x5d8288b9, 0x102cae3e), TOBN(0x2d143e3d, 0xf1965dff), + TOBN(0x00c9a376, 0xa078d847), TOBN(0x6fc0da31, 0x26028731)}}, + {{TOBN(0xa2baeadf, 0xe45083a2), TOBN(0x66bc7218, 0x5e5b4bcd), + TOBN(0x2c826442, 0xd04b8e7f), TOBN(0xc19f5451, 0x6c4b586b)}, + {TOBN(0x60182c49, 0x5b7eeed5), TOBN(0xd9954ecd, 0x7aa9dfa1), + TOBN(0xa403a8ec, 0xc73884ad), TOBN(0x7fb17de2, 0x9bb39041)}}, + {{TOBN(0x694b64c5, 0xabb020e8), TOBN(0x3d18c184, 0x19c4eec7), + TOBN(0x9c4673ef, 0x1c4793e5), TOBN(0xc7b8aeb5, 0x056092e6)}, + {TOBN(0x3aa1ca43, 0xf0f8c16b), TOBN(0x224ed5ec, 0xd679b2f6), + TOBN(0x0d56eeaf, 0x55a205c9), TOBN(0xbfe115ba, 0x4b8e028b)}}, + {{TOBN(0x97e60849, 0x3927f4fe), TOBN(0xf91fbf94, 0x759aa7c5), + TOBN(0x985af769, 0x6be90a51), TOBN(0xc1277b78, 0x78ccb823)}, + {TOBN(0x395b656e, 0xe7a75952), TOBN(0x00df7de0, 0x928da5f5), + TOBN(0x09c23175, 0x4ca4454f), TOBN(0x4ec971f4, 0x7aa2d3c1)}}, + {{TOBN(0x45c3c507, 0xe75d9ccc), TOBN(0x63b7be8a, 0x3dc90306), + TOBN(0x37e09c66, 0x5db44bdc), TOBN(0x50d60da1, 0x6841c6a2)}, + {TOBN(0x6f9b65ee, 0x08df1b12), TOBN(0x38734879, 0x7ff089df), + TOBN(0x9c331a66, 0x3fe8013d), TOBN(0x017f5de9, 0x5f42fcc8)}}, + {{TOBN(0x43077866, 0xe8e57567), TOBN(0xc9f781ce, 0xf9fcdb18), + TOBN(0x38131dda, 0x9b12e174), TOBN(0x25d84aa3, 0x8a03752a)}, + {TOBN(0x45e09e09, 0x4d0c0ce2), TOBN(0x1564008b, 0x92bebba5), + TOBN(0xf7e8ad31, 0xa87284c7), TOBN(0xb7c4b46c, 0x97e7bbaa)}}, + {{TOBN(0x3e22a7b3, 0x97acf4ec), TOBN(0x0426c400, 0x5ea8b640), + TOBN(0x5e3295a6, 0x4e969285), TOBN(0x22aabc59, 0xa6a45670)}, + {TOBN(0xb929714c, 0x5f5942bc), TOBN(0x9a6168bd, 0xfa3182ed), + TOBN(0x2216a665, 0x104152ba), TOBN(0x46908d03, 0xb6926368)}}}, + {{{TOBN(0xa9f5d874, 0x5a1251fb), TOBN(0x967747a8, 0xc72725c7), + TOBN(0x195c33e5, 0x31ffe89e), TOBN(0x609d210f, 0xe964935e)}, + {TOBN(0xcafd6ca8, 0x2fe12227), TOBN(0xaf9b5b96, 0x0426469d), + TOBN(0x2e9ee04c, 0x5693183c), TOBN(0x1084a333, 0xc8146fef)}}, + {{TOBN(0x96649933, 0xaed1d1f7), TOBN(0x566eaff3, 0x50563090), + TOBN(0x345057f0, 0xad2e39cf), TOBN(0x148ff65b, 0x1f832124)}, + {TOBN(0x042e89d4, 0xcf94cf0d), TOBN(0x319bec84, 0x520c58b3), + TOBN(0x2a267626, 0x5361aa0d), TOBN(0xc86fa302, 0x8fbc87ad)}}, + {{TOBN(0xfc83d2ab, 0x5c8b06d5), TOBN(0xb1a785a2, 0xfe4eac46), + TOBN(0xb99315bc, 0x846f7779), TOBN(0xcf31d816, 0xef9ea505)}, + {TOBN(0x2391fe6a, 0x15d7dc85), TOBN(0x2f132b04, 0xb4016b33), + TOBN(0x29547fe3, 0x181cb4c7), TOBN(0xdb66d8a6, 0x650155a1)}}, + {{TOBN(0x6b66d7e1, 0xadc1696f), TOBN(0x98ebe593, 0x0acd72d0), + TOBN(0x65f24550, 0xcc1b7435), TOBN(0xce231393, 0xb4b9a5ec)}, + {TOBN(0x234a22d4, 0xdb067df9), TOBN(0x98dda095, 0xcaff9b00), + TOBN(0x1bbc75a0, 0x6100c9c1), TOBN(0x1560a9c8, 0x939cf695)}}, + {{TOBN(0xcf006d3e, 0x99e0925f), TOBN(0x2dd74a96, 0x6322375a), + TOBN(0xc58b446a, 0xb56af5ba), TOBN(0x50292683, 0xe0b9b4f1)}, + {TOBN(0xe2c34cb4, 0x1aeaffa3), TOBN(0x8b17203f, 0x9b9587c1), + TOBN(0x6d559207, 0xead1350c), TOBN(0x2b66a215, 0xfb7f9604)}}, + {{TOBN(0x0850325e, 0xfe51bf74), TOBN(0x9c4f579e, 0x5e460094), + TOBN(0x5c87b92a, 0x76da2f25), TOBN(0x889de4e0, 0x6febef33)}, + {TOBN(0x6900ec06, 0x646083ce), TOBN(0xbe2a0335, 0xbfe12773), + TOBN(0xadd1da35, 0xc5344110), TOBN(0x757568b7, 0xb802cd20)}}, + {{TOBN(0x75559779, 0x00f7e6c8), TOBN(0x38e8b94f, 0x0facd2f0), + TOBN(0xfea1f3af, 0x03fde375), TOBN(0x5e11a1d8, 0x75881dfc)}, + {TOBN(0xb3a6b02e, 0xc1e2f2ef), TOBN(0x193d2bbb, 0xc605a6c5), + TOBN(0x325ffeee, 0x339a0b2d), TOBN(0x27b6a724, 0x9e0c8846)}}, + {{TOBN(0xe4050f1c, 0xf1c367ca), TOBN(0x9bc85a9b, 0xc90fbc7d), + TOBN(0xa373c4a2, 0xe1a11032), TOBN(0xb64232b7, 0xad0393a9)}, + {TOBN(0xf5577eb0, 0x167dad29), TOBN(0x1604f301, 0x94b78ab2), + TOBN(0x0baa94af, 0xe829348b), TOBN(0x77fbd8dd, 0x41654342)}}, + {{TOBN(0xdab50ea5, 0xb964e39a), TOBN(0xd4c29e3c, 0xd0d3c76e), + TOBN(0x80dae67c, 0x56d11964), TOBN(0x7307a8bf, 0xe5ffcc2f)}, + {TOBN(0x65bbc1aa, 0x91708c3b), TOBN(0xa151e62c, 0x28bf0eeb), + TOBN(0x6cb53381, 0x6fa34db7), TOBN(0x5139e05c, 0xa29403a8)}}, + {{TOBN(0x6ff651b4, 0x94a7cd2e), TOBN(0x5671ffd1, 0x0699336c), + TOBN(0x6f5fd2cc, 0x979a896a), TOBN(0x11e893a8, 0xd8148cef)}, + {TOBN(0x988906a1, 0x65cf7b10), TOBN(0x81b67178, 0xc50d8485), + TOBN(0x7c0deb35, 0x8a35b3de), TOBN(0x423ac855, 0xc1d29799)}}, + {{TOBN(0xaf580d87, 0xdac50b74), TOBN(0x28b2b89f, 0x5869734c), + TOBN(0x99a3b936, 0x874e28fb), TOBN(0xbb2c9190, 0x25f3f73a)}, + {TOBN(0x199f6918, 0x84a9d5b7), TOBN(0x7ebe2325, 0x7e770374), + TOBN(0xf442e107, 0x0738efe2), TOBN(0xcf9f3f56, 0xcf9082d2)}}, + {{TOBN(0x719f69e1, 0x09618708), TOBN(0xcc9e8364, 0xc183f9b1), + TOBN(0xec203a95, 0x366a21af), TOBN(0x6aec5d6d, 0x068b141f)}, + {TOBN(0xee2df78a, 0x994f04e9), TOBN(0xb39ccae8, 0x271245b0), + TOBN(0xb875a4a9, 0x97e43f4f), TOBN(0x507dfe11, 0xdb2cea98)}}, + {{TOBN(0x4fbf81cb, 0x489b03e9), TOBN(0xdb86ec5b, 0x6ec414fa), + TOBN(0xfad444f9, 0xf51b3ae5), TOBN(0xca7d33d6, 0x1914e3fe)}, + {TOBN(0xa9c32f5c, 0x0ae6c4d0), TOBN(0xa9ca1d1e, 0x73969568), + TOBN(0x98043c31, 0x1aa7467e), TOBN(0xe832e75c, 0xe21b5ac6)}}, + {{TOBN(0x314b7aea, 0x5232123d), TOBN(0x08307c8c, 0x65ae86db), + TOBN(0x06e7165c, 0xaa4668ed), TOBN(0xb170458b, 0xb4d3ec39)}, + {TOBN(0x4d2e3ec6, 0xc19bb986), TOBN(0xc5f34846, 0xae0304ed), + TOBN(0x917695a0, 0x6c9f9722), TOBN(0x6c7f7317, 0x4cab1c0a)}}, + {{TOBN(0x6295940e, 0x9d6d2e8b), TOBN(0xd318b8c1, 0x549f7c97), + TOBN(0x22453204, 0x97713885), TOBN(0x468d834b, 0xa8a440fe)}, + {TOBN(0xd81fe5b2, 0xbfba796e), TOBN(0x152364db, 0x6d71f116), + TOBN(0xbb8c7c59, 0xb5b66e53), TOBN(0x0b12c61b, 0x2641a192)}}, + {{TOBN(0x31f14802, 0xfcf0a7fd), TOBN(0x42fd0789, 0x5488b01e), + TOBN(0x71d78d6d, 0x9952b498), TOBN(0x8eb572d9, 0x07ac5201)}, + {TOBN(0xe0a2a44c, 0x4d194a88), TOBN(0xd2b63fd9, 0xba017e66), + TOBN(0x78efc6c8, 0xf888aefc), TOBN(0xb76f6bda, 0x4a881a11)}}, + {{TOBN(0x187f314b, 0xb46c2397), TOBN(0x004cf566, 0x5ded2819), + TOBN(0xa9ea5704, 0x38764d34), TOBN(0xbba45217, 0x78084709)}, + {TOBN(0x06474571, 0x1171121e), TOBN(0xad7b7eb1, 0xe7c9b671), + TOBN(0xdacfbc40, 0x730f7507), TOBN(0x178cd8c6, 0xc7ad7bd1)}}, + {{TOBN(0xbf0be101, 0xb2a67238), TOBN(0x3556d367, 0xaf9c14f2), + TOBN(0x104b7831, 0xa5662075), TOBN(0x58ca59bb, 0x79d9e60a)}, + {TOBN(0x4bc45392, 0xa569a73b), TOBN(0x517a52e8, 0x5698f6c9), + TOBN(0x85643da5, 0xaeadd755), TOBN(0x1aed0cd5, 0x2a581b84)}}, + {{TOBN(0xb9b4ff84, 0x80af1372), TOBN(0x244c3113, 0xf1ba5d1f), + TOBN(0x2a5dacbe, 0xf5f98d31), TOBN(0x2c3323e8, 0x4375bc2a)}, + {TOBN(0x17a3ab4a, 0x5594b1dd), TOBN(0xa1928bfb, 0xceb4797e), + TOBN(0xe83af245, 0xe4886a19), TOBN(0x8979d546, 0x72b5a74a)}}, + {{TOBN(0xa0f726bc, 0x19f9e967), TOBN(0xd9d03152, 0xe8fbbf4e), + TOBN(0xcfd6f51d, 0xb7707d40), TOBN(0x633084d9, 0x63f6e6e0)}, + {TOBN(0xedcd9cdc, 0x55667eaf), TOBN(0x73b7f92b, 0x2e44d56f), + TOBN(0xfb2e39b6, 0x4e962b14), TOBN(0x7d408f6e, 0xf671fcbf)}}, + {{TOBN(0xcc634ddc, 0x164a89bb), TOBN(0x74a42bb2, 0x3ef3bd05), + TOBN(0x1280dbb2, 0x428decbb), TOBN(0x6103f6bb, 0x402c8596)}, + {TOBN(0xfa2bf581, 0x355a5752), TOBN(0x562f96a8, 0x00946674), + TOBN(0x4e4ca16d, 0x6da0223b), TOBN(0xfe47819f, 0x28d3aa25)}}, + {{TOBN(0x9eea3075, 0xf8dfcf8a), TOBN(0xa284f0aa, 0x95669825), + TOBN(0xb3fca250, 0x867d3fd8), TOBN(0x20757b5f, 0x269d691e)}, + {TOBN(0xf2c24020, 0x93b8a5de), TOBN(0xd3f93359, 0xebc06da6), + TOBN(0x1178293e, 0xb2739c33), TOBN(0xd2a3e770, 0xbcd686e5)}}, + {{TOBN(0xa76f49f4, 0xcd941534), TOBN(0x0d37406b, 0xe3c71c0e), + TOBN(0x172d9397, 0x3b97f7e3), TOBN(0xec17e239, 0xbd7fd0de)}, + {TOBN(0xe3290551, 0x6f496ba2), TOBN(0x6a693172, 0x36ad50e7), + TOBN(0xc4e539a2, 0x83e7eff5), TOBN(0x752737e7, 0x18e1b4cf)}}, + {{TOBN(0xa2f7932c, 0x68af43ee), TOBN(0x5502468e, 0x703d00bd), + TOBN(0xe5dc978f, 0x2fb061f5), TOBN(0xc9a1904a, 0x28c815ad)}, + {TOBN(0xd3af538d, 0x470c56a4), TOBN(0x159abc5f, 0x193d8ced), + TOBN(0x2a37245f, 0x20108ef3), TOBN(0xfa17081e, 0x223f7178)}}, + {{TOBN(0x27b0fb2b, 0x10c8c0f5), TOBN(0x2102c3ea, 0x40650547), + TOBN(0x594564df, 0x8ac3bfa7), TOBN(0x98102033, 0x509dad96)}, + {TOBN(0x6989643f, 0xf1d18a13), TOBN(0x35eebd91, 0xd7fc5af0), + TOBN(0x078d096a, 0xfaeaafd8), TOBN(0xb7a89341, 0xdef3de98)}}, + {{TOBN(0x2a206e8d, 0xecf2a73a), TOBN(0x066a6397, 0x8e551994), + TOBN(0x3a6a088a, 0xb98d53a2), TOBN(0x0ce7c67c, 0x2d1124aa)}, + {TOBN(0x48cec671, 0x759a113c), TOBN(0xe3b373d3, 0x4f6f67fa), + TOBN(0x5455d479, 0xfd36727b), TOBN(0xe5a428ee, 0xa13c0d81)}}, + {{TOBN(0xb853dbc8, 0x1c86682b), TOBN(0xb78d2727, 0xb8d02b2a), + TOBN(0xaaf69bed, 0x8ebc329a), TOBN(0xdb6b40b3, 0x293b2148)}, + {TOBN(0xe42ea77d, 0xb8c4961f), TOBN(0xb1a12f7c, 0x20e5e0ab), + TOBN(0xa0ec5274, 0x79e8b05e), TOBN(0x68027391, 0xfab60a80)}}, + {{TOBN(0x6bfeea5f, 0x16b1bd5e), TOBN(0xf957e420, 0x4de30ad3), + TOBN(0xcbaf664e, 0x6a353b9e), TOBN(0x5c873312, 0x26d14feb)}, + {TOBN(0x4e87f98c, 0xb65f57cb), TOBN(0xdb60a621, 0x5e0cdd41), + TOBN(0x67c16865, 0xa6881440), TOBN(0x1093ef1a, 0x46ab52aa)}}, + {{TOBN(0xc095afb5, 0x3f4ece64), TOBN(0x6a6bb02e, 0x7604551a), + TOBN(0x55d44b4e, 0x0b26b8cd), TOBN(0xe5f9a999, 0xf971268a)}, + {TOBN(0xc08ec425, 0x11a7de84), TOBN(0x83568095, 0xfda469dd), + TOBN(0x737bfba1, 0x6c6c90a2), TOBN(0x1cb9c4a0, 0xbe229831)}}, + {{TOBN(0x93bccbba, 0xbb2eec64), TOBN(0xa0c23b64, 0xda03adbe), + TOBN(0x5f7aa00a, 0xe0e86ac4), TOBN(0x470b941e, 0xfc1401e6)}, + {TOBN(0x5ad8d679, 0x9df43574), TOBN(0x4ccfb8a9, 0x0f65d810), + TOBN(0x1bce80e3, 0xaa7fbd81), TOBN(0x273291ad, 0x9508d20a)}}, + {{TOBN(0xf5c4b46b, 0x42a92806), TOBN(0x810684ec, 0xa86ab44a), + TOBN(0x4591640b, 0xca0bc9f8), TOBN(0xb5efcdfc, 0x5c4b6054)}, + {TOBN(0x16fc8907, 0x6e9edd12), TOBN(0xe29d0b50, 0xd4d792f9), + TOBN(0xa45fd01c, 0x9b03116d), TOBN(0x85035235, 0xc81765a4)}}, + {{TOBN(0x1fe2a9b2, 0xb4b4b67c), TOBN(0xc1d10df0, 0xe8020604), + TOBN(0x9d64abfc, 0xbc8058d8), TOBN(0x8943b9b2, 0x712a0fbb)}, + {TOBN(0x90eed914, 0x3b3def04), TOBN(0x85ab3aa2, 0x4ce775ff), + TOBN(0x605fd4ca, 0x7bbc9040), TOBN(0x8b34a564, 0xe2c75dfb)}}, + {{TOBN(0x41ffc94a, 0x10358560), TOBN(0x2d8a5072, 0x9e5c28aa), + TOBN(0xe915a0fc, 0x4cc7eb15), TOBN(0xe9efab05, 0x8f6d0f5d)}, + {TOBN(0xdbab47a9, 0xd19e9b91), TOBN(0x8cfed745, 0x0276154c), + TOBN(0x154357ae, 0x2cfede0d), TOBN(0x520630df, 0x19f5a4ef)}}, + {{TOBN(0x25759f7c, 0xe382360f), TOBN(0xb6db05c9, 0x88bf5857), + TOBN(0x2917d61d, 0x6c58d46c), TOBN(0x14f8e491, 0xfd20cb7a)}, + {TOBN(0xb68a727a, 0x11c20340), TOBN(0x0386f86f, 0xaf7ccbb6), + TOBN(0x5c8bc6cc, 0xfee09a20), TOBN(0x7d76ff4a, 0xbb7eea35)}}, + {{TOBN(0xa7bdebe7, 0xdb15be7a), TOBN(0x67a08054, 0xd89f0302), + TOBN(0x56bf0ea9, 0xc1193364), TOBN(0xc8244467, 0x62837ebe)}, + {TOBN(0x32bd8e8b, 0x20d841b8), TOBN(0x127a0548, 0xdbb8a54f), + TOBN(0x83dd4ca6, 0x63b20236), TOBN(0x87714718, 0x203491fa)}}, + {{TOBN(0x4dabcaaa, 0xaa8a5288), TOBN(0x91cc0c8a, 0xaf23a1c9), + TOBN(0x34c72c6a, 0x3f220e0c), TOBN(0xbcc20bdf, 0x1232144a)}, + {TOBN(0x6e2f42da, 0xa20ede1b), TOBN(0xc441f00c, 0x74a00515), + TOBN(0xbf46a5b6, 0x734b8c4b), TOBN(0x57409503, 0x7b56c9a4)}}, + {{TOBN(0x9f735261, 0xe4585d45), TOBN(0x9231faed, 0x6734e642), + TOBN(0x1158a176, 0xbe70ee6c), TOBN(0x35f1068d, 0x7c3501bf)}, + {TOBN(0x6beef900, 0xa2d26115), TOBN(0x649406f2, 0xef0afee3), + TOBN(0x3f43a60a, 0xbc2420a1), TOBN(0x509002a7, 0xd5aee4ac)}}, + {{TOBN(0xb46836a5, 0x3ff3571b), TOBN(0x24f98b78, 0x837927c1), + TOBN(0x6254256a, 0x4533c716), TOBN(0xf27abb0b, 0xd07ee196)}, + {TOBN(0xd7cf64fc, 0x5c6d5bfd), TOBN(0x6915c751, 0xf0cd7a77), + TOBN(0xd9f59012, 0x8798f534), TOBN(0x772b0da8, 0xf81d8b5f)}}, + {{TOBN(0x1244260c, 0x2e03fa69), TOBN(0x36cf0e3a, 0x3be1a374), + TOBN(0x6e7c1633, 0xef06b960), TOBN(0xa71a4c55, 0x671f90f6)}, + {TOBN(0x7a941251, 0x33c673db), TOBN(0xc0bea510, 0x73e8c131), + TOBN(0x61a8a699, 0xd4f6c734), TOBN(0x25e78c88, 0x341ed001)}}, + {{TOBN(0x5c18acf8, 0x8e2f7d90), TOBN(0xfdbf33d7, 0x77be32cd), + TOBN(0x0a085cd7, 0xd2eb5ee9), TOBN(0x2d702cfb, 0xb3201115)}, + {TOBN(0xb6e0ebdb, 0x85c88ce8), TOBN(0x23a3ce3c, 0x1e01d617), + TOBN(0x3041618e, 0x567333ac), TOBN(0x9dd0fd8f, 0x157edb6b)}}, + {{TOBN(0x27f74702, 0xb57872b8), TOBN(0x2ef26b4f, 0x657d5fe1), + TOBN(0x95426f0a, 0x57cf3d40), TOBN(0x847e2ad1, 0x65a6067a)}, + {TOBN(0xd474d9a0, 0x09996a74), TOBN(0x16a56acd, 0x2a26115c), + TOBN(0x02a615c3, 0xd16f4d43), TOBN(0xcc3fc965, 0xaadb85b7)}}, + {{TOBN(0x386bda73, 0xce07d1b0), TOBN(0xd82910c2, 0x58ad4178), + TOBN(0x124f82cf, 0xcd2617f4), TOBN(0xcc2f5e8d, 0xef691770)}, + {TOBN(0x82702550, 0xb8c30ccc), TOBN(0x7b856aea, 0x1a8e575a), + TOBN(0xbb822fef, 0xb1ab9459), TOBN(0x085928bc, 0xec24e38e)}}, + {{TOBN(0x5d0402ec, 0xba8f4b4d), TOBN(0xc07cd4ba, 0x00b4d58b), + TOBN(0x5d8dffd5, 0x29227e7a), TOBN(0x61d44d0c, 0x31bf386f)}, + {TOBN(0xe486dc2b, 0x135e6f4d), TOBN(0x680962eb, 0xe79410ef), + TOBN(0xa61bd343, 0xf10088b5), TOBN(0x6aa76076, 0xe2e28686)}}, + {{TOBN(0x80463d11, 0x8fb98871), TOBN(0xcb26f5c3, 0xbbc76aff), + TOBN(0xd4ab8edd, 0xfbe03614), TOBN(0xc8eb579b, 0xc0cf2dee)}, + {TOBN(0xcc004c15, 0xc93bae41), TOBN(0x46fbae5d, 0x3aeca3b2), + TOBN(0x671235cf, 0x0f1e9ab1), TOBN(0xadfba934, 0x9ec285c1)}}, + {{TOBN(0x88ded013, 0xf216c980), TOBN(0xc8ac4fb8, 0xf79e0bc1), + TOBN(0xa29b89c6, 0xfb97a237), TOBN(0xb697b780, 0x9922d8e7)}, + {TOBN(0x3142c639, 0xddb945b5), TOBN(0x447b06c7, 0xe094c3a9), + TOBN(0xcdcb3642, 0x72266c90), TOBN(0x633aad08, 0xa9385046)}}, + {{TOBN(0xa36c936b, 0xb57c6477), TOBN(0x871f8b64, 0xe94dbcc6), + TOBN(0x28d0fb62, 0xa591a67b), TOBN(0x9d40e081, 0xc1d926f5)}, + {TOBN(0x3111eaf6, 0xf2d84b5a), TOBN(0x228993f9, 0xa565b644), + TOBN(0x0ccbf592, 0x2c83188b), TOBN(0xf87b30ab, 0x3df3e197)}}, + {{TOBN(0xb8658b31, 0x7642bca8), TOBN(0x1a032d7f, 0x52800f17), + TOBN(0x051dcae5, 0x79bf9445), TOBN(0xeba6b8ee, 0x54a2e253)}, + {TOBN(0x5c8b9cad, 0xd4485692), TOBN(0x84bda40e, 0x8986e9be), + TOBN(0xd16d16a4, 0x2f0db448), TOBN(0x8ec80050, 0xa14d4188)}}, + {{TOBN(0xb2b26107, 0x98fa7aaa), TOBN(0x41209ee4, 0xf073aa4e), + TOBN(0xf1570359, 0xf2d6b19b), TOBN(0xcbe6868c, 0xfc577caf)}, + {TOBN(0x186c4bdc, 0x32c04dd3), TOBN(0xa6c35fae, 0xcfeee397), + TOBN(0xb4a1b312, 0xf086c0cf), TOBN(0xe0a5ccc6, 0xd9461fe2)}}, + {{TOBN(0xc32278aa, 0x1536189f), TOBN(0x1126c55f, 0xba6df571), + TOBN(0x0f71a602, 0xb194560e), TOBN(0x8b2d7405, 0x324bd6e1)}, + {TOBN(0x8481939e, 0x3738be71), TOBN(0xb5090b1a, 0x1a4d97a9), + TOBN(0x116c65a3, 0xf05ba915), TOBN(0x21863ad3, 0xaae448aa)}}, + {{TOBN(0xd24e2679, 0xa7aae5d3), TOBN(0x7076013d, 0x0de5c1c4), + TOBN(0x2d50f8ba, 0xbb05b629), TOBN(0x73c1abe2, 0x6e66efbb)}, + {TOBN(0xefd4b422, 0xf2488af7), TOBN(0xe4105d02, 0x663ba575), + TOBN(0x7eb60a8b, 0x53a69457), TOBN(0x62210008, 0xc945973b)}}, + {{TOBN(0xfb255478, 0x77a50ec6), TOBN(0xbf0392f7, 0x0a37a72c), + TOBN(0xa0a7a19c, 0x4be18e7a), TOBN(0x90d8ea16, 0x25b1e0af)}, + {TOBN(0x7582a293, 0xef953f57), TOBN(0x90a64d05, 0xbdc5465a), + TOBN(0xca79c497, 0xe2510717), TOBN(0x560dbb7c, 0x18cb641f)}}, + {{TOBN(0x1d8e3286, 0x4b66abfb), TOBN(0xd26f52e5, 0x59030900), + TOBN(0x1ee3f643, 0x5584941a), TOBN(0x6d3b3730, 0x569f5958)}, + {TOBN(0x9ff2a62f, 0x4789dba5), TOBN(0x91fcb815, 0x72b5c9b7), + TOBN(0xf446cb7d, 0x6c8f9a0e), TOBN(0x48f625c1, 0x39b7ecb5)}}, + {{TOBN(0xbabae801, 0x1c6219b8), TOBN(0xe7a562d9, 0x28ac2f23), + TOBN(0xe1b48732, 0x26e20588), TOBN(0x06ee1cad, 0x775af051)}, + {TOBN(0xda29ae43, 0xfaff79f7), TOBN(0xc141a412, 0x652ee9e0), + TOBN(0x1e127f6f, 0x195f4bd0), TOBN(0x29c6ab4f, 0x072f34f8)}}, + {{TOBN(0x7b7c1477, 0x30448112), TOBN(0x82b51af1, 0xe4a38656), + TOBN(0x2bf2028a, 0x2f315010), TOBN(0xc9a4a01f, 0x6ea88cd4)}, + {TOBN(0xf63e95d8, 0x257e5818), TOBN(0xdd8efa10, 0xb4519b16), + TOBN(0xed8973e0, 0x0da910bf), TOBN(0xed49d077, 0x5c0fe4a9)}}, + {{TOBN(0xac3aac5e, 0xb7caee1e), TOBN(0x1033898d, 0xa7f4da57), + TOBN(0x42145c0e, 0x5c6669b9), TOBN(0x42daa688, 0xc1aa2aa0)}, + {TOBN(0x629cc15c, 0x1a1d885a), TOBN(0x25572ec0, 0xf4b76817), + TOBN(0x8312e435, 0x9c8f8f28), TOBN(0x8107f8cd, 0x81965490)}}, + {{TOBN(0x516ff3a3, 0x6fa6110c), TOBN(0x74fb1eb1, 0xfb93561f), + TOBN(0x6c0c9047, 0x8457522b), TOBN(0xcfd32104, 0x6bb8bdc6)}, + {TOBN(0x2d6884a2, 0xcc80ad57), TOBN(0x7c27fc35, 0x86a9b637), + TOBN(0x3461baed, 0xadf4e8cd), TOBN(0x1d56251a, 0x617242f0)}}, + {{TOBN(0x0b80d209, 0xc955bef4), TOBN(0xdf02cad2, 0x06adb047), + TOBN(0xf0d7cb91, 0x5ec74fee), TOBN(0xd2503375, 0x1111ba44)}, + {TOBN(0x9671755e, 0xdf53cb36), TOBN(0x54dcb612, 0x3368551b), + TOBN(0x66d69aac, 0xc8a025a4), TOBN(0x6be946c6, 0xe77ef445)}}, + {{TOBN(0x719946d1, 0xa995e094), TOBN(0x65e848f6, 0xe51e04d8), + TOBN(0xe62f3300, 0x6a1e3113), TOBN(0x1541c7c1, 0x501de503)}, + {TOBN(0x4daac9fa, 0xf4acfade), TOBN(0x0e585897, 0x44cd0b71), + TOBN(0x544fd869, 0x0a51cd77), TOBN(0x60fc20ed, 0x0031016d)}}, + {{TOBN(0x58b404ec, 0xa4276867), TOBN(0x46f6c3cc, 0x34f34993), + TOBN(0x477ca007, 0xc636e5bd), TOBN(0x8018f5e5, 0x7c458b47)}, + {TOBN(0xa1202270, 0xe47b668f), TOBN(0xcef48ccd, 0xee14f203), + TOBN(0x23f98bae, 0x62ff9b4d), TOBN(0x55acc035, 0xc589eddd)}}, + {{TOBN(0x3fe712af, 0x64db4444), TOBN(0x19e9d634, 0xbecdd480), + TOBN(0xe08bc047, 0xa930978a), TOBN(0x2dbf24ec, 0xa1280733)}, + {TOBN(0x3c0ae38c, 0x2cd706b2), TOBN(0x5b012a5b, 0x359017b9), + TOBN(0x3943c38c, 0x72e0f5ae), TOBN(0x786167ea, 0x57176fa3)}}, + {{TOBN(0xe5f9897d, 0x594881dc), TOBN(0x6b5efad8, 0xcfb820c1), + TOBN(0xb2179093, 0xd55018de), TOBN(0x39ad7d32, 0x0bac56ce)}, + {TOBN(0xb55122e0, 0x2cfc0e81), TOBN(0x117c4661, 0xf6d89daa), + TOBN(0x362d01e1, 0xcb64fa09), TOBN(0x6a309b4e, 0x3e9c4ddd)}}, + {{TOBN(0xfa979fb7, 0xabea49b1), TOBN(0xb4b1d27d, 0x10e2c6c5), + TOBN(0xbd61c2c4, 0x23afde7a), TOBN(0xeb6614f8, 0x9786d358)}, + {TOBN(0x4a5d816b, 0x7f6f7459), TOBN(0xe431a44f, 0x09360e7b), + TOBN(0x8c27a032, 0xc309914c), TOBN(0xcea5d68a, 0xcaede3d8)}}, + {{TOBN(0x3668f665, 0x3a0a3f95), TOBN(0x89369416, 0x7ceba27b), + TOBN(0x89981fad, 0xe4728fe9), TOBN(0x7102c8a0, 0x8a093562)}, + {TOBN(0xbb80310e, 0x235d21c8), TOBN(0x505e55d1, 0xbefb7f7b), + TOBN(0xa0a90811, 0x12958a67), TOBN(0xd67e106a, 0x4d851fef)}}, + {{TOBN(0xb84011a9, 0x431dd80e), TOBN(0xeb7c7cca, 0x73306cd9), + TOBN(0x20fadd29, 0xd1b3b730), TOBN(0x83858b5b, 0xfe37b3d3)}, + {TOBN(0xbf4cd193, 0xb6251d5c), TOBN(0x1cca1fd3, 0x1352d952), + TOBN(0xc66157a4, 0x90fbc051), TOBN(0x7990a638, 0x89b98636)}}}, + {{{TOBN(0xe5aa692a, 0x87dec0e1), TOBN(0x010ded8d, 0xf7b39d00), + TOBN(0x7b1b80c8, 0x54cfa0b5), TOBN(0x66beb876, 0xa0f8ea28)}, + {TOBN(0x50d7f531, 0x3476cd0e), TOBN(0xa63d0e65, 0xb08d3949), + TOBN(0x1a09eea9, 0x53479fc6), TOBN(0x82ae9891, 0xf499e742)}}, + {{TOBN(0xab58b910, 0x5ca7d866), TOBN(0x582967e2, 0x3adb3b34), + TOBN(0x89ae4447, 0xcceac0bc), TOBN(0x919c667c, 0x7bf56af5)}, + {TOBN(0x9aec17b1, 0x60f5dcd7), TOBN(0xec697b9f, 0xddcaadbc), + TOBN(0x0b98f341, 0x463467f5), TOBN(0xb187f1f7, 0xa967132f)}}, + {{TOBN(0x90fe7a1d, 0x214aeb18), TOBN(0x1506af3c, 0x741432f7), + TOBN(0xbb5565f9, 0xe591a0c4), TOBN(0x10d41a77, 0xb44f1bc3)}, + {TOBN(0xa09d65e4, 0xa84bde96), TOBN(0x42f060d8, 0xf20a6a1c), + TOBN(0x652a3bfd, 0xf27f9ce7), TOBN(0xb6bdb65c, 0x3b3d739f)}}, + {{TOBN(0xeb5ddcb6, 0xec7fae9f), TOBN(0x995f2714, 0xefb66e5a), + TOBN(0xdee95d8e, 0x69445d52), TOBN(0x1b6c2d46, 0x09e27620)}, + {TOBN(0x32621c31, 0x8129d716), TOBN(0xb03909f1, 0x0958c1aa), + TOBN(0x8c468ef9, 0x1af4af63), TOBN(0x162c429f, 0xfba5cdf6)}}, + {{TOBN(0x2f682343, 0x753b9371), TOBN(0x29cab45a, 0x5f1f9cd7), + TOBN(0x571623ab, 0xb245db96), TOBN(0xc507db09, 0x3fd79999)}, + {TOBN(0x4e2ef652, 0xaf036c32), TOBN(0x86f0cc78, 0x05018e5c), + TOBN(0xc10a73d4, 0xab8be350), TOBN(0x6519b397, 0x7e826327)}}, + {{TOBN(0xe8cb5eef, 0x9c053df7), TOBN(0x8de25b37, 0xb300ea6f), + TOBN(0xdb03fa92, 0xc849cffb), TOBN(0x242e43a7, 0xe84169bb)}, + {TOBN(0xe4fa51f4, 0xdd6f958e), TOBN(0x6925a77f, 0xf4445a8d), + TOBN(0xe6e72a50, 0xe90d8949), TOBN(0xc66648e3, 0x2b1f6390)}}, + {{TOBN(0xb2ab1957, 0x173e460c), TOBN(0x1bbbce75, 0x30704590), + TOBN(0xc0a90dbd, 0xdb1c7162), TOBN(0x505e399e, 0x15cdd65d)}, + {TOBN(0x68434dcb, 0x57797ab7), TOBN(0x60ad35ba, 0x6a2ca8e8), + TOBN(0x4bfdb1e0, 0xde3336c1), TOBN(0xbbef99eb, 0xd8b39015)}}, + {{TOBN(0x6c3b96f3, 0x1711ebec), TOBN(0x2da40f1f, 0xce98fdc4), + TOBN(0xb99774d3, 0x57b4411f), TOBN(0x87c8bdf4, 0x15b65bb6)}, + {TOBN(0xda3a89e3, 0xc2eef12d), TOBN(0xde95bb9b, 0x3c7471f3), + TOBN(0x600f225b, 0xd812c594), TOBN(0x54907c5d, 0x2b75a56b)}}, + {{TOBN(0xa93cc5f0, 0x8db60e35), TOBN(0x743e3cd6, 0xfa833319), + TOBN(0x7dad5c41, 0xf81683c9), TOBN(0x70c1e7d9, 0x9c34107e)}, + {TOBN(0x0edc4a39, 0xa6be0907), TOBN(0x36d47035, 0x86d0b7d3), + TOBN(0x8c76da03, 0x272bfa60), TOBN(0x0b4a07ea, 0x0f08a414)}}, + {{TOBN(0x699e4d29, 0x45c1dd53), TOBN(0xcadc5898, 0x231debb5), + TOBN(0xdf49fcc7, 0xa77f00e0), TOBN(0x93057bbf, 0xa73e5a0e)}, + {TOBN(0x2f8b7ecd, 0x027a4cd1), TOBN(0x114734b3, 0xc614011a), + TOBN(0xe7a01db7, 0x67677c68), TOBN(0x89d9be5e, 0x7e273f4f)}}, + {{TOBN(0xd225cb2e, 0x089808ef), TOBN(0xf1f7a27d, 0xd59e4107), + TOBN(0x53afc761, 0x8211b9c9), TOBN(0x0361bc67, 0xe6819159)}, + {TOBN(0x2a865d0b, 0x7f071426), TOBN(0x6a3c1810, 0xe7072567), + TOBN(0x3e3bca1e, 0x0d6bcabd), TOBN(0xa1b02bc1, 0x408591bc)}}, + {{TOBN(0xe0deee59, 0x31fba239), TOBN(0xf47424d3, 0x98bd91d1), + TOBN(0x0f8886f4, 0x071a3c1d), TOBN(0x3f7d41e8, 0xa819233b)}, + {TOBN(0x708623c2, 0xcf6eb998), TOBN(0x86bb49af, 0x609a287f), + TOBN(0x942bb249, 0x63c90762), TOBN(0x0ef6eea5, 0x55a9654b)}}, + {{TOBN(0x5f6d2d72, 0x36f5defe), TOBN(0xfa9922dc, 0x56f99176), + TOBN(0x6c8c5ece, 0xf78ce0c7), TOBN(0x7b44589d, 0xbe09b55e)}, + {TOBN(0xe11b3bca, 0x9ea83770), TOBN(0xd7fa2c7f, 0x2ab71547), + TOBN(0x2a3dd6fa, 0x2a1ddcc0), TOBN(0x09acb430, 0x5a7b7707)}}, + {{TOBN(0x4add4a2e, 0x649d4e57), TOBN(0xcd53a2b0, 0x1917526e), + TOBN(0xc5262330, 0x20b44ac4), TOBN(0x4028746a, 0xbaa2c31d)}, + {TOBN(0x51318390, 0x64291d4c), TOBN(0xbf48f151, 0xee5ad909), + TOBN(0xcce57f59, 0x7b185681), TOBN(0x7c3ac1b0, 0x4854d442)}}, + {{TOBN(0x65587dc3, 0xc093c171), TOBN(0xae7acb24, 0x24f42b65), + TOBN(0x5a338adb, 0x955996cb), TOBN(0xc8e65675, 0x6051f91b)}, + {TOBN(0x66711fba, 0x28b8d0b1), TOBN(0x15d74137, 0xb6c10a90), + TOBN(0x70cdd7eb, 0x3a232a80), TOBN(0xc9e2f07f, 0x6191ed24)}}, + {{TOBN(0xa80d1db6, 0xf79588c0), TOBN(0xfa52fc69, 0xb55768cc), + TOBN(0x0b4df1ae, 0x7f54438a), TOBN(0x0cadd1a7, 0xf9b46a4f)}, + {TOBN(0xb40ea6b3, 0x1803dd6f), TOBN(0x488e4fa5, 0x55eaae35), + TOBN(0x9f047d55, 0x382e4e16), TOBN(0xc9b5b7e0, 0x2f6e0c98)}}, + {{TOBN(0x6b1bd2d3, 0x95762649), TOBN(0xa9604ee7, 0xc7aea3f6), + TOBN(0x3646ff27, 0x6dc6f896), TOBN(0x9bf0e7f5, 0x2860bad1)}, + {TOBN(0x2d92c821, 0x7cb44b92), TOBN(0xa2f5ce63, 0xaea9c182), + TOBN(0xd0a2afb1, 0x9154a5fd), TOBN(0x482e474c, 0x95801da6)}}, + {{TOBN(0xc19972d0, 0xb611c24b), TOBN(0x1d468e65, 0x60a8f351), + TOBN(0xeb758069, 0x7bcf6421), TOBN(0xec9dd0ee, 0x88fbc491)}, + {TOBN(0x5b59d2bf, 0x956c2e32), TOBN(0x73dc6864, 0xdcddf94e), + TOBN(0xfd5e2321, 0xbcee7665), TOBN(0xa7b4f8ef, 0x5e9a06c4)}}, + {{TOBN(0xfba918dd, 0x7280f855), TOBN(0xbbaac260, 0x8baec688), + TOBN(0xa3b3f00f, 0x33400f42), TOBN(0x3d2dba29, 0x66f2e6e4)}, + {TOBN(0xb6f71a94, 0x98509375), TOBN(0x8f33031f, 0xcea423cc), + TOBN(0x009b8dd0, 0x4807e6fb), TOBN(0x5163cfe5, 0x5cdb954c)}}, + {{TOBN(0x03cc8f17, 0xcf41c6e8), TOBN(0xf1f03c2a, 0x037b925c), + TOBN(0xc39c19cc, 0x66d2427c), TOBN(0x823d24ba, 0x7b6c18e4)}, + {TOBN(0x32ef9013, 0x901f0b4f), TOBN(0x684360f1, 0xf8941c2e), + TOBN(0x0ebaff52, 0x2c28092e), TOBN(0x7891e4e3, 0x256c932f)}}, + {{TOBN(0x51264319, 0xac445e3d), TOBN(0x553432e7, 0x8ea74381), + TOBN(0xe6eeaa69, 0x67e9c50a), TOBN(0x27ced284, 0x62e628c7)}, + {TOBN(0x3f96d375, 0x7a4afa57), TOBN(0xde0a14c3, 0xe484c150), + TOBN(0x364a24eb, 0x38bd9923), TOBN(0x1df18da0, 0xe5177422)}}, + {{TOBN(0x174e8f82, 0xd8d38a9b), TOBN(0x2e97c600, 0xe7de1391), + TOBN(0xc5709850, 0xa1c175dd), TOBN(0x969041a0, 0x32ae5035)}, + {TOBN(0xcbfd533b, 0x76a2086b), TOBN(0xd6bba71b, 0xd7c2e8fe), + TOBN(0xb2d58ee6, 0x099dfb67), TOBN(0x3a8b342d, 0x064a85d9)}}, + {{TOBN(0x3bc07649, 0x522f9be3), TOBN(0x690c075b, 0xdf1f49a8), + TOBN(0x80e1aee8, 0x3854ec42), TOBN(0x2a7dbf44, 0x17689dc7)}, + {TOBN(0xc004fc0e, 0x3faf4078), TOBN(0xb2f02e9e, 0xdf11862c), + TOBN(0xf10a5e0f, 0xa0a1b7b3), TOBN(0x30aca623, 0x8936ec80)}}, + {{TOBN(0xf83cbf05, 0x02f40d9a), TOBN(0x4681c468, 0x2c318a4d), + TOBN(0x98575618, 0x0e9c2674), TOBN(0xbe79d046, 0x1847092e)}, + {TOBN(0xaf1e480a, 0x78bd01e0), TOBN(0x6dd359e4, 0x72a51db9), + TOBN(0x62ce3821, 0xe3afbab6), TOBN(0xc5cee5b6, 0x17733199)}}, + {{TOBN(0xe08b30d4, 0x6ffd9fbb), TOBN(0x6e5bc699, 0x36c610b7), + TOBN(0xf343cff2, 0x9ce262cf), TOBN(0xca2e4e35, 0x68b914c1)}, + {TOBN(0x011d64c0, 0x16de36c5), TOBN(0xe0b10fdd, 0x42e2b829), + TOBN(0x78942981, 0x6685aaf8), TOBN(0xe7511708, 0x230ede97)}}, + {{TOBN(0x671ed8fc, 0x3b922bf8), TOBN(0xe4d8c0a0, 0x4c29b133), + TOBN(0x87eb1239, 0x3b6e99c4), TOBN(0xaff3974c, 0x8793beba)}, + {TOBN(0x03749405, 0x2c18df9b), TOBN(0xc5c3a293, 0x91007139), + TOBN(0x6a77234f, 0xe37a0b95), TOBN(0x02c29a21, 0xb661c96b)}}, + {{TOBN(0xc3aaf1d6, 0x141ecf61), TOBN(0x9195509e, 0x3bb22f53), + TOBN(0x29597404, 0x22d51357), TOBN(0x1b083822, 0x537bed60)}, + {TOBN(0xcd7d6e35, 0xe07289f0), TOBN(0x1f94c48c, 0x6dd86eff), + TOBN(0xc8bb1f82, 0xeb0f9cfa), TOBN(0x9ee0b7e6, 0x1b2eb97d)}}, + {{TOBN(0x5a52fe2e, 0x34d74e31), TOBN(0xa352c310, 0x3bf79ab6), + TOBN(0x97ff6c5a, 0xabfeeb8f), TOBN(0xbfbe8fef, 0xf5c97305)}, + {TOBN(0xd6081ce6, 0xa7904608), TOBN(0x1f812f3a, 0xc4fca249), + TOBN(0x9b24bc9a, 0xb9e5e200), TOBN(0x91022c67, 0x38012ee8)}}, + {{TOBN(0xe83d9c5d, 0x30a713a1), TOBN(0x4876e3f0, 0x84ef0f93), + TOBN(0xc9777029, 0xc1fbf928), TOBN(0xef7a6bb3, 0xbce7d2a4)}, + {TOBN(0xb8067228, 0xdfa2a659), TOBN(0xd5cd3398, 0xd877a48f), + TOBN(0xbea4fd8f, 0x025d0f3f), TOBN(0xd67d2e35, 0x2eae7c2b)}}, + {{TOBN(0x184de7d7, 0xcc5f4394), TOBN(0xb5551b5c, 0x4536e142), + TOBN(0x2e89b212, 0xd34aa60a), TOBN(0x14a96fea, 0xf50051d5)}, + {TOBN(0x4e21ef74, 0x0d12bb0b), TOBN(0xc522f020, 0x60b9677e), + TOBN(0x8b12e467, 0x2df7731d), TOBN(0x39f80382, 0x7b326d31)}}, + {{TOBN(0xdfb8630c, 0x39024a94), TOBN(0xaacb96a8, 0x97319452), + TOBN(0xd68a3961, 0xeda3867c), TOBN(0x0c58e2b0, 0x77c4ffca)}, + {TOBN(0x3d545d63, 0x4da919fa), TOBN(0xef79b69a, 0xf15e2289), + TOBN(0x54bc3d3d, 0x808bab10), TOBN(0xc8ab3007, 0x45f82c37)}}, + {{TOBN(0xc12738b6, 0x7c4a658a), TOBN(0xb3c47639, 0x40e72182), + TOBN(0x3b77be46, 0x8798e44f), TOBN(0xdc047df2, 0x17a7f85f)}, + {TOBN(0x2439d4c5, 0x5e59d92d), TOBN(0xcedca475, 0xe8e64d8d), + TOBN(0xa724cd0d, 0x87ca9b16), TOBN(0x35e4fd59, 0xa5540dfe)}}, + {{TOBN(0xf8c1ff18, 0xe4bcf6b1), TOBN(0x856d6285, 0x295018fa), + TOBN(0x433f665c, 0x3263c949), TOBN(0xa6a76dd6, 0xa1f21409)}, + {TOBN(0x17d32334, 0xcc7b4f79), TOBN(0xa1d03122, 0x06720e4a), + TOBN(0xadb6661d, 0x81d9bed5), TOBN(0xf0d6fb02, 0x11db15d1)}}, + {{TOBN(0x7fd11ad5, 0x1fb747d2), TOBN(0xab50f959, 0x3033762b), + TOBN(0x2a7e711b, 0xfbefaf5a), TOBN(0xc7393278, 0x3fef2bbf)}, + {TOBN(0xe29fa244, 0x0df6f9be), TOBN(0x9092757b, 0x71efd215), + TOBN(0xee60e311, 0x4f3d6fd9), TOBN(0x338542d4, 0x0acfb78b)}}, + {{TOBN(0x44a23f08, 0x38961a0f), TOBN(0x1426eade, 0x986987ca), + TOBN(0x36e6ee2e, 0x4a863cc6), TOBN(0x48059420, 0x628b8b79)}, + {TOBN(0x30303ad8, 0x7396e1de), TOBN(0x5c8bdc48, 0x38c5aad1), + TOBN(0x3e40e11f, 0x5c8f5066), TOBN(0xabd6e768, 0x8d246bbd)}}, + {{TOBN(0x68aa40bb, 0x23330a01), TOBN(0xd23f5ee4, 0xc34eafa0), + TOBN(0x3bbee315, 0x5de02c21), TOBN(0x18dd4397, 0xd1d8dd06)}, + {TOBN(0x3ba1939a, 0x122d7b44), TOBN(0xe6d3b40a, 0xa33870d6), + TOBN(0x8e620f70, 0x1c4fe3f8), TOBN(0xf6bba1a5, 0xd3a50cbf)}}, + {{TOBN(0x4a78bde5, 0xcfc0aee0), TOBN(0x847edc46, 0xc08c50bd), + TOBN(0xbaa2439c, 0xad63c9b2), TOBN(0xceb4a728, 0x10fc2acb)}, + {TOBN(0xa419e40e, 0x26da033d), TOBN(0x6cc3889d, 0x03e02683), + TOBN(0x1cd28559, 0xfdccf725), TOBN(0x0fd7e0f1, 0x8d13d208)}}, + {{TOBN(0x01b9733b, 0x1f0df9d4), TOBN(0x8cc2c5f3, 0xa2b5e4f3), + TOBN(0x43053bfa, 0x3a304fd4), TOBN(0x8e87665c, 0x0a9f1aa7)}, + {TOBN(0x087f29ec, 0xd73dc965), TOBN(0x15ace455, 0x3e9023db), + TOBN(0x2370e309, 0x2bce28b4), TOBN(0xf9723442, 0xb6b1e84a)}}, + {{TOBN(0xbeee662e, 0xb72d9f26), TOBN(0xb19396de, 0xf0e47109), + TOBN(0x85b1fa73, 0xe13289d0), TOBN(0x436cf77e, 0x54e58e32)}, + {TOBN(0x0ec833b3, 0xe990ef77), TOBN(0x7373e3ed, 0x1b11fc25), + TOBN(0xbe0eda87, 0x0fc332ce), TOBN(0xced04970, 0x8d7ea856)}}, + {{TOBN(0xf85ff785, 0x7e977ca0), TOBN(0xb66ee8da, 0xdfdd5d2b), + TOBN(0xf5e37950, 0x905af461), TOBN(0x587b9090, 0x966d487c)}, + {TOBN(0x6a198a1b, 0x32ba0127), TOBN(0xa7720e07, 0x141615ac), + TOBN(0xa23f3499, 0x996ef2f2), TOBN(0xef5f64b4, 0x470bcb3d)}}, + {{TOBN(0xa526a962, 0x92b8c559), TOBN(0x0c14aac0, 0x69740a0f), + TOBN(0x0d41a9e3, 0xa6bdc0a5), TOBN(0x97d52106, 0x9c48aef4)}, + {TOBN(0xcf16bd30, 0x3e7c253b), TOBN(0xcc834b1a, 0x47fdedc1), + TOBN(0x7362c6e5, 0x373aab2e), TOBN(0x264ed85e, 0xc5f590ff)}}, + {{TOBN(0x7a46d9c0, 0x66d41870), TOBN(0xa50c20b1, 0x4787ba09), + TOBN(0x185e7e51, 0xe3d44635), TOBN(0xb3b3e080, 0x31e2d8dc)}, + {TOBN(0xbed1e558, 0xa179e9d9), TOBN(0x2daa3f79, 0x74a76781), + TOBN(0x4372baf2, 0x3a40864f), TOBN(0x46900c54, 0x4fe75cb5)}}, + {{TOBN(0xb95f171e, 0xf76765d0), TOBN(0x4ad726d2, 0x95c87502), + TOBN(0x2ec769da, 0x4d7c99bd), TOBN(0x5e2ddd19, 0xc36cdfa8)}, + {TOBN(0xc22117fc, 0xa93e6dea), TOBN(0xe8a2583b, 0x93771123), + TOBN(0xbe2f6089, 0xfa08a3a2), TOBN(0x4809d5ed, 0x8f0e1112)}}, + {{TOBN(0x3b414aa3, 0xda7a095e), TOBN(0x9049acf1, 0x26f5aadd), + TOBN(0x78d46a4d, 0x6be8b84a), TOBN(0xd66b1963, 0xb732b9b3)}, + {TOBN(0x5c2ac2a0, 0xde6e9555), TOBN(0xcf52d098, 0xb5bd8770), + TOBN(0x15a15fa6, 0x0fd28921), TOBN(0x56ccb81e, 0x8b27536d)}}, + {{TOBN(0x0f0d8ab8, 0x9f4ccbb8), TOBN(0xed5f44d2, 0xdb221729), + TOBN(0x43141988, 0x00bed10c), TOBN(0xc94348a4, 0x1d735b8b)}, + {TOBN(0x79f3e9c4, 0x29ef8479), TOBN(0x4c13a4e3, 0x614c693f), + TOBN(0x32c9af56, 0x8e143a14), TOBN(0xbc517799, 0xe29ac5c4)}}, + {{TOBN(0x05e17992, 0x2774856f), TOBN(0x6e52fb05, 0x6c1bf55f), + TOBN(0xaeda4225, 0xe4f19e16), TOBN(0x70f4728a, 0xaf5ccb26)}, + {TOBN(0x5d2118d1, 0xb2947f22), TOBN(0xc827ea16, 0x281d6fb9), + TOBN(0x8412328d, 0x8cf0eabd), TOBN(0x45ee9fb2, 0x03ef9dcf)}}, + {{TOBN(0x8e700421, 0xbb937d63), TOBN(0xdf8ff2d5, 0xcc4b37a6), + TOBN(0xa4c0d5b2, 0x5ced7b68), TOBN(0x6537c1ef, 0xc7308f59)}, + {TOBN(0x25ce6a26, 0x3b37f8e8), TOBN(0x170e9a9b, 0xdeebc6ce), + TOBN(0xdd037952, 0x8728d72c), TOBN(0x445b0e55, 0x850154bc)}}, + {{TOBN(0x4b7d0e06, 0x83a7337b), TOBN(0x1e3416d4, 0xffecf249), + TOBN(0x24840eff, 0x66a2b71f), TOBN(0xd0d9a50a, 0xb37cc26d)}, + {TOBN(0xe2198150, 0x6fe28ef7), TOBN(0x3cc5ef16, 0x23324c7f), + TOBN(0x220f3455, 0x769b5263), TOBN(0xe2ade2f1, 0xa10bf475)}}, + {{TOBN(0x28cd20fa, 0x458d3671), TOBN(0x1549722c, 0x2dc4847b), + TOBN(0x6dd01e55, 0x591941e3), TOBN(0x0e6fbcea, 0x27128ccb)}, + {TOBN(0xae1a1e6b, 0x3bef0262), TOBN(0xfa8c472c, 0x8f54e103), + TOBN(0x7539c0a8, 0x72c052ec), TOBN(0xd7b27369, 0x5a3490e9)}}, + {{TOBN(0x143fe1f1, 0x71684349), TOBN(0x36b4722e, 0x32e19b97), + TOBN(0xdc059227, 0x90980aff), TOBN(0x175c9c88, 0x9e13d674)}, + {TOBN(0xa7de5b22, 0x6e6bfdb1), TOBN(0x5ea5b7b2, 0xbedb4b46), + TOBN(0xd5570191, 0xd34a6e44), TOBN(0xfcf60d2e, 0xa24ff7e6)}}, + {{TOBN(0x614a392d, 0x677819e1), TOBN(0x7be74c7e, 0xaa5a29e8), + TOBN(0xab50fece, 0x63c85f3f), TOBN(0xaca2e2a9, 0x46cab337)}, + {TOBN(0x7f700388, 0x122a6fe3), TOBN(0xdb69f703, 0x882a04a8), + TOBN(0x9a77935d, 0xcf7aed57), TOBN(0xdf16207c, 0x8d91c86f)}}, + {{TOBN(0x2fca49ab, 0x63ed9998), TOBN(0xa3125c44, 0xa77ddf96), + TOBN(0x05dd8a86, 0x24344072), TOBN(0xa023dda2, 0xfec3fb56)}, + {TOBN(0x421b41fc, 0x0c743032), TOBN(0x4f2120c1, 0x5e438639), + TOBN(0xfb7cae51, 0xc83c1b07), TOBN(0xb2370caa, 0xcac2171a)}}, + {{TOBN(0x2eb2d962, 0x6cc820fb), TOBN(0x59feee5c, 0xb85a44bf), + TOBN(0x94620fca, 0x5b6598f0), TOBN(0x6b922cae, 0x7e314051)}, + {TOBN(0xff8745ad, 0x106bed4e), TOBN(0x546e71f5, 0xdfa1e9ab), + TOBN(0x935c1e48, 0x1ec29487), TOBN(0x9509216c, 0x4d936530)}}, + {{TOBN(0xc7ca3067, 0x85c9a2db), TOBN(0xd6ae5152, 0x6be8606f), + TOBN(0x09dbcae6, 0xe14c651d), TOBN(0xc9536e23, 0x9bc32f96)}, + {TOBN(0xa90535a9, 0x34521b03), TOBN(0xf39c526c, 0x878756ff), + TOBN(0x383172ec, 0x8aedf03c), TOBN(0x20a8075e, 0xefe0c034)}}, + {{TOBN(0xf22f9c62, 0x64026422), TOBN(0x8dd10780, 0x24b9d076), + TOBN(0x944c742a, 0x3bef2950), TOBN(0x55b9502e, 0x88a2b00b)}, + {TOBN(0xa59e14b4, 0x86a09817), TOBN(0xa39dd3ac, 0x47bb4071), + TOBN(0x55137f66, 0x3be0592f), TOBN(0x07fcafd4, 0xc9e63f5b)}}, + {{TOBN(0x963652ee, 0x346eb226), TOBN(0x7dfab085, 0xec2facb7), + TOBN(0x273bf2b8, 0x691add26), TOBN(0x30d74540, 0xf2b46c44)}, + {TOBN(0x05e8e73e, 0xf2c2d065), TOBN(0xff9b8a00, 0xd42eeac9), + TOBN(0x2fcbd205, 0x97209d22), TOBN(0xeb740ffa, 0xde14ea2c)}}, + {{TOBN(0xc71ff913, 0xa8aef518), TOBN(0x7bfc74bb, 0xfff4cfa2), + TOBN(0x1716680c, 0xb6b36048), TOBN(0x121b2cce, 0x9ef79af1)}, + {TOBN(0xbff3c836, 0xa01eb3d3), TOBN(0x50eb1c6a, 0x5f79077b), + TOBN(0xa48c32d6, 0xa004bbcf), TOBN(0x47a59316, 0x7d64f61d)}}, + {{TOBN(0x6068147f, 0x93102016), TOBN(0x12c5f654, 0x94d12576), + TOBN(0xefb071a7, 0xc9bc6b91), TOBN(0x7c2da0c5, 0x6e23ea95)}, + {TOBN(0xf4fd45b6, 0xd4a1dd5d), TOBN(0x3e7ad9b6, 0x9122b13c), + TOBN(0x342ca118, 0xe6f57a48), TOBN(0x1c2e94a7, 0x06f8288f)}}, + {{TOBN(0x99e68f07, 0x5a97d231), TOBN(0x7c80de97, 0x4d838758), + TOBN(0xbce0f5d0, 0x05872727), TOBN(0xbe5d95c2, 0x19c4d016)}, + {TOBN(0x921d5cb1, 0x9c2492ee), TOBN(0x42192dc1, 0x404d6fb3), + TOBN(0x4c84dcd1, 0x32f988d3), TOBN(0xde26d61f, 0xa17b8e85)}}, + {{TOBN(0xc466dcb6, 0x137c7408), TOBN(0x9a38d7b6, 0x36a266da), + TOBN(0x7ef5cb06, 0x83bebf1b), TOBN(0xe5cdcbbf, 0x0fd014e3)}, + {TOBN(0x30aa376d, 0xf65965a0), TOBN(0x60fe88c2, 0xebb3e95e), + TOBN(0x33fd0b61, 0x66ee6f20), TOBN(0x8827dcdb, 0x3f41f0a0)}}, + {{TOBN(0xbf8a9d24, 0x0c56c690), TOBN(0x40265dad, 0xddb7641d), + TOBN(0x522b05bf, 0x3a6b662b), TOBN(0x466d1dfe, 0xb1478c9b)}, + {TOBN(0xaa616962, 0x1484469b), TOBN(0x0db60549, 0x02df8f9f), + TOBN(0xc37bca02, 0x3cb8bf51), TOBN(0x5effe346, 0x21371ce8)}}, + {{TOBN(0xe8f65264, 0xff112c32), TOBN(0x8a9c736d, 0x7b971fb2), + TOBN(0xa4f19470, 0x7b75080d), TOBN(0xfc3f2c5a, 0x8839c59b)}, + {TOBN(0x1d6c777e, 0x5aeb49c2), TOBN(0xf3db034d, 0xda1addfe), + TOBN(0xd76fee5a, 0x5535affc), TOBN(0x0853ac70, 0xb92251fd)}}, + {{TOBN(0x37e3d594, 0x8b2a29d5), TOBN(0x28f1f457, 0x4de00ddb), + TOBN(0x8083c1b5, 0xf42c328b), TOBN(0xd8ef1d8f, 0xe493c73b)}, + {TOBN(0x96fb6260, 0x41dc61bd), TOBN(0xf74e8a9d, 0x27ee2f8a), + TOBN(0x7c605a80, 0x2c946a5d), TOBN(0xeed48d65, 0x3839ccfd)}}, + {{TOBN(0x9894344f, 0x3a29467a), TOBN(0xde81e949, 0xc51eba6d), + TOBN(0xdaea066b, 0xa5e5c2f2), TOBN(0x3fc8a614, 0x08c8c7b3)}, + {TOBN(0x7adff88f, 0x06d0de9f), TOBN(0xbbc11cf5, 0x3b75ce0a), + TOBN(0x9fbb7acc, 0xfbbc87d5), TOBN(0xa1458e26, 0x7badfde2)}}}, + {{{TOBN(0x1cb43668, 0xe039c256), TOBN(0x5f26fb8b, 0x7c17fd5d), + TOBN(0xeee426af, 0x79aa062b), TOBN(0x072002d0, 0xd78fbf04)}, + {TOBN(0x4c9ca237, 0xe84fb7e3), TOBN(0xb401d8a1, 0x0c82133d), + TOBN(0xaaa52592, 0x6d7e4181), TOBN(0xe9430833, 0x73dbb152)}}, + {{TOBN(0xf92dda31, 0xbe24319a), TOBN(0x03f7d28b, 0xe095a8e7), + TOBN(0xa52fe840, 0x98782185), TOBN(0x276ddafe, 0x29c24dbc)}, + {TOBN(0x80cd5496, 0x1d7a64eb), TOBN(0xe4360889, 0x7f1dbe42), + TOBN(0x2f81a877, 0x8438d2d5), TOBN(0x7e4d52a8, 0x85169036)}}, + {{TOBN(0x19e3d5b1, 0x1d59715d), TOBN(0xc7eaa762, 0xd788983e), + TOBN(0xe5a730b0, 0xabf1f248), TOBN(0xfbab8084, 0xfae3fd83)}, + {TOBN(0x65e50d21, 0x53765b2f), TOBN(0xbdd4e083, 0xfa127f3d), + TOBN(0x9cf3c074, 0x397b1b10), TOBN(0x59f8090c, 0xb1b59fd3)}}, + {{TOBN(0x7b15fd9d, 0x615faa8f), TOBN(0x8fa1eb40, 0x968554ed), + TOBN(0x7bb4447e, 0x7aa44882), TOBN(0x2bb2d0d1, 0x029fff32)}, + {TOBN(0x075e2a64, 0x6caa6d2f), TOBN(0x8eb879de, 0x22e7351b), + TOBN(0xbcd5624e, 0x9a506c62), TOBN(0x218eaef0, 0xa87e24dc)}}, + {{TOBN(0x37e56847, 0x44ddfa35), TOBN(0x9ccfc5c5, 0xdab3f747), + TOBN(0x9ac1df3f, 0x1ee96cf4), TOBN(0x0c0571a1, 0x3b480b8f)}, + {TOBN(0x2fbeb3d5, 0x4b3a7b3c), TOBN(0x35c03669, 0x5dcdbb99), + TOBN(0x52a0f5dc, 0xb2415b3a), TOBN(0xd57759b4, 0x4413ed9a)}}, + {{TOBN(0x1fe647d8, 0x3d30a2c5), TOBN(0x0857f77e, 0xf78a81dc), + TOBN(0x11d5a334, 0x131a4a9b), TOBN(0xc0a94af9, 0x29d393f5)}, + {TOBN(0xbc3a5c0b, 0xdaa6ec1a), TOBN(0xba9fe493, 0x88d2d7ed), + TOBN(0xbb4335b4, 0xbb614797), TOBN(0x991c4d68, 0x72f83533)}}, + {{TOBN(0x53258c28, 0xd2f01cb3), TOBN(0x93d6eaa3, 0xd75db0b1), + TOBN(0x419a2b0d, 0xe87d0db4), TOBN(0xa1e48f03, 0xd8fe8493)}, + {TOBN(0xf747faf6, 0xc508b23a), TOBN(0xf137571a, 0x35d53549), + TOBN(0x9f5e58e2, 0xfcf9b838), TOBN(0xc7186cee, 0xa7fd3cf5)}}, + {{TOBN(0x77b868ce, 0xe978a1d3), TOBN(0xe3a68b33, 0x7ab92d04), + TOBN(0x51029794, 0x87a5b862), TOBN(0x5f0606c3, 0x3a61d41d)}, + {TOBN(0x2814be27, 0x6f9326f1), TOBN(0x2f521c14, 0xc6fe3c2e), + TOBN(0x17464d7d, 0xacdf7351), TOBN(0x10f5f9d3, 0x777f7e44)}}, + {{TOBN(0xce8e616b, 0x269fb37d), TOBN(0xaaf73804, 0x7de62de5), + TOBN(0xaba11175, 0x4fdd4153), TOBN(0x515759ba, 0x3770b49b)}, + {TOBN(0x8b09ebf8, 0xaa423a61), TOBN(0x592245a1, 0xcd41fb92), + TOBN(0x1cba8ec1, 0x9b4c8936), TOBN(0xa87e91e3, 0xaf36710e)}}, + {{TOBN(0x1fd84ce4, 0x3d34a2e3), TOBN(0xee3759ce, 0xb43b5d61), + TOBN(0x895bc78c, 0x619186c7), TOBN(0xf19c3809, 0xcbb9725a)}, + {TOBN(0xc0be21aa, 0xde744b1f), TOBN(0xa7d222b0, 0x60f8056b), + TOBN(0x74be6157, 0xb23efe11), TOBN(0x6fab2b4f, 0x0cd68253)}}, + {{TOBN(0xad33ea5f, 0x4bf1d725), TOBN(0x9c1d8ee2, 0x4f6c950f), + TOBN(0x544ee78a, 0xa377af06), TOBN(0x54f489bb, 0x94a113e1)}, + {TOBN(0x8f11d634, 0x992fb7e8), TOBN(0x0169a7aa, 0xa2a44347), + TOBN(0x1d49d4af, 0x95020e00), TOBN(0x95945722, 0xe08e120b)}}, + {{TOBN(0xb6e33878, 0xa4d32282), TOBN(0xe36e029d, 0x48020ae7), + TOBN(0xe05847fb, 0x37a9b750), TOBN(0xf876812c, 0xb29e3819)}, + {TOBN(0x84ad138e, 0xd23a17f0), TOBN(0x6d7b4480, 0xf0b3950e), + TOBN(0xdfa8aef4, 0x2fd67ae0), TOBN(0x8d3eea24, 0x52333af6)}}, + {{TOBN(0x0d052075, 0xb15d5acc), TOBN(0xc6d9c79f, 0xbd815bc4), + TOBN(0x8dcafd88, 0xdfa36cf2), TOBN(0x908ccbe2, 0x38aa9070)}, + {TOBN(0x638722c4, 0xba35afce), TOBN(0x5a3da8b0, 0xfd6abf0b), + TOBN(0x2dce252c, 0xc9c335c1), TOBN(0x84e7f0de, 0x65aa799b)}}, + {{TOBN(0x2101a522, 0xb99a72cb), TOBN(0x06de6e67, 0x87618016), + TOBN(0x5ff8c7cd, 0xe6f3653e), TOBN(0x0a821ab5, 0xc7a6754a)}, + {TOBN(0x7e3fa52b, 0x7cb0b5a2), TOBN(0xa7fb121c, 0xc9048790), + TOBN(0x1a725020, 0x06ce053a), TOBN(0xb490a31f, 0x04e929b0)}}, + {{TOBN(0xe17be47d, 0x62dd61ad), TOBN(0x781a961c, 0x6be01371), + TOBN(0x1063bfd3, 0xdae3cbba), TOBN(0x35647406, 0x7f73c9ba)}, + {TOBN(0xf50e957b, 0x2736a129), TOBN(0xa6313702, 0xed13f256), + TOBN(0x9436ee65, 0x3a19fcc5), TOBN(0xcf2bdb29, 0xe7a4c8b6)}}, + {{TOBN(0xb06b1244, 0xc5f95cd8), TOBN(0xda8c8af0, 0xf4ab95f4), + TOBN(0x1bae59c2, 0xb9e5836d), TOBN(0x07d51e7e, 0x3acffffc)}, + {TOBN(0x01e15e6a, 0xc2ccbcda), TOBN(0x3bc1923f, 0x8528c3e0), + TOBN(0x43324577, 0xa49fead4), TOBN(0x61a1b884, 0x2aa7a711)}}, + {{TOBN(0xf9a86e08, 0x700230ef), TOBN(0x0af585a1, 0xbd19adf8), + TOBN(0x7645f361, 0xf55ad8f2), TOBN(0x6e676223, 0x46c3614c)}, + {TOBN(0x23cb257c, 0x4e774d3f), TOBN(0x82a38513, 0xac102d1b), + TOBN(0x9bcddd88, 0x7b126aa5), TOBN(0xe716998b, 0xeefd3ee4)}}, + {{TOBN(0x4239d571, 0xfb167583), TOBN(0xdd011c78, 0xd16c8f8a), + TOBN(0x271c2895, 0x69a27519), TOBN(0x9ce0a3b7, 0xd2d64b6a)}, + {TOBN(0x8c977289, 0xd5ec6738), TOBN(0xa3b49f9a, 0x8840ef6b), + TOBN(0x808c14c9, 0x9a453419), TOBN(0x5c00295b, 0x0cf0a2d5)}}, + {{TOBN(0x524414fb, 0x1d4bcc76), TOBN(0xb07691d2, 0x459a88f1), + TOBN(0x77f43263, 0xf70d110f), TOBN(0x64ada5e0, 0xb7abf9f3)}, + {TOBN(0xafd0f94e, 0x5b544cf5), TOBN(0xb4a13a15, 0xfd2713fe), + TOBN(0xb99b7d6e, 0x250c74f4), TOBN(0x097f2f73, 0x20324e45)}}, + {{TOBN(0x994b37d8, 0xaffa8208), TOBN(0xc3c31b0b, 0xdc29aafc), + TOBN(0x3da74651, 0x7a3a607f), TOBN(0xd8e1b8c1, 0xfe6955d6)}, + {TOBN(0x716e1815, 0xc8418682), TOBN(0x541d487f, 0x7dc91d97), + TOBN(0x48a04669, 0xc6996982), TOBN(0xf39cab15, 0x83a6502e)}}, + {{TOBN(0x025801a0, 0xe68db055), TOBN(0xf3569758, 0xba3338d5), + TOBN(0xb0c8c0aa, 0xee2afa84), TOBN(0x4f6985d3, 0xfb6562d1)}, + {TOBN(0x351f1f15, 0x132ed17a), TOBN(0x510ed0b4, 0xc04365fe), + TOBN(0xa3f98138, 0xe5b1f066), TOBN(0xbc9d95d6, 0x32df03dc)}}, + {{TOBN(0xa83ccf6e, 0x19abd09e), TOBN(0x0b4097c1, 0x4ff17edb), + TOBN(0x58a5c478, 0xd64a06ce), TOBN(0x2ddcc3fd, 0x544a58fd)}, + {TOBN(0xd449503d, 0x9e8153b8), TOBN(0x3324fd02, 0x7774179b), + TOBN(0xaf5d47c8, 0xdbd9120c), TOBN(0xeb860162, 0x34fa94db)}}, + {{TOBN(0x5817bdd1, 0x972f07f4), TOBN(0xe5579e2e, 0xd27bbceb), + TOBN(0x86847a1f, 0x5f11e5a6), TOBN(0xb39ed255, 0x7c3cf048)}, + {TOBN(0xe1076417, 0xa2f62e55), TOBN(0x6b9ab38f, 0x1bcf82a2), + TOBN(0x4bb7c319, 0x7aeb29f9), TOBN(0xf6d17da3, 0x17227a46)}}, + {{TOBN(0xab53ddbd, 0x0f968c00), TOBN(0xa03da7ec, 0x000c880b), + TOBN(0x7b239624, 0x6a9ad24d), TOBN(0x612c0401, 0x01ec60d0)}, + {TOBN(0x70d10493, 0x109f5df1), TOBN(0xfbda4030, 0x80af7550), + TOBN(0x30b93f95, 0xc6b9a9b3), TOBN(0x0c74ec71, 0x007d9418)}}, + {{TOBN(0x94175564, 0x6edb951f), TOBN(0x5f4a9d78, 0x7f22c282), + TOBN(0xb7870895, 0xb38d1196), TOBN(0xbc593df3, 0xa228ce7c)}, + {TOBN(0xc78c5bd4, 0x6af3641a), TOBN(0x7802200b, 0x3d9b3dcc), + TOBN(0x0dc73f32, 0x8be33304), TOBN(0x847ed87d, 0x61ffb79a)}}, + {{TOBN(0xf85c974e, 0x6d671192), TOBN(0x1e14100a, 0xde16f60f), + TOBN(0x45cb0d5a, 0x95c38797), TOBN(0x18923bba, 0x9b022da4)}, + {TOBN(0xef2be899, 0xbbe7e86e), TOBN(0x4a1510ee, 0x216067bf), + TOBN(0xd98c8154, 0x84d5ce3e), TOBN(0x1af777f0, 0xf92a2b90)}}, + {{TOBN(0x9fbcb400, 0x4ef65724), TOBN(0x3e04a4c9, 0x3c0ca6fe), + TOBN(0xfb3e2cb5, 0x55002994), TOBN(0x1f3a93c5, 0x5363ecab)}, + {TOBN(0x1fe00efe, 0x3923555b), TOBN(0x744bedd9, 0x1e1751ea), + TOBN(0x3fb2db59, 0x6ab69357), TOBN(0x8dbd7365, 0xf5e6618b)}}, + {{TOBN(0x99d53099, 0xdf1ea40e), TOBN(0xb3f24a0b, 0x57d61e64), + TOBN(0xd088a198, 0x596eb812), TOBN(0x22c8361b, 0x5762940b)}, + {TOBN(0x66f01f97, 0xf9c0d95c), TOBN(0x88461172, 0x8e43cdae), + TOBN(0x11599a7f, 0xb72b15c3), TOBN(0x135a7536, 0x420d95cc)}}, + {{TOBN(0x2dcdf0f7, 0x5f7ae2f6), TOBN(0x15fc6e1d, 0xd7fa6da2), + TOBN(0x81ca829a, 0xd1d441b6), TOBN(0x84c10cf8, 0x04a106b6)}, + {TOBN(0xa9b26c95, 0xa73fbbd0), TOBN(0x7f24e0cb, 0x4d8f6ee8), + TOBN(0x48b45937, 0x1e25a043), TOBN(0xf8a74fca, 0x036f3dfe)}}, + {{TOBN(0x1ed46585, 0xc9f84296), TOBN(0x7fbaa8fb, 0x3bc278b0), + TOBN(0xa8e96cd4, 0x6c4fcbd0), TOBN(0x940a1202, 0x73b60a5f)}, + {TOBN(0x34aae120, 0x55a4aec8), TOBN(0x550e9a74, 0xdbd742f0), + TOBN(0x794456d7, 0x228c68ab), TOBN(0x492f8868, 0xa4e25ec6)}}, + {{TOBN(0x682915ad, 0xb2d8f398), TOBN(0xf13b51cc, 0x5b84c953), + TOBN(0xcda90ab8, 0x5bb917d6), TOBN(0x4b615560, 0x4ea3dee1)}, + {TOBN(0x578b4e85, 0x0a52c1c8), TOBN(0xeab1a695, 0x20b75fc4), + TOBN(0x60c14f3c, 0xaa0bb3c6), TOBN(0x220f448a, 0xb8216094)}}, + {{TOBN(0x4fe7ee31, 0xb0e63d34), TOBN(0xf4600572, 0xa9e54fab), + TOBN(0xc0493334, 0xd5e7b5a4), TOBN(0x8589fb92, 0x06d54831)}, + {TOBN(0xaa70f5cc, 0x6583553a), TOBN(0x0879094a, 0xe25649e5), + TOBN(0xcc904507, 0x10044652), TOBN(0xebb0696d, 0x02541c4f)}}, + {{TOBN(0x5a171fde, 0xb9718710), TOBN(0x38f1bed8, 0xf374a9f5), + TOBN(0xc8c582e1, 0xba39bdc1), TOBN(0xfc457b0a, 0x908cc0ce)}, + {TOBN(0x9a187fd4, 0x883841e2), TOBN(0x8ec25b39, 0x38725381), + TOBN(0x2553ed05, 0x96f84395), TOBN(0x095c7661, 0x6f6c6897)}}, + {{TOBN(0x917ac85c, 0x4bdc5610), TOBN(0xb2885fe4, 0x179eb301), + TOBN(0x5fc65547, 0x8b78bdcc), TOBN(0x4a9fc893, 0xe59e4699)}, + {TOBN(0xbb7ff0cd, 0x3ce299af), TOBN(0x195be9b3, 0xadf38b20), + TOBN(0x6a929c87, 0xd38ddb8f), TOBN(0x55fcc99c, 0xb21a51b9)}}, + {{TOBN(0x2b695b4c, 0x721a4593), TOBN(0xed1e9a15, 0x768eaac2), + TOBN(0xfb63d71c, 0x7489f914), TOBN(0xf98ba31c, 0x78118910)}, + {TOBN(0x80291373, 0x9b128eb4), TOBN(0x7801214e, 0xd448af4a), + TOBN(0xdbd2e22b, 0x55418dd3), TOBN(0xeffb3c0d, 0xd3998242)}}, + {{TOBN(0xdfa6077c, 0xc7bf3827), TOBN(0xf2165bcb, 0x47f8238f), + TOBN(0xfe37cf68, 0x8564d554), TOBN(0xe5f825c4, 0x0a81fb98)}, + {TOBN(0x43cc4f67, 0xffed4d6f), TOBN(0xbc609578, 0xb50a34b0), + TOBN(0x8aa8fcf9, 0x5041faf1), TOBN(0x5659f053, 0x651773b6)}}, + {{TOBN(0xe87582c3, 0x6044d63b), TOBN(0xa6089409, 0x0cdb0ca0), + TOBN(0x8c993e0f, 0xbfb2bcf6), TOBN(0xfc64a719, 0x45985cfc)}, + {TOBN(0x15c4da80, 0x83dbedba), TOBN(0x804ae112, 0x2be67df7), + TOBN(0xda4c9658, 0xa23defde), TOBN(0x12002ddd, 0x5156e0d3)}}, + {{TOBN(0xe68eae89, 0x5dd21b96), TOBN(0x8b99f28b, 0xcf44624d), + TOBN(0x0ae00808, 0x1ec8897a), TOBN(0xdd0a9303, 0x6712f76e)}, + {TOBN(0x96237522, 0x4e233de4), TOBN(0x192445b1, 0x2b36a8a5), + TOBN(0xabf9ff74, 0x023993d9), TOBN(0x21f37bf4, 0x2aad4a8f)}}, + {{TOBN(0x340a4349, 0xf8bd2bbd), TOBN(0x1d902cd9, 0x4868195d), + TOBN(0x3d27bbf1, 0xe5fdb6f1), TOBN(0x7a5ab088, 0x124f9f1c)}, + {TOBN(0xc466ab06, 0xf7a09e03), TOBN(0x2f8a1977, 0x31f2c123), + TOBN(0xda355dc7, 0x041b6657), TOBN(0xcb840d12, 0x8ece2a7c)}}, + {{TOBN(0xb600ad9f, 0x7db32675), TOBN(0x78fea133, 0x07a06f1b), + TOBN(0x5d032269, 0xb31f6094), TOBN(0x07753ef5, 0x83ec37aa)}, + {TOBN(0x03485aed, 0x9c0bea78), TOBN(0x41bb3989, 0xbc3f4524), + TOBN(0x09403761, 0x697f726d), TOBN(0x6109beb3, 0xdf394820)}}, + {{TOBN(0x804111ea, 0x3b6d1145), TOBN(0xb6271ea9, 0xa8582654), + TOBN(0x619615e6, 0x24e66562), TOBN(0xa2554945, 0xd7b6ad9c)}, + {TOBN(0xd9c4985e, 0x99bfe35f), TOBN(0x9770ccc0, 0x7b51cdf6), + TOBN(0x7c327013, 0x92881832), TOBN(0x8777d45f, 0x286b26d1)}}, + {{TOBN(0x9bbeda22, 0xd847999d), TOBN(0x03aa33b6, 0xc3525d32), + TOBN(0x4b7b96d4, 0x28a959a1), TOBN(0xbb3786e5, 0x31e5d234)}, + {TOBN(0xaeb5d3ce, 0x6961f247), TOBN(0x20aa85af, 0x02f93d3f), + TOBN(0x9cd1ad3d, 0xd7a7ae4f), TOBN(0xbf6688f0, 0x781adaa8)}}, + {{TOBN(0xb1b40e86, 0x7469cead), TOBN(0x1904c524, 0x309fca48), + TOBN(0x9b7312af, 0x4b54bbc7), TOBN(0xbe24bf8f, 0x593affa2)}, + {TOBN(0xbe5e0790, 0xbd98764b), TOBN(0xa0f45f17, 0xa26e299e), + TOBN(0x4af0d2c2, 0x6b8fe4c7), TOBN(0xef170db1, 0x8ae8a3e6)}}, + {{TOBN(0x0e8d61a0, 0x29e0ccc1), TOBN(0xcd53e87e, 0x60ad36ca), + TOBN(0x328c6623, 0xc8173822), TOBN(0x7ee1767d, 0xa496be55)}, + {TOBN(0x89f13259, 0x648945af), TOBN(0x9e45a5fd, 0x25c8009c), + TOBN(0xaf2febd9, 0x1f61ab8c), TOBN(0x43f6bc86, 0x8a275385)}}, + {{TOBN(0x87792348, 0xf2142e79), TOBN(0x17d89259, 0xc6e6238a), + TOBN(0x7536d2f6, 0x4a839d9b), TOBN(0x1f428fce, 0x76a1fbdc)}, + {TOBN(0x1c109601, 0x0db06dfe), TOBN(0xbfc16bc1, 0x50a3a3cc), + TOBN(0xf9cbd9ec, 0x9b30f41b), TOBN(0x5b5da0d6, 0x00138cce)}}, + {{TOBN(0xec1d0a48, 0x56ef96a7), TOBN(0xb47eb848, 0x982bf842), + TOBN(0x66deae32, 0xec3f700d), TOBN(0x4e43c42c, 0xaa1181e0)}, + {TOBN(0xa1d72a31, 0xd1a4aa2a), TOBN(0x440d4668, 0xc004f3ce), + TOBN(0x0d6a2d3b, 0x45fe8a7a), TOBN(0x820e52e2, 0xfb128365)}}, + {{TOBN(0x29ac5fcf, 0x25e51b09), TOBN(0x180cd2bf, 0x2023d159), + TOBN(0xa9892171, 0xa1ebf90e), TOBN(0xf97c4c87, 0x7c132181)}, + {TOBN(0x9f1dc724, 0xc03dbb7e), TOBN(0xae043765, 0x018cbbe4), + TOBN(0xfb0b2a36, 0x0767d153), TOBN(0xa8e2f4d6, 0x249cbaeb)}}, + {{TOBN(0x172a5247, 0xd95ea168), TOBN(0x1758fada, 0x2970764a), + TOBN(0xac803a51, 0x1d978169), TOBN(0x299cfe2e, 0xde77e01b)}, + {TOBN(0x652a1e17, 0xb0a98927), TOBN(0x2e26e1d1, 0x20014495), + TOBN(0x7ae0af9f, 0x7175b56a), TOBN(0xc2e22a80, 0xd64b9f95)}}, + {{TOBN(0x4d0ff9fb, 0xd90a060a), TOBN(0x496a27db, 0xbaf38085), + TOBN(0x32305401, 0xda776bcf), TOBN(0xb8cdcef6, 0x725f209e)}, + {TOBN(0x61ba0f37, 0x436a0bba), TOBN(0x263fa108, 0x76860049), + TOBN(0x92beb98e, 0xda3542cf), TOBN(0xa2d4d14a, 0xd5849538)}}, + {{TOBN(0x989b9d68, 0x12e9a1bc), TOBN(0x61d9075c, 0x5f6e3268), + TOBN(0x352c6aa9, 0x99ace638), TOBN(0xde4e4a55, 0x920f43ff)}, + {TOBN(0xe5e4144a, 0xd673c017), TOBN(0x667417ae, 0x6f6e05ea), + TOBN(0x613416ae, 0xdcd1bd56), TOBN(0x5eb36201, 0x86693711)}}, + {{TOBN(0x2d7bc504, 0x3a1aa914), TOBN(0x175a1299, 0x76dc5975), + TOBN(0xe900e0f2, 0x3fc8125c), TOBN(0x569ef68c, 0x11198875)}, + {TOBN(0x9012db63, 0x63a113b4), TOBN(0xe3bd3f56, 0x98835766), + TOBN(0xa5c94a52, 0x76412dea), TOBN(0xad9e2a09, 0xaa735e5c)}}, + {{TOBN(0x405a984c, 0x508b65e9), TOBN(0xbde4a1d1, 0x6df1a0d1), + TOBN(0x1a9433a1, 0xdfba80da), TOBN(0xe9192ff9, 0x9440ad2e)}, + {TOBN(0x9f649696, 0x5099fe92), TOBN(0x25ddb65c, 0x0b27a54a), + TOBN(0x178279dd, 0xc590da61), TOBN(0x5479a999, 0xfbde681a)}}, + {{TOBN(0xd0e84e05, 0x013fe162), TOBN(0xbe11dc92, 0x632d471b), + TOBN(0xdf0b0c45, 0xfc0e089f), TOBN(0x04fb15b0, 0x4c144025)}, + {TOBN(0xa61d5fc2, 0x13c99927), TOBN(0xa033e9e0, 0x3de2eb35), + TOBN(0xf8185d5c, 0xb8dacbb4), TOBN(0x9a88e265, 0x8644549d)}}, + {{TOBN(0xf717af62, 0x54671ff6), TOBN(0x4bd4241b, 0x5fa58603), + TOBN(0x06fba40b, 0xe67773c0), TOBN(0xc1d933d2, 0x6a2847e9)}, + {TOBN(0xf4f5acf3, 0x689e2c70), TOBN(0x92aab0e7, 0x46bafd31), + TOBN(0x798d76aa, 0x3473f6e5), TOBN(0xcc6641db, 0x93141934)}}, + {{TOBN(0xcae27757, 0xd31e535e), TOBN(0x04cc43b6, 0x87c2ee11), + TOBN(0x8d1f9675, 0x2e029ffa), TOBN(0xc2150672, 0xe4cc7a2c)}, + {TOBN(0x3b03c1e0, 0x8d68b013), TOBN(0xa9d6816f, 0xedf298f3), + TOBN(0x1bfbb529, 0xa2804464), TOBN(0x95a52fae, 0x5db22125)}}, + {{TOBN(0x55b32160, 0x0e1cb64e), TOBN(0x004828f6, 0x7e7fc9fe), + TOBN(0x13394b82, 0x1bb0fb93), TOBN(0xb6293a2d, 0x35f1a920)}, + {TOBN(0xde35ef21, 0xd145d2d9), TOBN(0xbe6225b3, 0xbb8fa603), + TOBN(0x00fc8f6b, 0x32cf252d), TOBN(0xa28e52e6, 0x117cf8c2)}}, + {{TOBN(0x9d1dc89b, 0x4c371e6d), TOBN(0xcebe0675, 0x36ef0f28), + TOBN(0x5de05d09, 0xa4292f81), TOBN(0xa8303593, 0x353e3083)}, + {TOBN(0xa1715b0a, 0x7e37a9bb), TOBN(0x8c56f61e, 0x2b8faec3), + TOBN(0x52507431, 0x33c9b102), TOBN(0x0130cefc, 0xa44431f0)}}, + {{TOBN(0x56039fa0, 0xbd865cfb), TOBN(0x4b03e578, 0xbc5f1dd7), + TOBN(0x40edf2e4, 0xbabe7224), TOBN(0xc752496d, 0x3a1988f6)}, + {TOBN(0xd1572d3b, 0x564beb6b), TOBN(0x0db1d110, 0x39a1c608), + TOBN(0x568d1934, 0x16f60126), TOBN(0x05ae9668, 0xf354af33)}}, + {{TOBN(0x19de6d37, 0xc92544f2), TOBN(0xcc084353, 0xa35837d5), + TOBN(0xcbb6869c, 0x1a514ece), TOBN(0xb633e728, 0x2e1d1066)}, + {TOBN(0xf15dd69f, 0x936c581c), TOBN(0x96e7b8ce, 0x7439c4f9), + TOBN(0x5e676f48, 0x2e448a5b), TOBN(0xb2ca7d5b, 0xfd916bbb)}}, + {{TOBN(0xd55a2541, 0xf5024025), TOBN(0x47bc5769, 0xe4c2d937), + TOBN(0x7d31b92a, 0x0362189f), TOBN(0x83f3086e, 0xef7816f9)}, + {TOBN(0xf9f46d94, 0xb587579a), TOBN(0xec2d22d8, 0x30e76c5f), + TOBN(0x27d57461, 0xb000ffcf), TOBN(0xbb7e65f9, 0x364ffc2c)}}, + {{TOBN(0x7c7c9477, 0x6652a220), TOBN(0x61618f89, 0xd696c981), + TOBN(0x5021701d, 0x89effff3), TOBN(0xf2c8ff8e, 0x7c314163)}, + {TOBN(0x2da413ad, 0x8efb4d3e), TOBN(0x937b5adf, 0xce176d95), + TOBN(0x22867d34, 0x2a67d51c), TOBN(0x262b9b10, 0x18eb3ac9)}}, + {{TOBN(0x4e314fe4, 0xc43ff28b), TOBN(0x76476627, 0x6a664e7a), + TOBN(0x3e90e40b, 0xb7a565c2), TOBN(0x8588993a, 0xc1acf831)}, + {TOBN(0xd7b501d6, 0x8f938829), TOBN(0x996627ee, 0x3edd7d4c), + TOBN(0x37d44a62, 0x90cd34c7), TOBN(0xa8327499, 0xf3833e8d)}}, + {{TOBN(0x2e18917d, 0x4bf50353), TOBN(0x85dd726b, 0x556765fb), + TOBN(0x54fe65d6, 0x93d5ab66), TOBN(0x3ddbaced, 0x915c25fe)}, + {TOBN(0xa799d9a4, 0x12f22e85), TOBN(0xe2a24867, 0x6d06f6bc), + TOBN(0xf4f1ee56, 0x43ca1637), TOBN(0xfda2828b, 0x61ece30a)}}, + {{TOBN(0x758c1a3e, 0xa2dee7a6), TOBN(0xdcde2f3c, 0x734b2284), + TOBN(0xaba445d2, 0x4eaba6ad), TOBN(0x35aaf668, 0x76cee0a7)}, + {TOBN(0x7e0b04a9, 0xe5aa049a), TOBN(0xe74083ad, 0x91103e84), + TOBN(0xbeb183ce, 0x40afecc3), TOBN(0x6b89de9f, 0xea043f7a)}}}, + {{{TOBN(0x0e299d23, 0xfe67ba66), TOBN(0x91450760, 0x93cf2f34), + TOBN(0xf45b5ea9, 0x97fcf913), TOBN(0x5be00843, 0x8bd7ddda)}, + {TOBN(0x358c3e05, 0xd53ff04d), TOBN(0xbf7ccdc3, 0x5de91ef7), + TOBN(0xad684dbf, 0xb69ec1a0), TOBN(0x367e7cf2, 0x801fd997)}}, + {{TOBN(0x0ca1f3b7, 0xb0dc8595), TOBN(0x27de4608, 0x9f1d9f2e), + TOBN(0x1af3bf39, 0xbadd82a7), TOBN(0x79356a79, 0x65862448)}, + {TOBN(0xc0602345, 0xf5f9a052), TOBN(0x1a8b0f89, 0x139a42f9), + TOBN(0xb53eee42, 0x844d40fc), TOBN(0x93b0bfe5, 0x4e5b6368)}}, + {{TOBN(0x5434dd02, 0xc024789c), TOBN(0x90dca9ea, 0x41b57bfc), + TOBN(0x8aa898e2, 0x243398df), TOBN(0xf607c834, 0x894a94bb)}, + {TOBN(0xbb07be97, 0xc2c99b76), TOBN(0x6576ba67, 0x18c29302), + TOBN(0x3d79efcc, 0xe703a88c), TOBN(0xf259ced7, 0xb6a0d106)}}, + {{TOBN(0x0f893a5d, 0xc8de610b), TOBN(0xe8c515fb, 0x67e223ce), + TOBN(0x7774bfa6, 0x4ead6dc5), TOBN(0x89d20f95, 0x925c728f)}, + {TOBN(0x7a1e0966, 0x098583ce), TOBN(0xa2eedb94, 0x93f2a7d7), + TOBN(0x1b282097, 0x4c304d4a), TOBN(0x0842e3da, 0xc077282d)}}, + {{TOBN(0xe4d972a3, 0x3b9e2d7b), TOBN(0x7cc60b27, 0xc48218ff), + TOBN(0x8fc70838, 0x84149d91), TOBN(0x5c04346f, 0x2f461ecc)}, + {TOBN(0xebe9fdf2, 0x614650a9), TOBN(0x5e35b537, 0xc1f666ac), + TOBN(0x645613d1, 0x88babc83), TOBN(0x88cace3a, 0xc5e1c93e)}}, + {{TOBN(0x209ca375, 0x3de92e23), TOBN(0xccb03cc8, 0x5fbbb6e3), + TOBN(0xccb90f03, 0xd7b1487e), TOBN(0xfa9c2a38, 0xc710941f)}, + {TOBN(0x756c3823, 0x6724ceed), TOBN(0x3a902258, 0x192d0323), + TOBN(0xb150e519, 0xea5e038e), TOBN(0xdcba2865, 0xc7427591)}}, + {{TOBN(0xe549237f, 0x78890732), TOBN(0xc443bef9, 0x53fcb4d9), + TOBN(0x9884d8a6, 0xeb3480d6), TOBN(0x8a35b6a1, 0x3048b186)}, + {TOBN(0xb4e44716, 0x65e9a90a), TOBN(0x45bf380d, 0x653006c0), + TOBN(0x8f3f820d, 0x4fe9ae3b), TOBN(0x244a35a0, 0x979a3b71)}}, + {{TOBN(0xa1010e9d, 0x74cd06ff), TOBN(0x9c17c7df, 0xaca3eeac), + TOBN(0x74c86cd3, 0x8063aa2b), TOBN(0x8595c4b3, 0x734614ff)}, + {TOBN(0xa3de00ca, 0x990f62cc), TOBN(0xd9bed213, 0xca0c3be5), + TOBN(0x7886078a, 0xdf8ce9f5), TOBN(0xddb27ce3, 0x5cd44444)}}, + {{TOBN(0xed374a66, 0x58926ddd), TOBN(0x138b2d49, 0x908015b8), + TOBN(0x886c6579, 0xde1f7ab8), TOBN(0x888b9aa0, 0xc3020b7a)}, + {TOBN(0xd3ec034e, 0x3a96e355), TOBN(0xba65b0b8, 0xf30fbe9a), + TOBN(0x064c8e50, 0xff21367a), TOBN(0x1f508ea4, 0x0b04b46e)}}, + {{TOBN(0x98561a49, 0x747c866c), TOBN(0xbbb1e5fe, 0x0518a062), + TOBN(0x20ff4e8b, 0xecdc3608), TOBN(0x7f55cded, 0x20184027)}, + {TOBN(0x8d73ec95, 0xf38c85f0), TOBN(0x5b589fdf, 0x8bc3b8c3), + TOBN(0xbe95dd98, 0x0f12b66f), TOBN(0xf5bd1a09, 0x0e338e01)}}, + {{TOBN(0x65163ae5, 0x5e915918), TOBN(0x6158d6d9, 0x86f8a46b), + TOBN(0x8466b538, 0xeeebf99c), TOBN(0xca8761f6, 0xbca477ef)}, + {TOBN(0xaf3449c2, 0x9ebbc601), TOBN(0xef3b0f41, 0xe0c3ae2f), + TOBN(0xaa6c577d, 0x5de63752), TOBN(0xe9166601, 0x64682a51)}}, + {{TOBN(0x5a3097be, 0xfc15aa1e), TOBN(0x40d12548, 0xb54b0745), + TOBN(0x5bad4706, 0x519a5f12), TOBN(0xed03f717, 0xa439dee6)}, + {TOBN(0x0794bb6c, 0x4a02c499), TOBN(0xf725083d, 0xcffe71d2), + TOBN(0x2cad7519, 0x0f3adcaf), TOBN(0x7f68ea1c, 0x43729310)}}, + {{TOBN(0xe747c8c7, 0xb7ffd977), TOBN(0xec104c35, 0x80761a22), + TOBN(0x8395ebaf, 0x5a3ffb83), TOBN(0xfb3261f4, 0xe4b63db7)}, + {TOBN(0x53544960, 0xd883e544), TOBN(0x13520d70, 0x8cc2eeb8), + TOBN(0x08f6337b, 0xd3d65f99), TOBN(0x83997db2, 0x781cf95b)}}, + {{TOBN(0xce6ff106, 0x0dbd2c01), TOBN(0x4f8eea6b, 0x1f9ce934), + TOBN(0x546f7c4b, 0x0e993921), TOBN(0x6236a324, 0x5e753fc7)}, + {TOBN(0x65a41f84, 0xa16022e9), TOBN(0x0c18d878, 0x43d1dbb2), + TOBN(0x73c55640, 0x2d4cef9c), TOBN(0xa0428108, 0x70444c74)}}, + {{TOBN(0x68e4f15e, 0x9afdfb3c), TOBN(0x49a56143, 0x5bdfb6df), + TOBN(0xa9bc1bd4, 0x5f823d97), TOBN(0xbceb5970, 0xea111c2a)}, + {TOBN(0x366b455f, 0xb269bbc4), TOBN(0x7cd85e1e, 0xe9bc5d62), + TOBN(0xc743c41c, 0x4f18b086), TOBN(0xa4b40990, 0x95294fb9)}}, + {{TOBN(0x9c7c581d, 0x26ee8382), TOBN(0xcf17dcc5, 0x359d638e), + TOBN(0xee8273ab, 0xb728ae3d), TOBN(0x1d112926, 0xf821f047)}, + {TOBN(0x11498477, 0x50491a74), TOBN(0x687fa761, 0xfde0dfb9), + TOBN(0x2c258022, 0x7ea435ab), TOBN(0x6b8bdb94, 0x91ce7e3f)}}, + {{TOBN(0x4c5b5dc9, 0x3bf834aa), TOBN(0x04371819, 0x4f6c7e4b), + TOBN(0xc284e00a, 0x3736bcad), TOBN(0x0d881118, 0x21ae8f8d)}, + {TOBN(0xf9cf0f82, 0xf48c8e33), TOBN(0xa11fd075, 0xa1bf40db), + TOBN(0xdceab0de, 0xdc2733e5), TOBN(0xc560a8b5, 0x8e986bd7)}}, + {{TOBN(0x48dd1fe2, 0x3929d097), TOBN(0x3885b290, 0x92f188f1), + TOBN(0x0f2ae613, 0xda6fcdac), TOBN(0x9054303e, 0xb662a46c)}, + {TOBN(0xb6871e44, 0x0738042a), TOBN(0x98e6a977, 0xbdaf6449), + TOBN(0xd8bc0650, 0xd1c9df1b), TOBN(0xef3d6451, 0x36e098f9)}}, + {{TOBN(0x03fbae82, 0xb6d72d28), TOBN(0x77ca9db1, 0xf5d84080), + TOBN(0x8a112cff, 0xa58efc1c), TOBN(0x518d761c, 0xc564cb4a)}, + {TOBN(0x69b5740e, 0xf0d1b5ce), TOBN(0x717039cc, 0xe9eb1785), + TOBN(0x3fe29f90, 0x22f53382), TOBN(0x8e54ba56, 0x6bc7c95c)}}, + {{TOBN(0x9c806d8a, 0xf7f91d0f), TOBN(0x3b61b0f1, 0xa82a5728), + TOBN(0x4640032d, 0x94d76754), TOBN(0x273eb5de, 0x47d834c6)}, + {TOBN(0x2988abf7, 0x7b4e4d53), TOBN(0xb7ce66bf, 0xde401777), + TOBN(0x9fba6b32, 0x715071b3), TOBN(0x82413c24, 0xad3a1a98)}}, + {{TOBN(0x5b7fc8c4, 0xe0e8ad93), TOBN(0xb5679aee, 0x5fab868d), + TOBN(0xb1f9d2fa, 0x2b3946f3), TOBN(0x458897dc, 0x5685b50a)}, + {TOBN(0x1e98c930, 0x89d0caf3), TOBN(0x39564c5f, 0x78642e92), + TOBN(0x1b77729a, 0x0dbdaf18), TOBN(0xf9170722, 0x579e82e6)}}, + {{TOBN(0x680c0317, 0xe4515fa5), TOBN(0xf85cff84, 0xfb0c790f), + TOBN(0xc7a82aab, 0x6d2e0765), TOBN(0x7446bca9, 0x35c82b32)}, + {TOBN(0x5de607aa, 0x6d63184f), TOBN(0x7c1a46a8, 0x262803a6), + TOBN(0xd218313d, 0xaebe8035), TOBN(0x92113ffd, 0xc73c51f8)}}, + {{TOBN(0x4b38e083, 0x12e7e46c), TOBN(0x69d0a37a, 0x56126bd5), + TOBN(0xfb3f324b, 0x73c07e04), TOBN(0xa0c22f67, 0x8fda7267)}, + {TOBN(0x8f2c0051, 0x4d2c7d8f), TOBN(0xbc45ced3, 0xcbe2cae5), + TOBN(0xe1c6cf07, 0xa8f0f277), TOBN(0xbc392312, 0x1eb99a98)}}, + {{TOBN(0x75537b7e, 0x3cc8ac85), TOBN(0x8d725f57, 0xdd02753b), + TOBN(0xfd05ff64, 0xb737df2f), TOBN(0x55fe8712, 0xf6d2531d)}, + {TOBN(0x57ce04a9, 0x6ab6b01c), TOBN(0x69a02a89, 0x7cd93724), + TOBN(0x4f82ac35, 0xcf86699b), TOBN(0x8242d3ad, 0x9cb4b232)}}, + {{TOBN(0x713d0f65, 0xd62105e5), TOBN(0xbb222bfa, 0x2d29be61), + TOBN(0xf2f9a79e, 0x6cfbef09), TOBN(0xfc24d8d3, 0xd5d6782f)}, + {TOBN(0x5db77085, 0xd4129967), TOBN(0xdb81c3cc, 0xdc3c2a43), + TOBN(0x9d655fc0, 0x05d8d9a3), TOBN(0x3f5d057a, 0x54298026)}}, + {{TOBN(0x1157f56d, 0x88c54694), TOBN(0xb26baba5, 0x9b09573e), + TOBN(0x2cab03b0, 0x22adffd1), TOBN(0x60a412c8, 0xdd69f383)}, + {TOBN(0xed76e98b, 0x54b25039), TOBN(0xd4ee67d3, 0x687e714d), + TOBN(0x87739648, 0x7b00b594), TOBN(0xce419775, 0xc9ef709b)}}, + {{TOBN(0x40f76f85, 0x1c203a40), TOBN(0x30d352d6, 0xeafd8f91), + TOBN(0xaf196d3d, 0x95578dd2), TOBN(0xea4bb3d7, 0x77cc3f3d)}, + {TOBN(0x42a5bd03, 0xb98e782b), TOBN(0xac958c40, 0x0624920d), + TOBN(0xb838134c, 0xfc56fcc8), TOBN(0x86ec4ccf, 0x89572e5e)}}, + {{TOBN(0x69c43526, 0x9be47be0), TOBN(0x323b7dd8, 0xcb28fea1), + TOBN(0xfa5538ba, 0x3a6c67e5), TOBN(0xef921d70, 0x1d378e46)}, + {TOBN(0xf92961fc, 0x3c4b880e), TOBN(0x3f6f914e, 0x98940a67), + TOBN(0xa990eb0a, 0xfef0ff39), TOBN(0xa6c2920f, 0xf0eeff9c)}}, + {{TOBN(0xca804166, 0x51b8d9a3), TOBN(0x42531bc9, 0x0ffb0db1), + TOBN(0x72ce4718, 0xaa82e7ce), TOBN(0x6e199913, 0xdf574741)}, + {TOBN(0xd5f1b13d, 0xd5d36946), TOBN(0x8255dc65, 0xf68f0194), + TOBN(0xdc9df4cd, 0x8710d230), TOBN(0x3453c20f, 0x138c1988)}}, + {{TOBN(0x9af98dc0, 0x89a6ef01), TOBN(0x4dbcc3f0, 0x9857df85), + TOBN(0x34805601, 0x5c1ad924), TOBN(0x40448da5, 0xd0493046)}, + {TOBN(0xf629926d, 0x4ee343e2), TOBN(0x6343f1bd, 0x90e8a301), + TOBN(0xefc93491, 0x40815b3f), TOBN(0xf882a423, 0xde8f66fb)}}, + {{TOBN(0x3a12d5f4, 0xe7db9f57), TOBN(0x7dfba38a, 0x3c384c27), + TOBN(0x7a904bfd, 0x6fc660b1), TOBN(0xeb6c5db3, 0x2773b21c)}, + {TOBN(0xc350ee66, 0x1cdfe049), TOBN(0x9baac0ce, 0x44540f29), + TOBN(0xbc57b6ab, 0xa5ec6aad), TOBN(0x167ce8c3, 0x0a7c1baa)}}, + {{TOBN(0xb23a03a5, 0x53fb2b56), TOBN(0x6ce141e7, 0x4e057f78), + TOBN(0x796525c3, 0x89e490d9), TOBN(0x0bc95725, 0xa31a7e75)}, + {TOBN(0x1ec56791, 0x1220fd06), TOBN(0x716e3a3c, 0x408b0bd6), + TOBN(0x31cd6bf7, 0xe8ebeba9), TOBN(0xa7326ca6, 0xbee6b670)}}, + {{TOBN(0x3d9f851c, 0xcd090c43), TOBN(0x561e8f13, 0xf12c3988), + TOBN(0x50490b6a, 0x904b7be4), TOBN(0x61690ce1, 0x0410737b)}, + {TOBN(0x299e9a37, 0x0f009052), TOBN(0x258758f0, 0xf026092e), + TOBN(0x9fa255f3, 0xfdfcdc0f), TOBN(0xdbc9fb1f, 0xc0e1bcd2)}}, + {{TOBN(0x35f9dd6e, 0x24651840), TOBN(0xdca45a84, 0xa5c59abc), + TOBN(0x103d396f, 0xecca4938), TOBN(0x4532da0a, 0xb97b3f29)}, + {TOBN(0xc4135ea5, 0x1999a6bf), TOBN(0x3aa9505a, 0x5e6bf2ee), + TOBN(0xf77cef06, 0x3f5be093), TOBN(0x97d1a0f8, 0xa943152e)}}, + {{TOBN(0x2cb0ebba, 0x2e1c21dd), TOBN(0xf41b29fc, 0x2c6797c4), + TOBN(0xc6e17321, 0xb300101f), TOBN(0x4422b0e9, 0xd0d79a89)}, + {TOBN(0x49e4901c, 0x92f1bfc4), TOBN(0x06ab1f8f, 0xe1e10ed9), + TOBN(0x84d35577, 0xdb2926b8), TOBN(0xca349d39, 0x356e8ec2)}}, + {{TOBN(0x70b63d32, 0x343bf1a9), TOBN(0x8fd3bd28, 0x37d1a6b1), + TOBN(0x0454879c, 0x316865b4), TOBN(0xee959ff6, 0xc458efa2)}, + {TOBN(0x0461dcf8, 0x9706dc3f), TOBN(0x737db0e2, 0x164e4b2e), + TOBN(0x09262680, 0x2f8843c8), TOBN(0x54498bbc, 0x7745e6f6)}}, + {{TOBN(0x359473fa, 0xa29e24af), TOBN(0xfcc3c454, 0x70aa87a1), + TOBN(0xfd2c4bf5, 0x00573ace), TOBN(0xb65b514e, 0x28dd1965)}, + {TOBN(0xe46ae7cf, 0x2193e393), TOBN(0x60e9a4e1, 0xf5444d97), + TOBN(0xe7594e96, 0x00ff38ed), TOBN(0x43d84d2f, 0x0a0e0f02)}}, + {{TOBN(0x8b6db141, 0xee398a21), TOBN(0xb88a56ae, 0xe3bcc5be), + TOBN(0x0a1aa52f, 0x373460ea), TOBN(0x20da1a56, 0x160bb19b)}, + {TOBN(0xfb54999d, 0x65bf0384), TOBN(0x71a14d24, 0x5d5a180e), + TOBN(0xbc44db7b, 0x21737b04), TOBN(0xd84fcb18, 0x01dd8e92)}}, + {{TOBN(0x80de937b, 0xfa44b479), TOBN(0x53505499, 0x5c98fd4f), + TOBN(0x1edb12ab, 0x28f08727), TOBN(0x4c58b582, 0xa5f3ef53)}, + {TOBN(0xbfb236d8, 0x8327f246), TOBN(0xc3a3bfaa, 0x4d7df320), + TOBN(0xecd96c59, 0xb96024f2), TOBN(0xfc293a53, 0x7f4e0433)}}, + {{TOBN(0x5341352b, 0x5acf6e10), TOBN(0xc50343fd, 0xafe652c3), + TOBN(0x4af3792d, 0x18577a7f), TOBN(0xe1a4c617, 0xaf16823d)}, + {TOBN(0x9b26d0cd, 0x33425d0a), TOBN(0x306399ed, 0x9b7bc47f), + TOBN(0x2a792f33, 0x706bb20b), TOBN(0x31219614, 0x98111055)}}, + {{TOBN(0x864ec064, 0x87f5d28b), TOBN(0x11392d91, 0x962277fd), + TOBN(0xb5aa7942, 0xbb6aed5f), TOBN(0x080094dc, 0x47e799d9)}, + {TOBN(0x4afa588c, 0x208ba19b), TOBN(0xd3e7570f, 0x8512f284), + TOBN(0xcbae64e6, 0x02f5799a), TOBN(0xdeebe7ef, 0x514b9492)}}, + {{TOBN(0x30300f98, 0xe5c298ff), TOBN(0x17f561be, 0x3678361f), + TOBN(0xf52ff312, 0x98cb9a16), TOBN(0x6233c3bc, 0x5562d490)}, + {TOBN(0x7bfa15a1, 0x92e3a2cb), TOBN(0x961bcfd1, 0xe6365119), + TOBN(0x3bdd29bf, 0x2c8c53b1), TOBN(0x739704df, 0x822844ba)}}, + {{TOBN(0x7dacfb58, 0x7e7b754b), TOBN(0x23360791, 0xa806c9b9), + TOBN(0xe7eb88c9, 0x23504452), TOBN(0x2983e996, 0x852c1783)}, + {TOBN(0xdd4ae529, 0x958d881d), TOBN(0x026bae03, 0x262c7b3c), + TOBN(0x3a6f9193, 0x960b52d1), TOBN(0xd0980f90, 0x92696cfb)}}, + {{TOBN(0x4c1f428c, 0xd5f30851), TOBN(0x94dfed27, 0x2a4f6630), + TOBN(0x4df53772, 0xfc5d48a4), TOBN(0xdd2d5a2f, 0x933260ce)}, + {TOBN(0x574115bd, 0xd44cc7a5), TOBN(0x4ba6b20d, 0xbd12533a), + TOBN(0x30e93cb8, 0x243057c9), TOBN(0x794c486a, 0x14de320e)}}, + {{TOBN(0xe925d4ce, 0xf21496e4), TOBN(0xf951d198, 0xec696331), + TOBN(0x9810e2de, 0x3e8d812f), TOBN(0xd0a47259, 0x389294ab)}, + {TOBN(0x513ba2b5, 0x0e3bab66), TOBN(0x462caff5, 0xabad306f), + TOBN(0xe2dc6d59, 0xaf04c49e), TOBN(0x1aeb8750, 0xe0b84b0b)}}, + {{TOBN(0xc034f12f, 0x2f7d0ca2), TOBN(0x6d2e8128, 0xe06acf2f), + TOBN(0x801f4f83, 0x21facc2f), TOBN(0xa1170c03, 0xf40ef607)}, + {TOBN(0xfe0a1d4f, 0x7805a99c), TOBN(0xbde56a36, 0xcc26aba5), + TOBN(0x5b1629d0, 0x35531f40), TOBN(0xac212c2b, 0x9afa6108)}}, + {{TOBN(0x30a06bf3, 0x15697be5), TOBN(0x6f0545dc, 0x2c63c7c1), + TOBN(0x5d8cb842, 0x7ccdadaf), TOBN(0xd52e379b, 0xac7015bb)}, + {TOBN(0xc4f56147, 0xf462c23e), TOBN(0xd44a4298, 0x46bc24b0), + TOBN(0xbc73d23a, 0xe2856d4f), TOBN(0x61cedd8c, 0x0832bcdf)}}, + {{TOBN(0x60953556, 0x99f241d7), TOBN(0xee4adbd7, 0x001a349d), + TOBN(0x0b35bf6a, 0xaa89e491), TOBN(0x7f0076f4, 0x136f7546)}, + {TOBN(0xd19a18ba, 0x9264da3d), TOBN(0x6eb2d2cd, 0x62a7a28b), + TOBN(0xcdba941f, 0x8761c971), TOBN(0x1550518b, 0xa3be4a5d)}}, + {{TOBN(0xd0e8e2f0, 0x57d0b70c), TOBN(0xeea8612e, 0xcd133ba3), + TOBN(0x814670f0, 0x44416aec), TOBN(0x424db6c3, 0x30775061)}, + {TOBN(0xd96039d1, 0x16213fd1), TOBN(0xc61e7fa5, 0x18a3478f), + TOBN(0xa805bdcc, 0xcb0c5021), TOBN(0xbdd6f3a8, 0x0cc616dd)}}, + {{TOBN(0x06009667, 0x5d97f7e2), TOBN(0x31db0fc1, 0xaf0bf4b6), + TOBN(0x23680ed4, 0x5491627a), TOBN(0xb99a3c66, 0x7d741fb1)}, + {TOBN(0xe9bb5f55, 0x36b1ff92), TOBN(0x29738577, 0x512b388d), + TOBN(0xdb8a2ce7, 0x50fcf263), TOBN(0x385346d4, 0x6c4f7b47)}}, + {{TOBN(0xbe86c5ef, 0x31631f9e), TOBN(0xbf91da21, 0x03a57a29), + TOBN(0xc3b1f796, 0x7b23f821), TOBN(0x0f7d00d2, 0x770db354)}, + {TOBN(0x8ffc6c3b, 0xd8fe79da), TOBN(0xcc5e8c40, 0xd525c996), + TOBN(0x4640991d, 0xcfff632a), TOBN(0x64d97e8c, 0x67112528)}}, + {{TOBN(0xc232d973, 0x02f1cd1e), TOBN(0xce87eacb, 0x1dd212a4), + TOBN(0x6e4c8c73, 0xe69802f7), TOBN(0x12ef0290, 0x1fffddbd)}, + {TOBN(0x941ec74e, 0x1bcea6e2), TOBN(0xd0b54024, 0x3cb92cbb), + TOBN(0x809fb9d4, 0x7e8f9d05), TOBN(0x3bf16159, 0xf2992aae)}}, + {{TOBN(0xad40f279, 0xf8a7a838), TOBN(0x11aea631, 0x05615660), + TOBN(0xbf52e6f1, 0xa01f6fa1), TOBN(0xef046995, 0x3dc2aec9)}, + {TOBN(0x785dbec9, 0xd8080711), TOBN(0xe1aec60a, 0x9fdedf76), + TOBN(0xece797b5, 0xfa21c126), TOBN(0xc66e898f, 0x05e52732)}}, + {{TOBN(0x39bb69c4, 0x08811fdb), TOBN(0x8bfe1ef8, 0x2fc7f082), + TOBN(0xc8e7a393, 0x174f4138), TOBN(0xfba8ad1d, 0xd58d1f98)}, + {TOBN(0xbc21d0ce, 0xbfd2fd5b), TOBN(0x0b839a82, 0x6ee60d61), + TOBN(0xaacf7658, 0xafd22253), TOBN(0xb526bed8, 0xaae396b3)}}, + {{TOBN(0xccc1bbc2, 0x38564464), TOBN(0x9e3ff947, 0x8c45bc73), + TOBN(0xcde9bca3, 0x58188a78), TOBN(0x138b8ee0, 0xd73bf8f7)}, + {TOBN(0x5c7e234c, 0x4123c489), TOBN(0x66e69368, 0xfa643297), + TOBN(0x0629eeee, 0x39a15fa3), TOBN(0x95fab881, 0xa9e2a927)}}, + {{TOBN(0xb2497007, 0xeafbb1e1), TOBN(0xd75c9ce6, 0xe75b7a93), + TOBN(0x3558352d, 0xefb68d78), TOBN(0xa2f26699, 0x223f6396)}, + {TOBN(0xeb911ecf, 0xe469b17a), TOBN(0x62545779, 0xe72d3ec2), + TOBN(0x8ea47de7, 0x82cb113f), TOBN(0xebe4b086, 0x4e1fa98d)}}, + {{TOBN(0xec2d5ed7, 0x8cdfedb1), TOBN(0xa535c077, 0xfe211a74), + TOBN(0x9678109b, 0x11d244c5), TOBN(0xf17c8bfb, 0xbe299a76)}, + {TOBN(0xb651412e, 0xfb11fbc4), TOBN(0xea0b5482, 0x94ab3f65), + TOBN(0xd8dffd95, 0x0cf78243), TOBN(0x2e719e57, 0xce0361d4)}}, + {{TOBN(0x9007f085, 0x304ddc5b), TOBN(0x095e8c6d, 0x4daba2ea), + TOBN(0x5a33cdb4, 0x3f9d28a9), TOBN(0x85b95cd8, 0xe2283003)}, + {TOBN(0xbcd6c819, 0xb9744733), TOBN(0x29c5f538, 0xfc7f5783), + TOBN(0x6c49b2fa, 0xd59038e4), TOBN(0x68349cc1, 0x3bbe1018)}}, + {{TOBN(0xcc490c1d, 0x21830ee5), TOBN(0x36f9c4ee, 0xe9bfa297), + TOBN(0x58fd7294, 0x48de1a94), TOBN(0xaadb13a8, 0x4e8f2cdc)}, + {TOBN(0x515eaaa0, 0x81313dba), TOBN(0xc76bb468, 0xc2152dd8), + TOBN(0x357f8d75, 0xa653dbf8), TOBN(0xe4d8c4d1, 0xb14ac143)}}, + {{TOBN(0xbdb8e675, 0xb055cb40), TOBN(0x898f8e7b, 0x977b5167), + TOBN(0xecc65651, 0xb82fb863), TOBN(0x56544814, 0x6d88f01f)}, + {TOBN(0xb0928e95, 0x263a75a9), TOBN(0xcfb6836f, 0x1a22fcda), + TOBN(0x651d14db, 0x3f3bd37c), TOBN(0x1d3837fb, 0xb6ad4664)}}, + {{TOBN(0x7c5fb538, 0xff4f94ab), TOBN(0x7243c712, 0x6d7fb8f2), + TOBN(0xef13d60c, 0xa85c5287), TOBN(0x18cfb7c7, 0x4bb8dd1b)}, + {TOBN(0x82f9bfe6, 0x72908219), TOBN(0x35c4592b, 0x9d5144ab), + TOBN(0x52734f37, 0x9cf4b42f), TOBN(0x6bac55e7, 0x8c60ddc4)}}, + {{TOBN(0xb5cd811e, 0x94dea0f6), TOBN(0x259ecae4, 0xe18cc1a3), + TOBN(0x6a0e836e, 0x15e660f8), TOBN(0x6c639ea6, 0x0e02bff2)}, + {TOBN(0x8721b8cb, 0x7e1026fd), TOBN(0x9e73b50b, 0x63261942), + TOBN(0xb8c70974, 0x77f01da3), TOBN(0x1839e6a6, 0x8268f57f)}}, + {{TOBN(0x571b9415, 0x5150b805), TOBN(0x1892389e, 0xf92c7097), + TOBN(0x8d69c18e, 0x4a084b95), TOBN(0x7014c512, 0xbe5b495c)}, + {TOBN(0x4780db36, 0x1b07523c), TOBN(0x2f6219ce, 0x2c1c64fa), + TOBN(0xc38b81b0, 0x602c105a), TOBN(0xab4f4f20, 0x5dc8e360)}}, + {{TOBN(0x20d3c982, 0xcf7d62d2), TOBN(0x1f36e29d, 0x23ba8150), + TOBN(0x48ae0bf0, 0x92763f9e), TOBN(0x7a527e6b, 0x1d3a7007)}, + {TOBN(0xb4a89097, 0x581a85e3), TOBN(0x1f1a520f, 0xdc158be5), + TOBN(0xf98db37d, 0x167d726e), TOBN(0x8802786e, 0x1113e862)}}}, + {{{TOBN(0xefb2149e, 0x36f09ab0), TOBN(0x03f163ca, 0x4a10bb5b), + TOBN(0xd0297045, 0x06e20998), TOBN(0x56f0af00, 0x1b5a3bab)}, + {TOBN(0x7af4cfec, 0x70880e0d), TOBN(0x7332a66f, 0xbe3d913f), + TOBN(0x32e6c84a, 0x7eceb4bd), TOBN(0xedc4a79a, 0x9c228f55)}}, + {{TOBN(0xc37c7dd0, 0xc55c4496), TOBN(0xa6a96357, 0x25bbabd2), + TOBN(0x5b7e63f2, 0xadd7f363), TOBN(0x9dce3782, 0x2e73f1df)}, + {TOBN(0xe1e5a16a, 0xb2b91f71), TOBN(0xe4489823, 0x5ba0163c), + TOBN(0xf2759c32, 0xf6e515ad), TOBN(0xa5e2f1f8, 0x8615eecf)}}, + {{TOBN(0x74519be7, 0xabded551), TOBN(0x03d358b8, 0xc8b74410), + TOBN(0x4d00b10b, 0x0e10d9a9), TOBN(0x6392b0b1, 0x28da52b7)}, + {TOBN(0x6744a298, 0x0b75c904), TOBN(0xc305b0ae, 0xa8f7f96c), + TOBN(0x042e421d, 0x182cf932), TOBN(0xf6fc5d50, 0x9e4636ca)}}, + {{TOBN(0x795847c9, 0xd64cc78c), TOBN(0x6c50621b, 0x9b6cb27b), + TOBN(0x07099bf8, 0xdf8022ab), TOBN(0x48f862eb, 0xc04eda1d)}, + {TOBN(0xd12732ed, 0xe1603c16), TOBN(0x19a80e0f, 0x5c9a9450), + TOBN(0xe2257f54, 0xb429b4fc), TOBN(0x66d3b2c6, 0x45460515)}}, + {{TOBN(0x6ca4f87e, 0x822e37be), TOBN(0x73f237b4, 0x253bda4e), + TOBN(0xf747f3a2, 0x41190aeb), TOBN(0xf06fa36f, 0x804cf284)}, + {TOBN(0x0a6bbb6e, 0xfc621c12), TOBN(0x5d624b64, 0x40b80ec6), + TOBN(0x4b072425, 0x7ba556f3), TOBN(0x7fa0c354, 0x3e2d20a8)}}, + {{TOBN(0xe921fa31, 0xe3229d41), TOBN(0xa929c652, 0x94531bd4), + TOBN(0x84156027, 0xa6d38209), TOBN(0xf3d69f73, 0x6bdb97bd)}, + {TOBN(0x8906d19a, 0x16833631), TOBN(0x68a34c2e, 0x03d51be3), + TOBN(0xcb59583b, 0x0e511cd8), TOBN(0x99ce6bfd, 0xfdc132a8)}}, + {{TOBN(0x3facdaaa, 0xffcdb463), TOBN(0x658bbc1a, 0x34a38b08), + TOBN(0x12a801f8, 0xf1a9078d), TOBN(0x1567bcf9, 0x6ab855de)}, + {TOBN(0xe08498e0, 0x3572359b), TOBN(0xcf0353e5, 0x8659e68b), + TOBN(0xbb86e9c8, 0x7d23807c), TOBN(0xbc08728d, 0x2198e8a2)}}, + {{TOBN(0x8de2b7bc, 0x453cadd6), TOBN(0x203900a7, 0xbc0bc1f8), + TOBN(0xbcd86e47, 0xa6abd3af), TOBN(0x911cac12, 0x8502effb)}, + {TOBN(0x2d550242, 0xec965469), TOBN(0x0e9f7692, 0x29e0017e), + TOBN(0x633f078f, 0x65979885), TOBN(0xfb87d449, 0x4cf751ef)}}, + {{TOBN(0xe1790e4b, 0xfc25419a), TOBN(0x36467203, 0x4bff3cfd), + TOBN(0xc8db6386, 0x25b6e83f), TOBN(0x6cc69f23, 0x6cad6fd2)}, + {TOBN(0x0219e45a, 0x6bc68bb9), TOBN(0xe43d79b6, 0x297f7334), + TOBN(0x7d445368, 0x465dc97c), TOBN(0x4b9eea32, 0x2a0b949a)}}, + {{TOBN(0x1b96c6ba, 0x6102d021), TOBN(0xeaafac78, 0x2f4461ea), + TOBN(0xd4b85c41, 0xc49f19a8), TOBN(0x275c28e4, 0xcf538875)}, + {TOBN(0x35451a9d, 0xdd2e54e0), TOBN(0x6991adb5, 0x0605618b), + TOBN(0x5b8b4bcd, 0x7b36cd24), TOBN(0x372a4f8c, 0x56f37216)}}, + {{TOBN(0xc890bd73, 0xa6a5da60), TOBN(0x6f083da0, 0xdc4c9ff0), + TOBN(0xf4e14d94, 0xf0536e57), TOBN(0xf9ee1eda, 0xaaec8243)}, + {TOBN(0x571241ec, 0x8bdcf8e7), TOBN(0xa5db8271, 0x0b041e26), + TOBN(0x9a0b9a99, 0xe3fff040), TOBN(0xcaaf21dd, 0x7c271202)}}, + {{TOBN(0xb4e2b2e1, 0x4f0dd2e8), TOBN(0xe77e7c4f, 0x0a377ac7), + TOBN(0x69202c3f, 0x0d7a2198), TOBN(0xf759b7ff, 0x28200eb8)}, + {TOBN(0xc87526ed, 0xdcfe314e), TOBN(0xeb84c524, 0x53d5cf99), + TOBN(0xb1b52ace, 0x515138b6), TOBN(0x5aa7ff8c, 0x23fca3f4)}}, + {{TOBN(0xff0b13c3, 0xb9791a26), TOBN(0x960022da, 0xcdd58b16), + TOBN(0xdbd55c92, 0x57aad2de), TOBN(0x3baaaaa3, 0xf30fe619)}, + {TOBN(0x9a4b2346, 0x0d881efd), TOBN(0x506416c0, 0x46325e2a), + TOBN(0x91381e76, 0x035c18d4), TOBN(0xb3bb68be, 0xf27817b0)}}, + {{TOBN(0x15bfb8bf, 0x5116f937), TOBN(0x7c64a586, 0xc1268943), + TOBN(0x71e25cc3, 0x8419a2c8), TOBN(0x9fd6b0c4, 0x8335f463)}, + {TOBN(0x4bf0ba3c, 0xe8ee0e0e), TOBN(0x6f6fba60, 0x298c21fa), + TOBN(0x57d57b39, 0xae66bee0), TOBN(0x292d5130, 0x22672544)}}, + {{TOBN(0xf451105d, 0xbab093b3), TOBN(0x012f59b9, 0x02839986), + TOBN(0x8a915802, 0x3474a89c), TOBN(0x048c919c, 0x2de03e97)}, + {TOBN(0xc476a2b5, 0x91071cd5), TOBN(0x791ed89a, 0x034970a5), + TOBN(0x89bd9042, 0xe1b7994b), TOBN(0x8eaf5179, 0xa1057ffd)}}, + {{TOBN(0x6066e2a2, 0xd551ee10), TOBN(0x87a8f1d8, 0x727e09a6), + TOBN(0x00d08bab, 0x2c01148d), TOBN(0x6da8e4f1, 0x424f33fe)}, + {TOBN(0x466d17f0, 0xcf9a4e71), TOBN(0xff502010, 0x3bf5cb19), + TOBN(0xdccf97d8, 0xd062ecc0), TOBN(0x80c0d9af, 0x81d80ac4)}}, + {{TOBN(0xe87771d8, 0x033f2876), TOBN(0xb0186ec6, 0x7d5cc3db), + TOBN(0x58e8bb80, 0x3bc9bc1d), TOBN(0x4d1395cc, 0x6f6ef60e)}, + {TOBN(0xa73c62d6, 0x186244a0), TOBN(0x918e5f23, 0x110a5b53), + TOBN(0xed4878ca, 0x741b7eab), TOBN(0x3038d71a, 0xdbe03e51)}}, + {{TOBN(0x840204b7, 0xa93c3246), TOBN(0x21ab6069, 0xa0b9b4cd), + TOBN(0xf5fa6e2b, 0xb1d64218), TOBN(0x1de6ad0e, 0xf3d56191)}, + {TOBN(0x570aaa88, 0xff1929c7), TOBN(0xc6df4c6b, 0x640e87b5), + TOBN(0xde8a74f2, 0xc65f0ccc), TOBN(0x8b972fd5, 0xe6f6cc01)}}, + {{TOBN(0x3fff36b6, 0x0b846531), TOBN(0xba7e45e6, 0x10a5e475), + TOBN(0x84a1d10e, 0x4145b6c5), TOBN(0xf1f7f91a, 0x5e046d9d)}, + {TOBN(0x0317a692, 0x44de90d7), TOBN(0x951a1d4a, 0xf199c15e), + TOBN(0x91f78046, 0xc9d73deb), TOBN(0x74c82828, 0xfab8224f)}}, + {{TOBN(0xaa6778fc, 0xe7560b90), TOBN(0xb4073e61, 0xa7e824ce), + TOBN(0xff0d693c, 0xd642eba8), TOBN(0x7ce2e57a, 0x5dccef38)}, + {TOBN(0x89c2c789, 0x1df1ad46), TOBN(0x83a06922, 0x098346fd), + TOBN(0x2d715d72, 0xda2fc177), TOBN(0x7b6dd71d, 0x85b6cf1d)}}, + {{TOBN(0xc60a6d0a, 0x73fa9cb0), TOBN(0xedd3992e, 0x328bf5a9), + TOBN(0xc380ddd0, 0x832c8c82), TOBN(0xd182d410, 0xa2a0bf50)}, + {TOBN(0x7d9d7438, 0xd9a528db), TOBN(0xe8b1a0e9, 0xcaf53994), + TOBN(0xddd6e5fe, 0x0e19987c), TOBN(0xacb8df03, 0x190b059d)}}, + {{TOBN(0x53703a32, 0x8300129f), TOBN(0x1f637662, 0x68c43bfd), + TOBN(0xbcbd1913, 0x00e54051), TOBN(0x812fcc62, 0x7bf5a8c5)}, + {TOBN(0x3f969d5f, 0x29fb85da), TOBN(0x72f4e00a, 0x694759e8), + TOBN(0x426b6e52, 0x790726b7), TOBN(0x617bbc87, 0x3bdbb209)}}, + {{TOBN(0x511f8bb9, 0x97aee317), TOBN(0x812a4096, 0xe81536a8), + TOBN(0x137dfe59, 0x3ac09b9b), TOBN(0x0682238f, 0xba8c9a7a)}, + {TOBN(0x7072ead6, 0xaeccb4bd), TOBN(0x6a34e9aa, 0x692ba633), + TOBN(0xc82eaec2, 0x6fff9d33), TOBN(0xfb753512, 0x1d4d2b62)}}, + {{TOBN(0x1a0445ff, 0x1d7aadab), TOBN(0x65d38260, 0xd5f6a67c), + TOBN(0x6e62fb08, 0x91cfb26f), TOBN(0xef1e0fa5, 0x5c7d91d6)}, + {TOBN(0x47e7c7ba, 0x33db72cd), TOBN(0x017cbc09, 0xfa7c74b2), + TOBN(0x3c931590, 0xf50a503c), TOBN(0xcac54f60, 0x616baa42)}}, + {{TOBN(0x9b6cd380, 0xb2369f0f), TOBN(0x97d3a70d, 0x23c76151), + TOBN(0x5f9dd6fc, 0x9862a9c6), TOBN(0x044c4ab2, 0x12312f51)}, + {TOBN(0x035ea0fd, 0x834a2ddc), TOBN(0x49e6b862, 0xcc7b826d), + TOBN(0xb03d6883, 0x62fce490), TOBN(0x62f2497a, 0xb37e36e9)}}, + {{TOBN(0x04b005b6, 0xc6458293), TOBN(0x36bb5276, 0xe8d10af7), + TOBN(0xacf2dc13, 0x8ee617b8), TOBN(0x470d2d35, 0xb004b3d4)}, + {TOBN(0x06790832, 0xfeeb1b77), TOBN(0x2bb75c39, 0x85657f9c), + TOBN(0xd70bd4ed, 0xc0f60004), TOBN(0xfe797ecc, 0x219b018b)}}, + {{TOBN(0x9b5bec2a, 0x753aebcc), TOBN(0xdaf9f3dc, 0xc939eca5), + TOBN(0xd6bc6833, 0xd095ad09), TOBN(0x98abdd51, 0xdaa4d2fc)}, + {TOBN(0xd9840a31, 0x8d168be5), TOBN(0xcf7c10e0, 0x2325a23c), + TOBN(0xa5c02aa0, 0x7e6ecfaf), TOBN(0x2462e7e6, 0xb5bfdf18)}}, + {{TOBN(0xab2d8a8b, 0xa0cc3f12), TOBN(0x68dd485d, 0xbc672a29), + TOBN(0x72039752, 0x596f2cd3), TOBN(0x5d3eea67, 0xa0cf3d8d)}, + {TOBN(0x810a1a81, 0xe6602671), TOBN(0x8f144a40, 0x14026c0c), + TOBN(0xbc753a6d, 0x76b50f85), TOBN(0xc4dc21e8, 0x645cd4a4)}}, + {{TOBN(0xc5262dea, 0x521d0378), TOBN(0x802b8e0e, 0x05011c6f), + TOBN(0x1ba19cbb, 0x0b4c19ea), TOBN(0x21db64b5, 0xebf0aaec)}, + {TOBN(0x1f394ee9, 0x70342f9d), TOBN(0x93a10aee, 0x1bc44a14), + TOBN(0xa7eed31b, 0x3efd0baa), TOBN(0x6e7c824e, 0x1d154e65)}}, + {{TOBN(0xee23fa81, 0x9966e7ee), TOBN(0x64ec4aa8, 0x05b7920d), + TOBN(0x2d44462d, 0x2d90aad4), TOBN(0xf44dd195, 0xdf277ad5)}, + {TOBN(0x8d6471f1, 0xbb46b6a1), TOBN(0x1e65d313, 0xfd885090), + TOBN(0x33a800f5, 0x13a977b4), TOBN(0xaca9d721, 0x0797e1ef)}}, + {{TOBN(0x9a5a85a0, 0xfcff6a17), TOBN(0x9970a3f3, 0x1eca7cee), + TOBN(0xbb9f0d6b, 0xc9504be3), TOBN(0xe0c504be, 0xadd24ee2)}, + {TOBN(0x7e09d956, 0x77fcc2f4), TOBN(0xef1a5227, 0x65bb5fc4), + TOBN(0x145d4fb1, 0x8b9286aa), TOBN(0x66fd0c5d, 0x6649028b)}}, + {{TOBN(0x98857ceb, 0x1bf4581c), TOBN(0xe635e186, 0xaca7b166), + TOBN(0x278ddd22, 0x659722ac), TOBN(0xa0903c4c, 0x1db68007)}, + {TOBN(0x366e4589, 0x48f21402), TOBN(0x31b49c14, 0xb96abda2), + TOBN(0x329c4b09, 0xe0403190), TOBN(0x97197ca3, 0xd29f43fe)}}, + {{TOBN(0x8073dd1e, 0x274983d8), TOBN(0xda1a3bde, 0x55717c8f), + TOBN(0xfd3d4da2, 0x0361f9d1), TOBN(0x1332d081, 0x4c7de1ce)}, + {TOBN(0x9b7ef7a3, 0xaa6d0e10), TOBN(0x17db2e73, 0xf54f1c4a), + TOBN(0xaf3dffae, 0x4cd35567), TOBN(0xaaa2f406, 0xe56f4e71)}}, + {{TOBN(0x8966759e, 0x7ace3fc7), TOBN(0x9594eacf, 0x45a8d8c6), + TOBN(0x8de3bd8b, 0x91834e0e), TOBN(0xafe4ca53, 0x548c0421)}, + {TOBN(0xfdd7e856, 0xe6ee81c6), TOBN(0x8f671beb, 0x6b891a3a), + TOBN(0xf7a58f2b, 0xfae63829), TOBN(0x9ab186fb, 0x9c11ac9f)}}, + {{TOBN(0x8d6eb369, 0x10b5be76), TOBN(0x046b7739, 0xfb040bcd), + TOBN(0xccb4529f, 0xcb73de88), TOBN(0x1df0fefc, 0xcf26be03)}, + {TOBN(0xad7757a6, 0xbcfcd027), TOBN(0xa8786c75, 0xbb3165ca), + TOBN(0xe9db1e34, 0x7e99a4d9), TOBN(0x99ee86df, 0xb06c504b)}}, + {{TOBN(0x5b7c2ddd, 0xc15c9f0a), TOBN(0xdf87a734, 0x4295989e), + TOBN(0x59ece47c, 0x03d08fda), TOBN(0xb074d3dd, 0xad5fc702)}, + {TOBN(0x20407903, 0x51a03776), TOBN(0x2bb1f77b, 0x2a608007), + TOBN(0x25c58f4f, 0xe1153185), TOBN(0xe6df62f6, 0x766e6447)}}, + {{TOBN(0xefb3d1be, 0xed51275a), TOBN(0x5de47dc7, 0x2f0f483f), + TOBN(0x7932d98e, 0x97c2bedf), TOBN(0xd5c11927, 0x0219f8a1)}, + {TOBN(0x9d751200, 0xa73a294e), TOBN(0x5f88434a, 0x9dc20172), + TOBN(0xd28d9fd3, 0xa26f506a), TOBN(0xa890cd31, 0x9d1dcd48)}}, + {{TOBN(0x0aebaec1, 0x70f4d3b4), TOBN(0xfd1a1369, 0x0ffc8d00), + TOBN(0xb9d9c240, 0x57d57838), TOBN(0x45929d26, 0x68bac361)}, + {TOBN(0x5a2cd060, 0x25b15ca6), TOBN(0x4b3c83e1, 0x6e474446), + TOBN(0x1aac7578, 0xee1e5134), TOBN(0xa418f5d6, 0xc91e2f41)}}, + {{TOBN(0x6936fc8a, 0x213ed68b), TOBN(0x860ae7ed, 0x510a5224), + TOBN(0x63660335, 0xdef09b53), TOBN(0x641b2897, 0xcd79c98d)}, + {TOBN(0x29bd38e1, 0x01110f35), TOBN(0x79c26f42, 0x648b1937), + TOBN(0x64dae519, 0x9d9164f4), TOBN(0xd85a2310, 0x0265c273)}}, + {{TOBN(0x7173dd5d, 0x4b07e2b1), TOBN(0xd144c4cb, 0x8d9ea221), + TOBN(0xe8b04ea4, 0x1105ab14), TOBN(0x92dda542, 0xfe80d8f1)}, + {TOBN(0xe9982fa8, 0xcf03dce6), TOBN(0x8b5ea965, 0x1a22cffc), + TOBN(0xf7f4ea7f, 0x3fad88c4), TOBN(0x62db773e, 0x6a5ba95c)}}, + {{TOBN(0xd20f02fb, 0x93f24567), TOBN(0xfd46c69a, 0x315257ca), + TOBN(0x0ac74cc7, 0x8bcab987), TOBN(0x46f31c01, 0x5ceca2f5)}, + {TOBN(0x40aedb59, 0x888b219e), TOBN(0xe50ecc37, 0xe1fccd02), + TOBN(0x1bcd9dad, 0x911f816c), TOBN(0x583cc1ec, 0x8db9b00c)}}, + {{TOBN(0xf3cd2e66, 0xa483bf11), TOBN(0xfa08a6f5, 0xb1b2c169), + TOBN(0xf375e245, 0x4be9fa28), TOBN(0x99a7ffec, 0x5b6d011f)}, + {TOBN(0x6a3ebddb, 0xc4ae62da), TOBN(0x6cea00ae, 0x374aef5d), + TOBN(0xab5fb98d, 0x9d4d05bc), TOBN(0x7cba1423, 0xd560f252)}}, + {{TOBN(0x49b2cc21, 0x208490de), TOBN(0x1ca66ec3, 0xbcfb2879), + TOBN(0x7f1166b7, 0x1b6fb16f), TOBN(0xfff63e08, 0x65fe5db3)}, + {TOBN(0xb8345abe, 0x8b2610be), TOBN(0xb732ed80, 0x39de3df4), + TOBN(0x0e24ed50, 0x211c32b4), TOBN(0xd10d8a69, 0x848ff27d)}}, + {{TOBN(0xc1074398, 0xed4de248), TOBN(0xd7cedace, 0x10488927), + TOBN(0xa4aa6bf8, 0x85673e13), TOBN(0xb46bae91, 0x6daf30af)}, + {TOBN(0x07088472, 0xfcef7ad8), TOBN(0x61151608, 0xd4b35e97), + TOBN(0xbcfe8f26, 0xdde29986), TOBN(0xeb84c4c7, 0xd5a34c79)}}, + {{TOBN(0xc1eec55c, 0x164e1214), TOBN(0x891be86d, 0xa147bb03), + TOBN(0x9fab4d10, 0x0ba96835), TOBN(0xbf01e9b8, 0xa5c1ae9f)}, + {TOBN(0x6b4de139, 0xb186ebc0), TOBN(0xd5c74c26, 0x85b91bca), + TOBN(0x5086a99c, 0xc2d93854), TOBN(0xeed62a7b, 0xa7a9dfbc)}}, + {{TOBN(0x8778ed6f, 0x76b7618a), TOBN(0xbff750a5, 0x03b66062), + TOBN(0x4cb7be22, 0xb65186db), TOBN(0x369dfbf0, 0xcc3a6d13)}, + {TOBN(0xc7dab26c, 0x7191a321), TOBN(0x9edac3f9, 0x40ed718e), + TOBN(0xbc142b36, 0xd0cfd183), TOBN(0xc8af82f6, 0x7c991693)}}, + {{TOBN(0xb3d1e4d8, 0x97ce0b2a), TOBN(0xe6d7c87f, 0xc3a55cdf), + TOBN(0x35846b95, 0x68b81afe), TOBN(0x018d12af, 0xd3c239d8)}, + {TOBN(0x2b2c6208, 0x01206e15), TOBN(0xe0e42453, 0xa3b882c6), + TOBN(0x854470a3, 0xa50162d5), TOBN(0x08157478, 0x7017a62a)}}, + {{TOBN(0x18bd3fb4, 0x820357c7), TOBN(0x992039ae, 0x6f1458ad), + TOBN(0x9a1df3c5, 0x25b44aa1), TOBN(0x2d780357, 0xed3d5281)}, + {TOBN(0x58cf7e4d, 0xc77ad4d4), TOBN(0xd49a7998, 0xf9df4fc4), + TOBN(0x4465a8b5, 0x1d71205e), TOBN(0xa0ee0ea6, 0x649254aa)}}, + {{TOBN(0x4b5eeecf, 0xab7bd771), TOBN(0x6c873073, 0x35c262b9), + TOBN(0xdc5bd648, 0x3c9d61e7), TOBN(0x233d6d54, 0x321460d2)}, + {TOBN(0xd20c5626, 0xfc195bcc), TOBN(0x25445958, 0x04d78b63), + TOBN(0xe03fcb3d, 0x17ec8ef3), TOBN(0x54b690d1, 0x46b8f781)}}, + {{TOBN(0x82fa2c8a, 0x21230646), TOBN(0xf51aabb9, 0x084f418c), + TOBN(0xff4fbec1, 0x1a30ba43), TOBN(0x6a5acf73, 0x743c9df7)}, + {TOBN(0x1da2b357, 0xd635b4d5), TOBN(0xc3de68dd, 0xecd5c1da), + TOBN(0xa689080b, 0xd61af0dd), TOBN(0xdea5938a, 0xd665bf99)}}, + {{TOBN(0x0231d71a, 0xfe637294), TOBN(0x01968aa6, 0xa5a81cd8), + TOBN(0x11252d50, 0x048e63b5), TOBN(0xc446bc52, 0x6ca007e9)}, + {TOBN(0xef8c50a6, 0x96d6134b), TOBN(0x9361fbf5, 0x9e09a05c), + TOBN(0xf17f85a6, 0xdca3291a), TOBN(0xb178d548, 0xff251a21)}}, + {{TOBN(0x87f6374b, 0xa4df3915), TOBN(0x566ce1bf, 0x2fd5d608), + TOBN(0x425cba4d, 0x7de35102), TOBN(0x6b745f8f, 0x58c5d5e2)}, + {TOBN(0x88402af6, 0x63122edf), TOBN(0x3190f9ed, 0x3b989a89), + TOBN(0x4ad3d387, 0xebba3156), TOBN(0xef385ad9, 0xc7c469a5)}}, + {{TOBN(0xb08281de, 0x3f642c29), TOBN(0x20be0888, 0x910ffb88), + TOBN(0xf353dd4a, 0xd5292546), TOBN(0x3f1627de, 0x8377a262)}, + {TOBN(0xa5faa013, 0xeefcd638), TOBN(0x8f3bf626, 0x74cc77c3), + TOBN(0x32618f65, 0xa348f55e), TOBN(0x5787c0dc, 0x9fefeb9e)}}, + {{TOBN(0xf1673aa2, 0xd9a23e44), TOBN(0x88dfa993, 0x4e10690d), + TOBN(0x1ced1b36, 0x2bf91108), TOBN(0x9193ceca, 0x3af48649)}, + {TOBN(0xfb34327d, 0x2d738fc5), TOBN(0x6697b037, 0x975fee6c), + TOBN(0x2f485da0, 0xc04079a5), TOBN(0x2cdf5735, 0x2feaa1ac)}}, + {{TOBN(0x76944420, 0xbd55659e), TOBN(0x7973e32b, 0x4376090c), + TOBN(0x86bb4fe1, 0x163b591a), TOBN(0x10441aed, 0xc196f0ca)}, + {TOBN(0x3b431f4a, 0x045ad915), TOBN(0x6c11b437, 0xa4afacb1), + TOBN(0x30b0c7db, 0x71fdbbd8), TOBN(0xb642931f, 0xeda65acd)}}, + {{TOBN(0x4baae6e8, 0x9c92b235), TOBN(0xa73bbd0e, 0x6b3993a1), + TOBN(0xd06d60ec, 0x693dd031), TOBN(0x03cab91b, 0x7156881c)}, + {TOBN(0xd615862f, 0x1db3574b), TOBN(0x485b0185, 0x64bb061a), + TOBN(0x27434988, 0xa0181e06), TOBN(0x2cd61ad4, 0xc1c0c757)}}, + {{TOBN(0x3effed5a, 0x2ff9f403), TOBN(0x8dc98d8b, 0x62239029), + TOBN(0x2206021e, 0x1f17b70d), TOBN(0xafbec0ca, 0xbf510015)}, + {TOBN(0x9fed7164, 0x80130dfa), TOBN(0x306dc2b5, 0x8a02dcf5), + TOBN(0x48f06620, 0xfeb10fc0), TOBN(0x78d1e1d5, 0x5a57cf51)}}, + {{TOBN(0xadef8c5a, 0x192ef710), TOBN(0x88afbd4b, 0x3b7431f9), + TOBN(0x7e1f7407, 0x64250c9e), TOBN(0x6e31318d, 0xb58bec07)}, + {TOBN(0xfd4fc4b8, 0x24f89b4e), TOBN(0x65a5dd88, 0x48c36a2a), + TOBN(0x4f1eccff, 0xf024baa7), TOBN(0x22a21cf2, 0xcba94650)}}, + {{TOBN(0x95d29dee, 0x42a554f7), TOBN(0x828983a5, 0x002ec4ba), + TOBN(0x8112a1f7, 0x8badb73d), TOBN(0x79ea8897, 0xa27c1839)}, + {TOBN(0x8969a5a7, 0xd065fd83), TOBN(0xf49af791, 0xb262a0bc), + TOBN(0xfcdea8b6, 0xaf2b5127), TOBN(0x10e913e1, 0x564c2dbc)}}, + {{TOBN(0x51239d14, 0xbc21ef51), TOBN(0xe51c3ceb, 0x4ce57292), + TOBN(0x795ff068, 0x47bbcc3b), TOBN(0x86b46e1e, 0xbd7e11e6)}, + {TOBN(0x0ea6ba23, 0x80041ef4), TOBN(0xd72fe505, 0x6262342e), + TOBN(0x8abc6dfd, 0x31d294d4), TOBN(0xbbe017a2, 0x1278c2c9)}}, + {{TOBN(0xb1fcfa09, 0xb389328a), TOBN(0x322fbc62, 0xd01771b5), + TOBN(0x04c0d063, 0x60b045bf), TOBN(0xdb652edc, 0x10e52d01)}, + {TOBN(0x50ef932c, 0x03ec6627), TOBN(0xde1b3b2d, 0xc1ee50e3), + TOBN(0x5ab7bdc5, 0xdc37a90d), TOBN(0xfea67213, 0x31e33a96)}}, + {{TOBN(0x6482b5cb, 0x4f2999aa), TOBN(0x38476cc6, 0xb8cbf0dd), + TOBN(0x93ebfacb, 0x173405bb), TOBN(0x15cdafe7, 0xe52369ec)}, + {TOBN(0xd42d5ba4, 0xd935b7db), TOBN(0x648b6004, 0x1c99a4cd), + TOBN(0x785101bd, 0xa3b5545b), TOBN(0x4bf2c38a, 0x9dd67faf)}}, + {{TOBN(0xb1aadc63, 0x4442449c), TOBN(0xe0e9921a, 0x33ad4fb8), + TOBN(0x5c552313, 0xaa686d82), TOBN(0xdee635fa, 0x465d866c)}, + {TOBN(0xbc3c224a, 0x18ee6e8a), TOBN(0xeed748a6, 0xed42e02f), + TOBN(0xe70f930a, 0xd474cd08), TOBN(0x774ea6ec, 0xfff24adf)}}, + {{TOBN(0x03e2de1c, 0xf3480d4a), TOBN(0xf0d8edc7, 0xbc8acf1a), + TOBN(0xf23e3303, 0x68295a9c), TOBN(0xfadd5f68, 0xc546a97d)}, + {TOBN(0x895597ad, 0x96f8acb1), TOBN(0xbddd49d5, 0x671bdae2), + TOBN(0x16fcd528, 0x21dd43f4), TOBN(0xa5a45412, 0x6619141a)}}}, + {{{TOBN(0x8ce9b6bf, 0xc360e25a), TOBN(0xe6425195, 0x075a1a78), + TOBN(0x9dc756a8, 0x481732f4), TOBN(0x83c0440f, 0x5432b57a)}, + {TOBN(0xc670b3f1, 0xd720281f), TOBN(0x2205910e, 0xd135e051), + TOBN(0xded14b0e, 0xdb052be7), TOBN(0x697b3d27, 0xc568ea39)}}, + {{TOBN(0x2e599b9a, 0xfb3ff9ed), TOBN(0x28c2e0ab, 0x17f6515c), + TOBN(0x1cbee4fd, 0x474da449), TOBN(0x071279a4, 0x4f364452)}, + {TOBN(0x97abff66, 0x01fbe855), TOBN(0x3ee394e8, 0x5fda51c4), + TOBN(0x190385f6, 0x67597c0b), TOBN(0x6e9fccc6, 0xa27ee34b)}}, + {{TOBN(0x0b89de93, 0x14092ebb), TOBN(0xf17256bd, 0x428e240c), + TOBN(0xcf89a7f3, 0x93d2f064), TOBN(0x4f57841e, 0xe1ed3b14)}, + {TOBN(0x4ee14405, 0xe708d855), TOBN(0x856aae72, 0x03f1c3d0), + TOBN(0xc8e5424f, 0xbdd7eed5), TOBN(0x3333e4ef, 0x73ab4270)}}, + {{TOBN(0x3bc77ade, 0xdda492f8), TOBN(0xc11a3aea, 0x78297205), + TOBN(0x5e89a3e7, 0x34931b4c), TOBN(0x17512e2e, 0x9f5694bb)}, + {TOBN(0x5dc349f3, 0x177bf8b6), TOBN(0x232ea4ba, 0x08c7ff3e), + TOBN(0x9c4f9d16, 0xf511145d), TOBN(0xccf109a3, 0x33b379c3)}}, + {{TOBN(0xe75e7a88, 0xa1f25897), TOBN(0x7ac6961f, 0xa1b5d4d8), + TOBN(0xe3e10773, 0x08f3ed5c), TOBN(0x208a54ec, 0x0a892dfb)}, + {TOBN(0xbe826e19, 0x78660710), TOBN(0x0cf70a97, 0x237df2c8), + TOBN(0x418a7340, 0xed704da5), TOBN(0xa3eeb9a9, 0x08ca33fd)}}, + {{TOBN(0x49d96233, 0x169bca96), TOBN(0x04d286d4, 0x2da6aafb), + TOBN(0xc09606ec, 0xa0c2fa94), TOBN(0x8869d0d5, 0x23ff0fb3)}, + {TOBN(0xa99937e5, 0xd0150d65), TOBN(0xa92e2503, 0x240c14c9), + TOBN(0x656bf945, 0x108e2d49), TOBN(0x152a733a, 0xa2f59e2b)}}, + {{TOBN(0xb4323d58, 0x8434a920), TOBN(0xc0af8e93, 0x622103c5), + TOBN(0x667518ef, 0x938dbf9a), TOBN(0xa1843073, 0x83a9cdf2)}, + {TOBN(0x350a94aa, 0x5447ab80), TOBN(0xe5e5a325, 0xc75a3d61), + TOBN(0x74ba507f, 0x68411a9e), TOBN(0x10581fc1, 0x594f70c5)}}, + {{TOBN(0x60e28570, 0x80eb24a9), TOBN(0x7bedfb4d, 0x488e0cfd), + TOBN(0x721ebbd7, 0xc259cdb8), TOBN(0x0b0da855, 0xbc6390a9)}, + {TOBN(0x2b4d04db, 0xde314c70), TOBN(0xcdbf1fbc, 0x6c32e846), + TOBN(0x33833eab, 0xb162fc9e), TOBN(0x9939b48b, 0xb0dd3ab7)}}, + {{TOBN(0x5aaa98a7, 0xcb0c9c8c), TOBN(0x75105f30, 0x81c4375c), + TOBN(0xceee5057, 0x5ef1c90f), TOBN(0xb31e065f, 0xc23a17bf)}, + {TOBN(0x5364d275, 0xd4b6d45a), TOBN(0xd363f3ad, 0x62ec8996), + TOBN(0xb5d21239, 0x4391c65b), TOBN(0x84564765, 0xebb41b47)}}, + {{TOBN(0x20d18ecc, 0x37107c78), TOBN(0xacff3b6b, 0x570c2a66), + TOBN(0x22f975d9, 0x9bd0d845), TOBN(0xef0a0c46, 0xba178fa0)}, + {TOBN(0x1a419651, 0x76b6028e), TOBN(0xc49ec674, 0x248612d4), + TOBN(0x5b6ac4f2, 0x7338af55), TOBN(0x06145e62, 0x7bee5a36)}}, + {{TOBN(0x33e95d07, 0xe75746b5), TOBN(0x1c1e1f6d, 0xc40c78be), + TOBN(0x967833ef, 0x222ff8e2), TOBN(0x4bedcf6a, 0xb49180ad)}, + {TOBN(0x6b37e9c1, 0x3d7a4c8a), TOBN(0x2748887c, 0x6ddfe760), + TOBN(0xf7055123, 0xaa3a5bbc), TOBN(0x954ff225, 0x7bbb8e74)}}, + {{TOBN(0xc42b8ab1, 0x97c3dfb9), TOBN(0x55a549b0, 0xcf168154), + TOBN(0xad6748e7, 0xc1b50692), TOBN(0x2775780f, 0x6fc5cbcb)}, + {TOBN(0x4eab80b8, 0xe1c9d7c8), TOBN(0x8c69dae1, 0x3fdbcd56), + TOBN(0x47e6b4fb, 0x9969eace), TOBN(0x002f1085, 0xa705cb5a)}}, + {{TOBN(0x4e23ca44, 0x6d3fea55), TOBN(0xb4ae9c86, 0xf4810568), + TOBN(0x47bfb91b, 0x2a62f27d), TOBN(0x60deb4c9, 0xd9bac28c)}, + {TOBN(0xa892d894, 0x7de6c34c), TOBN(0x4ee68259, 0x4494587d), + TOBN(0x914ee14e, 0x1a3f8a5b), TOBN(0xbb113eaa, 0x28700385)}}, + {{TOBN(0x81ca03b9, 0x2115b4c9), TOBN(0x7c163d38, 0x8908cad1), + TOBN(0xc912a118, 0xaa18179a), TOBN(0xe09ed750, 0x886e3081)}, + {TOBN(0xa676e3fa, 0x26f516ca), TOBN(0x753cacf7, 0x8e732f91), + TOBN(0x51592aea, 0x833da8b4), TOBN(0xc626f42f, 0x4cbea8aa)}}, + {{TOBN(0xef9dc899, 0xa7b56eaf), TOBN(0x00c0e52c, 0x34ef7316), + TOBN(0x5b1e4e24, 0xfe818a86), TOBN(0x9d31e20d, 0xc538be47)}, + {TOBN(0x22eb932d, 0x3ed68974), TOBN(0xe44bbc08, 0x7c4e87c4), + TOBN(0x4121086e, 0x0dde9aef), TOBN(0x8e6b9cff, 0x134f4345)}}, + {{TOBN(0x96892c1f, 0x711b0eb9), TOBN(0xb905f2c8, 0x780ab954), + TOBN(0xace26309, 0xa20792db), TOBN(0xec8ac9b3, 0x0684e126)}, + {TOBN(0x486ad8b6, 0xb40a2447), TOBN(0x60121fc1, 0x9fe3fb24), + TOBN(0x5626fccf, 0x1a8e3b3f), TOBN(0x4e568622, 0x6ad1f394)}}, + {{TOBN(0xda7aae0d, 0x196aa5a1), TOBN(0xe0df8c77, 0x1041b5fb), + TOBN(0x451465d9, 0x26b318b7), TOBN(0xc29b6e55, 0x7ab136e9)}, + {TOBN(0x2c2ab48b, 0x71148463), TOBN(0xb5738de3, 0x64454a76), + TOBN(0x54ccf9a0, 0x5a03abe4), TOBN(0x377c0296, 0x0427d58e)}}, + {{TOBN(0x73f5f0b9, 0x2bb39c1f), TOBN(0x14373f2c, 0xe608d8c5), + TOBN(0xdcbfd314, 0x00fbb805), TOBN(0xdf18fb20, 0x83afdcfb)}, + {TOBN(0x81a57f42, 0x42b3523f), TOBN(0xe958532d, 0x87f650fb), + TOBN(0xaa8dc8b6, 0x8b0a7d7c), TOBN(0x1b75dfb7, 0x150166be)}}, + {{TOBN(0x90e4f7c9, 0x2d7d1413), TOBN(0x67e2d6b5, 0x9834f597), + TOBN(0x4fd4f4f9, 0xa808c3e8), TOBN(0xaf8237e0, 0xd5281ec1)}, + {TOBN(0x25ab5fdc, 0x84687cee), TOBN(0xc5ded6b1, 0xa5b26c09), + TOBN(0x8e4a5aec, 0xc8ea7650), TOBN(0x23b73e5c, 0x14cc417f)}}, + {{TOBN(0x2bfb4318, 0x3037bf52), TOBN(0xb61e6db5, 0x78c725d7), + TOBN(0x8efd4060, 0xbbb3e5d7), TOBN(0x2e014701, 0xdbac488e)}, + {TOBN(0xac75cf9a, 0x360aa449), TOBN(0xb70cfd05, 0x79634d08), + TOBN(0xa591536d, 0xfffb15ef), TOBN(0xb2c37582, 0xd07c106c)}}, + {{TOBN(0xb4293fdc, 0xf50225f9), TOBN(0xc52e175c, 0xb0e12b03), + TOBN(0xf649c3ba, 0xd0a8bf64), TOBN(0x745a8fef, 0xeb8ae3c6)}, + {TOBN(0x30d7e5a3, 0x58321bc3), TOBN(0xb1732be7, 0x0bc4df48), + TOBN(0x1f217993, 0xe9ea5058), TOBN(0xf7a71cde, 0x3e4fd745)}}, + {{TOBN(0x86cc533e, 0x894c5bbb), TOBN(0x6915c7d9, 0x69d83082), + TOBN(0xa6aa2d05, 0x5815c244), TOBN(0xaeeee592, 0x49b22ce5)}, + {TOBN(0x89e39d13, 0x78135486), TOBN(0x3a275c1f, 0x16b76f2f), + TOBN(0xdb6bcc1b, 0xe036e8f5), TOBN(0x4df69b21, 0x5e4709f5)}}, + {{TOBN(0xa188b250, 0x2d0f39aa), TOBN(0x622118bb, 0x15a85947), + TOBN(0x2ebf520f, 0xfde0f4fa), TOBN(0xa40e9f29, 0x4860e539)}, + {TOBN(0x7b6a51eb, 0x22b57f0f), TOBN(0x849a33b9, 0x7e80644a), + TOBN(0x50e5d16f, 0x1cf095fe), TOBN(0xd754b54e, 0xec55f002)}}, + {{TOBN(0x5cfbbb22, 0x236f4a98), TOBN(0x0b0c59e9, 0x066800bb), + TOBN(0x4ac69a8f, 0x5a9a7774), TOBN(0x2b33f804, 0xd6bec948)}, + {TOBN(0xb3729295, 0x32e6c466), TOBN(0x68956d0f, 0x4e599c73), + TOBN(0xa47a249f, 0x155c31cc), TOBN(0x24d80f0d, 0xe1ce284e)}}, + {{TOBN(0xcd821dfb, 0x988baf01), TOBN(0xe6331a7d, 0xdbb16647), + TOBN(0x1eb8ad33, 0x094cb960), TOBN(0x593cca38, 0xc91bbca5)}, + {TOBN(0x384aac8d, 0x26567456), TOBN(0x40fa0309, 0xc04b6490), + TOBN(0x97834cd6, 0xdab6c8f6), TOBN(0x68a7318d, 0x3f91e55f)}}, + {{TOBN(0xa00fd04e, 0xfc4d3157), TOBN(0xb56f8ab2, 0x2bf3bdea), + TOBN(0x014f5648, 0x4fa57172), TOBN(0x948c5860, 0x450abdb3)}, + {TOBN(0x342b5df0, 0x0ebd4f08), TOBN(0x3e5168cd, 0x0e82938e), + TOBN(0x7aedc1ce, 0xb0df5dd0), TOBN(0x6bbbc6d9, 0xe5732516)}}, + {{TOBN(0xc7bfd486, 0x605daaa6), TOBN(0x46fd72b7, 0xbb9a6c9e), + TOBN(0xe4847fb1, 0xa124fb89), TOBN(0x75959cbd, 0xa2d8ffbc)}, + {TOBN(0x42579f65, 0xc8a588ee), TOBN(0x368c92e6, 0xb80b499d), + TOBN(0xea4ef6cd, 0x999a5df1), TOBN(0xaa73bb7f, 0x936fe604)}}, + {{TOBN(0xf347a70d, 0x6457d188), TOBN(0x86eda86b, 0x8b7a388b), + TOBN(0xb7cdff06, 0x0ccd6013), TOBN(0xbeb1b6c7, 0xd0053fb2)}, + {TOBN(0x0b022387, 0x99240a9f), TOBN(0x1bbb384f, 0x776189b2), + TOBN(0x8695e71e, 0x9066193a), TOBN(0x2eb50097, 0x06ffac7e)}}, + {{TOBN(0x0654a9c0, 0x4a7d2caa), TOBN(0x6f3fb3d1, 0xa5aaa290), + TOBN(0x835db041, 0xff476e8f), TOBN(0x540b8b0b, 0xc42295e4)}, + {TOBN(0xa5c73ac9, 0x05e214f5), TOBN(0x9a74075a, 0x56a0b638), + TOBN(0x2e4b1090, 0xce9e680b), TOBN(0x57a5b479, 0x6b8d9afa)}}, + {{TOBN(0x0dca48e7, 0x26bfe65c), TOBN(0x097e391c, 0x7290c307), + TOBN(0x683c462e, 0x6669e72e), TOBN(0xf505be1e, 0x062559ac)}, + {TOBN(0x5fbe3ea1, 0xe3a3035a), TOBN(0x6431ebf6, 0x9cd50da8), + TOBN(0xfd169d5c, 0x1f6407f2), TOBN(0x8d838a95, 0x60fce6b8)}}, + {{TOBN(0x2a2bfa7f, 0x650006f0), TOBN(0xdfd7dad3, 0x50c0fbb2), + TOBN(0x92452495, 0xccf9ad96), TOBN(0x183bf494, 0xd95635f9)}, + {TOBN(0x02d5df43, 0x4a7bd989), TOBN(0x505385cc, 0xa5431095), + TOBN(0xdd98e67d, 0xfd43f53e), TOBN(0xd61e1a6c, 0x500c34a9)}}, + {{TOBN(0x5a4b46c6, 0x4a8a3d62), TOBN(0x8469c4d0, 0x247743d2), + TOBN(0x2bb3a13d, 0x88f7e433), TOBN(0x62b23a10, 0x01be5849)}, + {TOBN(0xe83596b4, 0xa63d1a4c), TOBN(0x454e7fea, 0x7d183f3e), + TOBN(0x643fce61, 0x17afb01c), TOBN(0x4e65e5e6, 0x1c4c3638)}}, + {{TOBN(0x41d85ea1, 0xef74c45b), TOBN(0x2cfbfa66, 0xae328506), + TOBN(0x98b078f5, 0x3ada7da9), TOBN(0xd985fe37, 0xec752fbb)}, + {TOBN(0xeece68fe, 0x5a0148b4), TOBN(0x6f9a55c7, 0x2d78136d), + TOBN(0x232dccc4, 0xd2b729ce), TOBN(0xa27e0dfd, 0x90aafbc4)}}, + {{TOBN(0x96474452, 0x12b4603e), TOBN(0xa876c551, 0x6b706d14), + TOBN(0xdf145fcf, 0x69a9d412), TOBN(0xe2ab75b7, 0x2d479c34)}, + {TOBN(0x12df9a76, 0x1a23ff97), TOBN(0xc6138992, 0x5d359d10), + TOBN(0x6e51c7ae, 0xfa835f22), TOBN(0x69a79cb1, 0xc0fcc4d9)}}, + {{TOBN(0xf57f350d, 0x594cc7e1), TOBN(0x3079ca63, 0x3350ab79), + TOBN(0x226fb614, 0x9aff594a), TOBN(0x35afec02, 0x6d59a62b)}, + {TOBN(0x9bee46f4, 0x06ed2c6e), TOBN(0x58da1735, 0x7d939a57), + TOBN(0x44c50402, 0x8fd1797e), TOBN(0xd8853e7c, 0x5ccea6ca)}}, + {{TOBN(0x4065508d, 0xa35fcd5f), TOBN(0x8965df8c, 0x495ccaeb), + TOBN(0x0f2da850, 0x12e1a962), TOBN(0xee471b94, 0xc1cf1cc4)}, + {TOBN(0xcef19bc8, 0x0a08fb75), TOBN(0x704958f5, 0x81de3591), + TOBN(0x2867f8b2, 0x3aef4f88), TOBN(0x8d749384, 0xea9f9a5f)}}, + {{TOBN(0x1b385537, 0x8c9049f4), TOBN(0x5be948f3, 0x7b92d8b6), + TOBN(0xd96f725d, 0xb6e2bd6b), TOBN(0x37a222bc, 0x958c454d)}, + {TOBN(0xe7c61abb, 0x8809bf61), TOBN(0x46f07fbc, 0x1346f18d), + TOBN(0xfb567a7a, 0xe87c0d1c), TOBN(0x84a461c8, 0x7ef3d07a)}}, + {{TOBN(0x0a5adce6, 0xd9278d98), TOBN(0x24d94813, 0x9dfc73e1), + TOBN(0x4f3528b6, 0x054321c3), TOBN(0x2e03fdde, 0x692ea706)}, + {TOBN(0x10e60619, 0x47b533c0), TOBN(0x1a8bc73f, 0x2ca3c055), + TOBN(0xae58d4b2, 0x1bb62b8f), TOBN(0xb2045a73, 0x584a24e3)}}, + {{TOBN(0x3ab3d5af, 0xbd76e195), TOBN(0x478dd1ad, 0x6938a810), + TOBN(0x6ffab393, 0x6ee3d5cb), TOBN(0xdfb693db, 0x22b361e4)}, + {TOBN(0xf9694496, 0x51dbf1a7), TOBN(0xcab4b4ef, 0x08a2e762), + TOBN(0xe8c92f25, 0xd39bba9a), TOBN(0x850e61bc, 0xf1464d96)}}, + {{TOBN(0xb7e830e3, 0xdc09508b), TOBN(0xfaf6d2cf, 0x74317655), + TOBN(0x72606ceb, 0xdf690355), TOBN(0x48bb92b3, 0xd0c3ded6)}, + {TOBN(0x65b75484, 0x5c7cf892), TOBN(0xf6cd7ac9, 0xd5d5f01f), + TOBN(0xc2c30a59, 0x96401d69), TOBN(0x91268650, 0xed921878)}}, + {{TOBN(0x380bf913, 0xb78c558f), TOBN(0x43c0baeb, 0xc8afdaa9), + TOBN(0x377f61d5, 0x54f169d3), TOBN(0xf8da07e3, 0xae5ff20b)}, + {TOBN(0xb676c49d, 0xa8a90ea8), TOBN(0x81c1ff2b, 0x83a29b21), + TOBN(0x383297ac, 0x2ad8d276), TOBN(0x3001122f, 0xba89f982)}}, + {{TOBN(0xe1d794be, 0x6718e448), TOBN(0x246c1482, 0x7c3e6e13), + TOBN(0x56646ef8, 0x5d26b5ef), TOBN(0x80f5091e, 0x88069cdd)}, + {TOBN(0xc5992e2f, 0x724bdd38), TOBN(0x02e915b4, 0x8471e8c7), + TOBN(0x96ff320a, 0x0d0ff2a9), TOBN(0xbf886487, 0x4384d1a0)}}, + {{TOBN(0xbbe1e6a6, 0xc93f72d6), TOBN(0xd5f75d12, 0xcad800ea), + TOBN(0xfa40a09f, 0xe7acf117), TOBN(0x32c8cdd5, 0x7581a355)}, + {TOBN(0x74221992, 0x7023c499), TOBN(0xa8afe5d7, 0x38ec3901), + TOBN(0x5691afcb, 0xa90e83f0), TOBN(0x41bcaa03, 0x0b8f8eac)}}, + {{TOBN(0xe38b5ff9, 0x8d2668d5), TOBN(0x0715281a, 0x7ad81965), + TOBN(0x1bc8fc7c, 0x03c6ce11), TOBN(0xcbbee6e2, 0x8b650436)}, + {TOBN(0x06b00fe8, 0x0cdb9808), TOBN(0x17d6e066, 0xfe3ed315), + TOBN(0x2e9d38c6, 0x4d0b5018), TOBN(0xab8bfd56, 0x844dcaef)}}, + {{TOBN(0x42894a59, 0x513aed8b), TOBN(0xf77f3b6d, 0x314bd07a), + TOBN(0xbbdecb8f, 0x8e42b582), TOBN(0xf10e2fa8, 0xd2390fe6)}, + {TOBN(0xefb95022, 0x62a2f201), TOBN(0x4d59ea50, 0x50ee32b0), + TOBN(0xd87f7728, 0x6da789a8), TOBN(0xcf98a2cf, 0xf79492c4)}}, + {{TOBN(0xf9577239, 0x720943c2), TOBN(0xba044cf5, 0x3990b9d0), + TOBN(0x5aa8e823, 0x95f2884a), TOBN(0x834de6ed, 0x0278a0af)}, + {TOBN(0xc8e1ee9a, 0x5f25bd12), TOBN(0x9259ceaa, 0x6f7ab271), + TOBN(0x7e6d97a2, 0x77d00b76), TOBN(0x5c0c6eea, 0xa437832a)}}, + {{TOBN(0x5232c20f, 0x5606b81d), TOBN(0xabd7b375, 0x0d991ee5), + TOBN(0x4d2bfe35, 0x8632d951), TOBN(0x78f85146, 0x98ed9364)}, + {TOBN(0x951873f0, 0xf30c3282), TOBN(0x0da8ac80, 0xa789230b), + TOBN(0x3ac7789c, 0x5398967f), TOBN(0xa69b8f7f, 0xbdda0fb5)}}, + {{TOBN(0xe5db7717, 0x6add8545), TOBN(0x1b71cb66, 0x72c49b66), + TOBN(0xd8560739, 0x68421d77), TOBN(0x03840fe8, 0x83e3afea)}, + {TOBN(0xb391dad5, 0x1ec69977), TOBN(0xae243fb9, 0x307f6726), + TOBN(0xc88ac87b, 0xe8ca160c), TOBN(0x5174cced, 0x4ce355f4)}}, + {{TOBN(0x98a35966, 0xe58ba37d), TOBN(0xfdcc8da2, 0x7817335d), + TOBN(0x5b752830, 0x83fbc7bf), TOBN(0x68e419d4, 0xd9c96984)}, + {TOBN(0x409a39f4, 0x02a40380), TOBN(0x88940faf, 0x1fe977bc), + TOBN(0xc640a94b, 0x8f8edea6), TOBN(0x1e22cd17, 0xed11547d)}}, + {{TOBN(0xe28568ce, 0x59ffc3e2), TOBN(0x60aa1b55, 0xc1dee4e7), + TOBN(0xc67497c8, 0x837cb363), TOBN(0x06fb438a, 0x105a2bf2)}, + {TOBN(0x30357ec4, 0x500d8e20), TOBN(0x1ad9095d, 0x0670db10), + TOBN(0x7f589a05, 0xc73b7cfd), TOBN(0xf544607d, 0x880d6d28)}}, + {{TOBN(0x17ba93b1, 0xa20ef103), TOBN(0xad859130, 0x6ba6577b), + TOBN(0x65c91cf6, 0x6fa214a0), TOBN(0xd7d49c6c, 0x27990da5)}, + {TOBN(0xecd9ec8d, 0x20bb569d), TOBN(0xbd4b2502, 0xeeffbc33), + TOBN(0x2056ca5a, 0x6bed0467), TOBN(0x7916a1f7, 0x5b63728c)}}, + {{TOBN(0xd4f9497d, 0x53a4f566), TOBN(0x89734664, 0x97b56810), + TOBN(0xf8e1da74, 0x0494a621), TOBN(0x82546a93, 0x8d011c68)}, + {TOBN(0x1f3acb19, 0xc61ac162), TOBN(0x52f8fa9c, 0xabad0d3e), + TOBN(0x15356523, 0xb4b7ea43), TOBN(0x5a16ad61, 0xae608125)}}, + {{TOBN(0xb0bcb87f, 0x4faed184), TOBN(0x5f236b1d, 0x5029f45f), + TOBN(0xd42c7607, 0x0bc6b1fc), TOBN(0xc644324e, 0x68aefce3)}, + {TOBN(0x8e191d59, 0x5c5d8446), TOBN(0xc0208077, 0x13ae1979), + TOBN(0xadcaee55, 0x3ba59cc7), TOBN(0x20ed6d6b, 0xa2cb81ba)}}, + {{TOBN(0x0952ba19, 0xb6efcffc), TOBN(0x60f12d68, 0x97c0b87c), + TOBN(0x4ee2c7c4, 0x9caa30bc), TOBN(0x767238b7, 0x97fbff4e)}, + {TOBN(0xebc73921, 0x501b5d92), TOBN(0x3279e3df, 0xc2a37737), + TOBN(0x9fc12bc8, 0x6d197543), TOBN(0xfa94dc6f, 0x0a40db4e)}}, + {{TOBN(0x7392b41a, 0x530ccbbd), TOBN(0x87c82146, 0xea823525), + TOBN(0xa52f984c, 0x05d98d0c), TOBN(0x2ae57d73, 0x5ef6974c)}, + {TOBN(0x9377f7bf, 0x3042a6dd), TOBN(0xb1a007c0, 0x19647a64), + TOBN(0xfaa9079a, 0x0cca9767), TOBN(0x3d81a25b, 0xf68f72d5)}}, + {{TOBN(0x752067f8, 0xff81578e), TOBN(0x78622150, 0x9045447d), + TOBN(0xc0c22fcf, 0x0505aa6f), TOBN(0x1030f0a6, 0x6bed1c77)}, + {TOBN(0x31f29f15, 0x1f0bd739), TOBN(0x2d7989c7, 0xe6debe85), + TOBN(0x5c070e72, 0x8e677e98), TOBN(0x0a817bd3, 0x06e81fd5)}}, + {{TOBN(0xc110d830, 0xb0f2ac95), TOBN(0x48d0995a, 0xab20e64e), + TOBN(0x0f3e00e1, 0x7729cd9a), TOBN(0x2a570c20, 0xdd556946)}, + {TOBN(0x912dbcfd, 0x4e86214d), TOBN(0x2d014ee2, 0xcf615498), + TOBN(0x55e2b1e6, 0x3530d76e), TOBN(0xc5135ae4, 0xfd0fd6d1)}}, + {{TOBN(0x0066273a, 0xd4f3049f), TOBN(0xbb8e9893, 0xe7087477), + TOBN(0x2dba1ddb, 0x14c6e5fd), TOBN(0xdba37886, 0x51f57e6c)}, + {TOBN(0x5aaee0a6, 0x5a72f2cf), TOBN(0x1208bfbf, 0x7bea5642), + TOBN(0xf5c6aa3b, 0x67872c37), TOBN(0xd726e083, 0x43f93224)}}, + {{TOBN(0x1854daa5, 0x061f1658), TOBN(0xc0016df1, 0xdf0cd2b3), + TOBN(0xc2a3f23e, 0x833d50de), TOBN(0x73b681d2, 0xbbbd3017)}, + {TOBN(0x2f046dc4, 0x3ac343c0), TOBN(0x9c847e7d, 0x85716421), + TOBN(0xe1e13c91, 0x0917eed4), TOBN(0x3fc9eebd, 0x63a1b9c6)}}, + {{TOBN(0x0f816a72, 0x7fe02299), TOBN(0x6335ccc2, 0x294f3319), + TOBN(0x3820179f, 0x4745c5be), TOBN(0xe647b782, 0x922f066e)}, + {TOBN(0xc22e49de, 0x02cafb8a), TOBN(0x299bc2ff, 0xfcc2eccc), + TOBN(0x9a8feea2, 0x6e0e8282), TOBN(0xa627278b, 0xfe893205)}}, + {{TOBN(0xa7e19733, 0x7933e47b), TOBN(0xf4ff6b13, 0x2e766402), + TOBN(0xa4d8be0a, 0x98440d9f), TOBN(0x658f5c2f, 0x38938808)}, + {TOBN(0x90b75677, 0xc95b3b3e), TOBN(0xfa044269, 0x3137b6ff), + TOBN(0x077b039b, 0x43c47c29), TOBN(0xcca95dd3, 0x8a6445b2)}}, + {{TOBN(0x0b498ba4, 0x2333fc4c), TOBN(0x274f8e68, 0xf736a1b1), + TOBN(0x6ca348fd, 0x5f1d4b2e), TOBN(0x24d3be78, 0xa8f10199)}, + {TOBN(0x8535f858, 0xca14f530), TOBN(0xa6e7f163, 0x5b982e51), + TOBN(0x847c8512, 0x36e1bf62), TOBN(0xf6a7c58e, 0x03448418)}}, + {{TOBN(0x583f3703, 0xf9374ab6), TOBN(0x864f9195, 0x6e564145), + TOBN(0x33bc3f48, 0x22526d50), TOBN(0x9f323c80, 0x1262a496)}, + {TOBN(0xaa97a7ae, 0x3f046a9a), TOBN(0x70da183e, 0xdf8a039a), + TOBN(0x5b68f71c, 0x52aa0ba6), TOBN(0x9be0fe51, 0x21459c2d)}}, + {{TOBN(0xc1e17eb6, 0xcbc613e5), TOBN(0x33131d55, 0x497ea61c), + TOBN(0x2f69d39e, 0xaf7eded5), TOBN(0x73c2f434, 0xde6af11b)}, + {TOBN(0x4ca52493, 0xa4a375fa), TOBN(0x5f06787c, 0xb833c5c2), + TOBN(0x814e091f, 0x3e6e71cf), TOBN(0x76451f57, 0x8b746666)}}}, + {{{TOBN(0x80f9bdef, 0x694db7e0), TOBN(0xedca8787, 0xb9fcddc6), + TOBN(0x51981c34, 0x03b8dce1), TOBN(0x4274dcf1, 0x70e10ba1)}, + {TOBN(0xf72743b8, 0x6def6d1a), TOBN(0xd25b1670, 0xebdb1866), + TOBN(0xc4491e8c, 0x050c6f58), TOBN(0x2be2b2ab, 0x87fbd7f5)}}, + {{TOBN(0x3e0e5c9d, 0xd111f8ec), TOBN(0xbcc33f8d, 0xb7c4e760), + TOBN(0x702f9a91, 0xbd392a51), TOBN(0x7da4a795, 0xc132e92d)}, + {TOBN(0x1a0b0ae3, 0x0bb1151b), TOBN(0x54febac8, 0x02e32251), + TOBN(0xea3a5082, 0x694e9e78), TOBN(0xe58ffec1, 0xe4fe40b8)}}, + {{TOBN(0xf85592fc, 0xd1e0cf9e), TOBN(0xdea75f0d, 0xc0e7b2e8), + TOBN(0xc04215cf, 0xc135584e), TOBN(0x174fc727, 0x2f57092a)}, + {TOBN(0xe7277877, 0xeb930bea), TOBN(0x504caccb, 0x5eb02a5a), + TOBN(0xf9fe08f7, 0xf5241b9b), TOBN(0xe7fb62f4, 0x8d5ca954)}}, + {{TOBN(0xfbb8349d, 0x29c4120b), TOBN(0x9f94391f, 0xc0d0d915), + TOBN(0xc4074fa7, 0x5410ba51), TOBN(0xa66adbf6, 0x150a5911)}, + {TOBN(0xc164543c, 0x34bfca38), TOBN(0xe0f27560, 0xb9e1ccfc), + TOBN(0x99da0f53, 0xe820219c), TOBN(0xe8234498, 0xc6b4997a)}}, + {{TOBN(0xcfb88b76, 0x9d4c5423), TOBN(0x9e56eb10, 0xb0521c49), + TOBN(0x418e0b5e, 0xbe8700a1), TOBN(0x00cbaad6, 0xf93cb58a)}, + {TOBN(0xe923fbde, 0xd92a5e67), TOBN(0xca4979ac, 0x1f347f11), + TOBN(0x89162d85, 0x6bc0585b), TOBN(0xdd6254af, 0xac3c70e3)}}, + {{TOBN(0x7b23c513, 0x516e19e4), TOBN(0x56e2e847, 0xc5c4d593), + TOBN(0x9f727d73, 0x5ce71ef6), TOBN(0x5b6304a6, 0xf79a44c5)}, + {TOBN(0x6638a736, 0x3ab7e433), TOBN(0x1adea470, 0xfe742f83), + TOBN(0xe054b854, 0x5b7fc19f), TOBN(0xf935381a, 0xba1d0698)}}, + {{TOBN(0x546eab2d, 0x799e9a74), TOBN(0x96239e0e, 0xa949f729), + TOBN(0xca274c6b, 0x7090055a), TOBN(0x835142c3, 0x9020c9b0)}, + {TOBN(0xa405667a, 0xa2e8807f), TOBN(0x29f2c085, 0x1aa3d39e), + TOBN(0xcc555d64, 0x42fc72f5), TOBN(0xe856e0e7, 0xfbeacb3c)}}, + {{TOBN(0xb5504f9d, 0x918e4936), TOBN(0x65035ef6, 0xb2513982), + TOBN(0x0553a0c2, 0x6f4d9cb9), TOBN(0x6cb10d56, 0xbea85509)}, + {TOBN(0x48d957b7, 0xa242da11), TOBN(0x16a4d3dd, 0x672b7268), + TOBN(0x3d7e637c, 0x8502a96b), TOBN(0x27c7032b, 0x730d463b)}}, + {{TOBN(0xbdc02b18, 0xe4136a14), TOBN(0xbacf969d, 0x678e32bf), + TOBN(0xc98d89a3, 0xdd9c3c03), TOBN(0x7b92420a, 0x23becc4f)}, + {TOBN(0xd4b41f78, 0xc64d565c), TOBN(0x9f969d00, 0x10f28295), + TOBN(0xec7f7f76, 0xb13d051a), TOBN(0x08945e1e, 0xa92da585)}}, + {{TOBN(0x55366b7d, 0x5846426f), TOBN(0xe7d09e89, 0x247d441d), + TOBN(0x510b404d, 0x736fbf48), TOBN(0x7fa003d0, 0xe784bd7d)}, + {TOBN(0x25f7614f, 0x17fd9596), TOBN(0x49e0e0a1, 0x35cb98db), + TOBN(0x2c65957b, 0x2e83a76a), TOBN(0x5d40da8d, 0xcddbe0f8)}}, + {{TOBN(0xf2b8c405, 0x050bad24), TOBN(0x8918426d, 0xc2aa4823), + TOBN(0x2aeab3dd, 0xa38365a7), TOBN(0x72031717, 0x7c91b690)}, + {TOBN(0x8b00d699, 0x60a94120), TOBN(0x478a255d, 0xe99eaeec), + TOBN(0xbf656a5f, 0x6f60aafd), TOBN(0xdfd7cb75, 0x5dee77b3)}}, + {{TOBN(0x37f68bb4, 0xa595939d), TOBN(0x03556479, 0x28740217), + TOBN(0x8e740e7c, 0x84ad7612), TOBN(0xd89bc843, 0x9044695f)}, + {TOBN(0xf7f3da5d, 0x85a9184d), TOBN(0x562563bb, 0x9fc0b074), + TOBN(0x06d2e6aa, 0xf88a888e), TOBN(0x612d8643, 0x161fbe7c)}}, + {{TOBN(0x465edba7, 0xf64085e7), TOBN(0xb230f304, 0x29aa8511), + TOBN(0x53388426, 0xcda2d188), TOBN(0x90885735, 0x4b666649)}, + {TOBN(0x6f02ff9a, 0x652f54f6), TOBN(0x65c82294, 0x5fae2bf0), + TOBN(0x7816ade0, 0x62f5eee3), TOBN(0xdcdbdf43, 0xfcc56d70)}}, + {{TOBN(0x9fb3bba3, 0x54530bb2), TOBN(0xbde3ef77, 0xcb0869ea), + TOBN(0x89bc9046, 0x0b431163), TOBN(0x4d03d7d2, 0xe4819a35)}, + {TOBN(0x33ae4f9e, 0x43b6a782), TOBN(0x216db307, 0x9c88a686), + TOBN(0x91dd88e0, 0x00ffedd9), TOBN(0xb280da9f, 0x12bd4840)}}, + {{TOBN(0x32a7cb8a, 0x1635e741), TOBN(0xfe14008a, 0x78be02a7), + TOBN(0x3fafb334, 0x1b7ae030), TOBN(0x7fd508e7, 0x5add0ce9)}, + {TOBN(0x72c83219, 0xd607ad51), TOBN(0x0f229c0a, 0x8d40964a), + TOBN(0x1be2c336, 0x1c878da2), TOBN(0xe0c96742, 0xeab2ab86)}}, + {{TOBN(0x458f8691, 0x3e538cd7), TOBN(0xa7001f6c, 0x8e08ad53), + TOBN(0x52b8c6e6, 0xbf5d15ff), TOBN(0x548234a4, 0x011215dd)}, + {TOBN(0xff5a9d2d, 0x3d5b4045), TOBN(0xb0ffeeb6, 0x4a904190), + TOBN(0x55a3aca4, 0x48607f8b), TOBN(0x8cbd665c, 0x30a0672a)}}, + {{TOBN(0x87f834e0, 0x42583068), TOBN(0x02da2aeb, 0xf3f6e683), + TOBN(0x6b763e5d, 0x05c12248), TOBN(0x7230378f, 0x65a8aefc)}, + {TOBN(0x93bd80b5, 0x71e8e5ca), TOBN(0x53ab041c, 0xb3b62524), + TOBN(0x1b860513, 0x6c9c552e), TOBN(0xe84d402c, 0xd5524e66)}}, + {{TOBN(0xa37f3573, 0xf37f5937), TOBN(0xeb0f6c7d, 0xd1e4fca5), + TOBN(0x2965a554, 0xac8ab0fc), TOBN(0x17fbf56c, 0x274676ac)}, + {TOBN(0x2e2f6bd9, 0xacf7d720), TOBN(0x41fc8f88, 0x10224766), + TOBN(0x517a14b3, 0x85d53bef), TOBN(0xdae327a5, 0x7d76a7d1)}}, + {{TOBN(0x6ad0a065, 0xc4818267), TOBN(0x33aa189b, 0x37c1bbc1), + TOBN(0x64970b52, 0x27392a92), TOBN(0x21699a1c, 0x2d1535ea)}, + {TOBN(0xcd20779c, 0xc2d7a7fd), TOBN(0xe3186059, 0x99c83cf2), + TOBN(0x9b69440b, 0x72c0b8c7), TOBN(0xa81497d7, 0x7b9e0e4d)}}, + {{TOBN(0x515d5c89, 0x1f5f82dc), TOBN(0x9a7f67d7, 0x6361079e), + TOBN(0xa8da81e3, 0x11a35330), TOBN(0xe44990c4, 0x4b18be1b)}, + {TOBN(0xc7d5ed95, 0xaf103e59), TOBN(0xece8aba7, 0x8dac9261), + TOBN(0xbe82b099, 0x9394b8d3), TOBN(0x6830f09a, 0x16adfe83)}}, + {{TOBN(0x250a29b4, 0x88172d01), TOBN(0x8b20bd65, 0xcaff9e02), + TOBN(0xb8a7661e, 0xe8a6329a), TOBN(0x4520304d, 0xd3fce920)}, + {TOBN(0xae45da1f, 0x2b47f7ef), TOBN(0xe07f5288, 0x5bffc540), + TOBN(0xf7997009, 0x3464f874), TOBN(0x2244c2cd, 0xa6fa1f38)}}, + {{TOBN(0x43c41ac1, 0x94d7d9b1), TOBN(0x5bafdd82, 0xc82e7f17), + TOBN(0xdf0614c1, 0x5fda0fca), TOBN(0x74b043a7, 0xa8ae37ad)}, + {TOBN(0x3ba6afa1, 0x9e71734c), TOBN(0x15d5437e, 0x9c450f2e), + TOBN(0x4a5883fe, 0x67e242b1), TOBN(0x5143bdc2, 0x2c1953c2)}}, + {{TOBN(0x542b8b53, 0xfc5e8920), TOBN(0x363bf9a8, 0x9a9cee08), + TOBN(0x02375f10, 0xc3486e08), TOBN(0x2037543b, 0x8c5e70d2)}, + {TOBN(0x7109bccc, 0x625640b4), TOBN(0xcbc1051e, 0x8bc62c3b), + TOBN(0xf8455fed, 0x803f26ea), TOBN(0x6badceab, 0xeb372424)}}, + {{TOBN(0xa2a9ce7c, 0x6b53f5f9), TOBN(0x64246595, 0x1b176d99), + TOBN(0xb1298d36, 0xb95c081b), TOBN(0x53505bb8, 0x1d9a9ee6)}, + {TOBN(0x3f6f9e61, 0xf2ba70b0), TOBN(0xd07e16c9, 0x8afad453), + TOBN(0x9f1694bb, 0xe7eb4a6a), TOBN(0xdfebced9, 0x3cb0bc8e)}}, + {{TOBN(0x92d3dcdc, 0x53868c8b), TOBN(0x174311a2, 0x386107a6), + TOBN(0x4109e07c, 0x689b4e64), TOBN(0x30e4587f, 0x2df3dcb6)}, + {TOBN(0x841aea31, 0x0811b3b2), TOBN(0x6144d41d, 0x0cce43ea), + TOBN(0x464c4581, 0x2a9a7803), TOBN(0xd03d371f, 0x3e158930)}}, + {{TOBN(0xc676d7f2, 0xb1f3390b), TOBN(0x9f7a1b8c, 0xa5b61272), + TOBN(0x4ebebfc9, 0xc2e127a9), TOBN(0x4602500c, 0x5dd997bf)}, + {TOBN(0x7f09771c, 0x4711230f), TOBN(0x058eb37c, 0x020f09c1), + TOBN(0xab693d4b, 0xfee5e38b), TOBN(0x9289eb1f, 0x4653cbc0)}}, + {{TOBN(0xbecf46ab, 0xd51b9cf5), TOBN(0xd2aa9c02, 0x9f0121af), + TOBN(0x36aaf7d2, 0xe90dc274), TOBN(0x909e4ea0, 0x48b95a3c)}, + {TOBN(0xe6b70496, 0x6f32dbdb), TOBN(0x672188a0, 0x8b030b3e), + TOBN(0xeeffe5b3, 0xcfb617e2), TOBN(0x87e947de, 0x7c82709e)}}, + {{TOBN(0xa44d2b39, 0x1770f5a7), TOBN(0xe4d4d791, 0x0e44eb82), + TOBN(0x42e69d1e, 0x3f69712a), TOBN(0xbf11c4d6, 0xac6a820e)}, + {TOBN(0xb5e7f3e5, 0x42c4224c), TOBN(0xd6b4e81c, 0x449d941c), + TOBN(0x5d72bd16, 0x5450e878), TOBN(0x6a61e28a, 0xee25ac54)}}, + {{TOBN(0x33272094, 0xe6f1cd95), TOBN(0x7512f30d, 0x0d18673f), + TOBN(0x32f7a4ca, 0x5afc1464), TOBN(0x2f095656, 0x6bbb977b)}, + {TOBN(0x586f47ca, 0xa8226200), TOBN(0x02c868ad, 0x1ac07369), + TOBN(0x4ef2b845, 0xc613acbe), TOBN(0x43d7563e, 0x0386054c)}}, + {{TOBN(0x54da9dc7, 0xab952578), TOBN(0xb5423df2, 0x26e84d0b), + TOBN(0xa8b64eeb, 0x9b872042), TOBN(0xac205782, 0x5990f6df)}, + {TOBN(0x4ff696eb, 0x21f4c77a), TOBN(0x1a79c3e4, 0xaab273af), + TOBN(0x29bc922e, 0x9436b3f1), TOBN(0xff807ef8, 0xd6d9a27a)}}, + {{TOBN(0x82acea3d, 0x778f22a0), TOBN(0xfb10b2e8, 0x5b5e7469), + TOBN(0xc0b16980, 0x2818ee7d), TOBN(0x011afff4, 0xc91c1a2f)}, + {TOBN(0x95a6d126, 0xad124418), TOBN(0x31c081a5, 0xe72e295f), + TOBN(0x36bb283a, 0xf2f4db75), TOBN(0xd115540f, 0x7acef462)}}, + {{TOBN(0xc7f3a8f8, 0x33f6746c), TOBN(0x21e46f65, 0xfea990ca), + TOBN(0x915fd5c5, 0xcaddb0a9), TOBN(0xbd41f016, 0x78614555)}, + {TOBN(0x346f4434, 0x426ffb58), TOBN(0x80559436, 0x14dbc204), + TOBN(0xf3dd20fe, 0x5a969b7f), TOBN(0x9d59e956, 0xe899a39a)}}, + {{TOBN(0xf1b0971c, 0x8ad4cf4b), TOBN(0x03448860, 0x2ffb8fb8), + TOBN(0xf071ac3c, 0x65340ba4), TOBN(0x408d0596, 0xb27fd758)}, + {TOBN(0xe7c78ea4, 0x98c364b0), TOBN(0xa4aac4a5, 0x051e8ab5), + TOBN(0xb9e1d560, 0x485d9002), TOBN(0x9acd518a, 0x88844455)}}, + {{TOBN(0xe4ca688f, 0xd06f56c0), TOBN(0xa48af70d, 0xdf027972), + TOBN(0x691f0f04, 0x5e9a609d), TOBN(0xa9dd82cd, 0xee61270e)}, + {TOBN(0x8903ca63, 0xa0ef18d3), TOBN(0x9fb7ee35, 0x3d6ca3bd), + TOBN(0xa7b4a09c, 0xabf47d03), TOBN(0x4cdada01, 0x1c67de8e)}}, + {{TOBN(0x52003749, 0x9355a244), TOBN(0xe77fd2b6, 0x4f2151a9), + TOBN(0x695d6cf6, 0x66b4efcb), TOBN(0xc5a0cacf, 0xda2cfe25)}, + {TOBN(0x104efe5c, 0xef811865), TOBN(0xf52813e8, 0x9ea5cc3d), + TOBN(0x855683dc, 0x40b58dbc), TOBN(0x0338ecde, 0x175fcb11)}}, + {{TOBN(0xf9a05637, 0x74921592), TOBN(0xb4f1261d, 0xb9bb9d31), + TOBN(0x551429b7, 0x4e9c5459), TOBN(0xbe182e6f, 0x6ea71f53)}, + {TOBN(0xd3a3b07c, 0xdfc50573), TOBN(0x9ba1afda, 0x62be8d44), + TOBN(0x9bcfd2cb, 0x52ab65d3), TOBN(0xdf11d547, 0xa9571802)}}, + {{TOBN(0x099403ee, 0x02a2404a), TOBN(0x497406f4, 0x21088a71), + TOBN(0x99479409, 0x5004ae71), TOBN(0xbdb42078, 0xa812c362)}, + {TOBN(0x2b72a30f, 0xd8828442), TOBN(0x283add27, 0xfcb5ed1c), + TOBN(0xf7c0e200, 0x66a40015), TOBN(0x3e3be641, 0x08b295ef)}}, + {{TOBN(0xac127dc1, 0xe038a675), TOBN(0x729deff3, 0x8c5c6320), + TOBN(0xb7df8fd4, 0xa90d2c53), TOBN(0x9b74b0ec, 0x681e7cd3)}, + {TOBN(0x5cb5a623, 0xdab407e5), TOBN(0xcdbd3615, 0x76b340c6), + TOBN(0xa184415a, 0x7d28392c), TOBN(0xc184c1d8, 0xe96f7830)}}, + {{TOBN(0xc3204f19, 0x81d3a80f), TOBN(0xfde0c841, 0xc8e02432), + TOBN(0x78203b3e, 0x8149e0c1), TOBN(0x5904bdbb, 0x08053a73)}, + {TOBN(0x30fc1dd1, 0x101b6805), TOBN(0x43c223bc, 0x49aa6d49), + TOBN(0x9ed67141, 0x7a174087), TOBN(0x311469a0, 0xd5997008)}}, + {{TOBN(0xb189b684, 0x5e43fc61), TOBN(0xf3282375, 0xe0d3ab57), + TOBN(0x4fa34b67, 0xb1181da8), TOBN(0x621ed0b2, 0x99ee52b8)}, + {TOBN(0x9b178de1, 0xad990676), TOBN(0xd51de67b, 0x56d54065), + TOBN(0x2a2c27c4, 0x7538c201), TOBN(0x33856ec8, 0x38a40f5c)}}, + {{TOBN(0x2522fc15, 0xbe6cdcde), TOBN(0x1e603f33, 0x9f0c6f89), + TOBN(0x7994edc3, 0x103e30a6), TOBN(0x033a00db, 0x220c853e)}, + {TOBN(0xd3cfa409, 0xf7bb7fd7), TOBN(0x70f8781e, 0x462d18f6), + TOBN(0xbbd82980, 0x687fe295), TOBN(0x6eef4c32, 0x595669f3)}}, + {{TOBN(0x86a9303b, 0x2f7e85c3), TOBN(0x5fce4621, 0x71988f9b), + TOBN(0x5b935bf6, 0xc138acb5), TOBN(0x30ea7d67, 0x25661212)}, + {TOBN(0xef1eb5f4, 0xe51ab9a2), TOBN(0x0587c98a, 0xae067c78), + TOBN(0xb3ce1b3c, 0x77ca9ca6), TOBN(0x2a553d4d, 0x54b5f057)}}, + {{TOBN(0xc7898236, 0x4da29ec2), TOBN(0xdbdd5d13, 0xb9c57316), + TOBN(0xc57d6e6b, 0x2cd80d47), TOBN(0x80b460cf, 0xfe9e7391)}, + {TOBN(0x98648cab, 0xf963c31e), TOBN(0x67f9f633, 0xcc4d32fd), + TOBN(0x0af42a9d, 0xfdf7c687), TOBN(0x55f292a3, 0x0b015ea7)}}, + {{TOBN(0x89e468b2, 0xcd21ab3d), TOBN(0xe504f022, 0xc393d392), + TOBN(0xab21e1d4, 0xa5013af9), TOBN(0xe3283f78, 0xc2c28acb)}, + {TOBN(0xf38b35f6, 0x226bf99f), TOBN(0xe8354274, 0x0e291e69), + TOBN(0x61673a15, 0xb20c162d), TOBN(0xc101dc75, 0xb04fbdbe)}}, + {{TOBN(0x8323b4c2, 0x255bd617), TOBN(0x6c969693, 0x6c2a9154), + TOBN(0xc6e65860, 0x62679387), TOBN(0x8e01db0c, 0xb8c88e23)}, + {TOBN(0x33c42873, 0x893a5559), TOBN(0x7630f04b, 0x47a3e149), + TOBN(0xb5d80805, 0xddcf35f8), TOBN(0x582ca080, 0x77dfe732)}}, + {{TOBN(0x2c7156e1, 0x0b1894a0), TOBN(0x92034001, 0xd81c68c0), + TOBN(0xed225d00, 0xc8b115b5), TOBN(0x237f9c22, 0x83b907f2)}, + {TOBN(0x0ea2f32f, 0x4470e2c0), TOBN(0xb725f7c1, 0x58be4e95), + TOBN(0x0f1dcafa, 0xb1ae5463), TOBN(0x59ed5187, 0x1ba2fc04)}}, + {{TOBN(0xf6e0f316, 0xd0115d4d), TOBN(0x5180b12f, 0xd3691599), + TOBN(0x157e32c9, 0x527f0a41), TOBN(0x7b0b081d, 0xa8e0ecc0)}, + {TOBN(0x6dbaaa8a, 0xbf4f0dd0), TOBN(0x99b289c7, 0x4d252696), + TOBN(0x79b7755e, 0xdbf864fe), TOBN(0x6974e2b1, 0x76cad3ab)}}, + {{TOBN(0x35dbbee2, 0x06ddd657), TOBN(0xe7cbdd11, 0x2ff3a96d), + TOBN(0x88381968, 0x076be758), TOBN(0x2d737e72, 0x08c91f5d)}, + {TOBN(0x5f83ab62, 0x86ec3776), TOBN(0x98aa649d, 0x945fa7a1), + TOBN(0xf477ec37, 0x72ef0933), TOBN(0x66f52b1e, 0x098c17b1)}}, + {{TOBN(0x9eec58fb, 0xd803738b), TOBN(0x91aaade7, 0xe4e86aa4), + TOBN(0x6b1ae617, 0xa5b51492), TOBN(0x63272121, 0xbbc45974)}, + {TOBN(0x7e0e28f0, 0x862c5129), TOBN(0x0a8f79a9, 0x3321a4a0), + TOBN(0xe26d1664, 0x5041c88f), TOBN(0x0571b805, 0x53233e3a)}}, + {{TOBN(0xd1b0ccde, 0xc9520711), TOBN(0x55a9e4ed, 0x3c8b84bf), + TOBN(0x9426bd39, 0xa1fef314), TOBN(0x4f5f638e, 0x6eb93f2b)}, + {TOBN(0xba2a1ed3, 0x2bf9341b), TOBN(0xd63c1321, 0x4d42d5a9), + TOBN(0xd2964a89, 0x316dc7c5), TOBN(0xd1759606, 0xca511851)}}, + {{TOBN(0xd8a9201f, 0xf9e6ed35), TOBN(0xb7b5ee45, 0x6736925a), + TOBN(0x0a83fbbc, 0x99581af7), TOBN(0x3076bc40, 0x64eeb051)}, + {TOBN(0x5511c98c, 0x02dec312), TOBN(0x270de898, 0x238dcb78), + TOBN(0x2cf4cf9c, 0x539c08c9), TOBN(0xa70cb65e, 0x38d3b06e)}}, + {{TOBN(0xb12ec10e, 0xcfe57bbd), TOBN(0x82c7b656, 0x35a0c2b5), + TOBN(0xddc7d5cd, 0x161c67bd), TOBN(0xe32e8985, 0xae3a32cc)}, + {TOBN(0x7aba9444, 0xd11a5529), TOBN(0xe964ed02, 0x2427fa1a), + TOBN(0x1528392d, 0x24a1770a), TOBN(0xa152ce2c, 0x12c72fcd)}}, + {{TOBN(0x714553a4, 0x8ec07649), TOBN(0x18b4c290, 0x459dd453), + TOBN(0xea32b714, 0x7b64b110), TOBN(0xb871bfa5, 0x2e6f07a2)}, + {TOBN(0xb67112e5, 0x9e2e3c9b), TOBN(0xfbf250e5, 0x44aa90f6), + TOBN(0xf77aedb8, 0xbd539006), TOBN(0x3b0cdf9a, 0xd172a66f)}}, + {{TOBN(0xedf69fea, 0xf8c51187), TOBN(0x05bb67ec, 0x741e4da7), + TOBN(0x47df0f32, 0x08114345), TOBN(0x56facb07, 0xbb9792b1)}, + {TOBN(0xf3e007e9, 0x8f6229e4), TOBN(0x62d103f4, 0x526fba0f), + TOBN(0x4f33bef7, 0xb0339d79), TOBN(0x9841357b, 0xb59bfec1)}}, + {{TOBN(0xfa8dbb59, 0xc34e6705), TOBN(0xc3c7180b, 0x7fdaa84c), + TOBN(0xf95872fc, 0xa4108537), TOBN(0x8750cc3b, 0x932a3e5a)}, + {TOBN(0xb61cc69d, 0xb7275d7d), TOBN(0xffa0168b, 0x2e59b2e9), + TOBN(0xca032abc, 0x6ecbb493), TOBN(0x1d86dbd3, 0x2c9082d8)}}, + {{TOBN(0xae1e0b67, 0xe28ef5ba), TOBN(0x2c9a4699, 0xcb18e169), + TOBN(0x0ecd0e33, 0x1e6bbd20), TOBN(0x571b360e, 0xaf5e81d2)}, + {TOBN(0xcd9fea58, 0x101c1d45), TOBN(0x6651788e, 0x18880452), + TOBN(0xa9972635, 0x1f8dd446), TOBN(0x44bed022, 0xe37281d0)}}, + {{TOBN(0x094b2b2d, 0x33da525d), TOBN(0xf193678e, 0x13144fd8), + TOBN(0xb8ab5ba4, 0xf4c1061d), TOBN(0x4343b5fa, 0xdccbe0f4)}, + {TOBN(0xa8702371, 0x63812713), TOBN(0x47bf6d2d, 0xf7611d93), + TOBN(0x46729b8c, 0xbd21e1d7), TOBN(0x7484d4e0, 0xd629e77d)}}, + {{TOBN(0x830e6eea, 0x60dbac1f), TOBN(0x23d8c484, 0xda06a2f7), + TOBN(0x896714b0, 0x50ca535b), TOBN(0xdc8d3644, 0xebd97a9b)}, + {TOBN(0x106ef9fa, 0xb12177b4), TOBN(0xf79bf464, 0x534d5d9c), + TOBN(0x2537a349, 0xa6ab360b), TOBN(0xc7c54253, 0xa00c744f)}}, + {{TOBN(0xb3c7a047, 0xe5911a76), TOBN(0x61ffa5c8, 0x647f1ee7), + TOBN(0x15aed36f, 0x8f56ab42), TOBN(0x6a0d41b0, 0xa3ff9ac9)}, + {TOBN(0x68f469f5, 0xcc30d357), TOBN(0xbe9adf81, 0x6b72be96), + TOBN(0x1cd926fe, 0x903ad461), TOBN(0x7e89e38f, 0xcaca441b)}}, + {{TOBN(0xf0f82de5, 0xfacf69d4), TOBN(0x363b7e76, 0x4775344c), + TOBN(0x6894f312, 0xb2e36d04), TOBN(0x3c6cb4fe, 0x11d1c9a5)}, + {TOBN(0x85d9c339, 0x4008e1f2), TOBN(0x5e9a85ea, 0x249f326c), + TOBN(0xdc35c60a, 0x678c5e06), TOBN(0xc08b944f, 0x9f86fba9)}}, + {{TOBN(0xde40c02c, 0x89f71f0f), TOBN(0xad8f3e31, 0xff3da3c0), + TOBN(0x3ea5096b, 0x42125ded), TOBN(0x13879cbf, 0xa7379183)}, + {TOBN(0x6f4714a5, 0x6b306a0b), TOBN(0x359c2ea6, 0x67646c5e), + TOBN(0xfacf8943, 0x07726368), TOBN(0x07a58935, 0x65ff431e)}}, + {{TOBN(0x24d661d1, 0x68754ab0), TOBN(0x801fce1d, 0x6f429a76), + TOBN(0xc068a85f, 0xa58ce769), TOBN(0xedc35c54, 0x5d5eca2b)}, + {TOBN(0xea31276f, 0xa3f660d1), TOBN(0xa0184ebe, 0xb8fc7167), + TOBN(0x0f20f21a, 0x1d8db0ae), TOBN(0xd96d095f, 0x56c35e12)}}, + {{TOBN(0xedf402b5, 0xf8c2a25b), TOBN(0x1bb772b9, 0x059204b6), + TOBN(0x50cbeae2, 0x19b4e34c), TOBN(0x93109d80, 0x3fa0845a)}, + {TOBN(0x54f7ccf7, 0x8ef59fb5), TOBN(0x3b438fe2, 0x88070963), + TOBN(0x9e28c659, 0x31f3ba9b), TOBN(0x9cc31b46, 0xead9da92)}}, + {{TOBN(0x3c2f0ba9, 0xb733aa5f), TOBN(0xdece47cb, 0xf05af235), + TOBN(0xf8e3f715, 0xa2ac82a5), TOBN(0xc97ba641, 0x2203f18a)}, + {TOBN(0xc3af5504, 0x09c11060), TOBN(0x56ea2c05, 0x46af512d), + TOBN(0xfac28daf, 0xf3f28146), TOBN(0x87fab43a, 0x959ef494)}}}, + {{{TOBN(0x09891641, 0xd4c5105f), TOBN(0x1ae80f8e, 0x6d7fbd65), + TOBN(0x9d67225f, 0xbee6bdb0), TOBN(0x3b433b59, 0x7fc4d860)}, + {TOBN(0x44e66db6, 0x93e85638), TOBN(0xf7b59252, 0xe3e9862f), + TOBN(0xdb785157, 0x665c32ec), TOBN(0x702fefd7, 0xae362f50)}}, + {{TOBN(0x3754475d, 0x0fefb0c3), TOBN(0xd48fb56b, 0x46d7c35d), + TOBN(0xa070b633, 0x363798a4), TOBN(0xae89f3d2, 0x8fdb98e6)}, + {TOBN(0x970b89c8, 0x6363d14c), TOBN(0x89817521, 0x67abd27d), + TOBN(0x9bf7d474, 0x44d5a021), TOBN(0xb3083baf, 0xcac72aee)}}, + {{TOBN(0x389741de, 0xbe949a44), TOBN(0x638e9388, 0x546a4fa5), + TOBN(0x3fe6419c, 0xa0047bdc), TOBN(0x7047f648, 0xaaea57ca)}, + {TOBN(0x54e48a90, 0x41fbab17), TOBN(0xda8e0b28, 0x576bdba2), + TOBN(0xe807eebc, 0xc72afddc), TOBN(0x07d3336d, 0xf42577bf)}}, + {{TOBN(0x62a8c244, 0xbfe20925), TOBN(0x91c19ac3, 0x8fdce867), + TOBN(0x5a96a5d5, 0xdd387063), TOBN(0x61d587d4, 0x21d324f6)}, + {TOBN(0xe87673a2, 0xa37173ea), TOBN(0x23848008, 0x53778b65), + TOBN(0x10f8441e, 0x05bab43e), TOBN(0xfa11fe12, 0x4621efbe)}}, + {{TOBN(0x047b772e, 0x81685d7b), TOBN(0x23f27d81, 0xbf34a976), + TOBN(0xc27608e2, 0x915f48ef), TOBN(0x3b0b43fa, 0xa521d5c3)}, + {TOBN(0x7613fb26, 0x63ca7284), TOBN(0x7f5729b4, 0x1d4db837), + TOBN(0x87b14898, 0x583b526b), TOBN(0x00b732a6, 0xbbadd3d1)}}, + {{TOBN(0x8e02f426, 0x2048e396), TOBN(0x436b50b6, 0x383d9de4), + TOBN(0xf78d3481, 0x471e85ad), TOBN(0x8b01ea6a, 0xd005c8d6)}, + {TOBN(0xd3c7afee, 0x97015c07), TOBN(0x46cdf1a9, 0x4e3ba2ae), + TOBN(0x7a42e501, 0x83d3a1d2), TOBN(0xd54b5268, 0xb541dff4)}}, + {{TOBN(0x3f24cf30, 0x4e23e9bc), TOBN(0x4387f816, 0x126e3624), + TOBN(0x26a46a03, 0x3b0b6d61), TOBN(0xaf1bc845, 0x8b2d777c)}, + {TOBN(0x25c401ba, 0x527de79c), TOBN(0x0e1346d4, 0x4261bbb6), + TOBN(0x4b96c44b, 0x287b4bc7), TOBN(0x658493c7, 0x5254562f)}}, + {{TOBN(0x23f949fe, 0xb8a24a20), TOBN(0x17ebfed1, 0xf52ca53f), + TOBN(0x9b691bbe, 0xbcfb4853), TOBN(0x5617ff6b, 0x6278a05d)}, + {TOBN(0x241b34c5, 0xe3c99ebd), TOBN(0xfc64242e, 0x1784156a), + TOBN(0x4206482f, 0x695d67df), TOBN(0xb967ce0e, 0xee27c011)}}, + {{TOBN(0x65db3751, 0x21c80b5d), TOBN(0x2e7a563c, 0xa31ecca0), + TOBN(0xe56ffc4e, 0x5238a07e), TOBN(0x3d6c2966, 0x32ced854)}, + {TOBN(0xe99d7d1a, 0xaf70b885), TOBN(0xafc3bad9, 0x2d686459), + TOBN(0x9c78bf46, 0x0cc8ba5b), TOBN(0x5a439519, 0x18955aa3)}}, + {{TOBN(0xf8b517a8, 0x5fe4e314), TOBN(0xe60234d0, 0xfcb8906f), + TOBN(0xffe542ac, 0xf2061b23), TOBN(0x287e191f, 0x6b4cb59c)}, + {TOBN(0x21857ddc, 0x09d877d8), TOBN(0x1c23478c, 0x14678941), + TOBN(0xbbf0c056, 0xb6e05ea4), TOBN(0x82da4b53, 0xb01594fe)}}, + {{TOBN(0xf7526791, 0xfadb8608), TOBN(0x049e832d, 0x7b74cdf6), + TOBN(0xa43581cc, 0xc2b90a34), TOBN(0x73639eb8, 0x9360b10c)}, + {TOBN(0x4fba331f, 0xe1e4a71b), TOBN(0x6ffd6b93, 0x8072f919), + TOBN(0x6e53271c, 0x65679032), TOBN(0x67206444, 0xf14272ce)}}, + {{TOBN(0xc0f734a3, 0xb2335834), TOBN(0x9526205a, 0x90ef6860), + TOBN(0xcb8be717, 0x04e2bb0d), TOBN(0x2418871e, 0x02f383fa)}, + {TOBN(0xd7177681, 0x4082c157), TOBN(0xcc914ad0, 0x29c20073), + TOBN(0xf186c1eb, 0xe587e728), TOBN(0x6fdb3c22, 0x61bcd5fd)}}, + {{TOBN(0x30d014a6, 0xf2f9f8e9), TOBN(0x963ece23, 0x4fec49d2), + TOBN(0x862025c5, 0x9605a8d9), TOBN(0x39874445, 0x19f8929a)}, + {TOBN(0x01b6ff65, 0x12bf476a), TOBN(0x598a64d8, 0x09cf7d91), + TOBN(0xd7ec7749, 0x93be56ca), TOBN(0x10899785, 0xcbb33615)}}, + {{TOBN(0xb8a092fd, 0x02eee3ad), TOBN(0xa86b3d35, 0x30145270), + TOBN(0x323d98c6, 0x8512b675), TOBN(0x4b8bc785, 0x62ebb40f)}, + {TOBN(0x7d301f54, 0x413f9cde), TOBN(0xa5e4fb4f, 0x2bab5664), + TOBN(0x1d2b252d, 0x1cbfec23), TOBN(0xfcd576bb, 0xe177120d)}}, + {{TOBN(0x04427d3e, 0x83731a34), TOBN(0x2bb9028e, 0xed836e8e), + TOBN(0xb36acff8, 0xb612ca7c), TOBN(0xb88fe5ef, 0xd3d9c73a)}, + {TOBN(0xbe2a6bc6, 0xedea4eb3), TOBN(0x43b93133, 0x488eec77), + TOBN(0xf41ff566, 0xb17106e1), TOBN(0x469e9172, 0x654efa32)}}, + {{TOBN(0xb4480f04, 0x41c23fa3), TOBN(0xb4712eb0, 0xc1989a2e), + TOBN(0x3ccbba0f, 0x93a29ca7), TOBN(0x6e205c14, 0xd619428c)}, + {TOBN(0x90db7957, 0xb3641686), TOBN(0x0432691d, 0x45ac8b4e), + TOBN(0x07a759ac, 0xf64e0350), TOBN(0x0514d89c, 0x9c972517)}}, + {{TOBN(0x1701147f, 0xa8e67fc3), TOBN(0x9e2e0b8b, 0xab2085be), + TOBN(0xd5651824, 0xac284e57), TOBN(0x890d4325, 0x74893664)}, + {TOBN(0x8a7c5e6e, 0xc55e68a3), TOBN(0xbf12e90b, 0x4339c85a), + TOBN(0x31846b85, 0xf922b655), TOBN(0x9a54ce4d, 0x0bf4d700)}}, + {{TOBN(0xd7f4e83a, 0xf1a14295), TOBN(0x916f955c, 0xb285d4f9), + TOBN(0xe57bb0e0, 0x99ffdaba), TOBN(0x28a43034, 0xeab0d152)}, + {TOBN(0x0a36ffa2, 0xb8a9cef8), TOBN(0x5517407e, 0xb9ec051a), + TOBN(0x9c796096, 0xea68e672), TOBN(0x853db5fb, 0xfb3c77fb)}}, + {{TOBN(0x21474ba9, 0xe864a51a), TOBN(0x6c267699, 0x6e8a1b8b), + TOBN(0x7c823626, 0x94120a28), TOBN(0xe61e9a48, 0x8383a5db)}, + {TOBN(0x7dd75003, 0x9f84216d), TOBN(0xab020d07, 0xad43cd85), + TOBN(0x9437ae48, 0xda12c659), TOBN(0x6449c2eb, 0xe65452ad)}}, + {{TOBN(0xcc7c4c1c, 0x2cf9d7c1), TOBN(0x1320886a, 0xee95e5ab), + TOBN(0xbb7b9056, 0xbeae170c), TOBN(0xc8a5b250, 0xdbc0d662)}, + {TOBN(0x4ed81432, 0xc11d2303), TOBN(0x7da66912, 0x1f03769f), + TOBN(0x3ac7a5fd, 0x84539828), TOBN(0x14dada94, 0x3bccdd02)}}, + {{TOBN(0x8b84c321, 0x7ef6b0d1), TOBN(0x52a9477a, 0x7c933f22), + TOBN(0x5ef6728a, 0xfd440b82), TOBN(0x5c3bd859, 0x6ce4bd5e)}, + {TOBN(0x918b80f5, 0xf22c2d3e), TOBN(0x368d5040, 0xb7bb6cc5), + TOBN(0xb66142a1, 0x2695a11c), TOBN(0x60ac583a, 0xeb19ea70)}}, + {{TOBN(0x317cbb98, 0x0eab2437), TOBN(0x8cc08c55, 0x5e2654c8), + TOBN(0xfe2d6520, 0xe6d8307f), TOBN(0xe9f147f3, 0x57428993)}, + {TOBN(0x5f9c7d14, 0xd2fd6cf1), TOBN(0xa3ecd064, 0x2d4fcbb0), + TOBN(0xad83fef0, 0x8e7341f7), TOBN(0x643f23a0, 0x3a63115c)}}, + {{TOBN(0xd38a78ab, 0xe65ab743), TOBN(0xbf7c75b1, 0x35edc89c), + TOBN(0x3dd8752e, 0x530df568), TOBN(0xf85c4a76, 0xe308c682)}, + {TOBN(0x4c9955b2, 0xe68acf37), TOBN(0xa544df3d, 0xab32af85), + TOBN(0x4b8ec3f5, 0xa25cf493), TOBN(0x4d8f2764, 0x1a622feb)}}, + {{TOBN(0x7bb4f7aa, 0xf0dcbc49), TOBN(0x7de551f9, 0x70bbb45b), + TOBN(0xcfd0f3e4, 0x9f2ca2e5), TOBN(0xece58709, 0x1f5c76ef)}, + {TOBN(0x32920edd, 0x167d79ae), TOBN(0x039df8a2, 0xfa7d7ec1), + TOBN(0xf46206c0, 0xbb30af91), TOBN(0x1ff5e2f5, 0x22676b59)}}, + {{TOBN(0x11f4a039, 0x6ea51d66), TOBN(0x506c1445, 0x807d7a26), + TOBN(0x60da5705, 0x755a9b24), TOBN(0x8fc8cc32, 0x1f1a319e)}, + {TOBN(0x83642d4d, 0x9433d67d), TOBN(0x7fa5cb8f, 0x6a7dd296), + TOBN(0x576591db, 0x9b7bde07), TOBN(0x13173d25, 0x419716fb)}}, + {{TOBN(0xea30599d, 0xd5b340ff), TOBN(0xfc6b5297, 0xb0fe76c5), + TOBN(0x1c6968c8, 0xab8f5adc), TOBN(0xf723c7f5, 0x901c928d)}, + {TOBN(0x4203c321, 0x9773d402), TOBN(0xdf7c6aa3, 0x1b51dd47), + TOBN(0x3d49e37a, 0x552be23c), TOBN(0x57febee8, 0x0b5a6e87)}}, + {{TOBN(0xc5ecbee4, 0x7bd8e739), TOBN(0x79d44994, 0xae63bf75), + TOBN(0x168bd00f, 0x38fb8923), TOBN(0x75d48ee4, 0xd0533130)}, + {TOBN(0x554f77aa, 0xdb5cdf33), TOBN(0x3396e896, 0x3c696769), + TOBN(0x2fdddbf2, 0xd3fd674e), TOBN(0xbbb8f6ee, 0x99d0e3e5)}}, + {{TOBN(0x51b90651, 0xcbae2f70), TOBN(0xefc4bc05, 0x93aaa8eb), + TOBN(0x8ecd8689, 0xdd1df499), TOBN(0x1aee99a8, 0x22f367a5)}, + {TOBN(0x95d485b9, 0xae8274c5), TOBN(0x6c14d445, 0x7d30b39c), + TOBN(0xbafea90b, 0xbcc1ef81), TOBN(0x7c5f317a, 0xa459a2ed)}}, + {{TOBN(0x01211075, 0x4ef44227), TOBN(0xa17bed6e, 0xdc20f496), + TOBN(0x0cdfe424, 0x819853cd), TOBN(0x13793298, 0xf71e2ce7)}, + {TOBN(0x3c1f3078, 0xdbbe307b), TOBN(0x6dd1c20e, 0x76ee9936), + TOBN(0x23ee4b57, 0x423caa20), TOBN(0x4ac3793b, 0x8efb840e)}}, + {{TOBN(0x934438eb, 0xed1f8ca0), TOBN(0x3e546658, 0x4ebb25a2), + TOBN(0xc415af0e, 0xc069896f), TOBN(0xc13eddb0, 0x9a5aa43d)}, + {TOBN(0x7a04204f, 0xd49eb8f6), TOBN(0xd0d5bdfc, 0xd74f1670), + TOBN(0x3697e286, 0x56fc0558), TOBN(0x10207371, 0x01cebade)}}, + {{TOBN(0x5f87e690, 0x0647a82b), TOBN(0x908e0ed4, 0x8f40054f), + TOBN(0xa9f633d4, 0x79853803), TOBN(0x8ed13c9a, 0x4a28b252)}, + {TOBN(0x3e2ef676, 0x1f460f64), TOBN(0x53930b9b, 0x36d06336), + TOBN(0x347073ac, 0x8fc4979b), TOBN(0x84380e0e, 0x5ecd5597)}}, + {{TOBN(0xe3b22c6b, 0xc4fe3c39), TOBN(0xba4a8153, 0x6c7bebdf), + TOBN(0xf23ab6b7, 0x25693459), TOBN(0x53bc3770, 0x14922b11)}, + {TOBN(0x4645c8ab, 0x5afc60db), TOBN(0xaa022355, 0x20b9f2a3), + TOBN(0x52a2954c, 0xce0fc507), TOBN(0x8c2731bb, 0x7ce1c2e7)}}, + {{TOBN(0xf39608ab, 0x18a0339d), TOBN(0xac7a658d, 0x3735436c), + TOBN(0xb22c2b07, 0xcd992b4f), TOBN(0x4e83daec, 0xf40dcfd4)}, + {TOBN(0x8a34c7be, 0x2f39ea3e), TOBN(0xef0c005f, 0xb0a56d2e), + TOBN(0x62731f6a, 0x6edd8038), TOBN(0x5721d740, 0x4e3cb075)}}, + {{TOBN(0x1ea41511, 0xfbeeee1b), TOBN(0xd1ef5e73, 0xef1d0c05), + TOBN(0x42feefd1, 0x73c07d35), TOBN(0xe530a00a, 0x8a329493)}, + {TOBN(0x5d55b7fe, 0xf15ebfb0), TOBN(0x549de03c, 0xd322491a), + TOBN(0xf7b5f602, 0x745b3237), TOBN(0x3632a3a2, 0x1ab6e2b6)}}, + {{TOBN(0x0d3bba89, 0x0ef59f78), TOBN(0x0dfc6443, 0xc9e52b9a), + TOBN(0x1dc79699, 0x72631447), TOBN(0xef033917, 0xb3be20b1)}, + {TOBN(0x0c92735d, 0xb1383948), TOBN(0xc1fc29a2, 0xc0dd7d7d), + TOBN(0x6485b697, 0x403ed068), TOBN(0x13bfaab3, 0xaac93bdc)}}, + {{TOBN(0x410dc6a9, 0x0deeaf52), TOBN(0xb003fb02, 0x4c641c15), + TOBN(0x1384978c, 0x5bc504c4), TOBN(0x37640487, 0x864a6a77)}, + {TOBN(0x05991bc6, 0x222a77da), TOBN(0x62260a57, 0x5e47eb11), + TOBN(0xc7af6613, 0xf21b432c), TOBN(0x22f3acc9, 0xab4953e9)}}, + {{TOBN(0x52934922, 0x8e41d155), TOBN(0x4d024568, 0x3ac059ef), + TOBN(0xb0201755, 0x4d884411), TOBN(0xce8055cf, 0xa59a178f)}, + {TOBN(0xcd77d1af, 0xf6204549), TOBN(0xa0a00a3e, 0xc7066759), + TOBN(0x471071ef, 0x0272c229), TOBN(0x009bcf6b, 0xd3c4b6b0)}}, + {{TOBN(0x2a2638a8, 0x22305177), TOBN(0xd51d59df, 0x41645bbf), + TOBN(0xa81142fd, 0xc0a7a3c0), TOBN(0xa17eca6d, 0x4c7063ee)}, + {TOBN(0x0bb887ed, 0x60d9dcec), TOBN(0xd6d28e51, 0x20ad2455), + TOBN(0xebed6308, 0xa67102ba), TOBN(0x042c3114, 0x8bffa408)}}, + {{TOBN(0xfd099ac5, 0x8aa68e30), TOBN(0x7a6a3d7c, 0x1483513e), + TOBN(0xffcc6b75, 0xba2d8f0c), TOBN(0x54dacf96, 0x1e78b954)}, + {TOBN(0xf645696f, 0xa4a9af89), TOBN(0x3a411940, 0x06ac98ec), + TOBN(0x41b8b3f6, 0x22a67a20), TOBN(0x2d0b1e0f, 0x99dec626)}}, + {{TOBN(0x27c89192, 0x40be34e8), TOBN(0xc7162b37, 0x91907f35), + TOBN(0x90188ec1, 0xa956702b), TOBN(0xca132f7d, 0xdf93769c)}, + {TOBN(0x3ece44f9, 0x0e2025b4), TOBN(0x67aaec69, 0x0c62f14c), + TOBN(0xad741418, 0x22e3cc11), TOBN(0xcf9b75c3, 0x7ff9a50e)}}, + {{TOBN(0x02fa2b16, 0x4d348272), TOBN(0xbd99d61a, 0x9959d56d), + TOBN(0xbc4f19db, 0x18762916), TOBN(0xcc7cce50, 0x49c1ac80)}, + {TOBN(0x4d59ebaa, 0xd846bd83), TOBN(0x8775a9dc, 0xa9202849), + TOBN(0x07ec4ae1, 0x6e1f4ca9), TOBN(0x27eb5875, 0xba893f11)}}, + {{TOBN(0x00284d51, 0x662cc565), TOBN(0x82353a6b, 0x0db4138d), + TOBN(0xd9c7aaaa, 0xaa32a594), TOBN(0xf5528b5e, 0xa5669c47)}, + {TOBN(0xf3220231, 0x2f23c5ff), TOBN(0xe3e8147a, 0x6affa3a1), + TOBN(0xfb423d5c, 0x202ddda0), TOBN(0x3d6414ac, 0x6b871bd4)}}, + {{TOBN(0x586f82e1, 0xa51a168a), TOBN(0xb712c671, 0x48ae5448), + TOBN(0x9a2e4bd1, 0x76233eb8), TOBN(0x0188223a, 0x78811ca9)}, + {TOBN(0x553c5e21, 0xf7c18de1), TOBN(0x7682e451, 0xb27bb286), + TOBN(0x3ed036b3, 0x0e51e929), TOBN(0xf487211b, 0xec9cb34f)}}, + {{TOBN(0x0d094277, 0x0c24efc8), TOBN(0x0349fd04, 0xbef737a4), + TOBN(0x6d1c9dd2, 0x514cdd28), TOBN(0x29c135ff, 0x30da9521)}, + {TOBN(0xea6e4508, 0xf78b0b6f), TOBN(0x176f5dd2, 0x678c143c), + TOBN(0x08148418, 0x4be21e65), TOBN(0x27f7525c, 0xe7df38c4)}}, + {{TOBN(0x1fb70e09, 0x748ab1a4), TOBN(0x9cba50a0, 0x5efe4433), + TOBN(0x7846c7a6, 0x15f75af2), TOBN(0x2a7c2c57, 0x5ee73ea8)}, + {TOBN(0x42e566a4, 0x3f0a449a), TOBN(0x45474c3b, 0xad90fc3d), + TOBN(0x7447be3d, 0x8b61d057), TOBN(0x3e9d1cf1, 0x3a4ec092)}}, + {{TOBN(0x1603e453, 0xf380a6e6), TOBN(0x0b86e431, 0x9b1437c2), + TOBN(0x7a4173f2, 0xef29610a), TOBN(0x8fa729a7, 0xf03d57f7)}, + {TOBN(0x3e186f6e, 0x6c9c217e), TOBN(0xbe1d3079, 0x91919524), + TOBN(0x92a62a70, 0x153d4fb1), TOBN(0x32ed3e34, 0xd68c2f71)}}, + {{TOBN(0xd785027f, 0x9eb1a8b7), TOBN(0xbc37eb77, 0xc5b22fe8), + TOBN(0x466b34f0, 0xb9d6a191), TOBN(0x008a89af, 0x9a05f816)}, + {TOBN(0x19b028fb, 0x7d42c10a), TOBN(0x7fe8c92f, 0x49b3f6b8), + TOBN(0x58907cc0, 0xa5a0ade3), TOBN(0xb3154f51, 0x559d1a7c)}}, + {{TOBN(0x5066efb6, 0xd9790ed6), TOBN(0xa77a0cbc, 0xa6aa793b), + TOBN(0x1a915f3c, 0x223e042e), TOBN(0x1c5def04, 0x69c5874b)}, + {TOBN(0x0e830078, 0x73b6c1da), TOBN(0x55cf85d2, 0xfcd8557a), + TOBN(0x0f7c7c76, 0x0460f3b1), TOBN(0x87052acb, 0x46e58063)}}, + {{TOBN(0x09212b80, 0x907eae66), TOBN(0x3cb068e0, 0x4d721c89), + TOBN(0xa87941ae, 0xdd45ac1c), TOBN(0xde8d5c0d, 0x0daa0dbb)}, + {TOBN(0xda421fdc, 0xe3502e6e), TOBN(0xc8944201, 0x4d89a084), + TOBN(0x7307ba5e, 0xf0c24bfb), TOBN(0xda212beb, 0x20bde0ef)}}, + {{TOBN(0xea2da24b, 0xf82ce682), TOBN(0x058d3816, 0x07f71fe4), + TOBN(0x35a02462, 0x5ffad8de), TOBN(0xcd7b05dc, 0xaadcefab)}, + {TOBN(0xd442f8ed, 0x1d9f54ec), TOBN(0x8be3d618, 0xb2d3b5ca), + TOBN(0xe2220ed0, 0xe06b2ce2), TOBN(0x82699a5f, 0x1b0da4c0)}}, + {{TOBN(0x3ff106f5, 0x71c0c3a7), TOBN(0x8f580f5a, 0x0d34180c), + TOBN(0x4ebb120e, 0x22d7d375), TOBN(0x5e5782cc, 0xe9513675)}, + {TOBN(0x2275580c, 0x99c82a70), TOBN(0xe8359fbf, 0x15ea8c4c), + TOBN(0x53b48db8, 0x7b415e70), TOBN(0xaacf2240, 0x100c6014)}}, + {{TOBN(0x9faaccf5, 0xe4652f1d), TOBN(0xbd6fdd2a, 0xd56157b2), + TOBN(0xa4f4fb1f, 0x6261ec50), TOBN(0x244e55ad, 0x476bcd52)}, + {TOBN(0x881c9305, 0x047d320b), TOBN(0x1ca983d5, 0x6181263f), + TOBN(0x354e9a44, 0x278fb8ee), TOBN(0xad2dbc0f, 0x396e4964)}}, + {{TOBN(0x723f3aa2, 0x9268b3de), TOBN(0x0d1ca29a, 0xe6e0609a), + TOBN(0x794866aa, 0x6cf44252), TOBN(0x0b59f3e3, 0x01af87ed)}, + {TOBN(0xe234e5ff, 0x7f4a6c51), TOBN(0xa8768fd2, 0x61dc2f7e), + TOBN(0xdafc7332, 0x0a94d81f), TOBN(0xd7f84282, 0x06938ce1)}}, + {{TOBN(0xae0b3c0e, 0x0546063e), TOBN(0x7fbadcb2, 0x5d61abc6), + TOBN(0xd5d7a2c9, 0x369ac400), TOBN(0xa5978d09, 0xae67d10c)}, + {TOBN(0x290f211e, 0x4f85eaac), TOBN(0xe61e2ad1, 0xfacac681), + TOBN(0xae125225, 0x388384cd), TOBN(0xa7fb68e9, 0xccfde30f)}}, + {{TOBN(0x7a59b936, 0x3daed4c2), TOBN(0x80a9aa40, 0x2606f789), + TOBN(0xb40c1ea5, 0xf6a6d90a), TOBN(0x948364d3, 0x514d5885)}, + {TOBN(0x062ebc60, 0x70985182), TOBN(0xa6db5b0e, 0x33310895), + TOBN(0x64a12175, 0xe329c2f5), TOBN(0xc5f25bd2, 0x90ea237e)}}, + {{TOBN(0x7915c524, 0x2d0a4c23), TOBN(0xeb5d26e4, 0x6bb3cc52), + TOBN(0x369a9116, 0xc09e2c92), TOBN(0x0c527f92, 0xcf182cf8)}, + {TOBN(0x9e591938, 0x2aede0ac), TOBN(0xb2922208, 0x6cc34939), + TOBN(0x3c9d8962, 0x99a34361), TOBN(0x3c81836d, 0xc1905fe6)}}, + {{TOBN(0x4bfeb57f, 0xa001ec5a), TOBN(0xe993f5bb, 0xa0dc5dba), + TOBN(0x47884109, 0x724a1380), TOBN(0x8a0369ab, 0x32fe9a04)}, + {TOBN(0xea068d60, 0x8c927db8), TOBN(0xbf5f37cf, 0x94655741), + TOBN(0x47d402a2, 0x04b6c7ea), TOBN(0x4551c295, 0x6af259cb)}}, + {{TOBN(0x698b71e7, 0xed77ee8b), TOBN(0xbddf7bd0, 0xf309d5c7), + TOBN(0x6201c22c, 0x34e780ca), TOBN(0xab04f7d8, 0x4c295ef4)}, + {TOBN(0x1c947294, 0x4313a8ce), TOBN(0xe532e4ac, 0x92ca4cfe), + TOBN(0x89738f80, 0xd0a7a97a), TOBN(0xec088c88, 0xa580fd5b)}}, + {{TOBN(0x612b1ecc, 0x42ce9e51), TOBN(0x8f9840fd, 0xb25fdd2a), + TOBN(0x3cda78c0, 0x01e7f839), TOBN(0x546b3d3a, 0xece05480)}, + {TOBN(0x271719a9, 0x80d30916), TOBN(0x45497107, 0x584c20c4), + TOBN(0xaf8f9478, 0x5bc78608), TOBN(0x28c7d484, 0x277e2a4c)}}, + {{TOBN(0xfce01767, 0x88a2ffe4), TOBN(0xdc506a35, 0x28e169a5), + TOBN(0x0ea10861, 0x7af9c93a), TOBN(0x1ed24361, 0x03fa0e08)}, + {TOBN(0x96eaaa92, 0xa3d694e7), TOBN(0xc0f43b4d, 0xef50bc74), + TOBN(0xce6aa58c, 0x64114db4), TOBN(0x8218e8ea, 0x7c000fd4)}}, + {{TOBN(0xac815dfb, 0x185f8844), TOBN(0xcd7e90cb, 0x1557abfb), + TOBN(0x23d16655, 0xafbfecdf), TOBN(0x80f3271f, 0x085cac4a)}, + {TOBN(0x7fc39aa7, 0xd0e62f47), TOBN(0x88d519d1, 0x460a48e5), + TOBN(0x59559ac4, 0xd28f101e), TOBN(0x7981d9e9, 0xca9ae816)}}, + {{TOBN(0x5c38652c, 0x9ac38203), TOBN(0x86eaf87f, 0x57657fe5), + TOBN(0x568fc472, 0xe21f5416), TOBN(0x2afff39c, 0xe7e597b5)}, + {TOBN(0x3adbbb07, 0x256d4eab), TOBN(0x22598692, 0x8285ab89), + TOBN(0x35f8112a, 0x041caefe), TOBN(0x95df02e3, 0xa5064c8b)}}, + {{TOBN(0x4d63356e, 0xc7004bf3), TOBN(0x230a08f4, 0xdb83c7de), + TOBN(0xca27b270, 0x8709a7b7), TOBN(0x0d1c4cc4, 0xcb9abd2d)}, + {TOBN(0x8a0bc66e, 0x7550fee8), TOBN(0x369cd4c7, 0x9cf7247e), + TOBN(0x75562e84, 0x92b5b7e7), TOBN(0x8fed0da0, 0x5802af7b)}}, + {{TOBN(0x6a7091c2, 0xe48fb889), TOBN(0x26882c13, 0x7b8a9d06), + TOBN(0xa2498663, 0x1b82a0e2), TOBN(0x844ed736, 0x3518152d)}, + {TOBN(0x282f476f, 0xd86e27c7), TOBN(0xa04edaca, 0x04afefdc), + TOBN(0x8b256ebc, 0x6119e34d), TOBN(0x56a413e9, 0x0787d78b)}}}, + {{{TOBN(0x82ee061d, 0x5a74be50), TOBN(0xe41781c4, 0xdea16ff5), + TOBN(0xe0b0c81e, 0x99bfc8a2), TOBN(0x624f4d69, 0x0b547e2d)}, + {TOBN(0x3a83545d, 0xbdcc9ae4), TOBN(0x2573dbb6, 0x409b1e8e), + TOBN(0x482960c4, 0xa6c93539), TOBN(0xf01059ad, 0x5ae18798)}}, + {{TOBN(0x715c9f97, 0x3112795f), TOBN(0xe8244437, 0x984e6ee1), + TOBN(0x55cb4858, 0xecb66bcd), TOBN(0x7c136735, 0xabaffbee)}, + {TOBN(0x54661595, 0x5dbec38e), TOBN(0x51c0782c, 0x388ad153), + TOBN(0x9ba4c53a, 0xc6e0952f), TOBN(0x27e6782a, 0x1b21dfa8)}}, + {{TOBN(0x682f903d, 0x4ed2dbc2), TOBN(0x0eba59c8, 0x7c3b2d83), + TOBN(0x8e9dc84d, 0x9c7e9335), TOBN(0x5f9b21b0, 0x0eb226d7)}, + {TOBN(0xe33bd394, 0xaf267bae), TOBN(0xaa86cc25, 0xbe2e15ae), + TOBN(0x4f0bf67d, 0x6a8ec500), TOBN(0x5846aa44, 0xf9630658)}}, + {{TOBN(0xfeb09740, 0xe2c2bf15), TOBN(0x627a2205, 0xa9e99704), + TOBN(0xec8d73d0, 0xc2fbc565), TOBN(0x223eed8f, 0xc20c8de8)}, + {TOBN(0x1ee32583, 0xa8363b49), TOBN(0x1a0b6cb9, 0xc9c2b0a6), + TOBN(0x49f7c3d2, 0x90dbc85c), TOBN(0xa8dfbb97, 0x1ef4c1ac)}}, + {{TOBN(0xafb34d4c, 0x65c7c2ab), TOBN(0x1d4610e7, 0xe2c5ea84), + TOBN(0x893f6d1b, 0x973c4ab5), TOBN(0xa3cdd7e9, 0x945ba5c4)}, + {TOBN(0x60514983, 0x064417ee), TOBN(0x1459b23c, 0xad6bdf2b), + TOBN(0x23b2c341, 0x5cf726c3), TOBN(0x3a829635, 0x32d6354a)}}, + {{TOBN(0x294f901f, 0xab192c18), TOBN(0xec5fcbfe, 0x7030164f), + TOBN(0xe2e2fcb7, 0xe2246ba6), TOBN(0x1e7c88b3, 0x221a1a0c)}, + {TOBN(0x72c7dd93, 0xc92d88c5), TOBN(0x41c2148e, 0x1106fb59), + TOBN(0x547dd4f5, 0xa0f60f14), TOBN(0xed9b52b2, 0x63960f31)}}, + {{TOBN(0x6c8349eb, 0xb0a5b358), TOBN(0xb154c5c2, 0x9e7e2ed6), + TOBN(0xcad5eccf, 0xeda462db), TOBN(0xf2d6dbe4, 0x2de66b69)}, + {TOBN(0x426aedf3, 0x8665e5b2), TOBN(0x488a8513, 0x7b7f5723), + TOBN(0x15cc43b3, 0x8bcbb386), TOBN(0x27ad0af3, 0xd791d879)}}, + {{TOBN(0xc16c236e, 0x846e364f), TOBN(0x7f33527c, 0xdea50ca0), + TOBN(0xc4810775, 0x0926b86d), TOBN(0x6c2a3609, 0x0598e70c)}, + {TOBN(0xa6755e52, 0xf024e924), TOBN(0xe0fa07a4, 0x9db4afca), + TOBN(0x15c3ce7d, 0x66831790), TOBN(0x5b4ef350, 0xa6cbb0d6)}}, + {{TOBN(0x2c4aafc4, 0xb6205969), TOBN(0x42563f02, 0xf6c7854f), + TOBN(0x016aced5, 0x1d983b48), TOBN(0xfeb356d8, 0x99949755)}, + {TOBN(0x8c2a2c81, 0xd1a39bd7), TOBN(0x8f44340f, 0xe6934ae9), + TOBN(0x148cf91c, 0x447904da), TOBN(0x7340185f, 0x0f51a926)}}, + {{TOBN(0x2f8f00fb, 0x7409ab46), TOBN(0x057e78e6, 0x80e289b2), + TOBN(0x03e5022c, 0xa888e5d1), TOBN(0x3c87111a, 0x9dede4e2)}, + {TOBN(0x5b9b0e1c, 0x7809460b), TOBN(0xe751c852, 0x71c9abc7), + TOBN(0x8b944e28, 0xc7cc1dc9), TOBN(0x4f201ffa, 0x1d3cfa08)}}, + {{TOBN(0x02fc905c, 0x3e6721ce), TOBN(0xd52d70da, 0xd0b3674c), + TOBN(0x5dc2e5ca, 0x18810da4), TOBN(0xa984b273, 0x5c69dd99)}, + {TOBN(0x63b92527, 0x84de5ca4), TOBN(0x2f1c9872, 0xc852dec4), + TOBN(0x18b03593, 0xc2e3de09), TOBN(0x19d70b01, 0x9813dc2f)}}, + {{TOBN(0x42806b2d, 0xa6dc1d29), TOBN(0xd3030009, 0xf871e144), + TOBN(0xa1feb333, 0xaaf49276), TOBN(0xb5583b9e, 0xc70bc04b)}, + {TOBN(0x1db0be78, 0x95695f20), TOBN(0xfc841811, 0x89d012b5), + TOBN(0x6409f272, 0x05f61643), TOBN(0x40d34174, 0xd5883128)}}, + {{TOBN(0xd79196f5, 0x67419833), TOBN(0x6059e252, 0x863b7b08), + TOBN(0x84da1817, 0x1c56700c), TOBN(0x5758ee56, 0xb28d3ec4)}, + {TOBN(0x7da2771d, 0x013b0ea6), TOBN(0xfddf524b, 0x54c5e9b9), + TOBN(0x7df4faf8, 0x24305d80), TOBN(0x58f5c1bf, 0x3a97763f)}}, + {{TOBN(0xa5af37f1, 0x7c696042), TOBN(0xd4cba22c, 0x4a2538de), + TOBN(0x211cb995, 0x9ea42600), TOBN(0xcd105f41, 0x7b069889)}, + {TOBN(0xb1e1cf19, 0xddb81e74), TOBN(0x472f2d89, 0x5157b8ca), + TOBN(0x086fb008, 0xee9db885), TOBN(0x365cd570, 0x0f26d131)}}, + {{TOBN(0x284b02bb, 0xa2be7053), TOBN(0xdcbbf7c6, 0x7ab9a6d6), + TOBN(0x4425559c, 0x20f7a530), TOBN(0x961f2dfa, 0x188767c8)}, + {TOBN(0xe2fd9435, 0x70dc80c4), TOBN(0x104d6b63, 0xf0784120), + TOBN(0x7f592bc1, 0x53567122), TOBN(0xf6bc1246, 0xf688ad77)}}, + {{TOBN(0x05214c05, 0x0f15dde9), TOBN(0xa47a76a8, 0x0d5f2b82), + TOBN(0xbb254d30, 0x62e82b62), TOBN(0x11a05fe0, 0x3ec955ee)}, + {TOBN(0x7eaff46e, 0x9d529b36), TOBN(0x55ab1301, 0x8f9e3df6), + TOBN(0xc463e371, 0x99317698), TOBN(0xfd251438, 0xccda47ad)}}, + {{TOBN(0xca9c3547, 0x23d695ea), TOBN(0x48ce626e, 0x16e589b5), + TOBN(0x6b5b64c7, 0xb187d086), TOBN(0xd02e1794, 0xb2207948)}, + {TOBN(0x8b58e98f, 0x7198111d), TOBN(0x90ca6305, 0xdcf9c3cc), + TOBN(0x5691fe72, 0xf34089b0), TOBN(0x60941af1, 0xfc7c80ff)}}, + {{TOBN(0xa09bc0a2, 0x22eb51e5), TOBN(0xc0bb7244, 0xaa9cf09a), + TOBN(0x36a8077f, 0x80159f06), TOBN(0x8b5c989e, 0xdddc560e)}, + {TOBN(0x19d2f316, 0x512e1f43), TOBN(0x02eac554, 0xad08ff62), + TOBN(0x012ab84c, 0x07d20b4e), TOBN(0x37d1e115, 0xd6d4e4e1)}}, + {{TOBN(0xb6443e1a, 0xab7b19a8), TOBN(0xf08d067e, 0xdef8cd45), + TOBN(0x63adf3e9, 0x685e03da), TOBN(0xcf15a10e, 0x4792b916)}, + {TOBN(0xf44bcce5, 0xb738a425), TOBN(0xebe131d5, 0x9636b2fd), + TOBN(0x94068841, 0x7850d605), TOBN(0x09684eaa, 0xb40d749d)}}, + {{TOBN(0x8c3c669c, 0x72ba075b), TOBN(0x89f78b55, 0xba469015), + TOBN(0x5706aade, 0x3e9f8ba8), TOBN(0x6d8bd565, 0xb32d7ed7)}, + {TOBN(0x25f4e63b, 0x805f08d6), TOBN(0x7f48200d, 0xc3bcc1b5), + TOBN(0x4e801968, 0xb025d847), TOBN(0x74afac04, 0x87cbe0a8)}}, + {{TOBN(0x43ed2c2b, 0x7e63d690), TOBN(0xefb6bbf0, 0x0223cdb8), + TOBN(0x4fec3cae, 0x2884d3fe), TOBN(0x065ecce6, 0xd75e25a4)}, + {TOBN(0x6c2294ce, 0x69f79071), TOBN(0x0d9a8e5f, 0x044b8666), + TOBN(0x5009f238, 0x17b69d8f), TOBN(0x3c29f8fe, 0xc5dfdaf7)}}, + {{TOBN(0x9067528f, 0xebae68c4), TOBN(0x5b385632, 0x30c5ba21), + TOBN(0x540df119, 0x1fdd1aec), TOBN(0xcf37825b, 0xcfba4c78)}, + {TOBN(0x77eff980, 0xbeb11454), TOBN(0x40a1a991, 0x60c1b066), + TOBN(0xe8018980, 0xf889a1c7), TOBN(0xb9c52ae9, 0x76c24be0)}}, + {{TOBN(0x05fbbcce, 0x45650ef4), TOBN(0xae000f10, 0x8aa29ac7), + TOBN(0x884b7172, 0x4f04c470), TOBN(0x7cd4fde2, 0x19bb5c25)}, + {TOBN(0x6477b22a, 0xe8840869), TOBN(0xa8868859, 0x5fbd0686), + TOBN(0xf23cc02e, 0x1116dfba), TOBN(0x76cd563f, 0xd87d7776)}}, + {{TOBN(0xe2a37598, 0xa9d82abf), TOBN(0x5f188ccb, 0xe6c170f5), + TOBN(0x81682200, 0x5066b087), TOBN(0xda22c212, 0xc7155ada)}, + {TOBN(0x151e5d3a, 0xfbddb479), TOBN(0x4b606b84, 0x6d715b99), + TOBN(0x4a73b54b, 0xf997cb2e), TOBN(0x9a1bfe43, 0x3ecd8b66)}}, + {{TOBN(0x1c312809, 0x2a67d48a), TOBN(0xcd6a671e, 0x031fa9e2), + TOBN(0xbec3312a, 0x0e43a34a), TOBN(0x1d935639, 0x55ef47d3)}, + {TOBN(0x5ea02489, 0x8fea73ea), TOBN(0x8247b364, 0xa035afb2), + TOBN(0xb58300a6, 0x5265b54c), TOBN(0x3286662f, 0x722c7148)}}, + {{TOBN(0xb77fd76b, 0xb4ec4c20), TOBN(0xf0a12fa7, 0x0f3fe3fd), + TOBN(0xf845bbf5, 0x41d8c7e8), TOBN(0xe4d969ca, 0x5ec10aa8)}, + {TOBN(0x4c0053b7, 0x43e232a3), TOBN(0xdc7a3fac, 0x37f8a45a), + TOBN(0x3c4261c5, 0x20d81c8f), TOBN(0xfd4b3453, 0xb00eab00)}}, + {{TOBN(0x76d48f86, 0xd36e3062), TOBN(0x626c5277, 0xa143ff02), + TOBN(0x538174de, 0xaf76f42e), TOBN(0x2267aa86, 0x6407ceac)}, + {TOBN(0xfad76351, 0x72e572d5), TOBN(0xab861af7, 0xba7330eb), + TOBN(0xa0a1c8c7, 0x418d8657), TOBN(0x988821cb, 0x20289a52)}}, + {{TOBN(0x79732522, 0xcccc18ad), TOBN(0xaadf3f8d, 0xf1a6e027), + TOBN(0xf7382c93, 0x17c2354d), TOBN(0x5ce1680c, 0xd818b689)}, + {TOBN(0x359ebbfc, 0xd9ecbee9), TOBN(0x4330689c, 0x1cae62ac), + TOBN(0xb55ce5b4, 0xc51ac38a), TOBN(0x7921dfea, 0xfe238ee8)}}, + {{TOBN(0x3972bef8, 0x271d1ca5), TOBN(0x3e423bc7, 0xe8aabd18), + TOBN(0x57b09f3f, 0x44a3e5e3), TOBN(0x5da886ae, 0x7b444d66)}, + {TOBN(0x68206634, 0xa9964375), TOBN(0x356a2fa3, 0x699cd0ff), + TOBN(0xaf0faa24, 0xdba515e9), TOBN(0x536e1f5c, 0xb321d79a)}}, + {{TOBN(0xd3b9913a, 0x5c04e4ea), TOBN(0xd549dcfe, 0xd6f11513), + TOBN(0xee227bf5, 0x79fd1d94), TOBN(0x9f35afee, 0xb43f2c67)}, + {TOBN(0xd2638d24, 0xf1314f53), TOBN(0x62baf948, 0xcabcd822), + TOBN(0x5542de29, 0x4ef48db0), TOBN(0xb3eb6a04, 0xfc5f6bb2)}}, + {{TOBN(0x23c110ae, 0x1208e16a), TOBN(0x1a4d15b5, 0xf8363e24), + TOBN(0x30716844, 0x164be00b), TOBN(0xa8e24824, 0xf6f4690d)}, + {TOBN(0x548773a2, 0x90b170cf), TOBN(0xa1bef331, 0x42f191f4), + TOBN(0x70f418d0, 0x9247aa97), TOBN(0xea06028e, 0x48be9147)}}, + {{TOBN(0xe13122f3, 0xdbfb894e), TOBN(0xbe9b79f6, 0xce274b18), + TOBN(0x85a49de5, 0xca58aadf), TOBN(0x24957758, 0x11487351)}, + {TOBN(0x111def61, 0xbb939099), TOBN(0x1d6a974a, 0x26d13694), + TOBN(0x4474b4ce, 0xd3fc253b), TOBN(0x3a1485e6, 0x4c5db15e)}}, + {{TOBN(0xe79667b4, 0x147c15b4), TOBN(0xe34f553b, 0x7bc61301), + TOBN(0x032b80f8, 0x17094381), TOBN(0x55d8bafd, 0x723eaa21)}, + {TOBN(0x5a987995, 0xf1c0e74e), TOBN(0x5a9b292e, 0xebba289c), + TOBN(0x413cd4b2, 0xeb4c8251), TOBN(0x98b5d243, 0xd162db0a)}}, + {{TOBN(0xbb47bf66, 0x68342520), TOBN(0x08d68949, 0xbaa862d1), + TOBN(0x11f349c7, 0xe906abcd), TOBN(0x454ce985, 0xed7bf00e)}, + {TOBN(0xacab5c9e, 0xb55b803b), TOBN(0xb03468ea, 0x31e3c16d), + TOBN(0x5c24213d, 0xd273bf12), TOBN(0x211538eb, 0x71587887)}}, + {{TOBN(0x198e4a2f, 0x731dea2d), TOBN(0xd5856cf2, 0x74ed7b2a), + TOBN(0x86a632eb, 0x13a664fe), TOBN(0x932cd909, 0xbda41291)}, + {TOBN(0x850e95d4, 0xc0c4ddc0), TOBN(0xc0f422f8, 0x347fc2c9), + TOBN(0xe68cbec4, 0x86076bcb), TOBN(0xf9e7c0c0, 0xcd6cd286)}}, + {{TOBN(0x65994ddb, 0x0f5f27ca), TOBN(0xe85461fb, 0xa80d59ff), + TOBN(0xff05481a, 0x66601023), TOBN(0xc665427a, 0xfc9ebbfb)}, + {TOBN(0xb0571a69, 0x7587fd52), TOBN(0x935289f8, 0x8d49efce), + TOBN(0x61becc60, 0xea420688), TOBN(0xb22639d9, 0x13a786af)}}, + {{TOBN(0x1a8e6220, 0x361ecf90), TOBN(0x001f23e0, 0x25506463), + TOBN(0xe4ae9b5d, 0x0a5c2b79), TOBN(0xebc9cdad, 0xd8149db5)}, + {TOBN(0xb33164a1, 0x934aa728), TOBN(0x750eb00e, 0xae9b60f3), + TOBN(0x5a91615b, 0x9b9cfbfd), TOBN(0x97015cbf, 0xef45f7f6)}}, + {{TOBN(0xb462c4a5, 0xbf5151df), TOBN(0x21adcc41, 0xb07118f2), + TOBN(0xd60c545b, 0x043fa42c), TOBN(0xfc21aa54, 0xe96be1ab)}, + {TOBN(0xe84bc32f, 0x4e51ea80), TOBN(0x3dae45f0, 0x259b5d8d), + TOBN(0xbb73c7eb, 0xc38f1b5e), TOBN(0xe405a74a, 0xe8ae617d)}}, + {{TOBN(0xbb1ae9c6, 0x9f1c56bd), TOBN(0x8c176b98, 0x49f196a4), + TOBN(0xc448f311, 0x6875092b), TOBN(0xb5afe3de, 0x9f976033)}, + {TOBN(0xa8dafd49, 0x145813e5), TOBN(0x687fc4d9, 0xe2b34226), + TOBN(0xf2dfc92d, 0x4c7ff57f), TOBN(0x004e3fc1, 0x401f1b46)}}, + {{TOBN(0x5afddab6, 0x1430c9ab), TOBN(0x0bdd41d3, 0x2238e997), + TOBN(0xf0947430, 0x418042ae), TOBN(0x71f9adda, 0xcdddc4cb)}, + {TOBN(0x7090c016, 0xc52dd907), TOBN(0xd9bdf44d, 0x29e2047f), + TOBN(0xe6f1fe80, 0x1b1011a6), TOBN(0xb63accbc, 0xd9acdc78)}}, + {{TOBN(0xcfc7e235, 0x1272a95b), TOBN(0x0c667717, 0xa6276ac8), + TOBN(0x3c0d3709, 0xe2d7eef7), TOBN(0x5add2b06, 0x9a685b3e)}, + {TOBN(0x363ad32d, 0x14ea5d65), TOBN(0xf8e01f06, 0x8d7dd506), + TOBN(0xc9ea2213, 0x75b4aac6), TOBN(0xed2a2bf9, 0x0d353466)}}, + {{TOBN(0x439d79b5, 0xe9d3a7c3), TOBN(0x8e0ee5a6, 0x81b7f34b), + TOBN(0xcf3dacf5, 0x1dc4ba75), TOBN(0x1d3d1773, 0xeb3310c7)}, + {TOBN(0xa8e67112, 0x7747ae83), TOBN(0x31f43160, 0x197d6b40), + TOBN(0x0521ccee, 0xcd961400), TOBN(0x67246f11, 0xf6535768)}}, + {{TOBN(0x702fcc5a, 0xef0c3133), TOBN(0x247cc45d, 0x7e16693b), + TOBN(0xfd484e49, 0xc729b749), TOBN(0x522cef7d, 0xb218320f)}, + {TOBN(0xe56ef405, 0x59ab93b3), TOBN(0x225fba11, 0x9f181071), + TOBN(0x33bd6595, 0x15330ed0), TOBN(0xc4be69d5, 0x1ddb32f7)}}, + {{TOBN(0x264c7668, 0x0448087c), TOBN(0xac30903f, 0x71432dae), + TOBN(0x3851b266, 0x00f9bf47), TOBN(0x400ed311, 0x6cdd6d03)}, + {TOBN(0x045e79fe, 0xf8fd2424), TOBN(0xfdfd974a, 0xfa6da98b), + TOBN(0x45c9f641, 0x0c1e673a), TOBN(0x76f2e733, 0x5b2c5168)}}, + {{TOBN(0x1adaebb5, 0x2a601753), TOBN(0xb286514c, 0xc57c2d49), + TOBN(0xd8769670, 0x1e0bfd24), TOBN(0x950c547e, 0x04478922)}, + {TOBN(0xd1d41969, 0xe5d32bfe), TOBN(0x30bc1472, 0x750d6c3e), + TOBN(0x8f3679fe, 0xe0e27f3a), TOBN(0x8f64a7dc, 0xa4a6ee0c)}}, + {{TOBN(0x2fe59937, 0x633dfb1f), TOBN(0xea82c395, 0x977f2547), + TOBN(0xcbdfdf1a, 0x661ea646), TOBN(0xc7ccc591, 0xb9085451)}, + {TOBN(0x82177962, 0x81761e13), TOBN(0xda57596f, 0x9196885c), + TOBN(0xbc17e849, 0x28ffbd70), TOBN(0x1e6e0a41, 0x2671d36f)}}, + {{TOBN(0x61ae872c, 0x4152fcf5), TOBN(0x441c87b0, 0x9e77e754), + TOBN(0xd0799dd5, 0xa34dff09), TOBN(0x766b4e44, 0x88a6b171)}, + {TOBN(0xdc06a512, 0x11f1c792), TOBN(0xea02ae93, 0x4be35c3e), + TOBN(0xe5ca4d6d, 0xe90c469e), TOBN(0x4df4368e, 0x56e4ff5c)}}, + {{TOBN(0x7817acab, 0x4baef62e), TOBN(0x9f5a2202, 0xa85b91e8), + TOBN(0x9666ebe6, 0x6ce57610), TOBN(0x32ad31f3, 0xf73bfe03)}, + {TOBN(0x628330a4, 0x25bcf4d6), TOBN(0xea950593, 0x515056e6), + TOBN(0x59811c89, 0xe1332156), TOBN(0xc89cf1fe, 0x8c11b2d7)}}, + {{TOBN(0x75b63913, 0x04e60cc0), TOBN(0xce811e8d, 0x4625d375), + TOBN(0x030e43fc, 0x2d26e562), TOBN(0xfbb30b4b, 0x608d36a0)}, + {TOBN(0x634ff82c, 0x48528118), TOBN(0x7c6fe085, 0xcd285911), + TOBN(0x7f2830c0, 0x99358f28), TOBN(0x2e60a95e, 0x665e6c09)}}, + {{TOBN(0x08407d3d, 0x9b785dbf), TOBN(0x530889ab, 0xa759bce7), + TOBN(0xf228e0e6, 0x52f61239), TOBN(0x2b6d1461, 0x6879be3c)}, + {TOBN(0xe6902c04, 0x51a7bbf7), TOBN(0x30ad99f0, 0x76f24a64), + TOBN(0x66d9317a, 0x98bc6da0), TOBN(0xf4f877f3, 0xcb596ac0)}}, + {{TOBN(0xb05ff62d, 0x4c44f119), TOBN(0x4555f536, 0xe9b77416), + TOBN(0xc7c0d059, 0x8caed63b), TOBN(0x0cd2b7ce, 0xc358b2a9)}, + {TOBN(0x3f33287b, 0x46945fa3), TOBN(0xf8785b20, 0xd67c8791), + TOBN(0xc54a7a61, 0x9637bd08), TOBN(0x54d4598c, 0x18be79d7)}}, + {{TOBN(0x889e5acb, 0xc46d7ce1), TOBN(0x9a515bb7, 0x8b085877), + TOBN(0xfac1a03d, 0x0b7a5050), TOBN(0x7d3e738a, 0xf2926035)}, + {TOBN(0x861cc2ce, 0x2a6cb0eb), TOBN(0x6f2e2955, 0x8f7adc79), + TOBN(0x61c4d451, 0x33016376), TOBN(0xd9fd2c80, 0x5ad59090)}}, + {{TOBN(0xe5a83738, 0xb2b836a1), TOBN(0x855b41a0, 0x7c0d6622), + TOBN(0x186fe317, 0x7cc19af1), TOBN(0x6465c1ff, 0xfdd99acb)}, + {TOBN(0x46e5c23f, 0x6974b99e), TOBN(0x75a7cf8b, 0xa2717cbe), + TOBN(0x4d2ebc3f, 0x062be658), TOBN(0x094b4447, 0x5f209c98)}}, + {{TOBN(0x4af285ed, 0xb940cb5a), TOBN(0x6706d792, 0x7cc82f10), + TOBN(0xc8c8776c, 0x030526fa), TOBN(0xfa8e6f76, 0xa0da9140)}, + {TOBN(0x77ea9d34, 0x591ee4f0), TOBN(0x5f46e337, 0x40274166), + TOBN(0x1bdf98bb, 0xea671457), TOBN(0xd7c08b46, 0x862a1fe2)}}, + {{TOBN(0x46cc303c, 0x1c08ad63), TOBN(0x99543440, 0x4c845e7b), + TOBN(0x1b8fbdb5, 0x48f36bf7), TOBN(0x5b82c392, 0x8c8273a7)}, + {TOBN(0x08f712c4, 0x928435d5), TOBN(0x071cf0f1, 0x79330380), + TOBN(0xc74c2d24, 0xa8da054a), TOBN(0xcb0e7201, 0x43c46b5c)}}, + {{TOBN(0x0ad7337a, 0xc0b7eff3), TOBN(0x8552225e, 0xc5e48b3c), + TOBN(0xe6f78b0c, 0x73f13a5f), TOBN(0x5e70062e, 0x82349cbe)}, + {TOBN(0x6b8d5048, 0xe7073969), TOBN(0x392d2a29, 0xc33cb3d2), + TOBN(0xee4f727c, 0x4ecaa20f), TOBN(0xa068c99e, 0x2ccde707)}}, + {{TOBN(0xfcd5651f, 0xb87a2913), TOBN(0xea3e3c15, 0x3cc252f0), + TOBN(0x777d92df, 0x3b6cd3e4), TOBN(0x7a414143, 0xc5a732e7)}, + {TOBN(0xa895951a, 0xa71ff493), TOBN(0xfe980c92, 0xbbd37cf6), + TOBN(0x45bd5e64, 0xdecfeeff), TOBN(0x910dc2a9, 0xa44c43e9)}}, + {{TOBN(0xcb403f26, 0xcca9f54d), TOBN(0x928bbdfb, 0x9303f6db), + TOBN(0x3c37951e, 0xa9eee67c), TOBN(0x3bd61a52, 0xf79961c3)}, + {TOBN(0x09a238e6, 0x395c9a79), TOBN(0x6940ca2d, 0x61eb352d), + TOBN(0x7d1e5c5e, 0xc1875631), TOBN(0x1e19742c, 0x1e1b20d1)}}, + {{TOBN(0x4633d908, 0x23fc2e6e), TOBN(0xa76e29a9, 0x08959149), + TOBN(0x61069d9c, 0x84ed7da5), TOBN(0x0baa11cf, 0x5dbcad51)}, + {TOBN(0xd01eec64, 0x961849da), TOBN(0x93b75f1f, 0xaf3d8c28), + TOBN(0x57bc4f9f, 0x1ca2ee44), TOBN(0x5a26322d, 0x00e00558)}}, + {{TOBN(0x1888d658, 0x61a023ef), TOBN(0x1d72aab4, 0xb9e5246e), + TOBN(0xa9a26348, 0xe5563ec0), TOBN(0xa0971963, 0xc3439a43)}, + {TOBN(0x567dd54b, 0xadb9b5b7), TOBN(0x73fac1a1, 0xc45a524b), + TOBN(0x8fe97ef7, 0xfe38e608), TOBN(0x608748d2, 0x3f384f48)}}, + {{TOBN(0xb0571794, 0xc486094f), TOBN(0x869254a3, 0x8bf3a8d6), + TOBN(0x148a8dd1, 0x310b0e25), TOBN(0x99ab9f3f, 0x9aa3f7d8)}, + {TOBN(0x0927c68a, 0x6706c02e), TOBN(0x22b5e76c, 0x69790e6c), + TOBN(0x6c325260, 0x6c71376c), TOBN(0x53a57690, 0x09ef6657)}}, + {{TOBN(0x8d63f852, 0xedffcf3a), TOBN(0xb4d2ed04, 0x3c0a6f55), + TOBN(0xdb3aa8de, 0x12519b9e), TOBN(0x5d38e9c4, 0x1e0a569a)}, + {TOBN(0x871528bf, 0x303747e2), TOBN(0xa208e77c, 0xf5b5c18d), + TOBN(0x9d129c88, 0xca6bf923), TOBN(0xbcbf197f, 0xbf02839f)}}, + {{TOBN(0x9b9bf030, 0x27323194), TOBN(0x3b055a8b, 0x339ca59d), + TOBN(0xb46b2312, 0x0f669520), TOBN(0x19789f1f, 0x497e5f24)}, + {TOBN(0x9c499468, 0xaaf01801), TOBN(0x72ee1190, 0x8b69d59c), + TOBN(0x8bd39595, 0xacf4c079), TOBN(0x3ee11ece, 0x8e0cd048)}}, + {{TOBN(0xebde86ec, 0x1ed66f18), TOBN(0x225d906b, 0xd61fce43), + TOBN(0x5cab07d6, 0xe8bed74d), TOBN(0x16e4617f, 0x27855ab7)}, + {TOBN(0x6568aadd, 0xb2fbc3dd), TOBN(0xedb5484f, 0x8aeddf5b), + TOBN(0x878f20e8, 0x6dcf2fad), TOBN(0x3516497c, 0x615f5699)}}}, + {{{TOBN(0xef0a3fec, 0xfa181e69), TOBN(0x9ea02f81, 0x30d69a98), + TOBN(0xb2e9cf8e, 0x66eab95d), TOBN(0x520f2beb, 0x24720021)}, + {TOBN(0x621c540a, 0x1df84361), TOBN(0x12037721, 0x71fa6d5d), + TOBN(0x6e3c7b51, 0x0ff5f6ff), TOBN(0x817a069b, 0xabb2bef3)}}, + {{TOBN(0x83572fb6, 0xb294cda6), TOBN(0x6ce9bf75, 0xb9039f34), + TOBN(0x20e012f0, 0x095cbb21), TOBN(0xa0aecc1b, 0xd063f0da)}, + {TOBN(0x57c21c3a, 0xf02909e5), TOBN(0xc7d59ecf, 0x48ce9cdc), + TOBN(0x2732b844, 0x8ae336f8), TOBN(0x056e3723, 0x3f4f85f4)}}, + {{TOBN(0x8a10b531, 0x89e800ca), TOBN(0x50fe0c17, 0x145208fd), + TOBN(0x9e43c0d3, 0xb714ba37), TOBN(0x427d200e, 0x34189acc)}, + {TOBN(0x05dee24f, 0xe616e2c0), TOBN(0x9c25f4c8, 0xee1854c1), + TOBN(0x4d3222a5, 0x8f342a73), TOBN(0x0807804f, 0xa027c952)}}, + {{TOBN(0xc222653a, 0x4f0d56f3), TOBN(0x961e4047, 0xca28b805), + TOBN(0x2c03f8b0, 0x4a73434b), TOBN(0x4c966787, 0xab712a19)}, + {TOBN(0xcc196c42, 0x864fee42), TOBN(0xc1be93da, 0x5b0ece5c), + TOBN(0xa87d9f22, 0xc131c159), TOBN(0x2bb6d593, 0xdce45655)}}, + {{TOBN(0x22c49ec9, 0xb809b7ce), TOBN(0x8a41486b, 0xe2c72c2c), + TOBN(0x813b9420, 0xfea0bf36), TOBN(0xb3d36ee9, 0xa66dac69)}, + {TOBN(0x6fddc08a, 0x328cc987), TOBN(0x0a3bcd2c, 0x3a326461), + TOBN(0x7103c49d, 0xd810dbba), TOBN(0xf9d81a28, 0x4b78a4c4)}}, + {{TOBN(0x3de865ad, 0xe4d55941), TOBN(0xdedafa5e, 0x30384087), + TOBN(0x6f414abb, 0x4ef18b9b), TOBN(0x9ee9ea42, 0xfaee5268)}, + {TOBN(0x260faa16, 0x37a55a4a), TOBN(0xeb19a514, 0x015f93b9), + TOBN(0x51d7ebd2, 0x9e9c3598), TOBN(0x523fc56d, 0x1932178e)}}, + {{TOBN(0x501d070c, 0xb98fe684), TOBN(0xd60fbe9a, 0x124a1458), + TOBN(0xa45761c8, 0x92bc6b3f), TOBN(0xf5384858, 0xfe6f27cb)}, + {TOBN(0x4b0271f7, 0xb59e763b), TOBN(0x3d4606a9, 0x5b5a8e5e), + TOBN(0x1eda5d9b, 0x05a48292), TOBN(0xda7731d0, 0xe6fec446)}}, + {{TOBN(0xa3e33693, 0x90d45871), TOBN(0xe9764040, 0x06166d8d), + TOBN(0xb5c33682, 0x89a90403), TOBN(0x4bd17983, 0x72f1d637)}, + {TOBN(0xa616679e, 0xd5d2c53a), TOBN(0x5ec4bcd8, 0xfdcf3b87), + TOBN(0xae6d7613, 0xb66a694e), TOBN(0x7460fc76, 0xe3fc27e5)}}, + {{TOBN(0x70469b82, 0x95caabee), TOBN(0xde024ca5, 0x889501e3), + TOBN(0x6bdadc06, 0x076ed265), TOBN(0x0cb1236b, 0x5a0ef8b2)}, + {TOBN(0x4065ddbf, 0x0972ebf9), TOBN(0xf1dd3875, 0x22aca432), + TOBN(0xa88b97cf, 0x744aff76), TOBN(0xd1359afd, 0xfe8e3d24)}}, + {{TOBN(0x52a3ba2b, 0x91502cf3), TOBN(0x2c3832a8, 0x084db75d), + TOBN(0x04a12ddd, 0xde30b1c9), TOBN(0x7802eabc, 0xe31fd60c)}, + {TOBN(0x33707327, 0xa37fddab), TOBN(0x65d6f2ab, 0xfaafa973), + TOBN(0x3525c5b8, 0x11e6f91a), TOBN(0x76aeb0c9, 0x5f46530b)}}, + {{TOBN(0xe8815ff6, 0x2f93a675), TOBN(0xa6ec9684, 0x05f48679), + TOBN(0x6dcbb556, 0x358ae884), TOBN(0x0af61472, 0xe19e3873)}, + {TOBN(0x72334372, 0xa5f696be), TOBN(0xc65e57ea, 0x6f22fb70), + TOBN(0x268da30c, 0x946cea90), TOBN(0x136a8a87, 0x65681b2a)}}, + {{TOBN(0xad5e81dc, 0x0f9f44d4), TOBN(0xf09a6960, 0x2c46585a), + TOBN(0xd1649164, 0xc447d1b1), TOBN(0x3b4b36c8, 0x879dc8b1)}, + {TOBN(0x20d4177b, 0x3b6b234c), TOBN(0x096a2505, 0x1730d9d0), + TOBN(0x0611b9b8, 0xef80531d), TOBN(0xba904b3b, 0x64bb495d)}}, + {{TOBN(0x1192d9d4, 0x93a3147a), TOBN(0x9f30a5dc, 0x9a565545), + TOBN(0x90b1f9cb, 0x6ef07212), TOBN(0x29958546, 0x0d87fc13)}, + {TOBN(0xd3323eff, 0xc17db9ba), TOBN(0xcb18548c, 0xcb1644a8), + TOBN(0x18a306d4, 0x4f49ffbc), TOBN(0x28d658f1, 0x4c2e8684)}}, + {{TOBN(0x44ba60cd, 0xa99f8c71), TOBN(0x67b7abdb, 0x4bf742ff), + TOBN(0x66310f9c, 0x914b3f99), TOBN(0xae430a32, 0xf412c161)}, + {TOBN(0x1e6776d3, 0x88ace52f), TOBN(0x4bc0fa24, 0x52d7067d), + TOBN(0x03c286aa, 0x8f07cd1b), TOBN(0x4cb8f38c, 0xa985b2c1)}}, + {{TOBN(0x83ccbe80, 0x8c3bff36), TOBN(0x005a0bd2, 0x5263e575), + TOBN(0x460d7dda, 0x259bdcd1), TOBN(0x4a1c5642, 0xfa5cab6b)}, + {TOBN(0x2b7bdbb9, 0x9fe4fc88), TOBN(0x09418e28, 0xcc97bbb5), + TOBN(0xd8274fb4, 0xa12321ae), TOBN(0xb137007d, 0x5c87b64e)}}, + {{TOBN(0x80531fe1, 0xc63c4962), TOBN(0x50541e89, 0x981fdb25), + TOBN(0xdc1291a1, 0xfd4c2b6b), TOBN(0xc0693a17, 0xa6df4fca)}, + {TOBN(0xb2c4604e, 0x0117f203), TOBN(0x245f1963, 0x0a99b8d0), + TOBN(0xaedc20aa, 0xc6212c44), TOBN(0xb1ed4e56, 0x520f52a8)}}, + {{TOBN(0xfe48f575, 0xf8547be3), TOBN(0x0a7033cd, 0xa9e45f98), + TOBN(0x4b45d3a9, 0x18c50100), TOBN(0xb2a6cd6a, 0xa61d41da)}, + {TOBN(0x60bbb4f5, 0x57933c6b), TOBN(0xa7538ebd, 0x2b0d7ffc), + TOBN(0x9ea3ab8d, 0x8cd626b6), TOBN(0x8273a484, 0x3601625a)}}, + {{TOBN(0x88859845, 0x0168e508), TOBN(0x8cbc9bb2, 0x99a94abd), + TOBN(0x713ac792, 0xfab0a671), TOBN(0xa3995b19, 0x6c9ebffc)}, + {TOBN(0xe711668e, 0x1239e152), TOBN(0x56892558, 0xbbb8dff4), + TOBN(0x8bfc7dab, 0xdbf17963), TOBN(0x5b59fe5a, 0xb3de1253)}}, + {{TOBN(0x7e3320eb, 0x34a9f7ae), TOBN(0xe5e8cf72, 0xd751efe4), + TOBN(0x7ea003bc, 0xd9be2f37), TOBN(0xc0f551a0, 0xb6c08ef7)}, + {TOBN(0x56606268, 0x038f6725), TOBN(0x1dd38e35, 0x6d92d3b6), + TOBN(0x07dfce7c, 0xc3cbd686), TOBN(0x4e549e04, 0x651c5da8)}}, + {{TOBN(0x4058f93b, 0x08b19340), TOBN(0xc2fae6f4, 0xcac6d89d), + TOBN(0x4bad8a8c, 0x8f159cc7), TOBN(0x0ddba4b3, 0xcb0b601c)}, + {TOBN(0xda4fc7b5, 0x1dd95f8c), TOBN(0x1d163cd7, 0xcea5c255), + TOBN(0x30707d06, 0x274a8c4c), TOBN(0x79d9e008, 0x2802e9ce)}}, + {{TOBN(0x02a29ebf, 0xe6ddd505), TOBN(0x37064e74, 0xb50bed1a), + TOBN(0x3f6bae65, 0xa7327d57), TOBN(0x3846f5f1, 0xf83920bc)}, + {TOBN(0x87c37491, 0x60df1b9b), TOBN(0x4cfb2895, 0x2d1da29f), + TOBN(0x10a478ca, 0x4ed1743c), TOBN(0x390c6030, 0x3edd47c6)}}, + {{TOBN(0x8f3e5312, 0x8c0a78de), TOBN(0xccd02bda, 0x1e85df70), + TOBN(0xd6c75c03, 0xa61b6582), TOBN(0x0762921c, 0xfc0eebd1)}, + {TOBN(0xd34d0823, 0xd85010c0), TOBN(0xd73aaacb, 0x0044cf1f), + TOBN(0xfb4159bb, 0xa3b5e78a), TOBN(0x2287c7f7, 0xe5826f3f)}}, + {{TOBN(0x4aeaf742, 0x580b1a01), TOBN(0xf080415d, 0x60423b79), + TOBN(0xe12622cd, 0xa7dea144), TOBN(0x49ea4996, 0x59d62472)}, + {TOBN(0xb42991ef, 0x571f3913), TOBN(0x0610f214, 0xf5b25a8a), + TOBN(0x47adc585, 0x30b79e8f), TOBN(0xf90e3df6, 0x07a065a2)}}, + {{TOBN(0x5d0a5deb, 0x43e2e034), TOBN(0x53fb5a34, 0x444024aa), + TOBN(0xa8628c68, 0x6b0c9f7f), TOBN(0x9c69c29c, 0xac563656)}, + {TOBN(0x5a231feb, 0xbace47b6), TOBN(0xbdce0289, 0x9ea5a2ec), + TOBN(0x05da1fac, 0x9463853e), TOBN(0x96812c52, 0x509e78aa)}}, + {{TOBN(0xd3fb5771, 0x57151692), TOBN(0xeb2721f8, 0xd98e1c44), + TOBN(0xc0506087, 0x32399be1), TOBN(0xda5a5511, 0xd979d8b8)}, + {TOBN(0x737ed55d, 0xc6f56780), TOBN(0xe20d3004, 0x0dc7a7f4), + TOBN(0x02ce7301, 0xf5941a03), TOBN(0x91ef5215, 0xed30f83a)}}, + {{TOBN(0x28727fc1, 0x4092d85f), TOBN(0x72d223c6, 0x5c49e41a), + TOBN(0xa7cf30a2, 0xba6a4d81), TOBN(0x7c086209, 0xb030d87d)}, + {TOBN(0x04844c7d, 0xfc588b09), TOBN(0x728cd499, 0x5874bbb0), + TOBN(0xcc1281ee, 0xe84c0495), TOBN(0x0769b5ba, 0xec31958f)}}, + {{TOBN(0x665c228b, 0xf99c2471), TOBN(0xf2d8a11b, 0x191eb110), + TOBN(0x4594f494, 0xd36d7024), TOBN(0x482ded8b, 0xcdcb25a1)}, + {TOBN(0xc958a9d8, 0xdadd4885), TOBN(0x7004477e, 0xf1d2b547), + TOBN(0x0a45f6ef, 0x2a0af550), TOBN(0x4fc739d6, 0x2f8d6351)}}, + {{TOBN(0x75cdaf27, 0x786f08a9), TOBN(0x8700bb26, 0x42c2737f), + TOBN(0x855a7141, 0x1c4e2670), TOBN(0x810188c1, 0x15076fef)}, + {TOBN(0xc251d0c9, 0xabcd3297), TOBN(0xae4c8967, 0xf48108eb), + TOBN(0xbd146de7, 0x18ceed30), TOBN(0xf9d4f07a, 0xc986bced)}}, + {{TOBN(0x5ad98ed5, 0x83fa1e08), TOBN(0x7780d33e, 0xbeabd1fb), + TOBN(0xe330513c, 0x903b1196), TOBN(0xba11de9e, 0xa47bc8c4)}, + {TOBN(0x684334da, 0x02c2d064), TOBN(0x7ecf360d, 0xa48de23b), + TOBN(0x57a1b474, 0x0a9089d8), TOBN(0xf28fa439, 0xff36734c)}}, + {{TOBN(0xf2a482cb, 0xea4570b3), TOBN(0xee65d68b, 0xa5ebcee9), + TOBN(0x988d0036, 0xb9694cd5), TOBN(0x53edd0e9, 0x37885d32)}, + {TOBN(0xe37e3307, 0xbeb9bc6d), TOBN(0xe9abb907, 0x9f5c6768), + TOBN(0x4396ccd5, 0x51f2160f), TOBN(0x2500888c, 0x47336da6)}}, + {{TOBN(0x383f9ed9, 0x926fce43), TOBN(0x809dd1c7, 0x04da2930), + TOBN(0x30f6f596, 0x8a4cb227), TOBN(0x0d700c7f, 0x73a56b38)}, + {TOBN(0x1825ea33, 0xab64a065), TOBN(0xaab9b735, 0x1338df80), + TOBN(0x1516100d, 0x9b63f57f), TOBN(0x2574395a, 0x27a6a634)}}, + {{TOBN(0xb5560fb6, 0x700a1acd), TOBN(0xe823fd73, 0xfd999681), + TOBN(0xda915d1f, 0x6cb4e1ba), TOBN(0x0d030118, 0x6ebe00a3)}, + {TOBN(0x744fb0c9, 0x89fca8cd), TOBN(0x970d01db, 0xf9da0e0b), + TOBN(0x0ad8c564, 0x7931d76f), TOBN(0xb15737bf, 0xf659b96a)}}, + {{TOBN(0xdc9933e8, 0xa8b484e7), TOBN(0xb2fdbdf9, 0x7a26dec7), + TOBN(0x2349e9a4, 0x9f1f0136), TOBN(0x7860368e, 0x70fddddb)}, + {TOBN(0xd93d2c1c, 0xf9ad3e18), TOBN(0x6d6c5f17, 0x689f4e79), + TOBN(0x7a544d91, 0xb24ff1b6), TOBN(0x3e12a5eb, 0xfe16cd8c)}}, + {{TOBN(0x543574e9, 0xa56b872f), TOBN(0xa1ad550c, 0xfcf68ea2), + TOBN(0x689e37d2, 0x3f560ef7), TOBN(0x8c54b9ca, 0xc9d47a8b)}, + {TOBN(0x46d40a4a, 0x088ac342), TOBN(0xec450c7c, 0x1576c6d0), + TOBN(0xb589e31c, 0x1f9689e9), TOBN(0xdacf2602, 0xb8781718)}}, + {{TOBN(0xa89237c6, 0xc8cb6b42), TOBN(0x1326fc93, 0xb96ef381), + TOBN(0x55d56c6d, 0xb5f07825), TOBN(0xacba2eea, 0x7449e22d)}, + {TOBN(0x74e0887a, 0x633c3000), TOBN(0xcb6cd172, 0xd7cbcf71), + TOBN(0x309e81de, 0xc36cf1be), TOBN(0x07a18a6d, 0x60ae399b)}}, + {{TOBN(0xb36c2679, 0x9edce57e), TOBN(0x52b892f4, 0xdf001d41), + TOBN(0xd884ae5d, 0x16a1f2c6), TOBN(0x9b329424, 0xefcc370a)}, + {TOBN(0x3120daf2, 0xbd2e21df), TOBN(0x55298d2d, 0x02470a99), + TOBN(0x0b78af6c, 0xa05db32e), TOBN(0x5c76a331, 0x601f5636)}}, + {{TOBN(0xaae861ff, 0xf8a4f29c), TOBN(0x70dc9240, 0xd68f8d49), + TOBN(0x960e649f, 0x81b1321c), TOBN(0x3d2c801b, 0x8792e4ce)}, + {TOBN(0xf479f772, 0x42521876), TOBN(0x0bed93bc, 0x416c79b1), + TOBN(0xa67fbc05, 0x263e5bc9), TOBN(0x01e8e630, 0x521db049)}}, + {{TOBN(0x76f26738, 0xc6f3431e), TOBN(0xe609cb02, 0xe3267541), + TOBN(0xb10cff2d, 0x818c877c), TOBN(0x1f0e75ce, 0x786a13cb)}, + {TOBN(0xf4fdca64, 0x1158544d), TOBN(0x5d777e89, 0x6cb71ed0), + TOBN(0x3c233737, 0xa9aa4755), TOBN(0x7b453192, 0xe527ab40)}}, + {{TOBN(0xdb59f688, 0x39f05ffe), TOBN(0x8f4f4be0, 0x6d82574e), + TOBN(0xcce3450c, 0xee292d1b), TOBN(0xaa448a12, 0x61ccd086)}, + {TOBN(0xabce91b3, 0xf7914967), TOBN(0x4537f09b, 0x1908a5ed), + TOBN(0xa812421e, 0xf51042e7), TOBN(0xfaf5cebc, 0xec0b3a34)}}, + {{TOBN(0x730ffd87, 0x4ca6b39a), TOBN(0x70fb72ed, 0x02efd342), + TOBN(0xeb4735f9, 0xd75c8edb), TOBN(0xc11f2157, 0xc278aa51)}, + {TOBN(0xc459f635, 0xbf3bfebf), TOBN(0x3a1ff0b4, 0x6bd9601f), + TOBN(0xc9d12823, 0xc420cb73), TOBN(0x3e9af3e2, 0x3c2915a3)}}, + {{TOBN(0xe0c82c72, 0xb41c3440), TOBN(0x175239e5, 0xe3039a5f), + TOBN(0xe1084b8a, 0x558795a3), TOBN(0x328d0a1d, 0xd01e5c60)}, + {TOBN(0x0a495f2e, 0xd3788a04), TOBN(0x25d8ff16, 0x66c11a9f), + TOBN(0xf5155f05, 0x9ed692d6), TOBN(0x954fa107, 0x4f425fe4)}}, + {{TOBN(0xd16aabf2, 0xe98aaa99), TOBN(0x90cd8ba0, 0x96b0f88a), + TOBN(0x957f4782, 0xc154026a), TOBN(0x54ee0734, 0x52af56d2)}, + {TOBN(0xbcf89e54, 0x45b4147a), TOBN(0x3d102f21, 0x9a52816c), + TOBN(0x6808517e, 0x39b62e77), TOBN(0x92e25421, 0x69169ad8)}}, + {{TOBN(0xd721d871, 0xbb608558), TOBN(0x60e4ebae, 0xf6d4ff9b), + TOBN(0x0ba10819, 0x41f2763e), TOBN(0xca2e45be, 0x51ee3247)}, + {TOBN(0x66d172ec, 0x2bfd7a5f), TOBN(0x528a8f2f, 0x74d0b12d), + TOBN(0xe17f1e38, 0xdabe70dc), TOBN(0x1d5d7316, 0x9f93983c)}}, + {{TOBN(0x51b2184a, 0xdf423e31), TOBN(0xcb417291, 0xaedb1a10), + TOBN(0x2054ca93, 0x625bcab9), TOBN(0x54396860, 0xa98998f0)}, + {TOBN(0x4e53f6c4, 0xa54ae57e), TOBN(0x0ffeb590, 0xee648e9d), + TOBN(0xfbbdaadc, 0x6afaf6bc), TOBN(0xf88ae796, 0xaa3bfb8a)}}, + {{TOBN(0x209f1d44, 0xd2359ed9), TOBN(0xac68dd03, 0xf3544ce2), + TOBN(0xf378da47, 0xfd51e569), TOBN(0xe1abd860, 0x2cc80097)}, + {TOBN(0x23ca18d9, 0x343b6e3a), TOBN(0x480797e8, 0xb40a1bae), + TOBN(0xd1f0c717, 0x533f3e67), TOBN(0x44896970, 0x06e6cdfc)}}, + {{TOBN(0x8ca21055, 0x52a82e8d), TOBN(0xb2caf785, 0x78460cdc), + TOBN(0x4c1b7b62, 0xe9037178), TOBN(0xefc09d2c, 0xdb514b58)}, + {TOBN(0x5f2df9ee, 0x9113be5c), TOBN(0x2fbda78f, 0xb3f9271c), + TOBN(0xe09a81af, 0x8f83fc54), TOBN(0x06b13866, 0x8afb5141)}}, + {{TOBN(0x38f6480f, 0x43e3865d), TOBN(0x72dd77a8, 0x1ddf47d9), + TOBN(0xf2a8e971, 0x4c205ff7), TOBN(0x46d449d8, 0x9d088ad8)}, + {TOBN(0x926619ea, 0x185d706f), TOBN(0xe47e02eb, 0xc7dd7f62), + TOBN(0xe7f120a7, 0x8cbc2031), TOBN(0xc18bef00, 0x998d4ac9)}}, + {{TOBN(0x18f37a9c, 0x6bdf22da), TOBN(0xefbc432f, 0x90dc82df), + TOBN(0xc52cef8e, 0x5d703651), TOBN(0x82887ba0, 0xd99881a5)}, + {TOBN(0x7cec9dda, 0xb920ec1d), TOBN(0xd0d7e8c3, 0xec3e8d3b), + TOBN(0x445bc395, 0x4ca88747), TOBN(0xedeaa2e0, 0x9fd53535)}}, + {{TOBN(0x461b1d93, 0x6cc87475), TOBN(0xd92a52e2, 0x6d2383bd), + TOBN(0xfabccb59, 0xd7903546), TOBN(0x6111a761, 0x3d14b112)}, + {TOBN(0x0ae584fe, 0xb3d5f612), TOBN(0x5ea69b8d, 0x60e828ec), + TOBN(0x6c078985, 0x54087030), TOBN(0x649cab04, 0xac4821fe)}}, + {{TOBN(0x25ecedcf, 0x8bdce214), TOBN(0xb5622f72, 0x86af7361), + TOBN(0x0e1227aa, 0x7038b9e2), TOBN(0xd0efb273, 0xac20fa77)}, + {TOBN(0x817ff88b, 0x79df975b), TOBN(0x856bf286, 0x1999503e), + TOBN(0xb4d5351f, 0x5038ec46), TOBN(0x740a52c5, 0xfc42af6e)}}, + {{TOBN(0x2e38bb15, 0x2cbb1a3f), TOBN(0xc3eb99fe, 0x17a83429), + TOBN(0xca4fcbf1, 0xdd66bb74), TOBN(0x880784d6, 0xcde5e8fc)}, + {TOBN(0xddc84c1c, 0xb4e7a0be), TOBN(0x8780510d, 0xbd15a72f), + TOBN(0x44bcf1af, 0x81ec30e1), TOBN(0x141e50a8, 0x0a61073e)}}, + {{TOBN(0x0d955718, 0x47be87ae), TOBN(0x68a61417, 0xf76a4372), + TOBN(0xf57e7e87, 0xc607c3d3), TOBN(0x043afaf8, 0x5252f332)}, + {TOBN(0xcc14e121, 0x1552a4d2), TOBN(0xb6dee692, 0xbb4d4ab4), + TOBN(0xb6ab74c8, 0xa03816a4), TOBN(0x84001ae4, 0x6f394a29)}}, + {{TOBN(0x5bed8344, 0xd795fb45), TOBN(0x57326e7d, 0xb79f55a5), + TOBN(0xc9533ce0, 0x4accdffc), TOBN(0x53473caf, 0x3993fa04)}, + {TOBN(0x7906eb93, 0xa13df4c8), TOBN(0xa73e51f6, 0x97cbe46f), + TOBN(0xd1ab3ae1, 0x0ae4ccf8), TOBN(0x25614508, 0x8a5b3dbc)}}, + {{TOBN(0x61eff962, 0x11a71b27), TOBN(0xdf71412b, 0x6bb7fa39), + TOBN(0xb31ba6b8, 0x2bd7f3ef), TOBN(0xb0b9c415, 0x69180d29)}, + {TOBN(0xeec14552, 0x014cdde5), TOBN(0x702c624b, 0x227b4bbb), + TOBN(0x2b15e8c2, 0xd3e988f3), TOBN(0xee3bcc6d, 0xa4f7fd04)}}, + {{TOBN(0x9d00822a, 0x42ac6c85), TOBN(0x2db0cea6, 0x1df9f2b7), + TOBN(0xd7cad2ab, 0x42de1e58), TOBN(0x346ed526, 0x2d6fbb61)}, + {TOBN(0xb3962995, 0x1a2faf09), TOBN(0x2fa8a580, 0x7c25612e), + TOBN(0x30ae04da, 0x7cf56490), TOBN(0x75662908, 0x0eea3961)}}, + {{TOBN(0x3609f5c5, 0x3d080847), TOBN(0xcb081d39, 0x5241d4f6), + TOBN(0xb4fb3810, 0x77961a63), TOBN(0xc20c5984, 0x2abb66fc)}, + {TOBN(0x3d40aa7c, 0xf902f245), TOBN(0x9cb12736, 0x4e536b1e), + TOBN(0x5eda24da, 0x99b3134f), TOBN(0xafbd9c69, 0x5cd011af)}}, + {{TOBN(0x9a16e30a, 0xc7088c7d), TOBN(0x5ab65710, 0x3207389f), + TOBN(0x1b09547f, 0xe7407a53), TOBN(0x2322f9d7, 0x4fdc6eab)}, + {TOBN(0xc0f2f22d, 0x7430de4d), TOBN(0x19382696, 0xe68ca9a9), + TOBN(0x17f1eff1, 0x918e5868), TOBN(0xe3b5b635, 0x586f4204)}}, + {{TOBN(0x146ef980, 0x3fbc4341), TOBN(0x359f2c80, 0x5b5eed4e), + TOBN(0x9f35744e, 0x7482e41d), TOBN(0x9a9ac3ec, 0xf3b224c2)}, + {TOBN(0x9161a6fe, 0x91fc50ae), TOBN(0x89ccc66b, 0xc613fa7c), + TOBN(0x89268b14, 0xc732f15a), TOBN(0x7cd6f4e2, 0xb467ed03)}}, + {{TOBN(0xfbf79869, 0xce56b40e), TOBN(0xf93e094c, 0xc02dde98), + TOBN(0xefe0c3a8, 0xedee2cd7), TOBN(0x90f3ffc0, 0xb268fd42)}, + {TOBN(0x81a7fd56, 0x08241aed), TOBN(0x95ab7ad8, 0x00b1afe8), + TOBN(0x40127056, 0x3e310d52), TOBN(0xd3ffdeb1, 0x09d9fc43)}}, + {{TOBN(0xc8f85c91, 0xd11a8594), TOBN(0x2e74d258, 0x31cf6db8), + TOBN(0x829c7ca3, 0x02b5dfd0), TOBN(0xe389cfbe, 0x69143c86)}, + {TOBN(0xd01b6405, 0x941768d8), TOBN(0x45103995, 0x03bf825d), + TOBN(0xcc4ee166, 0x56cd17e2), TOBN(0xbea3c283, 0xba037e79)}}, + {{TOBN(0x4e1ac06e, 0xd9a47520), TOBN(0xfbfe18aa, 0xaf852404), + TOBN(0x5615f8e2, 0x8087648a), TOBN(0x7301e47e, 0xb9d150d9)}, + {TOBN(0x79f9f9dd, 0xb299b977), TOBN(0x76697a7b, 0xa5b78314), + TOBN(0x10d67468, 0x7d7c90e7), TOBN(0x7afffe03, 0x937210b5)}}, + {{TOBN(0x5aef3e4b, 0x28c22cee), TOBN(0xefb0ecd8, 0x09fd55ae), + TOBN(0x4cea7132, 0x0d2a5d6a), TOBN(0x9cfb5fa1, 0x01db6357)}, + {TOBN(0x395e0b57, 0xf36e1ac5), TOBN(0x008fa9ad, 0x36cafb7d), + TOBN(0x8f6cdf70, 0x5308c4db), TOBN(0x51527a37, 0x95ed2477)}}, + {{TOBN(0xba0dee30, 0x5bd21311), TOBN(0x6ed41b22, 0x909c90d7), + TOBN(0xc5f6b758, 0x7c8696d3), TOBN(0x0db8eaa8, 0x3ce83a80)}, + {TOBN(0xd297fe37, 0xb24b4b6f), TOBN(0xfe58afe8, 0x522d1f0d), + TOBN(0x97358736, 0x8c98dbd9), TOBN(0x6bc226ca, 0x9454a527)}}, + {{TOBN(0xa12b384e, 0xce53c2d0), TOBN(0x779d897d, 0x5e4606da), + TOBN(0xa53e47b0, 0x73ec12b0), TOBN(0x462dbbba, 0x5756f1ad)}, + {TOBN(0x69fe09f2, 0xcafe37b6), TOBN(0x273d1ebf, 0xecce2e17), + TOBN(0x8ac1d538, 0x3cf607fd), TOBN(0x8035f7ff, 0x12e10c25)}}}, + {{{TOBN(0x854d34c7, 0x7e6c5520), TOBN(0xc27df9ef, 0xdcb9ea58), + TOBN(0x405f2369, 0xd686666d), TOBN(0x29d1febf, 0x0417aa85)}, + {TOBN(0x9846819e, 0x93470afe), TOBN(0x3e6a9669, 0xe2a27f9e), + TOBN(0x24d008a2, 0xe31e6504), TOBN(0xdba7cecf, 0x9cb7680a)}}, + {{TOBN(0xecaff541, 0x338d6e43), TOBN(0x56f7dd73, 0x4541d5cc), + TOBN(0xb5d426de, 0x96bc88ca), TOBN(0x48d94f6b, 0x9ed3a2c3)}, + {TOBN(0x6354a3bb, 0x2ef8279c), TOBN(0xd575465b, 0x0b1867f2), + TOBN(0xef99b0ff, 0x95225151), TOBN(0xf3e19d88, 0xf94500d8)}}, + {{TOBN(0x92a83268, 0xe32dd620), TOBN(0x913ec99f, 0x627849a2), + TOBN(0xedd8fdfa, 0x2c378882), TOBN(0xaf96f33e, 0xee6f8cfe)}, + {TOBN(0xc06737e5, 0xdc3fa8a5), TOBN(0x236bb531, 0xb0b03a1d), + TOBN(0x33e59f29, 0x89f037b0), TOBN(0x13f9b5a7, 0xd9a12a53)}}, + {{TOBN(0x0d0df6ce, 0x51efb310), TOBN(0xcb5b2eb4, 0x958df5be), + TOBN(0xd6459e29, 0x36158e59), TOBN(0x82aae2b9, 0x1466e336)}, + {TOBN(0xfb658a39, 0x411aa636), TOBN(0x7152ecc5, 0xd4c0a933), + TOBN(0xf10c758a, 0x49f026b7), TOBN(0xf4837f97, 0xcb09311f)}}, + {{TOBN(0xddfb02c4, 0xc753c45f), TOBN(0x18ca81b6, 0xf9c840fe), + TOBN(0x846fd09a, 0xb0f8a3e6), TOBN(0xb1162add, 0xe7733dbc)}, + {TOBN(0x7070ad20, 0x236e3ab6), TOBN(0xf88cdaf5, 0xb2a56326), + TOBN(0x05fc8719, 0x997cbc7a), TOBN(0x442cd452, 0x4b665272)}}, + {{TOBN(0x7807f364, 0xb71698f5), TOBN(0x6ba418d2, 0x9f7b605e), + TOBN(0xfd20b00f, 0xa03b2cbb), TOBN(0x883eca37, 0xda54386f)}, + {TOBN(0xff0be43f, 0xf3437f24), TOBN(0xe910b432, 0xa48bb33c), + TOBN(0x4963a128, 0x329df765), TOBN(0xac1dd556, 0xbe2fe6f7)}}, + {{TOBN(0x557610f9, 0x24a0a3fc), TOBN(0x38e17bf4, 0xe881c3f9), + TOBN(0x6ba84faf, 0xed0dac99), TOBN(0xd4a222c3, 0x59eeb918)}, + {TOBN(0xc79c1dbe, 0x13f542b6), TOBN(0x1fc65e0d, 0xe425d457), + TOBN(0xeffb754f, 0x1debb779), TOBN(0x638d8fd0, 0x9e08af60)}}, + {{TOBN(0x994f523a, 0x626332d5), TOBN(0x7bc38833, 0x5561bb44), + TOBN(0x005ed4b0, 0x3d845ea2), TOBN(0xd39d3ee1, 0xc2a1f08a)}, + {TOBN(0x6561fdd3, 0xe7676b0d), TOBN(0x620e35ff, 0xfb706017), + TOBN(0x36ce424f, 0xf264f9a8), TOBN(0xc4c3419f, 0xda2681f7)}}, + {{TOBN(0xfb6afd2f, 0x69beb6e8), TOBN(0x3a50b993, 0x6d700d03), + TOBN(0xc840b2ad, 0x0c83a14f), TOBN(0x573207be, 0x54085bef)}, + {TOBN(0x5af882e3, 0x09fe7e5b), TOBN(0x957678a4, 0x3b40a7e1), + TOBN(0x172d4bdd, 0x543056e2), TOBN(0x9c1b26b4, 0x0df13c0a)}}, + {{TOBN(0x1c30861c, 0xf405ff06), TOBN(0xebac86bd, 0x486e828b), + TOBN(0xe791a971, 0x636933fc), TOBN(0x50e7c2be, 0x7aeee947)}, + {TOBN(0xc3d4a095, 0xfa90d767), TOBN(0xae60eb7b, 0xe670ab7b), + TOBN(0x17633a64, 0x397b056d), TOBN(0x93a21f33, 0x105012aa)}}, + {{TOBN(0x663c370b, 0xabb88643), TOBN(0x91df36d7, 0x22e21599), + TOBN(0x183ba835, 0x8b761671), TOBN(0x381eea1d, 0x728f3bf1)}, + {TOBN(0xb9b2f1ba, 0x39966e6c), TOBN(0x7c464a28, 0xe7295492), + TOBN(0x0fd5f70a, 0x09b26b7f), TOBN(0xa9aba1f9, 0xfbe009df)}}, + {{TOBN(0x857c1f22, 0x369b87ad), TOBN(0x3c00e5d9, 0x32fca556), + TOBN(0x1ad74cab, 0x90b06466), TOBN(0xa7112386, 0x550faaf2)}, + {TOBN(0x7435e198, 0x6d9bd5f5), TOBN(0x2dcc7e38, 0x59c3463f), + TOBN(0xdc7df748, 0xca7bd4b2), TOBN(0x13cd4c08, 0x9dec2f31)}}, + {{TOBN(0x0d3b5df8, 0xe3237710), TOBN(0x0dadb26e, 0xcbd2f7b0), + TOBN(0x9f5966ab, 0xe4aa082b), TOBN(0x666ec8de, 0x350e966e)}, + {TOBN(0x1bfd1ed5, 0xee524216), TOBN(0xcd93c59b, 0x41dab0b6), + TOBN(0x658a8435, 0xd186d6ba), TOBN(0x1b7d34d2, 0x159d1195)}}, + {{TOBN(0x5936e460, 0x22caf46b), TOBN(0x6a45dd8f, 0x9a96fe4f), + TOBN(0xf7925434, 0xb98f474e), TOBN(0x41410412, 0x0053ef15)}, + {TOBN(0x71cf8d12, 0x41de97bf), TOBN(0xb8547b61, 0xbd80bef4), + TOBN(0xb47d3970, 0xc4db0037), TOBN(0xf1bcd328, 0xfef20dff)}}, + {{TOBN(0x31a92e09, 0x10caad67), TOBN(0x1f591960, 0x5531a1e1), + TOBN(0x3bb852e0, 0x5f4fc840), TOBN(0x63e297ca, 0x93a72c6c)}, + {TOBN(0x3c2b0b2e, 0x49abad67), TOBN(0x6ec405fc, 0xed3db0d9), + TOBN(0xdc14a530, 0x7fef1d40), TOBN(0xccd19846, 0x280896fc)}}, + {{TOBN(0x00f83176, 0x9bb81648), TOBN(0xd69eb485, 0x653120d0), + TOBN(0xd17d75f4, 0x4ccabc62), TOBN(0x34a07f82, 0xb749fcb1)}, + {TOBN(0x2c3af787, 0xbbfb5554), TOBN(0xb06ed4d0, 0x62e283f8), + TOBN(0x5722889f, 0xa19213a0), TOBN(0x162b085e, 0xdcf3c7b4)}}, + {{TOBN(0xbcaecb31, 0xe0dd3eca), TOBN(0xc6237fbc, 0xe52f13a5), + TOBN(0xcc2b6b03, 0x27bac297), TOBN(0x2ae1cac5, 0xb917f54a)}, + {TOBN(0x474807d4, 0x7845ae4f), TOBN(0xfec7dd92, 0xce5972e0), + TOBN(0xc3bd2541, 0x1d7915bb), TOBN(0x66f85dc4, 0xd94907ca)}}, + {{TOBN(0xd981b888, 0xbdbcf0ca), TOBN(0xd75f5da6, 0xdf279e9f), + TOBN(0x128bbf24, 0x7054e934), TOBN(0x3c6ff6e5, 0x81db134b)}, + {TOBN(0x795b7cf4, 0x047d26e4), TOBN(0xf370f7b8, 0x5049ec37), + TOBN(0xc6712d4d, 0xced945af), TOBN(0xdf30b5ec, 0x095642bc)}}, + {{TOBN(0x9b034c62, 0x4896246e), TOBN(0x5652c016, 0xee90bbd1), + TOBN(0xeb38636f, 0x87fedb73), TOBN(0x5e32f847, 0x0135a613)}, + {TOBN(0x0703b312, 0xcf933c83), TOBN(0xd05bb76e, 0x1a7f47e6), + TOBN(0x825e4f0c, 0x949c2415), TOBN(0x569e5622, 0x7250d6f8)}}, + {{TOBN(0xbbe9eb3a, 0x6568013e), TOBN(0x8dbd203f, 0x22f243fc), + TOBN(0x9dbd7694, 0xb342734a), TOBN(0x8f6d12f8, 0x46afa984)}, + {TOBN(0xb98610a2, 0xc9eade29), TOBN(0xbab4f323, 0x47dd0f18), + TOBN(0x5779737b, 0x671c0d46), TOBN(0x10b6a7c6, 0xd3e0a42a)}}, + {{TOBN(0xfb19ddf3, 0x3035b41c), TOBN(0xd336343f, 0x99c45895), + TOBN(0x61fe4938, 0x54c857e5), TOBN(0xc4d506be, 0xae4e57d5)}, + {TOBN(0x3cd8c8cb, 0xbbc33f75), TOBN(0x7281f08a, 0x9262c77d), + TOBN(0x083f4ea6, 0xf11a2823), TOBN(0x8895041e, 0x9fba2e33)}}, + {{TOBN(0xfcdfea49, 0x9c438edf), TOBN(0x7678dcc3, 0x91edba44), + TOBN(0xf07b3b87, 0xe2ba50f0), TOBN(0xc13888ef, 0x43948c1b)}, + {TOBN(0xc2135ad4, 0x1140af42), TOBN(0x8e5104f3, 0x926ed1a7), + TOBN(0xf24430cb, 0x88f6695f), TOBN(0x0ce0637b, 0x6d73c120)}}, + {{TOBN(0xb2db01e6, 0xfe631e8f), TOBN(0x1c5563d7, 0xd7bdd24b), + TOBN(0x8daea3ba, 0x369ad44f), TOBN(0x000c81b6, 0x8187a9f9)}, + {TOBN(0x5f48a951, 0xaae1fd9a), TOBN(0xe35626c7, 0x8d5aed8a), + TOBN(0x20952763, 0x0498c622), TOBN(0x76d17634, 0x773aa504)}}, + {{TOBN(0x36d90dda, 0xeb300f7a), TOBN(0x9dcf7dfc, 0xedb5e801), + TOBN(0x645cb268, 0x74d5244c), TOBN(0xa127ee79, 0x348e3aa2)}, + {TOBN(0x488acc53, 0x575f1dbb), TOBN(0x95037e85, 0x80e6161e), + TOBN(0x57e59283, 0x292650d0), TOBN(0xabe67d99, 0x14938216)}}, + {{TOBN(0x3c7f944b, 0x3f8e1065), TOBN(0xed908cb6, 0x330e8924), + TOBN(0x08ee8fd5, 0x6f530136), TOBN(0x2227b7d5, 0xd7ffc169)}, + {TOBN(0x4f55c893, 0xb5cd6dd5), TOBN(0x82225e11, 0xa62796e8), + TOBN(0x5c6cead1, 0xcb18e12c), TOBN(0x4381ae0c, 0x84f5a51a)}}, + {{TOBN(0x345913d3, 0x7fafa4c8), TOBN(0x3d918082, 0x0491aac0), + TOBN(0x9347871f, 0x3e69264c), TOBN(0xbea9dd3c, 0xb4f4f0cd)}, + {TOBN(0xbda5d067, 0x3eadd3e7), TOBN(0x0033c1b8, 0x0573bcd8), + TOBN(0x25589379, 0x5da2486c), TOBN(0xcb89ee5b, 0x86abbee7)}}, + {{TOBN(0x8fe0a8f3, 0x22532e5d), TOBN(0xb6410ff0, 0x727dfc4c), + TOBN(0x619b9d58, 0x226726db), TOBN(0x5ec25669, 0x7a2b2dc7)}, + {TOBN(0xaf4d2e06, 0x4c3beb01), TOBN(0x852123d0, 0x7acea556), + TOBN(0x0e9470fa, 0xf783487a), TOBN(0x75a7ea04, 0x5664b3eb)}}, + {{TOBN(0x4ad78f35, 0x6798e4ba), TOBN(0x9214e6e5, 0xc7d0e091), + TOBN(0xc420b488, 0xb1290403), TOBN(0x64049e0a, 0xfc295749)}, + {TOBN(0x03ef5af1, 0x3ae9841f), TOBN(0xdbe4ca19, 0xb0b662a6), + TOBN(0x46845c5f, 0xfa453458), TOBN(0xf8dabf19, 0x10b66722)}}, + {{TOBN(0xb650f0aa, 0xcce2793b), TOBN(0x71db851e, 0xc5ec47c1), + TOBN(0x3eb78f3e, 0x3b234fa9), TOBN(0xb0c60f35, 0xfc0106ce)}, + {TOBN(0x05427121, 0x774eadbd), TOBN(0x25367faf, 0xce323863), + TOBN(0x7541b5c9, 0xcd086976), TOBN(0x4ff069e2, 0xdc507ad1)}}, + {{TOBN(0x74145256, 0x8776e667), TOBN(0x6e76142c, 0xb23c6bb5), + TOBN(0xdbf30712, 0x1b3a8a87), TOBN(0x60e7363e, 0x98450836)}, + {TOBN(0x5741450e, 0xb7366d80), TOBN(0xe4ee14ca, 0x4837dbdf), + TOBN(0xa765eb9b, 0x69d4316f), TOBN(0x04548dca, 0x8ef43825)}}, + {{TOBN(0x9c9f4e4c, 0x5ae888eb), TOBN(0x733abb51, 0x56e9ac99), + TOBN(0xdaad3c20, 0xba6ac029), TOBN(0x9b8dd3d3, 0x2ba3e38e)}, + {TOBN(0xa9bb4c92, 0x0bc5d11a), TOBN(0xf20127a7, 0x9c5f88a3), + TOBN(0x4f52b06e, 0x161d3cb8), TOBN(0x26c1ff09, 0x6afaf0a6)}}, + {{TOBN(0x32670d2f, 0x7189e71f), TOBN(0xc6438748, 0x5ecf91e7), + TOBN(0x15758e57, 0xdb757a21), TOBN(0x427d09f8, 0x290a9ce5)}, + {TOBN(0x846a308f, 0x38384a7a), TOBN(0xaac3acb4, 0xb0732b99), + TOBN(0x9e941009, 0x17845819), TOBN(0x95cba111, 0xa7ce5e03)}}, + {{TOBN(0x6f3d4f7f, 0xb00009c4), TOBN(0xb8396c27, 0x8ff28b5f), + TOBN(0xb1a9ae43, 0x1c97975d), TOBN(0x9d7ba8af, 0xe5d9fed5)}, + {TOBN(0x338cf09f, 0x34f485b6), TOBN(0xbc0ddacc, 0x64122516), + TOBN(0xa450da12, 0x05d471fe), TOBN(0x4c3a6250, 0x628dd8c9)}}, + {{TOBN(0x69c7d103, 0xd1295837), TOBN(0xa2893e50, 0x3807eb2f), + TOBN(0xd6e1e1de, 0xbdb41491), TOBN(0xc630745b, 0x5e138235)}, + {TOBN(0xc892109e, 0x48661ae1), TOBN(0x8d17e7eb, 0xea2b2674), + TOBN(0x00ec0f87, 0xc328d6b5), TOBN(0x6d858645, 0xf079ff9e)}}, + {{TOBN(0x6cdf243e, 0x19115ead), TOBN(0x1ce1393e, 0x4bac4fcf), + TOBN(0x2c960ed0, 0x9c29f25b), TOBN(0x59be4d8e, 0x9d388a05)}, + {TOBN(0x0d46e06c, 0xd0def72b), TOBN(0xb923db5d, 0xe0342748), + TOBN(0xf7d3aacd, 0x936d4a3d), TOBN(0x558519cc, 0x0b0b099e)}}, + {{TOBN(0x3ea8ebf8, 0x827097ef), TOBN(0x259353db, 0xd054f55d), + TOBN(0x84c89abc, 0x6d2ed089), TOBN(0x5c548b69, 0x8e096a7c)}, + {TOBN(0xd587f616, 0x994b995d), TOBN(0x4d1531f6, 0xa5845601), + TOBN(0x792ab31e, 0x451fd9f0), TOBN(0xc8b57bb2, 0x65adf6ca)}}, + {{TOBN(0x68440fcb, 0x1cd5ad73), TOBN(0xb9c860e6, 0x6144da4f), + TOBN(0x2ab286aa, 0x8462beb8), TOBN(0xcc6b8fff, 0xef46797f)}, + {TOBN(0xac820da4, 0x20c8a471), TOBN(0x69ae05a1, 0x77ff7faf), + TOBN(0xb9163f39, 0xbfb5da77), TOBN(0xbd03e590, 0x2c73ab7a)}}, + {{TOBN(0x7e862b5e, 0xb2940d9e), TOBN(0x3c663d86, 0x4b9af564), + TOBN(0xd8309031, 0xbde3033d), TOBN(0x298231b2, 0xd42c5bc6)}, + {TOBN(0x42090d2c, 0x552ad093), TOBN(0xa4799d1c, 0xff854695), + TOBN(0x0a88b5d6, 0xd31f0d00), TOBN(0xf8b40825, 0xa2f26b46)}}, + {{TOBN(0xec29b1ed, 0xf1bd7218), TOBN(0xd491c53b, 0x4b24c86e), + TOBN(0xd2fe588f, 0x3395ea65), TOBN(0x6f3764f7, 0x4456ef15)}, + {TOBN(0xdb43116d, 0xcdc34800), TOBN(0xcdbcd456, 0xc1e33955), + TOBN(0xefdb5540, 0x74ab286b), TOBN(0x948c7a51, 0xd18c5d7c)}}, + {{TOBN(0xeb81aa37, 0x7378058e), TOBN(0x41c746a1, 0x04411154), + TOBN(0xa10c73bc, 0xfb828ac7), TOBN(0x6439be91, 0x9d972b29)}, + {TOBN(0x4bf3b4b0, 0x43a2fbad), TOBN(0x39e6dadf, 0x82b5e840), + TOBN(0x4f716408, 0x6397bd4c), TOBN(0x0f7de568, 0x7f1eeccb)}}, + {{TOBN(0x5865c5a1, 0xd2ffbfc1), TOBN(0xf74211fa, 0x4ccb6451), + TOBN(0x66368a88, 0xc0b32558), TOBN(0x5b539dc2, 0x9ad7812e)}, + {TOBN(0x579483d0, 0x2f3af6f6), TOBN(0x52132078, 0x99934ece), + TOBN(0x50b9650f, 0xdcc9e983), TOBN(0xca989ec9, 0xaee42b8a)}}, + {{TOBN(0x6a44c829, 0xd6f62f99), TOBN(0x8f06a309, 0x4c2a7c0c), + TOBN(0x4ea2b3a0, 0x98a0cb0a), TOBN(0x5c547b70, 0xbeee8364)}, + {TOBN(0x461d40e1, 0x682afe11), TOBN(0x9e0fc77a, 0x7b41c0a8), + TOBN(0x79e4aefd, 0xe20d5d36), TOBN(0x2916e520, 0x32dd9f63)}}, + {{TOBN(0xf59e52e8, 0x3f883faf), TOBN(0x396f9639, 0x2b868d35), + TOBN(0xc902a9df, 0x4ca19881), TOBN(0x0fc96822, 0xdb2401a6)}, + {TOBN(0x41237587, 0x66f1c68d), TOBN(0x10fc6de3, 0xfb476c0d), + TOBN(0xf8b6b579, 0x841f5d90), TOBN(0x2ba8446c, 0xfa24f44a)}}, + {{TOBN(0xa237b920, 0xef4a9975), TOBN(0x60bb6004, 0x2330435f), + TOBN(0xd6f4ab5a, 0xcfb7e7b5), TOBN(0xb2ac5097, 0x83435391)}, + {TOBN(0xf036ee2f, 0xb0d1ea67), TOBN(0xae779a6a, 0x74c56230), + TOBN(0x59bff8c8, 0xab838ae6), TOBN(0xcd83ca99, 0x9b38e6f0)}}, + {{TOBN(0xbb27bef5, 0xe33deed3), TOBN(0xe6356f6f, 0x001892a8), + TOBN(0xbf3be6cc, 0x7adfbd3e), TOBN(0xaecbc81c, 0x33d1ac9d)}, + {TOBN(0xe4feb909, 0xe6e861dc), TOBN(0x90a247a4, 0x53f5f801), + TOBN(0x01c50acb, 0x27346e57), TOBN(0xce29242e, 0x461acc1b)}}, + {{TOBN(0x04dd214a, 0x2f998a91), TOBN(0x271ee9b1, 0xd4baf27b), + TOBN(0x7e3027d1, 0xe8c26722), TOBN(0x21d1645c, 0x1820dce5)}, + {TOBN(0x086f242c, 0x7501779c), TOBN(0xf0061407, 0xfa0e8009), + TOBN(0xf23ce477, 0x60187129), TOBN(0x05bbdedb, 0x0fde9bd0)}}, + {{TOBN(0x682f4832, 0x25d98473), TOBN(0xf207fe85, 0x5c658427), + TOBN(0xb6fdd7ba, 0x4166ffa1), TOBN(0x0c314056, 0x9eed799d)}, + {TOBN(0x0db8048f, 0x4107e28f), TOBN(0x74ed3871, 0x41216840), + TOBN(0x74489f8f, 0x56a3c06e), TOBN(0x1e1c005b, 0x12777134)}}, + {{TOBN(0xdb332a73, 0xf37ec3c3), TOBN(0xc65259bd, 0xdd59eba0), + TOBN(0x2291709c, 0xdb4d3257), TOBN(0x9a793b25, 0xbd389390)}, + {TOBN(0xf39fe34b, 0xe43756f0), TOBN(0x2f76bdce, 0x9afb56c9), + TOBN(0x9f37867a, 0x61208b27), TOBN(0xea1d4307, 0x089972c3)}}, + {{TOBN(0x8c595330, 0x8bdf623a), TOBN(0x5f5accda, 0x8441fb7d), + TOBN(0xfafa9418, 0x32ddfd95), TOBN(0x6ad40c5a, 0x0fde9be7)}, + {TOBN(0x43faba89, 0xaeca8709), TOBN(0xc64a7cf1, 0x2c248a9d), + TOBN(0x16620252, 0x72637a76), TOBN(0xaee1c791, 0x22b8d1bb)}}, + {{TOBN(0xf0f798fd, 0x21a843b2), TOBN(0x56e4ed4d, 0x8d005cb1), + TOBN(0x355f7780, 0x1f0d8abe), TOBN(0x197b04cf, 0x34522326)}, + {TOBN(0x41f9b31f, 0xfd42c13f), TOBN(0x5ef7feb2, 0xb40f933d), + TOBN(0x27326f42, 0x5d60bad4), TOBN(0x027ecdb2, 0x8c92cf89)}}, + {{TOBN(0x04aae4d1, 0x4e3352fe), TOBN(0x08414d2f, 0x73591b90), + TOBN(0x5ed6124e, 0xb7da7d60), TOBN(0xb985b931, 0x4d13d4ec)}, + {TOBN(0xa592d3ab, 0x96bf36f9), TOBN(0x012dbed5, 0xbbdf51df), + TOBN(0xa57963c0, 0xdf6c177d), TOBN(0x010ec869, 0x87ca29cf)}}, + {{TOBN(0xba1700f6, 0xbf926dff), TOBN(0x7c9fdbd1, 0xf4bf6bc2), + TOBN(0xdc18dc8f, 0x64da11f5), TOBN(0xa6074b7a, 0xd938ae75)}, + {TOBN(0x14270066, 0xe84f44a4), TOBN(0x99998d38, 0xd27b954e), + TOBN(0xc1be8ab2, 0xb4f38e9a), TOBN(0x8bb55bbf, 0x15c01016)}}, + {{TOBN(0xf73472b4, 0x0ea2ab30), TOBN(0xd365a340, 0xf73d68dd), + TOBN(0xc01a7168, 0x19c2e1eb), TOBN(0x32f49e37, 0x34061719)}, + {TOBN(0xb73c57f1, 0x01d8b4d6), TOBN(0x03c8423c, 0x26b47700), + TOBN(0x321d0bc8, 0xa4d8826a), TOBN(0x6004213c, 0x4bc0e638)}}, + {{TOBN(0xf78c64a1, 0xc1c06681), TOBN(0x16e0a16f, 0xef018e50), + TOBN(0x31cbdf91, 0xdb42b2b3), TOBN(0xf8f4ffce, 0xe0d36f58)}, + {TOBN(0xcdcc71cd, 0x4cc5e3e0), TOBN(0xd55c7cfa, 0xa129e3e0), + TOBN(0xccdb6ba0, 0x0fb2cbf1), TOBN(0x6aba0005, 0xc4bce3cb)}}, + {{TOBN(0x501cdb30, 0xd232cfc4), TOBN(0x9ddcf12e, 0xd58a3cef), + TOBN(0x02d2cf9c, 0x87e09149), TOBN(0xdc5d7ec7, 0x2c976257)}, + {TOBN(0x6447986e, 0x0b50d7dd), TOBN(0x88fdbaf7, 0x807f112a), + TOBN(0x58c9822a, 0xb00ae9f6), TOBN(0x6abfb950, 0x6d3d27e0)}}, + {{TOBN(0xd0a74487, 0x8a429f4f), TOBN(0x0649712b, 0xdb516609), + TOBN(0xb826ba57, 0xe769b5df), TOBN(0x82335df2, 0x1fc7aaf2)}, + {TOBN(0x2389f067, 0x5c93d995), TOBN(0x59ac367a, 0x68677be6), + TOBN(0xa77985ff, 0x21d9951b), TOBN(0x038956fb, 0x85011cce)}}, + {{TOBN(0x608e48cb, 0xbb734e37), TOBN(0xc08c0bf2, 0x2be5b26f), + TOBN(0x17bbdd3b, 0xf9b1a0d9), TOBN(0xeac7d898, 0x10483319)}, + {TOBN(0xc95c4baf, 0xbc1a6dea), TOBN(0xfdd0e2bf, 0x172aafdb), + TOBN(0x40373cbc, 0x8235c41a), TOBN(0x14303f21, 0xfb6f41d5)}}, + {{TOBN(0xba063621, 0x0408f237), TOBN(0xcad3b09a, 0xecd2d1ed), + TOBN(0x4667855a, 0x52abb6a2), TOBN(0xba9157dc, 0xaa8b417b)}, + {TOBN(0xfe7f3507, 0x4f013efb), TOBN(0x1b112c4b, 0xaa38c4a2), + TOBN(0xa1406a60, 0x9ba64345), TOBN(0xe53cba33, 0x6993c80b)}}, + {{TOBN(0x45466063, 0xded40d23), TOBN(0x3d5f1f4d, 0x54908e25), + TOBN(0x9ebefe62, 0x403c3c31), TOBN(0x274ea0b5, 0x0672a624)}, + {TOBN(0xff818d99, 0x451d1b71), TOBN(0x80e82643, 0x8f79cf79), + TOBN(0xa165df13, 0x73ce37f5), TOBN(0xa744ef4f, 0xfe3a21fd)}}, + {{TOBN(0x73f1e7f5, 0xcf551396), TOBN(0xc616898e, 0x868c676b), + TOBN(0x671c28c7, 0x8c442c36), TOBN(0xcfe5e558, 0x5e0a317d)}, + {TOBN(0x1242d818, 0x7051f476), TOBN(0x56fad2a6, 0x14f03442), + TOBN(0x262068bc, 0x0a44d0f6), TOBN(0xdfa2cd6e, 0xce6edf4e)}}, + {{TOBN(0x0f43813a, 0xd15d1517), TOBN(0x61214cb2, 0x377d44f5), + TOBN(0xd399aa29, 0xc639b35f), TOBN(0x42136d71, 0x54c51c19)}, + {TOBN(0x9774711b, 0x08417221), TOBN(0x0a5546b3, 0x52545a57), + TOBN(0x80624c41, 0x1150582d), TOBN(0x9ec5c418, 0xfbc555bc)}}, + {{TOBN(0x2c87dcad, 0x771849f1), TOBN(0xb0c932c5, 0x01d7bf6f), + TOBN(0x6aa5cd3e, 0x89116eb2), TOBN(0xd378c25a, 0x51ca7bd3)}, + {TOBN(0xc612a0da, 0x9e6e3e31), TOBN(0x0417a54d, 0xb68ad5d0), + TOBN(0x00451e4a, 0x22c6edb8), TOBN(0x9fbfe019, 0xb42827ce)}}, + {{TOBN(0x2fa92505, 0xba9384a2), TOBN(0x21b8596e, 0x64ad69c1), + TOBN(0x8f4fcc49, 0x983b35a6), TOBN(0xde093760, 0x72754672)}, + {TOBN(0x2f14ccc8, 0xf7bffe6d), TOBN(0x27566bff, 0x5d94263d), + TOBN(0xb5b4e9c6, 0x2df3ec30), TOBN(0x94f1d7d5, 0x3e6ea6ba)}}, + {{TOBN(0x97b7851a, 0xaaca5e9b), TOBN(0x518aa521, 0x56713b97), + TOBN(0x3357e8c7, 0x150a61f6), TOBN(0x7842e7e2, 0xec2c2b69)}, + {TOBN(0x8dffaf65, 0x6868a548), TOBN(0xd963bd82, 0xe068fc81), + TOBN(0x64da5c8b, 0x65917733), TOBN(0x927090ff, 0x7b247328)}}}, + {{{TOBN(0x214bc9a7, 0xd298c241), TOBN(0xe3b697ba, 0x56807cfd), + TOBN(0xef1c7802, 0x4564eadb), TOBN(0xdde8cdcf, 0xb48149c5)}, + {TOBN(0x946bf0a7, 0x5a4d2604), TOBN(0x27154d7f, 0x6c1538af), + TOBN(0x95cc9230, 0xde5b1fcc), TOBN(0xd88519e9, 0x66864f82)}}, + {{TOBN(0xb828dd1a, 0x7cb1282c), TOBN(0xa08d7626, 0xbe46973a), + TOBN(0x6baf8d40, 0xe708d6b2), TOBN(0x72571fa1, 0x4daeb3f3)}, + {TOBN(0x85b1732f, 0xf22dfd98), TOBN(0x87ab01a7, 0x0087108d), + TOBN(0xaaaafea8, 0x5988207a), TOBN(0xccc832f8, 0x69f00755)}}, + {{TOBN(0x964d950e, 0x36ff3bf0), TOBN(0x8ad20f6f, 0xf0b34638), + TOBN(0x4d9177b3, 0xb5d7585f), TOBN(0xcf839760, 0xef3f019f)}, + {TOBN(0x582fc5b3, 0x8288c545), TOBN(0x2f8e4e9b, 0x13116bd1), + TOBN(0xf91e1b2f, 0x332120ef), TOBN(0xcf568724, 0x2a17dd23)}}, + {{TOBN(0x488f1185, 0xca8d9d1a), TOBN(0xadf2c77d, 0xd987ded2), + TOBN(0x5f3039f0, 0x60c46124), TOBN(0xe5d70b75, 0x71e095f4)}, + {TOBN(0x82d58650, 0x6260e70f), TOBN(0x39d75ea7, 0xf750d105), + TOBN(0x8cf3d0b1, 0x75bac364), TOBN(0xf3a7564d, 0x21d01329)}}, + {{TOBN(0x182f04cd, 0x2f52d2a7), TOBN(0x4fde149a, 0xe2df565a), + TOBN(0xb80c5eec, 0xa79fb2f7), TOBN(0xab491d7b, 0x22ddc897)}, + {TOBN(0x99d76c18, 0xc6312c7f), TOBN(0xca0d5f3d, 0x6aa41a57), + TOBN(0x71207325, 0xd15363a0), TOBN(0xe82aa265, 0xbeb252c2)}}, + {{TOBN(0x94ab4700, 0xec3128c2), TOBN(0x6c76d862, 0x8e383f49), + TOBN(0xdc36b150, 0xc03024eb), TOBN(0xfb439477, 0x53daac69)}, + {TOBN(0xfc68764a, 0x8dc79623), TOBN(0x5b86995d, 0xb440fbb2), + TOBN(0xd66879bf, 0xccc5ee0d), TOBN(0x05228942, 0x95aa8bd3)}}, + {{TOBN(0xb51a40a5, 0x1e6a75c1), TOBN(0x24327c76, 0x0ea7d817), + TOBN(0x06630182, 0x07774597), TOBN(0xd6fdbec3, 0x97fa7164)}, + {TOBN(0x20c99dfb, 0x13c90f48), TOBN(0xd6ac5273, 0x686ef263), + TOBN(0xc6a50bdc, 0xfef64eeb), TOBN(0xcd87b281, 0x86fdfc32)}}, + {{TOBN(0xb24aa43e, 0x3fcd3efc), TOBN(0xdd26c034, 0xb8088e9a), + TOBN(0xa5ef4dc9, 0xbd3d46ea), TOBN(0xa2f99d58, 0x8a4c6a6f)}, + {TOBN(0xddabd355, 0x2f1da46c), TOBN(0x72c3f8ce, 0x1afacdd1), + TOBN(0xd90c4eee, 0x92d40578), TOBN(0xd28bb41f, 0xca623b94)}}, + {{TOBN(0x50fc0711, 0x745edc11), TOBN(0x9dd9ad7d, 0x3dc87558), + TOBN(0xce6931fb, 0xb49d1e64), TOBN(0x6c77a0a2, 0xc98bd0f9)}, + {TOBN(0x62b9a629, 0x6baf7cb1), TOBN(0xcf065f91, 0xccf72d22), + TOBN(0x7203cce9, 0x79639071), TOBN(0x09ae4885, 0xf9cb732f)}}, + {{TOBN(0x5e7c3bec, 0xee8314f3), TOBN(0x1c068aed, 0xdbea298f), + TOBN(0x08d381f1, 0x7c80acec), TOBN(0x03b56be8, 0xe330495b)}, + {TOBN(0xaeffb8f2, 0x9222882d), TOBN(0x95ff38f6, 0xc4af8bf7), + TOBN(0x50e32d35, 0x1fc57d8c), TOBN(0x6635be52, 0x17b444f0)}}, + {{TOBN(0x04d15276, 0xa5177900), TOBN(0x4e1dbb47, 0xf6858752), + TOBN(0x5b475622, 0xc615796c), TOBN(0xa6fa0387, 0x691867bf)}, + {TOBN(0xed7f5d56, 0x2844c6d0), TOBN(0xc633cf9b, 0x03a2477d), + TOBN(0xf6be5c40, 0x2d3721d6), TOBN(0xaf312eb7, 0xe9fd68e6)}}, + {{TOBN(0x242792d2, 0xe7417ce1), TOBN(0xff42bc71, 0x970ee7f5), + TOBN(0x1ff4dc6d, 0x5c67a41e), TOBN(0x77709b7b, 0x20882a58)}, + {TOBN(0x3554731d, 0xbe217f2c), TOBN(0x2af2a8cd, 0x5bb72177), + TOBN(0x58eee769, 0x591dd059), TOBN(0xbb2930c9, 0x4bba6477)}}, + {{TOBN(0x863ee047, 0x7d930cfc), TOBN(0x4c262ad1, 0x396fd1f4), + TOBN(0xf4765bc8, 0x039af7e1), TOBN(0x2519834b, 0x5ba104f6)}, + {TOBN(0x7cd61b4c, 0xd105f961), TOBN(0xa5415da5, 0xd63bca54), + TOBN(0x778280a0, 0x88a1f17c), TOBN(0xc4968949, 0x2329512c)}}, + {{TOBN(0x174a9126, 0xcecdaa7a), TOBN(0xfc8c7e0e, 0x0b13247b), + TOBN(0x29c110d2, 0x3484c1c4), TOBN(0xf8eb8757, 0x831dfc3b)}, + {TOBN(0x022f0212, 0xc0067452), TOBN(0x3f6f69ee, 0x7b9b926c), + TOBN(0x09032da0, 0xef42daf4), TOBN(0x79f00ade, 0x83f80de4)}}, + {{TOBN(0x6210db71, 0x81236c97), TOBN(0x74f7685b, 0x3ee0781f), + TOBN(0x4df7da7b, 0xa3e41372), TOBN(0x2aae38b1, 0xb1a1553e)}, + {TOBN(0x1688e222, 0xf6dd9d1b), TOBN(0x57695448, 0x5b8b6487), + TOBN(0x478d2127, 0x4b2edeaa), TOBN(0xb2818fa5, 0x1e85956a)}}, + {{TOBN(0x1e6addda, 0xf176f2c0), TOBN(0x01ca4604, 0xe2572658), + TOBN(0x0a404ded, 0x85342ffb), TOBN(0x8cf60f96, 0x441838d6)}, + {TOBN(0x9bbc691c, 0xc9071c4a), TOBN(0xfd588744, 0x34442803), + TOBN(0x97101c85, 0x809c0d81), TOBN(0xa7fb754c, 0x8c456f7f)}}, + {{TOBN(0xc95f3c5c, 0xd51805e1), TOBN(0xab4ccd39, 0xb299dca8), + TOBN(0x3e03d20b, 0x47eaf500), TOBN(0xfa3165c1, 0xd7b80893)}, + {TOBN(0x005e8b54, 0xe160e552), TOBN(0xdc4972ba, 0x9019d11f), + TOBN(0x21a6972e, 0x0c9a4a7a), TOBN(0xa52c258f, 0x37840fd7)}}, + {{TOBN(0xf8559ff4, 0xc1e99d81), TOBN(0x08e1a7d6, 0xa3c617c0), + TOBN(0xb398fd43, 0x248c6ba7), TOBN(0x6ffedd91, 0xd1283794)}, + {TOBN(0x8a6a59d2, 0xd629d208), TOBN(0xa9d141d5, 0x3490530e), + TOBN(0x42f6fc18, 0x38505989), TOBN(0x09bf250d, 0x479d94ee)}}, + {{TOBN(0x223ad3b1, 0xb3822790), TOBN(0x6c5926c0, 0x93b8971c), + TOBN(0x609efc7e, 0x75f7fa62), TOBN(0x45d66a6d, 0x1ec2d989)}, + {TOBN(0x4422d663, 0x987d2792), TOBN(0x4a73caad, 0x3eb31d2b), + TOBN(0xf06c2ac1, 0xa32cb9e6), TOBN(0xd9445c5f, 0x91aeba84)}}, + {{TOBN(0x6af7a1d5, 0xaf71013f), TOBN(0xe68216e5, 0x0bedc946), + TOBN(0xf4cba30b, 0xd27370a0), TOBN(0x7981afbf, 0x870421cc)}, + {TOBN(0x02496a67, 0x9449f0e1), TOBN(0x86cfc4be, 0x0a47edae), + TOBN(0x3073c936, 0xb1feca22), TOBN(0xf5694612, 0x03f8f8fb)}}, + {{TOBN(0xd063b723, 0x901515ea), TOBN(0x4c6c77a5, 0x749cf038), + TOBN(0x6361e360, 0xab9e5059), TOBN(0x596cf171, 0xa76a37c0)}, + {TOBN(0x800f53fa, 0x6530ae7a), TOBN(0x0f5e631e, 0x0792a7a6), + TOBN(0x5cc29c24, 0xefdb81c9), TOBN(0xa269e868, 0x3f9c40ba)}}, + {{TOBN(0xec14f9e1, 0x2cb7191e), TOBN(0x78ea1bd8, 0xe5b08ea6), + TOBN(0x3c65aa9b, 0x46332bb9), TOBN(0x84cc22b3, 0xbf80ce25)}, + {TOBN(0x0098e9e9, 0xd49d5bf1), TOBN(0xcd4ec1c6, 0x19087da4), + TOBN(0x3c9d07c5, 0xaef6e357), TOBN(0x839a0268, 0x9f8f64b8)}}, + {{TOBN(0xc5e9eb62, 0xc6d8607f), TOBN(0x759689f5, 0x6aa995e4), + TOBN(0x70464669, 0xbbb48317), TOBN(0x921474bf, 0xe402417d)}, + {TOBN(0xcabe135b, 0x2a354c8c), TOBN(0xd51e52d2, 0x812fa4b5), + TOBN(0xec741096, 0x53311fe8), TOBN(0x4f774535, 0xb864514b)}}, + {{TOBN(0xbcadd671, 0x5bde48f8), TOBN(0xc9703873, 0x2189bc7d), + TOBN(0x5d45299e, 0xc709ee8a), TOBN(0xd1287ee2, 0x845aaff8)}, + {TOBN(0x7d1f8874, 0xdb1dbf1f), TOBN(0xea46588b, 0x990c88d6), + TOBN(0x60ba649a, 0x84368313), TOBN(0xd5fdcbce, 0x60d543ae)}}, + {{TOBN(0x90b46d43, 0x810d5ab0), TOBN(0x6739d8f9, 0x04d7e5cc), + TOBN(0x021c1a58, 0x0d337c33), TOBN(0x00a61162, 0x68e67c40)}, + {TOBN(0x95ef413b, 0x379f0a1f), TOBN(0xfe126605, 0xe9e2ab95), + TOBN(0x67578b85, 0x2f5f199c), TOBN(0xf5c00329, 0x2cb84913)}}, + {{TOBN(0xf7956430, 0x37577dd8), TOBN(0x83b82af4, 0x29c5fe88), + TOBN(0x9c1bea26, 0xcdbdc132), TOBN(0x589fa086, 0x9c04339e)}, + {TOBN(0x033e9538, 0xb13799df), TOBN(0x85fa8b21, 0xd295d034), + TOBN(0xdf17f73f, 0xbd9ddcca), TOBN(0xf32bd122, 0xddb66334)}}, + {{TOBN(0x55ef88a7, 0x858b044c), TOBN(0x1f0d69c2, 0x5aa9e397), + TOBN(0x55fd9cc3, 0x40d85559), TOBN(0xc774df72, 0x7785ddb2)}, + {TOBN(0x5dcce9f6, 0xd3bd2e1c), TOBN(0xeb30da20, 0xa85dfed0), + TOBN(0x5ed7f5bb, 0xd3ed09c4), TOBN(0x7d42a35c, 0x82a9c1bd)}}, + {{TOBN(0xcf3de995, 0x9890272d), TOBN(0x75f3432a, 0x3e713a10), + TOBN(0x5e13479f, 0xe28227b8), TOBN(0xb8561ea9, 0xfefacdc8)}, + {TOBN(0xa6a297a0, 0x8332aafd), TOBN(0x9b0d8bb5, 0x73809b62), + TOBN(0xd2fa1cfd, 0x0c63036f), TOBN(0x7a16eb55, 0xbd64bda8)}}, + {{TOBN(0x3f5cf5f6, 0x78e62ddc), TOBN(0x2267c454, 0x07fd752b), + TOBN(0x5e361b6b, 0x5e437bbe), TOBN(0x95c59501, 0x8354e075)}, + {TOBN(0xec725f85, 0xf2b254d9), TOBN(0x844b617d, 0x2cb52b4e), + TOBN(0xed8554f5, 0xcf425fb5), TOBN(0xab67703e, 0x2af9f312)}}, + {{TOBN(0x4cc34ec1, 0x3cf48283), TOBN(0xb09daa25, 0x9c8a705e), + TOBN(0xd1e9d0d0, 0x5b7d4f84), TOBN(0x4df6ef64, 0xdb38929d)}, + {TOBN(0xe16b0763, 0xaa21ba46), TOBN(0xc6b1d178, 0xa293f8fb), + TOBN(0x0ff5b602, 0xd520aabf), TOBN(0x94d671bd, 0xc339397a)}}, + {{TOBN(0x7c7d98cf, 0x4f5792fa), TOBN(0x7c5e0d67, 0x11215261), + TOBN(0x9b19a631, 0xa7c5a6d4), TOBN(0xc8511a62, 0x7a45274d)}, + {TOBN(0x0c16621c, 0xa5a60d99), TOBN(0xf7fbab88, 0xcf5e48cb), + TOBN(0xab1e6ca2, 0xf7ddee08), TOBN(0x83bd08ce, 0xe7867f3c)}}, + {{TOBN(0xf7e48e8a, 0x2ac13e27), TOBN(0x4494f6df, 0x4eb1a9f5), + TOBN(0xedbf84eb, 0x981f0a62), TOBN(0x49badc32, 0x536438f0)}, + {TOBN(0x50bea541, 0x004f7571), TOBN(0xbac67d10, 0xdf1c94ee), + TOBN(0x253d73a1, 0xb727bc31), TOBN(0xb3d01cf2, 0x30686e28)}}, + {{TOBN(0x51b77b1b, 0x55fd0b8b), TOBN(0xa099d183, 0xfeec3173), + TOBN(0x202b1fb7, 0x670e72b7), TOBN(0xadc88b33, 0xa8e1635f)}, + {TOBN(0x34e8216a, 0xf989d905), TOBN(0xc2e68d20, 0x29b58d01), + TOBN(0x11f81c92, 0x6fe55a93), TOBN(0x15f1462a, 0x8f296f40)}}, + {{TOBN(0x1915d375, 0xea3d62f2), TOBN(0xa17765a3, 0x01c8977d), + TOBN(0x7559710a, 0xe47b26f6), TOBN(0xe0bd29c8, 0x535077a5)}, + {TOBN(0x615f976d, 0x08d84858), TOBN(0x370dfe85, 0x69ced5c1), + TOBN(0xbbc7503c, 0xa734fa56), TOBN(0xfbb9f1ec, 0x91ac4574)}}, + {{TOBN(0x95d7ec53, 0x060dd7ef), TOBN(0xeef2dacd, 0x6e657979), + TOBN(0x54511af3, 0xe2a08235), TOBN(0x1e324aa4, 0x1f4aea3d)}, + {TOBN(0x550e7e71, 0xe6e67671), TOBN(0xbccd5190, 0xbf52faf7), + TOBN(0xf880d316, 0x223cc62a), TOBN(0x0d402c7e, 0x2b32eb5d)}}, + {{TOBN(0xa40bc039, 0x306a5a3b), TOBN(0x4e0a41fd, 0x96783a1b), + TOBN(0xa1e8d39a, 0x0253cdd4), TOBN(0x6480be26, 0xc7388638)}, + {TOBN(0xee365e1d, 0x2285f382), TOBN(0x188d8d8f, 0xec0b5c36), + TOBN(0x34ef1a48, 0x1f0f4d82), TOBN(0x1a8f43e1, 0xa487d29a)}}, + {{TOBN(0x8168226d, 0x77aefb3a), TOBN(0xf69a751e, 0x1e72c253), + TOBN(0x8e04359a, 0xe9594df1), TOBN(0x475ffd7d, 0xd14c0467)}, + {TOBN(0xb5a2c2b1, 0x3844e95c), TOBN(0x85caf647, 0xdd12ef94), + TOBN(0x1ecd2a9f, 0xf1063d00), TOBN(0x1dd2e229, 0x23843311)}}, + {{TOBN(0x38f0e09d, 0x73d17244), TOBN(0x3ede7746, 0x8fc653f1), + TOBN(0xae4459f5, 0xdc20e21c), TOBN(0x00db2ffa, 0x6a8599ea)}, + {TOBN(0x11682c39, 0x30cfd905), TOBN(0x4934d074, 0xa5c112a6), + TOBN(0xbdf063c5, 0x568bfe95), TOBN(0x779a440a, 0x016c441a)}}, + {{TOBN(0x0c23f218, 0x97d6fbdc), TOBN(0xd3a5cd87, 0xe0776aac), + TOBN(0xcee37f72, 0xd712e8db), TOBN(0xfb28c70d, 0x26f74e8d)}, + {TOBN(0xffe0c728, 0xb61301a0), TOBN(0xa6282168, 0xd3724354), + TOBN(0x7ff4cb00, 0x768ffedc), TOBN(0xc51b3088, 0x03b02de9)}}, + {{TOBN(0xa5a8147c, 0x3902dda5), TOBN(0x35d2f706, 0xfe6973b4), + TOBN(0x5ac2efcf, 0xc257457e), TOBN(0x933f48d4, 0x8700611b)}, + {TOBN(0xc365af88, 0x4912beb2), TOBN(0x7f5a4de6, 0x162edf94), + TOBN(0xc646ba7c, 0x0c32f34b), TOBN(0x632c6af3, 0xb2091074)}}, + {{TOBN(0x58d4f2e3, 0x753e43a9), TOBN(0x70e1d217, 0x24d4e23f), + TOBN(0xb24bf729, 0xafede6a6), TOBN(0x7f4a94d8, 0x710c8b60)}, + {TOBN(0xaad90a96, 0x8d4faa6a), TOBN(0xd9ed0b32, 0xb066b690), + TOBN(0x52fcd37b, 0x78b6dbfd), TOBN(0x0b64615e, 0x8bd2b431)}}, + {{TOBN(0x228e2048, 0xcfb9fad5), TOBN(0xbeaa386d, 0x240b76bd), + TOBN(0x2d6681c8, 0x90dad7bc), TOBN(0x3e553fc3, 0x06d38f5e)}, + {TOBN(0xf27cdb9b, 0x9d5f9750), TOBN(0x3e85c52a, 0xd28c5b0e), + TOBN(0x190795af, 0x5247c39b), TOBN(0x547831eb, 0xbddd6828)}}, + {{TOBN(0xf327a227, 0x4a82f424), TOBN(0x36919c78, 0x7e47f89d), + TOBN(0xe4783919, 0x43c7392c), TOBN(0xf101b9aa, 0x2316fefe)}, + {TOBN(0xbcdc9e9c, 0x1c5009d2), TOBN(0xfb55ea13, 0x9cd18345), + TOBN(0xf5b5e231, 0xa3ce77c7), TOBN(0xde6b4527, 0xd2f2cb3d)}}, + {{TOBN(0x10f6a333, 0x9bb26f5f), TOBN(0x1e85db8e, 0x044d85b6), + TOBN(0xc3697a08, 0x94197e54), TOBN(0x65e18cc0, 0xa7cb4ea8)}, + {TOBN(0xa38c4f50, 0xa471fe6e), TOBN(0xf031747a, 0x2f13439c), + TOBN(0x53c4a6ba, 0xc007318b), TOBN(0xa8da3ee5, 0x1deccb3d)}}, + {{TOBN(0x0555b31c, 0x558216b1), TOBN(0x90c7810c, 0x2f79e6c2), + TOBN(0x9b669f4d, 0xfe8eed3c), TOBN(0x70398ec8, 0xe0fac126)}, + {TOBN(0xa96a449e, 0xf701b235), TOBN(0x0ceecdb3, 0xeb94f395), + TOBN(0x285fc368, 0xd0cb7431), TOBN(0x0d37bb52, 0x16a18c64)}}, + {{TOBN(0x05110d38, 0xb880d2dd), TOBN(0xa60f177b, 0x65930d57), + TOBN(0x7da34a67, 0xf36235f5), TOBN(0x47f5e17c, 0x183816b9)}, + {TOBN(0xc7664b57, 0xdb394af4), TOBN(0x39ba215d, 0x7036f789), + TOBN(0x46d2ca0e, 0x2f27b472), TOBN(0xc42647ee, 0xf73a84b7)}}, + {{TOBN(0x44bc7545, 0x64488f1d), TOBN(0xaa922708, 0xf4cf85d5), + TOBN(0x721a01d5, 0x53e4df63), TOBN(0x649c0c51, 0x5db46ced)}, + {TOBN(0x6bf0d64e, 0x3cffcb6c), TOBN(0xe3bf93fe, 0x50f71d96), + TOBN(0x75044558, 0xbcc194a0), TOBN(0x16ae3372, 0x6afdc554)}}, + {{TOBN(0xbfc01adf, 0x5ca48f3f), TOBN(0x64352f06, 0xe22a9b84), + TOBN(0xcee54da1, 0xc1099e4a), TOBN(0xbbda54e8, 0xfa1b89c0)}, + {TOBN(0x166a3df5, 0x6f6e55fb), TOBN(0x1ca44a24, 0x20176f88), + TOBN(0x936afd88, 0xdfb7b5ff), TOBN(0xe34c2437, 0x8611d4a0)}}, + {{TOBN(0x7effbb75, 0x86142103), TOBN(0x6704ba1b, 0x1f34fc4d), + TOBN(0x7c2a468f, 0x10c1b122), TOBN(0x36b3a610, 0x8c6aace9)}, + {TOBN(0xabfcc0a7, 0x75a0d050), TOBN(0x066f9197, 0x3ce33e32), + TOBN(0xce905ef4, 0x29fe09be), TOBN(0x89ee25ba, 0xa8376351)}}, + {{TOBN(0x2a3ede22, 0xfd29dc76), TOBN(0x7fd32ed9, 0x36f17260), + TOBN(0x0cadcf68, 0x284b4126), TOBN(0x63422f08, 0xa7951fc8)}, + {TOBN(0x562b24f4, 0x0807e199), TOBN(0xfe9ce5d1, 0x22ad4490), + TOBN(0xc2f51b10, 0x0db2b1b4), TOBN(0xeb3613ff, 0xe4541d0d)}}, + {{TOBN(0xbd2c4a05, 0x2680813b), TOBN(0x527aa55d, 0x561b08d6), + TOBN(0xa9f8a40e, 0xa7205558), TOBN(0xe3eea56f, 0x243d0bec)}, + {TOBN(0x7b853817, 0xa0ff58b3), TOBN(0xb67d3f65, 0x1a69e627), + TOBN(0x0b76bbb9, 0xa869b5d6), TOBN(0xa3afeb82, 0x546723ed)}}, + {{TOBN(0x5f24416d, 0x3e554892), TOBN(0x8413b53d, 0x430e2a45), + TOBN(0x99c56aee, 0x9032a2a0), TOBN(0x09432bf6, 0xeec367b1)}, + {TOBN(0x552850c6, 0xdaf0ecc1), TOBN(0x49ebce55, 0x5bc92048), + TOBN(0xdfb66ba6, 0x54811307), TOBN(0x1b84f797, 0x6f298597)}}, + {{TOBN(0x79590481, 0x8d1d7a0d), TOBN(0xd9fabe03, 0x3a6fa556), + TOBN(0xa40f9c59, 0xba9e5d35), TOBN(0xcb1771c1, 0xf6247577)}, + {TOBN(0x542a47ca, 0xe9a6312b), TOBN(0xa34b3560, 0x552dd8c5), + TOBN(0xfdf94de0, 0x0d794716), TOBN(0xd46124a9, 0x9c623094)}}, + {{TOBN(0x56b7435d, 0x68afe8b4), TOBN(0x27f20540, 0x6c0d8ea1), + TOBN(0x12b77e14, 0x73186898), TOBN(0xdbc3dd46, 0x7479490f)}, + {TOBN(0x951a9842, 0xc03b0c05), TOBN(0x8b1b3bb3, 0x7921bc96), + TOBN(0xa573b346, 0x2b202e0a), TOBN(0x77e4665d, 0x47254d56)}}, + {{TOBN(0x08b70dfc, 0xd23e3984), TOBN(0xab86e8bc, 0xebd14236), + TOBN(0xaa3e07f8, 0x57114ba7), TOBN(0x5ac71689, 0xab0ef4f2)}, + {TOBN(0x88fca384, 0x0139d9af), TOBN(0x72733f88, 0x76644af0), + TOBN(0xf122f72a, 0x65d74f4a), TOBN(0x13931577, 0xa5626c7a)}}, + {{TOBN(0xd5b5d9eb, 0x70f8d5a4), TOBN(0x375adde7, 0xd7bbb228), + TOBN(0x31e88b86, 0x0c1c0b32), TOBN(0xd1f568c4, 0x173edbaa)}, + {TOBN(0x1592fc83, 0x5459df02), TOBN(0x2beac0fb, 0x0fcd9a7e), + TOBN(0xb0a6fdb8, 0x1b473b0a), TOBN(0xe3224c6f, 0x0fe8fc48)}}, + {{TOBN(0x680bd00e, 0xe87edf5b), TOBN(0x30385f02, 0x20e77cf5), + TOBN(0xe9ab98c0, 0x4d42d1b2), TOBN(0x72d191d2, 0xd3816d77)}, + {TOBN(0x1564daca, 0x0917d9e5), TOBN(0x394eab59, 0x1f8fed7f), + TOBN(0xa209aa8d, 0x7fbb3896), TOBN(0x5564f3b9, 0xbe6ac98e)}}, + {{TOBN(0xead21d05, 0xd73654ef), TOBN(0x68d1a9c4, 0x13d78d74), + TOBN(0x61e01708, 0x6d4973a0), TOBN(0x83da3500, 0x46e6d32a)}, + {TOBN(0x6a3dfca4, 0x68ae0118), TOBN(0xa1b9a4c9, 0xd02da069), + TOBN(0x0b2ff9c7, 0xebab8302), TOBN(0x98af07c3, 0x944ba436)}}, + {{TOBN(0x85997326, 0x995f0f9f), TOBN(0x467fade0, 0x71b58bc6), + TOBN(0x47e4495a, 0xbd625a2b), TOBN(0xfdd2d01d, 0x33c3b8cd)}, + {TOBN(0x2c38ae28, 0xc693f9fa), TOBN(0x48622329, 0x348f7999), + TOBN(0x97bf738e, 0x2161f583), TOBN(0x15ee2fa7, 0x565e8cc9)}}, + {{TOBN(0xa1a5c845, 0x5777e189), TOBN(0xcc10bee0, 0x456f2829), + TOBN(0x8ad95c56, 0xda762bd5), TOBN(0x152e2214, 0xe9d91da8)}, + {TOBN(0x975b0e72, 0x7cb23c74), TOBN(0xfd5d7670, 0xa90c66df), + TOBN(0xb5b5b8ad, 0x225ffc53), TOBN(0xab6dff73, 0xfaded2ae)}}, + {{TOBN(0xebd56781, 0x6f4cbe9d), TOBN(0x0ed8b249, 0x6a574bd7), + TOBN(0x41c246fe, 0x81a881fa), TOBN(0x91564805, 0xc3db9c70)}, + {TOBN(0xd7c12b08, 0x5b862809), TOBN(0x1facd1f1, 0x55858d7b), + TOBN(0x7693747c, 0xaf09e92a), TOBN(0x3b69dcba, 0x189a425f)}}, + {{TOBN(0x0be28e9f, 0x967365ef), TOBN(0x57300eb2, 0xe801f5c9), + TOBN(0x93b8ac6a, 0xd583352f), TOBN(0xa2cf1f89, 0xcd05b2b7)}, + {TOBN(0x7c0c9b74, 0x4dcc40cc), TOBN(0xfee38c45, 0xada523fb), + TOBN(0xb49a4dec, 0x1099cc4d), TOBN(0x325c377f, 0x69f069c6)}}, + {{TOBN(0xe12458ce, 0x476cc9ff), TOBN(0x580e0b6c, 0xc6d4cb63), + TOBN(0xd561c8b7, 0x9072289b), TOBN(0x0377f264, 0xa619e6da)}, + {TOBN(0x26685362, 0x88e591a5), TOBN(0xa453a7bd, 0x7523ca2b), + TOBN(0x8a9536d2, 0xc1df4533), TOBN(0xc8e50f2f, 0xbe972f79)}}, + {{TOBN(0xd433e50f, 0x6d3549cf), TOBN(0x6f33696f, 0xfacd665e), + TOBN(0x695bfdac, 0xce11fcb4), TOBN(0x810ee252, 0xaf7c9860)}, + {TOBN(0x65450fe1, 0x7159bb2c), TOBN(0xf7dfbebe, 0x758b357b), + TOBN(0x2b057e74, 0xd69fea72), TOBN(0xd485717a, 0x92731745)}}}, + {{{TOBN(0x896c42e8, 0xee36860c), TOBN(0xdaf04dfd, 0x4113c22d), + TOBN(0x1adbb7b7, 0x44104213), TOBN(0xe5fd5fa1, 0x1fd394ea)}, + {TOBN(0x68235d94, 0x1a4e0551), TOBN(0x6772cfbe, 0x18d10151), + TOBN(0x276071e3, 0x09984523), TOBN(0xe4e879de, 0x5a56ba98)}}, + {{TOBN(0xaaafafb0, 0x285b9491), TOBN(0x01a0be88, 0x1e4c705e), + TOBN(0xff1d4f5d, 0x2ad9caab), TOBN(0x6e349a4a, 0xc37a233f)}, + {TOBN(0xcf1c1246, 0x4a1c6a16), TOBN(0xd99e6b66, 0x29383260), + TOBN(0xea3d4366, 0x5f6d5471), TOBN(0x36974d04, 0xff8cc89b)}}, + {{TOBN(0xc26c49a1, 0xcfe89d80), TOBN(0xb42c026d, 0xda9c8371), + TOBN(0xca6c013a, 0xdad066d2), TOBN(0xfb8f7228, 0x56a4f3ee)}, + {TOBN(0x08b579ec, 0xd850935b), TOBN(0x34c1a74c, 0xd631e1b3), + TOBN(0xcb5fe596, 0xac198534), TOBN(0x39ff21f6, 0xe1f24f25)}}, + {{TOBN(0x27f29e14, 0x8f929057), TOBN(0x7a64ae06, 0xc0c853df), + TOBN(0x256cd183, 0x58e9c5ce), TOBN(0x9d9cce82, 0xded092a5)}, + {TOBN(0xcc6e5979, 0x6e93b7c7), TOBN(0xe1e47092, 0x31bb9e27), + TOBN(0xb70b3083, 0xaa9e29a0), TOBN(0xbf181a75, 0x3785e644)}}, + {{TOBN(0xf53f2c65, 0x8ead09f7), TOBN(0x1335e1d5, 0x9780d14d), + TOBN(0x69cc20e0, 0xcd1b66bc), TOBN(0x9b670a37, 0xbbe0bfc8)}, + {TOBN(0xce53dc81, 0x28efbeed), TOBN(0x0c74e77c, 0x8326a6e5), + TOBN(0x3604e0d2, 0xb88e9a63), TOBN(0xbab38fca, 0x13dc2248)}}, + {{TOBN(0x8ed6e8c8, 0x5c0a3f1e), TOBN(0xbcad2492, 0x7c87c37f), + TOBN(0xfdfb62bb, 0x9ee3b78d), TOBN(0xeba8e477, 0xcbceba46)}, + {TOBN(0x37d38cb0, 0xeeaede4b), TOBN(0x0bc498e8, 0x7976deb6), + TOBN(0xb2944c04, 0x6b6147fb), TOBN(0x8b123f35, 0xf71f9609)}}, + {{TOBN(0xa155dcc7, 0xde79dc24), TOBN(0xf1168a32, 0x558f69cd), + TOBN(0xbac21595, 0x0d1850df), TOBN(0x15c8295b, 0xb204c848)}, + {TOBN(0xf661aa36, 0x7d8184ff), TOBN(0xc396228e, 0x30447bdb), + TOBN(0x11cd5143, 0xbde4a59e), TOBN(0xe3a26e3b, 0x6beab5e6)}}, + {{TOBN(0xd3b3a13f, 0x1402b9d0), TOBN(0x573441c3, 0x2c7bc863), + TOBN(0x4b301ec4, 0x578c3e6e), TOBN(0xc26fc9c4, 0x0adaf57e)}, + {TOBN(0x96e71bfd, 0x7493cea3), TOBN(0xd05d4b3f, 0x1af81456), + TOBN(0xdaca2a8a, 0x6a8c608f), TOBN(0x53ef07f6, 0x0725b276)}}, + {{TOBN(0x07a5fbd2, 0x7824fc56), TOBN(0x34675218, 0x13289077), + TOBN(0x5bf69fd5, 0xe0c48349), TOBN(0xa613ddd3, 0xb6aa7875)}, + {TOBN(0x7f78c19c, 0x5450d866), TOBN(0x46f4409c, 0x8f84a481), + TOBN(0x9f1d1928, 0x90fce239), TOBN(0x016c4168, 0xb2ce44b9)}}, + {{TOBN(0xbae023f0, 0xc7435978), TOBN(0xb152c888, 0x20e30e19), + TOBN(0x9c241645, 0xe3fa6faf), TOBN(0x735d95c1, 0x84823e60)}, + {TOBN(0x03197573, 0x03955317), TOBN(0x0b4b02a9, 0xf03b4995), + TOBN(0x076bf559, 0x70274600), TOBN(0x32c5cc53, 0xaaf57508)}}, + {{TOBN(0xe8af6d1f, 0x60624129), TOBN(0xb7bc5d64, 0x9a5e2b5e), + TOBN(0x3814b048, 0x5f082d72), TOBN(0x76f267f2, 0xce19677a)}, + {TOBN(0x626c630f, 0xb36eed93), TOBN(0x55230cd7, 0x3bf56803), + TOBN(0x78837949, 0xce2736a0), TOBN(0x0d792d60, 0xaa6c55f1)}}, + {{TOBN(0x0318dbfd, 0xd5c7c5d2), TOBN(0xb38f8da7, 0x072b342d), + TOBN(0x3569bddc, 0x7b8de38a), TOBN(0xf25b5887, 0xa1c94842)}, + {TOBN(0xb2d5b284, 0x2946ad60), TOBN(0x854f29ad, 0xe9d1707e), + TOBN(0xaa5159dc, 0x2c6a4509), TOBN(0x899f94c0, 0x57189837)}}, + {{TOBN(0xcf6adc51, 0xf4a55b03), TOBN(0x261762de, 0x35e3b2d5), + TOBN(0x4cc43012, 0x04827b51), TOBN(0xcd22a113, 0xc6021442)}, + {TOBN(0xce2fd61a, 0x247c9569), TOBN(0x59a50973, 0xd152beca), + TOBN(0x6c835a11, 0x63a716d4), TOBN(0xc26455ed, 0x187dedcf)}}, + {{TOBN(0x27f536e0, 0x49ce89e7), TOBN(0x18908539, 0xcc890cb5), + TOBN(0x308909ab, 0xd83c2aa1), TOBN(0xecd3142b, 0x1ab73bd3)}, + {TOBN(0x6a85bf59, 0xb3f5ab84), TOBN(0x3c320a68, 0xf2bea4c6), + TOBN(0xad8dc538, 0x6da4541f), TOBN(0xeaf34eb0, 0xb7c41186)}}, + {{TOBN(0x1c780129, 0x977c97c4), TOBN(0x5ff9beeb, 0xc57eb9fa), + TOBN(0xa24d0524, 0xc822c478), TOBN(0xfd8eec2a, 0x461cd415)}, + {TOBN(0xfbde194e, 0xf027458c), TOBN(0xb4ff5319, 0x1d1be115), + TOBN(0x63f874d9, 0x4866d6f4), TOBN(0x35c75015, 0xb21ad0c9)}}, + {{TOBN(0xa6b5c9d6, 0x46ac49d2), TOBN(0x42c77c0b, 0x83137aa9), + TOBN(0x24d000fc, 0x68225a38), TOBN(0x0f63cfc8, 0x2fe1e907)}, + {TOBN(0x22d1b01b, 0xc6441f95), TOBN(0x7d38f719, 0xec8e448f), + TOBN(0x9b33fa5f, 0x787fb1ba), TOBN(0x94dcfda1, 0x190158df)}}, + {{TOBN(0xc47cb339, 0x5f6d4a09), TOBN(0x6b4f355c, 0xee52b826), + TOBN(0x3d100f5d, 0xf51b930a), TOBN(0xf4512fac, 0x9f668f69)}, + {TOBN(0x546781d5, 0x206c4c74), TOBN(0xd021d4d4, 0xcb4d2e48), + TOBN(0x494a54c2, 0xca085c2d), TOBN(0xf1dbaca4, 0x520850a8)}}, + {{TOBN(0x63c79326, 0x490a1aca), TOBN(0xcb64dd9c, 0x41526b02), + TOBN(0xbb772591, 0xa2979258), TOBN(0x3f582970, 0x48d97846)}, + {TOBN(0xd66b70d1, 0x7c213ba7), TOBN(0xc28febb5, 0xe8a0ced4), + TOBN(0x6b911831, 0xc10338c1), TOBN(0x0d54e389, 0xbf0126f3)}}, + {{TOBN(0x7048d460, 0x4af206ee), TOBN(0x786c88f6, 0x77e97cb9), + TOBN(0xd4375ae1, 0xac64802e), TOBN(0x469bcfe1, 0xd53ec11c)}, + {TOBN(0xfc9b340d, 0x47062230), TOBN(0xe743bb57, 0xc5b4a3ac), + TOBN(0xfe00b4aa, 0x59ef45ac), TOBN(0x29a4ef23, 0x59edf188)}}, + {{TOBN(0x40242efe, 0xb483689b), TOBN(0x2575d3f6, 0x513ac262), + TOBN(0xf30037c8, 0x0ca6db72), TOBN(0xc9fcce82, 0x98864be2)}, + {TOBN(0x84a112ff, 0x0149362d), TOBN(0x95e57582, 0x1c4ae971), + TOBN(0x1fa4b1a8, 0x945cf86c), TOBN(0x4525a734, 0x0b024a2f)}}, + {{TOBN(0xe76c8b62, 0x8f338360), TOBN(0x483ff593, 0x28edf32b), + TOBN(0x67e8e90a, 0x298b1aec), TOBN(0x9caab338, 0x736d9a21)}, + {TOBN(0x5c09d2fd, 0x66892709), TOBN(0x2496b4dc, 0xb55a1d41), + TOBN(0x93f5fb1a, 0xe24a4394), TOBN(0x08c75049, 0x6fa8f6c1)}}, + {{TOBN(0xcaead1c2, 0xc905d85f), TOBN(0xe9d7f790, 0x0733ae57), + TOBN(0x24c9a65c, 0xf07cdd94), TOBN(0x7389359c, 0xa4b55931)}, + {TOBN(0xf58709b7, 0x367e45f7), TOBN(0x1f203067, 0xcb7e7adc), + TOBN(0x82444bff, 0xc7b72818), TOBN(0x07303b35, 0xbaac8033)}}, + {{TOBN(0x1e1ee4e4, 0xd13b7ea1), TOBN(0xe6489b24, 0xe0e74180), + TOBN(0xa5f2c610, 0x7e70ef70), TOBN(0xa1655412, 0xbdd10894)}, + {TOBN(0x555ebefb, 0x7af4194e), TOBN(0x533c1c3c, 0x8e89bd9c), + TOBN(0x735b9b57, 0x89895856), TOBN(0x15fb3cd2, 0x567f5c15)}}, + {{TOBN(0x057fed45, 0x526f09fd), TOBN(0xe8a4f10c, 0x8128240a), + TOBN(0x9332efc4, 0xff2bfd8d), TOBN(0x214e77a0, 0xbd35aa31)}, + {TOBN(0x32896d73, 0x14faa40e), TOBN(0x767867ec, 0x01e5f186), + TOBN(0xc9adf8f1, 0x17a1813e), TOBN(0xcb6cda78, 0x54741795)}}, + {{TOBN(0xb7521b6d, 0x349d51aa), TOBN(0xf56b5a9e, 0xe3c7b8e9), + TOBN(0xc6f1e5c9, 0x32a096df), TOBN(0x083667c4, 0xa3635024)}, + {TOBN(0x365ea135, 0x18087f2f), TOBN(0xf1b8eaac, 0xd136e45d), + TOBN(0xc8a0e484, 0x73aec989), TOBN(0xd75a324b, 0x142c9259)}}, + {{TOBN(0xb7b4d001, 0x01dae185), TOBN(0x45434e0b, 0x9b7a94bc), + TOBN(0xf54339af, 0xfbd8cb0b), TOBN(0xdcc4569e, 0xe98ef49e)}, + {TOBN(0x7789318a, 0x09a51299), TOBN(0x81b4d206, 0xb2b025d8), + TOBN(0xf64aa418, 0xfae85792), TOBN(0x3e50258f, 0xacd7baf7)}}, + {{TOBN(0xdce84cdb, 0x2996864b), TOBN(0xa2e67089, 0x1f485fa4), + TOBN(0xb28b2bb6, 0x534c6a5a), TOBN(0x31a7ec6b, 0xc94b9d39)}, + {TOBN(0x1d217766, 0xd6bc20da), TOBN(0x4acdb5ec, 0x86761190), + TOBN(0x68726328, 0x73701063), TOBN(0x4d24ee7c, 0x2128c29b)}}, + {{TOBN(0xc072ebd3, 0xa19fd868), TOBN(0x612e481c, 0xdb8ddd3b), + TOBN(0xb4e1d754, 0x1a64d852), TOBN(0x00ef95ac, 0xc4c6c4ab)}, + {TOBN(0x1536d2ed, 0xaa0a6c46), TOBN(0x61294086, 0x43774790), + TOBN(0x54af25e8, 0x343fda10), TOBN(0x9ff9d98d, 0xfd25d6f2)}}, + {{TOBN(0x0746af7c, 0x468b8835), TOBN(0x977a31cb, 0x730ecea7), + TOBN(0xa5096b80, 0xc2cf4a81), TOBN(0xaa986833, 0x6458c37a)}, + {TOBN(0x6af29bf3, 0xa6bd9d34), TOBN(0x6a62fe9b, 0x33c5d854), + TOBN(0x50e6c304, 0xb7133b5e), TOBN(0x04b60159, 0x7d6e6848)}}, + {{TOBN(0x4cd296df, 0x5579bea4), TOBN(0x10e35ac8, 0x5ceedaf1), + TOBN(0x04c4c5fd, 0xe3bcc5b1), TOBN(0x95f9ee8a, 0x89412cf9)}, + {TOBN(0x2c9459ee, 0x82b6eb0f), TOBN(0x2e845765, 0x95c2aadd), + TOBN(0x774a84ae, 0xd327fcfe), TOBN(0xd8c93722, 0x0368d476)}}, + {{TOBN(0x0dbd5748, 0xf83e8a3b), TOBN(0xa579aa96, 0x8d2495f3), + TOBN(0x535996a0, 0xae496e9b), TOBN(0x07afbfe9, 0xb7f9bcc2)}, + {TOBN(0x3ac1dc6d, 0x5b7bd293), TOBN(0x3b592cff, 0x7022323d), + TOBN(0xba0deb98, 0x9c0a3e76), TOBN(0x18e78e9f, 0x4b197acb)}}, + {{TOBN(0x211cde10, 0x296c36ef), TOBN(0x7ee89672, 0x82c4da77), + TOBN(0xb617d270, 0xa57836da), TOBN(0xf0cd9c31, 0x9cb7560b)}, + {TOBN(0x01fdcbf7, 0xe455fe90), TOBN(0x3fb53cbb, 0x7e7334f3), + TOBN(0x781e2ea4, 0x4e7de4ec), TOBN(0x8adab3ad, 0x0b384fd0)}}, + {{TOBN(0x129eee2f, 0x53d64829), TOBN(0x7a471e17, 0xa261492b), + TOBN(0xe4f9adb9, 0xe4cb4a2c), TOBN(0x3d359f6f, 0x97ba2c2d)}, + {TOBN(0x346c6786, 0x0aacd697), TOBN(0x92b444c3, 0x75c2f8a8), + TOBN(0xc79fa117, 0xd85df44e), TOBN(0x56782372, 0x398ddf31)}}, + {{TOBN(0x60e690f2, 0xbbbab3b8), TOBN(0x4851f8ae, 0x8b04816b), + TOBN(0xc72046ab, 0x9c92e4d2), TOBN(0x518c74a1, 0x7cf3136b)}, + {TOBN(0xff4eb50a, 0xf9877d4c), TOBN(0x14578d90, 0xa919cabb), + TOBN(0x8218f8c4, 0xac5eb2b6), TOBN(0xa3ccc547, 0x542016e4)}}, + {{TOBN(0x025bf48e, 0x327f8349), TOBN(0xf3e97346, 0xf43cb641), + TOBN(0xdc2bafdf, 0x500f1085), TOBN(0x57167876, 0x2f063055)}, + {TOBN(0x5bd914b9, 0x411925a6), TOBN(0x7c078d48, 0xa1123de5), + TOBN(0xee6bf835, 0x182b165d), TOBN(0xb11b5e5b, 0xba519727)}}, + {{TOBN(0xe33ea76c, 0x1eea7b85), TOBN(0x2352b461, 0x92d4f85e), + TOBN(0xf101d334, 0xafe115bb), TOBN(0xfabc1294, 0x889175a3)}, + {TOBN(0x7f6bcdc0, 0x5233f925), TOBN(0xe0a802db, 0xe77fec55), + TOBN(0xbdb47b75, 0x8069b659), TOBN(0x1c5e12de, 0xf98fbd74)}}, + {{TOBN(0x869c58c6, 0x4b8457ee), TOBN(0xa5360f69, 0x4f7ea9f7), + TOBN(0xe576c09f, 0xf460b38f), TOBN(0x6b70d548, 0x22b7fb36)}, + {TOBN(0x3fd237f1, 0x3bfae315), TOBN(0x33797852, 0xcbdff369), + TOBN(0x97df25f5, 0x25b516f9), TOBN(0x46f388f2, 0xba38ad2d)}}, + {{TOBN(0x656c4658, 0x89d8ddbb), TOBN(0x8830b26e, 0x70f38ee8), + TOBN(0x4320fd5c, 0xde1212b0), TOBN(0xc34f30cf, 0xe4a2edb2)}, + {TOBN(0xabb131a3, 0x56ab64b8), TOBN(0x7f77f0cc, 0xd99c5d26), + TOBN(0x66856a37, 0xbf981d94), TOBN(0x19e76d09, 0x738bd76e)}}, + {{TOBN(0xe76c8ac3, 0x96238f39), TOBN(0xc0a482be, 0xa830b366), + TOBN(0xb7b8eaff, 0x0b4eb499), TOBN(0x8ecd83bc, 0x4bfb4865)}, + {TOBN(0x971b2cb7, 0xa2f3776f), TOBN(0xb42176a4, 0xf4b88adf), + TOBN(0xb9617df5, 0xbe1fa446), TOBN(0x8b32d508, 0xcd031bd2)}}, + {{TOBN(0x1c6bd47d, 0x53b618c0), TOBN(0xc424f46c, 0x6a227923), + TOBN(0x7303ffde, 0xdd92d964), TOBN(0xe9712878, 0x71b5abf2)}, + {TOBN(0x8f48a632, 0xf815561d), TOBN(0x85f48ff5, 0xd3c055d1), + TOBN(0x222a1427, 0x7525684f), TOBN(0xd0d841a0, 0x67360cc3)}}, + {{TOBN(0x4245a926, 0x0b9267c6), TOBN(0xc78913f1, 0xcf07f863), + TOBN(0xaa844c8e, 0x4d0d9e24), TOBN(0xa42ad522, 0x3d5f9017)}, + {TOBN(0xbd371749, 0xa2c989d5), TOBN(0x928292df, 0xe1f5e78e), + TOBN(0x493b383e, 0x0a1ea6da), TOBN(0x5136fd8d, 0x13aee529)}}, + {{TOBN(0x860c44b1, 0xf2c34a99), TOBN(0x3b00aca4, 0xbf5855ac), + TOBN(0xabf6aaa0, 0xfaaf37be), TOBN(0x65f43682, 0x2a53ec08)}, + {TOBN(0x1d9a5801, 0xa11b12e1), TOBN(0x78a7ab2c, 0xe20ed475), + TOBN(0x0de1067e, 0x9a41e0d5), TOBN(0x30473f5f, 0x305023ea)}}, + {{TOBN(0xdd3ae09d, 0x169c7d97), TOBN(0x5cd5baa4, 0xcfaef9cd), + TOBN(0x5cd7440b, 0x65a44803), TOBN(0xdc13966a, 0x47f364de)}, + {TOBN(0x077b2be8, 0x2b8357c1), TOBN(0x0cb1b4c5, 0xe9d57c2a), + TOBN(0x7a4ceb32, 0x05ff363e), TOBN(0xf310fa4d, 0xca35a9ef)}}, + {{TOBN(0xdbb7b352, 0xf97f68c6), TOBN(0x0c773b50, 0x0b02cf58), + TOBN(0xea2e4821, 0x3c1f96d9), TOBN(0xffb357b0, 0xeee01815)}, + {TOBN(0xb9c924cd, 0xe0f28039), TOBN(0x0b36c95a, 0x46a3fbe4), + TOBN(0x1faaaea4, 0x5e46db6c), TOBN(0xcae575c3, 0x1928aaff)}}, + {{TOBN(0x7f671302, 0xa70dab86), TOBN(0xfcbd12a9, 0x71c58cfc), + TOBN(0xcbef9acf, 0xbee0cb92), TOBN(0x573da0b9, 0xf8c1b583)}, + {TOBN(0x4752fcfe, 0x0d41d550), TOBN(0xe7eec0e3, 0x2155cffe), + TOBN(0x0fc39fcb, 0x545ae248), TOBN(0x522cb8d1, 0x8065f44e)}}, + {{TOBN(0x263c962a, 0x70cbb96c), TOBN(0xe034362a, 0xbcd124a9), + TOBN(0xf120db28, 0x3c2ae58d), TOBN(0xb9a38d49, 0xfef6d507)}, + {TOBN(0xb1fd2a82, 0x1ff140fd), TOBN(0xbd162f30, 0x20aee7e0), + TOBN(0x4e17a5d4, 0xcb251949), TOBN(0x2aebcb83, 0x4f7e1c3d)}}, + {{TOBN(0x608eb25f, 0x937b0527), TOBN(0xf42e1e47, 0xeb7d9997), + TOBN(0xeba699c4, 0xb8a53a29), TOBN(0x1f921c71, 0xe091b536)}, + {TOBN(0xcce29e7b, 0x5b26bbd5), TOBN(0x7a8ef5ed, 0x3b61a680), + TOBN(0xe5ef8043, 0xba1f1c7e), TOBN(0x16ea8217, 0x18158dda)}}, + {{TOBN(0x01778a2b, 0x599ff0f9), TOBN(0x68a923d7, 0x8104fc6b), + TOBN(0x5bfa44df, 0xda694ff3), TOBN(0x4f7199db, 0xf7667f12)}, + {TOBN(0xc06d8ff6, 0xe46f2a79), TOBN(0x08b5dead, 0xe9f8131d), + TOBN(0x02519a59, 0xabb4ce7c), TOBN(0xc4f710bc, 0xb42aec3e)}}, + {{TOBN(0x3d77b057, 0x78bde41a), TOBN(0x6474bf80, 0xb4186b5a), + TOBN(0x048b3f67, 0x88c65741), TOBN(0xc64519de, 0x03c7c154)}, + {TOBN(0xdf073846, 0x0edfcc4f), TOBN(0x319aa737, 0x48f1aa6b), + TOBN(0x8b9f8a02, 0xca909f77), TOBN(0x90258139, 0x7580bfef)}}, + {{TOBN(0xd8bfd3ca, 0xc0c22719), TOBN(0xc60209e4, 0xc9ca151e), + TOBN(0x7a744ab5, 0xd9a1a69c), TOBN(0x6de5048b, 0x14937f8f)}, + {TOBN(0x171938d8, 0xe115ac04), TOBN(0x7df70940, 0x1c6b16d2), + TOBN(0xa6aeb663, 0x7f8e94e7), TOBN(0xc130388e, 0x2a2cf094)}}, + {{TOBN(0x1850be84, 0x77f54e6e), TOBN(0x9f258a72, 0x65d60fe5), + TOBN(0xff7ff0c0, 0x6c9146d6), TOBN(0x039aaf90, 0xe63a830b)}, + {TOBN(0x38f27a73, 0x9460342f), TOBN(0x4703148c, 0x3f795f8a), + TOBN(0x1bb5467b, 0x9681a97e), TOBN(0x00931ba5, 0xecaeb594)}}, + {{TOBN(0xcdb6719d, 0x786f337c), TOBN(0xd9c01cd2, 0xe704397d), + TOBN(0x0f4a3f20, 0x555c2fef), TOBN(0x00452509, 0x7c0af223)}, + {TOBN(0x54a58047, 0x84db8e76), TOBN(0x3bacf1aa, 0x93c8aa06), + TOBN(0x11ca957c, 0xf7919422), TOBN(0x50641053, 0x78cdaa40)}}, + {{TOBN(0x7a303874, 0x9f7144ae), TOBN(0x170c963f, 0x43d4acfd), + TOBN(0x5e148149, 0x58ddd3ef), TOBN(0xa7bde582, 0x9e72dba8)}, + {TOBN(0x0769da8b, 0x6fa68750), TOBN(0xfa64e532, 0x572e0249), + TOBN(0xfcaadf9d, 0x2619ad31), TOBN(0x87882daa, 0xa7b349cd)}}, + {{TOBN(0x9f6eb731, 0x6c67a775), TOBN(0xcb10471a, 0xefc5d0b1), + TOBN(0xb433750c, 0xe1b806b2), TOBN(0x19c5714d, 0x57b1ae7e)}, + {TOBN(0xc0dc8b7b, 0xed03fd3f), TOBN(0xdd03344f, 0x31bc194e), + TOBN(0xa66c52a7, 0x8c6320b5), TOBN(0x8bc82ce3, 0xd0b6fd93)}}, + {{TOBN(0xf8e13501, 0xb35f1341), TOBN(0xe53156dd, 0x25a43e42), + TOBN(0xd3adf27e, 0x4daeb85c), TOBN(0xb81d8379, 0xbbeddeb5)}, + {TOBN(0x1b0b546e, 0x2e435867), TOBN(0x9020eb94, 0xeba5dd60), + TOBN(0x37d91161, 0x8210cb9d), TOBN(0x4c596b31, 0x5c91f1cf)}}, + {{TOBN(0xb228a90f, 0x0e0b040d), TOBN(0xbaf02d82, 0x45ff897f), + TOBN(0x2aac79e6, 0x00fa6122), TOBN(0x24828817, 0x8e36f557)}, + {TOBN(0xb9521d31, 0x113ec356), TOBN(0x9e48861e, 0x15eff1f8), + TOBN(0x2aa1d412, 0xe0d41715), TOBN(0x71f86203, 0x53f131b8)}}, + {{TOBN(0xf60da8da, 0x3fd19408), TOBN(0x4aa716dc, 0x278d9d99), + TOBN(0x394531f7, 0xa8c51c90), TOBN(0xb560b0e8, 0xf59db51c)}, + {TOBN(0xa28fc992, 0xfa34bdad), TOBN(0xf024fa14, 0x9cd4f8bd), + TOBN(0x5cf530f7, 0x23a9d0d3), TOBN(0x615ca193, 0xe28c9b56)}}, + {{TOBN(0x6d2a483d, 0x6f73c51e), TOBN(0xa4cb2412, 0xea0dc2dd), + TOBN(0x50663c41, 0x1eb917ff), TOBN(0x3d3a74cf, 0xeade299e)}, + {TOBN(0x29b3990f, 0x4a7a9202), TOBN(0xa9bccf59, 0xa7b15c3d), + TOBN(0x66a3ccdc, 0xa5df9208), TOBN(0x48027c14, 0x43f2f929)}}, + {{TOBN(0xd385377c, 0x40b557f0), TOBN(0xe001c366, 0xcd684660), + TOBN(0x1b18ed6b, 0xe2183a27), TOBN(0x879738d8, 0x63210329)}, + {TOBN(0xa687c74b, 0xbda94882), TOBN(0xd1bbcc48, 0xa684b299), + TOBN(0xaf6f1112, 0x863b3724), TOBN(0x6943d1b4, 0x2c8ce9f8)}}, + {{TOBN(0xe044a3bb, 0x098cafb4), TOBN(0x27ed2310, 0x60d48caf), + TOBN(0x542b5675, 0x3a31b84d), TOBN(0xcbf3dd50, 0xfcddbed7)}, + {TOBN(0x25031f16, 0x41b1d830), TOBN(0xa7ec851d, 0xcb0c1e27), + TOBN(0xac1c8fe0, 0xb5ae75db), TOBN(0xb24c7557, 0x08c52120)}}, + {{TOBN(0x57f811dc, 0x1d4636c3), TOBN(0xf8436526, 0x681a9939), + TOBN(0x1f6bc6d9, 0x9c81adb3), TOBN(0x840f8ac3, 0x5b7d80d4)}, + {TOBN(0x731a9811, 0xf4387f1a), TOBN(0x7c501cd3, 0xb5156880), + TOBN(0xa5ca4a07, 0xdfe68867), TOBN(0xf123d8f0, 0x5fcea120)}}, + {{TOBN(0x1fbb0e71, 0xd607039e), TOBN(0x2b70e215, 0xcd3a4546), + TOBN(0x32d2f01d, 0x53324091), TOBN(0xb796ff08, 0x180ab19b)}, + {TOBN(0x32d87a86, 0x3c57c4aa), TOBN(0x2aed9caf, 0xb7c49a27), + TOBN(0x9fb35eac, 0x31630d98), TOBN(0x338e8cdf, 0x5c3e20a3)}}, + {{TOBN(0x80f16182, 0x66cde8db), TOBN(0x4e159980, 0x2d72fd36), + TOBN(0xd7b8f13b, 0x9b6e5072), TOBN(0xf5213907, 0x3b7b5dc1)}, + {TOBN(0x4d431f1d, 0x8ce4396e), TOBN(0x37a1a680, 0xa7ed2142), + TOBN(0xbf375696, 0xd01aaf6b), TOBN(0xaa1c0c54, 0xe63aab66)}}, + {{TOBN(0x3014368b, 0x4ed80940), TOBN(0x67e6d056, 0x7a6fcedd), + TOBN(0x7c208c49, 0xca97579f), TOBN(0xfe3d7a81, 0xa23597f6)}, + {TOBN(0x5e203202, 0x7e096ae2), TOBN(0xb1f3e1e7, 0x24b39366), + TOBN(0x26da26f3, 0x2fdcdffc), TOBN(0x79422f1d, 0x6097be83)}}}, + {{{TOBN(0x263a2cfb, 0x9db3b381), TOBN(0x9c3a2dee, 0xd4df0a4b), + TOBN(0x728d06e9, 0x7d04e61f), TOBN(0x8b1adfbc, 0x42449325)}, + {TOBN(0x6ec1d939, 0x7e053a1b), TOBN(0xee2be5c7, 0x66daf707), + TOBN(0x80ba1e14, 0x810ac7ab), TOBN(0xdd2ae778, 0xf530f174)}}, + {{TOBN(0x0435d97a, 0x205b9d8b), TOBN(0x6eb8f064, 0x056756d4), + TOBN(0xd5e88a8b, 0xb6f8210e), TOBN(0x070ef12d, 0xec9fd9ea)}, + {TOBN(0x4d849505, 0x3bcc876a), TOBN(0x12a75338, 0xa7404ce3), + TOBN(0xd22b49e1, 0xb8a1db5e), TOBN(0xec1f2051, 0x14bfa5ad)}}, + {{TOBN(0xadbaeb79, 0xb6828f36), TOBN(0x9d7a0258, 0x01bd5b9e), + TOBN(0xeda01e0d, 0x1e844b0c), TOBN(0x4b625175, 0x887edfc9)}, + {TOBN(0x14109fdd, 0x9669b621), TOBN(0x88a2ca56, 0xf6f87b98), + TOBN(0xfe2eb788, 0x170df6bc), TOBN(0x0cea06f4, 0xffa473f9)}}, + {{TOBN(0x43ed81b5, 0xc4e83d33), TOBN(0xd9f35879, 0x5efd488b), + TOBN(0x164a620f, 0x9deb4d0f), TOBN(0xc6927bdb, 0xac6a7394)}, + {TOBN(0x45c28df7, 0x9f9e0f03), TOBN(0x2868661e, 0xfcd7e1a9), + TOBN(0x7cf4e8d0, 0xffa348f1), TOBN(0x6bd4c284, 0x398538e0)}}, + {{TOBN(0x2618a091, 0x289a8619), TOBN(0xef796e60, 0x6671b173), + TOBN(0x664e46e5, 0x9090c632), TOBN(0xa38062d4, 0x1e66f8fb)}, + {TOBN(0x6c744a20, 0x0573274e), TOBN(0xd07b67e4, 0xa9271394), + TOBN(0x391223b2, 0x6bdc0e20), TOBN(0xbe2d93f1, 0xeb0a05a7)}}, + {{TOBN(0xf23e2e53, 0x3f36d141), TOBN(0xe84bb3d4, 0x4dfca442), + TOBN(0xb804a48d, 0x6b7c023a), TOBN(0x1e16a8fa, 0x76431c3b)}, + {TOBN(0x1b5452ad, 0xddd472e0), TOBN(0x7d405ee7, 0x0d1ee127), + TOBN(0x50fc6f1d, 0xffa27599), TOBN(0x351ac53c, 0xbf391b35)}}, + {{TOBN(0x7efa14b8, 0x4444896b), TOBN(0x64974d2f, 0xf94027fb), + TOBN(0xefdcd0e8, 0xde84487d), TOBN(0x8c45b260, 0x2b48989b)}, + {TOBN(0xa8fcbbc2, 0xd8463487), TOBN(0xd1b2b3f7, 0x3fbc476c), + TOBN(0x21d005b7, 0xc8f443c0), TOBN(0x518f2e67, 0x40c0139c)}}, + {{TOBN(0x56036e8c, 0x06d75fc1), TOBN(0x2dcf7bb7, 0x3249a89f), + TOBN(0x81dd1d3d, 0xe245e7dd), TOBN(0xf578dc4b, 0xebd6e2a7)}, + {TOBN(0x4c028903, 0xdf2ce7a0), TOBN(0xaee36288, 0x9c39afac), + TOBN(0xdc847c31, 0x146404ab), TOBN(0x6304c0d8, 0xa4e97818)}}, + {{TOBN(0xae51dca2, 0xa91f6791), TOBN(0x2abe4190, 0x9baa9efc), + TOBN(0xd9d2e2f4, 0x559c7ac1), TOBN(0xe82f4b51, 0xfc9f773a)}, + {TOBN(0xa7713027, 0x4073e81c), TOBN(0xc0276fac, 0xfbb596fc), + TOBN(0x1d819fc9, 0xa684f70c), TOBN(0x29b47fdd, 0xc9f7b1e0)}}, + {{TOBN(0x358de103, 0x459b1940), TOBN(0xec881c59, 0x5b013e93), + TOBN(0x51574c93, 0x49532ad3), TOBN(0x2db1d445, 0xb37b46de)}, + {TOBN(0xc6445b87, 0xdf239fd8), TOBN(0xc718af75, 0x151d24ee), + TOBN(0xaea1c4a4, 0xf43c6259), TOBN(0x40c0e5d7, 0x70be02f7)}}, + {{TOBN(0x6a4590f4, 0x721b33f2), TOBN(0x2124f1fb, 0xfedf04ea), + TOBN(0xf8e53cde, 0x9745efe7), TOBN(0xe7e10432, 0x65f046d9)}, + {TOBN(0xc3fca28e, 0xe4d0c7e6), TOBN(0x847e339a, 0x87253b1b), + TOBN(0x9b595348, 0x3743e643), TOBN(0xcb6a0a0b, 0x4fd12fc5)}}, + {{TOBN(0xfb6836c3, 0x27d02dcc), TOBN(0x5ad00982, 0x7a68bcc2), + TOBN(0x1b24b44c, 0x005e912d), TOBN(0xcc83d20f, 0x811fdcfe)}, + {TOBN(0x36527ec1, 0x666fba0c), TOBN(0x69948197, 0x14754635), + TOBN(0xfcdcb1a8, 0x556da9c2), TOBN(0xa5934267, 0x81a732b2)}}, + {{TOBN(0xec1214ed, 0xa714181d), TOBN(0x609ac13b, 0x6067b341), + TOBN(0xff4b4c97, 0xa545df1f), TOBN(0xa1240501, 0x34d2076b)}, + {TOBN(0x6efa0c23, 0x1409ca97), TOBN(0x254cc1a8, 0x20638c43), + TOBN(0xd4e363af, 0xdcfb46cd), TOBN(0x62c2adc3, 0x03942a27)}}, + {{TOBN(0xc67b9df0, 0x56e46483), TOBN(0xa55abb20, 0x63736356), + TOBN(0xab93c098, 0xc551bc52), TOBN(0x382b49f9, 0xb15fe64b)}, + {TOBN(0x9ec221ad, 0x4dff8d47), TOBN(0x79caf615, 0x437df4d6), + TOBN(0x5f13dc64, 0xbb456509), TOBN(0xe4c589d9, 0x191f0714)}}, + {{TOBN(0x27b6a8ab, 0x3fd40e09), TOBN(0xe455842e, 0x77313ea9), + TOBN(0x8b51d1e2, 0x1f55988b), TOBN(0x5716dd73, 0x062bbbfc)}, + {TOBN(0x633c11e5, 0x4e8bf3de), TOBN(0x9a0e77b6, 0x1b85be3b), + TOBN(0x56510729, 0x0911cca6), TOBN(0x27e76495, 0xefa6590f)}}, + {{TOBN(0xe4ac8b33, 0x070d3aab), TOBN(0x2643672b, 0x9a2cd5e5), + TOBN(0x52eff79b, 0x1cfc9173), TOBN(0x665ca49b, 0x90a7c13f)}, + {TOBN(0x5a8dda59, 0xb3efb998), TOBN(0x8a5b922d, 0x052f1341), + TOBN(0xae9ebbab, 0x3cf9a530), TOBN(0x35986e7b, 0xf56da4d7)}}, + {{TOBN(0x3a636b5c, 0xff3513cc), TOBN(0xbb0cf8ba, 0x3198f7dd), + TOBN(0xb8d40522, 0x41f16f86), TOBN(0x760575d8, 0xde13a7bf)}, + {TOBN(0x36f74e16, 0x9f7aa181), TOBN(0x163a3ecf, 0xf509ed1c), + TOBN(0x6aead61f, 0x3c40a491), TOBN(0x158c95fc, 0xdfe8fcaa)}}, + {{TOBN(0xa3991b6e, 0x13cda46f), TOBN(0x79482415, 0x342faed0), + TOBN(0xf3ba5bde, 0x666b5970), TOBN(0x1d52e6bc, 0xb26ab6dd)}, + {TOBN(0x768ba1e7, 0x8608dd3d), TOBN(0x4930db2a, 0xea076586), + TOBN(0xd9575714, 0xe7dc1afa), TOBN(0x1fc7bf7d, 0xf7c58817)}}, + {{TOBN(0x6b47accd, 0xd9eee96c), TOBN(0x0ca277fb, 0xe58cec37), + TOBN(0x113fe413, 0xe702c42a), TOBN(0xdd1764ee, 0xc47cbe51)}, + {TOBN(0x041e7cde, 0x7b3ed739), TOBN(0x50cb7459, 0x5ce9e1c0), + TOBN(0x35568513, 0x2925b212), TOBN(0x7cff95c4, 0x001b081c)}}, + {{TOBN(0x63ee4cbd, 0x8088b454), TOBN(0xdb7f32f7, 0x9a9e0c8a), + TOBN(0xb377d418, 0x6b2447cb), TOBN(0xe3e982aa, 0xd370219b)}, + {TOBN(0x06ccc1e4, 0xc2a2a593), TOBN(0x72c36865, 0x0773f24f), + TOBN(0xa13b4da7, 0x95859423), TOBN(0x8bbf1d33, 0x75040c8f)}}, + {{TOBN(0x726f0973, 0xda50c991), TOBN(0x48afcd5b, 0x822d6ee2), + TOBN(0xe5fc718b, 0x20fd7771), TOBN(0xb9e8e77d, 0xfd0807a1)}, + {TOBN(0x7f5e0f44, 0x99a7703d), TOBN(0x6972930e, 0x618e36f3), + TOBN(0x2b7c77b8, 0x23807bbe), TOBN(0xe5b82405, 0xcb27ff50)}}, + {{TOBN(0xba8b8be3, 0xbd379062), TOBN(0xd64b7a1d, 0x2dce4a92), + TOBN(0x040a73c5, 0xb2952e37), TOBN(0x0a9e252e, 0xd438aeca)}, + {TOBN(0xdd43956b, 0xc39d3bcb), TOBN(0x1a31ca00, 0xb32b2d63), + TOBN(0xd67133b8, 0x5c417a18), TOBN(0xd08e4790, 0x2ef442c8)}}, + {{TOBN(0x98cb1ae9, 0x255c0980), TOBN(0x4bd86381, 0x2b4a739f), + TOBN(0x5a5c31e1, 0x1e4a45a1), TOBN(0x1e5d55fe, 0x9cb0db2f)}, + {TOBN(0x74661b06, 0x8ff5cc29), TOBN(0x026b389f, 0x0eb8a4f4), + TOBN(0x536b21a4, 0x58848c24), TOBN(0x2e5bf8ec, 0x81dc72b0)}}, + {{TOBN(0x03c187d0, 0xad886aac), TOBN(0x5c16878a, 0xb771b645), + TOBN(0xb07dfc6f, 0xc74045ab), TOBN(0x2c6360bf, 0x7800caed)}, + {TOBN(0x24295bb5, 0xb9c972a3), TOBN(0xc9e6f88e, 0x7c9a6dba), + TOBN(0x90ffbf24, 0x92a79aa6), TOBN(0xde29d50a, 0x41c26ac2)}}, + {{TOBN(0x9f0af483, 0xd309cbe6), TOBN(0x5b020d8a, 0xe0bced4f), + TOBN(0x606e986d, 0xb38023e3), TOBN(0xad8f2c9d, 0x1abc6933)}, + {TOBN(0x19292e1d, 0xe7400e93), TOBN(0xfe3e18a9, 0x52be5e4d), + TOBN(0xe8e9771d, 0x2e0680bf), TOBN(0x8c5bec98, 0xc54db063)}}, + {{TOBN(0x2af9662a, 0x74a55d1f), TOBN(0xe3fbf28f, 0x046f66d8), + TOBN(0xa3a72ab4, 0xd4dc4794), TOBN(0x09779f45, 0x5c7c2dd8)}, + {TOBN(0xd893bdaf, 0xc3d19d8d), TOBN(0xd5a75094, 0x57d6a6df), + TOBN(0x8cf8fef9, 0x952e6255), TOBN(0x3da67cfb, 0xda9a8aff)}}, + {{TOBN(0x4c23f62a, 0x2c160dcd), TOBN(0x34e6c5e3, 0x8f90eaef), + TOBN(0x35865519, 0xa9a65d5a), TOBN(0x07c48aae, 0x8fd38a3d)}, + {TOBN(0xb7e7aeda, 0x50068527), TOBN(0x2c09ef23, 0x1c90936a), + TOBN(0x31ecfeb6, 0xe879324c), TOBN(0xa0871f6b, 0xfb0ec938)}}, + {{TOBN(0xb1f0fb68, 0xd84d835d), TOBN(0xc90caf39, 0x861dc1e6), + TOBN(0x12e5b046, 0x7594f8d7), TOBN(0x26897ae2, 0x65012b92)}, + {TOBN(0xbcf68a08, 0xa4d6755d), TOBN(0x403ee41c, 0x0991fbda), + TOBN(0x733e343e, 0x3bbf17e8), TOBN(0xd2c7980d, 0x679b3d65)}}, + {{TOBN(0x33056232, 0xd2e11305), TOBN(0x966be492, 0xf3c07a6f), + TOBN(0x6a8878ff, 0xbb15509d), TOBN(0xff221101, 0x0a9b59a4)}, + {TOBN(0x6c9f564a, 0xabe30129), TOBN(0xc6f2c940, 0x336e64cf), + TOBN(0x0fe75262, 0x8b0c8022), TOBN(0xbe0267e9, 0x6ae8db87)}}, + {{TOBN(0x22e192f1, 0x93bc042b), TOBN(0xf085b534, 0xb237c458), + TOBN(0xa0d192bd, 0x832c4168), TOBN(0x7a76e9e3, 0xbdf6271d)}, + {TOBN(0x52a882fa, 0xb88911b5), TOBN(0xc85345e4, 0xb4db0eb5), + TOBN(0xa3be02a6, 0x81a7c3ff), TOBN(0x51889c8c, 0xf0ec0469)}}, + {{TOBN(0x9d031369, 0xa5e829e5), TOBN(0xcbb4c6fc, 0x1607aa41), + TOBN(0x75ac59a6, 0x241d84c1), TOBN(0xc043f2bf, 0x8829e0ee)}, + {TOBN(0x82a38f75, 0x8ea5e185), TOBN(0x8bda40b9, 0xd87cbd9f), + TOBN(0x9e65e75e, 0x2d8fc601), TOBN(0x3d515f74, 0xa35690b3)}}, + {{TOBN(0x534acf4f, 0xda79e5ac), TOBN(0x68b83b3a, 0x8630215f), + TOBN(0x5c748b2e, 0xd085756e), TOBN(0xb0317258, 0xe5d37cb2)}, + {TOBN(0x6735841a, 0xc5ccc2c4), TOBN(0x7d7dc96b, 0x3d9d5069), + TOBN(0xa147e410, 0xfd1754bd), TOBN(0x65296e94, 0xd399ddd5)}}, + {{TOBN(0xf6b5b2d0, 0xbc8fa5bc), TOBN(0x8a5ead67, 0x500c277b), + TOBN(0x214625e6, 0xdfa08a5d), TOBN(0x51fdfedc, 0x959cf047)}, + {TOBN(0x6bc9430b, 0x289fca32), TOBN(0xe36ff0cf, 0x9d9bdc3f), + TOBN(0x2fe187cb, 0x58ea0ede), TOBN(0xed66af20, 0x5a900b3f)}}, + {{TOBN(0x00e0968b, 0x5fa9f4d6), TOBN(0x2d4066ce, 0x37a362e7), + TOBN(0xa99a9748, 0xbd07e772), TOBN(0x710989c0, 0x06a4f1d0)}, + {TOBN(0xd5dedf35, 0xce40cbd8), TOBN(0xab55c5f0, 0x1743293d), + TOBN(0x766f1144, 0x8aa24e2c), TOBN(0x94d874f8, 0x605fbcb4)}}, + {{TOBN(0xa365f0e8, 0xa518001b), TOBN(0xee605eb6, 0x9d04ef0f), + TOBN(0x5a3915cd, 0xba8d4d25), TOBN(0x44c0e1b8, 0xb5113472)}, + {TOBN(0xcbb024e8, 0x8b6740dc), TOBN(0x89087a53, 0xee1d4f0c), + TOBN(0xa88fa05c, 0x1fc4e372), TOBN(0x8bf395cb, 0xaf8b3af2)}}, + {{TOBN(0x1e71c9a1, 0xdeb8568b), TOBN(0xa35daea0, 0x80fb3d32), + TOBN(0xe8b6f266, 0x2cf8fb81), TOBN(0x6d51afe8, 0x9490696a)}, + {TOBN(0x81beac6e, 0x51803a19), TOBN(0xe3d24b7f, 0x86219080), + TOBN(0x727cfd9d, 0xdf6f463c), TOBN(0x8c6865ca, 0x72284ee8)}}, + {{TOBN(0x32c88b7d, 0xb743f4ef), TOBN(0x3793909b, 0xe7d11dce), + TOBN(0xd398f922, 0x2ff2ebe8), TOBN(0x2c70ca44, 0xe5e49796)}, + {TOBN(0xdf4d9929, 0xcb1131b1), TOBN(0x7826f298, 0x25888e79), + TOBN(0x4d3a112c, 0xf1d8740a), TOBN(0x00384cb6, 0x270afa8b)}}, + {{TOBN(0xcb64125b, 0x3ab48095), TOBN(0x3451c256, 0x62d05106), + TOBN(0xd73d577d, 0xa4955845), TOBN(0x39570c16, 0xbf9f4433)}, + {TOBN(0xd7dfaad3, 0xadecf263), TOBN(0xf1c3d8d1, 0xdc76e102), + TOBN(0x5e774a58, 0x54c6a836), TOBN(0xdad4b672, 0x3e92d47b)}}, + {{TOBN(0xbe7e990f, 0xf0d796a0), TOBN(0x5fc62478, 0xdf0e8b02), + TOBN(0x8aae8bf4, 0x030c00ad), TOBN(0x3d2db93b, 0x9004ba0f)}, + {TOBN(0xe48c8a79, 0xd85d5ddc), TOBN(0xe907caa7, 0x6bb07f34), + TOBN(0x58db343a, 0xa39eaed5), TOBN(0x0ea6e007, 0xadaf5724)}}, + {{TOBN(0xe00df169, 0xd23233f3), TOBN(0x3e322796, 0x77cb637f), + TOBN(0x1f897c0e, 0x1da0cf6c), TOBN(0xa651f5d8, 0x31d6bbdd)}, + {TOBN(0xdd61af19, 0x1a230c76), TOBN(0xbd527272, 0xcdaa5e4a), + TOBN(0xca753636, 0xd0abcd7e), TOBN(0x78bdd37c, 0x370bd8dc)}}, + {{TOBN(0xc23916c2, 0x17cd93fe), TOBN(0x65b97a4d, 0xdadce6e2), + TOBN(0xe04ed4eb, 0x174e42f8), TOBN(0x1491ccaa, 0xbb21480a)}, + {TOBN(0x145a8280, 0x23196332), TOBN(0x3c3862d7, 0x587b479a), + TOBN(0x9f4a88a3, 0x01dcd0ed), TOBN(0x4da2b7ef, 0x3ea12f1f)}}, + {{TOBN(0xf8e7ae33, 0xb126e48e), TOBN(0x404a0b32, 0xf494e237), + TOBN(0x9beac474, 0xc55acadb), TOBN(0x4ee5cf3b, 0xcbec9fd9)}, + {TOBN(0x336b33b9, 0x7df3c8c3), TOBN(0xbd905fe3, 0xb76808fd), + TOBN(0x8f436981, 0xaa45c16a), TOBN(0x255c5bfa, 0x3dd27b62)}}, + {{TOBN(0x71965cbf, 0xc3dd9b4d), TOBN(0xce23edbf, 0xfc068a87), + TOBN(0xb78d4725, 0x745b029b), TOBN(0x74610713, 0xcefdd9bd)}, + {TOBN(0x7116f75f, 0x1266bf52), TOBN(0x02046722, 0x18e49bb6), + TOBN(0xdf43df9f, 0x3d6f19e3), TOBN(0xef1bc7d0, 0xe685cb2f)}}, + {{TOBN(0xcddb27c1, 0x7078c432), TOBN(0xe1961b9c, 0xb77fedb7), + TOBN(0x1edc2f5c, 0xc2290570), TOBN(0x2c3fefca, 0x19cbd886)}, + {TOBN(0xcf880a36, 0xc2af389a), TOBN(0x96c610fd, 0xbda71cea), + TOBN(0xf03977a9, 0x32aa8463), TOBN(0x8eb7763f, 0x8586d90a)}}, + {{TOBN(0x3f342454, 0x2a296e77), TOBN(0xc8718683, 0x42837a35), + TOBN(0x7dc71090, 0x6a09c731), TOBN(0x54778ffb, 0x51b816db)}, + {TOBN(0x6b33bfec, 0xaf06defd), TOBN(0xfe3c105f, 0x8592b70b), + TOBN(0xf937fda4, 0x61da6114), TOBN(0x3c13e651, 0x4c266ad7)}}, + {{TOBN(0xe363a829, 0x855938e8), TOBN(0x2eeb5d9e, 0x9de54b72), + TOBN(0xbeb93b0e, 0x20ccfab9), TOBN(0x3dffbb5f, 0x25e61a25)}, + {TOBN(0x7f655e43, 0x1acc093d), TOBN(0x0cb6cc3d, 0x3964ce61), + TOBN(0x6ab283a1, 0xe5e9b460), TOBN(0x55d787c5, 0xa1c7e72d)}}, + {{TOBN(0x4d2efd47, 0xdeadbf02), TOBN(0x11e80219, 0xac459068), + TOBN(0x810c7626, 0x71f311f0), TOBN(0xfa17ef8d, 0x4ab6ef53)}, + {TOBN(0xaf47fd25, 0x93e43bff), TOBN(0x5cb5ff3f, 0x0be40632), + TOBN(0x54687106, 0x8ee61da3), TOBN(0x7764196e, 0xb08afd0f)}}, + {{TOBN(0x831ab3ed, 0xf0290a8f), TOBN(0xcae81966, 0xcb47c387), + TOBN(0xaad7dece, 0x184efb4f), TOBN(0xdcfc53b3, 0x4749110e)}, + {TOBN(0x6698f23c, 0x4cb632f9), TOBN(0xc42a1ad6, 0xb91f8067), + TOBN(0xb116a81d, 0x6284180a), TOBN(0xebedf5f8, 0xe901326f)}}, + {{TOBN(0xf2274c9f, 0x97e3e044), TOBN(0x42018520, 0x11d09fc9), + TOBN(0x56a65f17, 0xd18e6e23), TOBN(0x2ea61e2a, 0x352b683c)}, + {TOBN(0x27d291bc, 0x575eaa94), TOBN(0x9e7bc721, 0xb8ff522d), + TOBN(0x5f7268bf, 0xa7f04d6f), TOBN(0x5868c73f, 0xaba41748)}}, + {{TOBN(0x9f85c2db, 0x7be0eead), TOBN(0x511e7842, 0xff719135), + TOBN(0x5a06b1e9, 0xc5ea90d7), TOBN(0x0c19e283, 0x26fab631)}, + {TOBN(0x8af8f0cf, 0xe9206c55), TOBN(0x89389cb4, 0x3553c06a), + TOBN(0x39dbed97, 0xf65f8004), TOBN(0x0621b037, 0xc508991d)}}, + {{TOBN(0x1c52e635, 0x96e78cc4), TOBN(0x5385c8b2, 0x0c06b4a8), + TOBN(0xd84ddfdb, 0xb0e87d03), TOBN(0xc49dfb66, 0x934bafad)}, + {TOBN(0x7071e170, 0x59f70772), TOBN(0x3a073a84, 0x3a1db56b), + TOBN(0x03494903, 0x3b8af190), TOBN(0x7d882de3, 0xd32920f0)}}, + {{TOBN(0x91633f0a, 0xb2cf8940), TOBN(0x72b0b178, 0x6f948f51), + TOBN(0x2d28dc30, 0x782653c8), TOBN(0x88829849, 0xdb903a05)}, + {TOBN(0xb8095d0c, 0x6a19d2bb), TOBN(0x4b9e7f0c, 0x86f782cb), + TOBN(0x7af73988, 0x2d907064), TOBN(0xd12be0fe, 0x8b32643c)}}, + {{TOBN(0x358ed23d, 0x0e165dc3), TOBN(0x3d47ce62, 0x4e2378ce), + TOBN(0x7e2bb0b9, 0xfeb8a087), TOBN(0x3246e8ae, 0xe29e10b9)}, + {TOBN(0x459f4ec7, 0x03ce2b4d), TOBN(0xe9b4ca1b, 0xbbc077cf), + TOBN(0x2613b4f2, 0x0e9940c1), TOBN(0xfc598bb9, 0x047d1eb1)}}, + {{TOBN(0x9744c62b, 0x45036099), TOBN(0xa9dee742, 0x167c65d8), + TOBN(0x0c511525, 0xdabe1943), TOBN(0xda110554, 0x93c6c624)}, + {TOBN(0xae00a52c, 0x651a3be2), TOBN(0xcda5111d, 0x884449a6), + TOBN(0x063c06f4, 0xff33bed1), TOBN(0x73baaf9a, 0x0d3d76b4)}}, + {{TOBN(0x52fb0c9d, 0x7fc63668), TOBN(0x6886c9dd, 0x0c039cde), + TOBN(0x602bd599, 0x55b22351), TOBN(0xb00cab02, 0x360c7c13)}, + {TOBN(0x8cb616bc, 0x81b69442), TOBN(0x41486700, 0xb55c3cee), + TOBN(0x71093281, 0xf49ba278), TOBN(0xad956d9c, 0x64a50710)}}, + {{TOBN(0x9561f28b, 0x638a7e81), TOBN(0x54155cdf, 0x5980ddc3), + TOBN(0xb2db4a96, 0xd26f247a), TOBN(0x9d774e4e, 0x4787d100)}, + {TOBN(0x1a9e6e2e, 0x078637d2), TOBN(0x1c363e2d, 0x5e0ae06a), + TOBN(0x7493483e, 0xe9cfa354), TOBN(0x76843cb3, 0x7f74b98d)}}, + {{TOBN(0xbaca6591, 0xd4b66947), TOBN(0xb452ce98, 0x04460a8c), + TOBN(0x6830d246, 0x43768f55), TOBN(0xf4197ed8, 0x7dff12df)}, + {TOBN(0x6521b472, 0x400dd0f7), TOBN(0x59f5ca8f, 0x4b1e7093), + TOBN(0x6feff11b, 0x080338ae), TOBN(0x0ada31f6, 0xa29ca3c6)}}, + {{TOBN(0x24794eb6, 0x94a2c215), TOBN(0xd83a43ab, 0x05a57ab4), + TOBN(0x264a543a, 0x2a6f89fe), TOBN(0x2c2a3868, 0xdd5ec7c2)}, + {TOBN(0xd3373940, 0x8439d9b2), TOBN(0x715ea672, 0x0acd1f11), + TOBN(0x42c1d235, 0xe7e6cc19), TOBN(0x81ce6e96, 0xb990585c)}}, + {{TOBN(0x04e5dfe0, 0xd809c7bd), TOBN(0xd7b2580c, 0x8f1050ab), + TOBN(0x6d91ad78, 0xd8a4176f), TOBN(0x0af556ee, 0x4e2e897c)}, + {TOBN(0x162a8b73, 0x921de0ac), TOBN(0x52ac9c22, 0x7ea78400), + TOBN(0xee2a4eea, 0xefce2174), TOBN(0xbe61844e, 0x6d637f79)}}, + {{TOBN(0x0491f1bc, 0x789a283b), TOBN(0x72d3ac3d, 0x880836f4), + TOBN(0xaa1c5ea3, 0x88e5402d), TOBN(0x1b192421, 0xd5cc473d)}, + {TOBN(0x5c0b9998, 0x9dc84cac), TOBN(0xb0a8482d, 0x9c6e75b8), + TOBN(0x639961d0, 0x3a191ce2), TOBN(0xda3bc865, 0x6d837930)}}, + {{TOBN(0xca990653, 0x056e6f8f), TOBN(0x84861c41, 0x64d133a7), + TOBN(0x8b403276, 0x746abe40), TOBN(0xb7b4d51a, 0xebf8e303)}, + {TOBN(0x05b43211, 0x220a255d), TOBN(0xc997152c, 0x02419e6e), + TOBN(0x76ff47b6, 0x630c2fea), TOBN(0x50518677, 0x281fdade)}}, + {{TOBN(0x3283b8ba, 0xcf902b0b), TOBN(0x8d4b4eb5, 0x37db303b), + TOBN(0xcc89f42d, 0x755011bc), TOBN(0xb43d74bb, 0xdd09d19b)}, + {TOBN(0x65746bc9, 0x8adba350), TOBN(0x364eaf8c, 0xb51c1927), + TOBN(0x13c76596, 0x10ad72ec), TOBN(0x30045121, 0xf8d40c20)}}, + {{TOBN(0x6d2d99b7, 0xea7b979b), TOBN(0xcd78cd74, 0xe6fb3bcd), + TOBN(0x11e45a9e, 0x86cffbfe), TOBN(0x78a61cf4, 0x637024f6)}, + {TOBN(0xd06bc872, 0x3d502295), TOBN(0xf1376854, 0x458cb288), + TOBN(0xb9db26a1, 0x342f8586), TOBN(0xf33effcf, 0x4beee09e)}}, + {{TOBN(0xd7e0c4cd, 0xb30cfb3a), TOBN(0x6d09b8c1, 0x6c9db4c8), + TOBN(0x40ba1a42, 0x07c8d9df), TOBN(0x6fd495f7, 0x1c52c66d)}, + {TOBN(0xfb0e169f, 0x275264da), TOBN(0x80c2b746, 0xe57d8362), + TOBN(0xedd987f7, 0x49ad7222), TOBN(0xfdc229af, 0x4398ec7b)}}}, + {{{TOBN(0xb0d1ed84, 0x52666a58), TOBN(0x4bcb6e00, 0xe6a9c3c2), + TOBN(0x3c57411c, 0x26906408), TOBN(0xcfc20755, 0x13556400)}, + {TOBN(0xa08b1c50, 0x5294dba3), TOBN(0xa30ba286, 0x8b7dd31e), + TOBN(0xd70ba90e, 0x991eca74), TOBN(0x094e142c, 0xe762c2b9)}}, + {{TOBN(0xb81d783e, 0x979f3925), TOBN(0x1efd130a, 0xaf4c89a7), + TOBN(0x525c2144, 0xfd1bf7fa), TOBN(0x4b296904, 0x1b265a9e)}, + {TOBN(0xed8e9634, 0xb9db65b6), TOBN(0x35c82e32, 0x03599d8a), + TOBN(0xdaa7a54f, 0x403563f3), TOBN(0x9df088ad, 0x022c38ab)}}, + {{TOBN(0xe5cfb066, 0xbb3fd30a), TOBN(0x429169da, 0xeff0354e), + TOBN(0x809cf852, 0x3524e36c), TOBN(0x136f4fb3, 0x0155be1d)}, + {TOBN(0x4826af01, 0x1fbba712), TOBN(0x6ef0f0b4, 0x506ba1a1), + TOBN(0xd9928b31, 0x77aea73e), TOBN(0xe2bf6af2, 0x5eaa244e)}}, + {{TOBN(0x8d084f12, 0x4237b64b), TOBN(0x688ebe99, 0xe3ecfd07), + TOBN(0x57b8a70c, 0xf6845dd8), TOBN(0x808fc59c, 0x5da4a325)}, + {TOBN(0xa9032b2b, 0xa3585862), TOBN(0xb66825d5, 0xedf29386), + TOBN(0xb5a5a8db, 0x431ec29b), TOBN(0xbb143a98, 0x3a1e8dc8)}}, + {{TOBN(0x35ee94ce, 0x12ae381b), TOBN(0x3a7f176c, 0x86ccda90), + TOBN(0xc63a657e, 0x4606eaca), TOBN(0x9ae5a380, 0x43cd04df)}, + {TOBN(0x9bec8d15, 0xed251b46), TOBN(0x1f5d6d30, 0xcaca5e64), + TOBN(0x347b3b35, 0x9ff20f07), TOBN(0x4d65f034, 0xf7e4b286)}}, + {{TOBN(0x9e93ba24, 0xf111661e), TOBN(0xedced484, 0xb105eb04), + TOBN(0x96dc9ba1, 0xf424b578), TOBN(0xbf8f66b7, 0xe83e9069)}, + {TOBN(0x872d4df4, 0xd7ed8216), TOBN(0xbf07f377, 0x8e2cbecf), + TOBN(0x4281d899, 0x98e73754), TOBN(0xfec85fbb, 0x8aab8708)}}, + {{TOBN(0x9a3c0dee, 0xa5ba5b0b), TOBN(0xe6a116ce, 0x42d05299), + TOBN(0xae9775fe, 0xe9b02d42), TOBN(0x72b05200, 0xa1545cb6)}, + {TOBN(0xbc506f7d, 0x31a3b4ea), TOBN(0xe5893078, 0x8bbd9b32), + TOBN(0xc8bc5f37, 0xe4b12a97), TOBN(0x6b000c06, 0x4a73b671)}}, + {{TOBN(0x13b5bf22, 0x765fa7d0), TOBN(0x59805bf0, 0x1d6a5370), + TOBN(0x67a5e29d, 0x4280db98), TOBN(0x4f53916f, 0x776b1ce3)}, + {TOBN(0x714ff61f, 0x33ddf626), TOBN(0x4206238e, 0xa085d103), + TOBN(0x1c50d4b7, 0xe5809ee3), TOBN(0x999f450d, 0x85f8eb1d)}}, + {{TOBN(0x658a6051, 0xe4c79e9b), TOBN(0x1394cb73, 0xc66a9fea), + TOBN(0x27f31ed5, 0xc6be7b23), TOBN(0xf4c88f36, 0x5aa6f8fe)}, + {TOBN(0x0fb0721f, 0x4aaa499e), TOBN(0x68b3a7d5, 0xe3fb2a6b), + TOBN(0xa788097d, 0x3a92851d), TOBN(0x060e7f8a, 0xe96f4913)}}, + {{TOBN(0x82eebe73, 0x1a3a93bc), TOBN(0x42bbf465, 0xa21adc1a), + TOBN(0xc10b6fa4, 0xef030efd), TOBN(0x247aa4c7, 0x87b097bb)}, + {TOBN(0x8b8dc632, 0xf60c77da), TOBN(0x6ffbc26a, 0xc223523e), + TOBN(0xa4f6ff11, 0x344579cf), TOBN(0x5825653c, 0x980250f6)}}, + {{TOBN(0xb2dd097e, 0xbc1aa2b9), TOBN(0x07889393, 0x37a0333a), + TOBN(0x1cf55e71, 0x37a0db38), TOBN(0x2648487f, 0x792c1613)}, + {TOBN(0xdad01336, 0x3fcef261), TOBN(0x6239c81d, 0x0eabf129), + TOBN(0x8ee761de, 0x9d276be2), TOBN(0x406a7a34, 0x1eda6ad3)}}, + {{TOBN(0x4bf367ba, 0x4a493b31), TOBN(0x54f20a52, 0x9bf7f026), + TOBN(0xb696e062, 0x9795914b), TOBN(0xcddab96d, 0x8bf236ac)}, + {TOBN(0x4ff2c70a, 0xed25ea13), TOBN(0xfa1d09eb, 0x81cbbbe7), + TOBN(0x88fc8c87, 0x468544c5), TOBN(0x847a670d, 0x696b3317)}}, + {{TOBN(0xf133421e, 0x64bcb626), TOBN(0xaea638c8, 0x26dee0b5), + TOBN(0xd6e7680b, 0xb310346c), TOBN(0xe06f4097, 0xd5d4ced3)}, + {TOBN(0x09961452, 0x7512a30b), TOBN(0xf3d867fd, 0xe589a59a), + TOBN(0x2e73254f, 0x52d0c180), TOBN(0x9063d8a3, 0x333c74ac)}}, + {{TOBN(0xeda6c595, 0xd314e7bc), TOBN(0x2ee7464b, 0x467899ed), + TOBN(0x1cef423c, 0x0a1ed5d3), TOBN(0x217e76ea, 0x69cc7613)}, + {TOBN(0x27ccce1f, 0xe7cda917), TOBN(0x12d8016b, 0x8a893f16), + TOBN(0xbcd6de84, 0x9fc74f6b), TOBN(0xfa5817e2, 0xf3144e61)}}, + {{TOBN(0x1f354164, 0x0821ee4c), TOBN(0x1583eab4, 0x0bc61992), + TOBN(0x7490caf6, 0x1d72879f), TOBN(0x998ad9f3, 0xf76ae7b2)}, + {TOBN(0x1e181950, 0xa41157f7), TOBN(0xa9d7e1e6, 0xe8da3a7e), + TOBN(0x963784eb, 0x8426b95f), TOBN(0x0ee4ed6e, 0x542e2a10)}}, + {{TOBN(0xb79d4cc5, 0xac751e7b), TOBN(0x93f96472, 0xfd4211bd), + TOBN(0x8c72d3d2, 0xc8de4fc6), TOBN(0x7b69cbf5, 0xdf44f064)}, + {TOBN(0x3da90ca2, 0xf4bf94e1), TOBN(0x1a5325f8, 0xf12894e2), + TOBN(0x0a437f6c, 0x7917d60b), TOBN(0x9be70486, 0x96c9cb5d)}}, + {{TOBN(0xb4d880bf, 0xe1dc5c05), TOBN(0xd738adda, 0xeebeeb57), + TOBN(0x6f0119d3, 0xdf0fe6a3), TOBN(0x5c686e55, 0x66eaaf5a)}, + {TOBN(0x9cb10b50, 0xdfd0b7ec), TOBN(0xbdd0264b, 0x6a497c21), + TOBN(0xfc093514, 0x8c546c96), TOBN(0x58a947fa, 0x79dbf42a)}}, + {{TOBN(0xc0b48d4e, 0x49ccd6d7), TOBN(0xff8fb02c, 0x88bd5580), + TOBN(0xc75235e9, 0x07d473b2), TOBN(0x4fab1ac5, 0xa2188af3)}, + {TOBN(0x030fa3bc, 0x97576ec0), TOBN(0xe8c946e8, 0x0b7e7d2f), + TOBN(0x40a5c9cc, 0x70305600), TOBN(0x6d8260a9, 0xc8b013b4)}}, + {{TOBN(0x0368304f, 0x70bba85c), TOBN(0xad090da1, 0xa4a0d311), + TOBN(0x7170e870, 0x2415eec1), TOBN(0xbfba35fe, 0x8461ea47)}, + {TOBN(0x6279019a, 0xc1e91938), TOBN(0xa47638f3, 0x1afc415f), + TOBN(0x36c65cbb, 0xbcba0e0f), TOBN(0x02160efb, 0x034e2c48)}}, + {{TOBN(0xe6c51073, 0x615cd9e4), TOBN(0x498ec047, 0xf1243c06), + TOBN(0x3e5a8809, 0xb17b3d8c), TOBN(0x5cd99e61, 0x0cc565f1)}, + {TOBN(0x81e312df, 0x7851dafe), TOBN(0xf156f5ba, 0xa79061e2), + TOBN(0x80d62b71, 0x880c590e), TOBN(0xbec9746f, 0x0a39faa1)}}, + {{TOBN(0x1d98a9c1, 0xc8ed1f7a), TOBN(0x09e43bb5, 0xa81d5ff2), + TOBN(0xd5f00f68, 0x0da0794a), TOBN(0x412050d9, 0x661aa836)}, + {TOBN(0xa89f7c4e, 0x90747e40), TOBN(0x6dc05ebb, 0xb62a3686), + TOBN(0xdf4de847, 0x308e3353), TOBN(0x53868fbb, 0x9fb53bb9)}}, + {{TOBN(0x2b09d2c3, 0xcfdcf7dd), TOBN(0x41a9fce3, 0x723fcab4), + TOBN(0x73d905f7, 0x07f57ca3), TOBN(0x080f9fb1, 0xac8e1555)}, + {TOBN(0x7c088e84, 0x9ba7a531), TOBN(0x07d35586, 0xed9a147f), + TOBN(0x602846ab, 0xaf48c336), TOBN(0x7320fd32, 0x0ccf0e79)}}, + {{TOBN(0xaa780798, 0xb18bd1ff), TOBN(0x52c2e300, 0xafdd2905), + TOBN(0xf27ea3d6, 0x434267cd), TOBN(0x8b96d16d, 0x15605b5f)}, + {TOBN(0x7bb31049, 0x4b45706b), TOBN(0xe7f58b8e, 0x743d25f8), + TOBN(0xe9b5e45b, 0x87f30076), TOBN(0xd19448d6, 0x5d053d5a)}}, + {{TOBN(0x1ecc8cb9, 0xd3210a04), TOBN(0x6bc7d463, 0xdafb5269), + TOBN(0x3e59b10a, 0x67c3489f), TOBN(0x1769788c, 0x65641e1b)}, + {TOBN(0x8a53b82d, 0xbd6cb838), TOBN(0x7066d6e6, 0x236d5f22), + TOBN(0x03aa1c61, 0x6908536e), TOBN(0xc971da0d, 0x66ae9809)}}, + {{TOBN(0x01b3a86b, 0xc49a2fac), TOBN(0x3b8420c0, 0x3092e77a), + TOBN(0x02057300, 0x7d6fb556), TOBN(0x6941b2a1, 0xbff40a87)}, + {TOBN(0x140b6308, 0x0658ff2a), TOBN(0x87804363, 0x3424ab36), + TOBN(0x0253bd51, 0x5751e299), TOBN(0xc75bcd76, 0x449c3e3a)}}, + {{TOBN(0x92eb4090, 0x7f8f875d), TOBN(0x9c9d754e, 0x56c26bbf), + TOBN(0x158cea61, 0x8110bbe7), TOBN(0x62a6b802, 0x745f91ea)}, + {TOBN(0xa79c41aa, 0xc6e7394b), TOBN(0x445b6a83, 0xad57ef10), + TOBN(0x0c5277eb, 0x6ea6f40c), TOBN(0x319fe96b, 0x88633365)}}, + {{TOBN(0x0b0fc61f, 0x385f63cb), TOBN(0x41250c84, 0x22bdd127), + TOBN(0x67d153f1, 0x09e942c2), TOBN(0x60920d08, 0xc021ad5d)}, + {TOBN(0x229f5746, 0x724d81a5), TOBN(0xb7ffb892, 0x5bba3299), + TOBN(0x518c51a1, 0xde413032), TOBN(0x2a9bfe77, 0x3c2fd94c)}}, + {{TOBN(0xcbcde239, 0x3191f4fd), TOBN(0x43093e16, 0xd3d6ada1), + TOBN(0x184579f3, 0x58769606), TOBN(0x2c94a8b3, 0xd236625c)}, + {TOBN(0x6922b9c0, 0x5c437d8e), TOBN(0x3d4ae423, 0xd8d9f3c8), + TOBN(0xf72c31c1, 0x2e7090a2), TOBN(0x4ac3f5f3, 0xd76a55bd)}}, + {{TOBN(0x342508fc, 0x6b6af991), TOBN(0x0d527100, 0x1b5cebbd), + TOBN(0xb84740d0, 0xdd440dd7), TOBN(0x748ef841, 0x780162fd)}, + {TOBN(0xa8dbfe0e, 0xdfc6fafb), TOBN(0xeadfdf05, 0xf7300f27), + TOBN(0x7d06555f, 0xfeba4ec9), TOBN(0x12c56f83, 0x9e25fa97)}}, + {{TOBN(0x77f84203, 0xd39b8c34), TOBN(0xed8b1be6, 0x3125eddb), + TOBN(0x5bbf2441, 0xf6e39dc5), TOBN(0xb00f6ee6, 0x6a5d678a)}, + {TOBN(0xba456ecf, 0x57d0ea99), TOBN(0xdcae0f58, 0x17e06c43), + TOBN(0x01643de4, 0x0f5b4baa), TOBN(0x2c324341, 0xd161b9be)}}, + {{TOBN(0x80177f55, 0xe126d468), TOBN(0xed325f1f, 0x76748e09), + TOBN(0x6116004a, 0xcfa9bdc2), TOBN(0x2d8607e6, 0x3a9fb468)}, + {TOBN(0x0e573e27, 0x6009d660), TOBN(0x3a525d2e, 0x8d10c5a1), + TOBN(0xd26cb45c, 0x3b9009a0), TOBN(0xb6b0cdc0, 0xde9d7448)}}, + {{TOBN(0x949c9976, 0xe1337c26), TOBN(0x6faadebd, 0xd73d68e5), + TOBN(0x9e158614, 0xf1b768d9), TOBN(0x22dfa557, 0x9cc4f069)}, + {TOBN(0xccd6da17, 0xbe93c6d6), TOBN(0x24866c61, 0xa504f5b9), + TOBN(0x2121353c, 0x8d694da1), TOBN(0x1c6ca580, 0x0140b8c6)}}, + {{TOBN(0xc245ad8c, 0xe964021e), TOBN(0xb83bffba, 0x032b82b3), + TOBN(0xfaa220c6, 0x47ef9898), TOBN(0x7e8d3ac6, 0x982c948a)}, + {TOBN(0x1faa2091, 0xbc2d124a), TOBN(0xbd54c3dd, 0x05b15ff4), + TOBN(0x386bf3ab, 0xc87c6fb7), TOBN(0xfb2b0563, 0xfdeb6f66)}}, + {{TOBN(0x4e77c557, 0x5b45afb4), TOBN(0xe9ded649, 0xefb8912d), + TOBN(0x7ec9bbf5, 0x42f6e557), TOBN(0x2570dfff, 0x62671f00)}, + {TOBN(0x2b3bfb78, 0x88e084bd), TOBN(0xa024b238, 0xf37fe5b4), + TOBN(0x44e7dc04, 0x95649aee), TOBN(0x498ca255, 0x5e7ec1d8)}}, + {{TOBN(0x3bc766ea, 0xaaa07e86), TOBN(0x0db6facb, 0xf3608586), + TOBN(0xbadd2549, 0xbdc259c8), TOBN(0x95af3c6e, 0x041c649f)}, + {TOBN(0xb36a928c, 0x02e30afb), TOBN(0x9b5356ad, 0x008a88b8), + TOBN(0x4b67a5f1, 0xcf1d9e9d), TOBN(0xc6542e47, 0xa5d8d8ce)}}, + {{TOBN(0x73061fe8, 0x7adfb6cc), TOBN(0xcc826fd3, 0x98678141), + TOBN(0x00e758b1, 0x3c80515a), TOBN(0x6afe3247, 0x41485083)}, + {TOBN(0x0fcb08b9, 0xb6ae8a75), TOBN(0xb8cf388d, 0x4acf51e1), + TOBN(0x344a5560, 0x6961b9d6), TOBN(0x1a6778b8, 0x6a97fd0c)}}, + {{TOBN(0xd840fdc1, 0xecc4c7e3), TOBN(0xde9fe47d, 0x16db68cc), + TOBN(0xe95f89de, 0xa3e216aa), TOBN(0x84f1a6a4, 0x9594a8be)}, + {TOBN(0x7ddc7d72, 0x5a7b162b), TOBN(0xc5cfda19, 0xadc817a3), + TOBN(0x80a5d350, 0x78b58d46), TOBN(0x93365b13, 0x82978f19)}}, + {{TOBN(0x2e44d225, 0x26a1fc90), TOBN(0x0d6d10d2, 0x4d70705d), + TOBN(0xd94b6b10, 0xd70c45f4), TOBN(0x0f201022, 0xb216c079)}, + {TOBN(0xcec966c5, 0x658fde41), TOBN(0xa8d2bc7d, 0x7e27601d), + TOBN(0xbfcce3e1, 0xff230be7), TOBN(0x3394ff6b, 0x0033ffb5)}}, + {{TOBN(0xd890c509, 0x8132c9af), TOBN(0xaac4b0eb, 0x361e7868), + TOBN(0x5194ded3, 0xe82d15aa), TOBN(0x4550bd2e, 0x23ae6b7d)}, + {TOBN(0x3fda318e, 0xea5399d4), TOBN(0xd989bffa, 0x91638b80), + TOBN(0x5ea124d0, 0xa14aa12d), TOBN(0x1fb1b899, 0x3667b944)}}, + {{TOBN(0x95ec7969, 0x44c44d6a), TOBN(0x91df144a, 0x57e86137), + TOBN(0x915fd620, 0x73adac44), TOBN(0x8f01732d, 0x59a83801)}, + {TOBN(0xec579d25, 0x3aa0a633), TOBN(0x06de5e7c, 0xc9d6d59c), + TOBN(0xc132f958, 0xb1ef8010), TOBN(0x29476f96, 0xe65c1a02)}}, + {{TOBN(0x336a77c0, 0xd34c3565), TOBN(0xef1105b2, 0x1b9f1e9e), + TOBN(0x63e6d08b, 0xf9e08002), TOBN(0x9aff2f21, 0xc613809e)}, + {TOBN(0xb5754f85, 0x3a80e75d), TOBN(0xde71853e, 0x6bbda681), + TOBN(0x86f041df, 0x8197fd7a), TOBN(0x8b332e08, 0x127817fa)}}, + {{TOBN(0x05d99be8, 0xb9c20cda), TOBN(0x89f7aad5, 0xd5cd0c98), + TOBN(0x7ef936fe, 0x5bb94183), TOBN(0x92ca0753, 0xb05cd7f2)}, + {TOBN(0x9d65db11, 0x74a1e035), TOBN(0x02628cc8, 0x13eaea92), + TOBN(0xf2d9e242, 0x49e4fbf2), TOBN(0x94fdfd9b, 0xe384f8b7)}}, + {{TOBN(0x65f56054, 0x63428c6b), TOBN(0x2f7205b2, 0x90b409a5), + TOBN(0xf778bb78, 0xff45ae11), TOBN(0xa13045be, 0xc5ee53b2)}, + {TOBN(0xe00a14ff, 0x03ef77fe), TOBN(0x689cd59f, 0xffef8bef), + TOBN(0x3578f0ed, 0x1e9ade22), TOBN(0xe99f3ec0, 0x6268b6a8)}}, + {{TOBN(0xa2057d91, 0xea1b3c3e), TOBN(0x2d1a7053, 0xb8823a4a), + TOBN(0xabbb336a, 0x2cca451e), TOBN(0xcd2466e3, 0x2218bb5d)}, + {TOBN(0x3ac1f42f, 0xc8cb762d), TOBN(0x7e312aae, 0x7690211f), + TOBN(0xebb9bd73, 0x45d07450), TOBN(0x207c4b82, 0x46c2213f)}}, + {{TOBN(0x99d425c1, 0x375913ec), TOBN(0x94e45e96, 0x67908220), + TOBN(0xc08f3087, 0xcd67dbf6), TOBN(0xa5670fbe, 0xc0887056)}, + {TOBN(0x6717b64a, 0x66f5b8fc), TOBN(0xd5a56aea, 0x786fec28), + TOBN(0xa8c3f55f, 0xc0ff4952), TOBN(0xa77fefae, 0x457ac49b)}}, + {{TOBN(0x29882d7c, 0x98379d44), TOBN(0xd000bdfb, 0x509edc8a), + TOBN(0xc6f95979, 0xe66fe464), TOBN(0x504a6115, 0xfa61bde0)}, + {TOBN(0x56b3b871, 0xeffea31a), TOBN(0x2d3de26d, 0xf0c21a54), + TOBN(0x21dbff31, 0x834753bf), TOBN(0xe67ecf49, 0x69269d86)}}, + {{TOBN(0x7a176952, 0x151fe690), TOBN(0x03515804, 0x7f2adb5f), + TOBN(0xee794b15, 0xd1b62a8d), TOBN(0xf004ceec, 0xaae454e6)}, + {TOBN(0x0897ea7c, 0xf0386fac), TOBN(0x3b62ff12, 0xd1fca751), + TOBN(0x154181df, 0x1b7a04ec), TOBN(0x2008e04a, 0xfb5847ec)}}, + {{TOBN(0xd147148e, 0x41dbd772), TOBN(0x2b419f73, 0x22942654), + TOBN(0x669f30d3, 0xe9c544f7), TOBN(0x52a2c223, 0xc8540149)}, + {TOBN(0x5da9ee14, 0x634dfb02), TOBN(0x5f074ff0, 0xf47869f3), + TOBN(0x74ee878d, 0xa3933acc), TOBN(0xe6510651, 0x4fe35ed1)}}, + {{TOBN(0xb3eb9482, 0xf1012e7a), TOBN(0x51013cc0, 0xa8a566ae), + TOBN(0xdd5e9243, 0x47c00d3b), TOBN(0x7fde089d, 0x946bb0e5)}, + {TOBN(0x030754fe, 0xc731b4b3), TOBN(0x12a136a4, 0x99fda062), + TOBN(0x7c1064b8, 0x5a1a35bc), TOBN(0xbf1f5763, 0x446c84ef)}}, + {{TOBN(0xed29a56d, 0xa16d4b34), TOBN(0x7fba9d09, 0xdca21c4f), + TOBN(0x66d7ac00, 0x6d8de486), TOBN(0x60061987, 0x73a2a5e1)}, + {TOBN(0x8b400f86, 0x9da28ff0), TOBN(0x3133f708, 0x43c4599c), + TOBN(0x9911c9b8, 0xee28cb0d), TOBN(0xcd7e2874, 0x8e0af61d)}}, + {{TOBN(0x5a85f0f2, 0x72ed91fc), TOBN(0x85214f31, 0x9cd4a373), + TOBN(0x881fe5be, 0x1925253c), TOBN(0xd8dc98e0, 0x91e8bc76)}, + {TOBN(0x7120affe, 0x585cc3a2), TOBN(0x724952ed, 0x735bf97a), + TOBN(0x5581e7dc, 0x3eb34581), TOBN(0x5cbff4f2, 0xe52ee57d)}}, + {{TOBN(0x8d320a0e, 0x87d8cc7b), TOBN(0x9beaa7f3, 0xf1d280d0), + TOBN(0x7a0b9571, 0x9beec704), TOBN(0x9126332e, 0x5b7f0057)}, + {TOBN(0x01fbc1b4, 0x8ed3bd6d), TOBN(0x35bb2c12, 0xd945eb24), + TOBN(0x6404694e, 0x9a8ae255), TOBN(0xb6092eec, 0x8d6abfb3)}}, + {{TOBN(0x4d76143f, 0xcc058865), TOBN(0x7b0a5af2, 0x6e249922), + TOBN(0x8aef9440, 0x6a50d353), TOBN(0xe11e4bcc, 0x64f0e07a)}, + {TOBN(0x4472993a, 0xa14a90fa), TOBN(0x7706e20c, 0xba0c51d4), + TOBN(0xf403292f, 0x1532672d), TOBN(0x52573bfa, 0x21829382)}}, + {{TOBN(0x6a7bb6a9, 0x3b5bdb83), TOBN(0x08da65c0, 0xa4a72318), + TOBN(0xc58d22aa, 0x63eb065f), TOBN(0x1717596c, 0x1b15d685)}, + {TOBN(0x112df0d0, 0xb266d88b), TOBN(0xf688ae97, 0x5941945a), + TOBN(0x487386e3, 0x7c292cac), TOBN(0x42f3b50d, 0x57d6985c)}}, + {{TOBN(0x6da4f998, 0x6a90fc34), TOBN(0xc8f257d3, 0x65ca8a8d), + TOBN(0xc2feabca, 0x6951f762), TOBN(0xe1bc81d0, 0x74c323ac)}, + {TOBN(0x1bc68f67, 0x251a2a12), TOBN(0x10d86587, 0xbe8a70dc), + TOBN(0xd648af7f, 0xf0f84d2e), TOBN(0xf0aa9ebc, 0x6a43ac92)}}, + {{TOBN(0x69e3be04, 0x27596893), TOBN(0xb6bb02a6, 0x45bf452b), + TOBN(0x0875c11a, 0xf4c698c8), TOBN(0x6652b5c7, 0xbece3794)}, + {TOBN(0x7b3755fd, 0x4f5c0499), TOBN(0x6ea16558, 0xb5532b38), + TOBN(0xd1c69889, 0xa2e96ef7), TOBN(0x9c773c3a, 0x61ed8f48)}}, + {{TOBN(0x2b653a40, 0x9b323abc), TOBN(0xe26605e1, 0xf0e1d791), + TOBN(0x45d41064, 0x4a87157a), TOBN(0x8f9a78b7, 0xcbbce616)}, + {TOBN(0xcf1e44aa, 0xc407eddd), TOBN(0x81ddd1d8, 0xa35b964f), + TOBN(0x473e339e, 0xfd083999), TOBN(0x6c94bdde, 0x8e796802)}}, + {{TOBN(0x5a304ada, 0x8545d185), TOBN(0x82ae44ea, 0x738bb8cb), + TOBN(0x628a35e3, 0xdf87e10e), TOBN(0xd3624f3d, 0xa15b9fe3)}, + {TOBN(0xcc44209b, 0x14be4254), TOBN(0x7d0efcbc, 0xbdbc2ea5), + TOBN(0x1f603362, 0x04c37bbe), TOBN(0x21f363f5, 0x56a5852c)}}, + {{TOBN(0xa1503d1c, 0xa8501550), TOBN(0x2251e0e1, 0xd8ab10bb), + TOBN(0xde129c96, 0x6961c51c), TOBN(0x1f7246a4, 0x81910f68)}, + {TOBN(0x2eb744ee, 0x5f2591f2), TOBN(0x3c47d33f, 0x5e627157), + TOBN(0x4d6d62c9, 0x22f3bd68), TOBN(0x6120a64b, 0xcb8df856)}}, + {{TOBN(0x3a9ac6c0, 0x7b5d07df), TOBN(0xa92b9558, 0x7ef39783), + TOBN(0xe128a134, 0xab3a9b4f), TOBN(0x41c18807, 0xb1252f05)}, + {TOBN(0xfc7ed089, 0x80ba9b1c), TOBN(0xac8dc6de, 0xc532a9dd), + TOBN(0xbf829cef, 0x55246809), TOBN(0x101b784f, 0x5b4ee80f)}}, + {{TOBN(0xc09945bb, 0xb6f11603), TOBN(0x57b09dbe, 0x41d2801e), + TOBN(0xfba5202f, 0xa97534a8), TOBN(0x7fd8ae5f, 0xc17b9614)}, + {TOBN(0xa50ba666, 0x78308435), TOBN(0x9572f77c, 0xd3868c4d), + TOBN(0x0cef7bfd, 0x2dd7aab0), TOBN(0xe7958e08, 0x2c7c79ff)}}, + {{TOBN(0x81262e42, 0x25346689), TOBN(0x716da290, 0xb07c7004), + TOBN(0x35f911ea, 0xb7950ee3), TOBN(0x6fd72969, 0x261d21b5)}, + {TOBN(0x52389803, 0x08b640d3), TOBN(0x5b0026ee, 0x887f12a1), + TOBN(0x20e21660, 0x742e9311), TOBN(0x0ef6d541, 0x5ff77ff7)}}, + {{TOBN(0x969127f0, 0xf9c41135), TOBN(0xf21d60c9, 0x68a64993), + TOBN(0x656e5d0c, 0xe541875c), TOBN(0xf1e0f84e, 0xa1d3c233)}, + {TOBN(0x9bcca359, 0x06002d60), TOBN(0xbe2da60c, 0x06191552), + TOBN(0x5da8bbae, 0x61181ec3), TOBN(0x9f04b823, 0x65806f19)}}, + {{TOBN(0xf1604a7d, 0xd4b79bb8), TOBN(0xaee806fb, 0x52c878c8), + TOBN(0x34144f11, 0x8d47b8e8), TOBN(0x72edf52b, 0x949f9054)}, + {TOBN(0xebfca84e, 0x2127015a), TOBN(0x9051d0c0, 0x9cb7cef3), + TOBN(0x86e8fe58, 0x296deec8), TOBN(0x33b28188, 0x41010d74)}}}, + {{{TOBN(0x01079383, 0x171b445f), TOBN(0x9bcf21e3, 0x8131ad4c), + TOBN(0x8cdfe205, 0xc93987e8), TOBN(0xe63f4152, 0xc92e8c8f)}, + {TOBN(0x729462a9, 0x30add43d), TOBN(0x62ebb143, 0xc980f05a), + TOBN(0x4f3954e5, 0x3b06e968), TOBN(0xfe1d75ad, 0x242cf6b1)}}, + {{TOBN(0x5f95c6c7, 0xaf8685c8), TOBN(0xd4c1c8ce, 0x2f8f01aa), + TOBN(0xc44bbe32, 0x2574692a), TOBN(0xb8003478, 0xd4a4a068)}, + {TOBN(0x7c8fc6e5, 0x2eca3cdb), TOBN(0xea1db16b, 0xec04d399), + TOBN(0xb05bc82e, 0x8f2bc5cf), TOBN(0x763d517f, 0xf44793d2)}}, + {{TOBN(0x4451c1b8, 0x08bd98d0), TOBN(0x644b1cd4, 0x6575f240), + TOBN(0x6907eb33, 0x7375d270), TOBN(0x56c8bebd, 0xfa2286bd)}, + {TOBN(0xc713d2ac, 0xc4632b46), TOBN(0x17da427a, 0xafd60242), + TOBN(0x313065b7, 0xc95c7546), TOBN(0xf8239898, 0xbf17a3de)}}, + {{TOBN(0xf3b7963f, 0x4c830320), TOBN(0x842c7aa0, 0x903203e3), + TOBN(0xaf22ca0a, 0xe7327afb), TOBN(0x38e13092, 0x967609b6)}, + {TOBN(0x73b8fb62, 0x757558f1), TOBN(0x3cc3e831, 0xf7eca8c1), + TOBN(0xe4174474, 0xf6331627), TOBN(0xa77989ca, 0xc3c40234)}}, + {{TOBN(0xe5fd17a1, 0x44a081e0), TOBN(0xd797fb7d, 0xb70e296a), + TOBN(0x2b472b30, 0x481f719c), TOBN(0x0e632a98, 0xfe6f8c52)}, + {TOBN(0x89ccd116, 0xc5f0c284), TOBN(0xf51088af, 0x2d987c62), + TOBN(0x2a2bccda, 0x4c2de6cf), TOBN(0x810f9efe, 0xf679f0f9)}}, + {{TOBN(0xb0f394b9, 0x7ffe4b3e), TOBN(0x0b691d21, 0xe5fa5d21), + TOBN(0xb0bd7747, 0x9dfbbc75), TOBN(0xd2830fda, 0xfaf78b00)}, + {TOBN(0xf78c249c, 0x52434f57), TOBN(0x4b1f7545, 0x98096dab), + TOBN(0x73bf6f94, 0x8ff8c0b3), TOBN(0x34aef03d, 0x454e134c)}}, + {{TOBN(0xf8d151f4, 0xb7ac7ec5), TOBN(0xd6ceb95a, 0xe50da7d5), + TOBN(0xa1b492b0, 0xdc3a0eb8), TOBN(0x75157b69, 0xb3dd2863)}, + {TOBN(0xe2c4c74e, 0xc5413d62), TOBN(0xbe329ff7, 0xbc5fc4c7), + TOBN(0x835a2aea, 0x60fa9dda), TOBN(0xf117f5ad, 0x7445cb87)}}, + {{TOBN(0xae8317f4, 0xb0166f7a), TOBN(0xfbd3e3f7, 0xceec74e6), + TOBN(0xfdb516ac, 0xe0874bfd), TOBN(0x3d846019, 0xc681f3a3)}, + {TOBN(0x0b12ee5c, 0x7c1620b0), TOBN(0xba68b4dd, 0x2b63c501), + TOBN(0xac03cd32, 0x6668c51e), TOBN(0x2a6279f7, 0x4e0bcb5b)}}, + {{TOBN(0x17bd69b0, 0x6ae85c10), TOBN(0x72946979, 0x1dfdd3a6), + TOBN(0xd9a03268, 0x2c078bec), TOBN(0x41c6a658, 0xbfd68a52)}, + {TOBN(0xcdea1024, 0x0e023900), TOBN(0xbaeec121, 0xb10d144d), + TOBN(0x5a600e74, 0x058ab8dc), TOBN(0x1333af21, 0xbb89ccdd)}}, + {{TOBN(0xdf25eae0, 0x3aaba1f1), TOBN(0x2cada16e, 0x3b7144cf), + TOBN(0x657ee27d, 0x71ab98bc), TOBN(0x99088b4c, 0x7a6fc96e)}, + {TOBN(0x05d5c0a0, 0x3549dbd4), TOBN(0x42cbdf8f, 0xf158c3ac), + TOBN(0x3fb6b3b0, 0x87edd685), TOBN(0x22071cf6, 0x86f064d0)}}, + {{TOBN(0xd2d6721f, 0xff2811e5), TOBN(0xdb81b703, 0xfe7fae8c), + TOBN(0x3cfb74ef, 0xd3f1f7bb), TOBN(0x0cdbcd76, 0x16cdeb5d)}, + {TOBN(0x4f39642a, 0x566a808c), TOBN(0x02b74454, 0x340064d6), + TOBN(0xfabbadca, 0x0528fa6f), TOBN(0xe4c3074c, 0xd3fc0bb6)}}, + {{TOBN(0xb32cb8b0, 0xb796d219), TOBN(0xc3e95f4f, 0x34741dd9), + TOBN(0x87212125, 0x68edf6f5), TOBN(0x7a03aee4, 0xa2b9cb8e)}, + {TOBN(0x0cd3c376, 0xf53a89aa), TOBN(0x0d8af9b1, 0x948a28dc), + TOBN(0xcf86a3f4, 0x902ab04f), TOBN(0x8aacb62a, 0x7f42002d)}}, + {{TOBN(0x106985eb, 0xf62ffd52), TOBN(0xe670b54e, 0x5797bf10), + TOBN(0x4b405209, 0xc5e30aef), TOBN(0x12c97a20, 0x4365b5e9)}, + {TOBN(0x104646ce, 0x1fe32093), TOBN(0x13cb4ff6, 0x3907a8c9), + TOBN(0x8b9f30d1, 0xd46e726b), TOBN(0xe1985e21, 0xaba0f499)}}, + {{TOBN(0xc573dea9, 0x10a230cd), TOBN(0x24f46a93, 0xcd30f947), + TOBN(0xf2623fcf, 0xabe2010a), TOBN(0x3f278cb2, 0x73f00e4f)}, + {TOBN(0xed55c67d, 0x50b920eb), TOBN(0xf1cb9a2d, 0x8e760571), + TOBN(0x7c50d109, 0x0895b709), TOBN(0x4207cf07, 0x190d4369)}}, + {{TOBN(0x3b027e81, 0xc4127fe1), TOBN(0xa9f8b9ad, 0x3ae9c566), + TOBN(0x5ab10851, 0xacbfbba5), TOBN(0xa747d648, 0x569556f5)}, + {TOBN(0xcc172b5c, 0x2ba97bf7), TOBN(0x15e0f77d, 0xbcfa3324), + TOBN(0xa345b797, 0x7686279d), TOBN(0x5a723480, 0xe38003d3)}}, + {{TOBN(0xfd8e139f, 0x8f5fcda8), TOBN(0xf3e558c4, 0xbdee5bfd), + TOBN(0xd76cbaf4, 0xe33f9f77), TOBN(0x3a4c97a4, 0x71771969)}, + {TOBN(0xda27e84b, 0xf6dce6a7), TOBN(0xff373d96, 0x13e6c2d1), + TOBN(0xf115193c, 0xd759a6e9), TOBN(0x3f9b7025, 0x63d2262c)}}, + {{TOBN(0xd9764a31, 0x317cd062), TOBN(0x30779d8e, 0x199f8332), + TOBN(0xd8074106, 0x16b11b0b), TOBN(0x7917ab9f, 0x78aeaed8)}, + {TOBN(0xb67a9cbe, 0x28fb1d8e), TOBN(0x2e313563, 0x136eda33), + TOBN(0x010b7069, 0xa371a86c), TOBN(0x44d90fa2, 0x6744e6b7)}}, + {{TOBN(0x68190867, 0xd6b3e243), TOBN(0x9fe6cd9d, 0x59048c48), + TOBN(0xb900b028, 0x95731538), TOBN(0xa012062f, 0x32cae04f)}, + {TOBN(0x8107c8bc, 0x9399d082), TOBN(0x47e8c54a, 0x41df12e2), + TOBN(0x14ba5117, 0xb6ef3f73), TOBN(0x22260bea, 0x81362f0b)}}, + {{TOBN(0x90ea261e, 0x1a18cc20), TOBN(0x2192999f, 0x2321d636), + TOBN(0xef64d314, 0xe311b6a0), TOBN(0xd7401e4c, 0x3b54a1f5)}, + {TOBN(0x19019983, 0x6fbca2ba), TOBN(0x46ad3293, 0x8fbffc4b), + TOBN(0xa142d3f6, 0x3786bf40), TOBN(0xeb5cbc26, 0xb67039fc)}}, + {{TOBN(0x9cb0ae6c, 0x252bd479), TOBN(0x05e0f88a, 0x12b5848f), + TOBN(0x78f6d2b2, 0xa5c97663), TOBN(0x6f6e149b, 0xc162225c)}, + {TOBN(0xe602235c, 0xde601a89), TOBN(0xd17bbe98, 0xf373be1f), + TOBN(0xcaf49a5b, 0xa8471827), TOBN(0x7e1a0a85, 0x18aaa116)}}, + {{TOBN(0x6c833196, 0x270580c3), TOBN(0x1e233839, 0xf1c98a14), + TOBN(0x67b2f7b4, 0xae34e0a5), TOBN(0x47ac8745, 0xd8ce7289)}, + {TOBN(0x2b74779a, 0x100dd467), TOBN(0x274a4337, 0x4ee50d09), + TOBN(0x603dcf13, 0x83608bc9), TOBN(0xcd9da6c3, 0xc89e8388)}}, + {{TOBN(0x2660199f, 0x355116ac), TOBN(0xcc38bb59, 0xb6d18eed), + TOBN(0x3075f31f, 0x2f4bc071), TOBN(0x9774457f, 0x265dc57e)}, + {TOBN(0x06a6a9c8, 0xc6db88bb), TOBN(0x6429d07f, 0x4ec98e04), + TOBN(0x8d05e57b, 0x05ecaa8b), TOBN(0x20f140b1, 0x7872ea7b)}}, + {{TOBN(0xdf8c0f09, 0xca494693), TOBN(0x48d3a020, 0xf252e909), + TOBN(0x4c5c29af, 0x57b14b12), TOBN(0x7e6fa37d, 0xbf47ad1c)}, + {TOBN(0x66e7b506, 0x49a0c938), TOBN(0xb72c0d48, 0x6be5f41f), + TOBN(0x6a6242b8, 0xb2359412), TOBN(0xcd35c774, 0x8e859480)}}, + {{TOBN(0x12536fea, 0x87baa627), TOBN(0x58c1fec1, 0xf72aa680), + TOBN(0x6c29b637, 0x601e5dc9), TOBN(0x9e3c3c1c, 0xde9e01b9)}, + {TOBN(0xefc8127b, 0x2bcfe0b0), TOBN(0x35107102, 0x2a12f50d), + TOBN(0x6ccd6cb1, 0x4879b397), TOBN(0xf792f804, 0xf8a82f21)}}, + {{TOBN(0x509d4804, 0xa9b46402), TOBN(0xedddf85d, 0xc10f0850), + TOBN(0x928410dc, 0x4b6208aa), TOBN(0xf6229c46, 0x391012dc)}, + {TOBN(0xc5a7c41e, 0x7727b9b6), TOBN(0x289e4e4b, 0xaa444842), + TOBN(0x049ba1d9, 0xe9a947ea), TOBN(0x44f9e47f, 0x83c8debc)}}, + {{TOBN(0xfa77a1fe, 0x611f8b8e), TOBN(0xfd2e416a, 0xf518f427), + TOBN(0xc5fffa70, 0x114ebac3), TOBN(0xfe57c4e9, 0x5d89697b)}, + {TOBN(0xfdd053ac, 0xb1aaf613), TOBN(0x31df210f, 0xea585a45), + TOBN(0x318cc10e, 0x24985034), TOBN(0x1a38efd1, 0x5f1d6130)}}, + {{TOBN(0xbf86f237, 0x0b1e9e21), TOBN(0xb258514d, 0x1dbe88aa), + TOBN(0x1e38a588, 0x90c1baf9), TOBN(0x2936a01e, 0xbdb9b692)}, + {TOBN(0xd576de98, 0x6dd5b20c), TOBN(0xb586bf71, 0x70f98ecf), + TOBN(0xcccf0f12, 0xc42d2fd7), TOBN(0x8717e61c, 0xfb35bd7b)}}, + {{TOBN(0x8b1e5722, 0x35e6fc06), TOBN(0x3477728f, 0x0b3e13d5), + TOBN(0x150c294d, 0xaa8a7372), TOBN(0xc0291d43, 0x3bfa528a)}, + {TOBN(0xc6c8bc67, 0xcec5a196), TOBN(0xdeeb31e4, 0x5c2e8a7c), + TOBN(0xba93e244, 0xfb6e1c51), TOBN(0xb9f8b71b, 0x2e28e156)}}, + {{TOBN(0xce65a287, 0x968a2ab9), TOBN(0xe3c5ce69, 0x46bbcb1f), + TOBN(0xf8c835b9, 0xe7ae3f30), TOBN(0x16bbee26, 0xff72b82b)}, + {TOBN(0x665e2017, 0xfd42cd22), TOBN(0x1e139970, 0xf8b1d2a0), + TOBN(0x125cda29, 0x79204932), TOBN(0x7aee94a5, 0x49c3bee5)}}, + {{TOBN(0x68c70160, 0x89821a66), TOBN(0xf7c37678, 0x8f981669), + TOBN(0xd90829fc, 0x48cc3645), TOBN(0x346af049, 0xd70addfc)}, + {TOBN(0x2057b232, 0x370bf29c), TOBN(0xf90c73ce, 0x42e650ee), + TOBN(0xe03386ea, 0xa126ab90), TOBN(0x0e266e7e, 0x975a087b)}}, + {{TOBN(0x80578eb9, 0x0fca65d9), TOBN(0x7e2989ea, 0x16af45b8), + TOBN(0x7438212d, 0xcac75a4e), TOBN(0x38c7ca39, 0x4fef36b8)}, + {TOBN(0x8650c494, 0xd402676a), TOBN(0x26ab5a66, 0xf72c7c48), + TOBN(0x4e6cb426, 0xce3a464e), TOBN(0xf8f99896, 0x2b72f841)}}, + {{TOBN(0x8c318491, 0x1a335cc8), TOBN(0x563459ba, 0x6a5913e4), + TOBN(0x1b920d61, 0xc7b32919), TOBN(0x805ab8b6, 0xa02425ad)}, + {TOBN(0x2ac512da, 0x8d006086), TOBN(0x6ca4846a, 0xbcf5c0fd), + TOBN(0xafea51d8, 0xac2138d7), TOBN(0xcb647545, 0x344cd443)}}, + {{TOBN(0x0429ee8f, 0xbd7d9040), TOBN(0xee66a2de, 0x819b9c96), + TOBN(0x54f9ec25, 0xdea7d744), TOBN(0x2ffea642, 0x671721bb)}, + {TOBN(0x4f19dbd1, 0x114344ea), TOBN(0x04304536, 0xfd0dbc8b), + TOBN(0x014b50aa, 0x29ec7f91), TOBN(0xb5fc22fe, 0xbb06014d)}}, + {{TOBN(0x60d963a9, 0x1ee682e0), TOBN(0xdf48abc0, 0xfe85c727), + TOBN(0x0cadba13, 0x2e707c2d), TOBN(0xde608d3a, 0xa645aeff)}, + {TOBN(0x05f1c28b, 0xedafd883), TOBN(0x3c362ede, 0xbd94de1f), + TOBN(0x8dd0629d, 0x13593e41), TOBN(0x0a5e736f, 0x766d6eaf)}}, + {{TOBN(0xbfa92311, 0xf68cf9d1), TOBN(0xa4f9ef87, 0xc1797556), + TOBN(0x10d75a1f, 0x5601c209), TOBN(0x651c374c, 0x09b07361)}, + {TOBN(0x49950b58, 0x88b5cead), TOBN(0x0ef00058, 0x6fa9dbaa), + TOBN(0xf51ddc26, 0x4e15f33a), TOBN(0x1f8b5ca6, 0x2ef46140)}}, + {{TOBN(0x343ac0a3, 0xee9523f0), TOBN(0xbb75eab2, 0x975ea978), + TOBN(0x1bccf332, 0x107387f4), TOBN(0x790f9259, 0x9ab0062e)}, + {TOBN(0xf1a363ad, 0x1e4f6a5f), TOBN(0x06e08b84, 0x62519a50), + TOBN(0x60915187, 0x7265f1ee), TOBN(0x6a80ca34, 0x93ae985e)}}, + {{TOBN(0x81b29768, 0xaaba4864), TOBN(0xb13cabf2, 0x8d52a7d6), + TOBN(0xb5c36348, 0x8ead03f1), TOBN(0xc932ad95, 0x81c7c1c0)}, + {TOBN(0x5452708e, 0xcae1e27b), TOBN(0x9dac4269, 0x1b0df648), + TOBN(0x233e3f0c, 0xdfcdb8bc), TOBN(0xe6ceccdf, 0xec540174)}}, + {{TOBN(0xbd0d845e, 0x95081181), TOBN(0xcc8a7920, 0x699355d5), + TOBN(0x111c0f6d, 0xc3b375a8), TOBN(0xfd95bc6b, 0xfd51e0dc)}, + {TOBN(0x4a106a26, 0x6888523a), TOBN(0x4d142bd6, 0xcb01a06d), + TOBN(0x79bfd289, 0xadb9b397), TOBN(0x0bdbfb94, 0xe9863914)}}, + {{TOBN(0x29d8a229, 0x1660f6a6), TOBN(0x7f6abcd6, 0x551c042d), + TOBN(0x13039deb, 0x0ac3ffe8), TOBN(0xa01be628, 0xec8523fb)}, + {TOBN(0x6ea34103, 0x0ca1c328), TOBN(0xc74114bd, 0xb903928e), + TOBN(0x8aa4ff4e, 0x9e9144b0), TOBN(0x7064091f, 0x7f9a4b17)}}, + {{TOBN(0xa3f4f521, 0xe447f2c4), TOBN(0x81b8da7a, 0x604291f0), + TOBN(0xd680bc46, 0x7d5926de), TOBN(0x84f21fd5, 0x34a1202f)}, + {TOBN(0x1d1e3181, 0x4e9df3d8), TOBN(0x1ca4861a, 0x39ab8d34), + TOBN(0x809ddeec, 0x5b19aa4a), TOBN(0x59f72f7e, 0x4d329366)}}, + {{TOBN(0xa2f93f41, 0x386d5087), TOBN(0x40bf739c, 0xdd67d64f), + TOBN(0xb4494205, 0x66702158), TOBN(0xc33c65be, 0x73b1e178)}, + {TOBN(0xcdcd657c, 0x38ca6153), TOBN(0x97f4519a, 0xdc791976), + TOBN(0xcc7c7f29, 0xcd6e1f39), TOBN(0x38de9cfb, 0x7e3c3932)}}, + {{TOBN(0xe448eba3, 0x7b793f85), TOBN(0xe9f8dbf9, 0xf067e914), + TOBN(0xc0390266, 0xf114ae87), TOBN(0x39ed75a7, 0xcd6a8e2a)}, + {TOBN(0xadb14848, 0x7ffba390), TOBN(0x67f8cb8b, 0x6af9bc09), + TOBN(0x322c3848, 0x9c7476db), TOBN(0xa320fecf, 0x52a538d6)}}, + {{TOBN(0xe0493002, 0xb2aced2b), TOBN(0xdfba1809, 0x616bd430), + TOBN(0x531c4644, 0xc331be70), TOBN(0xbc04d32e, 0x90d2e450)}, + {TOBN(0x1805a0d1, 0x0f9f142d), TOBN(0x2c44a0c5, 0x47ee5a23), + TOBN(0x31875a43, 0x3989b4e3), TOBN(0x6b1949fd, 0x0c063481)}}, + {{TOBN(0x2dfb9e08, 0xbe0f4492), TOBN(0x3ff0da03, 0xe9d5e517), + TOBN(0x03dbe9a1, 0xf79466a8), TOBN(0x0b87bcd0, 0x15ea9932)}, + {TOBN(0xeb64fc83, 0xab1f58ab), TOBN(0x6d9598da, 0x817edc8a), + TOBN(0x699cff66, 0x1d3b67e5), TOBN(0x645c0f29, 0x92635853)}}, + {{TOBN(0x253cdd82, 0xeabaf21c), TOBN(0x82b9602a, 0x2241659e), + TOBN(0x2cae07ec, 0x2d9f7091), TOBN(0xbe4c720c, 0x8b48cd9b)}, + {TOBN(0x6ce5bc03, 0x6f08d6c9), TOBN(0x36e8a997, 0xaf10bf40), + TOBN(0x83422d21, 0x3e10ff12), TOBN(0x7b26d3eb, 0xbcc12494)}}, + {{TOBN(0xb240d2d0, 0xc9469ad6), TOBN(0xc4a11b4d, 0x30afa05b), + TOBN(0x4b604ace, 0xdd6ba286), TOBN(0x18486600, 0x3ee2864c)}, + {TOBN(0x5869d6ba, 0x8d9ce5be), TOBN(0x0d8f68c5, 0xff4bfb0d), + TOBN(0xb69f210b, 0x5700cf73), TOBN(0x61f6653a, 0x6d37c135)}}, + {{TOBN(0xff3d432b, 0x5aff5a48), TOBN(0x0d81c4b9, 0x72ba3a69), + TOBN(0xee879ae9, 0xfa1899ef), TOBN(0xbac7e2a0, 0x2d6acafd)}, + {TOBN(0xd6d93f6c, 0x1c664399), TOBN(0x4c288de1, 0x5bcb135d), + TOBN(0x83031dab, 0x9dab7cbf), TOBN(0xfe23feb0, 0x3abbf5f0)}}, + {{TOBN(0x9f1b2466, 0xcdedca85), TOBN(0x140bb710, 0x1a09538c), + TOBN(0xac8ae851, 0x5e11115d), TOBN(0x0d63ff67, 0x6f03f59e)}, + {TOBN(0x755e5551, 0x7d234afb), TOBN(0x61c2db4e, 0x7e208fc1), + TOBN(0xaa9859ce, 0xf28a4b5d), TOBN(0xbdd6d4fc, 0x34af030f)}}, + {{TOBN(0xd1c4a26d, 0x3be01cb1), TOBN(0x9ba14ffc, 0x243aa07c), + TOBN(0xf95cd3a9, 0xb2503502), TOBN(0xe379bc06, 0x7d2a93ab)}, + {TOBN(0x3efc18e9, 0xd4ca8d68), TOBN(0x083558ec, 0x80bb412a), + TOBN(0xd903b940, 0x9645a968), TOBN(0xa499f0b6, 0x9ba6054f)}}, + {{TOBN(0x208b573c, 0xb8349abe), TOBN(0x3baab3e5, 0x30b4fc1c), + TOBN(0x87e978ba, 0xcb524990), TOBN(0x3524194e, 0xccdf0e80)}, + {TOBN(0x62711725, 0x7d4bcc42), TOBN(0xe90a3d9b, 0xb90109ba), + TOBN(0x3b1bdd57, 0x1323e1e0), TOBN(0xb78e9bd5, 0x5eae1599)}}, + {{TOBN(0x0794b746, 0x9e03d278), TOBN(0x80178605, 0xd70e6297), + TOBN(0x171792f8, 0x99c97855), TOBN(0x11b393ee, 0xf5a86b5c)}, + {TOBN(0x48ef6582, 0xd8884f27), TOBN(0xbd44737a, 0xbf19ba5f), + TOBN(0x8698de4c, 0xa42062c6), TOBN(0x8975eb80, 0x61ce9c54)}}, + {{TOBN(0xd50e57c7, 0xd7fe71f3), TOBN(0x15342190, 0xbc97ce38), + TOBN(0x51bda2de, 0x4df07b63), TOBN(0xba12aeae, 0x200eb87d)}, + {TOBN(0xabe135d2, 0xa9b4f8f6), TOBN(0x04619d65, 0xfad6d99c), + TOBN(0x4a6683a7, 0x7994937c), TOBN(0x7a778c8b, 0x6f94f09a)}}, + {{TOBN(0x8c508623, 0x20a71b89), TOBN(0x241a2aed, 0x1c229165), + TOBN(0x352be595, 0xaaf83a99), TOBN(0x9fbfee7f, 0x1562bac8)}, + {TOBN(0xeaf658b9, 0x5c4017e3), TOBN(0x1dc7f9e0, 0x15120b86), + TOBN(0xd84f13dd, 0x4c034d6f), TOBN(0x283dd737, 0xeaea3038)}}, + {{TOBN(0x197f2609, 0xcd85d6a2), TOBN(0x6ebbc345, 0xfae60177), + TOBN(0xb80f031b, 0x4e12fede), TOBN(0xde55d0c2, 0x07a2186b)}, + {TOBN(0x1fb3e37f, 0x24dcdd5a), TOBN(0x8d602da5, 0x7ed191fb), + TOBN(0x108fb056, 0x76023e0d), TOBN(0x70178c71, 0x459c20c0)}}, + {{TOBN(0xfad5a386, 0x3fe54cf0), TOBN(0xa4a3ec4f, 0x02bbb475), + TOBN(0x1aa5ec20, 0x919d94d7), TOBN(0x5d3b63b5, 0xa81e4ab3)}, + {TOBN(0x7fa733d8, 0x5ad3d2af), TOBN(0xfbc586dd, 0xd1ac7a37), + TOBN(0x282925de, 0x40779614), TOBN(0xfe0ffffb, 0xe74a242a)}}, + {{TOBN(0x3f39e67f, 0x906151e5), TOBN(0xcea27f5f, 0x55e10649), + TOBN(0xdca1d4e1, 0xc17cf7b7), TOBN(0x0c326d12, 0x2fe2362d)}, + {TOBN(0x05f7ac33, 0x7dd35df3), TOBN(0x0c3b7639, 0xc396dbdf), + TOBN(0x0912f5ac, 0x03b7db1c), TOBN(0x9dea4b70, 0x5c9ed4a9)}}, + {{TOBN(0x475e6e53, 0xaae3f639), TOBN(0xfaba0e7c, 0xfc278bac), + TOBN(0x16f9e221, 0x9490375f), TOBN(0xaebf9746, 0xa5a7ed0a)}, + {TOBN(0x45f9af3f, 0xf41ad5d6), TOBN(0x03c4623c, 0xb2e99224), + TOBN(0x82c5bb5c, 0xb3cf56aa), TOBN(0x64311819, 0x34567ed3)}}, + {{TOBN(0xec57f211, 0x8be489ac), TOBN(0x2821895d, 0xb9a1104b), + TOBN(0x610dc875, 0x6064e007), TOBN(0x8e526f3f, 0x5b20d0fe)}, + {TOBN(0x6e71ca77, 0x5b645aee), TOBN(0x3d1dcb9f, 0x800e10ff), + TOBN(0x36b51162, 0x189cf6de), TOBN(0x2c5a3e30, 0x6bb17353)}}, + {{TOBN(0xc186cd3e, 0x2a6c6fbf), TOBN(0xa74516fa, 0x4bf97906), + TOBN(0x5b4b8f4b, 0x279d6901), TOBN(0x0c4e57b4, 0x2b573743)}, + {TOBN(0x75fdb229, 0xb6e386b6), TOBN(0xb46793fd, 0x99deac27), + TOBN(0xeeec47ea, 0xcf712629), TOBN(0xe965f3c4, 0xcbc3b2dd)}}, + {{TOBN(0x8dd1fb83, 0x425c6559), TOBN(0x7fc00ee6, 0x0af06fda), + TOBN(0xe98c9225, 0x33d956df), TOBN(0x0f1ef335, 0x4fbdc8a2)}, + {TOBN(0x2abb5145, 0xb79b8ea2), TOBN(0x40fd2945, 0xbdbff288), + TOBN(0x6a814ac4, 0xd7185db7), TOBN(0xc4329d6f, 0xc084609a)}}, + {{TOBN(0xc9ba7b52, 0xed1be45d), TOBN(0x891dd20d, 0xe4cd2c74), + TOBN(0x5a4d4a7f, 0x824139b1), TOBN(0x66c17716, 0xb873c710)}, + {TOBN(0x5e5bc141, 0x2843c4e0), TOBN(0xd5ac4817, 0xb97eb5bf), + TOBN(0xc0f8af54, 0x450c95c7), TOBN(0xc91b3fa0, 0x318406c5)}}, + {{TOBN(0x360c340a, 0xab9d97f8), TOBN(0xfb57bd07, 0x90a2d611), + TOBN(0x4339ae3c, 0xa6a6f7e5), TOBN(0x9c1fcd2a, 0x2feb8a10)}, + {TOBN(0x972bcca9, 0xc7ea7432), TOBN(0x1b0b924c, 0x308076f6), + TOBN(0x80b2814a, 0x2a5b4ca5), TOBN(0x2f78f55b, 0x61ef3b29)}}, + {{TOBN(0xf838744a, 0xc18a414f), TOBN(0xc611eaae, 0x903d0a86), + TOBN(0x94dabc16, 0x2a453f55), TOBN(0xe6f2e3da, 0x14efb279)}, + {TOBN(0x5b7a6017, 0x9320dc3c), TOBN(0x692e382f, 0x8df6b5a4), + TOBN(0x3f5e15e0, 0x2d40fa90), TOBN(0xc87883ae, 0x643dd318)}}, + {{TOBN(0x511053e4, 0x53544774), TOBN(0x834d0ecc, 0x3adba2bc), + TOBN(0x4215d7f7, 0xbae371f5), TOBN(0xfcfd57bf, 0x6c8663bc)}, + {TOBN(0xded2383d, 0xd6901b1d), TOBN(0x3b49fbb4, 0xb5587dc3), + TOBN(0xfd44a08d, 0x07625f62), TOBN(0x3ee4d65b, 0x9de9b762)}}}, + {{{TOBN(0x64e5137d, 0x0d63d1fa), TOBN(0x658fc052, 0x02a9d89f), + TOBN(0x48894874, 0x50436309), TOBN(0xe9ae30f8, 0xd598da61)}, + {TOBN(0x2ed710d1, 0x818baf91), TOBN(0xe27e9e06, 0x8b6a0c20), + TOBN(0x1e28dcfb, 0x1c1a6b44), TOBN(0x883acb64, 0xd6ac57dc)}}, + {{TOBN(0x8735728d, 0xc2c6ff70), TOBN(0x79d6122f, 0xc5dc2235), + TOBN(0x23f5d003, 0x19e277f9), TOBN(0x7ee84e25, 0xdded8cc7)}, + {TOBN(0x91a8afb0, 0x63cd880a), TOBN(0x3f3ea7c6, 0x3574af60), + TOBN(0x0cfcdc84, 0x02de7f42), TOBN(0x62d0792f, 0xb31aa152)}}, + {{TOBN(0x8e1b4e43, 0x8a5807ce), TOBN(0xad283893, 0xe4109a7e), + TOBN(0xc30cc9cb, 0xafd59dda), TOBN(0xf65f36c6, 0x3d8d8093)}, + {TOBN(0xdf31469e, 0xa60d32b2), TOBN(0xee93df4b, 0x3e8191c8), + TOBN(0x9c1017c5, 0x355bdeb5), TOBN(0xd2623185, 0x8616aa28)}}, + {{TOBN(0xb02c83f9, 0xdec31a21), TOBN(0x988c8b23, 0x6ad9d573), + TOBN(0x53e983ae, 0xa57be365), TOBN(0xe968734d, 0x646f834e)}, + {TOBN(0x9137ea8f, 0x5da6309b), TOBN(0x10f3a624, 0xc1f1ce16), + TOBN(0x782a9ea2, 0xca440921), TOBN(0xdf94739e, 0x5b46f1b5)}}, + {{TOBN(0x9f9be006, 0xcce85c9b), TOBN(0x360e70d6, 0xa4c7c2d3), + TOBN(0x2cd5beea, 0xaefa1e60), TOBN(0x64cf63c0, 0x8c3d2b6d)}, + {TOBN(0xfb107fa3, 0xe1cf6f90), TOBN(0xb7e937c6, 0xd5e044e6), + TOBN(0x74e8ca78, 0xce34db9f), TOBN(0x4f8b36c1, 0x3e210bd0)}}, + {{TOBN(0x1df165a4, 0x34a35ea8), TOBN(0x3418e0f7, 0x4d4412f6), + TOBN(0x5af1f8af, 0x518836c3), TOBN(0x42ceef4d, 0x130e1965)}, + {TOBN(0x5560ca0b, 0x543a1957), TOBN(0xc33761e5, 0x886cb123), + TOBN(0x66624b1f, 0xfe98ed30), TOBN(0xf772f4bf, 0x1090997d)}}, + {{TOBN(0xf4e540bb, 0x4885d410), TOBN(0x7287f810, 0x9ba5f8d7), + TOBN(0x22d0d865, 0xde98dfb1), TOBN(0x49ff51a1, 0xbcfbb8a3)}, + {TOBN(0xb6b6fa53, 0x6bc3012e), TOBN(0x3d31fd72, 0x170d541d), + TOBN(0x8018724f, 0x4b0f4966), TOBN(0x79e7399f, 0x87dbde07)}}, + {{TOBN(0x56f8410e, 0xf4f8b16a), TOBN(0x97241afe, 0xc47b266a), + TOBN(0x0a406b8e, 0x6d9c87c1), TOBN(0x803f3e02, 0xcd42ab1b)}, + {TOBN(0x7f0309a8, 0x04dbec69), TOBN(0xa83b85f7, 0x3bbad05f), + TOBN(0xc6097273, 0xad8e197f), TOBN(0xc097440e, 0x5067adc1)}}, + {{TOBN(0x730eafb6, 0x3524ff16), TOBN(0xd7f9b51e, 0x823fc6ce), + TOBN(0x27bd0d32, 0x443e4ac0), TOBN(0x40c59ad9, 0x4d66f217)}, + {TOBN(0x6c33136f, 0x17c387a4), TOBN(0x5043b8d5, 0xeb86804d), + TOBN(0x74970312, 0x675a73c9), TOBN(0x838fdb31, 0xf16669b6)}}, + {{TOBN(0xc507b6dd, 0x418e7ddd), TOBN(0x39888d93, 0x472f19d6), + TOBN(0x7eae26be, 0x0c27eb4d), TOBN(0x17b53ed3, 0xfbabb884)}, + {TOBN(0xfc27021b, 0x2b01ae4f), TOBN(0x88462e87, 0xcf488682), + TOBN(0xbee096ec, 0x215e2d87), TOBN(0xeb2fea9a, 0xd242e29b)}}, + {{TOBN(0x5d985b5f, 0xb821fc28), TOBN(0x89d2e197, 0xdc1e2ad2), + TOBN(0x55b566b8, 0x9030ba62), TOBN(0xe3fd41b5, 0x4f41b1c6)}, + {TOBN(0xb738ac2e, 0xb9a96d61), TOBN(0x7f8567ca, 0x369443f4), + TOBN(0x8698622d, 0xf803a440), TOBN(0x2b586236, 0x8fe2f4dc)}}, + {{TOBN(0xbbcc00c7, 0x56b95bce), TOBN(0x5ec03906, 0x616da680), + TOBN(0x79162ee6, 0x72214252), TOBN(0x43132b63, 0x86a892d2)}, + {TOBN(0x4bdd3ff2, 0x2f3263bf), TOBN(0xd5b3733c, 0x9cd0a142), + TOBN(0x592eaa82, 0x44415ccb), TOBN(0x663e8924, 0x8d5474ea)}}, + {{TOBN(0x8058a25e, 0x5236344e), TOBN(0x82e8df9d, 0xbda76ee6), + TOBN(0xdcf6efd8, 0x11cc3d22), TOBN(0x00089cda, 0x3b4ab529)}, + {TOBN(0x91d3a071, 0xbd38a3db), TOBN(0x4ea97fc0, 0xef72b925), + TOBN(0x0c9fc15b, 0xea3edf75), TOBN(0x5a6297cd, 0xa4348ed3)}}, + {{TOBN(0x0d38ab35, 0xce7c42d4), TOBN(0x9fd493ef, 0x82feab10), + TOBN(0x46056b6d, 0x82111b45), TOBN(0xda11dae1, 0x73efc5c3)}, + {TOBN(0xdc740278, 0x5545a7fb), TOBN(0xbdb2601c, 0x40d507e6), + TOBN(0x121dfeeb, 0x7066fa58), TOBN(0x214369a8, 0x39ae8c2a)}}, + {{TOBN(0x195709cb, 0x06e0956c), TOBN(0x4c9d254f, 0x010cd34b), + TOBN(0xf51e13f7, 0x0471a532), TOBN(0xe19d6791, 0x1e73054d)}, + {TOBN(0xf702a628, 0xdb5c7be3), TOBN(0xc7141218, 0xb24dde05), + TOBN(0xdc18233c, 0xf29b2e2e), TOBN(0x3a6bd1e8, 0x85342dba)}}, + {{TOBN(0x3f747fa0, 0xb311898c), TOBN(0xe2a272e4, 0xcd0eac65), + TOBN(0x4bba5851, 0xf914d0bc), TOBN(0x7a1a9660, 0xc4a43ee3)}, + {TOBN(0xe5a367ce, 0xa1c8cde9), TOBN(0x9d958ba9, 0x7271abe3), + TOBN(0xf3ff7eb6, 0x3d1615cd), TOBN(0xa2280dce, 0xf5ae20b0)}}, + {{TOBN(0x56dba5c1, 0xcf640147), TOBN(0xea5a2e3d, 0x5e83d118), + TOBN(0x04cd6b6d, 0xda24c511), TOBN(0x1c0f4671, 0xe854d214)}, + {TOBN(0x91a6b7a9, 0x69565381), TOBN(0xdc966240, 0xdecf1f5b), + TOBN(0x1b22d21c, 0xfcf5d009), TOBN(0x2a05f641, 0x9021dbd5)}}, + {{TOBN(0x8c0ed566, 0xd4312483), TOBN(0x5179a95d, 0x643e216f), + TOBN(0xcc185fec, 0x17044493), TOBN(0xb3063339, 0x54991a21)}, + {TOBN(0xd801ecdb, 0x0081a726), TOBN(0x0149b0c6, 0x4fa89bbb), + TOBN(0xafe9065a, 0x4391b6b9), TOBN(0xedc92786, 0xd633f3a3)}}, + {{TOBN(0xe408c24a, 0xae6a8e13), TOBN(0x85833fde, 0x9f3897ab), + TOBN(0x43800e7e, 0xd81a0715), TOBN(0xde08e346, 0xb44ffc5f)}, + {TOBN(0x7094184c, 0xcdeff2e0), TOBN(0x49f9387b, 0x165eaed1), + TOBN(0x635d6129, 0x777c468a), TOBN(0x8c0dcfd1, 0x538c2dd8)}}, + {{TOBN(0xd6d9d9e3, 0x7a6a308b), TOBN(0x62375830, 0x4c2767d3), + TOBN(0x874a8bc6, 0xf38cbeb6), TOBN(0xd94d3f1a, 0xccb6fd9e)}, + {TOBN(0x92a9735b, 0xba21f248), TOBN(0x272ad0e5, 0x6cd1efb0), + TOBN(0x7437b69c, 0x05b03284), TOBN(0xe7f04702, 0x6948c225)}}, + {{TOBN(0x8a56c04a, 0xcba2ecec), TOBN(0x0c181270, 0xe3a73e41), + TOBN(0x6cb34e9d, 0x03e93725), TOBN(0xf77c8713, 0x496521a9)}, + {TOBN(0x94569183, 0xfa7f9f90), TOBN(0xf2e7aa4c, 0x8c9707ad), + TOBN(0xced2c9ba, 0x26c1c9a3), TOBN(0x9109fe96, 0x40197507)}}, + {{TOBN(0x9ae868a9, 0xe9adfe1c), TOBN(0x3984403d, 0x314e39bb), + TOBN(0xb5875720, 0xf2fe378f), TOBN(0x33f901e0, 0xba44a628)}, + {TOBN(0xea1125fe, 0x3652438c), TOBN(0xae9ec4e6, 0x9dd1f20b), + TOBN(0x1e740d9e, 0xbebf7fbd), TOBN(0x6dbd3ddc, 0x42dbe79c)}}, + {{TOBN(0x62082aec, 0xedd36776), TOBN(0xf612c478, 0xe9859039), + TOBN(0xa493b201, 0x032f7065), TOBN(0xebd4d8f2, 0x4ff9b211)}, + {TOBN(0x3f23a0aa, 0xaac4cb32), TOBN(0xea3aadb7, 0x15ed4005), + TOBN(0xacf17ea4, 0xafa27e63), TOBN(0x56125c1a, 0xc11fd66c)}}, + {{TOBN(0x266344a4, 0x3794f8dc), TOBN(0xdcca923a, 0x483c5c36), + TOBN(0x2d6b6bbf, 0x3f9d10a0), TOBN(0xb320c5ca, 0x81d9bdf3)}, + {TOBN(0x620e28ff, 0x47b50a95), TOBN(0x933e3b01, 0xcef03371), + TOBN(0xf081bf85, 0x99100153), TOBN(0x183be9a0, 0xc3a8c8d6)}}, + {{TOBN(0x4e3ddc5a, 0xd6bbe24d), TOBN(0xc6c74630, 0x53843795), + TOBN(0x78193dd7, 0x65ec2d4c), TOBN(0xb8df26cc, 0xcd3c89b2)}, + {TOBN(0x98dbe399, 0x5a483f8d), TOBN(0x72d8a957, 0x7dd3313a), + TOBN(0x65087294, 0xab0bd375), TOBN(0xfcd89248, 0x7c259d16)}}, + {{TOBN(0x8a9443d7, 0x7613aa81), TOBN(0x80100800, 0x85fe6584), + TOBN(0x70fc4dbc, 0x7fb10288), TOBN(0xf58280d3, 0xe86beee8)}, + {TOBN(0x14fdd82f, 0x7c978c38), TOBN(0xdf1204c1, 0x0de44d7b), + TOBN(0xa08a1c84, 0x4160252f), TOBN(0x591554ca, 0xc17646a5)}}, + {{TOBN(0x214a37d6, 0xa05bd525), TOBN(0x48d5f09b, 0x07957b3c), + TOBN(0x0247cdcb, 0xd7109bc9), TOBN(0x40f9e4bb, 0x30599ce7)}, + {TOBN(0xc325fa03, 0xf46ad2ec), TOBN(0x00f766cf, 0xc3e3f9ee), + TOBN(0xab556668, 0xd43a4577), TOBN(0x68d30a61, 0x3ee03b93)}}, + {{TOBN(0x7ddc81ea, 0x77b46a08), TOBN(0xcf5a6477, 0xc7480699), + TOBN(0x43a8cb34, 0x6633f683), TOBN(0x1b867e6b, 0x92363c60)}, + {TOBN(0x43921114, 0x1f60558e), TOBN(0xcdbcdd63, 0x2f41450e), + TOBN(0x7fc04601, 0xcc630e8b), TOBN(0xea7c66d5, 0x97038b43)}}, + {{TOBN(0x7259b8a5, 0x04e99fd8), TOBN(0x98a8dd12, 0x4785549a), + TOBN(0x0e459a7c, 0x840552e1), TOBN(0xcdfcf4d0, 0x4bb0909e)}, + {TOBN(0x34a86db2, 0x53758da7), TOBN(0xe643bb83, 0xeac997e1), + TOBN(0x96400bd7, 0x530c5b7e), TOBN(0x9f97af87, 0xb41c8b52)}}, + {{TOBN(0x34fc8820, 0xfbeee3f9), TOBN(0x93e53490, 0x49091afd), + TOBN(0x764b9be5, 0x9a31f35c), TOBN(0x71f37864, 0x57e3d924)}, + {TOBN(0x02fb34e0, 0x943aa75e), TOBN(0xa18c9c58, 0xab8ff6e4), + TOBN(0x080f31b1, 0x33cf0d19), TOBN(0x5c9682db, 0x083518a7)}}, + {{TOBN(0x873d4ca6, 0xb709c3de), TOBN(0x64a84262, 0x3575b8f0), + TOBN(0x6275da1f, 0x020154bb), TOBN(0x97678caa, 0xd17cf1ab)}, + {TOBN(0x8779795f, 0x951a95c3), TOBN(0xdd35b163, 0x50fccc08), + TOBN(0x32709627, 0x33d8f031), TOBN(0x3c5ab10a, 0x498dd85c)}}, + {{TOBN(0xb6c185c3, 0x41dca566), TOBN(0x7de7feda, 0xd8622aa3), + TOBN(0x99e84d92, 0x901b6dfb), TOBN(0x30a02b0e, 0x7c4ad288)}, + {TOBN(0xc7c81daa, 0x2fd3cf36), TOBN(0xd1319547, 0xdf89e59f), + TOBN(0xb2be8184, 0xcd496733), TOBN(0xd5f449eb, 0x93d3412b)}}, + {{TOBN(0x7ea41b1b, 0x25fe531d), TOBN(0xf9797432, 0x6a1d5646), + TOBN(0x86067f72, 0x2bde501a), TOBN(0xf91481c0, 0x0c85e89c)}, + {TOBN(0xca8ee465, 0xf8b05bc6), TOBN(0x1844e1cf, 0x02e83cda), + TOBN(0xca82114a, 0xb4dbe33b), TOBN(0x0f9f8769, 0x4eabfde2)}}, + {{TOBN(0x4936b1c0, 0x38b27fe2), TOBN(0x63b6359b, 0xaba402df), + TOBN(0x40c0ea2f, 0x656bdbab), TOBN(0x9c992a89, 0x6580c39c)}, + {TOBN(0x600e8f15, 0x2a60aed1), TOBN(0xeb089ca4, 0xe0bf49df), + TOBN(0x9c233d7d, 0x2d42d99a), TOBN(0x648d3f95, 0x4c6bc2fa)}}, + {{TOBN(0xdcc383a8, 0xe1add3f3), TOBN(0xf42c0c6a, 0x4f64a348), + TOBN(0x2abd176f, 0x0030dbdb), TOBN(0x4de501a3, 0x7d6c215e)}, + {TOBN(0x4a107c1f, 0x4b9a64bc), TOBN(0xa77f0ad3, 0x2496cd59), + TOBN(0xfb78ac62, 0x7688dffb), TOBN(0x7025a2ca, 0x67937d8e)}}, + {{TOBN(0xfde8b2d1, 0xd1a8f4e7), TOBN(0xf5b3da47, 0x7354927c), + TOBN(0xe48606a3, 0xd9205735), TOBN(0xac477cc6, 0xe177b917)}, + {TOBN(0xfb1f73d2, 0xa883239a), TOBN(0xe12572f6, 0xcc8b8357), + TOBN(0x9d355e9c, 0xfb1f4f86), TOBN(0x89b795f8, 0xd9f3ec6e)}}, + {{TOBN(0x27be56f1, 0xb54398dc), TOBN(0x1890efd7, 0x3fedeed5), + TOBN(0x62f77f1f, 0x9c6d0140), TOBN(0x7ef0e314, 0x596f0ee4)}, + {TOBN(0x50ca6631, 0xcc61dab3), TOBN(0x4a39801d, 0xf4866e4f), + TOBN(0x66c8d032, 0xae363b39), TOBN(0x22c591e5, 0x2ead66aa)}}, + {{TOBN(0x954ba308, 0xde02a53e), TOBN(0x2a6c060f, 0xd389f357), + TOBN(0xe6cfcde8, 0xfbf40b66), TOBN(0x8e02fc56, 0xc6340ce1)}, + {TOBN(0xe4957795, 0x73adb4ba), TOBN(0x7b86122c, 0xa7b03805), + TOBN(0x63f83512, 0x0c8e6fa6), TOBN(0x83660ea0, 0x057d7804)}}, + {{TOBN(0xbad79105, 0x21ba473c), TOBN(0xb6c50bee, 0xded5389d), + TOBN(0xee2caf4d, 0xaa7c9bc0), TOBN(0xd97b8de4, 0x8c4e98a7)}, + {TOBN(0xa9f63e70, 0xab3bbddb), TOBN(0x3898aabf, 0x2597815a), + TOBN(0x7659af89, 0xac15b3d9), TOBN(0xedf7725b, 0x703ce784)}}, + {{TOBN(0x25470fab, 0xe085116b), TOBN(0x04a43375, 0x87285310), + TOBN(0x4e39187e, 0xe2bfd52f), TOBN(0x36166b44, 0x7d9ebc74)}, + {TOBN(0x92ad433c, 0xfd4b322c), TOBN(0x726aa817, 0xba79ab51), + TOBN(0xf96eacd8, 0xc1db15eb), TOBN(0xfaf71e91, 0x0476be63)}}, + {{TOBN(0xdd69a640, 0x641fad98), TOBN(0xb7995918, 0x29622559), + TOBN(0x03c6daa5, 0xde4199dc), TOBN(0x92cadc97, 0xad545eb4)}, + {TOBN(0x1028238b, 0x256534e4), TOBN(0x73e80ce6, 0x8595409a), + TOBN(0x690d4c66, 0xd05dc59b), TOBN(0xc95f7b8f, 0x981dee80)}}, + {{TOBN(0xf4337014, 0xd856ac25), TOBN(0x441bd9dd, 0xac524dca), + TOBN(0x640b3d85, 0x5f0499f5), TOBN(0x39cf84a9, 0xd5fda182)}, + {TOBN(0x04e7b055, 0xb2aa95a0), TOBN(0x29e33f0a, 0x0ddf1860), + TOBN(0x082e74b5, 0x423f6b43), TOBN(0x217edeb9, 0x0aaa2b0f)}}, + {{TOBN(0x58b83f35, 0x83cbea55), TOBN(0xc485ee4d, 0xbc185d70), + TOBN(0x833ff03b, 0x1e5f6992), TOBN(0xb5b9b9cc, 0xcf0c0dd5)}, + {TOBN(0x7caaee8e, 0x4e9e8a50), TOBN(0x462e907b, 0x6269dafd), + TOBN(0x6ed5cee9, 0xfbe791c6), TOBN(0x68ca3259, 0xed430790)}}, + {{TOBN(0x2b72bdf2, 0x13b5ba88), TOBN(0x60294c8a, 0x35ef0ac4), + TOBN(0x9c3230ed, 0x19b99b08), TOBN(0x560fff17, 0x6c2589aa)}, + {TOBN(0x552b8487, 0xd6770374), TOBN(0xa373202d, 0x9a56f685), + TOBN(0xd3e7f907, 0x45f175d9), TOBN(0x3c2f315f, 0xd080d810)}}, + {{TOBN(0x1130e9dd, 0x7b9520e8), TOBN(0xc078f9e2, 0x0af037b5), + TOBN(0x38cd2ec7, 0x1e9c104c), TOBN(0x0f684368, 0xc472fe92)}, + {TOBN(0xd3f1b5ed, 0x6247e7ef), TOBN(0xb32d33a9, 0x396dfe21), + TOBN(0x46f59cf4, 0x4a9aa2c2), TOBN(0x69cd5168, 0xff0f7e41)}}, + {{TOBN(0x3f59da0f, 0x4b3234da), TOBN(0xcf0b0235, 0xb4579ebe), + TOBN(0x6d1cbb25, 0x6d2476c7), TOBN(0x4f0837e6, 0x9dc30f08)}, + {TOBN(0x9a4075bb, 0x906f6e98), TOBN(0x253bb434, 0xc761e7d1), + TOBN(0xde2e645f, 0x6e73af10), TOBN(0xb89a4060, 0x0c5f131c)}}, + {{TOBN(0xd12840c5, 0xb8cc037f), TOBN(0x3d093a5b, 0x7405bb47), + TOBN(0x6202c253, 0x206348b8), TOBN(0xbf5d57fc, 0xc55a3ca7)}, + {TOBN(0x89f6c90c, 0x8c3bef48), TOBN(0x23ac7623, 0x5a0a960a), + TOBN(0xdfbd3d6b, 0x552b42ab), TOBN(0x3ef22458, 0x132061f6)}}, + {{TOBN(0xd74e9bda, 0xc97e6516), TOBN(0x88779360, 0xc230f49e), + TOBN(0xa6ec1de3, 0x1e74ea49), TOBN(0x581dcee5, 0x3fb645a2)}, + {TOBN(0xbaef2391, 0x8f483f14), TOBN(0x6d2dddfc, 0xd137d13b), + TOBN(0x54cde50e, 0xd2743a42), TOBN(0x89a34fc5, 0xe4d97e67)}}, + {{TOBN(0x13f1f5b3, 0x12e08ce5), TOBN(0xa80540b8, 0xa7f0b2ca), + TOBN(0x854bcf77, 0x01982805), TOBN(0xb8653ffd, 0x233bea04)}, + {TOBN(0x8e7b8787, 0x02b0b4c9), TOBN(0x2675261f, 0x9acb170a), + TOBN(0x061a9d90, 0x930c14e5), TOBN(0xb59b30e0, 0xdef0abea)}}, + {{TOBN(0x1dc19ea6, 0x0200ec7d), TOBN(0xb6f4a3f9, 0x0bce132b), + TOBN(0xb8d5de90, 0xf13e27e0), TOBN(0xbaee5ef0, 0x1fade16f)}, + {TOBN(0x6f406aaa, 0xe4c6cf38), TOBN(0xab4cfe06, 0xd1369815), + TOBN(0x0dcffe87, 0xefd550c6), TOBN(0x9d4f59c7, 0x75ff7d39)}}, + {{TOBN(0xb02553b1, 0x51deb6ad), TOBN(0x812399a4, 0xb1877749), + TOBN(0xce90f71f, 0xca6006e1), TOBN(0xc32363a6, 0xb02b6e77)}, + {TOBN(0x02284fbe, 0xdc36c64d), TOBN(0x86c81e31, 0xa7e1ae61), + TOBN(0x2576c7e5, 0xb909d94a), TOBN(0x8b6f7d02, 0x818b2bb0)}}, + {{TOBN(0xeca3ed07, 0x56faa38a), TOBN(0xa3790e6c, 0x9305bb54), + TOBN(0xd784eeda, 0x7bc73061), TOBN(0xbd56d369, 0x6dd50614)}, + {TOBN(0xd6575949, 0x229a8aa9), TOBN(0xdcca8f47, 0x4595ec28), + TOBN(0x814305c1, 0x06ab4fe6), TOBN(0xc8c39768, 0x24f43f16)}}, + {{TOBN(0xe2a45f36, 0x523f2b36), TOBN(0x995c6493, 0x920d93bb), + TOBN(0xf8afdab7, 0x90f1632b), TOBN(0x79ebbecd, 0x1c295954)}, + {TOBN(0xc7bb3ddb, 0x79592f48), TOBN(0x67216a7b, 0x5f88e998), + TOBN(0xd91f098b, 0xbc01193e), TOBN(0xf7d928a5, 0xb1db83fc)}}, + {{TOBN(0x55e38417, 0xe991f600), TOBN(0x2a91113e, 0x2981a934), + TOBN(0xcbc9d648, 0x06b13bde), TOBN(0xb011b6ac, 0x0755ff44)}, + {TOBN(0x6f4cb518, 0x045ec613), TOBN(0x522d2d31, 0xc2f5930a), + TOBN(0x5acae1af, 0x382e65de), TOBN(0x57643067, 0x27bc966f)}}, + {{TOBN(0x5e12705d, 0x1c7193f0), TOBN(0xf0f32f47, 0x3be8858e), + TOBN(0x785c3d7d, 0x96c6dfc7), TOBN(0xd75b4a20, 0xbf31795d)}, + {TOBN(0x91acf17b, 0x342659d4), TOBN(0xe596ea34, 0x44f0378f), + TOBN(0x4515708f, 0xce52129d), TOBN(0x17387e1e, 0x79f2f585)}}, + {{TOBN(0x72cfd2e9, 0x49dee168), TOBN(0x1ae05223, 0x3e2af239), + TOBN(0x009e75be, 0x1d94066a), TOBN(0x6cca31c7, 0x38abf413)}, + {TOBN(0xb50bd61d, 0x9bc49908), TOBN(0x4a9b4a8c, 0xf5e2bc1e), + TOBN(0xeb6cc5f7, 0x946f83ac), TOBN(0x27da93fc, 0xebffab28)}}, + {{TOBN(0xea314c96, 0x4821c8c5), TOBN(0x8de49ded, 0xa83c15f4), + TOBN(0x7a64cf20, 0x7af33004), TOBN(0x45f1bfeb, 0xc9627e10)}, + {TOBN(0x878b0626, 0x54b9df60), TOBN(0x5e4fdc3c, 0xa95c0b33), + TOBN(0xe54a37ca, 0xc2035d8e), TOBN(0x9087cda9, 0x80f20b8c)}}, + {{TOBN(0x36f61c23, 0x8319ade4), TOBN(0x766f287a, 0xde8cfdf8), + TOBN(0x48821948, 0x346f3705), TOBN(0x49a7b853, 0x16e4f4a2)}, + {TOBN(0xb9b3f8a7, 0x5cedadfd), TOBN(0x8f562815, 0x8db2a815), + TOBN(0xc0b7d554, 0x01f68f95), TOBN(0x12971e27, 0x688a208e)}}, + {{TOBN(0xc9f8b696, 0xd0ff34fc), TOBN(0x20824de2, 0x1222718c), + TOBN(0x7213cf9f, 0x0c95284d), TOBN(0xe2ad741b, 0xdc158240)}, + {TOBN(0x0ee3a6df, 0x54043ccf), TOBN(0x16ff479b, 0xd84412b3), + TOBN(0xf6c74ee0, 0xdfc98af0), TOBN(0xa78a169f, 0x52fcd2fb)}}, + {{TOBN(0xd8ae8746, 0x99c930e9), TOBN(0x1d33e858, 0x49e117a5), + TOBN(0x7581fcb4, 0x6624759f), TOBN(0xde50644f, 0x5bedc01d)}, + {TOBN(0xbeec5d00, 0xcaf3155e), TOBN(0x672d66ac, 0xbc73e75f), + TOBN(0x86b9d8c6, 0x270b01db), TOBN(0xd249ef83, 0x50f55b79)}}, + {{TOBN(0x6131d6d4, 0x73978fe3), TOBN(0xcc4e4542, 0x754b00a1), + TOBN(0x4e05df05, 0x57dfcfe9), TOBN(0x94b29cdd, 0x51ef6bf0)}, + {TOBN(0xe4530cff, 0x9bc7edf2), TOBN(0x8ac236fd, 0xd3da65f3), + TOBN(0x0faf7d5f, 0xc8eb0b48), TOBN(0x4d2de14c, 0x660eb039)}}, + {{TOBN(0xc006bba7, 0x60430e54), TOBN(0x10a2d0d6, 0xda3289ab), + TOBN(0x9c037a5d, 0xd7979c59), TOBN(0x04d1f3d3, 0xa116d944)}, + {TOBN(0x9ff22473, 0x8a0983cd), TOBN(0x28e25b38, 0xc883cabb), + TOBN(0xe968dba5, 0x47a58995), TOBN(0x2c80b505, 0x774eebdf)}}, + {{TOBN(0xee763b71, 0x4a953beb), TOBN(0x502e223f, 0x1642e7f6), + TOBN(0x6fe4b641, 0x61d5e722), TOBN(0x9d37c5b0, 0xdbef5316)}, + {TOBN(0x0115ed70, 0xf8330bc7), TOBN(0x139850e6, 0x75a72789), + TOBN(0x27d7faec, 0xffceccc2), TOBN(0x3016a860, 0x4fd9f7f6)}}, + {{TOBN(0xc492ec64, 0x4cd8f64c), TOBN(0x58a2d790, 0x279d7b51), + TOBN(0x0ced1fc5, 0x1fc75256), TOBN(0x3e658aed, 0x8f433017)}, + {TOBN(0x0b61942e, 0x05da59eb), TOBN(0xba3d60a3, 0x0ddc3722), + TOBN(0x7c311cd1, 0x742e7f87), TOBN(0x6473ffee, 0xf6b01b6e)}}}, + {{{TOBN(0x8303604f, 0x692ac542), TOBN(0xf079ffe1, 0x227b91d3), + TOBN(0x19f63e63, 0x15aaf9bd), TOBN(0xf99ee565, 0xf1f344fb)}, + {TOBN(0x8a1d661f, 0xd6219199), TOBN(0x8c883bc6, 0xd48ce41c), + TOBN(0x1065118f, 0x3c74d904), TOBN(0x713889ee, 0x0faf8b1b)}}, + {{TOBN(0x972b3f8f, 0x81a1b3be), TOBN(0x4f3ce145, 0xce2764a0), + TOBN(0xe2d0f1cc, 0x28c4f5f7), TOBN(0xdeee0c0d, 0xc7f3985b)}, + {TOBN(0x7df4adc0, 0xd39e25c3), TOBN(0x40619820, 0xc467a080), + TOBN(0x440ebc93, 0x61cf5a58), TOBN(0x527729a6, 0x422ad600)}}, + {{TOBN(0xca6c0937, 0xb1b76ba6), TOBN(0x1a2eab85, 0x4d2026dc), + TOBN(0xb1715e15, 0x19d9ae0a), TOBN(0xf1ad9199, 0xbac4a026)}, + {TOBN(0x35b3dfb8, 0x07ea7b0e), TOBN(0xedf5496f, 0x3ed9eb89), + TOBN(0x8932e5ff, 0x2d6d08ab), TOBN(0xf314874e, 0x25bd2731)}}, + {{TOBN(0xefb26a75, 0x3f73f449), TOBN(0x1d1c94f8, 0x8d44fc79), + TOBN(0x49f0fbc5, 0x3bc0dc4d), TOBN(0xb747ea0b, 0x3698a0d0)}, + {TOBN(0x5218c3fe, 0x228d291e), TOBN(0x35b804b5, 0x43c129d6), + TOBN(0xfac859b8, 0xd1acc516), TOBN(0x6c10697d, 0x95d6e668)}}, + {{TOBN(0xc38e438f, 0x0876fd4e), TOBN(0x45f0c307, 0x83d2f383), + TOBN(0x203cc2ec, 0xb10934cb), TOBN(0x6a8f2439, 0x2c9d46ee)}, + {TOBN(0xf16b431b, 0x65ccde7b), TOBN(0x41e2cd18, 0x27e76a6f), + TOBN(0xb9c8cf8f, 0x4e3484d7), TOBN(0x64426efd, 0x8315244a)}}, + {{TOBN(0x1c0a8e44, 0xfc94dea3), TOBN(0x34c8cdbf, 0xdad6a0b0), + TOBN(0x919c3840, 0x04113cef), TOBN(0xfd32fba4, 0x15490ffa)}, + {TOBN(0x58d190f6, 0x795dcfb7), TOBN(0xfef01b03, 0x83588baf), + TOBN(0x9e6d1d63, 0xca1fc1c0), TOBN(0x53173f96, 0xf0a41ac9)}}, + {{TOBN(0x2b1d402a, 0xba16f73b), TOBN(0x2fb31014, 0x8cf9b9fc), + TOBN(0x2d51e60e, 0x446ef7bf), TOBN(0xc731021b, 0xb91e1745)}, + {TOBN(0x9d3b4724, 0x4fee99d4), TOBN(0x4bca48b6, 0xfac5c1ea), + TOBN(0x70f5f514, 0xbbea9af7), TOBN(0x751f55a5, 0x974c283a)}}, + {{TOBN(0x6e30251a, 0xcb452fdb), TOBN(0x31ee6965, 0x50f30650), + TOBN(0xb0b3e508, 0x933548d9), TOBN(0xb8949a4f, 0xf4b0ef5b)}, + {TOBN(0x208b8326, 0x3c88f3bd), TOBN(0xab147c30, 0xdb1d9989), + TOBN(0xed6515fd, 0x44d4df03), TOBN(0x17a12f75, 0xe72eb0c5)}}, + {{TOBN(0x3b59796d, 0x36cf69db), TOBN(0x1219eee9, 0x56670c18), + TOBN(0xfe3341f7, 0x7a070d8e), TOBN(0x9b70130b, 0xa327f90c)}, + {TOBN(0x36a32462, 0x0ae18e0e), TOBN(0x2021a623, 0x46c0a638), + TOBN(0x251b5817, 0xc62eb0d4), TOBN(0x87bfbcdf, 0x4c762293)}}, + {{TOBN(0xf78ab505, 0xcdd61d64), TOBN(0x8c7a53fc, 0xc8c18857), + TOBN(0xa653ce6f, 0x16147515), TOBN(0x9c923aa5, 0xea7d52d5)}, + {TOBN(0xc24709cb, 0x5c18871f), TOBN(0x7d53bec8, 0x73b3cc74), + TOBN(0x59264aff, 0xfdd1d4c4), TOBN(0x5555917e, 0x240da582)}}, + {{TOBN(0xcae8bbda, 0x548f5a0e), TOBN(0x1910eaba, 0x3bbfbbe1), + TOBN(0xae579685, 0x7677afc3), TOBN(0x49ea61f1, 0x73ff0b5c)}, + {TOBN(0x78655478, 0x4f7c3922), TOBN(0x95d337cd, 0x20c68eef), + TOBN(0x68f1e1e5, 0xdf779ab9), TOBN(0x14b491b0, 0xb5cf69a8)}}, + {{TOBN(0x7a6cbbe0, 0x28e3fe89), TOBN(0xe7e1fee4, 0xc5aac0eb), + TOBN(0x7f47eda5, 0x697e5140), TOBN(0x4f450137, 0xb454921f)}, + {TOBN(0xdb625f84, 0x95cd8185), TOBN(0x74be0ba1, 0xcdb2e583), + TOBN(0xaee4fd7c, 0xdd5e6de4), TOBN(0x4251437d, 0xe8101739)}}, + {{TOBN(0x686d72a0, 0xac620366), TOBN(0x4be3fb9c, 0xb6d59344), + TOBN(0x6e8b44e7, 0xa1eb75b9), TOBN(0x84e39da3, 0x91a5c10c)}, + {TOBN(0x37cc1490, 0xb38f0409), TOBN(0x02951943, 0x2c2ade82), + TOBN(0x9b688783, 0x1190a2d8), TOBN(0x25627d14, 0x231182ba)}}, + {{TOBN(0x6eb550aa, 0x658a6d87), TOBN(0x1405aaa7, 0xcf9c7325), + TOBN(0xd147142e, 0x5c8748c9), TOBN(0x7f637e4f, 0x53ede0e0)}, + {TOBN(0xf8ca2776, 0x14ffad2c), TOBN(0xe58fb1bd, 0xbafb6791), + TOBN(0x17158c23, 0xbf8f93fc), TOBN(0x7f15b373, 0x0a4a4655)}}, + {{TOBN(0x39d4add2, 0xd842ca72), TOBN(0xa71e4391, 0x3ed96305), + TOBN(0x5bb09cbe, 0x6700be14), TOBN(0x68d69d54, 0xd8befcf6)}, + {TOBN(0xa45f5367, 0x37183bcf), TOBN(0x7152b7bb, 0x3370dff7), + TOBN(0xcf887baa, 0xbf12525b), TOBN(0xe7ac7bdd, 0xd6d1e3cd)}}, + {{TOBN(0x25914f78, 0x81fdad90), TOBN(0xcf638f56, 0x0d2cf6ab), + TOBN(0xb90bc03f, 0xcc054de5), TOBN(0x932811a7, 0x18b06350)}, + {TOBN(0x2f00b330, 0x9bbd11ff), TOBN(0x76108a6f, 0xb4044974), + TOBN(0x801bb9e0, 0xa851d266), TOBN(0x0dd099be, 0xbf8990c1)}}, + {{TOBN(0x58c5aaaa, 0xabe32986), TOBN(0x0fe9dd2a, 0x50d59c27), + TOBN(0x84951ff4, 0x8d307305), TOBN(0x6c23f829, 0x86529b78)}, + {TOBN(0x50bb2218, 0x0b136a79), TOBN(0x7e2174de, 0x77a20996), + TOBN(0x6f00a4b9, 0xc0bb4da6), TOBN(0x89a25a17, 0xefdde8da)}}, + {{TOBN(0xf728a27e, 0xc11ee01d), TOBN(0xf900553a, 0xe5f10dfb), + TOBN(0x189a83c8, 0x02ec893c), TOBN(0x3ca5bdc1, 0x23f66d77)}, + {TOBN(0x98781537, 0x97eada9f), TOBN(0x59c50ab3, 0x10256230), + TOBN(0x346042d9, 0x323c69b3), TOBN(0x1b715a6d, 0x2c460449)}}, + {{TOBN(0xa41dd476, 0x6ae06e0b), TOBN(0xcdd7888e, 0x9d42e25f), + TOBN(0x0f395f74, 0x56b25a20), TOBN(0xeadfe0ae, 0x8700e27e)}, + {TOBN(0xb09d52a9, 0x69950093), TOBN(0x3525d9cb, 0x327f8d40), + TOBN(0xb8235a94, 0x67df886a), TOBN(0x77e4b0dd, 0x035faec2)}}, + {{TOBN(0x115eb20a, 0x517d7061), TOBN(0x77fe3433, 0x6c2df683), + TOBN(0x6870ddc7, 0xcdc6fc67), TOBN(0xb1610588, 0x0b87de83)}, + {TOBN(0x343584ca, 0xd9c4ddbe), TOBN(0xb3164f1c, 0x3d754be2), + TOBN(0x0731ed3a, 0xc1e6c894), TOBN(0x26327dec, 0x4f6b904c)}}, + {{TOBN(0x9d49c6de, 0x97b5cd32), TOBN(0x40835dae, 0xb5eceecd), + TOBN(0xc66350ed, 0xd9ded7fe), TOBN(0x8aeebb5c, 0x7a678804)}, + {TOBN(0x51d42fb7, 0x5b8ee9ec), TOBN(0xd7a17bdd, 0x8e3ca118), + TOBN(0x40d7511a, 0x2ef4400e), TOBN(0xc48990ac, 0x875a66f4)}}, + {{TOBN(0x8de07d2a, 0x2199e347), TOBN(0xbee75556, 0x2a39e051), + TOBN(0x56918786, 0x916e51dc), TOBN(0xeb191313, 0x4a2d89ec)}, + {TOBN(0x6679610d, 0x37d341ed), TOBN(0x434fbb41, 0x56d51c2b), + TOBN(0xe54b7ee7, 0xd7492dba), TOBN(0xaa33a79a, 0x59021493)}}, + {{TOBN(0x49fc5054, 0xe4bd6d3d), TOBN(0x09540f04, 0x5ab551d0), + TOBN(0x8acc9085, 0x4942d3a6), TOBN(0x231af02f, 0x2d28323b)}, + {TOBN(0x93458cac, 0x0992c163), TOBN(0x1fef8e71, 0x888e3bb4), + TOBN(0x27578da5, 0xbe8c268c), TOBN(0xcc8be792, 0xe805ec00)}}, + {{TOBN(0x29267bae, 0xc61c3855), TOBN(0xebff429d, 0x58c1fd3b), + TOBN(0x22d886c0, 0x8c0b93b8), TOBN(0xca5e00b2, 0x2ddb8953)}, + {TOBN(0xcf330117, 0xc3fed8b7), TOBN(0xd49ac6fa, 0x819c01f6), + TOBN(0x6ddaa6bd, 0x3c0fbd54), TOBN(0x91743068, 0x8049a2cf)}}, + {{TOBN(0xd67f981e, 0xaff2ef81), TOBN(0xc3654d35, 0x2818ae80), + TOBN(0x81d05044, 0x1b2aa892), TOBN(0x2db067bf, 0x3d099328)}, + {TOBN(0xe7c79e86, 0x703dcc97), TOBN(0xe66f9b37, 0xe133e215), + TOBN(0xcdf119a6, 0xe39a7a5c), TOBN(0x47c60de3, 0x876f1b61)}}, + {{TOBN(0x6e405939, 0xd860f1b2), TOBN(0x3e9a1dbc, 0xf5ed4d4a), + TOBN(0x3f23619e, 0xc9b6bcbd), TOBN(0x5ee790cf, 0x734e4497)}, + {TOBN(0xf0a834b1, 0x5bdaf9bb), TOBN(0x02cedda7, 0x4ca295f0), + TOBN(0x4619aa2b, 0xcb8e378c), TOBN(0xe5613244, 0xcc987ea4)}}, + {{TOBN(0x0bc022cc, 0x76b23a50), TOBN(0x4a2793ad, 0x0a6c21ce), + TOBN(0x38328780, 0x89cac3f5), TOBN(0x29176f1b, 0xcba26d56)}, + {TOBN(0x06296187, 0x4f6f59eb), TOBN(0x86e9bca9, 0x8bdc658e), + TOBN(0x2ca9c4d3, 0x57e30402), TOBN(0x5438b216, 0x516a09bb)}}, + {{TOBN(0x0a6a063c, 0x7672765a), TOBN(0x37a3ce64, 0x0547b9bf), + TOBN(0x42c099c8, 0x98b1a633), TOBN(0xb5ab800d, 0x05ee6961)}, + {TOBN(0xf1963f59, 0x11a5acd6), TOBN(0xbaee6157, 0x46201063), + TOBN(0x36d9a649, 0xa596210a), TOBN(0xaed04363, 0x1ba7138c)}}, + {{TOBN(0xcf817d1c, 0xa4a82b76), TOBN(0x5586960e, 0xf3806be9), + TOBN(0x7ab67c89, 0x09dc6bb5), TOBN(0x52ace7a0, 0x114fe7eb)}, + {TOBN(0xcd987618, 0xcbbc9b70), TOBN(0x4f06fd5a, 0x604ca5e1), + TOBN(0x90af14ca, 0x6dbde133), TOBN(0x1afe4322, 0x948a3264)}}, + {{TOBN(0xa70d2ca6, 0xc44b2c6c), TOBN(0xab726799, 0x0ef87dfe), + TOBN(0x310f64dc, 0x2e696377), TOBN(0x49b42e68, 0x4c8126a0)}, + {TOBN(0x0ea444c3, 0xcea0b176), TOBN(0x53a8ddf7, 0xcb269182), + TOBN(0xf3e674eb, 0xbbba9dcb), TOBN(0x0d2878a8, 0xd8669d33)}}, + {{TOBN(0x04b935d5, 0xd019b6a3), TOBN(0xbb5cf88e, 0x406f1e46), + TOBN(0xa1912d16, 0x5b57c111), TOBN(0x9803fc21, 0x19ebfd78)}, + {TOBN(0x4f231c9e, 0xc07764a9), TOBN(0xd93286ee, 0xb75bd055), + TOBN(0x83a9457d, 0x8ee6c9de), TOBN(0x04695915, 0x6087ec90)}}, + {{TOBN(0x14c6dd8a, 0x58d6cd46), TOBN(0x9cb633b5, 0x8e6634d2), + TOBN(0xc1305047, 0xf81bc328), TOBN(0x12ede0e2, 0x26a177e5)}, + {TOBN(0x332cca62, 0x065a6f4f), TOBN(0xc3a47ecd, 0x67be487b), + TOBN(0x741eb187, 0x0f47ed1c), TOBN(0x99e66e58, 0xe7598b14)}}, + {{TOBN(0x6f0544ca, 0x63d0ff12), TOBN(0xe5efc784, 0xb610a05f), + TOBN(0xf72917b1, 0x7cad7b47), TOBN(0x3ff6ea20, 0xf2cac0c0)}, + {TOBN(0xcc23791b, 0xf21db8b7), TOBN(0x7dac70b1, 0xd7d93565), + TOBN(0x682cda1d, 0x694bdaad), TOBN(0xeb88bb8c, 0x1023516d)}}, + {{TOBN(0xc4c634b4, 0xdfdbeb1b), TOBN(0x22f5ca72, 0xb4ee4dea), + TOBN(0x1045a368, 0xe6524821), TOBN(0xed9e8a3f, 0x052b18b2)}, + {TOBN(0x9b7f2cb1, 0xb961f49a), TOBN(0x7fee2ec1, 0x7b009670), + TOBN(0x350d8754, 0x22507a6d), TOBN(0x561bd711, 0x4db55f1d)}}, + {{TOBN(0x4c189ccc, 0x320bbcaf), TOBN(0x568434cf, 0xdf1de48c), + TOBN(0x6af1b00e, 0x0fa8f128), TOBN(0xf0ba9d02, 0x8907583c)}, + {TOBN(0x735a4004, 0x32ff9f60), TOBN(0x3dd8e4b6, 0xc25dcf33), + TOBN(0xf2230f16, 0x42c74cef), TOBN(0xd8117623, 0x013fa8ad)}}, + {{TOBN(0x36822876, 0xf51fe76e), TOBN(0x8a6811cc, 0x11d62589), + TOBN(0xc3fc7e65, 0x46225718), TOBN(0xb7df2c9f, 0xc82fdbcd)}, + {TOBN(0x3b1d4e52, 0xdd7b205b), TOBN(0xb6959478, 0x47a2e414), + TOBN(0x05e4d793, 0xefa91148), TOBN(0xb47ed446, 0xfd2e9675)}}, + {{TOBN(0x1a7098b9, 0x04c9d9bf), TOBN(0x661e2881, 0x1b793048), + TOBN(0xb1a16966, 0xb01ee461), TOBN(0xbc521308, 0x2954746f)}, + {TOBN(0xc909a0fc, 0x2477de50), TOBN(0xd80bb41c, 0x7dbd51ef), + TOBN(0xa85be7ec, 0x53294905), TOBN(0x6d465b18, 0x83958f97)}}, + {{TOBN(0x16f6f330, 0xfb6840fd), TOBN(0xfaaeb214, 0x3401e6c8), + TOBN(0xaf83d30f, 0xccb5b4f8), TOBN(0x22885739, 0x266dec4b)}, + {TOBN(0x51b4367c, 0x7bc467df), TOBN(0x926562e3, 0xd842d27a), + TOBN(0xdfcb6614, 0x0fea14a6), TOBN(0xeb394dae, 0xf2734cd9)}}, + {{TOBN(0x3eeae5d2, 0x11c0be98), TOBN(0xb1e6ed11, 0x814e8165), + TOBN(0x191086bc, 0xe52bce1c), TOBN(0x14b74cc6, 0xa75a04da)}, + {TOBN(0x63cf1186, 0x8c060985), TOBN(0x071047de, 0x2dbd7f7c), + TOBN(0x4e433b8b, 0xce0942ca), TOBN(0xecbac447, 0xd8fec61d)}}, + {{TOBN(0x8f0ed0e2, 0xebf3232f), TOBN(0xfff80f9e, 0xc52a2edd), + TOBN(0xad9ab433, 0x75b55fdb), TOBN(0x73ca7820, 0xe42e0c11)}, + {TOBN(0x6dace0a0, 0xe6251b46), TOBN(0x89bc6b5c, 0x4c0d932d), + TOBN(0x3438cd77, 0x095da19a), TOBN(0x2f24a939, 0x8d48bdfb)}}, + {{TOBN(0x99b47e46, 0x766561b7), TOBN(0x736600e6, 0x0ed0322a), + TOBN(0x06a47cb1, 0x638e1865), TOBN(0x927c1c2d, 0xcb136000)}, + {TOBN(0x29542337, 0x0cc5df69), TOBN(0x99b37c02, 0x09d649a9), + TOBN(0xc5f0043c, 0x6aefdb27), TOBN(0x6cdd9987, 0x1be95c27)}}, + {{TOBN(0x69850931, 0x390420d2), TOBN(0x299c40ac, 0x0983efa4), + TOBN(0x3a05e778, 0xaf39aead), TOBN(0x84274408, 0x43a45193)}, + {TOBN(0x6bcd0fb9, 0x91a711a0), TOBN(0x461592c8, 0x9f52ab17), + TOBN(0xb49302b4, 0xda3c6ed6), TOBN(0xc51fddc7, 0x330d7067)}}, + {{TOBN(0x94babeb6, 0xda50d531), TOBN(0x521b840d, 0xa6a7b9da), + TOBN(0x5305151e, 0x404bdc89), TOBN(0x1bcde201, 0xd0d07449)}, + {TOBN(0xf427a78b, 0x3b76a59a), TOBN(0xf84841ce, 0x07791a1b), + TOBN(0xebd314be, 0xbf91ed1c), TOBN(0x8e61d34c, 0xbf172943)}}, + {{TOBN(0x1d5dc451, 0x5541b892), TOBN(0xb186ee41, 0xfc9d9e54), + TOBN(0x9d9f345e, 0xd5bf610d), TOBN(0x3e7ba65d, 0xf6acca9f)}, + {TOBN(0x9dda787a, 0xa8369486), TOBN(0x09f9dab7, 0x8eb5ba53), + TOBN(0x5afb2033, 0xd6481bc3), TOBN(0x76f4ce30, 0xafa62104)}}, + {{TOBN(0xa8fa00cf, 0xf4f066b5), TOBN(0x89ab5143, 0x461dafc2), + TOBN(0x44339ed7, 0xa3389998), TOBN(0x2ff862f1, 0xbc214903)}, + {TOBN(0x2c88f985, 0xb05556e3), TOBN(0xcd96058e, 0x3467081e), + TOBN(0x7d6a4176, 0xedc637ea), TOBN(0xe1743d09, 0x36a5acdc)}}, + {{TOBN(0x66fd72e2, 0x7eb37726), TOBN(0xf7fa264e, 0x1481a037), + TOBN(0x9fbd3bde, 0x45f4aa79), TOBN(0xed1e0147, 0x767c3e22)}, + {TOBN(0x7621f979, 0x82e7abe2), TOBN(0x19eedc72, 0x45f633f8), + TOBN(0xe69b155e, 0x6137bf3a), TOBN(0xa0ad13ce, 0x414ee94e)}}, + {{TOBN(0x93e3d524, 0x1c0e651a), TOBN(0xab1a6e2a, 0x02ce227e), + TOBN(0xe7af1797, 0x4ab27eca), TOBN(0x245446de, 0xbd444f39)}, + {TOBN(0x59e22a21, 0x56c07613), TOBN(0x43deafce, 0xf4275498), + TOBN(0x10834ccb, 0x67fd0946), TOBN(0xa75841e5, 0x47406edf)}}, + {{TOBN(0xebd6a677, 0x7b0ac93d), TOBN(0xa6e37b0d, 0x78f5e0d7), + TOBN(0x2516c096, 0x76f5492b), TOBN(0x1e4bf888, 0x9ac05f3a)}, + {TOBN(0xcdb42ce0, 0x4df0ba2b), TOBN(0x935d5cfd, 0x5062341b), + TOBN(0x8a303333, 0x82acac20), TOBN(0x429438c4, 0x5198b00e)}}, + {{TOBN(0x1d083bc9, 0x049d33fa), TOBN(0x58b82dda, 0x946f67ff), + TOBN(0xac3e2db8, 0x67a1d6a3), TOBN(0x62e6bead, 0x1798aac8)}, + {TOBN(0xfc85980f, 0xde46c58c), TOBN(0xa7f69379, 0x69c8d7be), + TOBN(0x23557927, 0x837b35ec), TOBN(0x06a933d8, 0xe0790c0c)}}, + {{TOBN(0x827c0e9b, 0x077ff55d), TOBN(0x53977798, 0xbb26e680), + TOBN(0x59530874, 0x1d9cb54f), TOBN(0xcca3f449, 0x4aac53ef)}, + {TOBN(0x11dc5c87, 0xa07eda0f), TOBN(0xc138bccf, 0xfd6400c8), + TOBN(0x549680d3, 0x13e5da72), TOBN(0xc93eed82, 0x4540617e)}}, + {{TOBN(0xfd3db157, 0x4d0b75c0), TOBN(0x9716eb42, 0x6386075b), + TOBN(0x0639605c, 0x817b2c16), TOBN(0x09915109, 0xf1e4f201)}, + {TOBN(0x35c9a928, 0x5cca6c3b), TOBN(0xb25f7d1a, 0x3505c900), + TOBN(0xeb9f7d20, 0x630480c4), TOBN(0xc3c7b8c6, 0x2a1a501c)}}, + {{TOBN(0x3f99183c, 0x5a1f8e24), TOBN(0xfdb118fa, 0x9dd255f0), + TOBN(0xb9b18b90, 0xc27f62a6), TOBN(0xe8f732f7, 0x396ec191)}, + {TOBN(0x524a2d91, 0x0be786ab), TOBN(0x5d32adef, 0x0ac5a0f5), + TOBN(0x9b53d4d6, 0x9725f694), TOBN(0x032a76c6, 0x0510ba89)}}, + {{TOBN(0x840391a3, 0xebeb1544), TOBN(0x44b7b88c, 0x3ed73ac3), + TOBN(0xd24bae7a, 0x256cb8b3), TOBN(0x7ceb151a, 0xe394cb12)}, + {TOBN(0xbd6b66d0, 0x5bc1e6a8), TOBN(0xec70cecb, 0x090f07bf), + TOBN(0x270644ed, 0x7d937589), TOBN(0xee9e1a3d, 0x5f1dccfe)}}, + {{TOBN(0xb0d40a84, 0x745b98d2), TOBN(0xda429a21, 0x2556ed40), + TOBN(0xf676eced, 0x85148cb9), TOBN(0x5a22d40c, 0xded18936)}, + {TOBN(0x3bc4b9e5, 0x70e8a4ce), TOBN(0xbfd1445b, 0x9eae0379), + TOBN(0xf23f2c0c, 0x1a0bd47e), TOBN(0xa9c0bb31, 0xe1845531)}}, + {{TOBN(0x9ddc4d60, 0x0a4c3f6b), TOBN(0xbdfaad79, 0x2c15ef44), + TOBN(0xce55a236, 0x7f484acc), TOBN(0x08653ca7, 0x055b1f15)}, + {TOBN(0x2efa8724, 0x538873a3), TOBN(0x09299e5d, 0xace1c7e7), + TOBN(0x07afab66, 0xade332ba), TOBN(0x9be1fdf6, 0x92dd71b7)}}, + {{TOBN(0xa49b5d59, 0x5758b11c), TOBN(0x0b852893, 0xc8654f40), + TOBN(0xb63ef6f4, 0x52379447), TOBN(0xd4957d29, 0x105e690c)}, + {TOBN(0x7d484363, 0x646559b0), TOBN(0xf4a8273c, 0x49788a8e), + TOBN(0xee406cb8, 0x34ce54a9), TOBN(0x1e1c260f, 0xf86fda9b)}}, + {{TOBN(0xe150e228, 0xcf6a4a81), TOBN(0x1fa3b6a3, 0x1b488772), + TOBN(0x1e6ff110, 0xc5a9c15b), TOBN(0xc6133b91, 0x8ad6aa47)}, + {TOBN(0x8ac5d55c, 0x9dffa978), TOBN(0xba1d1c1d, 0x5f3965f2), + TOBN(0xf969f4e0, 0x7732b52f), TOBN(0xfceecdb5, 0xa5172a07)}}, + {{TOBN(0xb0120a5f, 0x10f2b8f5), TOBN(0xc83a6cdf, 0x5c4c2f63), + TOBN(0x4d47a491, 0xf8f9c213), TOBN(0xd9e1cce5, 0xd3f1bbd5)}, + {TOBN(0x0d91bc7c, 0xaba7e372), TOBN(0xfcdc74c8, 0xdfd1a2db), + TOBN(0x05efa800, 0x374618e5), TOBN(0x11216969, 0x15a7925e)}}, + {{TOBN(0xd4c89823, 0xf6021c5d), TOBN(0x880d5e84, 0xeff14423), + TOBN(0x6523bc5a, 0x6dcd1396), TOBN(0xd1acfdfc, 0x113c978b)}, + {TOBN(0xb0c164e8, 0xbbb66840), TOBN(0xf7f4301e, 0x72b58459), + TOBN(0xc29ad4a6, 0xa638e8ec), TOBN(0xf5ab8961, 0x46b78699)}}, + {{TOBN(0x9dbd7974, 0x0e954750), TOBN(0x0121de88, 0x64f9d2c6), + TOBN(0x2e597b42, 0xd985232e), TOBN(0x55b6c3c5, 0x53451777)}, + {TOBN(0xbb53e547, 0x519cb9fb), TOBN(0xf134019f, 0x8428600d), + TOBN(0x5a473176, 0xe081791a), TOBN(0x2f3e2263, 0x35fb0c08)}}, + {{TOBN(0xb28c3017, 0x73d273b0), TOBN(0xccd21076, 0x7721ef9a), + TOBN(0x054cc292, 0xb650dc39), TOBN(0x662246de, 0x6188045e)}, + {TOBN(0x904b52fa, 0x6b83c0d1), TOBN(0xa72df267, 0x97e9cd46), + TOBN(0x886b43cd, 0x899725e4), TOBN(0x2b651688, 0xd849ff22)}}, + {{TOBN(0x60479b79, 0x02f34533), TOBN(0x5e354c14, 0x0c77c148), + TOBN(0xb4bb7581, 0xa8537c78), TOBN(0x188043d7, 0xefe1495f)}, + {TOBN(0x9ba12f42, 0x8c1d5026), TOBN(0x2e0c8a26, 0x93d4aaab), + TOBN(0xbdba7b8b, 0xaa57c450), TOBN(0x140c9ad6, 0x9bbdafef)}}, + {{TOBN(0x2067aa42, 0x25ac0f18), TOBN(0xf7b1295b, 0x04d1fbf3), + TOBN(0x14829111, 0xa4b04824), TOBN(0x2ce3f192, 0x33bd5e91)}, + {TOBN(0x9c7a1d55, 0x8f2e1b72), TOBN(0xfe932286, 0x302aa243), + TOBN(0x497ca7b4, 0xd4be9554), TOBN(0xb8e821b8, 0xe0547a6e)}}, + {{TOBN(0xfb2838be, 0x67e573e0), TOBN(0x05891db9, 0x4084c44b), + TOBN(0x91311373, 0x96c1c2c5), TOBN(0x6aebfa3f, 0xd958444b)}, + {TOBN(0xac9cdce9, 0xe56e55c1), TOBN(0x7148ced3, 0x2caa46d0), + TOBN(0x2e10c7ef, 0xb61fe8eb), TOBN(0x9fd835da, 0xff97cf4d)}}}, + {{{TOBN(0xa36da109, 0x081e9387), TOBN(0xfb9780d7, 0x8c935828), + TOBN(0xd5940332, 0xe540b015), TOBN(0xc9d7b51b, 0xe0f466fa)}, + {TOBN(0xfaadcd41, 0xd6d9f671), TOBN(0xba6c1e28, 0xb1a2ac17), + TOBN(0x066a7833, 0xed201e5f), TOBN(0x19d99719, 0xf90f462b)}}, + {{TOBN(0xf431f462, 0x060b5f61), TOBN(0xa56f46b4, 0x7bd057c2), + TOBN(0x348dca6c, 0x47e1bf65), TOBN(0x9a38783e, 0x41bcf1ff)}, + {TOBN(0x7a5d33a9, 0xda710718), TOBN(0x5a779987, 0x2e0aeaf6), + TOBN(0xca87314d, 0x2d29d187), TOBN(0xfa0edc3e, 0xc687d733)}}, + {{TOBN(0x9df33621, 0x6a31e09b), TOBN(0xde89e44d, 0xc1350e35), + TOBN(0x29214871, 0x4ca0cf52), TOBN(0xdf379672, 0x0b88a538)}, + {TOBN(0xc92a510a, 0x2591d61b), TOBN(0x79aa87d7, 0x585b447b), + TOBN(0xf67db604, 0xe5287f77), TOBN(0x1697c8bf, 0x5efe7a80)}}, + {{TOBN(0x1c894849, 0xcb198ac7), TOBN(0xa884a93d, 0x0f264665), + TOBN(0x2da964ef, 0x9b200678), TOBN(0x3c351b87, 0x009834e6)}, + {TOBN(0xafb2ef9f, 0xe2c4b44b), TOBN(0x580f6c47, 0x3326790c), + TOBN(0xb8480521, 0x0b02264a), TOBN(0x8ba6f9e2, 0x42a194e2)}}, + {{TOBN(0xfc87975f, 0x8fb54738), TOBN(0x35160788, 0x27c3ead3), + TOBN(0x834116d2, 0xb74a085a), TOBN(0x53c99a73, 0xa62fe996)}, + {TOBN(0x87585be0, 0x5b81c51b), TOBN(0x925bafa8, 0xbe0852b7), + TOBN(0x76a4fafd, 0xa84d19a7), TOBN(0x39a45982, 0x585206d4)}}, + {{TOBN(0x499b6ab6, 0x5eb03c0e), TOBN(0xf19b7954, 0x72bc3fde), + TOBN(0xa86b5b9c, 0x6e3a80d2), TOBN(0xe4377508, 0x6d42819f)}, + {TOBN(0xc1663650, 0xbb3ee8a3), TOBN(0x75eb14fc, 0xb132075f), + TOBN(0xa8ccc906, 0x7ad834f6), TOBN(0xea6a2474, 0xe6e92ffd)}}, + {{TOBN(0x9d72fd95, 0x0f8d6758), TOBN(0xcb84e101, 0x408c07dd), + TOBN(0xb9114bfd, 0xa5e23221), TOBN(0x358b5fe2, 0xe94e742c)}, + {TOBN(0x1c0577ec, 0x95f40e75), TOBN(0xf0155451, 0x3d73f3d6), + TOBN(0x9d55cd67, 0xbd1b9b66), TOBN(0x63e86e78, 0xaf8d63c7)}}, + {{TOBN(0x39d934ab, 0xd3c095f1), TOBN(0x04b261be, 0xe4b76d71), + TOBN(0x1d2e6970, 0xe73e6984), TOBN(0x879fb23b, 0x5e5fcb11)}, + {TOBN(0x11506c72, 0xdfd75490), TOBN(0x3a97d085, 0x61bcf1c1), + TOBN(0x43201d82, 0xbf5e7007), TOBN(0x7f0ac52f, 0x798232a7)}}, + {{TOBN(0x2715cbc4, 0x6eb564d4), TOBN(0x8d6c752c, 0x9e570e29), + TOBN(0xf80247c8, 0x9ef5fd5d), TOBN(0xc3c66b46, 0xd53eb514)}, + {TOBN(0x9666b401, 0x0f87de56), TOBN(0xce62c06f, 0xc6c603b5), + TOBN(0xae7b4c60, 0x7e4fc942), TOBN(0x38ac0b77, 0x663a9c19)}}, + {{TOBN(0xcb4d20ee, 0x4b049136), TOBN(0x8b63bf12, 0x356a4613), + TOBN(0x1221aef6, 0x70e08128), TOBN(0xe62d8c51, 0x4acb6b16)}, + {TOBN(0x71f64a67, 0x379e7896), TOBN(0xb25237a2, 0xcafd7fa5), + TOBN(0xf077bd98, 0x3841ba6a), TOBN(0xc4ac0244, 0x3cd16e7e)}}, + {{TOBN(0x548ba869, 0x21fea4ca), TOBN(0xd36d0817, 0xf3dfdac1), + TOBN(0x09d8d71f, 0xf4685faf), TOBN(0x8eff66be, 0xc52c459a)}, + {TOBN(0x182faee7, 0x0b57235e), TOBN(0xee3c39b1, 0x0106712b), + TOBN(0x5107331f, 0xc0fcdcb0), TOBN(0x669fb9dc, 0xa51054ba)}}, + {{TOBN(0xb25101fb, 0x319d7682), TOBN(0xb0293129, 0x0a982fee), + TOBN(0x51c1c9b9, 0x0261b344), TOBN(0x0e008c5b, 0xbfd371fa)}, + {TOBN(0xd866dd1c, 0x0278ca33), TOBN(0x666f76a6, 0xe5aa53b1), + TOBN(0xe5cfb779, 0x6013a2cf), TOBN(0x1d3a1aad, 0xa3521836)}}, + {{TOBN(0xcedd2531, 0x73faa485), TOBN(0xc8ee6c4f, 0xc0a76878), + TOBN(0xddbccfc9, 0x2a11667d), TOBN(0x1a418ea9, 0x1c2f695a)}, + {TOBN(0xdb11bd92, 0x51f73971), TOBN(0x3e4b3c82, 0xda2ed89f), + TOBN(0x9a44f3f4, 0xe73e0319), TOBN(0xd1e3de0f, 0x303431af)}}, + {{TOBN(0x3c5604ff, 0x50f75f9c), TOBN(0x1d8eddf3, 0x7e752b22), + TOBN(0x0ef074dd, 0x3c9a1118), TOBN(0xd0ffc172, 0xccb86d7b)}, + {TOBN(0xabd1ece3, 0x037d90f2), TOBN(0xe3f307d6, 0x6055856c), + TOBN(0x422f9328, 0x7e4c6daf), TOBN(0x902aac66, 0x334879a0)}}, + {{TOBN(0xb6a1e7bf, 0x94cdfade), TOBN(0x6c97e1ed, 0x7fc6d634), + TOBN(0x662ad24d, 0xa2fb63f8), TOBN(0xf81be1b9, 0xa5928405)}, + {TOBN(0x86d765e4, 0xd14b4206), TOBN(0xbecc2e0e, 0x8fa0db65), + TOBN(0xa28838e0, 0xb17fc76c), TOBN(0xe49a602a, 0xe37cf24e)}}, + {{TOBN(0x76b4131a, 0x567193ec), TOBN(0xaf3c305a, 0xe5f6e70b), + TOBN(0x9587bd39, 0x031eebdd), TOBN(0x5709def8, 0x71bbe831)}, + {TOBN(0x57059983, 0x0eb2b669), TOBN(0x4d80ce1b, 0x875b7029), + TOBN(0x838a7da8, 0x0364ac16), TOBN(0x2f431d23, 0xbe1c83ab)}}, + {{TOBN(0xe56812a6, 0xf9294dd3), TOBN(0xb448d01f, 0x9b4b0d77), + TOBN(0xf3ae6061, 0x04e8305c), TOBN(0x2bead645, 0x94d8c63e)}, + {TOBN(0x0a85434d, 0x84fd8b07), TOBN(0x537b983f, 0xf7a9dee5), + TOBN(0xedcc5f18, 0xef55bd85), TOBN(0x2041af62, 0x21c6cf8b)}}, + {{TOBN(0x8e52874c, 0xb940c71e), TOBN(0x211935a9, 0xdb5f4b3a), + TOBN(0x94350492, 0x301b1dc3), TOBN(0x33d2646d, 0x29958620)}, + {TOBN(0x16b0d64b, 0xef911404), TOBN(0x9d1f25ea, 0x9a3c5ef4), + TOBN(0x20f200eb, 0x4a352c78), TOBN(0x43929f2c, 0x4bd0b428)}}, + {{TOBN(0xa5656667, 0xc7196e29), TOBN(0x7992c2f0, 0x9391be48), + TOBN(0xaaa97cbd, 0x9ee0cd6e), TOBN(0x51b0310c, 0x3dc8c9bf)}, + {TOBN(0x237f8acf, 0xdd9f22cb), TOBN(0xbb1d81a1, 0xb585d584), + TOBN(0x8d5d85f5, 0x8c416388), TOBN(0x0d6e5a5a, 0x42fe474f)}}, + {{TOBN(0xe7812766, 0x38235d4e), TOBN(0x1c62bd67, 0x496e3298), + TOBN(0x8378660c, 0x3f175bc8), TOBN(0x4d04e189, 0x17afdd4d)}, + {TOBN(0x32a81601, 0x85a8068c), TOBN(0xdb58e4e1, 0x92b29a85), + TOBN(0xe8a65b86, 0xc70d8a3b), TOBN(0x5f0e6f4e, 0x98a0403b)}}, + {{TOBN(0x08129684, 0x69ed2370), TOBN(0x34dc30bd, 0x0871ee26), + TOBN(0x3a5ce948, 0x7c9c5b05), TOBN(0x7d487b80, 0x43a90c87)}, + {TOBN(0x4089ba37, 0xdd0e7179), TOBN(0x45f80191, 0xb4041811), + TOBN(0x1c3e1058, 0x98747ba5), TOBN(0x98c4e13a, 0x6e1ae592)}}, + {{TOBN(0xd44636e6, 0xe82c9f9e), TOBN(0x711db87c, 0xc33a1043), + TOBN(0x6f431263, 0xaa8aec05), TOBN(0x43ff120d, 0x2744a4aa)}, + {TOBN(0xd3bd892f, 0xae77779b), TOBN(0xf0fe0cc9, 0x8cdc9f82), + TOBN(0xca5f7fe6, 0xf1c5b1bc), TOBN(0xcc63a682, 0x44929a72)}}, + {{TOBN(0xc7eaba0c, 0x09dbe19a), TOBN(0x2f3585ad, 0x6b5c73c2), + TOBN(0x8ab8924b, 0x0ae50c30), TOBN(0x17fcd27a, 0x638b30ba)}, + {TOBN(0xaf414d34, 0x10b3d5a5), TOBN(0x09c107d2, 0x2a9accf1), + TOBN(0x15dac49f, 0x946a6242), TOBN(0xaec3df2a, 0xd707d642)}}, + {{TOBN(0x2c2492b7, 0x3f894ae0), TOBN(0xf59df3e5, 0xb75f18ce), + TOBN(0x7cb740d2, 0x8f53cad0), TOBN(0x3eb585fb, 0xc4f01294)}, + {TOBN(0x17da0c86, 0x32c7f717), TOBN(0xeb8c795b, 0xaf943f4c), + TOBN(0x4ee23fb5, 0xf67c51d2), TOBN(0xef187575, 0x68889949)}}, + {{TOBN(0xa6b4bdb2, 0x0389168b), TOBN(0xc4ecd258, 0xea577d03), + TOBN(0x3a63782b, 0x55743082), TOBN(0x6f678f4c, 0xc72f08cd)}, + {TOBN(0x553511cf, 0x65e58dd8), TOBN(0xd53b4e3e, 0xd402c0cd), + TOBN(0x37de3e29, 0xa037c14c), TOBN(0x86b6c516, 0xc05712aa)}}, + {{TOBN(0x2834da3e, 0xb38dff6f), TOBN(0xbe012c52, 0xea636be8), + TOBN(0x292d238c, 0x61dd37f8), TOBN(0x0e54523f, 0x8f8142db)}, + {TOBN(0xe31eb436, 0x036a05d8), TOBN(0x83e3cdff, 0x1e93c0ff), + TOBN(0x3fd2fe0f, 0x50821ddf), TOBN(0xc8e19b0d, 0xff9eb33b)}}, + {{TOBN(0xc8cc943f, 0xb569a5fe), TOBN(0xad0090d4, 0xd4342d75), + TOBN(0x82090b4b, 0xcaeca000), TOBN(0xca39687f, 0x1bd410eb)}, + {TOBN(0xe7bb0df7, 0x65959d77), TOBN(0x39d78218, 0x9c964999), + TOBN(0xd87f62e8, 0xb2415451), TOBN(0xe5efb774, 0xbed76108)}}, + {{TOBN(0x3ea011a4, 0xe822f0d0), TOBN(0xbc647ad1, 0x5a8704f8), + TOBN(0xbb315b35, 0x50c6820f), TOBN(0x863dec3d, 0xb7e76bec)}, + {TOBN(0x01ff5d3a, 0xf017bfc7), TOBN(0x20054439, 0x976b8229), + TOBN(0x067fca37, 0x0bbd0d3b), TOBN(0xf63dde64, 0x7f5e3d0f)}}, + {{TOBN(0x22dbefb3, 0x2a4c94e9), TOBN(0xafbff0fe, 0x96f8278a), + TOBN(0x80aea0b1, 0x3503793d), TOBN(0xb2238029, 0x5f06cd29)}, + {TOBN(0x65703e57, 0x8ec3feca), TOBN(0x06c38314, 0x393e7053), + TOBN(0xa0b751eb, 0x7c6734c4), TOBN(0xd2e8a435, 0xc59f0f1e)}}, + {{TOBN(0x147d9052, 0x5e9ca895), TOBN(0x2f4dd31e, 0x972072df), + TOBN(0xa16fda8e, 0xe6c6755c), TOBN(0xc66826ff, 0xcf196558)}, + {TOBN(0x1f1a76a3, 0x0cf43895), TOBN(0xa9d604e0, 0x83c3097b), + TOBN(0xe1908309, 0x66390e0e), TOBN(0xa50bf753, 0xb3c85eff)}}, + {{TOBN(0x0696bdde, 0xf6a70251), TOBN(0x548b801b, 0x3c6ab16a), + TOBN(0x37fcf704, 0xa4d08762), TOBN(0x090b3def, 0xdff76c4e)}, + {TOBN(0x87e8cb89, 0x69cb9158), TOBN(0x44a90744, 0x995ece43), + TOBN(0xf85395f4, 0x0ad9fbf5), TOBN(0x49b0f6c5, 0x4fb0c82d)}}, + {{TOBN(0x75d9bc15, 0xadf7cccf), TOBN(0x81a3e5d6, 0xdfa1e1b0), + TOBN(0x8c39e444, 0x249bc17e), TOBN(0xf37dccb2, 0x8ea7fd43)}, + {TOBN(0xda654873, 0x907fba12), TOBN(0x35daa6da, 0x4a372904), + TOBN(0x0564cfc6, 0x6283a6c5), TOBN(0xd09fa4f6, 0x4a9395bf)}}, + {{TOBN(0x688e9ec9, 0xaeb19a36), TOBN(0xd913f1ce, 0xc7bfbfb4), + TOBN(0x797b9a3c, 0x61c2faa6), TOBN(0x2f979bec, 0x6a0a9c12)}, + {TOBN(0xb5969d0f, 0x359679ec), TOBN(0xebcf523d, 0x079b0460), + TOBN(0xfd6b0008, 0x10fab870), TOBN(0x3f2edcda, 0x9373a39c)}}, + {{TOBN(0x0d64f9a7, 0x6f568431), TOBN(0xf848c27c, 0x02f8898c), + TOBN(0xf418ade1, 0x260b5bd5), TOBN(0xc1f3e323, 0x6973dee8)}, + {TOBN(0x46e9319c, 0x26c185dd), TOBN(0x6d85b7d8, 0x546f0ac4), + TOBN(0x427965f2, 0x247f9d57), TOBN(0xb519b636, 0xb0035f48)}}, + {{TOBN(0x6b6163a9, 0xab87d59c), TOBN(0xff9f58c3, 0x39caaa11), + TOBN(0x4ac39cde, 0x3177387b), TOBN(0x5f6557c2, 0x873e77f9)}, + {TOBN(0x67504006, 0x36a83041), TOBN(0x9b1c96ca, 0x75ef196c), + TOBN(0xf34283de, 0xb08c7940), TOBN(0x7ea09644, 0x1128c316)}}, + {{TOBN(0xb510b3b5, 0x6aa39dff), TOBN(0x59b43da2, 0x9f8e4d8c), + TOBN(0xa8ce31fd, 0x9e4c4b9f), TOBN(0x0e20be26, 0xc1303c01)}, + {TOBN(0x18187182, 0xe8ee47c9), TOBN(0xd9687cdb, 0x7db98101), + TOBN(0x7a520e4d, 0xa1e14ff6), TOBN(0x429808ba, 0x8836d572)}}, + {{TOBN(0xa37ca60d, 0x4944b663), TOBN(0xf901f7a9, 0xa3f91ae5), + TOBN(0xe4e3e76e, 0x9e36e3b1), TOBN(0x9aa219cf, 0x29d93250)}, + {TOBN(0x347fe275, 0x056a2512), TOBN(0xa4d643d9, 0xde65d95c), + TOBN(0x9669d396, 0x699fc3ed), TOBN(0xb598dee2, 0xcf8c6bbe)}}, + {{TOBN(0x682ac1e5, 0xdda9e5c6), TOBN(0x4e0d3c72, 0xcaa9fc95), + TOBN(0x17faaade, 0x772bea44), TOBN(0x5ef8428c, 0xab0009c8)}, + {TOBN(0xcc4ce47a, 0x460ff016), TOBN(0xda6d12bf, 0x725281cb), + TOBN(0x44c67848, 0x0223aad2), TOBN(0x6e342afa, 0x36256e28)}}, + {{TOBN(0x1400bb0b, 0x93a37c04), TOBN(0x62b1bc9b, 0xdd10bd96), + TOBN(0x7251adeb, 0x0dac46b7), TOBN(0x7d33b92e, 0x7be4ef51)}, + {TOBN(0x28b2a94b, 0xe61fa29a), TOBN(0x4b2be13f, 0x06422233), + TOBN(0x36d6d062, 0x330d8d37), TOBN(0x5ef80e1e, 0xb28ca005)}}, + {{TOBN(0x174d4699, 0x6d16768e), TOBN(0x9fc4ff6a, 0x628bf217), + TOBN(0x77705a94, 0x154e490d), TOBN(0x9d96dd28, 0x8d2d997a)}, + {TOBN(0x77e2d9d8, 0xce5d72c4), TOBN(0x9d06c5a4, 0xc11c714f), + TOBN(0x02aa5136, 0x79e4a03e), TOBN(0x1386b3c2, 0x030ff28b)}}, + {{TOBN(0xfe82e8a6, 0xfb283f61), TOBN(0x7df203e5, 0xf3abc3fb), + TOBN(0xeec7c351, 0x3a4d3622), TOBN(0xf7d17dbf, 0xdf762761)}, + {TOBN(0xc3956e44, 0x522055f0), TOBN(0xde3012db, 0x8fa748db), + TOBN(0xca9fcb63, 0xbf1dcc14), TOBN(0xa56d9dcf, 0xbe4e2f3a)}}, + {{TOBN(0xb86186b6, 0x8bcec9c2), TOBN(0x7cf24df9, 0x680b9f06), + TOBN(0xc46b45ea, 0xc0d29281), TOBN(0xfff42bc5, 0x07b10e12)}, + {TOBN(0x12263c40, 0x4d289427), TOBN(0x3d5f1899, 0xb4848ec4), + TOBN(0x11f97010, 0xd040800c), TOBN(0xb4c5f529, 0x300feb20)}}, + {{TOBN(0xcc543f8f, 0xde94fdcb), TOBN(0xe96af739, 0xc7c2f05e), + TOBN(0xaa5e0036, 0x882692e1), TOBN(0x09c75b68, 0x950d4ae9)}, + {TOBN(0x62f63df2, 0xb5932a7a), TOBN(0x2658252e, 0xde0979ad), + TOBN(0x2a19343f, 0xb5e69631), TOBN(0x718c7501, 0x525b666b)}}, + {{TOBN(0x26a42d69, 0xea40dc3a), TOBN(0xdc84ad22, 0xaecc018f), + TOBN(0x25c36c7b, 0x3270f04a), TOBN(0x46ba6d47, 0x50fa72ed)}, + {TOBN(0x6c37d1c5, 0x93e58a8e), TOBN(0xa2394731, 0x120c088c), + TOBN(0xc3be4263, 0xcb6e86da), TOBN(0x2c417d36, 0x7126d038)}}, + {{TOBN(0x5b70f9c5, 0x8b6f8efa), TOBN(0x671a2faa, 0x37718536), + TOBN(0xd3ced3c6, 0xb539c92b), TOBN(0xe56f1bd9, 0xa31203c2)}, + {TOBN(0x8b096ec4, 0x9ff3c8eb), TOBN(0x2deae432, 0x43491cea), + TOBN(0x2465c6eb, 0x17943794), TOBN(0x5d267e66, 0x20586843)}}, + {{TOBN(0x9d3d116d, 0xb07159d0), TOBN(0xae07a67f, 0xc1896210), + TOBN(0x8fc84d87, 0xbb961579), TOBN(0x30009e49, 0x1c1f8dd6)}, + {TOBN(0x8a8caf22, 0xe3132819), TOBN(0xcffa197c, 0xf23ab4ff), + TOBN(0x58103a44, 0x205dd687), TOBN(0x57b796c3, 0x0ded67a2)}}, + {{TOBN(0x0b9c3a6c, 0xa1779ad7), TOBN(0xa33cfe2e, 0x357c09c5), + TOBN(0x2ea29315, 0x3db4a57e), TOBN(0x91959695, 0x8ebeb52e)}, + {TOBN(0x118db9a6, 0xe546c879), TOBN(0x8e996df4, 0x6295c8d6), + TOBN(0xdd990484, 0x55ec806b), TOBN(0x24f291ca, 0x165c1035)}}, + {{TOBN(0xcca523bb, 0x440e2229), TOBN(0x324673a2, 0x73ef4d04), + TOBN(0xaf3adf34, 0x3e11ec39), TOBN(0x6136d7f1, 0xdc5968d3)}, + {TOBN(0x7a7b2899, 0xb053a927), TOBN(0x3eaa2661, 0xae067ecd), + TOBN(0x8549b9c8, 0x02779cd9), TOBN(0x061d7940, 0xc53385ea)}}, + {{TOBN(0x3e0ba883, 0xf06d18bd), TOBN(0x4ba6de53, 0xb2700843), + TOBN(0xb966b668, 0x591a9e4d), TOBN(0x93f67567, 0x7f4fa0ed)}, + {TOBN(0x5a02711b, 0x4347237b), TOBN(0xbc041e2f, 0xe794608e), + TOBN(0x55af10f5, 0x70f73d8c), TOBN(0xd2d4d4f7, 0xbb7564f7)}}, + {{TOBN(0xd7d27a89, 0xb3e93ce7), TOBN(0xf7b5a875, 0x5d3a2c1b), + TOBN(0xb29e68a0, 0x255b218a), TOBN(0xb533837e, 0x8af76754)}, + {TOBN(0xd1b05a73, 0x579fab2e), TOBN(0xb41055a1, 0xecd74385), + TOBN(0xb2369274, 0x445e9115), TOBN(0x2972a7c4, 0xf520274e)}}, + {{TOBN(0x6c08334e, 0xf678e68a), TOBN(0x4e4160f0, 0x99b057ed), + TOBN(0x3cfe11b8, 0x52ccb69a), TOBN(0x2fd1823a, 0x21c8f772)}, + {TOBN(0xdf7f072f, 0x3298f055), TOBN(0x8c0566f9, 0xfec74a6e), + TOBN(0xe549e019, 0x5bb4d041), TOBN(0x7c3930ba, 0x9208d850)}}, + {{TOBN(0xe07141fc, 0xaaa2902b), TOBN(0x539ad799, 0xe4f69ad3), + TOBN(0xa6453f94, 0x813f9ffd), TOBN(0xc58d3c48, 0x375bc2f7)}, + {TOBN(0xb3326fad, 0x5dc64e96), TOBN(0x3aafcaa9, 0xb240e354), + TOBN(0x1d1b0903, 0xaca1e7a9), TOBN(0x4ceb9767, 0x1211b8a0)}}, + {{TOBN(0xeca83e49, 0xe32a858e), TOBN(0x4c32892e, 0xae907bad), + TOBN(0xd5b42ab6, 0x2eb9b494), TOBN(0x7fde3ee2, 0x1eabae1b)}, + {TOBN(0x13b5ab09, 0xcaf54957), TOBN(0xbfb028be, 0xe5f5d5d5), + TOBN(0x928a0650, 0x2003e2c0), TOBN(0x90793aac, 0x67476843)}}, + {{TOBN(0x5e942e79, 0xc81710a0), TOBN(0x557e4a36, 0x27ccadd4), + TOBN(0x72a2bc56, 0x4bcf6d0c), TOBN(0x09ee5f43, 0x26d7b80c)}, + {TOBN(0x6b70dbe9, 0xd4292f19), TOBN(0x56f74c26, 0x63f16b18), + TOBN(0xc23db0f7, 0x35fbb42a), TOBN(0xb606bdf6, 0x6ae10040)}}, + {{TOBN(0x1eb15d4d, 0x044573ac), TOBN(0x7dc3cf86, 0x556b0ba4), + TOBN(0x97af9a33, 0xc60df6f7), TOBN(0x0b1ef85c, 0xa716ce8c)}, + {TOBN(0x2922f884, 0xc96958be), TOBN(0x7c32fa94, 0x35690963), + TOBN(0x2d7f667c, 0xeaa00061), TOBN(0xeaaf7c17, 0x3547365c)}}, + {{TOBN(0x1eb4de46, 0x87032d58), TOBN(0xc54f3d83, 0x5e2c79e0), + TOBN(0x07818df4, 0x5d04ef23), TOBN(0x55faa9c8, 0x673d41b4)}, + {TOBN(0xced64f6f, 0x89b95355), TOBN(0x4860d2ea, 0xb7415c84), + TOBN(0x5fdb9bd2, 0x050ebad3), TOBN(0xdb53e0cc, 0x6685a5bf)}}, + {{TOBN(0xb830c031, 0x9feb6593), TOBN(0xdd87f310, 0x6accff17), + TOBN(0x2303ebab, 0x9f555c10), TOBN(0x94603695, 0x287e7065)}, + {TOBN(0xf88311c3, 0x2e83358c), TOBN(0x508dd9b4, 0xeefb0178), + TOBN(0x7ca23706, 0x2dba8652), TOBN(0x62aac5a3, 0x0047abe5)}}, + {{TOBN(0x9a61d2a0, 0x8b1ea7b3), TOBN(0xd495ab63, 0xae8b1485), + TOBN(0x38740f84, 0x87052f99), TOBN(0x178ebe5b, 0xb2974eea)}, + {TOBN(0x030bbcca, 0x5b36d17f), TOBN(0xb5e4cce3, 0xaaf86eea), + TOBN(0xb51a0220, 0x68f8e9e0), TOBN(0xa4348796, 0x09eb3e75)}}, + {{TOBN(0xbe592309, 0xeef1a752), TOBN(0x5d7162d7, 0x6f2aa1ed), + TOBN(0xaebfb5ed, 0x0f007dd2), TOBN(0x255e14b2, 0xc89edd22)}, + {TOBN(0xba85e072, 0x0303b697), TOBN(0xc5d17e25, 0xf05720ff), + TOBN(0x02b58d6e, 0x5128ebb6), TOBN(0x2c80242d, 0xd754e113)}}, + {{TOBN(0x919fca5f, 0xabfae1ca), TOBN(0x937afaac, 0x1a21459b), + TOBN(0x9e0ca91c, 0x1f66a4d2), TOBN(0x194cc7f3, 0x23ec1331)}, + {TOBN(0xad25143a, 0x8aa11690), TOBN(0xbe40ad8d, 0x09b59e08), + TOBN(0x37d60d9b, 0xe750860a), TOBN(0x6c53b008, 0xc6bf434c)}}, + {{TOBN(0xb572415d, 0x1356eb80), TOBN(0xb8bf9da3, 0x9578ded8), + TOBN(0x22658e36, 0x5e8fb38b), TOBN(0x9b70ce22, 0x5af8cb22)}, + {TOBN(0x7c00018a, 0x829a8180), TOBN(0x84329f93, 0xb81ed295), + TOBN(0x7c343ea2, 0x5f3cea83), TOBN(0x38f8655f, 0x67586536)}}, + {{TOBN(0xa661a0d0, 0x1d3ec517), TOBN(0x98744652, 0x512321ae), + TOBN(0x084ca591, 0xeca92598), TOBN(0xa9bb9dc9, 0x1dcb3feb)}, + {TOBN(0x14c54355, 0x78b4c240), TOBN(0x5ed62a3b, 0x610cafdc), + TOBN(0x07512f37, 0x1b38846b), TOBN(0x571bb70a, 0xb0e38161)}}, + {{TOBN(0xb556b95b, 0x2da705d2), TOBN(0x3ef8ada6, 0xb1a08f98), + TOBN(0x85302ca7, 0xddecfbe5), TOBN(0x0e530573, 0x943105cd)}, + {TOBN(0x60554d55, 0x21a9255d), TOBN(0x63a32fa1, 0xf2f3802a), + TOBN(0x35c8c5b0, 0xcd477875), TOBN(0x97f458ea, 0x6ad42da1)}}, + {{TOBN(0x832d7080, 0xeb6b242d), TOBN(0xd30bd023, 0x3b71e246), + TOBN(0x7027991b, 0xbe31139d), TOBN(0x68797e91, 0x462e4e53)}, + {TOBN(0x423fe20a, 0x6b4e185a), TOBN(0x82f2c67e, 0x42d9b707), + TOBN(0x25c81768, 0x4cf7811b), TOBN(0xbd53005e, 0x045bb95d)}}}, + {{{TOBN(0xe5f649be, 0x9d8e68fd), TOBN(0xdb0f0533, 0x1b044320), + TOBN(0xf6fde9b3, 0xe0c33398), TOBN(0x92f4209b, 0x66c8cfae)}, + {TOBN(0xe9d1afcc, 0x1a739d4b), TOBN(0x09aea75f, 0xa28ab8de), + TOBN(0x14375fb5, 0xeac6f1d0), TOBN(0x6420b560, 0x708f7aa5)}}, + {{TOBN(0x9eae499c, 0x6254dc41), TOBN(0x7e293924, 0x7a837e7e), + TOBN(0x74aec08c, 0x090524a7), TOBN(0xf82b9219, 0x8d6f55f2)}, + {TOBN(0x493c962e, 0x1402cec5), TOBN(0x9f17ca17, 0xfa2f30e7), + TOBN(0xbcd783e8, 0xe9b879cb), TOBN(0xea3d8c14, 0x5a6f145f)}}, + {{TOBN(0xdede15e7, 0x5e0dee6e), TOBN(0x74f24872, 0xdc628aa2), + TOBN(0xd3e9c4fe, 0x7861bb93), TOBN(0x56d4822a, 0x6187b2e0)}, + {TOBN(0xb66417cf, 0xc59826f9), TOBN(0xca260969, 0x2408169e), + TOBN(0xedf69d06, 0xc79ef885), TOBN(0x00031f8a, 0xdc7d138f)}}, + {{TOBN(0x103c46e6, 0x0ebcf726), TOBN(0x4482b831, 0x6231470e), + TOBN(0x6f6dfaca, 0x487c2109), TOBN(0x2e0ace97, 0x62e666ef)}, + {TOBN(0x3246a9d3, 0x1f8d1f42), TOBN(0x1b1e83f1, 0x574944d2), + TOBN(0x13dfa63a, 0xa57f334b), TOBN(0x0cf8daed, 0x9f025d81)}}, + {{TOBN(0x30d78ea8, 0x00ee11c1), TOBN(0xeb053cd4, 0xb5e3dd75), + TOBN(0x9b65b13e, 0xd58c43c5), TOBN(0xc3ad49bd, 0xbd151663)}, + {TOBN(0x99fd8e41, 0xb6427990), TOBN(0x12cf15bd, 0x707eae1e), + TOBN(0x29ad4f1b, 0x1aabb71e), TOBN(0x5143e74d, 0x07545d0e)}}, + {{TOBN(0x30266336, 0xc88bdee1), TOBN(0x25f29306, 0x5876767c), + TOBN(0x9c078571, 0xc6731996), TOBN(0xc88690b2, 0xed552951)}, + {TOBN(0x274f2c2d, 0x852705b4), TOBN(0xb0bf8d44, 0x4e09552d), + TOBN(0x7628beeb, 0x986575d1), TOBN(0x407be238, 0x7f864651)}}, + {{TOBN(0x0e5e3049, 0xa639fc6b), TOBN(0xe75c35d9, 0x86003625), + TOBN(0x0cf35bd8, 0x5dcc1646), TOBN(0x8bcaced2, 0x6c26273a)}, + {TOBN(0xe22ecf1d, 0xb5536742), TOBN(0x013dd897, 0x1a9e068b), + TOBN(0x17f411cb, 0x8a7909c5), TOBN(0x5757ac98, 0x861dd506)}}, + {{TOBN(0x85de1f0d, 0x1e935abb), TOBN(0xdefd10b4, 0x154de37a), + TOBN(0xb8d9e392, 0x369cebb5), TOBN(0x54d5ef9b, 0x761324be)}, + {TOBN(0x4d6341ba, 0x74f17e26), TOBN(0xc0a0e3c8, 0x78c1dde4), + TOBN(0xa6d77581, 0x87d918fd), TOBN(0x66876015, 0x02ca3a13)}}, + {{TOBN(0xc7313e9c, 0xf36658f0), TOBN(0xc433ef1c, 0x71f8057e), + TOBN(0x85326246, 0x1b6a835a), TOBN(0xc8f05398, 0x7c86394c)}, + {TOBN(0xff398cdf, 0xe983c4a1), TOBN(0xbf5e8162, 0x03b7b931), + TOBN(0x93193c46, 0xb7b9045b), TOBN(0x1e4ebf5d, 0xa4a6e46b)}}, + {{TOBN(0xf9942a60, 0x43a24fe7), TOBN(0x29c1191e, 0xffb3492b), + TOBN(0x9f662449, 0x902fde05), TOBN(0xc792a7ac, 0x6713c32d)}, + {TOBN(0x2fd88ad8, 0xb737982c), TOBN(0x7e3a0319, 0xa21e60e3), + TOBN(0x09b0de44, 0x7383591a), TOBN(0x6df141ee, 0x8310a456)}}, + {{TOBN(0xaec1a039, 0xe6d6f471), TOBN(0x14b2ba0f, 0x1198d12e), + TOBN(0xebc1a160, 0x3aeee5ac), TOBN(0x401f4836, 0xe0b964ce)}, + {TOBN(0x2ee43796, 0x4fd03f66), TOBN(0x3fdb4e49, 0xdd8f3f12), + TOBN(0x6ef267f6, 0x29380f18), TOBN(0x3e8e9670, 0x8da64d16)}}, + {{TOBN(0xbc19180c, 0x207674f1), TOBN(0x112e09a7, 0x33ae8fdb), + TOBN(0x99667554, 0x6aaeb71e), TOBN(0x79432af1, 0xe101b1c7)}, + {TOBN(0xd5eb558f, 0xde2ddec6), TOBN(0x81392d1f, 0x5357753f), + TOBN(0xa7a76b97, 0x3ae1158a), TOBN(0x416fbbff, 0x4a899991)}}, + {{TOBN(0x9e65fdfd, 0x0d4a9dcf), TOBN(0x7bc29e48, 0x944ddf12), + TOBN(0xbc1a92d9, 0x3c856866), TOBN(0x273c6905, 0x6e98dfe2)}, + {TOBN(0x69fce418, 0xcdfaa6b8), TOBN(0x606bd823, 0x5061c69f), + TOBN(0x42d495a0, 0x6af75e27), TOBN(0x8ed3d505, 0x6d873a1f)}}, + {{TOBN(0xaf552841, 0x6ab25b6a), TOBN(0xc6c0ffc7, 0x2b1a4523), + TOBN(0xab18827b, 0x21c99e03), TOBN(0x060e8648, 0x9034691b)}, + {TOBN(0x5207f90f, 0x93c7f398), TOBN(0x9f4a96cb, 0x82f8d10b), + TOBN(0xdd71cd79, 0x3ad0f9e3), TOBN(0x84f435d2, 0xfc3a54f5)}}, + {{TOBN(0x4b03c55b, 0x8e33787f), TOBN(0xef42f975, 0xa6384673), + TOBN(0xff7304f7, 0x5051b9f0), TOBN(0x18aca1dc, 0x741c87c2)}, + {TOBN(0x56f120a7, 0x2d4bfe80), TOBN(0xfd823b3d, 0x053e732c), + TOBN(0x11bccfe4, 0x7537ca16), TOBN(0xdf6c9c74, 0x1b5a996b)}}, + {{TOBN(0xee7332c7, 0x904fc3fa), TOBN(0x14a23f45, 0xc7e3636a), + TOBN(0xc38659c3, 0xf091d9aa), TOBN(0x4a995e5d, 0xb12d8540)}, + {TOBN(0x20a53bec, 0xf3a5598a), TOBN(0x56534b17, 0xb1eaa995), + TOBN(0x9ed3dca4, 0xbf04e03c), TOBN(0x716c563a, 0xd8d56268)}}, + {{TOBN(0x27ba77a4, 0x1d6178e7), TOBN(0xe4c80c40, 0x68a1ff8e), + TOBN(0x75011099, 0x0a13f63d), TOBN(0x7bf33521, 0xa61d46f3)}, + {TOBN(0x0aff218e, 0x10b365bb), TOBN(0x81021804, 0x0fd7ea75), + TOBN(0x05a3fd8a, 0xa4b3a925), TOBN(0xb829e75f, 0x9b3db4e6)}}, + {{TOBN(0x6bdc75a5, 0x4d53e5fb), TOBN(0x04a5dc02, 0xd52717e3), + TOBN(0x86af502f, 0xe9a42ec2), TOBN(0x8867e8fb, 0x2630e382)}, + {TOBN(0xbf845c6e, 0xbec9889b), TOBN(0x54f491f2, 0xcb47c98d), + TOBN(0xa3091fba, 0x790c2a12), TOBN(0xd7f6fd78, 0xc20f708b)}}, + {{TOBN(0xa569ac30, 0xacde5e17), TOBN(0xd0f996d0, 0x6852b4d7), + TOBN(0xe51d4bb5, 0x4609ae54), TOBN(0x3fa37d17, 0x0daed061)}, + {TOBN(0x62a88684, 0x34b8fb41), TOBN(0x99a2acbd, 0x9efb64f1), + TOBN(0xb75c1a5e, 0x6448e1f2), TOBN(0xfa99951a, 0x42b5a069)}}, + {{TOBN(0x6d956e89, 0x2f3b26e7), TOBN(0xf4709860, 0xda875247), + TOBN(0x3ad15179, 0x2482dda3), TOBN(0xd64110e3, 0x017d82f0)}, + {TOBN(0x14928d2c, 0xfad414e4), TOBN(0x2b155f58, 0x2ed02b24), + TOBN(0x481a141b, 0xcb821bf1), TOBN(0x12e3c770, 0x4f81f5da)}}, + {{TOBN(0xe49c5de5, 0x9fff8381), TOBN(0x11053232, 0x5bbec894), + TOBN(0xa0d051cc, 0x454d88c4), TOBN(0x4f6db89c, 0x1f8e531b)}, + {TOBN(0x34fe3fd6, 0xca563a44), TOBN(0x7f5c2215, 0x58da8ab9), + TOBN(0x8445016d, 0x9474f0a1), TOBN(0x17d34d61, 0xcb7d8a0a)}}, + {{TOBN(0x8e9d3910, 0x1c474019), TOBN(0xcaff2629, 0xd52ceefb), + TOBN(0xf9cf3e32, 0xc1622c2b), TOBN(0xd4b95e3c, 0xe9071a05)}, + {TOBN(0xfbbca61f, 0x1594438c), TOBN(0x1eb6e6a6, 0x04aadedf), + TOBN(0x853027f4, 0x68e14940), TOBN(0x221d322a, 0xdfabda9c)}}, + {{TOBN(0xed8ea9f6, 0xb7cb179a), TOBN(0xdc7b764d, 0xb7934dcc), + TOBN(0xfcb13940, 0x5e09180d), TOBN(0x6629a6bf, 0xb47dc2dd)}, + {TOBN(0xbfc55e4e, 0x9f5a915e), TOBN(0xb1db9d37, 0x6204441e), + TOBN(0xf82d68cf, 0x930c5f53), TOBN(0x17d3a142, 0xcbb605b1)}}, + {{TOBN(0xdd5944ea, 0x308780f2), TOBN(0xdc8de761, 0x3845f5e4), + TOBN(0x6beaba7d, 0x7624d7a3), TOBN(0x1e709afd, 0x304df11e)}, + {TOBN(0x95364376, 0x02170456), TOBN(0xbf204b3a, 0xc8f94b64), + TOBN(0x4e53af7c, 0x5680ca68), TOBN(0x0526074a, 0xe0c67574)}}, + {{TOBN(0x95d8cef8, 0xecd92af6), TOBN(0xe6b9fa7a, 0x6cd1745a), + TOBN(0x3d546d3d, 0xa325c3e4), TOBN(0x1f57691d, 0x9ae93aae)}, + {TOBN(0xe891f3fe, 0x9d2e1a33), TOBN(0xd430093f, 0xac063d35), + TOBN(0xeda59b12, 0x5513a327), TOBN(0xdc2134f3, 0x5536f18f)}}, + {{TOBN(0xaa51fe2c, 0x5c210286), TOBN(0x3f68aaee, 0x1cab658c), + TOBN(0x5a23a00b, 0xf9357292), TOBN(0x9a626f39, 0x7efdabed)}, + {TOBN(0xfe2b3bf3, 0x199d78e3), TOBN(0xb7a2af77, 0x71bbc345), + TOBN(0x3d19827a, 0x1e59802c), TOBN(0x823bbc15, 0xb487a51c)}}, + {{TOBN(0x856139f2, 0x99d0a422), TOBN(0x9ac3df65, 0xf456c6fb), + TOBN(0xaddf65c6, 0x701f8bd6), TOBN(0x149f321e, 0x3758df87)}, + {TOBN(0xb1ecf714, 0x721b7eba), TOBN(0xe17df098, 0x31a3312a), + TOBN(0xdb2fd6ec, 0xd5c4d581), TOBN(0xfd02996f, 0x8fcea1b3)}}, + {{TOBN(0xe29fa63e, 0x7882f14f), TOBN(0xc9f6dc35, 0x07c6cadc), + TOBN(0x46f22d6f, 0xb882bed0), TOBN(0x1a45755b, 0xd118e52c)}, + {TOBN(0x9f2c7c27, 0x7c4608cf), TOBN(0x7ccbdf32, 0x568012c2), + TOBN(0xfcb0aedd, 0x61729b0e), TOBN(0x7ca2ca9e, 0xf7d75dbf)}}, + {{TOBN(0xf58fecb1, 0x6f640f62), TOBN(0xe274b92b, 0x39f51946), + TOBN(0x7f4dfc04, 0x6288af44), TOBN(0x0a91f32a, 0xeac329e5)}, + {TOBN(0x43ad274b, 0xd6aaba31), TOBN(0x719a1640, 0x0f6884f9), + TOBN(0x685d29f6, 0xdaf91e20), TOBN(0x5ec1cc33, 0x27e49d52)}}, + {{TOBN(0x38f4de96, 0x3b54a059), TOBN(0x0e0015e5, 0xefbcfdb3), + TOBN(0x177d23d9, 0x4dbb8da6), TOBN(0x98724aa2, 0x97a617ad)}, + {TOBN(0x30f0885b, 0xfdb6558e), TOBN(0xf9f7a28a, 0xc7899a96), + TOBN(0xd2ae8ac8, 0x872dc112), TOBN(0xfa0642ca, 0x73c3c459)}}, + {{TOBN(0x15296981, 0xe7dfc8d6), TOBN(0x67cd4450, 0x1fb5b94a), + TOBN(0x0ec71cf1, 0x0eddfd37), TOBN(0xc7e5eeb3, 0x9a8eddc7)}, + {TOBN(0x02ac8e3d, 0x81d95028), TOBN(0x0088f172, 0x70b0e35d), + TOBN(0xec041fab, 0xe1881fe3), TOBN(0x62cf71b8, 0xd99e7faa)}}, + {{TOBN(0x5043dea7, 0xe0f222c2), TOBN(0x309d42ac, 0x72e65142), + TOBN(0x94fe9ddd, 0x9216cd30), TOBN(0xd6539c7d, 0x0f87feec)}, + {TOBN(0x03c5a57c, 0x432ac7d7), TOBN(0x72692cf0, 0x327fda10), + TOBN(0xec28c85f, 0x280698de), TOBN(0x2331fb46, 0x7ec283b1)}}, + {{TOBN(0xd34bfa32, 0x2867e633), TOBN(0x78709a82, 0x0a9cc815), + TOBN(0xb7fe6964, 0x875e2fa5), TOBN(0x25cc064f, 0x9e98bfb5)}, + {TOBN(0x9eb0151c, 0x493a65c5), TOBN(0x5fb5d941, 0x53182464), + TOBN(0x69e6f130, 0xf04618e2), TOBN(0xa8ecec22, 0xf89c8ab6)}}, + {{TOBN(0xcd6ac88b, 0xb96209bd), TOBN(0x65fa8cdb, 0xb3e1c9e0), + TOBN(0xa47d22f5, 0x4a8d8eac), TOBN(0x83895cdf, 0x8d33f963)}, + {TOBN(0xa8adca59, 0xb56cd3d1), TOBN(0x10c8350b, 0xdaf38232), + TOBN(0x2b161fb3, 0xa5080a9f), TOBN(0xbe7f5c64, 0x3af65b3a)}}, + {{TOBN(0x2c754039, 0x97403a11), TOBN(0x94626cf7, 0x121b96af), + TOBN(0x431de7c4, 0x6a983ec2), TOBN(0x3780dd3a, 0x52cc3df7)}, + {TOBN(0xe28a0e46, 0x2baf8e3b), TOBN(0xabe68aad, 0x51d299ae), + TOBN(0x603eb8f9, 0x647a2408), TOBN(0x14c61ed6, 0x5c750981)}}, + {{TOBN(0x88b34414, 0xc53352e7), TOBN(0x5a34889c, 0x1337d46e), + TOBN(0x612c1560, 0xf95f2bc8), TOBN(0x8a3f8441, 0xd4807a3a)}, + {TOBN(0x680d9e97, 0x5224da68), TOBN(0x60cd6e88, 0xc3eb00e9), + TOBN(0x3875a98e, 0x9a6bc375), TOBN(0xdc80f924, 0x4fd554c2)}}, + {{TOBN(0x6c4b3415, 0x6ac77407), TOBN(0xa1e5ea8f, 0x25420681), + TOBN(0x541bfa14, 0x4607a458), TOBN(0x5dbc7e7a, 0x96d7fbf9)}, + {TOBN(0x646a851b, 0x31590a47), TOBN(0x039e85ba, 0x15ee6df8), + TOBN(0xd19fa231, 0xd7b43fc0), TOBN(0x84bc8be8, 0x299a0e04)}}, + {{TOBN(0x2b9d2936, 0xf20df03a), TOBN(0x24054382, 0x8608d472), + TOBN(0x76b6ba04, 0x9149202a), TOBN(0xb21c3831, 0x3670e7b7)}, + {TOBN(0xddd93059, 0xd6fdee10), TOBN(0x9da47ad3, 0x78488e71), + TOBN(0x99cc1dfd, 0xa0fcfb25), TOBN(0x42abde10, 0x64696954)}}, + {{TOBN(0x14cc15fc, 0x17eab9fe), TOBN(0xd6e863e4, 0xd3e70972), + TOBN(0x29a7765c, 0x6432112c), TOBN(0x88660001, 0x5b0774d8)}, + {TOBN(0x3729175a, 0x2c088eae), TOBN(0x13afbcae, 0x8230b8d4), + TOBN(0x44768151, 0x915f4379), TOBN(0xf086431a, 0xd8d22812)}}, + {{TOBN(0x37461955, 0xc298b974), TOBN(0x905fb5f0, 0xf8711e04), + TOBN(0x787abf3a, 0xfe969d18), TOBN(0x392167c2, 0x6f6a494e)}, + {TOBN(0xfc7a0d2d, 0x28c511da), TOBN(0xf127c7dc, 0xb66a262d), + TOBN(0xf9c4bb95, 0xfd63fdf0), TOBN(0x90016589, 0x3913ef46)}}, + {{TOBN(0x74d2a73c, 0x11aa600d), TOBN(0x2f5379bd, 0x9fb5ab52), + TOBN(0xe49e53a4, 0x7fb70068), TOBN(0x68dd39e5, 0x404aa9a7)}, + {TOBN(0xb9b0cf57, 0x2ecaa9c3), TOBN(0xba0e103b, 0xe824826b), + TOBN(0x60c2198b, 0x4631a3c4), TOBN(0xc5ff84ab, 0xfa8966a2)}}, + {{TOBN(0x2d6ebe22, 0xac95aff8), TOBN(0x1c9bb6db, 0xb5a46d09), + TOBN(0x419062da, 0x53ee4f8d), TOBN(0x7b9042d0, 0xbb97efef)}, + {TOBN(0x0f87f080, 0x830cf6bd), TOBN(0x4861d19a, 0x6ec8a6c6), + TOBN(0xd3a0daa1, 0x202f01aa), TOBN(0xb0111674, 0xf25afbd5)}}, + {{TOBN(0x6d00d6cf, 0x1afb20d9), TOBN(0x13695000, 0x40671bc5), + TOBN(0x913ab0dc, 0x2485ea9b), TOBN(0x1f2bed06, 0x9eef61ac)}, + {TOBN(0x850c8217, 0x6d799e20), TOBN(0x93415f37, 0x3271c2de), + TOBN(0x5afb06e9, 0x6c4f5910), TOBN(0x688a52df, 0xc4e9e421)}}, + {{TOBN(0x30495ba3, 0xe2a9a6db), TOBN(0x4601303d, 0x58f9268b), + TOBN(0xbe3b0dad, 0x7eb0f04f), TOBN(0x4ea47250, 0x4456936d)}, + {TOBN(0x8caf8798, 0xd33fd3e7), TOBN(0x1ccd8a89, 0xeb433708), + TOBN(0x9effe3e8, 0x87fd50ad), TOBN(0xbe240a56, 0x6b29c4df)}}, + {{TOBN(0xec4ffd98, 0xca0e7ebd), TOBN(0xf586783a, 0xe748616e), + TOBN(0xa5b00d8f, 0xc77baa99), TOBN(0x0acada29, 0xb4f34c9c)}, + {TOBN(0x36dad67d, 0x0fe723ac), TOBN(0x1d8e53a5, 0x39c36c1e), + TOBN(0xe4dd342d, 0x1f4bea41), TOBN(0x64fd5e35, 0xebc9e4e0)}}, + {{TOBN(0x96f01f90, 0x57908805), TOBN(0xb5b9ea3d, 0x5ed480dd), + TOBN(0x366c5dc2, 0x3efd2dd0), TOBN(0xed2fe305, 0x6e9dfa27)}, + {TOBN(0x4575e892, 0x6e9197e2), TOBN(0x11719c09, 0xab502a5d), + TOBN(0x264c7bec, 0xe81f213f), TOBN(0x741b9241, 0x55f5c457)}}, + {{TOBN(0x78ac7b68, 0x49a5f4f4), TOBN(0xf91d70a2, 0x9fc45b7d), + TOBN(0x39b05544, 0xb0f5f355), TOBN(0x11f06bce, 0xeef930d9)}, + {TOBN(0xdb84d25d, 0x038d05e1), TOBN(0x04838ee5, 0xbacc1d51), + TOBN(0x9da3ce86, 0x9e8ee00b), TOBN(0xc3412057, 0xc36eda1f)}}, + {{TOBN(0xae80b913, 0x64d9c2f4), TOBN(0x7468bac3, 0xa010a8ff), + TOBN(0xdfd20037, 0x37359d41), TOBN(0x1a0f5ab8, 0x15efeacc)}, + {TOBN(0x7c25ad2f, 0x659d0ce0), TOBN(0x4011bcbb, 0x6785cff1), + TOBN(0x128b9912, 0x7e2192c7), TOBN(0xa549d8e1, 0x13ccb0e8)}}, + {{TOBN(0x805588d8, 0xc85438b1), TOBN(0x5680332d, 0xbc25cb27), + TOBN(0xdcd1bc96, 0x1a4bfdf4), TOBN(0x779ff428, 0x706f6566)}, + {TOBN(0x8bbee998, 0xf059987a), TOBN(0xf6ce8cf2, 0xcc686de7), + TOBN(0xf8ad3c4a, 0x953cfdb2), TOBN(0xd1d426d9, 0x2205da36)}}, + {{TOBN(0xb3c0f13f, 0xc781a241), TOBN(0x3e89360e, 0xd75362a8), + TOBN(0xccd05863, 0xc8a91184), TOBN(0x9bd0c9b7, 0xefa8a7f4)}, + {TOBN(0x97ee4d53, 0x8a912a4b), TOBN(0xde5e15f8, 0xbcf518fd), + TOBN(0x6a055bf8, 0xc467e1e0), TOBN(0x10be4b4b, 0x1587e256)}}, + {{TOBN(0xd90c14f2, 0x668621c9), TOBN(0xd5518f51, 0xab9c92c1), + TOBN(0x8e6a0100, 0xd6d47b3c), TOBN(0xcbe980dd, 0x66716175)}, + {TOBN(0x500d3f10, 0xddd83683), TOBN(0x3b6cb35d, 0x99cac73c), + TOBN(0x53730c8b, 0x6083d550), TOBN(0xcf159767, 0xdf0a1987)}}, + {{TOBN(0x84bfcf53, 0x43ad73b3), TOBN(0x1b528c20, 0x4f035a94), + TOBN(0x4294edf7, 0x33eeac69), TOBN(0xb6283e83, 0x817f3240)}, + {TOBN(0xc3fdc959, 0x0a5f25b1), TOBN(0xefaf8aa5, 0x5844ee22), + TOBN(0xde269ba5, 0xdbdde4de), TOBN(0xe3347160, 0xc56133bf)}}, + {{TOBN(0xc1184219, 0x8d9ea9f8), TOBN(0x090de5db, 0xf3fc1ab5), + TOBN(0x404c37b1, 0x0bf22cda), TOBN(0x7de20ec8, 0xf5618894)}, + {TOBN(0x754c588e, 0xecdaecab), TOBN(0x6ca4b0ed, 0x88342743), + TOBN(0x76f08bdd, 0xf4a938ec), TOBN(0xd182de89, 0x91493ccb)}}, + {{TOBN(0xd652c53e, 0xc8a4186a), TOBN(0xb3e878db, 0x946d8e33), + TOBN(0x088453c0, 0x5f37663c), TOBN(0x5cd9daaa, 0xb407748b)}, + {TOBN(0xa1f5197f, 0x586d5e72), TOBN(0x47500be8, 0xc443ca59), + TOBN(0x78ef35b2, 0xe2652424), TOBN(0x09c5d26f, 0x6dd7767d)}}, + {{TOBN(0x7175a79a, 0xa74d3f7b), TOBN(0x0428fd8d, 0xcf5ea459), + TOBN(0x511cb97c, 0xa5d1746d), TOBN(0x36363939, 0xe71d1278)}, + {TOBN(0xcf2df955, 0x10350bf4), TOBN(0xb3817439, 0x60aae782), + TOBN(0xa748c0e4, 0x3e688809), TOBN(0x98021fbf, 0xd7a5a006)}}, + {{TOBN(0x9076a70c, 0x0e367a98), TOBN(0xbea1bc15, 0x0f62b7c2), + TOBN(0x2645a68c, 0x30fe0343), TOBN(0xacaffa78, 0x699dc14f)}, + {TOBN(0xf4469964, 0x457bf9c4), TOBN(0x0db6407b, 0x0d2ead83), + TOBN(0x68d56cad, 0xb2c6f3eb), TOBN(0x3b512e73, 0xf376356c)}}, + {{TOBN(0xe43b0e1f, 0xfce10408), TOBN(0x89ddc003, 0x5a5e257d), + TOBN(0xb0ae0d12, 0x0362e5b3), TOBN(0x07f983c7, 0xb0519161)}, + {TOBN(0xc2e94d15, 0x5d5231e7), TOBN(0xcff22aed, 0x0b4f9513), + TOBN(0xb02588dd, 0x6ad0b0b5), TOBN(0xb967d1ac, 0x11d0dcd5)}}, + {{TOBN(0x8dac6bc6, 0xcf777b6c), TOBN(0x0062bdbd, 0x4c6d1959), + TOBN(0x53da71b5, 0x0ef5cc85), TOBN(0x07012c7d, 0x4006f14f)}, + {TOBN(0x4617f962, 0xac47800d), TOBN(0x53365f2b, 0xc102ed75), + TOBN(0xb422efcb, 0x4ab8c9d3), TOBN(0x195cb26b, 0x34af31c9)}}, + {{TOBN(0x3a926e29, 0x05f2c4ce), TOBN(0xbd2bdecb, 0x9856966c), + TOBN(0x5d16ab3a, 0x85527015), TOBN(0x9f81609e, 0x4486c231)}, + {TOBN(0xd8b96b2c, 0xda350002), TOBN(0xbd054690, 0xfa1b7d36), + TOBN(0xdc90ebf5, 0xe71d79bc), TOBN(0xf241b6f9, 0x08964e4e)}}, + {{TOBN(0x7c838643, 0x2fe3cd4c), TOBN(0xe0f33acb, 0xb4bc633c), + TOBN(0xb4a9ecec, 0x3d139f1f), TOBN(0x05ce69cd, 0xdc4a1f49)}, + {TOBN(0xa19d1b16, 0xf5f98aaf), TOBN(0x45bb71d6, 0x6f23e0ef), + TOBN(0x33789fcd, 0x46cdfdd3), TOBN(0x9b8e2978, 0xcee040ca)}}, + {{TOBN(0x9c69b246, 0xae0a6828), TOBN(0xba533d24, 0x7078d5aa), + TOBN(0x7a2e42c0, 0x7bb4fbdb), TOBN(0xcfb4879a, 0x7035385c)}, + {TOBN(0x8c3dd30b, 0x3281705b), TOBN(0x7e361c6c, 0x404fe081), + TOBN(0x7b21649c, 0x3f604edf), TOBN(0x5dbf6a3f, 0xe52ffe47)}}, + {{TOBN(0xc41b7c23, 0x4b54d9bf), TOBN(0x1374e681, 0x3511c3d9), + TOBN(0x1863bf16, 0xc1b2b758), TOBN(0x90e78507, 0x1e9e6a96)}, + {TOBN(0xab4bf98d, 0x5d86f174), TOBN(0xd74e0bd3, 0x85e96fe4), + TOBN(0x8afde39f, 0xcac5d344), TOBN(0x90946dbc, 0xbd91b847)}}, + {{TOBN(0xf5b42358, 0xfe1a838c), TOBN(0x05aae6c5, 0x620ac9d8), + TOBN(0x8e193bd8, 0xa1ce5a0b), TOBN(0x8f710571, 0x4dabfd72)}, + {TOBN(0x8d8fdd48, 0x182caaac), TOBN(0x8c4aeefa, 0x040745cf), + TOBN(0x73c6c30a, 0xf3b93e6d), TOBN(0x991241f3, 0x16f42011)}}, + {{TOBN(0xa0158eea, 0xe457a477), TOBN(0xd19857db, 0xee6ddc05), + TOBN(0xb3265224, 0x18c41671), TOBN(0x3ffdfc7e, 0x3c2c0d58)}, + {TOBN(0x3a3a5254, 0x26ee7cda), TOBN(0x341b0869, 0xdf02c3a8), + TOBN(0xa023bf42, 0x723bbfc8), TOBN(0x3d15002a, 0x14452691)}}}, + {{{TOBN(0x5ef7324c, 0x85edfa30), TOBN(0x25976554, 0x87d4f3da), + TOBN(0x352f5bc0, 0xdcb50c86), TOBN(0x8f6927b0, 0x4832a96c)}, + {TOBN(0xd08ee1ba, 0x55f2f94c), TOBN(0x6a996f99, 0x344b45fa), + TOBN(0xe133cb8d, 0xa8aa455d), TOBN(0x5d0721ec, 0x758dc1f7)}}, + {{TOBN(0x6ba7a920, 0x79e5fb67), TOBN(0xe1331feb, 0x70aa725e), + TOBN(0x5080ccf5, 0x7df5d837), TOBN(0xe4cae01d, 0x7ff72e21)}, + {TOBN(0xd9243ee6, 0x0412a77d), TOBN(0x06ff7cac, 0xdf449025), + TOBN(0xbe75f7cd, 0x23ef5a31), TOBN(0xbc957822, 0x0ddef7a8)}}, + {{TOBN(0x8cf7230c, 0xb0ce1c55), TOBN(0x5b534d05, 0x0bbfb607), + TOBN(0xee1ef113, 0x0e16363b), TOBN(0x27e0aa7a, 0xb4999e82)}, + {TOBN(0xce1dac2d, 0x79362c41), TOBN(0x67920c90, 0x91bb6cb0), + TOBN(0x1e648d63, 0x2223df24), TOBN(0x0f7d9eef, 0xe32e8f28)}}, + {{TOBN(0x6943f39a, 0xfa833834), TOBN(0x22951722, 0xa6328562), + TOBN(0x81d63dd5, 0x4170fc10), TOBN(0x9f5fa58f, 0xaecc2e6d)}, + {TOBN(0xb66c8725, 0xe77d9a3b), TOBN(0x11235cea, 0x6384ebe0), + TOBN(0x06a8c118, 0x5845e24a), TOBN(0x0137b286, 0xebd093b1)}}, + {{TOBN(0xc589e1ce, 0x44ace150), TOBN(0xe0f8d3d9, 0x4381e97c), + TOBN(0x59e99b11, 0x62c5a4b8), TOBN(0x90d262f7, 0xfd0ec9f9)}, + {TOBN(0xfbc854c9, 0x283e13c9), TOBN(0x2d04fde7, 0xaedc7085), + TOBN(0x057d7765, 0x47dcbecb), TOBN(0x8dbdf591, 0x9a76fa5f)}}, + {{TOBN(0xd0150695, 0x0de1e578), TOBN(0x2e1463e7, 0xe9f72bc6), + TOBN(0xffa68441, 0x1b39eca5), TOBN(0x673c8530, 0x7c037f2f)}, + {TOBN(0xd0d6a600, 0x747f91da), TOBN(0xb08d43e1, 0xc9cb78e9), + TOBN(0x0fc0c644, 0x27b5cef5), TOBN(0x5c1d160a, 0xa60a2fd6)}}, + {{TOBN(0xf98cae53, 0x28c8e13b), TOBN(0x375f10c4, 0xb2eddcd1), + TOBN(0xd4eb8b7f, 0x5cce06ad), TOBN(0xb4669f45, 0x80a2e1ef)}, + {TOBN(0xd593f9d0, 0x5bbd8699), TOBN(0x5528a4c9, 0xe7976d13), + TOBN(0x3923e095, 0x1c7e28d3), TOBN(0xb9293790, 0x3f6bb577)}}, + {{TOBN(0xdb567d6a, 0xc42bd6d2), TOBN(0x6df86468, 0xbb1f96ae), + TOBN(0x0efe5b1a, 0x4843b28e), TOBN(0x961bbb05, 0x6379b240)}, + {TOBN(0xb6caf5f0, 0x70a6a26b), TOBN(0x70686c0d, 0x328e6e39), + TOBN(0x80da06cf, 0x895fc8d3), TOBN(0x804d8810, 0xb363fdc9)}}, + {{TOBN(0xbe22877b, 0x207f1670), TOBN(0x9b0dd188, 0x4e615291), + TOBN(0x625ae8dc, 0x97a3c2bf), TOBN(0x08584ef7, 0x439b86e8)}, + {TOBN(0xde7190a5, 0xdcd898ff), TOBN(0x26286c40, 0x2058ee3d), + TOBN(0x3db0b217, 0x5f87b1c1), TOBN(0xcc334771, 0x102a6db5)}}, + {{TOBN(0xd99de954, 0x2f770fb1), TOBN(0x97c1c620, 0x4cd7535e), + TOBN(0xd3b6c448, 0x3f09cefc), TOBN(0xd725af15, 0x5a63b4f8)}, + {TOBN(0x0c95d24f, 0xc01e20ec), TOBN(0xdfd37494, 0x9ae7121f), + TOBN(0x7d6ddb72, 0xec77b7ec), TOBN(0xfe079d3b, 0x0353a4ae)}}, + {{TOBN(0x3066e70a, 0x2e6ac8d2), TOBN(0x9c6b5a43, 0x106e5c05), + TOBN(0x52d3c6f5, 0xede59b8c), TOBN(0x30d6a5c3, 0xfccec9ae)}, + {TOBN(0xedec7c22, 0x4fc0a9ef), TOBN(0x190ff083, 0x95c16ced), + TOBN(0xbe12ec8f, 0x94de0fde), TOBN(0x0d131ab8, 0x852d3433)}}, + {{TOBN(0x42ace07e, 0x85701291), TOBN(0x94793ed9, 0x194061a8), + TOBN(0x30e83ed6, 0xd7f4a485), TOBN(0x9eec7269, 0xf9eeff4d)}, + {TOBN(0x90acba59, 0x0c9d8005), TOBN(0x5feca458, 0x1e79b9d1), + TOBN(0x8fbe5427, 0x1d506a1e), TOBN(0xa32b2c8e, 0x2439cfa7)}}, + {{TOBN(0x1671c173, 0x73dd0b4e), TOBN(0x37a28214, 0x44a054c6), + TOBN(0x81760a1b, 0x4e8b53f1), TOBN(0xa6c04224, 0xf9f93b9e)}, + {TOBN(0x18784b34, 0xcf671e3c), TOBN(0x81bbecd2, 0xcda9b994), + TOBN(0x38831979, 0xb2ab3848), TOBN(0xef54feb7, 0xf2e03c2d)}}, + {{TOBN(0xcf197ca7, 0xfb8088fa), TOBN(0x01427247, 0x4ddc96c5), + TOBN(0xa2d2550a, 0x30777176), TOBN(0x53469898, 0x4d0cf71d)}, + {TOBN(0x6ce937b8, 0x3a2aaac6), TOBN(0xe9f91dc3, 0x5af38d9b), + TOBN(0x2598ad83, 0xc8bf2899), TOBN(0x8e706ac9, 0xb5536c16)}}, + {{TOBN(0x40dc7495, 0xf688dc98), TOBN(0x26490cd7, 0x124c4afc), + TOBN(0xe651ec84, 0x1f18775c), TOBN(0x393ea6c3, 0xb4fdaf4a)}, + {TOBN(0x1e1f3343, 0x7f338e0d), TOBN(0x39fb832b, 0x6053e7b5), + TOBN(0x46e702da, 0x619e14d5), TOBN(0x859cacd1, 0xcdeef6e0)}}, + {{TOBN(0x63b99ce7, 0x4462007d), TOBN(0xb8ab48a5, 0x4cb5f5b7), + TOBN(0x9ec673d2, 0xf55edde7), TOBN(0xd1567f74, 0x8cfaefda)}, + {TOBN(0x46381b6b, 0x0887bcec), TOBN(0x694497ce, 0xe178f3c2), + TOBN(0x5e6525e3, 0x1e6266cb), TOBN(0x5931de26, 0x697d6413)}}, + {{TOBN(0x87f8df7c, 0x0e58d493), TOBN(0xb1ae5ed0, 0x58b73f12), + TOBN(0xc368f784, 0xdea0c34d), TOBN(0x9bd0a120, 0x859a91a0)}, + {TOBN(0xb00d88b7, 0xcc863c68), TOBN(0x3a1cc11e, 0x3d1f4d65), + TOBN(0xea38e0e7, 0x0aa85593), TOBN(0x37f13e98, 0x7dc4aee8)}}, + {{TOBN(0x10d38667, 0xbc947bad), TOBN(0x738e07ce, 0x2a36ee2e), + TOBN(0xc93470cd, 0xc577fcac), TOBN(0xdee1b616, 0x2782470d)}, + {TOBN(0x36a25e67, 0x2e793d12), TOBN(0xd6aa6cae, 0xe0f186da), + TOBN(0x474d0fd9, 0x80e07af7), TOBN(0xf7cdc47d, 0xba8a5cd4)}}, + {{TOBN(0x28af6d9d, 0xab15247f), TOBN(0x7c789c10, 0x493a537f), + TOBN(0x7ac9b110, 0x23a334e7), TOBN(0x0236ac09, 0x12c9c277)}, + {TOBN(0xa7e5bd25, 0x1d7a5144), TOBN(0x098b9c2a, 0xf13ec4ec), + TOBN(0x3639daca, 0xd3f0abca), TOBN(0x642da81a, 0xa23960f9)}}, + {{TOBN(0x7d2e5c05, 0x4f7269b1), TOBN(0xfcf30777, 0xe287c385), + TOBN(0x10edc84f, 0xf2a46f21), TOBN(0x35441757, 0x4f43fa36)}, + {TOBN(0xf1327899, 0xfd703431), TOBN(0xa438d7a6, 0x16dd587a), + TOBN(0x65c34c57, 0xe9c8352d), TOBN(0xa728edab, 0x5cc5a24e)}}, + {{TOBN(0xaed78abc, 0x42531689), TOBN(0x0a51a0e8, 0x010963ef), + TOBN(0x5776fa0a, 0xd717d9b3), TOBN(0xf356c239, 0x7dd3428b)}, + {TOBN(0x29903fff, 0x8d3a3dac), TOBN(0x409597fa, 0x3d94491f), + TOBN(0x4cd7a5ff, 0xbf4a56a4), TOBN(0xe5096474, 0x8adab462)}}, + {{TOBN(0xa97b5126, 0x5c3427b0), TOBN(0x6401405c, 0xd282c9bd), + TOBN(0x3629f8d7, 0x222c5c45), TOBN(0xb1c02c16, 0xe8d50aed)}, + {TOBN(0xbea2ed75, 0xd9635bc9), TOBN(0x226790c7, 0x6e24552f), + TOBN(0x3c33f2a3, 0x65f1d066), TOBN(0x2a43463e, 0x6dfccc2e)}}, + {{TOBN(0x8cc3453a, 0xdb483761), TOBN(0xe7cc6085, 0x65d5672b), + TOBN(0x277ed6cb, 0xde3efc87), TOBN(0x19f2f368, 0x69234eaf)}, + {TOBN(0x9aaf4317, 0x5c0b800b), TOBN(0x1f1e7c89, 0x8b6da6e2), + TOBN(0x6cfb4715, 0xb94ec75e), TOBN(0xd590dd5f, 0x453118c2)}}, + {{TOBN(0x14e49da1, 0x1f17a34c), TOBN(0x5420ab39, 0x235a1456), + TOBN(0xb7637241, 0x2f50363b), TOBN(0x7b15d623, 0xc3fabb6e)}, + {TOBN(0xa0ef40b1, 0xe274e49c), TOBN(0x5cf50744, 0x96b1860a), + TOBN(0xd6583fbf, 0x66afe5a4), TOBN(0x44240510, 0xf47e3e9a)}}, + {{TOBN(0x99254343, 0x11b2d595), TOBN(0xf1367499, 0xeec8df57), + TOBN(0x3cb12c61, 0x3e73dd05), TOBN(0xd248c033, 0x7dac102a)}, + {TOBN(0xcf154f13, 0xa77739f5), TOBN(0xbf4288cb, 0x23d2af42), + TOBN(0xaa64c9b6, 0x32e4a1cf), TOBN(0xee8c07a8, 0xc8a208f3)}}, + {{TOBN(0xe10d4999, 0x6fe8393f), TOBN(0x0f809a3f, 0xe91f3a32), + TOBN(0x61096d1c, 0x802f63c8), TOBN(0x289e1462, 0x57750d3d)}, + {TOBN(0xed06167e, 0x9889feea), TOBN(0xd5c9c0e2, 0xe0993909), + TOBN(0x46fca0d8, 0x56508ac6), TOBN(0x91826047, 0x4f1b8e83)}}, + {{TOBN(0x4f2c877a, 0x9a4a2751), TOBN(0x71bd0072, 0xcae6fead), + TOBN(0x38df8dcc, 0x06aa1941), TOBN(0x5a074b4c, 0x63beeaa8)}, + {TOBN(0xd6d65934, 0xc1cec8ed), TOBN(0xa6ecb49e, 0xaabc03bd), + TOBN(0xaade91c2, 0xde8a8415), TOBN(0xcfb0efdf, 0x691136e0)}}, + {{TOBN(0x11af45ee, 0x23ab3495), TOBN(0xa132df88, 0x0b77463d), + TOBN(0x8923c15c, 0x815d06f4), TOBN(0xc3ceb3f5, 0x0d61a436)}, + {TOBN(0xaf52291d, 0xe88fb1da), TOBN(0xea057974, 0x1da12179), + TOBN(0xb0d7218c, 0xd2fef720), TOBN(0x6c0899c9, 0x8e1d8845)}}, + {{TOBN(0x98157504, 0x752ddad7), TOBN(0xd60bd74f, 0xa1a68a97), + TOBN(0x7047a3a9, 0xf658fb99), TOBN(0x1f5d86d6, 0x5f8511e4)}, + {TOBN(0xb8a4bc42, 0x4b5a6d88), TOBN(0x69eb2c33, 0x1abefa7d), + TOBN(0x95bf39e8, 0x13c9c510), TOBN(0xf571960a, 0xd48aab43)}}, + {{TOBN(0x7e8cfbcf, 0x704e23c6), TOBN(0xc71b7d22, 0x28aaa65b), + TOBN(0xa041b2bd, 0x245e3c83), TOBN(0x69b98834, 0xd21854ff)}, + {TOBN(0x89d227a3, 0x963bfeec), TOBN(0x99947aaa, 0xde7da7cb), + TOBN(0x1d9ee9db, 0xee68a9b1), TOBN(0x0a08f003, 0x698ec368)}}, + {{TOBN(0xe9ea4094, 0x78ef2487), TOBN(0xc8d2d415, 0x02cfec26), + TOBN(0xc52f9a6e, 0xb7dcf328), TOBN(0x0ed489e3, 0x85b6a937)}, + {TOBN(0x9b94986b, 0xbef3366e), TOBN(0x0de59c70, 0xedddddb8), + TOBN(0xffdb748c, 0xeadddbe2), TOBN(0x9b9784bb, 0x8266ea40)}}, + {{TOBN(0x142b5502, 0x1a93507a), TOBN(0xb4cd1187, 0x8d3c06cf), + TOBN(0xdf70e76a, 0x91ec3f40), TOBN(0x484e81ad, 0x4e7553c2)}, + {TOBN(0x830f87b5, 0x272e9d6e), TOBN(0xea1c93e5, 0xc6ff514a), + TOBN(0x67cc2adc, 0xc4192a8e), TOBN(0xc77e27e2, 0x42f4535a)}}, + {{TOBN(0x9cdbab36, 0xd2b713c5), TOBN(0x86274ea0, 0xcf7b0cd3), + TOBN(0x784680f3, 0x09af826b), TOBN(0xbfcc837a, 0x0c72dea3)}, + {TOBN(0xa8bdfe9d, 0xd6529b73), TOBN(0x708aa228, 0x63a88002), + TOBN(0x6c7a9a54, 0xc91d45b9), TOBN(0xdf1a38bb, 0xfd004f56)}}, + {{TOBN(0x2e8c9a26, 0xb8bad853), TOBN(0x2d52cea3, 0x3723eae7), + TOBN(0x054d6d81, 0x56ca2830), TOBN(0xa3317d14, 0x9a8dc411)}, + {TOBN(0xa08662fe, 0xfd4ddeda), TOBN(0xed2a153a, 0xb55d792b), + TOBN(0x7035c16a, 0xbfc6e944), TOBN(0xb6bc5834, 0x00171cf3)}}, + {{TOBN(0xe27152b3, 0x83d102b6), TOBN(0xfe695a47, 0x0646b848), + TOBN(0xa5bb09d8, 0x916e6d37), TOBN(0xb4269d64, 0x0d17015e)}, + {TOBN(0x8d8156a1, 0x0a1d2285), TOBN(0xfeef6c51, 0x46d26d72), + TOBN(0x9dac57c8, 0x4c5434a7), TOBN(0x0282e5be, 0x59d39e31)}}, + {{TOBN(0xedfff181, 0x721c486d), TOBN(0x301baf10, 0xbc58824e), + TOBN(0x8136a6aa, 0x00570031), TOBN(0x55aaf78c, 0x1cddde68)}, + {TOBN(0x26829371, 0x59c63952), TOBN(0x3a3bd274, 0x8bc25baf), + TOBN(0xecdf8657, 0xb7e52dc3), TOBN(0x2dd8c087, 0xfd78e6c8)}}, + {{TOBN(0x20553274, 0xf5531461), TOBN(0x8b4a1281, 0x5d95499b), + TOBN(0xe2c8763a, 0x1a80f9d2), TOBN(0xd1dbe32b, 0x4ddec758)}, + {TOBN(0xaf12210d, 0x30c34169), TOBN(0xba74a953, 0x78baa533), + TOBN(0x3d133c6e, 0xa438f254), TOBN(0xa431531a, 0x201bef5b)}}, + {{TOBN(0x15295e22, 0xf669d7ec), TOBN(0xca374f64, 0x357fb515), + TOBN(0x8a8406ff, 0xeaa3fdb3), TOBN(0x106ae448, 0xdf3f2da8)}, + {TOBN(0x8f9b0a90, 0x33c8e9a1), TOBN(0x234645e2, 0x71ad5885), + TOBN(0x3d083224, 0x1c0aed14), TOBN(0xf10a7d3e, 0x7a942d46)}}, + {{TOBN(0x7c11deee, 0x40d5c9be), TOBN(0xb2bae7ff, 0xba84ed98), + TOBN(0x93e97139, 0xaad58ddd), TOBN(0x3d872796, 0x3f6d1fa3)}, + {TOBN(0x483aca81, 0x8569ff13), TOBN(0x8b89a5fb, 0x9a600f72), + TOBN(0x4cbc27c3, 0xc06f2b86), TOBN(0x22130713, 0x63ad9c0b)}}, + {{TOBN(0xb5358b1e, 0x48ac2840), TOBN(0x18311294, 0xecba9477), + TOBN(0xda58f990, 0xa6946b43), TOBN(0x3098baf9, 0x9ab41819)}, + {TOBN(0x66c4c158, 0x4198da52), TOBN(0xab4fc17c, 0x146bfd1b), + TOBN(0x2f0a4c3c, 0xbf36a908), TOBN(0x2ae9e34b, 0x58cf7838)}}, + {{TOBN(0xf411529e, 0x3fa11b1f), TOBN(0x21e43677, 0x974af2b4), + TOBN(0x7c20958e, 0xc230793b), TOBN(0x710ea885, 0x16e840f3)}, + {TOBN(0xfc0b21fc, 0xc5dc67cf), TOBN(0x08d51647, 0x88405718), + TOBN(0xd955c21f, 0xcfe49eb7), TOBN(0x9722a5d5, 0x56dd4a1f)}}, + {{TOBN(0xc9ef50e2, 0xc861baa5), TOBN(0xc0c21a5d, 0x9505ac3e), + TOBN(0xaf6b9a33, 0x8b7c063f), TOBN(0xc6370339, 0x2f4779c1)}, + {TOBN(0x22df99c7, 0x638167c3), TOBN(0xfe6ffe76, 0x795db30c), + TOBN(0x2b822d33, 0xa4854989), TOBN(0xfef031dd, 0x30563aa5)}}, + {{TOBN(0x16b09f82, 0xd57c667f), TOBN(0xc70312ce, 0xcc0b76f1), + TOBN(0xbf04a9e6, 0xc9118aec), TOBN(0x82fcb419, 0x3409d133)}, + {TOBN(0x1a8ab385, 0xab45d44d), TOBN(0xfba07222, 0x617b83a3), + TOBN(0xb05f50dd, 0x58e81b52), TOBN(0x1d8db553, 0x21ce5aff)}}, + {{TOBN(0x3097b8d4, 0xe344a873), TOBN(0x7d8d116d, 0xfe36d53e), + TOBN(0x6db22f58, 0x7875e750), TOBN(0x2dc5e373, 0x43e144ea)}, + {TOBN(0xc05f32e6, 0xe799eb95), TOBN(0xe9e5f4df, 0x6899e6ec), + TOBN(0xbdc3bd68, 0x1fab23d5), TOBN(0xb72b8ab7, 0x73af60e6)}}, + {{TOBN(0x8db27ae0, 0x2cecc84a), TOBN(0x600016d8, 0x7bdb871c), + TOBN(0x42a44b13, 0xd7c46f58), TOBN(0xb8919727, 0xc3a77d39)}, + {TOBN(0xcfc6bbbd, 0xdafd6088), TOBN(0x1a740146, 0x6bd20d39), + TOBN(0x8c747abd, 0x98c41072), TOBN(0x4c91e765, 0xbdf68ea1)}}, + {{TOBN(0x7c95e5ca, 0x08819a78), TOBN(0xcf48b729, 0xc9587921), + TOBN(0x091c7c5f, 0xdebbcc7d), TOBN(0x6f287404, 0xf0e05149)}, + {TOBN(0xf83b5ac2, 0x26cd44ec), TOBN(0x88ae32a6, 0xcfea250e), + TOBN(0x6ac5047a, 0x1d06ebc5), TOBN(0xc7e550b4, 0xd434f781)}}, + {{TOBN(0x61ab1cf2, 0x5c727bd2), TOBN(0x2e4badb1, 0x1cf915b0), + TOBN(0x1b4dadec, 0xf69d3920), TOBN(0xe61b1ca6, 0xf14c1dfe)}, + {TOBN(0x90b479cc, 0xbd6bd51f), TOBN(0x8024e401, 0x8045ec30), + TOBN(0xcab29ca3, 0x25ef0e62), TOBN(0x4f2e9416, 0x49e4ebc0)}}, + {{TOBN(0x45eb40ec, 0x0ccced58), TOBN(0x25cd4b9c, 0x0da44f98), + TOBN(0x43e06458, 0x871812c6), TOBN(0x99f80d55, 0x16cef651)}, + {TOBN(0x571340c9, 0xce6dc153), TOBN(0x138d5117, 0xd8665521), + TOBN(0xacdb45bc, 0x4e07014d), TOBN(0x2f34bb38, 0x84b60b91)}}, + {{TOBN(0xf44a4fd2, 0x2ae8921e), TOBN(0xb039288e, 0x892ba1e2), + TOBN(0x9da50174, 0xb1c180b2), TOBN(0x6b70ab66, 0x1693dc87)}, + {TOBN(0x7e9babc9, 0xe7057481), TOBN(0x4581ddef, 0x9c80dc41), + TOBN(0x0c890da9, 0x51294682), TOBN(0x0b5629d3, 0x3f4736e5)}}, + {{TOBN(0x2340c79e, 0xb06f5b41), TOBN(0xa42e84ce, 0x4e243469), + TOBN(0xf9a20135, 0x045a71a9), TOBN(0xefbfb415, 0xd27b6fb6)}, + {TOBN(0x25ebea23, 0x9d33cd6f), TOBN(0x9caedb88, 0xaa6c0af8), + TOBN(0x53dc7e9a, 0xd9ce6f96), TOBN(0x3897f9fd, 0x51e0b15a)}}, + {{TOBN(0xf51cb1f8, 0x8e5d788e), TOBN(0x1aec7ba8, 0xe1d490ee), + TOBN(0x265991e0, 0xcc58cb3c), TOBN(0x9f306e8c, 0x9fc3ad31)}, + {TOBN(0x5fed006e, 0x5040a0ac), TOBN(0xca9d5043, 0xfb476f2e), + TOBN(0xa19c06e8, 0xbeea7a23), TOBN(0xd2865801, 0x0edabb63)}}, + {{TOBN(0xdb92293f, 0x6967469a), TOBN(0x2894d839, 0x8d8a8ed8), + TOBN(0x87c9e406, 0xbbc77122), TOBN(0x8671c6f1, 0x2ea3a26a)}, + {TOBN(0xe42df8d6, 0xd7de9853), TOBN(0x2e3ce346, 0xb1f2bcc7), + TOBN(0xda601dfc, 0x899d50cf), TOBN(0xbfc913de, 0xfb1b598f)}}, + {{TOBN(0x81c4909f, 0xe61f7908), TOBN(0x192e304f, 0x9bbc7b29), + TOBN(0xc3ed8738, 0xc104b338), TOBN(0xedbe9e47, 0x783f5d61)}, + {TOBN(0x0c06e9be, 0x2db30660), TOBN(0xda3e613f, 0xc0eb7d8e), + TOBN(0xd8fa3e97, 0x322e096e), TOBN(0xfebd91e8, 0xd336e247)}}, + {{TOBN(0x8f13ccc4, 0xdf655a49), TOBN(0xa9e00dfc, 0x5eb20210), + TOBN(0x84631d0f, 0xc656b6ea), TOBN(0x93a058cd, 0xd8c0d947)}, + {TOBN(0x6846904a, 0x67bd3448), TOBN(0x4a3d4e1a, 0xf394fd5c), + TOBN(0xc102c1a5, 0xdb225f52), TOBN(0xe3455bba, 0xfc4f5e9a)}}, + {{TOBN(0x6b36985b, 0x4b9ad1ce), TOBN(0xa9818536, 0x5bb7f793), + TOBN(0x6c25e1d0, 0x48b1a416), TOBN(0x1381dd53, 0x3c81bee7)}, + {TOBN(0xd2a30d61, 0x7a4a7620), TOBN(0xc8412926, 0x39b8944c), + TOBN(0x3c1c6fbe, 0x7a97c33a), TOBN(0x941e541d, 0x938664e7)}}, + {{TOBN(0x417499e8, 0x4a34f239), TOBN(0x15fdb83c, 0xb90402d5), + TOBN(0xb75f46bf, 0x433aa832), TOBN(0xb61e15af, 0x63215db1)}, + {TOBN(0xaabe59d4, 0xa127f89a), TOBN(0x5d541e0c, 0x07e816da), + TOBN(0xaaba0659, 0xa618b692), TOBN(0x55327733, 0x17266026)}}, + {{TOBN(0xaf53a0fc, 0x95f57552), TOBN(0x32947650, 0x6cacb0c9), + TOBN(0x253ff58d, 0xc821be01), TOBN(0xb0309531, 0xa06f1146)}, + {TOBN(0x59bbbdf5, 0x05c2e54d), TOBN(0x158f27ad, 0x26e8dd22), + TOBN(0xcc5b7ffb, 0x397e1e53), TOBN(0xae03f65b, 0x7fc1e50d)}}, + {{TOBN(0xa9784ebd, 0x9c95f0f9), TOBN(0x5ed9deb2, 0x24640771), + TOBN(0x31244af7, 0x035561c4), TOBN(0x87332f3a, 0x7ee857de)}, + {TOBN(0x09e16e9e, 0x2b9e0d88), TOBN(0x52d910f4, 0x56a06049), + TOBN(0x507ed477, 0xa9592f48), TOBN(0x85cb917b, 0x2365d678)}}, + {{TOBN(0xf8511c93, 0x4c8998d1), TOBN(0x2186a3f1, 0x730ea58f), + TOBN(0x50189626, 0xb2029db0), TOBN(0x9137a6d9, 0x02ceb75a)}, + {TOBN(0x2fe17f37, 0x748bc82c), TOBN(0x87c2e931, 0x80469f8c), + TOBN(0x850f71cd, 0xbf891aa2), TOBN(0x0ca1b89b, 0x75ec3d8d)}}, + {{TOBN(0x516c43aa, 0x5e1cd3cd), TOBN(0x89397808, 0x9a887c28), + TOBN(0x0059c699, 0xddea1f9f), TOBN(0x7737d6fa, 0x8e6868f7)}, + {TOBN(0x6d93746a, 0x60f1524b), TOBN(0x36985e55, 0xba052aa7), + TOBN(0x41b1d322, 0xed923ea5), TOBN(0x3429759f, 0x25852a11)}}, + {{TOBN(0xbeca6ec3, 0x092e9f41), TOBN(0x3a238c66, 0x62256bbd), + TOBN(0xd82958ea, 0x70ad487d), TOBN(0x4ac8aaf9, 0x65610d93)}, + {TOBN(0x3fa101b1, 0x5e4ccab0), TOBN(0x9bf430f2, 0x9de14bfb), + TOBN(0xa10f5cc6, 0x6531899d), TOBN(0x590005fb, 0xea8ce17d)}}, + {{TOBN(0xc437912f, 0x24544cb6), TOBN(0x9987b71a, 0xd79ac2e3), + TOBN(0x13e3d9dd, 0xc058a212), TOBN(0x00075aac, 0xd2de9606)}, + {TOBN(0x80ab508b, 0x6cac8369), TOBN(0x87842be7, 0xf54f6c89), + TOBN(0xa7ad663d, 0x6bc532a4), TOBN(0x67813de7, 0x78a91bc8)}}, + {{TOBN(0x5dcb61ce, 0xc3427239), TOBN(0x5f3c7cf0, 0xc56934d9), + TOBN(0xc079e0fb, 0xe3191591), TOBN(0xe40896bd, 0xb01aada7)}, + {TOBN(0x8d466791, 0x0492d25f), TOBN(0x8aeb30c9, 0xe7408276), + TOBN(0xe9437495, 0x9287aacc), TOBN(0x23d4708d, 0x79fe03d4)}}, + {{TOBN(0x8cda9cf2, 0xd0c05199), TOBN(0x502fbc22, 0xfae78454), + TOBN(0xc0bda9df, 0xf572a182), TOBN(0x5f9b71b8, 0x6158b372)}, + {TOBN(0xe0f33a59, 0x2b82dd07), TOBN(0x76302735, 0x9523032e), + TOBN(0x7fe1a721, 0xc4505a32), TOBN(0x7b6e3e82, 0xf796409f)}}}, + {{{TOBN(0xe3417bc0, 0x35d0b34a), TOBN(0x440b386b, 0x8327c0a7), + TOBN(0x8fb7262d, 0xac0362d1), TOBN(0x2c41114c, 0xe0cdf943)}, + {TOBN(0x2ba5cef1, 0xad95a0b1), TOBN(0xc09b37a8, 0x67d54362), + TOBN(0x26d6cdd2, 0x01e486c9), TOBN(0x20477abf, 0x42ff9297)}}, + {{TOBN(0xa004dcb3, 0x292a9287), TOBN(0xddc15cf6, 0x77b092c7), + TOBN(0x083a8464, 0x806c0605), TOBN(0x4a68df70, 0x3db997b0)}, + {TOBN(0x9c134e45, 0x05bf7dd0), TOBN(0xa4e63d39, 0x8ccf7f8c), + TOBN(0xa6e6517f, 0x41b5f8af), TOBN(0xaa8b9342, 0xad7bc1cc)}}, + {{TOBN(0x126f35b5, 0x1e706ad9), TOBN(0xb99cebb4, 0xc3a9ebdf), + TOBN(0xa75389af, 0xbf608d90), TOBN(0x76113c4f, 0xc6c89858)}, + {TOBN(0x80de8eb0, 0x97e2b5aa), TOBN(0x7e1022cc, 0x63b91304), + TOBN(0x3bdab605, 0x6ccc066c), TOBN(0x33cbb144, 0xb2edf900)}}, + {{TOBN(0xc4176471, 0x7af715d2), TOBN(0xe2f7f594, 0xd0134a96), + TOBN(0x2c1873ef, 0xa41ec956), TOBN(0xe4e7b4f6, 0x77821304)}, + {TOBN(0xe5c8ff97, 0x88d5374a), TOBN(0x2b915e63, 0x80823d5b), + TOBN(0xea6bc755, 0xb2ee8fe2), TOBN(0x6657624c, 0xe7112651)}}, + {{TOBN(0x157af101, 0xdace5aca), TOBN(0xc4fdbcf2, 0x11a6a267), + TOBN(0xdaddf340, 0xc49c8609), TOBN(0x97e49f52, 0xe9604a65)}, + {TOBN(0x9be8e790, 0x937e2ad5), TOBN(0x846e2508, 0x326e17f1), + TOBN(0x3f38007a, 0x0bbbc0dc), TOBN(0xcf03603f, 0xb11e16d6)}}, + {{TOBN(0xd6f800e0, 0x7442f1d5), TOBN(0x475607d1, 0x66e0e3ab), + TOBN(0x82807f16, 0xb7c64047), TOBN(0x8858e1e3, 0xa749883d)}, + {TOBN(0x5859120b, 0x8231ee10), TOBN(0x1b80e7eb, 0x638a1ece), + TOBN(0xcb72525a, 0xc6aa73a4), TOBN(0xa7cdea3d, 0x844423ac)}}, + {{TOBN(0x5ed0c007, 0xf8ae7c38), TOBN(0x6db07a5c, 0x3d740192), + TOBN(0xbe5e9c2a, 0x5fe36db3), TOBN(0xd5b9d57a, 0x76e95046)}, + {TOBN(0x54ac32e7, 0x8eba20f2), TOBN(0xef11ca8f, 0x71b9a352), + TOBN(0x305e373e, 0xff98a658), TOBN(0xffe5a100, 0x823eb667)}}, + {{TOBN(0x57477b11, 0xe51732d2), TOBN(0xdfd6eb28, 0x2538fc0e), + TOBN(0x5c43b0cc, 0x3b39eec5), TOBN(0x6af12778, 0xcb36cc57)}, + {TOBN(0x70b0852d, 0x06c425ae), TOBN(0x6df92f8c, 0x5c221b9b), + TOBN(0x6c8d4f9e, 0xce826d9c), TOBN(0xf59aba7b, 0xb49359c3)}}, + {{TOBN(0x5c8ed8d5, 0xda64309d), TOBN(0x61a6de56, 0x91b30704), + TOBN(0xd6b52f6a, 0x2f9b5808), TOBN(0x0eee4194, 0x98c958a7)}, + {TOBN(0xcddd9aab, 0x771e4caa), TOBN(0x83965dfd, 0x78bc21be), + TOBN(0x02affce3, 0xb3b504f5), TOBN(0x30847a21, 0x561c8291)}}, + {{TOBN(0xd2eb2cf1, 0x52bfda05), TOBN(0xe0e4c4e9, 0x6197b98c), + TOBN(0x1d35076c, 0xf8a1726f), TOBN(0x6c06085b, 0x2db11e3d)}, + {TOBN(0x15c0c4d7, 0x4463ba14), TOBN(0x9d292f83, 0x0030238c), + TOBN(0x1311ee8b, 0x3727536d), TOBN(0xfeea86ef, 0xbeaedc1e)}}, + {{TOBN(0xb9d18cd3, 0x66131e2e), TOBN(0xf31d974f, 0x80fe2682), + TOBN(0xb6e49e0f, 0xe4160289), TOBN(0x7c48ec0b, 0x08e92799)}, + {TOBN(0x818111d8, 0xd1989aa7), TOBN(0xb34fa0aa, 0xebf926f9), + TOBN(0xdb5fe2f5, 0xa245474a), TOBN(0xf80a6ebb, 0x3c7ca756)}}, + {{TOBN(0xa7f96054, 0xafa05dd8), TOBN(0x26dfcf21, 0xfcaf119e), + TOBN(0xe20ef2e3, 0x0564bb59), TOBN(0xef4dca50, 0x61cb02b8)}, + {TOBN(0xcda7838a, 0x65d30672), TOBN(0x8b08d534, 0xfd657e86), + TOBN(0x4c5b4395, 0x46d595c8), TOBN(0x39b58725, 0x425cb836)}}, + {{TOBN(0x8ea61059, 0x3de9abe3), TOBN(0x40434881, 0x9cdc03be), + TOBN(0x9b261245, 0xcfedce8c), TOBN(0x78c318b4, 0xcf5234a1)}, + {TOBN(0x510bcf16, 0xfde24c99), TOBN(0x2a77cb75, 0xa2c2ff5d), + TOBN(0x9c895c2b, 0x27960fb4), TOBN(0xd30ce975, 0xb0eda42b)}}, + {{TOBN(0xfda85393, 0x1a62cc26), TOBN(0x23c69b96, 0x50c0e052), + TOBN(0xa227df15, 0xbfc633f3), TOBN(0x2ac78848, 0x1bae7d48)}, + {TOBN(0x487878f9, 0x187d073d), TOBN(0x6c2be919, 0x967f807d), + TOBN(0x765861d8, 0x336e6d8f), TOBN(0x88b8974c, 0xce528a43)}}, + {{TOBN(0x09521177, 0xff57d051), TOBN(0x2ff38037, 0xfb6a1961), + TOBN(0xfc0aba74, 0xa3d76ad4), TOBN(0x7c764803, 0x25a7ec17)}, + {TOBN(0x7532d75f, 0x48879bc8), TOBN(0xea7eacc0, 0x58ce6bc1), + TOBN(0xc82176b4, 0x8e896c16), TOBN(0x9a30e0b2, 0x2c750fed)}}, + {{TOBN(0xc37e2c2e, 0x421d3aa4), TOBN(0xf926407c, 0xe84fa840), + TOBN(0x18abc03d, 0x1454e41c), TOBN(0x26605ecd, 0x3f7af644)}, + {TOBN(0x242341a6, 0xd6a5eabf), TOBN(0x1edb84f4, 0x216b668e), + TOBN(0xd836edb8, 0x04010102), TOBN(0x5b337ce7, 0x945e1d8c)}}, + {{TOBN(0xd2075c77, 0xc055dc14), TOBN(0x2a0ffa25, 0x81d89cdf), + TOBN(0x8ce815ea, 0x6ffdcbaf), TOBN(0xa3428878, 0xfb648867)}, + {TOBN(0x277699cf, 0x884655fb), TOBN(0xfa5b5bd6, 0x364d3e41), + TOBN(0x01f680c6, 0x441e1cb7), TOBN(0x3fd61e66, 0xb70a7d67)}}, + {{TOBN(0x666ba2dc, 0xcc78cf66), TOBN(0xb3018174, 0x6fdbff77), + TOBN(0x8d4dd0db, 0x168d4668), TOBN(0x259455d0, 0x1dab3a2a)}, + {TOBN(0xf58564c5, 0xcde3acec), TOBN(0x77141925, 0x13adb276), + TOBN(0x527d725d, 0x8a303f65), TOBN(0x55deb6c9, 0xe6f38f7b)}}, + {{TOBN(0xfd5bb657, 0xb1fa70fb), TOBN(0xfa07f50f, 0xd8073a00), + TOBN(0xf72e3aa7, 0xbca02500), TOBN(0xf68f895d, 0x9975740d)}, + {TOBN(0x30112060, 0x5cae2a6a), TOBN(0x01bd7218, 0x02874842), + TOBN(0x3d423891, 0x7ce47bd3), TOBN(0xa66663c1, 0x789544f6)}}, + {{TOBN(0x864d05d7, 0x3272d838), TOBN(0xe22924f9, 0xfa6295c5), + TOBN(0x8189593f, 0x6c2fda32), TOBN(0x330d7189, 0xb184b544)}, + {TOBN(0x79efa62c, 0xbde1f714), TOBN(0x35771c94, 0xe5cb1a63), + TOBN(0x2f4826b8, 0x641c8332), TOBN(0x00a894fb, 0xc8cee854)}}, + {{TOBN(0xb4b9a39b, 0x36194d40), TOBN(0xe857a7c5, 0x77612601), + TOBN(0xf4209dd2, 0x4ecf2f58), TOBN(0x82b9e66d, 0x5a033487)}, + {TOBN(0xc1e36934, 0xe4e8b9dd), TOBN(0xd2372c9d, 0xa42377d7), + TOBN(0x51dc94c7, 0x0e3ae43b), TOBN(0x4c57761e, 0x04474f6f)}}, + {{TOBN(0xdcdacd0a, 0x1058a318), TOBN(0x369cf3f5, 0x78053a9a), + TOBN(0xc6c3de50, 0x31c68de2), TOBN(0x4653a576, 0x3c4b6d9f)}, + {TOBN(0x1688dd5a, 0xaa4e5c97), TOBN(0x5be80aa1, 0xb7ab3c74), + TOBN(0x70cefe7c, 0xbc65c283), TOBN(0x57f95f13, 0x06867091)}}, + {{TOBN(0xa39114e2, 0x4415503b), TOBN(0xc08ff7c6, 0x4cbb17e9), + TOBN(0x1eff674d, 0xd7dec966), TOBN(0x6d4690af, 0x53376f63)}, + {TOBN(0xff6fe32e, 0xea74237b), TOBN(0xc436d17e, 0xcd57508e), + TOBN(0x15aa28e1, 0xedcc40fe), TOBN(0x0d769c04, 0x581bbb44)}}, + {{TOBN(0xc240b6de, 0x34eaacda), TOBN(0xd9e116e8, 0x2ba0f1de), + TOBN(0xcbe45ec7, 0x79438e55), TOBN(0x91787c9d, 0x96f752d7)}, + {TOBN(0x897f532b, 0xf129ac2f), TOBN(0xd307b7c8, 0x5a36e22c), + TOBN(0x91940675, 0x749fb8f3), TOBN(0xd14f95d0, 0x157fdb28)}}, + {{TOBN(0xfe51d029, 0x6ae55043), TOBN(0x8931e98f, 0x44a87de1), + TOBN(0xe57f1cc6, 0x09e4fee2), TOBN(0x0d063b67, 0x4e072d92)}, + {TOBN(0x70a998b9, 0xed0e4316), TOBN(0xe74a736b, 0x306aca46), + TOBN(0xecf0fbf2, 0x4fda97c7), TOBN(0xa40f65cb, 0x3e178d93)}}, + {{TOBN(0x16253604, 0x16df4285), TOBN(0xb0c9babb, 0xd0c56ae2), + TOBN(0x73032b19, 0xcfc5cfc3), TOBN(0xe497e5c3, 0x09752056)}, + {TOBN(0x12096bb4, 0x164bda96), TOBN(0x1ee42419, 0xa0b74da1), + TOBN(0x8fc36243, 0x403826ba), TOBN(0x0c8f0069, 0xdc09e660)}}, + {{TOBN(0x8667e981, 0xc27253c9), TOBN(0x05a6aefb, 0x92b36a45), + TOBN(0xa62c4b36, 0x9cb7bb46), TOBN(0x8394f375, 0x11f7027b)}, + {TOBN(0x747bc79c, 0x5f109d0f), TOBN(0xcad88a76, 0x5b8cc60a), + TOBN(0x80c5a66b, 0x58f09e68), TOBN(0xe753d451, 0xf6127eac)}}, + {{TOBN(0xc44b74a1, 0x5b0ec6f5), TOBN(0x47989fe4, 0x5289b2b8), + TOBN(0x745f8484, 0x58d6fc73), TOBN(0xec362a6f, 0xf61c70ab)}, + {TOBN(0x070c98a7, 0xb3a8ad41), TOBN(0x73a20fc0, 0x7b63db51), + TOBN(0xed2c2173, 0xf44c35f4), TOBN(0x8a56149d, 0x9acc9dca)}}, + {{TOBN(0x98f17881, 0x9ac6e0f4), TOBN(0x360fdeaf, 0xa413b5ed), + TOBN(0x0625b8f4, 0xa300b0fd), TOBN(0xf1f4d76a, 0x5b3222d3)}, + {TOBN(0x9d6f5109, 0x587f76b8), TOBN(0x8b4ee08d, 0x2317fdb5), + TOBN(0x88089bb7, 0x8c68b095), TOBN(0x95570e9a, 0x5808d9b9)}}, + {{TOBN(0xa395c36f, 0x35d33ae7), TOBN(0x200ea123, 0x50bb5a94), + TOBN(0x20c789bd, 0x0bafe84b), TOBN(0x243ef52d, 0x0919276a)}, + {TOBN(0x3934c577, 0xe23ae233), TOBN(0xb93807af, 0xa460d1ec), + TOBN(0xb72a53b1, 0xf8fa76a4), TOBN(0xd8914cb0, 0xc3ca4491)}}, + {{TOBN(0x2e128494, 0x3fb42622), TOBN(0x3b2700ac, 0x500907d5), + TOBN(0xf370fb09, 0x1a95ec63), TOBN(0xf8f30be2, 0x31b6dfbd)}, + {TOBN(0xf2b2f8d2, 0x69e55f15), TOBN(0x1fead851, 0xcc1323e9), + TOBN(0xfa366010, 0xd9e5eef6), TOBN(0x64d487b0, 0xe316107e)}}, + {{TOBN(0x4c076b86, 0xd23ddc82), TOBN(0x03fd344c, 0x7e0143f0), + TOBN(0xa95362ff, 0x317af2c5), TOBN(0x0add3db7, 0xe18b7a4f)}, + {TOBN(0x9c673e3f, 0x8260e01b), TOBN(0xfbeb49e5, 0x54a1cc91), + TOBN(0x91351bf2, 0x92f2e433), TOBN(0xc755e7ec, 0x851141eb)}}, + {{TOBN(0xc9a95139, 0x29607745), TOBN(0x0ca07420, 0xa26f2b28), + TOBN(0xcb2790e7, 0x4bc6f9dd), TOBN(0x345bbb58, 0xadcaffc0)}, + {TOBN(0xc65ea38c, 0xbe0f27a2), TOBN(0x67c24d7c, 0x641fcb56), + TOBN(0x2c25f0a7, 0xa9e2c757), TOBN(0x93f5cdb0, 0x16f16c49)}}, + {{TOBN(0x2ca5a9d7, 0xc5ee30a1), TOBN(0xd1593635, 0xb909b729), + TOBN(0x804ce9f3, 0xdadeff48), TOBN(0xec464751, 0xb07c30c3)}, + {TOBN(0x89d65ff3, 0x9e49af6a), TOBN(0xf2d6238a, 0x6f3d01bc), + TOBN(0x1095561e, 0x0bced843), TOBN(0x51789e12, 0xc8a13fd8)}}, + {{TOBN(0xd633f929, 0x763231df), TOBN(0x46df9f7d, 0xe7cbddef), + TOBN(0x01c889c0, 0xcb265da8), TOBN(0xfce1ad10, 0xaf4336d2)}, + {TOBN(0x8d110df6, 0xfc6a0a7e), TOBN(0xdd431b98, 0x6da425dc), + TOBN(0xcdc4aeab, 0x1834aabe), TOBN(0x84deb124, 0x8439b7fc)}}, + {{TOBN(0x8796f169, 0x3c2a5998), TOBN(0x9b9247b4, 0x7947190d), + TOBN(0x55b9d9a5, 0x11597014), TOBN(0x7e9dd70d, 0x7b1566ee)}, + {TOBN(0x94ad78f7, 0xcbcd5e64), TOBN(0x0359ac17, 0x9bd4c032), + TOBN(0x3b11baaf, 0x7cc222ae), TOBN(0xa6a6e284, 0xba78e812)}}, + {{TOBN(0x8392053f, 0x24cea1a0), TOBN(0xc97bce4a, 0x33621491), + TOBN(0x7eb1db34, 0x35399ee9), TOBN(0x473f78ef, 0xece81ad1)}, + {TOBN(0x41d72fe0, 0xf63d3d0d), TOBN(0xe620b880, 0xafab62fc), + TOBN(0x92096bc9, 0x93158383), TOBN(0x41a21357, 0x8f896f6c)}}, + {{TOBN(0x1b5ee2fa, 0xc7dcfcab), TOBN(0x650acfde, 0x9546e007), + TOBN(0xc081b749, 0xb1b02e07), TOBN(0xda9e41a0, 0xf9eca03d)}, + {TOBN(0x013ba727, 0x175a54ab), TOBN(0xca0cd190, 0xea5d8d10), + TOBN(0x85ea52c0, 0x95fd96a9), TOBN(0x2c591b9f, 0xbc5c3940)}}, + {{TOBN(0x6fb4d4e4, 0x2bad4d5f), TOBN(0xfa4c3590, 0xfef0059b), + TOBN(0x6a10218a, 0xf5122294), TOBN(0x9a78a81a, 0xa85751d1)}, + {TOBN(0x04f20579, 0xa98e84e7), TOBN(0xfe1242c0, 0x4997e5b5), + TOBN(0xe77a273b, 0xca21e1e4), TOBN(0xfcc8b1ef, 0x9411939d)}}, + {{TOBN(0xe20ea302, 0x92d0487a), TOBN(0x1442dbec, 0x294b91fe), + TOBN(0x1f7a4afe, 0xbb6b0e8f), TOBN(0x1700ef74, 0x6889c318)}, + {TOBN(0xf5bbffc3, 0x70f1fc62), TOBN(0x3b31d4b6, 0x69c79cca), + TOBN(0xe8bc2aab, 0xa7f6340d), TOBN(0xb0b08ab4, 0xa725e10a)}}, + {{TOBN(0x44f05701, 0xae340050), TOBN(0xba4b3016, 0x1cf0c569), + TOBN(0x5aa29f83, 0xfbe19a51), TOBN(0x1b9ed428, 0xb71d752e)}, + {TOBN(0x1666e54e, 0xeb4819f5), TOBN(0x616cdfed, 0x9e18b75b), + TOBN(0x112ed5be, 0x3ee27b0b), TOBN(0xfbf28319, 0x44c7de4d)}}, + {{TOBN(0xd685ec85, 0xe0e60d84), TOBN(0x68037e30, 0x1db7ee78), + TOBN(0x5b65bdcd, 0x003c4d6e), TOBN(0x33e7363a, 0x93e29a6a)}, + {TOBN(0x995b3a61, 0x08d0756c), TOBN(0xd727f85c, 0x2faf134b), + TOBN(0xfac6edf7, 0x1d337823), TOBN(0x99b9aa50, 0x0439b8b4)}}, + {{TOBN(0x722eb104, 0xe2b4e075), TOBN(0x49987295, 0x437c4926), + TOBN(0xb1e4c0e4, 0x46a9b82d), TOBN(0xd0cb3197, 0x57a006f5)}, + {TOBN(0xf3de0f7d, 0xd7808c56), TOBN(0xb5c54d8f, 0x51f89772), + TOBN(0x500a114a, 0xadbd31aa), TOBN(0x9afaaaa6, 0x295f6cab)}}, + {{TOBN(0x94705e21, 0x04cf667a), TOBN(0xfc2a811b, 0x9d3935d7), + TOBN(0x560b0280, 0x6d09267c), TOBN(0xf19ed119, 0xf780e53b)}, + {TOBN(0xf0227c09, 0x067b6269), TOBN(0x967b8533, 0x5caef599), + TOBN(0x155b9243, 0x68efeebc), TOBN(0xcd6d34f5, 0xc497bae6)}}, + {{TOBN(0x1dd8d5d3, 0x6cceb370), TOBN(0x2aeac579, 0xa78d7bf9), + TOBN(0x5d65017d, 0x70b67a62), TOBN(0x70c8e44f, 0x17c53f67)}, + {TOBN(0xd1fc0950, 0x86a34d09), TOBN(0xe0fca256, 0xe7134907), + TOBN(0xe24fa29c, 0x80fdd315), TOBN(0x2c4acd03, 0xd87499ad)}}, + {{TOBN(0xbaaf7517, 0x3b5a9ba6), TOBN(0xb9cbe1f6, 0x12e51a51), + TOBN(0xd88edae3, 0x5e154897), TOBN(0xe4309c3c, 0x77b66ca0)}, + {TOBN(0xf5555805, 0xf67f3746), TOBN(0x85fc37ba, 0xa36401ff), + TOBN(0xdf86e2ca, 0xd9499a53), TOBN(0x6270b2a3, 0xecbc955b)}}, + {{TOBN(0xafae64f5, 0x974ad33b), TOBN(0x04d85977, 0xfe7b2df1), + TOBN(0x2a3db3ff, 0x4ab03f73), TOBN(0x0b87878a, 0x8702740a)}, + {TOBN(0x6d263f01, 0x5a061732), TOBN(0xc25430ce, 0xa32a1901), + TOBN(0xf7ebab3d, 0xdb155018), TOBN(0x3a86f693, 0x63a9b78e)}}, + {{TOBN(0x349ae368, 0xda9f3804), TOBN(0x470f07fe, 0xa164349c), + TOBN(0xd52f4cc9, 0x8562baa5), TOBN(0xc74a9e86, 0x2b290df3)}, + {TOBN(0xd3a1aa35, 0x43471a24), TOBN(0x239446be, 0xb8194511), + TOBN(0xbec2dd00, 0x81dcd44d), TOBN(0xca3d7f0f, 0xc42ac82d)}}, + {{TOBN(0x1f3db085, 0xfdaf4520), TOBN(0xbb6d3e80, 0x4549daf2), + TOBN(0xf5969d8a, 0x19ad5c42), TOBN(0x7052b13d, 0xdbfd1511)}, + {TOBN(0x11890d1b, 0x682b9060), TOBN(0xa71d3883, 0xac34452c), + TOBN(0xa438055b, 0x783805b4), TOBN(0x43241277, 0x4725b23e)}}, + {{TOBN(0xf20cf96e, 0x4901bbed), TOBN(0x6419c710, 0xf432a2bb), + TOBN(0x57a0fbb9, 0xdfa9cd7d), TOBN(0x589111e4, 0x00daa249)}, + {TOBN(0x19809a33, 0x7b60554e), TOBN(0xea5f8887, 0xede283a4), + TOBN(0x2d713802, 0x503bfd35), TOBN(0x151bb0af, 0x585d2a53)}}, + {{TOBN(0x40b08f74, 0x43b30ca8), TOBN(0xe10b5bba, 0xd9934583), + TOBN(0xe8a546d6, 0xb51110ad), TOBN(0x1dd50e66, 0x28e0b6c5)}, + {TOBN(0x292e9d54, 0xcff2b821), TOBN(0x3882555d, 0x47281760), + TOBN(0x134838f8, 0x3724d6e3), TOBN(0xf2c679e0, 0x22ddcda1)}}, + {{TOBN(0x40ee8815, 0x6d2a5768), TOBN(0x7f227bd2, 0x1c1e7e2d), + TOBN(0x487ba134, 0xd04ff443), TOBN(0x76e2ff3d, 0xc614e54b)}, + {TOBN(0x36b88d6f, 0xa3177ec7), TOBN(0xbf731d51, 0x2328fff5), + TOBN(0x758caea2, 0x49ba158e), TOBN(0x5ab8ff4c, 0x02938188)}}, + {{TOBN(0x33e16056, 0x35edc56d), TOBN(0x5a69d349, 0x7e940d79), + TOBN(0x6c4fd001, 0x03866dcb), TOBN(0x20a38f57, 0x4893cdef)}, + {TOBN(0xfbf3e790, 0xfac3a15b), TOBN(0x6ed7ea2e, 0x7a4f8e6b), + TOBN(0xa663eb4f, 0xbc3aca86), TOBN(0x22061ea5, 0x080d53f7)}}, + {{TOBN(0x2480dfe6, 0xf546783f), TOBN(0xd38bc6da, 0x5a0a641e), + TOBN(0xfb093cd1, 0x2ede8965), TOBN(0x89654db4, 0xacb455cf)}, + {TOBN(0x413cbf9a, 0x26e1adee), TOBN(0x291f3764, 0x373294d4), + TOBN(0x00797257, 0x648083fe), TOBN(0x25f504d3, 0x208cc341)}}, + {{TOBN(0x635a8e5e, 0xc3a0ee43), TOBN(0x70aaebca, 0x679898ff), + TOBN(0x9ee9f547, 0x5dc63d56), TOBN(0xce987966, 0xffb34d00)}, + {TOBN(0xf9f86b19, 0x5e26310a), TOBN(0x9e435484, 0x382a8ca8), + TOBN(0x253bcb81, 0xc2352fe4), TOBN(0xa4eac8b0, 0x4474b571)}}, + {{TOBN(0xc1b97512, 0xc1ad8cf8), TOBN(0x193b4e9e, 0x99e0b697), + TOBN(0x939d2716, 0x01e85df0), TOBN(0x4fb265b3, 0xcd44eafd)}, + {TOBN(0x321e7dcd, 0xe51e1ae2), TOBN(0x8e3a8ca6, 0xe3d8b096), + TOBN(0x8de46cb0, 0x52604998), TOBN(0x91099ad8, 0x39072aa7)}}, + {{TOBN(0x2617f91c, 0x93aa96b8), TOBN(0x0fc8716b, 0x7fca2e13), + TOBN(0xa7106f5e, 0x95328723), TOBN(0xd1c9c40b, 0x262e6522)}, + {TOBN(0xb9bafe86, 0x42b7c094), TOBN(0x1873439d, 0x1543c021), + TOBN(0xe1baa5de, 0x5cbefd5d), TOBN(0xa363fc5e, 0x521e8aff)}}, + {{TOBN(0xefe6320d, 0xf862eaac), TOBN(0x14419c63, 0x22c647dc), + TOBN(0x0e06707c, 0x4e46d428), TOBN(0xcb6c834f, 0x4a178f8f)}, + {TOBN(0x0f993a45, 0xd30f917c), TOBN(0xd4c4b049, 0x9879afee), + TOBN(0xb6142a1e, 0x70500063), TOBN(0x7c9b41c3, 0xa5d9d605)}}, + {{TOBN(0xbc00fc2f, 0x2f8ba2c7), TOBN(0x0966eb2f, 0x7c67aa28), + TOBN(0x13f7b516, 0x5a786972), TOBN(0x3bfb7557, 0x8a2fbba0)}, + {TOBN(0x131c4f23, 0x5a2b9620), TOBN(0xbff3ed27, 0x6faf46be), + TOBN(0x9b4473d1, 0x7e172323), TOBN(0x421e8878, 0x339f6246)}}, + {{TOBN(0x0fa8587a, 0x25a41632), TOBN(0xc0814124, 0xa35b6c93), + TOBN(0x2b18a9f5, 0x59ebb8db), TOBN(0x264e3357, 0x76edb29c)}, + {TOBN(0xaf245ccd, 0xc87c51e2), TOBN(0x16b3015b, 0x501e6214), + TOBN(0xbb31c560, 0x0a3882ce), TOBN(0x6961bb94, 0xfec11e04)}}, + {{TOBN(0x3b825b8d, 0xeff7a3a0), TOBN(0xbec33738, 0xb1df7326), + TOBN(0x68ad747c, 0x99604a1f), TOBN(0xd154c934, 0x9a3bd499)}, + {TOBN(0xac33506f, 0x1cc7a906), TOBN(0x73bb5392, 0x6c560e8f), + TOBN(0x6428fcbe, 0x263e3944), TOBN(0xc11828d5, 0x1c387434)}}, + {{TOBN(0x3cd04be1, 0x3e4b12ff), TOBN(0xc3aad9f9, 0x2d88667c), + TOBN(0xc52ddcf8, 0x248120cf), TOBN(0x985a892e, 0x2a389532)}, + {TOBN(0xfbb4b21b, 0x3bb85fa0), TOBN(0xf95375e0, 0x8dfc6269), + TOBN(0xfb4fb06c, 0x7ee2acea), TOBN(0x6785426e, 0x309c4d1f)}}, + {{TOBN(0x659b17c8, 0xd8ceb147), TOBN(0x9b649eee, 0xb70a5554), + TOBN(0x6b7fa0b5, 0xac6bc634), TOBN(0xd99fe2c7, 0x1d6e732f)}, + {TOBN(0x30e6e762, 0x8d3abba2), TOBN(0x18fee6e7, 0xa797b799), + TOBN(0x5c9d360d, 0xc696464d), TOBN(0xe3baeb48, 0x27bfde12)}}, + {{TOBN(0x2bf5db47, 0xf23206d5), TOBN(0x2f6d3420, 0x1d260152), + TOBN(0x17b87653, 0x3f8ff89a), TOBN(0x5157c30c, 0x378fa458)}, + {TOBN(0x7517c5c5, 0x2d4fb936), TOBN(0xef22f7ac, 0xe6518cdc), + TOBN(0xdeb483e6, 0xbf847a64), TOBN(0xf5084558, 0x92e0fa89)}}}, + {{{TOBN(0xab9659d8, 0xdf7304d4), TOBN(0xb71bcf1b, 0xff210e8e), + TOBN(0xa9a2438b, 0xd73fbd60), TOBN(0x4595cd1f, 0x5d11b4de)}, + {TOBN(0x9c0d329a, 0x4835859d), TOBN(0x4a0f0d2d, 0x7dbb6e56), + TOBN(0xc6038e5e, 0xdf928a4e), TOBN(0xc9429621, 0x8f5ad154)}}, + {{TOBN(0x91213462, 0xf23f2d92), TOBN(0x6cab71bd, 0x60b94078), + TOBN(0x6bdd0a63, 0x176cde20), TOBN(0x54c9b20c, 0xee4d54bc)}, + {TOBN(0x3cd2d8aa, 0x9f2ac02f), TOBN(0x03f8e617, 0x206eedb0), + TOBN(0xc7f68e16, 0x93086434), TOBN(0x831469c5, 0x92dd3db9)}}, + {{TOBN(0x8521df24, 0x8f981354), TOBN(0x587e23ec, 0x3588a259), + TOBN(0xcbedf281, 0xd7a0992c), TOBN(0x06930a55, 0x38961407)}, + {TOBN(0x09320deb, 0xbe5bbe21), TOBN(0xa7ffa5b5, 0x2491817f), + TOBN(0xe6c8b4d9, 0x09065160), TOBN(0xac4f3992, 0xfff6d2a9)}}, + {{TOBN(0x7aa7a158, 0x3ae9c1bd), TOBN(0xe0af6d98, 0xe37ce240), + TOBN(0xe54342d9, 0x28ab38b4), TOBN(0xe8b75007, 0x0a1c98ca)}, + {TOBN(0xefce86af, 0xe02358f2), TOBN(0x31b8b856, 0xea921228), + TOBN(0x052a1912, 0x0a1c67fc), TOBN(0xb4069ea4, 0xe3aead59)}}, + {{TOBN(0x3232d6e2, 0x7fa03cb3), TOBN(0xdb938e5b, 0x0fdd7d88), + TOBN(0x04c1d2cd, 0x2ccbfc5d), TOBN(0xd2f45c12, 0xaf3a580f)}, + {TOBN(0x592620b5, 0x7883e614), TOBN(0x5fd27e68, 0xbe7c5f26), + TOBN(0x139e45a9, 0x1567e1e3), TOBN(0x2cc71d2d, 0x44d8aaaf)}}, + {{TOBN(0x4a9090cd, 0xe36d0757), TOBN(0xf722d7b1, 0xd9a29382), + TOBN(0xfb7fb04c, 0x04b48ddf), TOBN(0x628ad2a7, 0xebe16f43)}, + {TOBN(0xcd3fbfb5, 0x20226040), TOBN(0x6c34ecb1, 0x5104b6c4), + TOBN(0x30c0754e, 0xc903c188), TOBN(0xec336b08, 0x2d23cab0)}}, + {{TOBN(0x473d62a2, 0x1e206ee5), TOBN(0xf1e27480, 0x8c49a633), + TOBN(0x87ab956c, 0xe9f6b2c3), TOBN(0x61830b48, 0x62b606ea)}, + {TOBN(0x67cd6846, 0xe78e815f), TOBN(0xfe40139f, 0x4c02082a), + TOBN(0x52bbbfcb, 0x952ec365), TOBN(0x74c11642, 0x6b9836ab)}}, + {{TOBN(0x9f51439e, 0x558df019), TOBN(0x230da4ba, 0xac712b27), + TOBN(0x518919e3, 0x55185a24), TOBN(0x4dcefcdd, 0x84b78f50)}, + {TOBN(0xa7d90fb2, 0xa47d4c5a), TOBN(0x55ac9abf, 0xb30e009e), + TOBN(0xfd2fc359, 0x74eed273), TOBN(0xb72d824c, 0xdbea8faf)}}, + {{TOBN(0xce721a74, 0x4513e2ca), TOBN(0x0b418612, 0x38240b2c), + TOBN(0x05199968, 0xd5baa450), TOBN(0xeb1757ed, 0x2b0e8c25)}, + {TOBN(0x6ebc3e28, 0x3dfac6d5), TOBN(0xb2431e2e, 0x48a237f5), + TOBN(0x2acb5e23, 0x52f61499), TOBN(0x5558a2a7, 0xe06c936b)}}, + {{TOBN(0xd213f923, 0xcbb13d1b), TOBN(0x98799f42, 0x5bfb9bfe), + TOBN(0x1ae8ddc9, 0x701144a9), TOBN(0x0b8b3bb6, 0x4c5595ee)}, + {TOBN(0x0ea9ef2e, 0x3ecebb21), TOBN(0x17cb6c4b, 0x3671f9a7), + TOBN(0x47ef464f, 0x726f1d1f), TOBN(0x171b9484, 0x6943a276)}}, + {{TOBN(0x51a4ae2d, 0x7ef0329c), TOBN(0x08509222, 0x91c4402a), + TOBN(0x64a61d35, 0xafd45bbc), TOBN(0x38f096fe, 0x3035a851)}, + {TOBN(0xc7468b74, 0xa1dec027), TOBN(0xe8cf10e7, 0x4fc7dcba), + TOBN(0xea35ff40, 0xf4a06353), TOBN(0x0b4c0dfa, 0x8b77dd66)}}, + {{TOBN(0x779b8552, 0xde7e5c19), TOBN(0xfab28609, 0xc1c0256c), + TOBN(0x64f58eee, 0xabd4743d), TOBN(0x4e8ef838, 0x7b6cc93b)}, + {TOBN(0xee650d26, 0x4cb1bf3d), TOBN(0x4c1f9d09, 0x73dedf61), + TOBN(0xaef7c9d7, 0xbfb70ced), TOBN(0x1ec0507e, 0x1641de1e)}}, + {{TOBN(0xcd7e5cc7, 0xcde45079), TOBN(0xde173c9a, 0x516ac9e4), + TOBN(0x517a8494, 0xc170315c), TOBN(0x438fd905, 0x91d8e8fb)}, + {TOBN(0x5145c506, 0xc7d9630b), TOBN(0x6457a87b, 0xf47d4d75), + TOBN(0xd31646bf, 0x0d9a80e8), TOBN(0x453add2b, 0xcef3aabe)}}, + {{TOBN(0xc9941109, 0xa607419d), TOBN(0xfaa71e62, 0xbb6bca80), + TOBN(0x34158c13, 0x07c431f3), TOBN(0x594abebc, 0x992bc47a)}, + {TOBN(0x6dfea691, 0xeb78399f), TOBN(0x48aafb35, 0x3f42cba4), + TOBN(0xedcd65af, 0x077c04f0), TOBN(0x1a29a366, 0xe884491a)}}, + {{TOBN(0x023a40e5, 0x1c21f2bf), TOBN(0xf99a513c, 0xa5057aee), + TOBN(0xa3fe7e25, 0xbcab072e), TOBN(0x8568d2e1, 0x40e32bcf)}, + {TOBN(0x904594eb, 0xd3f69d9f), TOBN(0x181a9733, 0x07affab1), + TOBN(0xe4d68d76, 0xb6e330f4), TOBN(0x87a6dafb, 0xc75a7fc1)}}, + {{TOBN(0x549db2b5, 0xef7d9289), TOBN(0x2480d4a8, 0x197f015a), + TOBN(0x61d5590b, 0xc40493b6), TOBN(0x3a55b52e, 0x6f780331)}, + {TOBN(0x40eb8115, 0x309eadb0), TOBN(0xdea7de5a, 0x92e5c625), + TOBN(0x64d631f0, 0xcc6a3d5a), TOBN(0x9d5e9d7c, 0x93e8dd61)}}, + {{TOBN(0xf297bef5, 0x206d3ffc), TOBN(0x23d5e033, 0x7d808bd4), + TOBN(0x4a4f6912, 0xd24cf5ba), TOBN(0xe4d8163b, 0x09cdaa8a)}, + {TOBN(0x0e0de9ef, 0xd3082e8e), TOBN(0x4fe1246c, 0x0192f360), + TOBN(0x1f900150, 0x4b8eee0a), TOBN(0x5219da81, 0xf1da391b)}}, + {{TOBN(0x7bf6a5c1, 0xf7ea25aa), TOBN(0xd165e6bf, 0xfbb07d5f), + TOBN(0xe3539361, 0x89e78671), TOBN(0xa3fcac89, 0x2bac4219)}, + {TOBN(0xdfab6fd4, 0xf0baa8ab), TOBN(0x5a4adac1, 0xe2c1c2e5), + TOBN(0x6cd75e31, 0x40d85849), TOBN(0xce263fea, 0x19b39181)}}, + {{TOBN(0xcb6803d3, 0x07032c72), TOBN(0x7f40d5ce, 0x790968c8), + TOBN(0xa6de86bd, 0xdce978f0), TOBN(0x25547c4f, 0x368f751c)}, + {TOBN(0xb1e685fd, 0x65fb2a9e), TOBN(0xce69336f, 0x1eb9179c), + TOBN(0xb15d1c27, 0x12504442), TOBN(0xb7df465c, 0xb911a06b)}}, + {{TOBN(0xb8d804a3, 0x315980cd), TOBN(0x693bc492, 0xfa3bebf7), + TOBN(0x3578aeee, 0x2253c504), TOBN(0x158de498, 0xcd2474a2)}, + {TOBN(0x1331f5c7, 0xcfda8368), TOBN(0xd2d7bbb3, 0x78d7177e), + TOBN(0xdf61133a, 0xf3c1e46e), TOBN(0x5836ce7d, 0xd30e7be8)}}, + {{TOBN(0x83084f19, 0x94f834cb), TOBN(0xd35653d4, 0x429ed782), + TOBN(0xa542f16f, 0x59e58243), TOBN(0xc2b52f65, 0x0470a22d)}, + {TOBN(0xe3b6221b, 0x18f23d96), TOBN(0xcb05abac, 0x3f5252b4), + TOBN(0xca00938b, 0x87d61402), TOBN(0x2f186cdd, 0x411933e4)}}, + {{TOBN(0xe042ece5, 0x9a29a5c5), TOBN(0xb19b3c07, 0x3b6c8402), + TOBN(0xc97667c7, 0x19d92684), TOBN(0xb5624622, 0xebc66372)}, + {TOBN(0x0cb96e65, 0x3c04fa02), TOBN(0x83a7176c, 0x8eaa39aa), + TOBN(0x2033561d, 0xeaa1633f), TOBN(0x45a9d086, 0x4533df73)}}, + {{TOBN(0xe0542c1d, 0x3dc090bc), TOBN(0x82c996ef, 0xaa59c167), + TOBN(0xe3f735e8, 0x0ee7fc4d), TOBN(0x7b179393, 0x7c35db79)}, + {TOBN(0xb6419e25, 0xf8c5dbfd), TOBN(0x4d9d7a1e, 0x1f327b04), + TOBN(0x979f6f9b, 0x298dfca8), TOBN(0xc7c5dff1, 0x8de9366a)}}, + {{TOBN(0x1b7a588d, 0x04c82bdd), TOBN(0x68005534, 0xf8319dfd), + TOBN(0xde8a55b5, 0xd8eb9580), TOBN(0x5ea886da, 0x8d5bca81)}, + {TOBN(0xe8530a01, 0x252a0b4d), TOBN(0x1bffb4fe, 0x35eaa0a1), + TOBN(0x2ad828b1, 0xd8e99563), TOBN(0x7de96ef5, 0x95f9cd87)}}, + {{TOBN(0x4abb2d0c, 0xd77d970c), TOBN(0x03cfb933, 0xd33ef9cb), + TOBN(0xb0547c01, 0x8b211fe9), TOBN(0x2fe64809, 0xa56ed1c6)}, + {TOBN(0xcb7d5624, 0xc2ac98cc), TOBN(0x2a1372c0, 0x1a393e33), + TOBN(0xc8d1ec1c, 0x29660521), TOBN(0xf3d31b04, 0xb37ac3e9)}}, + {{TOBN(0xa29ae9df, 0x5ece6e7c), TOBN(0x0603ac8f, 0x0facfb55), + TOBN(0xcfe85b7a, 0xdda233a5), TOBN(0xe618919f, 0xbd75f0b8)}, + {TOBN(0xf555a3d2, 0x99bf1603), TOBN(0x1f43afc9, 0xf184255a), + TOBN(0xdcdaf341, 0x319a3e02), TOBN(0xd3b117ef, 0x03903a39)}}, + {{TOBN(0xe095da13, 0x65d1d131), TOBN(0x86f16367, 0xc37ad03e), + TOBN(0x5f37389e, 0x462cd8dd), TOBN(0xc103fa04, 0xd67a60e6)}, + {TOBN(0x57c34344, 0xf4b478f0), TOBN(0xce91edd8, 0xe117c98d), + TOBN(0x001777b0, 0x231fc12e), TOBN(0x11ae47f2, 0xb207bccb)}}, + {{TOBN(0xd983cf8d, 0x20f8a242), TOBN(0x7aff5b1d, 0xf22e1ad8), + TOBN(0x68fd11d0, 0x7fc4feb3), TOBN(0x5d53ae90, 0xb0f1c3e1)}, + {TOBN(0x50fb7905, 0xec041803), TOBN(0x85e3c977, 0x14404888), + TOBN(0x0e67faed, 0xac628d8f), TOBN(0x2e865150, 0x6668532c)}}, + {{TOBN(0x15acaaa4, 0x6a67a6b0), TOBN(0xf4cdee25, 0xb25cec41), + TOBN(0x49ee565a, 0xe4c6701e), TOBN(0x2a04ca66, 0xfc7d63d8)}, + {TOBN(0xeb105018, 0xef0543fb), TOBN(0xf709a4f5, 0xd1b0d81d), + TOBN(0x5b906ee6, 0x2915d333), TOBN(0xf4a87412, 0x96f1f0ab)}}, + {{TOBN(0xb6b82fa7, 0x4d82f4c2), TOBN(0x90725a60, 0x6804efb3), + TOBN(0xbc82ec46, 0xadc3425e), TOBN(0xb7b80581, 0x2787843e)}, + {TOBN(0xdf46d91c, 0xdd1fc74c), TOBN(0xdc1c62cb, 0xe783a6c4), + TOBN(0x59d1b9f3, 0x1a04cbba), TOBN(0xd87f6f72, 0x95e40764)}}, + {{TOBN(0x02b4cfc1, 0x317f4a76), TOBN(0x8d2703eb, 0x91036bce), + TOBN(0x98206cc6, 0xa5e72a56), TOBN(0x57be9ed1, 0xcf53fb0f)}, + {TOBN(0x09374571, 0xef0b17ac), TOBN(0x74b2655e, 0xd9181b38), + TOBN(0xc8f80ea8, 0x89935d0e), TOBN(0xc0d9e942, 0x91529936)}}, + {{TOBN(0x19686041, 0x1e84e0e5), TOBN(0xa5db84d3, 0xaea34c93), + TOBN(0xf9d5bb19, 0x7073a732), TOBN(0xb8d2fe56, 0x6bcfd7c0)}, + {TOBN(0x45775f36, 0xf3eb82fa), TOBN(0x8cb20ccc, 0xfdff8b58), + TOBN(0x1659b65f, 0x8374c110), TOBN(0xb8b4a422, 0x330c789a)}}, + {{TOBN(0x75e3c3ea, 0x6fe8208b), TOBN(0xbd74b9e4, 0x286e78fe), + TOBN(0x0be2e81b, 0xd7d93a1a), TOBN(0x7ed06e27, 0xdd0a5aae)}, + {TOBN(0x721f5a58, 0x6be8b800), TOBN(0x428299d1, 0xd846db28), + TOBN(0x95cb8e6b, 0x5be88ed3), TOBN(0xc3186b23, 0x1c034e11)}}, + {{TOBN(0xa6312c9e, 0x8977d99b), TOBN(0xbe944331, 0x83f531e7), + TOBN(0x8232c0c2, 0x18d3b1d4), TOBN(0x617aae8b, 0xe1247b73)}, + {TOBN(0x40153fc4, 0x282aec3b), TOBN(0xc6063d2f, 0xf7b8f823), + TOBN(0x68f10e58, 0x3304f94c), TOBN(0x31efae74, 0xee676346)}}, + {{TOBN(0xbadb6c6d, 0x40a9b97c), TOBN(0x14702c63, 0x4f666256), + TOBN(0xdeb954f1, 0x5184b2e3), TOBN(0x5184a526, 0x94b6ca40)}, + {TOBN(0xfff05337, 0x003c32ea), TOBN(0x5aa374dd, 0x205974c7), + TOBN(0x9a763854, 0x4b0dd71a), TOBN(0x459cd27f, 0xdeb947ec)}}, + {{TOBN(0xa6e28161, 0x459c2b92), TOBN(0x2f020fa8, 0x75ee8ef5), + TOBN(0xb132ec2d, 0x30b06310), TOBN(0xc3e15899, 0xbc6a4530)}, + {TOBN(0xdc5f53fe, 0xaa3f451a), TOBN(0x3a3c7f23, 0xc2d9acac), + TOBN(0x2ec2f892, 0x6b27e58b), TOBN(0x68466ee7, 0xd742799f)}}, + {{TOBN(0x98324dd4, 0x1fa26613), TOBN(0xa2dc6dab, 0xbdc29d63), + TOBN(0xf9675faa, 0xd712d657), TOBN(0x813994be, 0x21fd8d15)}, + {TOBN(0x5ccbb722, 0xfd4f7553), TOBN(0x5135ff8b, 0xf3a36b20), + TOBN(0x44be28af, 0x69559df5), TOBN(0x40b65bed, 0x9d41bf30)}}, + {{TOBN(0xd98bf2a4, 0x3734e520), TOBN(0x5e3abbe3, 0x209bdcba), + TOBN(0x77c76553, 0xbc945b35), TOBN(0x5331c093, 0xc6ef14aa)}, + {TOBN(0x518ffe29, 0x76b60c80), TOBN(0x2285593b, 0x7ace16f8), + TOBN(0xab1f64cc, 0xbe2b9784), TOBN(0xe8f2c0d9, 0xab2421b6)}}, + {{TOBN(0x617d7174, 0xc1df065c), TOBN(0xafeeb5ab, 0x5f6578fa), + TOBN(0x16ff1329, 0x263b54a8), TOBN(0x45c55808, 0xc990dce3)}, + {TOBN(0x42eab6c0, 0xecc8c177), TOBN(0x799ea9b5, 0x5982ecaa), + TOBN(0xf65da244, 0xb607ef8e), TOBN(0x8ab226ce, 0x32a3fc2c)}}, + {{TOBN(0x745741e5, 0x7ea973dc), TOBN(0x5c00ca70, 0x20888f2e), + TOBN(0x7cdce3cf, 0x45fd9cf1), TOBN(0x8a741ef1, 0x5507f872)}, + {TOBN(0x47c51c2f, 0x196b4cec), TOBN(0x70d08e43, 0xc97ea618), + TOBN(0x930da15c, 0x15b18a2b), TOBN(0x33b6c678, 0x2f610514)}}, + {{TOBN(0xc662e4f8, 0x07ac9794), TOBN(0x1eccf050, 0xba06cb79), + TOBN(0x1ff08623, 0xe7d954e5), TOBN(0x6ef2c5fb, 0x24cf71c3)}, + {TOBN(0xb2c063d2, 0x67978453), TOBN(0xa0cf3796, 0x1d654af8), + TOBN(0x7cb242ea, 0x7ebdaa37), TOBN(0x206e0b10, 0xb86747e0)}}, + {{TOBN(0x481dae5f, 0xd5ecfefc), TOBN(0x07084fd8, 0xc2bff8fc), + TOBN(0x8040a01a, 0xea324596), TOBN(0x4c646980, 0xd4de4036)}, + {TOBN(0x9eb8ab4e, 0xd65abfc3), TOBN(0xe01cb91f, 0x13541ec7), + TOBN(0x8f029adb, 0xfd695012), TOBN(0x9ae28483, 0x3c7569ec)}}, + {{TOBN(0xa5614c9e, 0xa66d80a1), TOBN(0x680a3e44, 0x75f5f911), + TOBN(0x0c07b14d, 0xceba4fc1), TOBN(0x891c285b, 0xa13071c1)}, + {TOBN(0xcac67ceb, 0x799ece3c), TOBN(0x29b910a9, 0x41e07e27), + TOBN(0x66bdb409, 0xf2e43123), TOBN(0x06f8b137, 0x7ac9ecbe)}}, + {{TOBN(0x5981fafd, 0x38547090), TOBN(0x19ab8b9f, 0x85e3415d), + TOBN(0xfc28c194, 0xc7e31b27), TOBN(0x843be0aa, 0x6fbcbb42)}, + {TOBN(0xf3b1ed43, 0xa6db836c), TOBN(0x2a1330e4, 0x01a45c05), + TOBN(0x4f19f3c5, 0x95c1a377), TOBN(0xa85f39d0, 0x44b5ee33)}}, + {{TOBN(0x3da18e6d, 0x4ae52834), TOBN(0x5a403b39, 0x7423dcb0), + TOBN(0xbb555e0a, 0xf2374aef), TOBN(0x2ad599c4, 0x1e8ca111)}, + {TOBN(0x1b3a2fb9, 0x014b3bf8), TOBN(0x73092684, 0xf66d5007), + TOBN(0x079f1426, 0xc4340102), TOBN(0x1827cf81, 0x8fddf4de)}}, + {{TOBN(0xc83605f6, 0xf10ff927), TOBN(0xd3871451, 0x23739fc6), + TOBN(0x6d163450, 0xcac1c2cc), TOBN(0x6b521296, 0xa2ec1ac5)}, + {TOBN(0x0606c4f9, 0x6e3cb4a5), TOBN(0xe47d3f41, 0x778abff7), + TOBN(0x425a8d5e, 0xbe8e3a45), TOBN(0x53ea9e97, 0xa6102160)}}, + {{TOBN(0x477a106e, 0x39cbb688), TOBN(0x532401d2, 0xf3386d32), + TOBN(0x8e564f64, 0xb1b9b421), TOBN(0xca9b8388, 0x81dad33f)}, + {TOBN(0xb1422b4e, 0x2093913e), TOBN(0x533d2f92, 0x69bc8112), + TOBN(0x3fa017be, 0xebe7b2c7), TOBN(0xb2767c4a, 0xcaf197c6)}}, + {{TOBN(0xc925ff87, 0xaedbae9f), TOBN(0x7daf0eb9, 0x36880a54), + TOBN(0x9284ddf5, 0x9c4d0e71), TOBN(0x1581cf93, 0x316f8cf5)}, + {TOBN(0x3eeca887, 0x3ac1f452), TOBN(0xb417fce9, 0xfb6aeffe), + TOBN(0xa5918046, 0xeefb8dc3), TOBN(0x73d318ac, 0x02209400)}}, + {{TOBN(0xe800400f, 0x728693e5), TOBN(0xe87d814b, 0x339927ed), + TOBN(0x93e94d3b, 0x57ea9910), TOBN(0xff8a35b6, 0x2245fb69)}, + {TOBN(0x043853d7, 0x7f200d34), TOBN(0x470f1e68, 0x0f653ce1), + TOBN(0x81ac05bd, 0x59a06379), TOBN(0xa14052c2, 0x03930c29)}}, + {{TOBN(0x6b72fab5, 0x26bc2797), TOBN(0x13670d16, 0x99f16771), + TOBN(0x00170052, 0x1e3e48d1), TOBN(0x978fe401, 0xb7adf678)}, + {TOBN(0x55ecfb92, 0xd41c5dd4), TOBN(0x5ff8e247, 0xc7b27da5), + TOBN(0xe7518272, 0x013fb606), TOBN(0x5768d7e5, 0x2f547a3c)}}, + {{TOBN(0xbb24eaa3, 0x60017a5f), TOBN(0x6b18e6e4, 0x9c64ce9b), + TOBN(0xc225c655, 0x103dde07), TOBN(0xfc3672ae, 0x7592f7ea)}, + {TOBN(0x9606ad77, 0xd06283a1), TOBN(0x542fc650, 0xe4d59d99), + TOBN(0xabb57c49, 0x2a40e7c2), TOBN(0xac948f13, 0xa8db9f55)}}, + {{TOBN(0x6d4c9682, 0xb04465c3), TOBN(0xe3d062fa, 0x6468bd15), + TOBN(0xa51729ac, 0x5f318d7e), TOBN(0x1fc87df6, 0x9eb6fc95)}, + {TOBN(0x63d146a8, 0x0591f652), TOBN(0xa861b8f7, 0x589621aa), + TOBN(0x59f5f15a, 0xce31348c), TOBN(0x8f663391, 0x440da6da)}}, + {{TOBN(0xcfa778ac, 0xb591ffa3), TOBN(0x027ca9c5, 0x4cdfebce), + TOBN(0xbe8e05a5, 0x444ea6b3), TOBN(0x8aab4e69, 0xa78d8254)}, + {TOBN(0x2437f04f, 0xb474d6b8), TOBN(0x6597ffd4, 0x045b3855), + TOBN(0xbb0aea4e, 0xca47ecaa), TOBN(0x568aae83, 0x85c7ebfc)}}, + {{TOBN(0x0e966e64, 0xc73b2383), TOBN(0x49eb3447, 0xd17d8762), + TOBN(0xde107821, 0x8da05dab), TOBN(0x443d8baa, 0x016b7236)}, + {TOBN(0x163b63a5, 0xea7610d6), TOBN(0xe47e4185, 0xce1ca979), + TOBN(0xae648b65, 0x80baa132), TOBN(0xebf53de2, 0x0e0d5b64)}}, + {{TOBN(0x8d3bfcb4, 0xd3c8c1ca), TOBN(0x0d914ef3, 0x5d04b309), + TOBN(0x55ef6415, 0x3de7d395), TOBN(0xbde1666f, 0x26b850e8)}, + {TOBN(0xdbe1ca6e, 0xd449ab19), TOBN(0x8902b322, 0xe89a2672), + TOBN(0xb1674b7e, 0xdacb7a53), TOBN(0x8e9faf6e, 0xf52523ff)}}, + {{TOBN(0x6ba535da, 0x9a85788b), TOBN(0xd21f03ae, 0xbd0626d4), + TOBN(0x099f8c47, 0xe873dc64), TOBN(0xcda8564d, 0x018ec97e)}, + {TOBN(0x3e8d7a5c, 0xde92c68c), TOBN(0x78e035a1, 0x73323cc4), + TOBN(0x3ef26275, 0xf880ff7c), TOBN(0xa4ee3dff, 0x273eedaa)}}, + {{TOBN(0x58823507, 0xaf4e18f8), TOBN(0x967ec9b5, 0x0672f328), + TOBN(0x9ded19d9, 0x559d3186), TOBN(0x5e2ab3de, 0x6cdce39c)}, + {TOBN(0xabad6e4d, 0x11c226df), TOBN(0xf9783f43, 0x87723014), + TOBN(0x9a49a0cf, 0x1a885719), TOBN(0xfc0c1a5a, 0x90da9dbf)}}, + {{TOBN(0x8bbaec49, 0x571d92ac), TOBN(0x569e85fe, 0x4692517f), + TOBN(0x8333b014, 0xa14ea4af), TOBN(0x32f2a62f, 0x12e5c5ad)}, + {TOBN(0x98c2ce3a, 0x06d89b85), TOBN(0xb90741aa, 0x2ff77a08), + TOBN(0x2530defc, 0x01f795a2), TOBN(0xd6e5ba0b, 0x84b3c199)}}, + {{TOBN(0x7d8e8451, 0x12e4c936), TOBN(0xae419f7d, 0xbd0be17b), + TOBN(0xa583fc8c, 0x22262bc9), TOBN(0x6b842ac7, 0x91bfe2bd)}, + {TOBN(0x33cef4e9, 0x440d6827), TOBN(0x5f69f4de, 0xef81fb14), + TOBN(0xf16cf6f6, 0x234fbb92), TOBN(0x76ae3fc3, 0xd9e7e158)}}, + {{TOBN(0x4e89f6c2, 0xe9740b33), TOBN(0x677bc85d, 0x4962d6a1), + TOBN(0x6c6d8a7f, 0x68d10d15), TOBN(0x5f9a7224, 0x0257b1cd)}, + {TOBN(0x7096b916, 0x4ad85961), TOBN(0x5f8c47f7, 0xe657ab4a), + TOBN(0xde57d7d0, 0xf7461d7e), TOBN(0x7eb6094d, 0x80ce5ee2)}}, + {{TOBN(0x0b1e1dfd, 0x34190547), TOBN(0x8a394f43, 0xf05dd150), + TOBN(0x0a9eb24d, 0x97df44e6), TOBN(0x78ca06bf, 0x87675719)}, + {TOBN(0x6f0b3462, 0x6ffeec22), TOBN(0x9d91bcea, 0x36cdd8fb), + TOBN(0xac83363c, 0xa105be47), TOBN(0x81ba76c1, 0x069710e3)}}, + {{TOBN(0x3d1b24cb, 0x28c682c6), TOBN(0x27f25228, 0x8612575b), + TOBN(0xb587c779, 0xe8e66e98), TOBN(0x7b0c03e9, 0x405eb1fe)}, + {TOBN(0xfdf0d030, 0x15b548e7), TOBN(0xa8be76e0, 0x38b36af7), + TOBN(0x4cdab04a, 0x4f310c40), TOBN(0x6287223e, 0xf47ecaec)}}, + {{TOBN(0x678e6055, 0x8b399320), TOBN(0x61fe3fa6, 0xc01e4646), + TOBN(0xc482866b, 0x03261a5e), TOBN(0xdfcf45b8, 0x5c2f244a)}, + {TOBN(0x8fab9a51, 0x2f684b43), TOBN(0xf796c654, 0xc7220a66), + TOBN(0x1d90707e, 0xf5afa58f), TOBN(0x2c421d97, 0x4fdbe0de)}}, + {{TOBN(0xc4f4cda3, 0xaf2ebc2f), TOBN(0xa0af843d, 0xcb4efe24), + TOBN(0x53b857c1, 0x9ccd10b1), TOBN(0xddc9d1eb, 0x914d3e04)}, + {TOBN(0x7bdec8bb, 0x62771deb), TOBN(0x829277aa, 0x91c5aa81), + TOBN(0x7af18dd6, 0x832391ae), TOBN(0x1740f316, 0xc71a84ca)}}}, + {{{TOBN(0x8928e99a, 0xeeaf8c49), TOBN(0xee7aa73d, 0x6e24d728), + TOBN(0x4c5007c2, 0xe72b156c), TOBN(0x5fcf57c5, 0xed408a1d)}, + {TOBN(0x9f719e39, 0xb6057604), TOBN(0x7d343c01, 0xc2868bbf), + TOBN(0x2cca254b, 0x7e103e2d), TOBN(0xe6eb38a9, 0xf131bea2)}}, + {{TOBN(0xb33e624f, 0x8be762b4), TOBN(0x2a9ee4d1, 0x058e3413), + TOBN(0x968e6369, 0x67d805fa), TOBN(0x9848949b, 0x7db8bfd7)}, + {TOBN(0x5308d7e5, 0xd23a8417), TOBN(0x892f3b1d, 0xf3e29da5), + TOBN(0xc95c139e, 0x3dee471f), TOBN(0x8631594d, 0xd757e089)}}, + {{TOBN(0xe0c82a3c, 0xde918dcc), TOBN(0x2e7b5994, 0x26fdcf4b), + TOBN(0x82c50249, 0x32cb1b2d), TOBN(0xea613a9d, 0x7657ae07)}, + {TOBN(0xc2eb5f6c, 0xf1fdc9f7), TOBN(0xb6eae8b8, 0x879fe682), + TOBN(0x253dfee0, 0x591cbc7f), TOBN(0x000da713, 0x3e1290e6)}}, + {{TOBN(0x1083e2ea, 0x1f095615), TOBN(0x0a28ad77, 0x14e68c33), + TOBN(0x6bfc0252, 0x3d8818be), TOBN(0xb585113a, 0xf35850cd)}, + {TOBN(0x7d935f0b, 0x30df8aa1), TOBN(0xaddda07c, 0x4ab7e3ac), + TOBN(0x92c34299, 0x552f00cb), TOBN(0xc33ed1de, 0x2909df6c)}}, + {{TOBN(0x22c2195d, 0x80e87766), TOBN(0x9e99e6d8, 0x9ddf4ac0), + TOBN(0x09642e4e, 0x65e74934), TOBN(0x2610ffa2, 0xff1ff241)}, + {TOBN(0x4d1d47d4, 0x751c8159), TOBN(0x697b4985, 0xaf3a9363), + TOBN(0x0318ca46, 0x87477c33), TOBN(0xa90cb565, 0x9441eff3)}}, + {{TOBN(0x58bb3848, 0x36f024cb), TOBN(0x85be1f77, 0x36016168), + TOBN(0x6c59587c, 0xdc7e07f1), TOBN(0x191be071, 0xaf1d8f02)}, + {TOBN(0xbf169fa5, 0xcca5e55c), TOBN(0x3864ba3c, 0xf7d04eac), + TOBN(0x915e367f, 0x8d7d05db), TOBN(0xb48a876d, 0xa6549e5d)}}, + {{TOBN(0xef89c656, 0x580e40a2), TOBN(0xf194ed8c, 0x728068bc), + TOBN(0x74528045, 0xa47990c9), TOBN(0xf53fc7d7, 0x5e1a4649)}, + {TOBN(0xbec5ae9b, 0x78593e7d), TOBN(0x2cac4ee3, 0x41db65d7), + TOBN(0xa8c1eb24, 0x04a3d39b), TOBN(0x53b7d634, 0x03f8f3ef)}}, + {{TOBN(0x2dc40d48, 0x3e07113c), TOBN(0x6e4a5d39, 0x7d8b63ae), + TOBN(0x5582a94b, 0x79684c2b), TOBN(0x932b33d4, 0x622da26c)}, + {TOBN(0xf534f651, 0x0dbbf08d), TOBN(0x211d07c9, 0x64c23a52), + TOBN(0x0eeece0f, 0xee5bdc9b), TOBN(0xdf178168, 0xf7015558)}}, + {{TOBN(0xd4294635, 0x0a712229), TOBN(0x93cbe448, 0x09273f8c), + TOBN(0x00b095ef, 0x8f13bc83), TOBN(0xbb741972, 0x8798978c)}, + {TOBN(0x9d7309a2, 0x56dbe6e7), TOBN(0xe578ec56, 0x5a5d39ec), + TOBN(0x3961151b, 0x851f9a31), TOBN(0x2da7715d, 0xe5709eb4)}}, + {{TOBN(0x867f3017, 0x53dfabf0), TOBN(0x728d2078, 0xb8e39259), + TOBN(0x5c75a0cd, 0x815d9958), TOBN(0xf84867a6, 0x16603be1)}, + {TOBN(0xc865b13d, 0x70e35b1c), TOBN(0x02414468, 0x19b03e2c), + TOBN(0xe46041da, 0xac1f3121), TOBN(0x7c9017ad, 0x6f028a7c)}}, + {{TOBN(0xabc96de9, 0x0a482873), TOBN(0x4265d6b1, 0xb77e54d4), + TOBN(0x68c38e79, 0xa57d88e7), TOBN(0xd461d766, 0x9ce82de3)}, + {TOBN(0x817a9ec5, 0x64a7e489), TOBN(0xcc5675cd, 0xa0def5f2), + TOBN(0x9a00e785, 0x985d494e), TOBN(0xc626833f, 0x1b03514a)}}, + {{TOBN(0xabe7905a, 0x83cdd60e), TOBN(0x50602fb5, 0xa1170184), + TOBN(0x689886cd, 0xb023642a), TOBN(0xd568d090, 0xa6e1fb00)}, + {TOBN(0x5b1922c7, 0x0259217f), TOBN(0x93831cd9, 0xc43141e4), + TOBN(0xdfca3587, 0x0c95f86e), TOBN(0xdec2057a, 0x568ae828)}}, + {{TOBN(0xc44ea599, 0xf98a759a), TOBN(0x55a0a7a2, 0xf7c23c1d), + TOBN(0xd5ffb6e6, 0x94c4f687), TOBN(0x3563cce2, 0x12848478)}, + {TOBN(0x812b3517, 0xe7b1fbe1), TOBN(0x8a7dc979, 0x4f7338e0), + TOBN(0x211ecee9, 0x52d048db), TOBN(0x2eea4056, 0xc86ea3b8)}}, + {{TOBN(0xd8cb68a7, 0xba772b34), TOBN(0xe16ed341, 0x5f4e2541), + TOBN(0x9b32f6a6, 0x0fec14db), TOBN(0xeee376f7, 0x391698be)}, + {TOBN(0xe9a7aa17, 0x83674c02), TOBN(0x65832f97, 0x5843022a), + TOBN(0x29f3a8da, 0x5ba4990f), TOBN(0x79a59c3a, 0xfb8e3216)}}, + {{TOBN(0x9cdc4d2e, 0xbd19bb16), TOBN(0xc6c7cfd0, 0xb3262d86), + TOBN(0xd4ce14d0, 0x969c0b47), TOBN(0x1fa352b7, 0x13e56128)}, + {TOBN(0x383d55b8, 0x973db6d3), TOBN(0x71836850, 0xe8e5b7bf), + TOBN(0xc7714596, 0xe6bb571f), TOBN(0x259df31f, 0x2d5b2dd2)}}, + {{TOBN(0x568f8925, 0x913cc16d), TOBN(0x18bc5b6d, 0xe1a26f5a), + TOBN(0xdfa413be, 0xf5f499ae), TOBN(0xf8835dec, 0xc3f0ae84)}, + {TOBN(0xb6e60bd8, 0x65a40ab0), TOBN(0x65596439, 0x194b377e), + TOBN(0xbcd85625, 0x92084a69), TOBN(0x5ce433b9, 0x4f23ede0)}}, + {{TOBN(0xe8e8f04f, 0x6ad65143), TOBN(0x11511827, 0xd6e14af6), + TOBN(0x3d390a10, 0x8295c0c7), TOBN(0x71e29ee4, 0x621eba16)}, + {TOBN(0xa588fc09, 0x63717b46), TOBN(0x02be02fe, 0xe06ad4a2), + TOBN(0x931558c6, 0x04c22b22), TOBN(0xbb4d4bd6, 0x12f3c849)}}, + {{TOBN(0x54a4f496, 0x20efd662), TOBN(0x92ba6d20, 0xc5952d14), + TOBN(0x2db8ea1e, 0xcc9784c2), TOBN(0x81cc10ca, 0x4b353644)}, + {TOBN(0x40b570ad, 0x4b4d7f6c), TOBN(0x5c9f1d96, 0x84a1dcd2), + TOBN(0x01379f81, 0x3147e797), TOBN(0xe5c6097b, 0x2bd499f5)}}, + {{TOBN(0x40dcafa6, 0x328e5e20), TOBN(0xf7b5244a, 0x54815550), + TOBN(0xb9a4f118, 0x47bfc978), TOBN(0x0ea0e79f, 0xd25825b1)}, + {TOBN(0xa50f96eb, 0x646c7ecf), TOBN(0xeb811493, 0x446dea9d), + TOBN(0x2af04677, 0xdfabcf69), TOBN(0xbe3a068f, 0xc713f6e8)}}, + {{TOBN(0x860d523d, 0x42e06189), TOBN(0xbf077941, 0x4e3aff13), + TOBN(0x0b616dca, 0xc1b20650), TOBN(0xe66dd6d1, 0x2131300d)}, + {TOBN(0xd4a0fd67, 0xff99abde), TOBN(0xc9903550, 0xc7aac50d), + TOBN(0x022ecf8b, 0x7c46b2d7), TOBN(0x3333b1e8, 0x3abf92af)}}, + {{TOBN(0x11cc113c, 0x6c491c14), TOBN(0x05976688, 0x80dd3f88), + TOBN(0xf5b4d9e7, 0x29d932ed), TOBN(0xe982aad8, 0xa2c38b6d)}, + {TOBN(0x6f925347, 0x8be0dcf0), TOBN(0x700080ae, 0x65ca53f2), + TOBN(0xd8131156, 0x443ca77f), TOBN(0xe92d6942, 0xec51f984)}}, + {{TOBN(0xd2a08af8, 0x85dfe9ae), TOBN(0xd825d9a5, 0x4d2a86ca), + TOBN(0x2c53988d, 0x39dff020), TOBN(0xf38b135a, 0x430cdc40)}, + {TOBN(0x0c918ae0, 0x62a7150b), TOBN(0xf31fd8de, 0x0c340e9b), + TOBN(0xafa0e7ae, 0x4dbbf02e), TOBN(0x5847fb2a, 0x5eba6239)}}, + {{TOBN(0x6b1647dc, 0xdccbac8b), TOBN(0xb642aa78, 0x06f485c8), + TOBN(0x873f3765, 0x7038ecdf), TOBN(0x2ce5e865, 0xfa49d3fe)}, + {TOBN(0xea223788, 0xc98c4400), TOBN(0x8104a8cd, 0xf1fa5279), + TOBN(0xbcf7cc7a, 0x06becfd7), TOBN(0x49424316, 0xc8f974ae)}}, + {{TOBN(0xc0da65e7, 0x84d6365d), TOBN(0xbcb7443f, 0x8f759fb8), + TOBN(0x35c712b1, 0x7ae81930), TOBN(0x80428dff, 0x4c6e08ab)}, + {TOBN(0xf19dafef, 0xa4faf843), TOBN(0xced8538d, 0xffa9855f), + TOBN(0x20ac409c, 0xbe3ac7ce), TOBN(0x358c1fb6, 0x882da71e)}}, + {{TOBN(0xafa9c0e5, 0xfd349961), TOBN(0x2b2cfa51, 0x8421c2fc), + TOBN(0x2a80db17, 0xf3a28d38), TOBN(0xa8aba539, 0x5d138e7e)}, + {TOBN(0x52012d1d, 0x6e96eb8d), TOBN(0x65d8dea0, 0xcbaf9622), + TOBN(0x57735447, 0xb264f56c), TOBN(0xbeebef3f, 0x1b6c8da2)}}, + {{TOBN(0xfc346d98, 0xce785254), TOBN(0xd50e8d72, 0xbb64a161), + TOBN(0xc03567c7, 0x49794add), TOBN(0x15a76065, 0x752c7ef6)}, + {TOBN(0x59f3a222, 0x961f23d6), TOBN(0x378e4438, 0x73ecc0b0), + TOBN(0xc74be434, 0x5a82fde4), TOBN(0xae509af2, 0xd8b9cf34)}}, + {{TOBN(0x4a61ee46, 0x577f44a1), TOBN(0xe09b748c, 0xb611deeb), + TOBN(0xc0481b2c, 0xf5f7b884), TOBN(0x35626678, 0x61acfa6b)}, + {TOBN(0x37f4c518, 0xbf8d21e6), TOBN(0x22d96531, 0xb205a76d), + TOBN(0x37fb85e1, 0x954073c0), TOBN(0xbceafe4f, 0x65b3a567)}}, + {{TOBN(0xefecdef7, 0xbe42a582), TOBN(0xd3fc6080, 0x65046be6), + TOBN(0xc9af13c8, 0x09e8dba9), TOBN(0x1e6c9847, 0x641491ff)}, + {TOBN(0x3b574925, 0xd30c31f7), TOBN(0xb7eb72ba, 0xac2a2122), + TOBN(0x776a0dac, 0xef0859e7), TOBN(0x06fec314, 0x21900942)}}, + {{TOBN(0x2464bc10, 0xf8c22049), TOBN(0x9bfbcce7, 0x875ebf69), + TOBN(0xd7a88e2a, 0x4336326b), TOBN(0xda05261c, 0x5bc2acfa)}, + {TOBN(0xc29f5bdc, 0xeba7efc8), TOBN(0x471237ca, 0x25dbbf2e), + TOBN(0xa72773f2, 0x2975f127), TOBN(0xdc744e8e, 0x04d0b326)}}, + {{TOBN(0x38a7ed16, 0xa56edb73), TOBN(0x64357e37, 0x2c007e70), + TOBN(0xa167d15b, 0x5080b400), TOBN(0x07b41164, 0x23de4be1)}, + {TOBN(0xb2d91e32, 0x74c89883), TOBN(0x3c162821, 0x2882e7ed), + TOBN(0xad6b36ba, 0x7503e482), TOBN(0x48434e8e, 0x0ea34331)}}, + {{TOBN(0x79f4f24f, 0x2c7ae0b9), TOBN(0xc46fbf81, 0x1939b44a), + TOBN(0x76fefae8, 0x56595eb1), TOBN(0x417b66ab, 0xcd5f29c7)}, + {TOBN(0x5f2332b2, 0xc5ceec20), TOBN(0xd69661ff, 0xe1a1cae2), + TOBN(0x5ede7e52, 0x9b0286e6), TOBN(0x9d062529, 0xe276b993)}}, + {{TOBN(0x324794b0, 0x7e50122b), TOBN(0xdd744f8b, 0x4af07ca5), + TOBN(0x30a12f08, 0xd63fc97b), TOBN(0x39650f1a, 0x76626d9d)}, + {TOBN(0x101b47f7, 0x1fa38477), TOBN(0x3d815f19, 0xd4dc124f), + TOBN(0x1569ae95, 0xb26eb58a), TOBN(0xc3cde188, 0x95fb1887)}}, + {{TOBN(0x54e9f37b, 0xf9539a48), TOBN(0xb0100e06, 0x7408c1a5), + TOBN(0x821d9811, 0xea580cbb), TOBN(0x8af52d35, 0x86e50c56)}, + {TOBN(0xdfbd9d47, 0xdbbf698b), TOBN(0x2961a1ea, 0x03dc1c73), + TOBN(0x203d38f8, 0xe76a5df8), TOBN(0x08a53a68, 0x6def707a)}}, + {{TOBN(0x26eefb48, 0x1bee45d4), TOBN(0xb3cee346, 0x3c688036), + TOBN(0x463c5315, 0xc42f2469), TOBN(0x19d84d2e, 0x81378162)}, + {TOBN(0x22d7c3c5, 0x1c4d349f), TOBN(0x65965844, 0x163d59c5), + TOBN(0xcf198c56, 0xb8abceae), TOBN(0x6fb1fb1b, 0x628559d5)}}, + {{TOBN(0x8bbffd06, 0x07bf8fe3), TOBN(0x46259c58, 0x3467734b), + TOBN(0xd8953cea, 0x35f7f0d3), TOBN(0x1f0bece2, 0xd65b0ff1)}, + {TOBN(0xf7d5b4b3, 0xf3c72914), TOBN(0x29e8ea95, 0x3cb53389), + TOBN(0x4a365626, 0x836b6d46), TOBN(0xe849f910, 0xea174fde)}}, + {{TOBN(0x7ec62fbb, 0xf4737f21), TOBN(0xd8dba5ab, 0x6209f5ac), + TOBN(0x24b5d7a9, 0xa5f9adbe), TOBN(0x707d28f7, 0xa61dc768)}, + {TOBN(0x7711460b, 0xcaa999ea), TOBN(0xba7b174d, 0x1c92e4cc), + TOBN(0x3c4bab66, 0x18d4bf2d), TOBN(0xb8f0c980, 0xeb8bd279)}}, + {{TOBN(0x024bea9a, 0x324b4737), TOBN(0xfba9e423, 0x32a83bca), + TOBN(0x6e635643, 0xa232dced), TOBN(0x99619367, 0x2571c8ba)}, + {TOBN(0xe8c9f357, 0x54b7032b), TOBN(0xf936b3ba, 0x2442d54a), + TOBN(0x2263f0f0, 0x8290c65a), TOBN(0x48989780, 0xee2c7fdb)}}, + {{TOBN(0xadc5d55a, 0x13d4f95e), TOBN(0x737cff85, 0xad9b8500), + TOBN(0x271c557b, 0x8a73f43d), TOBN(0xbed617a4, 0xe18bc476)}, + {TOBN(0x66245401, 0x7dfd8ab2), TOBN(0xae7b89ae, 0x3a2870aa), + TOBN(0x1b555f53, 0x23a7e545), TOBN(0x6791e247, 0xbe057e4c)}}, + {{TOBN(0x860136ad, 0x324fa34d), TOBN(0xea111447, 0x4cbeae28), + TOBN(0x023a4270, 0xbedd3299), TOBN(0x3d5c3a7f, 0xc1c35c34)}, + {TOBN(0xb0f6db67, 0x8d0412d2), TOBN(0xd92625e2, 0xfcdc6b9a), + TOBN(0x92ae5ccc, 0x4e28a982), TOBN(0xea251c36, 0x47a3ce7e)}}, + {{TOBN(0x9d658932, 0x790691bf), TOBN(0xed610589, 0x06b736ae), + TOBN(0x712c2f04, 0xc0d63b6e), TOBN(0x5cf06fd5, 0xc63d488f)}, + {TOBN(0x97363fac, 0xd9588e41), TOBN(0x1f9bf762, 0x2b93257e), + TOBN(0xa9d1ffc4, 0x667acace), TOBN(0x1cf4a1aa, 0x0a061ecf)}}, + {{TOBN(0x40e48a49, 0xdc1818d0), TOBN(0x0643ff39, 0xa3621ab0), + TOBN(0x5768640c, 0xe39ef639), TOBN(0x1fc099ea, 0x04d86854)}, + {TOBN(0x9130b9c3, 0xeccd28fd), TOBN(0xd743cbd2, 0x7eec54ab), + TOBN(0x052b146f, 0xe5b475b6), TOBN(0x058d9a82, 0x900a7d1f)}}, + {{TOBN(0x65e02292, 0x91262b72), TOBN(0x96f924f9, 0xbb0edf03), + TOBN(0x5cfa59c8, 0xfe206842), TOBN(0xf6037004, 0x5eafa720)}, + {TOBN(0x5f30699e, 0x18d7dd96), TOBN(0x381e8782, 0xcbab2495), + TOBN(0x91669b46, 0xdd8be949), TOBN(0xb40606f5, 0x26aae8ef)}}, + {{TOBN(0x2812b839, 0xfc6751a4), TOBN(0x16196214, 0xfba800ef), + TOBN(0x4398d5ca, 0x4c1a2875), TOBN(0x720c00ee, 0x653d8349)}, + {TOBN(0xc2699eb0, 0xd820007c), TOBN(0x880ee660, 0xa39b5825), + TOBN(0x70694694, 0x471f6984), TOBN(0xf7d16ea8, 0xe3dda99a)}}, + {{TOBN(0x28d675b2, 0xc0519a23), TOBN(0x9ebf94fe, 0x4f6952e3), + TOBN(0xf28bb767, 0xa2294a8a), TOBN(0x85512b4d, 0xfe0af3f5)}, + {TOBN(0x18958ba8, 0x99b16a0d), TOBN(0x95c2430c, 0xba7548a7), + TOBN(0xb30d1b10, 0xa16be615), TOBN(0xe3ebbb97, 0x85bfb74c)}}, + {{TOBN(0xa3273cfe, 0x18549fdb), TOBN(0xf6e200bf, 0x4fcdb792), + TOBN(0x54a76e18, 0x83aba56c), TOBN(0x73ec66f6, 0x89ef6aa2)}, + {TOBN(0x8d17add7, 0xd1b9a305), TOBN(0xa959c5b9, 0xb7ae1b9d), + TOBN(0x88643522, 0x6bcc094a), TOBN(0xcc5616c4, 0xd7d429b9)}}, + {{TOBN(0xa6dada01, 0xe6a33f7c), TOBN(0xc6217a07, 0x9d4e70ad), + TOBN(0xd619a818, 0x09c15b7c), TOBN(0xea06b329, 0x0e80c854)}, + {TOBN(0x174811ce, 0xa5f5e7b9), TOBN(0x66dfc310, 0x787c65f4), + TOBN(0x4ea7bd69, 0x3316ab54), TOBN(0xc12c4acb, 0x1dcc0f70)}}, + {{TOBN(0xe4308d1a, 0x1e407dd9), TOBN(0xe8a3587c, 0x91afa997), + TOBN(0xea296c12, 0xab77b7a5), TOBN(0xb5ad49e4, 0x673c0d52)}, + {TOBN(0x40f9b2b2, 0x7006085a), TOBN(0xa88ff340, 0x87bf6ec2), + TOBN(0x978603b1, 0x4e3066a6), TOBN(0xb3f99fc2, 0xb5e486e2)}}, + {{TOBN(0x07b53f5e, 0xb2e63645), TOBN(0xbe57e547, 0x84c84232), + TOBN(0xd779c216, 0x7214d5cf), TOBN(0x617969cd, 0x029a3aca)}, + {TOBN(0xd17668cd, 0x8a7017a0), TOBN(0x77b4d19a, 0xbe9b7ee8), + TOBN(0x58fd0e93, 0x9c161776), TOBN(0xa8c4f4ef, 0xd5968a72)}}, + {{TOBN(0x296071cc, 0x67b3de77), TOBN(0xae3c0b8e, 0x634f7905), + TOBN(0x67e440c2, 0x8a7100c9), TOBN(0xbb8c3c1b, 0xeb4b9b42)}, + {TOBN(0x6d71e8ea, 0xc51b3583), TOBN(0x7591f5af, 0x9525e642), + TOBN(0xf73a2f7b, 0x13f509f3), TOBN(0x618487aa, 0x5619ac9b)}}, + {{TOBN(0x3a72e5f7, 0x9d61718a), TOBN(0x00413bcc, 0x7592d28c), + TOBN(0x7d9b11d3, 0x963c35cf), TOBN(0x77623bcf, 0xb90a46ed)}, + {TOBN(0xdeef273b, 0xdcdd2a50), TOBN(0x4a741f9b, 0x0601846e), + TOBN(0x33b89e51, 0x0ec6e929), TOBN(0xcb02319f, 0x8b7f22cd)}}, + {{TOBN(0xbbe1500d, 0x084bae24), TOBN(0x2f0ae8d7, 0x343d2693), + TOBN(0xacffb5f2, 0x7cdef811), TOBN(0xaa0c030a, 0x263fb94f)}, + {TOBN(0x6eef0d61, 0xa0f442de), TOBN(0xf92e1817, 0x27b139d3), + TOBN(0x1ae6deb7, 0x0ad8bc28), TOBN(0xa89e38dc, 0xc0514130)}}, + {{TOBN(0x81eeb865, 0xd2fdca23), TOBN(0x5a15ee08, 0xcc8ef895), + TOBN(0x768fa10a, 0x01905614), TOBN(0xeff5b8ef, 0x880ee19b)}, + {TOBN(0xf0c0cabb, 0xcb1c8a0e), TOBN(0x2e1ee9cd, 0xb8c838f9), + TOBN(0x0587d8b8, 0x8a4a14c0), TOBN(0xf6f27896, 0x2ff698e5)}}, + {{TOBN(0xed38ef1c, 0x89ee6256), TOBN(0xf44ee1fe, 0x6b353b45), + TOBN(0x9115c0c7, 0x70e903b3), TOBN(0xc78ec0a1, 0x818f31df)}, + {TOBN(0x6c003324, 0xb7dccbc6), TOBN(0xd96dd1f3, 0x163bbc25), + TOBN(0x33aa82dd, 0x5cedd805), TOBN(0x123aae4f, 0x7f7eb2f1)}}, + {{TOBN(0x1723fcf5, 0xa26262cd), TOBN(0x1f7f4d5d, 0x0060ebd5), + TOBN(0xf19c5c01, 0xb2eaa3af), TOBN(0x2ccb9b14, 0x9790accf)}, + {TOBN(0x1f9c1cad, 0x52324aa6), TOBN(0x63200526, 0x7247df54), + TOBN(0x5732fe42, 0xbac96f82), TOBN(0x52fe771f, 0x01a1c384)}}, + {{TOBN(0x546ca13d, 0xb1001684), TOBN(0xb56b4eee, 0xa1709f75), + TOBN(0x266545a9, 0xd5db8672), TOBN(0xed971c90, 0x1e8f3cfb)}, + {TOBN(0x4e7d8691, 0xe3a07b29), TOBN(0x7570d9ec, 0xe4b696b9), + TOBN(0xdc5fa067, 0x7bc7e9ae), TOBN(0x68b44caf, 0xc82c4844)}}, + {{TOBN(0x519d34b3, 0xbf44da80), TOBN(0x283834f9, 0x5ab32e66), + TOBN(0x6e608797, 0x6278a000), TOBN(0x1e62960e, 0x627312f6)}, + {TOBN(0x9b87b27b, 0xe6901c55), TOBN(0x80e78538, 0x24fdbc1f), + TOBN(0xbbbc0951, 0x2facc27d), TOBN(0x06394239, 0xac143b5a)}}, + {{TOBN(0x35bb4a40, 0x376c1944), TOBN(0x7cb62694, 0x63da1511), + TOBN(0xafd29161, 0xb7148a3b), TOBN(0xa6f9d9ed, 0x4e2ea2ee)}, + {TOBN(0x15dc2ca2, 0x880dd212), TOBN(0x903c3813, 0xa61139a9), + TOBN(0x2aa7b46d, 0x6c0f8785), TOBN(0x36ce2871, 0x901c60ff)}}, + {{TOBN(0xc683b028, 0xe10d9c12), TOBN(0x7573baa2, 0x032f33d3), + TOBN(0x87a9b1f6, 0x67a31b58), TOBN(0xfd3ed11a, 0xf4ffae12)}, + {TOBN(0x83dcaa9a, 0x0cb2748e), TOBN(0x8239f018, 0x5d6fdf16), + TOBN(0xba67b49c, 0x72753941), TOBN(0x2beec455, 0xc321cb36)}}, + {{TOBN(0x88015606, 0x3f8b84ce), TOBN(0x76417083, 0x8d38c86f), + TOBN(0x054f1ca7, 0x598953dd), TOBN(0xc939e110, 0x4e8e7429)}, + {TOBN(0x9b1ac2b3, 0x5a914f2f), TOBN(0x39e35ed3, 0xe74b8f9c), + TOBN(0xd0debdb2, 0x781b2fb0), TOBN(0x1585638f, 0x2d997ba2)}}, + {{TOBN(0x9c4b646e, 0x9e2fce99), TOBN(0x68a21081, 0x1e80857f), + TOBN(0x06d54e44, 0x3643b52a), TOBN(0xde8d6d63, 0x0d8eb843)}, + {TOBN(0x70321563, 0x42146a0a), TOBN(0x8ba826f2, 0x5eaa3622), + TOBN(0x227a58bd, 0x86138787), TOBN(0x43b6c03c, 0x10281d37)}}, + {{TOBN(0x6326afbb, 0xb54dde39), TOBN(0x744e5e8a, 0xdb6f2d5f), + TOBN(0x48b2a99a, 0xcff158e1), TOBN(0xa93c8fa0, 0xef87918f)}, + {TOBN(0x2182f956, 0xde058c5c), TOBN(0x216235d2, 0x936f9e7a), + TOBN(0xace0c0db, 0xd2e31e67), TOBN(0xc96449bf, 0xf23ac3e7)}}, + {{TOBN(0x7e9a2874, 0x170693bd), TOBN(0xa28e14fd, 0xa45e6335), + TOBN(0x5757f6b3, 0x56427344), TOBN(0x822e4556, 0xacf8edf9)}, + {TOBN(0x2b7a6ee2, 0xe6a285cd), TOBN(0x5866f211, 0xa9df3af0), + TOBN(0x40dde2dd, 0xf845b844), TOBN(0x986c3726, 0x110e5e49)}}, + {{TOBN(0x73680c2a, 0xf7172277), TOBN(0x57b94f0f, 0x0cccb244), + TOBN(0xbdff7267, 0x2d438ca7), TOBN(0xbad1ce11, 0xcf4663fd)}, + {TOBN(0x9813ed9d, 0xd8f71cae), TOBN(0xf43272a6, 0x961fdaa6), + TOBN(0xbeff0119, 0xbd6d1637), TOBN(0xfebc4f91, 0x30361978)}}, + {{TOBN(0x02b37a95, 0x2f41deff), TOBN(0x0e44a59a, 0xe63b89b7), + TOBN(0x673257dc, 0x143ff951), TOBN(0x19c02205, 0xd752baf4)}, + {TOBN(0x46c23069, 0xc4b7d692), TOBN(0x2e6392c3, 0xfd1502ac), + TOBN(0x6057b1a2, 0x1b220846), TOBN(0xe51ff946, 0x0c1b5b63)}}}, + {{{TOBN(0x6e85cb51, 0x566c5c43), TOBN(0xcff9c919, 0x3597f046), + TOBN(0x9354e90c, 0x4994d94a), TOBN(0xe0a39332, 0x2147927d)}, + {TOBN(0x8427fac1, 0x0dc1eb2b), TOBN(0x88cfd8c2, 0x2ff319fa), + TOBN(0xe2d4e684, 0x01965274), TOBN(0xfa2e067d, 0x67aaa746)}}, + {{TOBN(0xb6d92a7f, 0x3e5f9f11), TOBN(0x9afe153a, 0xd6cb3b8e), + TOBN(0x4d1a6dd7, 0xddf800bd), TOBN(0xf6c13cc0, 0xcaf17e19)}, + {TOBN(0x15f6c58e, 0x325fc3ee), TOBN(0x71095400, 0xa31dc3b2), + TOBN(0x168e7c07, 0xafa3d3e7), TOBN(0x3f8417a1, 0x94c7ae2d)}}, + {{TOBN(0xec234772, 0x813b230d), TOBN(0x634d0f5f, 0x17344427), + TOBN(0x11548ab1, 0xd77fc56a), TOBN(0x7fab1750, 0xce06af77)}, + {TOBN(0xb62c10a7, 0x4f7c4f83), TOBN(0xa7d2edc4, 0x220a67d9), + TOBN(0x1c404170, 0x921209a0), TOBN(0x0b9815a0, 0xface59f0)}}, + {{TOBN(0x2842589b, 0x319540c3), TOBN(0x18490f59, 0xa283d6f8), + TOBN(0xa2731f84, 0xdaae9fcb), TOBN(0x3db6d960, 0xc3683ba0)}, + {TOBN(0xc85c63bb, 0x14611069), TOBN(0xb19436af, 0x0788bf05), + TOBN(0x905459df, 0x347460d2), TOBN(0x73f6e094, 0xe11a7db1)}}, + {{TOBN(0xdc7f938e, 0xb6357f37), TOBN(0xc5d00f79, 0x2bd8aa62), + TOBN(0xc878dcb9, 0x2ca979fc), TOBN(0x37e83ed9, 0xeb023a99)}, + {TOBN(0x6b23e273, 0x1560bf3d), TOBN(0x1086e459, 0x1d0fae61), + TOBN(0x78248316, 0x9a9414bd), TOBN(0x1b956bc0, 0xf0ea9ea1)}}, + {{TOBN(0x7b85bb91, 0xc31b9c38), TOBN(0x0c5aa90b, 0x48ef57b5), + TOBN(0xdedeb169, 0xaf3bab6f), TOBN(0xe610ad73, 0x2d373685)}, + {TOBN(0xf13870df, 0x02ba8e15), TOBN(0x0337edb6, 0x8ca7f771), + TOBN(0xe4acf747, 0xb62c036c), TOBN(0xd921d576, 0xb6b94e81)}}, + {{TOBN(0xdbc86439, 0x2c422f7a), TOBN(0xfb635362, 0xed348898), + TOBN(0x83084668, 0xc45bfcd1), TOBN(0xc357c9e3, 0x2b315e11)}, + {TOBN(0xb173b540, 0x5b2e5b8c), TOBN(0x7e946931, 0xe102b9a4), + TOBN(0x17c890eb, 0x7b0fb199), TOBN(0xec225a83, 0xd61b662b)}}, + {{TOBN(0xf306a3c8, 0xee3c76cb), TOBN(0x3cf11623, 0xd32a1f6e), + TOBN(0xe6d5ab64, 0x6863e956), TOBN(0x3b8a4cbe, 0x5c005c26)}, + {TOBN(0xdcd529a5, 0x9ce6bb27), TOBN(0xc4afaa52, 0x04d4b16f), + TOBN(0xb0624a26, 0x7923798d), TOBN(0x85e56df6, 0x6b307fab)}}, + {{TOBN(0x0281893c, 0x2bf29698), TOBN(0x91fc19a4, 0xd7ce7603), + TOBN(0x75a5dca3, 0xad9a558f), TOBN(0x40ceb3fa, 0x4d50bf77)}, + {TOBN(0x1baf6060, 0xbc9ba369), TOBN(0x927e1037, 0x597888c2), + TOBN(0xd936bf19, 0x86a34c07), TOBN(0xd4cf10c1, 0xc34ae980)}}, + {{TOBN(0x3a3e5334, 0x859dd614), TOBN(0x9c475b5b, 0x18d0c8ee), + TOBN(0x63080d1f, 0x07cd51d5), TOBN(0xc9c0d0a6, 0xb88b4326)}, + {TOBN(0x1ac98691, 0xc234296f), TOBN(0x2a0a83a4, 0x94887fb6), + TOBN(0x56511427, 0x0cea9cf2), TOBN(0x5230a6e8, 0xa24802f5)}}, + {{TOBN(0xf7a2bf0f, 0x72e3d5c1), TOBN(0x37717446, 0x4f21439e), + TOBN(0xfedcbf25, 0x9ce30334), TOBN(0xe0030a78, 0x7ce202f9)}, + {TOBN(0x6f2d9ebf, 0x1202e9ca), TOBN(0xe79dde6c, 0x75e6e591), + TOBN(0xf52072af, 0xf1dac4f8), TOBN(0x6c8d087e, 0xbb9b404d)}}, + {{TOBN(0xad0fc73d, 0xbce913af), TOBN(0x909e587b, 0x458a07cb), + TOBN(0x1300da84, 0xd4f00c8a), TOBN(0x425cd048, 0xb54466ac)}, + {TOBN(0xb59cb9be, 0x90e9d8bf), TOBN(0x991616db, 0x3e431b0e), + TOBN(0xd3aa117a, 0x531aecff), TOBN(0x91af92d3, 0x59f4dc3b)}}, + {{TOBN(0x9b1ec292, 0xe93fda29), TOBN(0x76bb6c17, 0xe97d91bc), + TOBN(0x7509d95f, 0xaface1e6), TOBN(0x3653fe47, 0xbe855ae3)}, + {TOBN(0x73180b28, 0x0f680e75), TOBN(0x75eefd1b, 0xeeb6c26c), + TOBN(0xa4cdf29f, 0xb66d4236), TOBN(0x2d70a997, 0x6b5821d8)}}, + {{TOBN(0x7a3ee207, 0x20445c36), TOBN(0x71d1ac82, 0x59877174), + TOBN(0x0fc539f7, 0x949f73e9), TOBN(0xd05cf3d7, 0x982e3081)}, + {TOBN(0x8758e20b, 0x7b1c7129), TOBN(0xffadcc20, 0x569e61f2), + TOBN(0xb05d3a2f, 0x59544c2d), TOBN(0xbe16f5c1, 0x9fff5e53)}}, + {{TOBN(0x73cf65b8, 0xaad58135), TOBN(0x622c2119, 0x037aa5be), + TOBN(0x79373b3f, 0x646fd6a0), TOBN(0x0e029db5, 0x0d3978cf)}, + {TOBN(0x8bdfc437, 0x94fba037), TOBN(0xaefbd687, 0x620797a6), + TOBN(0x3fa5382b, 0xbd30d38e), TOBN(0x7627cfbf, 0x585d7464)}}, + {{TOBN(0xb2330fef, 0x4e4ca463), TOBN(0xbcef7287, 0x3566cc63), + TOBN(0xd161d2ca, 0xcf780900), TOBN(0x135dc539, 0x5b54827d)}, + {TOBN(0x638f052e, 0x27bf1bc6), TOBN(0x10a224f0, 0x07dfa06c), + TOBN(0xe973586d, 0x6d3321da), TOBN(0x8b0c5738, 0x26152c8f)}}, + {{TOBN(0x07ef4f2a, 0x34606074), TOBN(0x80fe7fe8, 0xa0f7047a), + TOBN(0x3d1a8152, 0xe1a0e306), TOBN(0x32cf43d8, 0x88da5222)}, + {TOBN(0xbf89a95f, 0x5f02ffe6), TOBN(0x3d9eb9a4, 0x806ad3ea), + TOBN(0x012c17bb, 0x79c8e55e), TOBN(0xfdcd1a74, 0x99c81dac)}}, + {{TOBN(0x7043178b, 0xb9556098), TOBN(0x4090a1df, 0x801c3886), + TOBN(0x759800ff, 0x9b67b912), TOBN(0x3e5c0304, 0x232620c8)}, + {TOBN(0x4b9d3c4b, 0x70dceeca), TOBN(0xbb2d3c15, 0x181f648e), + TOBN(0xf981d837, 0x6e33345c), TOBN(0xb626289b, 0x0cf2297a)}}, + {{TOBN(0x766ac659, 0x8baebdcf), TOBN(0x1a28ae09, 0x75df01e5), + TOBN(0xb71283da, 0x375876d8), TOBN(0x4865a96d, 0x607b9800)}, + {TOBN(0x25dd1bcd, 0x237936b2), TOBN(0x332f4f4b, 0x60417494), + TOBN(0xd0923d68, 0x370a2147), TOBN(0x497f5dfb, 0xdc842203)}}, + {{TOBN(0x9dc74cbd, 0x32be5e0f), TOBN(0x7475bcb7, 0x17a01375), + TOBN(0x438477c9, 0x50d872b1), TOBN(0xcec67879, 0xffe1d63d)}, + {TOBN(0x9b006014, 0xd8578c70), TOBN(0xc9ad99a8, 0x78bb6b8b), + TOBN(0x6799008e, 0x11fb3806), TOBN(0xcfe81435, 0xcd44cab3)}}, + {{TOBN(0xa2ee1582, 0x2f4fb344), TOBN(0xb8823450, 0x483fa6eb), + TOBN(0x622d323d, 0x652c7749), TOBN(0xd8474a98, 0xbeb0a15b)}, + {TOBN(0xe43c154d, 0x5d1c00d0), TOBN(0x7fd581d9, 0x0e3e7aac), + TOBN(0x2b44c619, 0x2525ddf8), TOBN(0x67a033eb, 0xb8ae9739)}}, + {{TOBN(0x113ffec1, 0x9ef2d2e4), TOBN(0x1bf6767e, 0xd5a0ea7f), + TOBN(0x57fff75e, 0x03714c0a), TOBN(0xa23c422e, 0x0a23e9ee)}, + {TOBN(0xdd5f6b2d, 0x540f83af), TOBN(0xc2c2c27e, 0x55ea46a7), + TOBN(0xeb6b4246, 0x672a1208), TOBN(0xd13599f7, 0xae634f7a)}}, + {{TOBN(0xcf914b5c, 0xd7b32c6e), TOBN(0x61a5a640, 0xeaf61814), + TOBN(0x8dc3df8b, 0x208a1bbb), TOBN(0xef627fd6, 0xb6d79aa5)}, + {TOBN(0x44232ffc, 0xc4c86bc8), TOBN(0xe6f9231b, 0x061539fe), + TOBN(0x1d04f25a, 0x958b9533), TOBN(0x180cf934, 0x49e8c885)}}, + {{TOBN(0x89689595, 0x9884aaf7), TOBN(0xb1959be3, 0x07b348a6), + TOBN(0x96250e57, 0x3c147c87), TOBN(0xae0efb3a, 0xdd0c61f8)}, + {TOBN(0xed00745e, 0xca8c325e), TOBN(0x3c911696, 0xecff3f70), + TOBN(0x73acbc65, 0x319ad41d), TOBN(0x7b01a020, 0xf0b1c7ef)}}, + {{TOBN(0xea32b293, 0x63a1483f), TOBN(0x89eabe71, 0x7a248f96), + TOBN(0x9c6231d3, 0x343157e5), TOBN(0x93a375e5, 0xdf3c546d)}, + {TOBN(0xe76e9343, 0x6a2afe69), TOBN(0xc4f89100, 0xe166c88e), + TOBN(0x248efd0d, 0x4f872093), TOBN(0xae0eb3ea, 0x8fe0ea61)}}, + {{TOBN(0xaf89790d, 0x9d79046e), TOBN(0x4d650f2d, 0x6cee0976), + TOBN(0xa3935d9a, 0x43071eca), TOBN(0x66fcd2c9, 0x283b0bfe)}, + {TOBN(0x0e665eb5, 0x696605f1), TOBN(0xe77e5d07, 0xa54cd38d), + TOBN(0x90ee050a, 0x43d950cf), TOBN(0x86ddebda, 0xd32e69b5)}}, + {{TOBN(0x6ad94a3d, 0xfddf7415), TOBN(0xf7fa1309, 0x3f6e8d5a), + TOBN(0xc4831d1d, 0xe9957f75), TOBN(0x7de28501, 0xd5817447)}, + {TOBN(0x6f1d7078, 0x9e2aeb6b), TOBN(0xba2b9ff4, 0xf67a53c2), + TOBN(0x36963767, 0xdf9defc3), TOBN(0x479deed3, 0x0d38022c)}}, + {{TOBN(0xd2edb89b, 0x3a8631e8), TOBN(0x8de855de, 0x7a213746), + TOBN(0xb2056cb7, 0xb00c5f11), TOBN(0xdeaefbd0, 0x2c9b85e4)}, + {TOBN(0x03f39a8d, 0xd150892d), TOBN(0x37b84686, 0x218b7985), + TOBN(0x36296dd8, 0xb7375f1a), TOBN(0x472cd4b1, 0xb78e898e)}}, + {{TOBN(0x15dff651, 0xe9f05de9), TOBN(0xd4045069, 0x2ce98ba9), + TOBN(0x8466a7ae, 0x9b38024c), TOBN(0xb910e700, 0xe5a6b5ef)}, + {TOBN(0xae1c56ea, 0xb3aa8f0d), TOBN(0xbab2a507, 0x7eee74a6), + TOBN(0x0dca11e2, 0x4b4c4620), TOBN(0xfd896e2e, 0x4c47d1f4)}}, + {{TOBN(0xeb45ae53, 0x308fbd93), TOBN(0x46cd5a2e, 0x02c36fda), + TOBN(0x6a3d4e90, 0xbaa48385), TOBN(0xdd55e62e, 0x9dbe9960)}, + {TOBN(0xa1406aa0, 0x2a81ede7), TOBN(0x6860dd14, 0xf9274ea7), + TOBN(0xcfdcb0c2, 0x80414f86), TOBN(0xff410b10, 0x22f94327)}}, + {{TOBN(0x5a33cc38, 0x49ad467b), TOBN(0xefb48b6c, 0x0a7335f1), + TOBN(0x14fb54a4, 0xb153a360), TOBN(0x604aa9d2, 0xb52469cc)}, + {TOBN(0x5e9dc486, 0x754e48e9), TOBN(0x693cb455, 0x37471e8e), + TOBN(0xfb2fd7cd, 0x8d3b37b6), TOBN(0x63345e16, 0xcf09ff07)}}, + {{TOBN(0x9910ba6b, 0x23a5d896), TOBN(0x1fe19e35, 0x7fe4364e), + TOBN(0x6e1da8c3, 0x9a33c677), TOBN(0x15b4488b, 0x29fd9fd0)}, + {TOBN(0x1f439254, 0x1a1f22bf), TOBN(0x920a8a70, 0xab8163e8), + TOBN(0x3fd1b249, 0x07e5658e), TOBN(0xf2c4f79c, 0xb6ec839b)}}, + {{TOBN(0x1abbc3d0, 0x4aa38d1b), TOBN(0x3b0db35c, 0xb5d9510e), + TOBN(0x1754ac78, 0x3e60dec0), TOBN(0x53272fd7, 0xea099b33)}, + {TOBN(0x5fb0494f, 0x07a8e107), TOBN(0x4a89e137, 0x6a8191fa), + TOBN(0xa113b7f6, 0x3c4ad544), TOBN(0x88a2e909, 0x6cb9897b)}}, + {{TOBN(0x17d55de3, 0xb44a3f84), TOBN(0xacb2f344, 0x17c6c690), + TOBN(0x32088168, 0x10232390), TOBN(0xf2e8a61f, 0x6c733bf7)}, + {TOBN(0xa774aab6, 0x9c2d7652), TOBN(0xfb5307e3, 0xed95c5bc), + TOBN(0xa05c73c2, 0x4981f110), TOBN(0x1baae31c, 0xa39458c9)}}, + {{TOBN(0x1def185b, 0xcbea62e7), TOBN(0xe8ac9eae, 0xeaf63059), + TOBN(0x098a8cfd, 0x9921851c), TOBN(0xd959c3f1, 0x3abe2f5b)}, + {TOBN(0xa4f19525, 0x20e40ae5), TOBN(0x320789e3, 0x07a24aa1), + TOBN(0x259e6927, 0x7392b2bc), TOBN(0x58f6c667, 0x1918668b)}}, + {{TOBN(0xce1db2bb, 0xc55d2d8b), TOBN(0x41d58bb7, 0xf4f6ca56), + TOBN(0x7650b680, 0x8f877614), TOBN(0x905e16ba, 0xf4c349ed)}, + {TOBN(0xed415140, 0xf661acac), TOBN(0x3b8784f0, 0xcb2270af), + TOBN(0x3bc280ac, 0x8a402cba), TOBN(0xd53f7146, 0x0937921a)}}, + {{TOBN(0xc03c8ee5, 0xe5681e83), TOBN(0x62126105, 0xf6ac9e4a), + TOBN(0x9503a53f, 0x936b1a38), TOBN(0x3d45e2d4, 0x782fecbd)}, + {TOBN(0x69a5c439, 0x76e8ae98), TOBN(0xb53b2eeb, 0xbfb4b00e), + TOBN(0xf1674712, 0x72386c89), TOBN(0x30ca34a2, 0x4268bce4)}}, + {{TOBN(0x7f1ed86c, 0x78341730), TOBN(0x8ef5beb8, 0xb525e248), + TOBN(0xbbc489fd, 0xb74fbf38), TOBN(0x38a92a0e, 0x91a0b382)}, + {TOBN(0x7a77ba3f, 0x22433ccf), TOBN(0xde8362d6, 0xa29f05a9), + TOBN(0x7f6a30ea, 0x61189afc), TOBN(0x693b5505, 0x59ef114f)}}, + {{TOBN(0x50266bc0, 0xcd1797a1), TOBN(0xea17b47e, 0xf4b7af2d), + TOBN(0xd6c4025c, 0x3df9483e), TOBN(0x8cbb9d9f, 0xa37b18c9)}, + {TOBN(0x91cbfd9c, 0x4d8424cf), TOBN(0xdb7048f1, 0xab1c3506), + TOBN(0x9eaf641f, 0x028206a3), TOBN(0xf986f3f9, 0x25bdf6ce)}}, + {{TOBN(0x262143b5, 0x224c08dc), TOBN(0x2bbb09b4, 0x81b50c91), + TOBN(0xc16ed709, 0xaca8c84f), TOBN(0xa6210d9d, 0xb2850ca8)}, + {TOBN(0x6d8df67a, 0x09cb54d6), TOBN(0x91eef6e0, 0x500919a4), + TOBN(0x90f61381, 0x0f132857), TOBN(0x9acede47, 0xf8d5028b)}}, + {{TOBN(0x844d1b71, 0x90b771c3), TOBN(0x563b71e4, 0xba6426be), + TOBN(0x2efa2e83, 0xbdb802ff), TOBN(0x3410cbab, 0xab5b4a41)}, + {TOBN(0x555b2d26, 0x30da84dd), TOBN(0xd0711ae9, 0xee1cc29a), + TOBN(0xcf3e8c60, 0x2f547792), TOBN(0x03d7d5de, 0xdc678b35)}}, + {{TOBN(0x071a2fa8, 0xced806b8), TOBN(0x222e6134, 0x697f1478), + TOBN(0xdc16fd5d, 0xabfcdbbf), TOBN(0x44912ebf, 0x121b53b8)}, + {TOBN(0xac943674, 0x2496c27c), TOBN(0x8ea3176c, 0x1ffc26b0), + TOBN(0xb6e224ac, 0x13debf2c), TOBN(0x524cc235, 0xf372a832)}}, + {{TOBN(0xd706e1d8, 0x9f6f1b18), TOBN(0x2552f005, 0x44cce35b), + TOBN(0x8c8326c2, 0xa88e31fc), TOBN(0xb5468b2c, 0xf9552047)}, + {TOBN(0xce683e88, 0x3ff90f2b), TOBN(0x77947bdf, 0x2f0a5423), + TOBN(0xd0a1b28b, 0xed56e328), TOBN(0xaee35253, 0xc20134ac)}}, + {{TOBN(0x7e98367d, 0x3567962f), TOBN(0x379ed61f, 0x8188bffb), + TOBN(0x73bba348, 0xfaf130a1), TOBN(0x6c1f75e1, 0x904ed734)}, + {TOBN(0x18956642, 0x3b4a79fc), TOBN(0xf20bc83d, 0x54ef4493), + TOBN(0x836d425d, 0x9111eca1), TOBN(0xe5b5c318, 0x009a8dcf)}}, + {{TOBN(0x3360b25d, 0x13221bc5), TOBN(0x707baad2, 0x6b3eeaf7), + TOBN(0xd7279ed8, 0x743a95a1), TOBN(0x7450a875, 0x969e809f)}, + {TOBN(0x32b6bd53, 0xe5d0338f), TOBN(0x1e77f7af, 0x2b883bbc), + TOBN(0x90da12cc, 0x1063ecd0), TOBN(0xe2697b58, 0xc315be47)}}, + {{TOBN(0x2771a5bd, 0xda85d534), TOBN(0x53e78c1f, 0xff980eea), + TOBN(0xadf1cf84, 0x900385e7), TOBN(0x7d3b14f6, 0xc9387b62)}, + {TOBN(0x170e74b0, 0xcb8f2bd2), TOBN(0x2d50b486, 0x827fa993), + TOBN(0xcdbe8c9a, 0xf6f32bab), TOBN(0x55e906b0, 0xc3b93ab8)}}, + {{TOBN(0x747f22fc, 0x8fe280d1), TOBN(0xcd8e0de5, 0xb2e114ab), + TOBN(0x5ab7dbeb, 0xe10b68b0), TOBN(0x9dc63a9c, 0xa480d4b2)}, + {TOBN(0x78d4bc3b, 0x4be1495f), TOBN(0x25eb3db8, 0x9359122d), + TOBN(0x3f8ac05b, 0x0809cbdc), TOBN(0xbf4187bb, 0xd37c702f)}}, + {{TOBN(0x84cea069, 0x1416a6a5), TOBN(0x8f860c79, 0x43ef881c), + TOBN(0x41311f8a, 0x38038a5d), TOBN(0xe78c2ec0, 0xfc612067)}, + {TOBN(0x494d2e81, 0x5ad73581), TOBN(0xb4cc9e00, 0x59604097), + TOBN(0xff558aec, 0xf3612cba), TOBN(0x35beef7a, 0x9e36c39e)}}, + {{TOBN(0x1845c7cf, 0xdbcf41b9), TOBN(0x5703662a, 0xaea997c0), + TOBN(0x8b925afe, 0xe402f6d8), TOBN(0xd0a1b1ae, 0x4dd72162)}, + {TOBN(0x9f47b375, 0x03c41c4b), TOBN(0xa023829b, 0x0391d042), + TOBN(0x5f5045c3, 0x503b8b0a), TOBN(0x123c2688, 0x98c010e5)}}, + {{TOBN(0x324ec0cc, 0x36ba06ee), TOBN(0xface3115, 0x3dd2cc0c), + TOBN(0xb364f3be, 0xf333e91f), TOBN(0xef8aff73, 0x28e832b0)}, + {TOBN(0x1e9bad04, 0x2d05841b), TOBN(0x42f0e3df, 0x356a21e2), + TOBN(0xa3270bcb, 0x4add627e), TOBN(0xb09a8158, 0xd322e711)}}, + {{TOBN(0x86e326a1, 0x0fee104a), TOBN(0xad7788f8, 0x3703f65d), + TOBN(0x7e765430, 0x47bc4833), TOBN(0x6cee582b, 0x2b9b893a)}, + {TOBN(0x9cd2a167, 0xe8f55a7b), TOBN(0xefbee3c6, 0xd9e4190d), + TOBN(0x33ee7185, 0xd40c2e9d), TOBN(0x844cc9c5, 0xa380b548)}}, + {{TOBN(0x323f8ecd, 0x66926e04), TOBN(0x0001e38f, 0x8110c1ba), + TOBN(0x8dbcac12, 0xfc6a7f07), TOBN(0xd65e1d58, 0x0cec0827)}, + {TOBN(0xd2cd4141, 0xbe76ca2d), TOBN(0x7895cf5c, 0xe892f33a), + TOBN(0x956d230d, 0x367139d2), TOBN(0xa91abd3e, 0xd012c4c1)}}, + {{TOBN(0x34fa4883, 0x87eb36bf), TOBN(0xc5f07102, 0x914b8fb4), + TOBN(0x90f0e579, 0xadb9c95f), TOBN(0xfe6ea8cb, 0x28888195)}, + {TOBN(0x7b9b5065, 0xedfa9284), TOBN(0x6c510bd2, 0x2b8c8d65), + TOBN(0xd7b8ebef, 0xcbe8aafd), TOBN(0xedb3af98, 0x96b1da07)}}, + {{TOBN(0x28ff779d, 0x6295d426), TOBN(0x0c4f6ac7, 0x3fa3ad7b), + TOBN(0xec44d054, 0x8b8e2604), TOBN(0x9b32a66d, 0x8b0050e1)}, + {TOBN(0x1f943366, 0xf0476ce2), TOBN(0x7554d953, 0xa602c7b4), + TOBN(0xbe35aca6, 0x524f2809), TOBN(0xb6881229, 0xfd4edbea)}}, + {{TOBN(0xe8cd0c8f, 0x508efb63), TOBN(0x9eb5b5c8, 0x6abcefc7), + TOBN(0xf5621f5f, 0xb441ab4f), TOBN(0x79e6c046, 0xb76a2b22)}, + {TOBN(0x74a4792c, 0xe37a1f69), TOBN(0xcbd252cb, 0x03542b60), + TOBN(0x785f65d5, 0xb3c20bd3), TOBN(0x8dea6143, 0x4fabc60c)}}, + {{TOBN(0x45e21446, 0xde673629), TOBN(0x57f7aa1e, 0x703c2d21), + TOBN(0xa0e99b7f, 0x98c868c7), TOBN(0x4e42f66d, 0x8b641676)}, + {TOBN(0x602884dc, 0x91077896), TOBN(0xa0d690cf, 0xc2c9885b), + TOBN(0xfeb4da33, 0x3b9a5187), TOBN(0x5f789598, 0x153c87ee)}}, + {{TOBN(0x2192dd47, 0x52b16dba), TOBN(0xdeefc0e6, 0x3524c1b1), + TOBN(0x465ea76e, 0xe4383693), TOBN(0x79401711, 0x361b8d98)}, + {TOBN(0xa5f9ace9, 0xf21a15cb), TOBN(0x73d26163, 0xefee9aeb), + TOBN(0xcca844b3, 0xe677016c), TOBN(0x6c122b07, 0x57eaee06)}}, + {{TOBN(0xb782dce7, 0x15f09690), TOBN(0x508b9b12, 0x2dfc0fc9), + TOBN(0x9015ab4b, 0x65d89fc6), TOBN(0x5e79dab7, 0xd6d5bb0f)}, + {TOBN(0x64f021f0, 0x6c775aa2), TOBN(0xdf09d8cc, 0x37c7eca1), + TOBN(0x9a761367, 0xef2fa506), TOBN(0xed4ca476, 0x5b81eec6)}}, + {{TOBN(0x262ede36, 0x10bbb8b5), TOBN(0x0737ce83, 0x0641ada3), + TOBN(0x4c94288a, 0xe9831ccc), TOBN(0x487fc1ce, 0x8065e635)}, + {TOBN(0xb13d7ab3, 0xb8bb3659), TOBN(0xdea5df3e, 0x855e4120), + TOBN(0xb9a18573, 0x85eb0244), TOBN(0x1a1b8ea3, 0xa7cfe0a3)}}, + {{TOBN(0x3b837119, 0x67b0867c), TOBN(0x8d5e0d08, 0x9d364520), + TOBN(0x52dccc1e, 0xd930f0e3), TOBN(0xefbbcec7, 0xbf20bbaf)}, + {TOBN(0x99cffcab, 0x0263ad10), TOBN(0xd8199e6d, 0xfcd18f8a), + TOBN(0x64e2773f, 0xe9f10617), TOBN(0x0079e8e1, 0x08704848)}}, + {{TOBN(0x1169989f, 0x8a342283), TOBN(0x8097799c, 0xa83012e6), + TOBN(0xece966cb, 0x8a6a9001), TOBN(0x93b3afef, 0x072ac7fc)}, + {TOBN(0xe6893a2a, 0x2db3d5ba), TOBN(0x263dc462, 0x89bf4fdc), + TOBN(0x8852dfc9, 0xe0396673), TOBN(0x7ac70895, 0x3af362b6)}}, + {{TOBN(0xbb9cce4d, 0x5c2f342b), TOBN(0xbf80907a, 0xb52d7aae), + TOBN(0x97f3d3cd, 0x2161bcd0), TOBN(0xb25b0834, 0x0962744d)}, + {TOBN(0xc5b18ea5, 0x6c3a1dda), TOBN(0xfe4ec7eb, 0x06c92317), + TOBN(0xb787b890, 0xad1c4afe), TOBN(0xdccd9a92, 0x0ede801a)}}, + {{TOBN(0x9ac6ddda, 0xdb58da1f), TOBN(0x22bbc12f, 0xb8cae6ee), + TOBN(0xc6f8bced, 0x815c4a43), TOBN(0x8105a92c, 0xf96480c7)}, + {TOBN(0x0dc3dbf3, 0x7a859d51), TOBN(0xe3ec7ce6, 0x3041196b), + TOBN(0xd9f64b25, 0x0d1067c9), TOBN(0xf2321321, 0x3d1f8dd8)}}, + {{TOBN(0x8b5c619c, 0x76497ee8), TOBN(0x5d2b0ac6, 0xc717370e), + TOBN(0x98204cb6, 0x4fcf68e1), TOBN(0x0bdec211, 0x62bc6792)}, + {TOBN(0x6973ccef, 0xa63b1011), TOBN(0xf9e3fa97, 0xe0de1ac5), + TOBN(0x5efb693e, 0x3d0e0c8b), TOBN(0x037248e9, 0xd2d4fcb4)}}}, + {{{TOBN(0x80802dc9, 0x1ec34f9e), TOBN(0xd8772d35, 0x33810603), + TOBN(0x3f06d66c, 0x530cb4f3), TOBN(0x7be5ed0d, 0xc475c129)}, + {TOBN(0xcb9e3c19, 0x31e82b10), TOBN(0xc63d2857, 0xc9ff6b4c), + TOBN(0xb92118c6, 0x92a1b45e), TOBN(0x0aec4414, 0x7285bbca)}}, + {{TOBN(0xfc189ae7, 0x1e29a3ef), TOBN(0xcbe906f0, 0x4c93302e), + TOBN(0xd0107914, 0xceaae10e), TOBN(0xb7a23f34, 0xb68e19f8)}, + {TOBN(0xe9d875c2, 0xefd2119d), TOBN(0x03198c6e, 0xfcadc9c8), + TOBN(0x65591bf6, 0x4da17113), TOBN(0x3cf0bbf8, 0x3d443038)}}, + {{TOBN(0xae485bb7, 0x2b724759), TOBN(0x945353e1, 0xb2d4c63a), + TOBN(0x82159d07, 0xde7d6f2c), TOBN(0x389caef3, 0x4ec5b109)}, + {TOBN(0x4a8ebb53, 0xdb65ef14), TOBN(0x2dc2cb7e, 0xdd99de43), + TOBN(0x816fa3ed, 0x83f2405f), TOBN(0x73429bb9, 0xc14208a3)}}, + {{TOBN(0xb618d590, 0xb01e6e27), TOBN(0x047e2ccd, 0xe180b2dc), + TOBN(0xd1b299b5, 0x04aea4a9), TOBN(0x412c9e1e, 0x9fa403a4)}, + {TOBN(0x88d28a36, 0x79407552), TOBN(0x49c50136, 0xf332b8e3), + TOBN(0x3a1b6fcc, 0xe668de19), TOBN(0x178851bc, 0x75122b97)}}, + {{TOBN(0xb1e13752, 0xfb85fa4c), TOBN(0xd61257ce, 0x383c8ce9), + TOBN(0xd43da670, 0xd2f74dae), TOBN(0xa35aa23f, 0xbf846bbb)}, + {TOBN(0x5e74235d, 0x4421fc83), TOBN(0xf6df8ee0, 0xc363473b), + TOBN(0x34d7f52a, 0x3c4aa158), TOBN(0x50d05aab, 0x9bc6d22e)}}, + {{TOBN(0x8c56e735, 0xa64785f4), TOBN(0xbc56637b, 0x5f29cd07), + TOBN(0x53b2bb80, 0x3ee35067), TOBN(0x50235a0f, 0xdc919270)}, + {TOBN(0x191ab6d8, 0xf2c4aa65), TOBN(0xc3475831, 0x8396023b), + TOBN(0x80400ba5, 0xf0f805ba), TOBN(0x8881065b, 0x5ec0f80f)}}, + {{TOBN(0xc370e522, 0xcc1b5e83), TOBN(0xde2d4ad1, 0x860b8bfb), + TOBN(0xad364df0, 0x67b256df), TOBN(0x8f12502e, 0xe0138997)}, + {TOBN(0x503fa0dc, 0x7783920a), TOBN(0xe80014ad, 0xc0bc866a), + TOBN(0x3f89b744, 0xd3064ba6), TOBN(0x03511dcd, 0xcba5dba5)}}, + {{TOBN(0x197dd46d, 0x95a7b1a2), TOBN(0x9c4e7ad6, 0x3c6341fb), + TOBN(0x426eca29, 0x484c2ece), TOBN(0x9211e489, 0xde7f4f8a)}, + {TOBN(0x14997f6e, 0xc78ef1f4), TOBN(0x2b2c0910, 0x06574586), + TOBN(0x17286a6e, 0x1c3eede8), TOBN(0x25f92e47, 0x0f60e018)}}, + {{TOBN(0x805c5646, 0x31890a36), TOBN(0x703ef600, 0x57feea5b), + TOBN(0x389f747c, 0xaf3c3030), TOBN(0xe0e5daeb, 0x54dd3739)}, + {TOBN(0xfe24a4c3, 0xc9c9f155), TOBN(0x7e4bf176, 0xb5393962), + TOBN(0x37183de2, 0xaf20bf29), TOBN(0x4a1bd7b5, 0xf95a8c3b)}}, + {{TOBN(0xa83b9699, 0x46191d3d), TOBN(0x281fc8dd, 0x7b87f257), + TOBN(0xb18e2c13, 0x54107588), TOBN(0x6372def7, 0x9b2bafe8)}, + {TOBN(0xdaf4bb48, 0x0d8972ca), TOBN(0x3f2dd4b7, 0x56167a3f), + TOBN(0x1eace32d, 0x84310cf4), TOBN(0xe3bcefaf, 0xe42700aa)}}, + {{TOBN(0x5fe5691e, 0xd785e73d), TOBN(0xa5db5ab6, 0x2ea60467), + TOBN(0x02e23d41, 0xdfc6514a), TOBN(0x35e8048e, 0xe03c3665)}, + {TOBN(0x3f8b118f, 0x1adaa0f8), TOBN(0x28ec3b45, 0x84ce1a5a), + TOBN(0xe8cacc6e, 0x2c6646b8), TOBN(0x1343d185, 0xdbd0e40f)}}, + {{TOBN(0xe5d7f844, 0xcaaa358c), TOBN(0x1a1db7e4, 0x9924182a), + TOBN(0xd64cd42d, 0x9c875d9a), TOBN(0xb37b515f, 0x042eeec8)}, + {TOBN(0x4d4dd409, 0x7b165fbe), TOBN(0xfc322ed9, 0xe206eff3), + TOBN(0x7dee4102, 0x59b7e17e), TOBN(0x55a481c0, 0x8236ca00)}}, + {{TOBN(0x8c885312, 0xc23fc975), TOBN(0x15715806, 0x05d6297b), + TOBN(0xa078868e, 0xf78edd39), TOBN(0x956b31e0, 0x03c45e52)}, + {TOBN(0x470275d5, 0xff7b33a6), TOBN(0xc8d5dc3a, 0x0c7e673f), + TOBN(0x419227b4, 0x7e2f2598), TOBN(0x8b37b634, 0x4c14a975)}}, + {{TOBN(0xd0667ed6, 0x8b11888c), TOBN(0x5e0e8c3e, 0x803e25dc), + TOBN(0x34e5d0dc, 0xb987a24a), TOBN(0x9f40ac3b, 0xae920323)}, + {TOBN(0x5463de95, 0x34e0f63a), TOBN(0xa128bf92, 0x6b6328f9), + TOBN(0x491ccd7c, 0xda64f1b7), TOBN(0x7ef1ec27, 0xc47bde35)}}, + {{TOBN(0xa857240f, 0xa36a2737), TOBN(0x35dc1366, 0x63621bc1), + TOBN(0x7a3a6453, 0xd4fb6897), TOBN(0x80f1a439, 0xc929319d)}, + {TOBN(0xfc18274b, 0xf8cb0ba0), TOBN(0xb0b53766, 0x8078c5eb), + TOBN(0xfb0d4924, 0x1e01d0ef), TOBN(0x50d7c67d, 0x372ab09c)}}, + {{TOBN(0xb4e370af, 0x3aeac968), TOBN(0xe4f7fee9, 0xc4b63266), + TOBN(0xb4acd4c2, 0xe3ac5664), TOBN(0xf8910bd2, 0xceb38cbf)}, + {TOBN(0x1c3ae50c, 0xc9c0726e), TOBN(0x15309569, 0xd97b40bf), + TOBN(0x70884b7f, 0xfd5a5a1b), TOBN(0x3890896a, 0xef8314cd)}}, + {{TOBN(0x58e1515c, 0xa5618c93), TOBN(0xe665432b, 0x77d942d1), + TOBN(0xb32181bf, 0xb6f767a8), TOBN(0x753794e8, 0x3a604110)}, + {TOBN(0x09afeb7c, 0xe8c0dbcc), TOBN(0x31e02613, 0x598673a3), + TOBN(0x5d98e557, 0x7d46db00), TOBN(0xfc21fb8c, 0x9d985b28)}}, + {{TOBN(0xc9040116, 0xb0843e0b), TOBN(0x53b1b3a8, 0x69b04531), + TOBN(0xdd1649f0, 0x85d7d830), TOBN(0xbb3bcc87, 0xcb7427e8)}, + {TOBN(0x77261100, 0xc93dce83), TOBN(0x7e79da61, 0xa1922a2a), + TOBN(0x587a2b02, 0xf3149ce8), TOBN(0x147e1384, 0xde92ec83)}}, + {{TOBN(0x484c83d3, 0xaf077f30), TOBN(0xea78f844, 0x0658b53a), + TOBN(0x912076c2, 0x027aec53), TOBN(0xf34714e3, 0x93c8177d)}, + {TOBN(0x37ef5d15, 0xc2376c84), TOBN(0x8315b659, 0x3d1aa783), + TOBN(0x3a75c484, 0xef852a90), TOBN(0x0ba0c58a, 0x16086bd4)}}, + {{TOBN(0x29688d7a, 0x529a6d48), TOBN(0x9c7f250d, 0xc2f19203), + TOBN(0x123042fb, 0x682e2df9), TOBN(0x2b7587e7, 0xad8121bc)}, + {TOBN(0x30fc0233, 0xe0182a65), TOBN(0xb82ecf87, 0xe3e1128a), + TOBN(0x71682861, 0x93fb098f), TOBN(0x043e21ae, 0x85e9e6a7)}}, + {{TOBN(0xab5b49d6, 0x66c834ea), TOBN(0x3be43e18, 0x47414287), + TOBN(0xf40fb859, 0x219a2a47), TOBN(0x0e6559e9, 0xcc58df3c)}, + {TOBN(0xfe1dfe8e, 0x0c6615b4), TOBN(0x14abc8fd, 0x56459d70), + TOBN(0x7be0fa8e, 0x05de0386), TOBN(0x8e63ef68, 0xe9035c7c)}}, + {{TOBN(0x116401b4, 0x53b31e91), TOBN(0x0cba7ad4, 0x4436b4d8), + TOBN(0x9151f9a0, 0x107afd66), TOBN(0xafaca8d0, 0x1f0ee4c4)}, + {TOBN(0x75fe5c1d, 0x9ee9761c), TOBN(0x3497a16b, 0xf0c0588f), + TOBN(0x3ee2bebd, 0x0304804c), TOBN(0xa8fb9a60, 0xc2c990b9)}}, + {{TOBN(0xd14d32fe, 0x39251114), TOBN(0x36bf25bc, 0xcac73366), + TOBN(0xc9562c66, 0xdba7495c), TOBN(0x324d301b, 0x46ad348b)}, + {TOBN(0x9f46620c, 0xd670407e), TOBN(0x0ea8d4f1, 0xe3733a01), + TOBN(0xd396d532, 0xb0c324e0), TOBN(0x5b211a0e, 0x03c317cd)}}, + {{TOBN(0x090d7d20, 0x5ffe7b37), TOBN(0x3b7f3efb, 0x1747d2da), + TOBN(0xa2cb525f, 0xb54fc519), TOBN(0x6e220932, 0xf66a971e)}, + {TOBN(0xddc160df, 0xb486d440), TOBN(0x7fcfec46, 0x3fe13465), + TOBN(0x83da7e4e, 0x76e4c151), TOBN(0xd6fa48a1, 0xd8d302b5)}}, + {{TOBN(0xc6304f26, 0x5872cd88), TOBN(0x806c1d3c, 0x278b90a1), + TOBN(0x3553e725, 0xcaf0bc1c), TOBN(0xff59e603, 0xbb9d8d5c)}, + {TOBN(0xa4550f32, 0x7a0b85dd), TOBN(0xdec5720a, 0x93ecc217), + TOBN(0x0b88b741, 0x69d62213), TOBN(0x7212f245, 0x5b365955)}}, + {{TOBN(0x20764111, 0xb5cae787), TOBN(0x13cb7f58, 0x1dfd3124), + TOBN(0x2dca77da, 0x1175aefb), TOBN(0xeb75466b, 0xffaae775)}, + {TOBN(0x74d76f3b, 0xdb6cff32), TOBN(0x7440f37a, 0x61fcda9a), + TOBN(0x1bb3ac92, 0xb525028b), TOBN(0x20fbf8f7, 0xa1975f29)}}, + {{TOBN(0x982692e1, 0xdf83097f), TOBN(0x28738f6c, 0x554b0800), + TOBN(0xdc703717, 0xa2ce2f2f), TOBN(0x7913b93c, 0x40814194)}, + {TOBN(0x04924593, 0x1fe89636), TOBN(0x7b98443f, 0xf78834a6), + TOBN(0x11c6ab01, 0x5114a5a1), TOBN(0x60deb383, 0xffba5f4c)}}, + {{TOBN(0x4caa54c6, 0x01a982e6), TOBN(0x1dd35e11, 0x3491cd26), + TOBN(0x973c315f, 0x7cbd6b05), TOBN(0xcab00775, 0x52494724)}, + {TOBN(0x04659b1f, 0x6565e15a), TOBN(0xbf30f529, 0x8c8fb026), + TOBN(0xfc21641b, 0xa8a0de37), TOBN(0xe9c7a366, 0xfa5e5114)}}, + {{TOBN(0xdb849ca5, 0x52f03ad8), TOBN(0xc7e8dbe9, 0x024e35c0), + TOBN(0xa1a2bbac, 0xcfc3c789), TOBN(0xbf733e7d, 0x9c26f262)}, + {TOBN(0x882ffbf5, 0xb8444823), TOBN(0xb7224e88, 0x6bf8483b), + TOBN(0x53023b8b, 0x65bef640), TOBN(0xaabfec91, 0xd4d5f8cd)}}, + {{TOBN(0xa40e1510, 0x079ea1bd), TOBN(0x1ad9addc, 0xd05d5d26), + TOBN(0xdb3f2eab, 0x13e68d4f), TOBN(0x1cff1ae2, 0x640f803f)}, + {TOBN(0xe0e7b749, 0xd4cee117), TOBN(0x8e9f275b, 0x4036d909), + TOBN(0xce34e31d, 0x8f4d4c38), TOBN(0x22b37f69, 0xd75130fc)}}, + {{TOBN(0x83e0f1fd, 0xb4014604), TOBN(0xa8ce9919, 0x89415078), + TOBN(0x82375b75, 0x41792efe), TOBN(0x4f59bf5c, 0x97d4515b)}, + {TOBN(0xac4f324f, 0x923a277d), TOBN(0xd9bc9b7d, 0x650f3406), + TOBN(0xc6fa87d1, 0x8a39bc51), TOBN(0x82588530, 0x5ccc108f)}}, + {{TOBN(0x5ced3c9f, 0x82e4c634), TOBN(0x8efb8314, 0x3a4464f8), + TOBN(0xe706381b, 0x7a1dca25), TOBN(0x6cd15a3c, 0x5a2a412b)}, + {TOBN(0x9347a8fd, 0xbfcd8fb5), TOBN(0x31db2eef, 0x6e54cd22), + TOBN(0xc4aeb11e, 0xf8d8932f), TOBN(0x11e7c1ed, 0x344411af)}}, + {{TOBN(0x2653050c, 0xdc9a151e), TOBN(0x9edbfc08, 0x3bb0a859), + TOBN(0x926c81c7, 0xfd5691e7), TOBN(0x9c1b2342, 0x6f39019a)}, + {TOBN(0x64a81c8b, 0x7f8474b9), TOBN(0x90657c07, 0x01761819), + TOBN(0x390b3331, 0x55e0375a), TOBN(0xc676c626, 0xb6ebc47d)}}, + {{TOBN(0x51623247, 0xb7d6dee8), TOBN(0x0948d927, 0x79659313), + TOBN(0x99700161, 0xe9ab35ed), TOBN(0x06cc32b4, 0x8ddde408)}, + {TOBN(0x6f2fd664, 0x061ef338), TOBN(0x1606fa02, 0xc202e9ed), + TOBN(0x55388bc1, 0x929ba99b), TOBN(0xc4428c5e, 0x1e81df69)}}, + {{TOBN(0xce2028ae, 0xf91b0b2a), TOBN(0xce870a23, 0xf03dfd3f), + TOBN(0x66ec2c87, 0x0affe8ed), TOBN(0xb205fb46, 0x284d0c00)}, + {TOBN(0xbf5dffe7, 0x44cefa48), TOBN(0xb6fc37a8, 0xa19876d7), + TOBN(0xbecfa84c, 0x08b72863), TOBN(0xd7205ff5, 0x2576374f)}}, + {{TOBN(0x80330d32, 0x8887de41), TOBN(0x5de0df0c, 0x869ea534), + TOBN(0x13f42753, 0x3c56ea17), TOBN(0xeb1f6069, 0x452b1a78)}, + {TOBN(0x50474396, 0xe30ea15c), TOBN(0x575816a1, 0xc1494125), + TOBN(0xbe1ce55b, 0xfe6bb38f), TOBN(0xb901a948, 0x96ae30f7)}}, + {{TOBN(0xe5af0f08, 0xd8fc3548), TOBN(0x5010b5d0, 0xd73bfd08), + TOBN(0x993d2880, 0x53fe655a), TOBN(0x99f2630b, 0x1c1309fd)}, + {TOBN(0xd8677baf, 0xb4e3b76f), TOBN(0x14e51ddc, 0xb840784b), + TOBN(0x326c750c, 0xbf0092ce), TOBN(0xc83d306b, 0xf528320f)}}, + {{TOBN(0xc4456715, 0x77d4715c), TOBN(0xd30019f9, 0x6b703235), + TOBN(0x207ccb2e, 0xd669e986), TOBN(0x57c824af, 0xf6dbfc28)}, + {TOBN(0xf0eb532f, 0xd8f92a23), TOBN(0x4a557fd4, 0x9bb98fd2), + TOBN(0xa57acea7, 0xc1e6199a), TOBN(0x0c663820, 0x8b94b1ed)}}, + {{TOBN(0x9b42be8f, 0xf83a9266), TOBN(0xc7741c97, 0x0101bd45), + TOBN(0x95770c11, 0x07bd9ceb), TOBN(0x1f50250a, 0x8b2e0744)}, + {TOBN(0xf762eec8, 0x1477b654), TOBN(0xc65b900e, 0x15efe59a), + TOBN(0x88c96148, 0x9546a897), TOBN(0x7e8025b3, 0xc30b4d7c)}}, + {{TOBN(0xae4065ef, 0x12045cf9), TOBN(0x6fcb2caf, 0x9ccce8bd), + TOBN(0x1fa0ba4e, 0xf2cf6525), TOBN(0xf683125d, 0xcb72c312)}, + {TOBN(0xa01da4ea, 0xe312410e), TOBN(0x67e28677, 0x6cd8e830), + TOBN(0xabd95752, 0x98fb3f07), TOBN(0x05f11e11, 0xeef649a5)}}, + {{TOBN(0xba47faef, 0x9d3472c2), TOBN(0x3adff697, 0xc77d1345), + TOBN(0x4761fa04, 0xdd15afee), TOBN(0x64f1f61a, 0xb9e69462)}, + {TOBN(0xfa691fab, 0x9bfb9093), TOBN(0x3df8ae8f, 0xa1133dfe), + TOBN(0xcd5f8967, 0x58cc710d), TOBN(0xfbb88d50, 0x16c7fe79)}}, + {{TOBN(0x8e011b4c, 0xe88c50d1), TOBN(0x7532e807, 0xa8771c4f), + TOBN(0x64c78a48, 0xe2278ee4), TOBN(0x0b283e83, 0x3845072a)}, + {TOBN(0x98a6f291, 0x49e69274), TOBN(0xb96e9668, 0x1868b21c), + TOBN(0x38f0adc2, 0xb1a8908e), TOBN(0x90afcff7, 0x1feb829d)}}, + {{TOBN(0x9915a383, 0x210b0856), TOBN(0xa5a80602, 0xdef04889), + TOBN(0x800e9af9, 0x7c64d509), TOBN(0x81382d0b, 0xb8996f6f)}, + {TOBN(0x490eba53, 0x81927e27), TOBN(0x46c63b32, 0x4af50182), + TOBN(0x784c5fd9, 0xd3ad62ce), TOBN(0xe4fa1870, 0xf8ae8736)}}, + {{TOBN(0x4ec9d0bc, 0xd7466b25), TOBN(0x84ddbe1a, 0xdb235c65), + TOBN(0x5e2645ee, 0x163c1688), TOBN(0x570bd00e, 0x00eba747)}, + {TOBN(0xfa51b629, 0x128bfa0f), TOBN(0x92fce1bd, 0x6c1d3b68), + TOBN(0x3e7361dc, 0xb66778b1), TOBN(0x9c7d249d, 0x5561d2bb)}}, + {{TOBN(0xa40b28bf, 0x0bbc6229), TOBN(0x1c83c05e, 0xdfd91497), + TOBN(0x5f9f5154, 0xf083df05), TOBN(0xbac38b3c, 0xeee66c9d)}, + {TOBN(0xf71db7e3, 0xec0dfcfd), TOBN(0xf2ecda8e, 0x8b0a8416), + TOBN(0x52fddd86, 0x7812aa66), TOBN(0x2896ef10, 0x4e6f4272)}}, + {{TOBN(0xff27186a, 0x0fe9a745), TOBN(0x08249fcd, 0x49ca70db), + TOBN(0x7425a2e6, 0x441cac49), TOBN(0xf4a0885a, 0xece5ff57)}, + {TOBN(0x6e2cb731, 0x7d7ead58), TOBN(0xf96cf7d6, 0x1898d104), + TOBN(0xafe67c9d, 0x4f2c9a89), TOBN(0x89895a50, 0x1c7bf5bc)}}, + {{TOBN(0xdc7cb8e5, 0x573cecfa), TOBN(0x66497eae, 0xd15f03e6), + TOBN(0x6bc0de69, 0x3f084420), TOBN(0x323b9b36, 0xacd532b0)}, + {TOBN(0xcfed390a, 0x0115a3c1), TOBN(0x9414c40b, 0x2d65ca0e), + TOBN(0x641406bd, 0x2f530c78), TOBN(0x29369a44, 0x833438f2)}}, + {{TOBN(0x996884f5, 0x903fa271), TOBN(0xe6da0fd2, 0xb9da921e), + TOBN(0xa6f2f269, 0x5db01e54), TOBN(0x1ee3e9bd, 0x6876214e)}, + {TOBN(0xa26e181c, 0xe27a9497), TOBN(0x36d254e4, 0x8e215e04), + TOBN(0x42f32a6c, 0x252cabca), TOBN(0x99481487, 0x80b57614)}}, + {{TOBN(0x4c4dfe69, 0x40d9cae1), TOBN(0x05869580, 0x11a10f09), + TOBN(0xca287b57, 0x3491b64b), TOBN(0x77862d5d, 0x3fd4a53b)}, + {TOBN(0xbf94856e, 0x50349126), TOBN(0x2be30bd1, 0x71c5268f), + TOBN(0x10393f19, 0xcbb650a6), TOBN(0x639531fe, 0x778cf9fd)}}, + {{TOBN(0x02556a11, 0xb2935359), TOBN(0xda38aa96, 0xaf8c126e), + TOBN(0x47dbe6c2, 0x0960167f), TOBN(0x37bbabb6, 0x501901cd)}, + {TOBN(0xb6e979e0, 0x2c947778), TOBN(0xd69a5175, 0x7a1a1dc6), + TOBN(0xc3ed5095, 0x9d9faf0c), TOBN(0x4dd9c096, 0x1d5fa5f0)}}, + {{TOBN(0xa0c4304d, 0x64f16ea8), TOBN(0x8b1cac16, 0x7e718623), + TOBN(0x0b576546, 0x7c67f03e), TOBN(0x559cf5ad, 0xcbd88c01)}, + {TOBN(0x074877bb, 0x0e2af19a), TOBN(0x1f717ec1, 0xa1228c92), + TOBN(0x70bcb800, 0x326e8920), TOBN(0xec6e2c5c, 0x4f312804)}}, + {{TOBN(0x426aea7d, 0x3fca4752), TOBN(0xf12c0949, 0x2211f62a), + TOBN(0x24beecd8, 0x7be7b6b5), TOBN(0xb77eaf4c, 0x36d7a27d)}, + {TOBN(0x154c2781, 0xfda78fd3), TOBN(0x848a83b0, 0x264eeabe), + TOBN(0x81287ef0, 0x4ffe2bc4), TOBN(0x7b6d88c6, 0xb6b6fc2a)}}, + {{TOBN(0x805fb947, 0xce417d99), TOBN(0x4b93dcc3, 0x8b916cc4), + TOBN(0x72e65bb3, 0x21273323), TOBN(0xbcc1badd, 0x6ea9886e)}, + {TOBN(0x0e223011, 0x4bc5ee85), TOBN(0xa561be74, 0xc18ee1e4), + TOBN(0x762fd2d4, 0xa6bcf1f1), TOBN(0x50e6a5a4, 0x95231489)}}, + {{TOBN(0xca96001f, 0xa00b500b), TOBN(0x5c098cfc, 0x5d7dcdf5), + TOBN(0xa64e2d2e, 0x8c446a85), TOBN(0xbae9bcf1, 0x971f3c62)}, + {TOBN(0x4ec22683, 0x8435a2c5), TOBN(0x8ceaed6c, 0x4bad4643), + TOBN(0xe9f8fb47, 0xccccf4e3), TOBN(0xbd4f3fa4, 0x1ce3b21e)}}, + {{TOBN(0xd79fb110, 0xa3db3292), TOBN(0xe28a37da, 0xb536c66a), + TOBN(0x279ce87b, 0x8e49e6a9), TOBN(0x70ccfe8d, 0xfdcec8e3)}, + {TOBN(0x2193e4e0, 0x3ba464b2), TOBN(0x0f39d60e, 0xaca9a398), + TOBN(0x7d7932af, 0xf82c12ab), TOBN(0xd8ff50ed, 0x91e7e0f7)}}, + {{TOBN(0xea961058, 0xfa28a7e0), TOBN(0xc726cf25, 0x0bf5ec74), + TOBN(0xe74d55c8, 0xdb229666), TOBN(0x0bd9abbf, 0xa57f5799)}, + {TOBN(0x7479ef07, 0x4dfc47b3), TOBN(0xd9c65fc3, 0x0c52f91d), + TOBN(0x8e0283fe, 0x36a8bde2), TOBN(0xa32a8b5e, 0x7d4b7280)}}, + {{TOBN(0x6a677c61, 0x12e83233), TOBN(0x0fbb3512, 0xdcc9bf28), + TOBN(0x562e8ea5, 0x0d780f61), TOBN(0x0db8b22b, 0x1dc4e89c)}, + {TOBN(0x0a6fd1fb, 0x89be0144), TOBN(0x8c77d246, 0xca57113b), + TOBN(0x4639075d, 0xff09c91c), TOBN(0x5b47b17f, 0x5060824c)}}, + {{TOBN(0x58aea2b0, 0x16287b52), TOBN(0xa1343520, 0xd0cd8eb0), + TOBN(0x6148b4d0, 0xc5d58573), TOBN(0xdd2b6170, 0x291c68ae)}, + {TOBN(0xa61b3929, 0x1da3b3b7), TOBN(0x5f946d79, 0x08c4ac10), + TOBN(0x4105d4a5, 0x7217d583), TOBN(0x5061da3d, 0x25e6de5e)}}, + {{TOBN(0x3113940d, 0xec1b4991), TOBN(0xf12195e1, 0x36f485ae), + TOBN(0xa7507fb2, 0x731a2ee0), TOBN(0x95057a8e, 0x6e9e196e)}, + {TOBN(0xa3c2c911, 0x2e130136), TOBN(0x97dfbb36, 0x33c60d15), + TOBN(0xcaf3c581, 0xb300ee2b), TOBN(0x77f25d90, 0xf4bac8b8)}}, + {{TOBN(0xdb1c4f98, 0x6d840cd6), TOBN(0x471d62c0, 0xe634288c), + TOBN(0x8ec2f85e, 0xcec8a161), TOBN(0x41f37cbc, 0xfa6f4ae2)}, + {TOBN(0x6793a20f, 0x4b709985), TOBN(0x7a7bd33b, 0xefa8985b), + TOBN(0x2c6a3fbd, 0x938e6446), TOBN(0x19042619, 0x2a8d47c1)}}, + {{TOBN(0x16848667, 0xcc36975f), TOBN(0x02acf168, 0x9d5f1dfb), + TOBN(0x62d41ad4, 0x613baa94), TOBN(0xb56fbb92, 0x9f684670)}, + {TOBN(0xce610d0d, 0xe9e40569), TOBN(0x7b99c65f, 0x35489fef), + TOBN(0x0c88ad1b, 0x3df18b97), TOBN(0x81b7d9be, 0x5d0e9edb)}}, + {{TOBN(0xd85218c0, 0xc716cc0a), TOBN(0xf4b5ff90, 0x85691c49), + TOBN(0xa4fd666b, 0xce356ac6), TOBN(0x17c72895, 0x4b327a7a)}, + {TOBN(0xf93d5085, 0xda6be7de), TOBN(0xff71530e, 0x3301d34e), + TOBN(0x4cd96442, 0xd8f448e8), TOBN(0x9283d331, 0x2ed18ffa)}}, + {{TOBN(0x4d33dd99, 0x2a849870), TOBN(0xa716964b, 0x41576335), + TOBN(0xff5e3a9b, 0x179be0e5), TOBN(0x5b9d6b1b, 0x83b13632)}, + {TOBN(0x3b8bd7d4, 0xa52f313b), TOBN(0xc9dd95a0, 0x637a4660), + TOBN(0x30035962, 0x0b3e218f), TOBN(0xce1481a3, 0xc7b28a3c)}}, + {{TOBN(0xab41b43a, 0x43228d83), TOBN(0x24ae1c30, 0x4ad63f99), + TOBN(0x8e525f1a, 0x46a51229), TOBN(0x14af860f, 0xcd26d2b4)}, + {TOBN(0xd6baef61, 0x3f714aa1), TOBN(0xf51865ad, 0xeb78795e), + TOBN(0xd3e21fce, 0xe6a9d694), TOBN(0x82ceb1dd, 0x8a37b527)}}}}; diff --git a/ring-0.17.14/crypto/fipsmodule/ec/p256-nistz.c b/ring-0.17.14/crypto/fipsmodule/ec/p256-nistz.c new file mode 100644 index 0000000000..7ae2ffda77 --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/ec/p256-nistz.c @@ -0,0 +1,437 @@ +// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +// Copyright (c) 2014, Intel Corporation. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1) +// (1) Intel Corporation, Israel Development Center, Haifa, Israel +// (2) University of Haifa, Israel +// +// Reference: +// S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with +// 256 Bit Primes" + +#include + +#include "../../limbs/limbs.inl" + +#include + +#include "p256-nistz.h" + +#if defined(OPENSSL_USE_NISTZ256) + +typedef P256_POINT_AFFINE PRECOMP256_ROW[64]; + +// One converted into the Montgomery domain +static const BN_ULONG ONE_MONT[P256_LIMBS] = { + TOBN(0x00000000, 0x00000001), + TOBN(0xffffffff, 0x00000000), + TOBN(0xffffffff, 0xffffffff), + TOBN(0x00000000, 0xfffffffe), +}; + +// Precomputed tables for the default generator +#include "p256-nistz-table.h" + +// Recode window to a signed digit, see |ec_GFp_nistp_recode_scalar_bits| in +// util.c for details +static crypto_word_t booth_recode_w5(crypto_word_t in) { + crypto_word_t s, d; + + s = ~((in >> 5) - 1); + d = (1 << 6) - in - 1; + d = (d & s) | (in & ~s); + d = (d >> 1) + (d & 1); + + return (d << 1) + (s & 1); +} + +static crypto_word_t booth_recode_w7(crypto_word_t in) { + crypto_word_t s, d; + + s = ~((in >> 7) - 1); + d = (1 << 8) - in - 1; + d = (d & s) | (in & ~s); + d = (d >> 1) + (d & 1); + + return (d << 1) + (s & 1); +} + +// The `(P256_LIMBS == 8)` case is unreachable for 64-bit targets. +#if defined(OPENSSL_64_BIT) && defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunreachable-code" +#endif + +// copy_conditional copies |src| to |dst| if |move| is one and leaves it as-is +// if |move| is zero. +// +// WARNING: this breaks the usual convention of constant-time functions +// returning masks. +static void copy_conditional(BN_ULONG dst[P256_LIMBS], + const BN_ULONG src[P256_LIMBS], BN_ULONG move) { + BN_ULONG mask1 = ((BN_ULONG)0) - move; + BN_ULONG mask2 = ~mask1; + + dst[0] = (src[0] & mask1) ^ (dst[0] & mask2); + dst[1] = (src[1] & mask1) ^ (dst[1] & mask2); + dst[2] = (src[2] & mask1) ^ (dst[2] & mask2); + dst[3] = (src[3] & mask1) ^ (dst[3] & mask2); + if (P256_LIMBS == 8) { + dst[4] = (src[4] & mask1) ^ (dst[4] & mask2); + dst[5] = (src[5] & mask1) ^ (dst[5] & mask2); + dst[6] = (src[6] & mask1) ^ (dst[6] & mask2); + dst[7] = (src[7] & mask1) ^ (dst[7] & mask2); + } +} + +#if defined(__clang__) +#pragma GCC diagnostic pop +#endif + +// is_not_zero returns one iff in != 0 and zero otherwise. +// +// WARNING: this breaks the usual convention of constant-time functions +// returning masks. +// +// (define-fun is_not_zero ((in (_ BitVec 64))) (_ BitVec 64) +// (bvlshr (bvor in (bvsub #x0000000000000000 in)) #x000000000000003f) +// ) +// +// (declare-fun x () (_ BitVec 64)) +// +// (assert (and (= x #x0000000000000000) (= (is_not_zero x) +// #x0000000000000001))) (check-sat) +// +// (assert (and (not (= x #x0000000000000000)) (= (is_not_zero x) +// #x0000000000000000))) (check-sat) +// +static BN_ULONG is_not_zero(BN_ULONG in) { + in |= (0 - in); + in >>= BN_BITS2 - 1; + return in; +} + +#if defined(OPENSSL_X86_64) +// Dispatch between CPU variations. The "_adx" suffixed functions use MULX in +// addition to ADCX/ADOX. MULX is part of BMI2, not ADX, so we must check both +// capabilities. + void ecp_nistz256_mul_mont(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS], + const BN_ULONG b[P256_LIMBS]) { + if (adx_bmi2_available) { + ecp_nistz256_mul_mont_adx(res, a, b); + } else { + ecp_nistz256_mul_mont_nohw(res, a, b); + } +} + + void ecp_nistz256_sqr_mont(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS]) { + if (adx_bmi2_available) { + ecp_nistz256_sqr_mont_adx(res, a); + } else { + ecp_nistz256_sqr_mont_nohw(res, a); + } +} + + void ecp_nistz256_ord_mul_mont(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS], + const BN_ULONG b[P256_LIMBS]) { + if (adx_bmi2_available) { + ecp_nistz256_ord_mul_mont_adx(res, a, b); + } else { + ecp_nistz256_ord_mul_mont_nohw(res, a, b); + } +} + + void ecp_nistz256_ord_sqr_mont(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS], + BN_ULONG rep) { + if (adx_bmi2_available) { + ecp_nistz256_ord_sqr_mont_adx(res, a, rep); + } else { + ecp_nistz256_ord_sqr_mont_nohw(res, a, rep); + } +} + +static void ecp_nistz256_select_w5(P256_POINT *val, const P256_POINT in_t[16], + int index) { + if (avx2_available) { + ecp_nistz256_select_w5_avx2(val, in_t, index); + } else { + ecp_nistz256_select_w5_nohw(val, in_t, index); + } +} + +static void ecp_nistz256_select_w7(P256_POINT_AFFINE *val, + const P256_POINT_AFFINE in_t[64], + int index) { + if (avx2_available) { + ecp_nistz256_select_w7_avx2(val, in_t, index); + } else { + ecp_nistz256_select_w7_nohw(val, in_t, index); + } +} + + void ecp_nistz256_point_double(P256_POINT *r, const P256_POINT *a) { + if (adx_bmi2_available) { + ecp_nistz256_point_double_adx(r, a); + } else { + ecp_nistz256_point_double_nohw(r, a); + } +} + + void ecp_nistz256_point_add(P256_POINT *r, const P256_POINT *a, + const P256_POINT *b) { + if (adx_bmi2_available) { + ecp_nistz256_point_add_adx(r, a, b); + } else { + ecp_nistz256_point_add_nohw(r, a, b); + } +} + + void ecp_nistz256_point_add_affine(P256_POINT *r, const P256_POINT *a, + const P256_POINT_AFFINE *b) { + if (adx_bmi2_available) { + ecp_nistz256_point_add_affine_adx(r, a, b); + } else { + ecp_nistz256_point_add_affine_nohw(r, a, b); + } +} +#endif // OPENSSL_X86_64 + +// r = p * p_scalar +static void ecp_nistz256_windowed_mul(P256_POINT *r, + const BN_ULONG p_scalar[P256_LIMBS], + const BN_ULONG p_x[P256_LIMBS], + const BN_ULONG p_y[P256_LIMBS]) { + debug_assert_nonsecret(r != NULL); + debug_assert_nonsecret(p_scalar != NULL); + debug_assert_nonsecret(p_x != NULL); + debug_assert_nonsecret(p_y != NULL); + + static const size_t kWindowSize = 5; + static const crypto_word_t kMask = (1 << (5 /* kWindowSize */ + 1)) - 1; + + // A |P256_POINT| is (3 * 32) = 96 bytes, and the 64-byte alignment should + // add no more than 63 bytes of overhead. Thus, |table| should require + // ~1599 ((96 * 16) + 63) bytes of stack space. + alignas(64) P256_POINT table[16]; + P256_SCALAR_BYTES p_str; + p256_scalar_bytes_from_limbs(p_str, p_scalar); + + // table[0] is implicitly (0,0,0) (the point at infinity), therefore it is + // not stored. All other values are actually stored with an offset of -1 in + // table. + P256_POINT *row = table; + + limbs_copy(row[1 - 1].X, p_x, P256_LIMBS); + limbs_copy(row[1 - 1].Y, p_y, P256_LIMBS); + limbs_copy(row[1 - 1].Z, ONE_MONT, P256_LIMBS); + + ecp_nistz256_point_double(&row[2 - 1], &row[1 - 1]); + ecp_nistz256_point_add(&row[3 - 1], &row[2 - 1], &row[1 - 1]); + ecp_nistz256_point_double(&row[4 - 1], &row[2 - 1]); + ecp_nistz256_point_double(&row[6 - 1], &row[3 - 1]); + ecp_nistz256_point_double(&row[8 - 1], &row[4 - 1]); + ecp_nistz256_point_double(&row[12 - 1], &row[6 - 1]); + ecp_nistz256_point_add(&row[5 - 1], &row[4 - 1], &row[1 - 1]); + ecp_nistz256_point_add(&row[7 - 1], &row[6 - 1], &row[1 - 1]); + ecp_nistz256_point_add(&row[9 - 1], &row[8 - 1], &row[1 - 1]); + ecp_nistz256_point_add(&row[13 - 1], &row[12 - 1], &row[1 - 1]); + ecp_nistz256_point_double(&row[14 - 1], &row[7 - 1]); + ecp_nistz256_point_double(&row[10 - 1], &row[5 - 1]); + ecp_nistz256_point_add(&row[15 - 1], &row[14 - 1], &row[1 - 1]); + ecp_nistz256_point_add(&row[11 - 1], &row[10 - 1], &row[1 - 1]); + ecp_nistz256_point_double(&row[16 - 1], &row[8 - 1]); + + BN_ULONG tmp[P256_LIMBS]; + alignas(32) P256_POINT h; + size_t index = 255; + crypto_word_t wvalue = p_str[(index - 1) / 8]; + wvalue = (wvalue >> ((index - 1) % 8)) & kMask; + + ecp_nistz256_select_w5(r, table, (int)(booth_recode_w5(wvalue) >> 1)); + + while (index >= 5) { + if (index != 255) { + size_t off = (index - 1) / 8; + + wvalue = (crypto_word_t)p_str[off] | (crypto_word_t)p_str[off + 1] << 8; + wvalue = (wvalue >> ((index - 1) % 8)) & kMask; + + wvalue = booth_recode_w5(wvalue); + + ecp_nistz256_select_w5(&h, table, (int)(wvalue >> 1)); + + ecp_nistz256_neg(tmp, h.Y); + copy_conditional(h.Y, tmp, (wvalue & 1)); + + ecp_nistz256_point_add(r, r, &h); + } + + index -= kWindowSize; + + ecp_nistz256_point_double(r, r); + ecp_nistz256_point_double(r, r); + ecp_nistz256_point_double(r, r); + ecp_nistz256_point_double(r, r); + ecp_nistz256_point_double(r, r); + } + + // Final window + wvalue = p_str[0]; + wvalue = (wvalue << 1) & kMask; + + wvalue = booth_recode_w5(wvalue); + + ecp_nistz256_select_w5(&h, table, (int)(wvalue >> 1)); + + ecp_nistz256_neg(tmp, h.Y); + copy_conditional(h.Y, tmp, wvalue & 1); + + ecp_nistz256_point_add(r, r, &h); +} + +static crypto_word_t calc_first_wvalue(size_t *index, const uint8_t p_str[33]) { + static const size_t kWindowSize = 7; + static const crypto_word_t kMask = (1 << (7 /* kWindowSize */ + 1)) - 1; + *index = kWindowSize; + + crypto_word_t wvalue = ((crypto_word_t)p_str[0] << 1) & kMask; + return booth_recode_w7(wvalue); +} + +static crypto_word_t calc_wvalue(size_t *index, const uint8_t p_str[33]) { + static const size_t kWindowSize = 7; + static const crypto_word_t kMask = (1 << (7 /* kWindowSize */ + 1)) - 1; + + const size_t off = (*index - 1) / 8; + crypto_word_t wvalue = + (crypto_word_t)p_str[off] | (crypto_word_t)p_str[off + 1] << 8; + wvalue = (wvalue >> ((*index - 1) % 8)) & kMask; + *index += kWindowSize; + + return booth_recode_w7(wvalue); +} + +void p256_point_mul(Limb r[3][P256_LIMBS], const Limb p_scalar[P256_LIMBS], + const Limb p_x[P256_LIMBS], + const Limb p_y[P256_LIMBS]) { + alignas(32) P256_POINT out; + ecp_nistz256_windowed_mul(&out, p_scalar, p_x, p_y); + + limbs_copy(r[0], out.X, P256_LIMBS); + limbs_copy(r[1], out.Y, P256_LIMBS); + limbs_copy(r[2], out.Z, P256_LIMBS); +} + +void p256_point_mul_base(Limb r[3][P256_LIMBS], const Limb scalar[P256_LIMBS]) { + P256_SCALAR_BYTES p_str; + p256_scalar_bytes_from_limbs(p_str, scalar); + + // First window + size_t index = 0; + crypto_word_t wvalue = calc_first_wvalue(&index, p_str); + + alignas(32) P256_POINT_AFFINE t; + alignas(32) P256_POINT p; + ecp_nistz256_select_w7(&t, ecp_nistz256_precomputed[0], (int)(wvalue >> 1)); + ecp_nistz256_neg(p.Z, t.Y); + copy_conditional(t.Y, p.Z, wvalue & 1); + + // Convert |t| from affine to Jacobian coordinates. We set Z to zero if |t| + // is infinity and |ONE| otherwise. |t| was computed from the table, so it + // is infinity iff |wvalue >> 1| is zero. + limbs_copy(p.X, t.X, P256_LIMBS); + limbs_copy(p.Y, t.Y, P256_LIMBS); + limbs_zero(p.Z, P256_LIMBS); + copy_conditional(p.Z, ONE_MONT, is_not_zero(wvalue >> 1)); + + for (int i = 1; i < 37; i++) { + wvalue = calc_wvalue(&index, p_str); + + ecp_nistz256_select_w7(&t, ecp_nistz256_precomputed[i], (int)(wvalue >> 1)); + + alignas(32) BN_ULONG neg_Y[P256_LIMBS]; + ecp_nistz256_neg(neg_Y, t.Y); + copy_conditional(t.Y, neg_Y, wvalue & 1); + + // Note |ecp_nistz256_point_add_affine| does not work if |p| and |t| are the + // same non-infinity point. + ecp_nistz256_point_add_affine(&p, &p, &t); + } + + limbs_copy(r[0], p.X, P256_LIMBS); + limbs_copy(r[1], p.Y, P256_LIMBS); + limbs_copy(r[2], p.Z, P256_LIMBS); +} + +void p256_point_mul_base_vartime(Limb r[3][P256_LIMBS], + const Limb g_scalar[P256_LIMBS]) { + alignas(32) P256_POINT p; + uint8_t p_str[33]; + OPENSSL_memcpy(p_str, g_scalar, 32); + p_str[32] = 0; + + // First window + size_t index = 0; + size_t wvalue = calc_first_wvalue(&index, p_str); + + // Convert |p| from affine to Jacobian coordinates. We set Z to zero if |p| + // is infinity and |ONE_MONT| otherwise. |p| was computed from the table, so + // it is infinity iff |wvalue >> 1| is zero. + if ((wvalue >> 1) != 0) { + OPENSSL_memcpy(p.X, &ecp_nistz256_precomputed[0][(wvalue >> 1) - 1].X, + sizeof(p.X)); + OPENSSL_memcpy(p.Y, &ecp_nistz256_precomputed[0][(wvalue >> 1) - 1].Y, + sizeof(p.Y)); + OPENSSL_memcpy(p.Z, ONE_MONT, sizeof(p.Z)); + } else { + OPENSSL_memset(p.X, 0, sizeof(p.X)); + OPENSSL_memset(p.Y, 0, sizeof(p.Y)); + OPENSSL_memset(p.Z, 0, sizeof(p.Z)); + } + + if ((wvalue & 1) == 1) { + ecp_nistz256_neg(p.Y, p.Y); + } + + for (int i = 1; i < 37; i++) { + wvalue = calc_wvalue(&index, p_str); + if ((wvalue >> 1) == 0) { + continue; + } + + alignas(32) P256_POINT_AFFINE t; + OPENSSL_memcpy(&t, &ecp_nistz256_precomputed[i][(wvalue >> 1) - 1], + sizeof(t)); + if ((wvalue & 1) == 1) { + ecp_nistz256_neg(t.Y, t.Y); + } + + // Note |ecp_nistz256_point_add_affine| does not work if |p| and |t| are + // the same non-infinity point, so it is important that we compute the + // |g_scalar| term before the |p_scalar| term. + ecp_nistz256_point_add_affine(&p, &p, &t); + } + + + limbs_copy(r[0], p.X, P256_LIMBS); + limbs_copy(r[1], p.Y, P256_LIMBS); + limbs_copy(r[2], p.Z, P256_LIMBS); +} + +#endif /* defined(OPENSSL_USE_NISTZ256) */ diff --git a/ring-0.17.14/crypto/fipsmodule/ec/p256-nistz.h b/ring-0.17.14/crypto/fipsmodule/ec/p256-nistz.h new file mode 100644 index 0000000000..ff01f501f7 --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/ec/p256-nistz.h @@ -0,0 +1,171 @@ +// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +// Copyright (c) 2014, Intel Corporation. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1) +// (1) Intel Corporation, Israel Development Center, Haifa, Israel +// (2) University of Haifa, Israel +// +// Reference: +// S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with +// 256 Bit Primes" + +#ifndef OPENSSL_HEADER_EC_P256_X86_64_H +#define OPENSSL_HEADER_EC_P256_X86_64_H + +#include + +#include "p256_shared.h" + +#include "../bn/internal.h" + +#if defined(OPENSSL_USE_NISTZ256) + +// ecp_nistz256_neg sets |res| to -|a| mod P. +void ecp_nistz256_neg(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS]); + +// ecp_nistz256_mul_mont sets |res| to |a| * |b| * 2^-256 mod P. +#if defined(OPENSSL_X86_64) +void ecp_nistz256_mul_mont_nohw(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS], + const BN_ULONG b[P256_LIMBS]); +void ecp_nistz256_mul_mont_adx(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS], + const BN_ULONG b[P256_LIMBS]); +#else +void ecp_nistz256_mul_mont(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS], + const BN_ULONG b[P256_LIMBS]); +#endif + +// ecp_nistz256_sqr_mont sets |res| to |a| * |a| * 2^-256 mod P. +#if defined(OPENSSL_X86_64) +void ecp_nistz256_sqr_mont_nohw(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS]); +void ecp_nistz256_sqr_mont_adx(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS]); +#else +void ecp_nistz256_sqr_mont(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS]); +#endif + + +// P-256 scalar operations. +// +// The following functions compute modulo N, where N is the order of P-256. They +// take fully-reduced inputs and give fully-reduced outputs. + +// ecp_nistz256_ord_mul_mont sets |res| to |a| * |b| where inputs and outputs +// are in Montgomery form. That is, |res| is |a| * |b| * 2^-256 mod N. +#if defined(OPENSSL_X86_64) +void ecp_nistz256_ord_mul_mont_nohw(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS], + const BN_ULONG b[P256_LIMBS]); +void ecp_nistz256_ord_mul_mont_adx(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS], + const BN_ULONG b[P256_LIMBS]); +#else +void ecp_nistz256_ord_mul_mont(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS], + const BN_ULONG b[P256_LIMBS]); +#endif + +// ecp_nistz256_ord_sqr_mont sets |res| to |a|^(2*|rep|) where inputs and +// outputs are in Montgomery form. That is, |res| is +// (|a| * 2^-256)^(2*|rep|) * 2^256 mod N. +#if defined(OPENSSL_X86_64) +void ecp_nistz256_ord_sqr_mont_nohw(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS], BN_ULONG rep); +void ecp_nistz256_ord_sqr_mont_adx(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS], BN_ULONG rep); +#else +void ecp_nistz256_ord_sqr_mont(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS], BN_ULONG rep); +#endif + + + +// P-256 point operations. +// +// The following functions may be used in-place. All coordinates are in the +// Montgomery domain. + +// A P256_POINT_AFFINE represents a P-256 point in affine coordinates. Infinity +// is encoded as (0, 0). +typedef struct { + BN_ULONG X[P256_LIMBS]; + BN_ULONG Y[P256_LIMBS]; +} P256_POINT_AFFINE; + +// ecp_nistz256_select_w5 sets |*val| to |in_t[index-1]| if 1 <= |index| <= 16 +// and all zeros (the point at infinity) if |index| is 0. This is done in +// constant time. +#if defined(OPENSSL_X86_64) +void ecp_nistz256_select_w5_nohw(P256_POINT *val, const P256_POINT in_t[16], + int index); +void ecp_nistz256_select_w5_avx2(P256_POINT *val, const P256_POINT in_t[16], + int index); +#else +void ecp_nistz256_select_w5(P256_POINT *val, const P256_POINT in_t[16], + int index); +#endif + +// ecp_nistz256_select_w7 sets |*val| to |in_t[index-1]| if 1 <= |index| <= 64 +// and all zeros (the point at infinity) if |index| is 0. This is done in +// constant time. +#if defined(OPENSSL_X86_64) +void ecp_nistz256_select_w7_nohw(P256_POINT_AFFINE *val, + const P256_POINT_AFFINE in_t[64], int index); +void ecp_nistz256_select_w7_avx2(P256_POINT_AFFINE *val, + const P256_POINT_AFFINE in_t[64], int index); +#else +void ecp_nistz256_select_w7(P256_POINT_AFFINE *val, + const P256_POINT_AFFINE in_t[64], int index); +#endif + +// ecp_nistz256_point_double sets |r| to |a| doubled. +#if defined(OPENSSL_X86_64) +void ecp_nistz256_point_double_nohw(P256_POINT *r, const P256_POINT *a); +void ecp_nistz256_point_double_adx(P256_POINT *r, const P256_POINT *a); +#else +void ecp_nistz256_point_double(P256_POINT *r, const P256_POINT *a); +#endif + +// ecp_nistz256_point_add adds |a| to |b| and places the result in |r|. +#if defined(OPENSSL_X86_64) +void ecp_nistz256_point_add_nohw(P256_POINT *r, const P256_POINT *a, + const P256_POINT *b); +void ecp_nistz256_point_add_adx(P256_POINT *r, const P256_POINT *a, + const P256_POINT *b); +#else +void ecp_nistz256_point_add(P256_POINT *r, const P256_POINT *a, + const P256_POINT *b); +#endif + +// ecp_nistz256_point_add_affine adds |a| to |b| and places the result in +// |r|. |a| and |b| must not represent the same point unless they are both +// infinity. +#if defined(OPENSSL_X86_64) +void ecp_nistz256_point_add_affine_adx(P256_POINT *r, const P256_POINT *a, + const P256_POINT_AFFINE *b); +void ecp_nistz256_point_add_affine_nohw(P256_POINT *r, const P256_POINT *a, + const P256_POINT_AFFINE *b); +#else +void ecp_nistz256_point_add_affine(P256_POINT *r, const P256_POINT *a, + const P256_POINT_AFFINE *b); +#endif + +#endif /* defined(OPENSSL_USE_NISTZ256) */ + +#endif // OPENSSL_HEADER_EC_P256_X86_64_H diff --git a/ring-0.17.14/crypto/fipsmodule/ec/p256.c b/ring-0.17.14/crypto/fipsmodule/ec/p256.c new file mode 100644 index 0000000000..0117916dab --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/ec/p256.c @@ -0,0 +1,539 @@ +// Copyright 2020 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// An implementation of the NIST P-256 elliptic curve point multiplication. +// 256-bit Montgomery form for 64 and 32-bit. Field operations are generated by +// Fiat, which lives in //third_party/fiat. + +#include + +#include "../../limbs/limbs.h" +#include "../../limbs/limbs.inl" + +#include "p256_shared.h" + +#include "../../internal.h" +#include "./util.h" + +#if !defined(OPENSSL_USE_NISTZ256) + +#if defined(_MSC_VER) && !defined(__clang__) +// '=': conversion from 'int64_t' to 'int32_t', possible loss of data +#pragma warning(disable: 4242) +// '=': conversion from 'int32_t' to 'uint8_t', possible loss of data +#pragma warning(disable: 4244) +// 'initializing': conversion from 'size_t' to 'fiat_p256_limb_t' +#pragma warning(disable: 4267) +#endif + +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic ignored "-Wconversion" +#pragma GCC diagnostic ignored "-Wsign-conversion" +#endif + +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC diagnostic ignored "-Winline" +#endif + +#if defined(BORINGSSL_HAS_UINT128) +#if defined(__GNUC__) +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include "../../../third_party/fiat/p256_64.h" +#elif defined(OPENSSL_64_BIT) +#include "../../../third_party/fiat/p256_64_msvc.h" +#else +#include "../../../third_party/fiat/p256_32.h" +#endif + + +// utility functions, handwritten + +#if defined(OPENSSL_64_BIT) +#define FIAT_P256_NLIMBS 4 +typedef uint64_t fiat_p256_limb_t; +typedef uint64_t fiat_p256_felem[FIAT_P256_NLIMBS]; +static const fiat_p256_felem fiat_p256_one = {0x1, 0xffffffff00000000, + 0xffffffffffffffff, 0xfffffffe}; +#else // 64BIT; else 32BIT +#define FIAT_P256_NLIMBS 8 +typedef uint32_t fiat_p256_limb_t; +typedef uint32_t fiat_p256_felem[FIAT_P256_NLIMBS]; +static const fiat_p256_felem fiat_p256_one = { + 0x1, 0x0, 0x0, 0xffffffff, 0xffffffff, 0xffffffff, 0xfffffffe, 0x0}; +#endif // 64BIT + + +static fiat_p256_limb_t fiat_p256_nz( + const fiat_p256_limb_t in1[FIAT_P256_NLIMBS]) { + fiat_p256_limb_t ret; + fiat_p256_nonzero(&ret, in1); + return ret; +} + +static void fiat_p256_copy(fiat_p256_limb_t out[FIAT_P256_NLIMBS], + const fiat_p256_limb_t in1[FIAT_P256_NLIMBS]) { + for (size_t i = 0; i < FIAT_P256_NLIMBS; i++) { + out[i] = in1[i]; + } +} + +static void fiat_p256_cmovznz(fiat_p256_limb_t out[FIAT_P256_NLIMBS], + fiat_p256_limb_t t, + const fiat_p256_limb_t z[FIAT_P256_NLIMBS], + const fiat_p256_limb_t nz[FIAT_P256_NLIMBS]) { + fiat_p256_selectznz(out, !!t, z, nz); +} + +static void fiat_p256_from_words(fiat_p256_felem out, + const Limb in[32 / sizeof(BN_ULONG)]) { + // Typically, |BN_ULONG| and |fiat_p256_limb_t| will be the same type, but on + // 64-bit platforms without |uint128_t|, they are different. However, on + // little-endian systems, |uint64_t[4]| and |uint32_t[8]| have the same + // layout. + OPENSSL_memcpy(out, in, 32); +} + +static void fiat_p256_to_words(Limb out[32 / sizeof(BN_ULONG)], const fiat_p256_felem in) { + // See |fiat_p256_from_words|. + OPENSSL_memcpy(out, in, 32); +} + + +// Group operations +// ---------------- +// +// Building on top of the field operations we have the operations on the +// elliptic curve group itself. Points on the curve are represented in Jacobian +// coordinates. +// +// Both operations were transcribed to Coq and proven to correspond to naive +// implementations using Affine coordinates, for all suitable fields. In the +// Coq proofs, issues of constant-time execution and memory layout (aliasing) +// conventions were not considered. Specification of affine coordinates: +// +// As a sanity check, a proof that these points form a commutative group: +// + +// fiat_p256_point_double calculates 2*(x_in, y_in, z_in) +// +// The method is taken from: +// http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b +// +// Coq transcription and correctness proof: +// +// +// +// Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed. +// while x_out == y_in is not (maybe this works, but it's not tested). +static void fiat_p256_point_double(fiat_p256_felem x_out, fiat_p256_felem y_out, + fiat_p256_felem z_out, + const fiat_p256_felem x_in, + const fiat_p256_felem y_in, + const fiat_p256_felem z_in) { + fiat_p256_felem delta, gamma, beta, ftmp, ftmp2, tmptmp, alpha, fourbeta; + // delta = z^2 + fiat_p256_square(delta, z_in); + // gamma = y^2 + fiat_p256_square(gamma, y_in); + // beta = x*gamma + fiat_p256_mul(beta, x_in, gamma); + + // alpha = 3*(x-delta)*(x+delta) + fiat_p256_sub(ftmp, x_in, delta); + fiat_p256_add(ftmp2, x_in, delta); + + fiat_p256_add(tmptmp, ftmp2, ftmp2); + fiat_p256_add(ftmp2, ftmp2, tmptmp); + fiat_p256_mul(alpha, ftmp, ftmp2); + + // x' = alpha^2 - 8*beta + fiat_p256_square(x_out, alpha); + fiat_p256_add(fourbeta, beta, beta); + fiat_p256_add(fourbeta, fourbeta, fourbeta); + fiat_p256_add(tmptmp, fourbeta, fourbeta); + fiat_p256_sub(x_out, x_out, tmptmp); + + // z' = (y + z)^2 - gamma - delta + fiat_p256_add(delta, gamma, delta); + fiat_p256_add(ftmp, y_in, z_in); + fiat_p256_square(z_out, ftmp); + fiat_p256_sub(z_out, z_out, delta); + + // y' = alpha*(4*beta - x') - 8*gamma^2 + fiat_p256_sub(y_out, fourbeta, x_out); + fiat_p256_add(gamma, gamma, gamma); + fiat_p256_square(gamma, gamma); + fiat_p256_mul(y_out, alpha, y_out); + fiat_p256_add(gamma, gamma, gamma); + fiat_p256_sub(y_out, y_out, gamma); +} + +// fiat_p256_point_add calculates (x1, y1, z1) + (x2, y2, z2) +// +// The method is taken from: +// http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl, +// adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity). +// +// Coq transcription and correctness proof: +// +// +// +// This function includes a branch for checking whether the two input points +// are equal, (while not equal to the point at infinity). This case never +// happens during single point multiplication, so there is no timing leak for +// ECDH or ECDSA signing. +static void fiat_p256_point_add(fiat_p256_felem x3, fiat_p256_felem y3, + fiat_p256_felem z3, const fiat_p256_felem x1, + const fiat_p256_felem y1, + const fiat_p256_felem z1, const int mixed, + const fiat_p256_felem x2, + const fiat_p256_felem y2, + const fiat_p256_felem z2) { + fiat_p256_felem x_out, y_out, z_out; + fiat_p256_limb_t z1nz = fiat_p256_nz(z1); + fiat_p256_limb_t z2nz = fiat_p256_nz(z2); + + // z1z1 = z1z1 = z1**2 + fiat_p256_felem z1z1; + fiat_p256_square(z1z1, z1); + + fiat_p256_felem u1, s1, two_z1z2; + if (!mixed) { + // z2z2 = z2**2 + fiat_p256_felem z2z2; + fiat_p256_square(z2z2, z2); + + // u1 = x1*z2z2 + fiat_p256_mul(u1, x1, z2z2); + + // two_z1z2 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 + fiat_p256_add(two_z1z2, z1, z2); + fiat_p256_square(two_z1z2, two_z1z2); + fiat_p256_sub(two_z1z2, two_z1z2, z1z1); + fiat_p256_sub(two_z1z2, two_z1z2, z2z2); + + // s1 = y1 * z2**3 + fiat_p256_mul(s1, z2, z2z2); + fiat_p256_mul(s1, s1, y1); + } else { + // We'll assume z2 = 1 (special case z2 = 0 is handled later). + + // u1 = x1*z2z2 + fiat_p256_copy(u1, x1); + // two_z1z2 = 2z1z2 + fiat_p256_add(two_z1z2, z1, z1); + // s1 = y1 * z2**3 + fiat_p256_copy(s1, y1); + } + + // u2 = x2*z1z1 + fiat_p256_felem u2; + fiat_p256_mul(u2, x2, z1z1); + + // h = u2 - u1 + fiat_p256_felem h; + fiat_p256_sub(h, u2, u1); + + fiat_p256_limb_t xneq = fiat_p256_nz(h); + + // z_out = two_z1z2 * h + fiat_p256_mul(z_out, h, two_z1z2); + + // z1z1z1 = z1 * z1z1 + fiat_p256_felem z1z1z1; + fiat_p256_mul(z1z1z1, z1, z1z1); + + // s2 = y2 * z1**3 + fiat_p256_felem s2; + fiat_p256_mul(s2, y2, z1z1z1); + + // r = (s2 - s1)*2 + fiat_p256_felem r; + fiat_p256_sub(r, s2, s1); + fiat_p256_add(r, r, r); + + fiat_p256_limb_t yneq = fiat_p256_nz(r); + + fiat_p256_limb_t is_nontrivial_double = constant_time_is_zero_w(xneq | yneq) & + ~constant_time_is_zero_w(z1nz) & + ~constant_time_is_zero_w(z2nz); + if (constant_time_declassify_w(is_nontrivial_double)) { + fiat_p256_point_double(x3, y3, z3, x1, y1, z1); + return; + } + + // I = (2h)**2 + fiat_p256_felem i; + fiat_p256_add(i, h, h); + fiat_p256_square(i, i); + + // J = h * I + fiat_p256_felem j; + fiat_p256_mul(j, h, i); + + // V = U1 * I + fiat_p256_felem v; + fiat_p256_mul(v, u1, i); + + // x_out = r**2 - J - 2V + fiat_p256_square(x_out, r); + fiat_p256_sub(x_out, x_out, j); + fiat_p256_sub(x_out, x_out, v); + fiat_p256_sub(x_out, x_out, v); + + // y_out = r(V-x_out) - 2 * s1 * J + fiat_p256_sub(y_out, v, x_out); + fiat_p256_mul(y_out, y_out, r); + fiat_p256_felem s1j; + fiat_p256_mul(s1j, s1, j); + fiat_p256_sub(y_out, y_out, s1j); + fiat_p256_sub(y_out, y_out, s1j); + + fiat_p256_cmovznz(x_out, z1nz, x2, x_out); + fiat_p256_cmovznz(x3, z2nz, x1, x_out); + fiat_p256_cmovznz(y_out, z1nz, y2, y_out); + fiat_p256_cmovznz(y3, z2nz, y1, y_out); + fiat_p256_cmovznz(z_out, z1nz, z2, z_out); + fiat_p256_cmovznz(z3, z2nz, z1, z_out); +} + +#include "./p256_table.h" + +// fiat_p256_select_point_affine selects the |idx-1|th point from a +// precomputation table and copies it to out. If |idx| is zero, the output is +// the point at infinity. +static void fiat_p256_select_point_affine( + const fiat_p256_limb_t idx, size_t size, + const fiat_p256_felem pre_comp[/*size*/][2], fiat_p256_felem out[3]) { + OPENSSL_memset(out, 0, sizeof(fiat_p256_felem) * 3); + for (size_t i = 0; i < size; i++) { + fiat_p256_limb_t mismatch = i ^ (idx - 1); + fiat_p256_cmovznz(out[0], mismatch, pre_comp[i][0], out[0]); + fiat_p256_cmovznz(out[1], mismatch, pre_comp[i][1], out[1]); + } + fiat_p256_cmovznz(out[2], idx, out[2], fiat_p256_one); +} + +// fiat_p256_select_point selects the |idx|th point from a precomputation table +// and copies it to out. +static void fiat_p256_select_point(const fiat_p256_limb_t idx, size_t size, + const fiat_p256_felem pre_comp[/*size*/][3], + fiat_p256_felem out[3]) { + OPENSSL_memset(out, 0, sizeof(fiat_p256_felem) * 3); + for (size_t i = 0; i < size; i++) { + fiat_p256_limb_t mismatch = i ^ idx; + fiat_p256_cmovznz(out[0], mismatch, pre_comp[i][0], out[0]); + fiat_p256_cmovznz(out[1], mismatch, pre_comp[i][1], out[1]); + fiat_p256_cmovznz(out[2], mismatch, pre_comp[i][2], out[2]); + } +} + +// fiat_p256_get_bit returns the |i|th bit in |in| +static crypto_word_t fiat_p256_get_bit(const Limb in[P256_LIMBS], int i) { + if (i < 0 || i >= 256) { + return 0; + } +#if defined(OPENSSL_64_BIT) + OPENSSL_STATIC_ASSERT(sizeof(Limb) == 8, "BN_ULONG was not 64-bit"); + return (in[i >> 6] >> (i & 63)) & 1; +#else + OPENSSL_STATIC_ASSERT(sizeof(Limb) == 4, "BN_ULONG was not 32-bit"); + return (in[i >> 5] >> (i & 31)) & 1; +#endif +} + +void p256_point_mul(Limb r[3][P256_LIMBS], const Limb scalar[P256_LIMBS], + const Limb p_x[P256_LIMBS], const Limb p_y[P256_LIMBS]) { + debug_assert_nonsecret(r != NULL); + debug_assert_nonsecret(scalar != NULL); + debug_assert_nonsecret(p_x != NULL); + debug_assert_nonsecret(p_y != NULL); + + fiat_p256_felem p_pre_comp[17][3]; + OPENSSL_memset(&p_pre_comp, 0, sizeof(p_pre_comp)); + // Precompute multiples. + fiat_p256_from_words(p_pre_comp[1][0], p_x); + fiat_p256_from_words(p_pre_comp[1][1], p_y); + fiat_p256_copy(p_pre_comp[1][2], fiat_p256_one); + + for (size_t j = 2; j <= 16; ++j) { + if (j & 1) { + fiat_p256_point_add(p_pre_comp[j][0], p_pre_comp[j][1], p_pre_comp[j][2], + p_pre_comp[1][0], p_pre_comp[1][1], p_pre_comp[1][2], + 0, p_pre_comp[j - 1][0], p_pre_comp[j - 1][1], + p_pre_comp[j - 1][2]); + } else { + fiat_p256_point_double(p_pre_comp[j][0], p_pre_comp[j][1], + p_pre_comp[j][2], p_pre_comp[j / 2][0], + p_pre_comp[j / 2][1], p_pre_comp[j / 2][2]); + } + } + + // Set nq to the point at infinity. + fiat_p256_felem nq[3] = {{0}, {0}, {0}}, ftmp, tmp[3]; + + // Loop over |scalar| msb-to-lsb, incorporating |p_pre_comp| every 5th round. + int skip = 1; // Save two point operations in the first round. + for (size_t i = 255; i < 256; i--) { + // double + if (!skip) { + fiat_p256_point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]); + } + + // do other additions every 5 doublings + if (i % 5 == 0) { + crypto_word_t bits = fiat_p256_get_bit(scalar, i + 4) << 5; + bits |= fiat_p256_get_bit(scalar, i + 3) << 4; + bits |= fiat_p256_get_bit(scalar, i + 2) << 3; + bits |= fiat_p256_get_bit(scalar, i + 1) << 2; + bits |= fiat_p256_get_bit(scalar, i) << 1; + bits |= fiat_p256_get_bit(scalar, i - 1); + crypto_word_t sign, digit; + recode_scalar_bits(&sign, &digit, bits); + + // select the point to add or subtract, in constant time. + fiat_p256_select_point((fiat_p256_limb_t)digit, 17, + RING_CORE_POINTLESS_ARRAY_CONST_CAST((const fiat_p256_felem(*)[3]))p_pre_comp, + tmp); + fiat_p256_opp(ftmp, tmp[1]); // (X, -Y, Z) is the negative point. + fiat_p256_cmovznz(tmp[1], (fiat_p256_limb_t)sign, tmp[1], ftmp); + + if (!skip) { + fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], + 0 /* mixed */, tmp[0], tmp[1], tmp[2]); + } else { + fiat_p256_copy(nq[0], tmp[0]); + fiat_p256_copy(nq[1], tmp[1]); + fiat_p256_copy(nq[2], tmp[2]); + skip = 0; + } + } + } + + fiat_p256_to_words(r[0], nq[0]); + fiat_p256_to_words(r[1], nq[1]); + fiat_p256_to_words(r[2], nq[2]); +} + +void p256_point_mul_base(Limb r[3][P256_LIMBS], const Limb scalar[P256_LIMBS]) { + // Set nq to the point at infinity. + fiat_p256_felem nq[3] = {{0}, {0}, {0}}, tmp[3]; + + int skip = 1; // Save two point operations in the first round. + for (size_t i = 31; i < 32; i--) { + if (!skip) { + fiat_p256_point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]); + } + + // First, look 32 bits upwards. + crypto_word_t bits = fiat_p256_get_bit(scalar, i + 224) << 3; + bits |= fiat_p256_get_bit(scalar, i + 160) << 2; + bits |= fiat_p256_get_bit(scalar, i + 96) << 1; + bits |= fiat_p256_get_bit(scalar, i + 32); + // Select the point to add, in constant time. + fiat_p256_select_point_affine((fiat_p256_limb_t)bits, 15, + fiat_p256_g_pre_comp[1], tmp); + + if (!skip) { + fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], + 1 /* mixed */, tmp[0], tmp[1], tmp[2]); + } else { + fiat_p256_copy(nq[0], tmp[0]); + fiat_p256_copy(nq[1], tmp[1]); + fiat_p256_copy(nq[2], tmp[2]); + skip = 0; + } + + // Second, look at the current position. + bits = fiat_p256_get_bit(scalar, i + 192) << 3; + bits |= fiat_p256_get_bit(scalar, i + 128) << 2; + bits |= fiat_p256_get_bit(scalar, i + 64) << 1; + bits |= fiat_p256_get_bit(scalar, i); + // Select the point to add, in constant time. + fiat_p256_select_point_affine((fiat_p256_limb_t)bits, 15, + fiat_p256_g_pre_comp[0], tmp); + fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 1 /* mixed */, + tmp[0], tmp[1], tmp[2]); + } + + fiat_p256_to_words(r[0], nq[0]); + fiat_p256_to_words(r[1], nq[1]); + fiat_p256_to_words(r[2], nq[2]); +} + +void p256_mul_mont(Limb r[P256_LIMBS], const Limb a[P256_LIMBS], + const Limb b[P256_LIMBS]) { + fiat_p256_felem a_, b_; + fiat_p256_from_words(a_, a); + fiat_p256_from_words(b_, b); + fiat_p256_mul(a_, a_, b_); + fiat_p256_to_words(r, a_); +} + +void p256_sqr_mont(Limb r[P256_LIMBS], const Limb a[P256_LIMBS]) { + fiat_p256_felem x; + fiat_p256_from_words(x, a); + fiat_p256_square(x, x); + fiat_p256_to_words(r, x); +} + +void p256_point_add(Limb r[3][P256_LIMBS], const Limb a[3][P256_LIMBS], + const Limb b[3][P256_LIMBS]) { + fiat_p256_felem x1, y1, z1, x2, y2, z2; + fiat_p256_from_words(x1, a[0]); + fiat_p256_from_words(y1, a[1]); + fiat_p256_from_words(z1, a[2]); + fiat_p256_from_words(x2, b[0]); + fiat_p256_from_words(y2, b[1]); + fiat_p256_from_words(z2, b[2]); + fiat_p256_point_add(x1, y1, z1, x1, y1, z1, 0 /* both Jacobian */, x2, y2, + z2); + fiat_p256_to_words(r[0], x1); + fiat_p256_to_words(r[1], y1); + fiat_p256_to_words(r[2], z1); +} + +void p256_point_double(Limb r[3][P256_LIMBS], const Limb a[3][P256_LIMBS]) { + fiat_p256_felem x, y, z; + fiat_p256_from_words(x, a[0]); + fiat_p256_from_words(y, a[1]); + fiat_p256_from_words(z, a[2]); + fiat_p256_point_double(x, y, z, x, y, z); + fiat_p256_to_words(r[0], x); + fiat_p256_to_words(r[1], y); + fiat_p256_to_words(r[2], z); +} + +// For testing only. +void p256_point_add_affine(Limb r[3][P256_LIMBS], const Limb a[3][P256_LIMBS], + const Limb b[2][P256_LIMBS]) { + fiat_p256_felem x1, y1, z1, x2, y2; + fiat_p256_from_words(x1, a[0]); + fiat_p256_from_words(y1, a[1]); + fiat_p256_from_words(z1, a[2]); + fiat_p256_from_words(x2, b[0]); + fiat_p256_from_words(y2, b[1]); + + fiat_p256_felem z2 = {0}; + fiat_p256_cmovznz(z2, fiat_p256_nz(x2) & fiat_p256_nz(y2), z2, fiat_p256_one); + + fiat_p256_point_add(x1, y1, z1, x1, y1, z1, 1 /* mixed */, x2, y2, z2); + + fiat_p256_to_words(r[0], x1); + fiat_p256_to_words(r[1], y1); + fiat_p256_to_words(r[2], z1); +} + +#endif diff --git a/ring-0.17.14/crypto/fipsmodule/ec/p256_shared.h b/ring-0.17.14/crypto/fipsmodule/ec/p256_shared.h new file mode 100644 index 0000000000..ef717b170b --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/ec/p256_shared.h @@ -0,0 +1,62 @@ +// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +// Copyright (c) 2014, Intel Corporation. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1) +// (1) Intel Corporation, Israel Development Center, Haifa, Israel +// (2) University of Haifa, Israel +// +// Reference: +// S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with +// 256 Bit Primes" + +#ifndef OPENSSL_HEADER_EC_P256_SHARED_H +#define OPENSSL_HEADER_EC_P256_SHARED_H + +#include "ring-core/base.h" + +#include "../bn/internal.h" + +#if !defined(OPENSSL_NO_ASM) && \ + (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \ + !defined(OPENSSL_SMALL) +# define OPENSSL_USE_NISTZ256 +#endif + +// P-256 field operations. +// +// An element mod P in P-256 is represented as a little-endian array of +// |P256_LIMBS| |BN_ULONG|s, spanning the full range of values. +// +// The following functions take fully-reduced inputs mod P and give +// fully-reduced outputs. They may be used in-place. + +#define P256_LIMBS (256 / BN_BITS2) + +// A P256_POINT represents a P-256 point in Jacobian coordinates. +typedef struct { + BN_ULONG X[P256_LIMBS]; + BN_ULONG Y[P256_LIMBS]; + BN_ULONG Z[P256_LIMBS]; +} P256_POINT; + +typedef unsigned char P256_SCALAR_BYTES[33]; + +static inline void p256_scalar_bytes_from_limbs( + P256_SCALAR_BYTES bytes_out, const BN_ULONG limbs[P256_LIMBS]) { + OPENSSL_memcpy(bytes_out, limbs, 32); + bytes_out[32] = 0; +} + +#endif /* !defined(OPENSSL_USE_NISTZ256) */ diff --git a/ring-0.17.14/crypto/fipsmodule/ec/p256_table.h b/ring-0.17.14/crypto/fipsmodule/ec/p256_table.h new file mode 100644 index 0000000000..e16eabaa73 --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/ec/p256_table.h @@ -0,0 +1,297 @@ +// Copyright 2020 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This file is generated by make_tables.go. + +// Base point pre computation +// -------------------------- +// +// Two different sorts of precomputed tables are used in the following code. +// Each contain various points on the curve, where each point is three field +// elements (x, y, z). +// +// For the base point table, z is usually 1 (0 for the point at infinity). +// This table has 2 * 16 elements, starting with the following: +// index | bits | point +// ------+---------+------------------------------ +// 0 | 0 0 0 0 | 0G +// 1 | 0 0 0 1 | 1G +// 2 | 0 0 1 0 | 2^64G +// 3 | 0 0 1 1 | (2^64 + 1)G +// 4 | 0 1 0 0 | 2^128G +// 5 | 0 1 0 1 | (2^128 + 1)G +// 6 | 0 1 1 0 | (2^128 + 2^64)G +// 7 | 0 1 1 1 | (2^128 + 2^64 + 1)G +// 8 | 1 0 0 0 | 2^192G +// 9 | 1 0 0 1 | (2^192 + 1)G +// 10 | 1 0 1 0 | (2^192 + 2^64)G +// 11 | 1 0 1 1 | (2^192 + 2^64 + 1)G +// 12 | 1 1 0 0 | (2^192 + 2^128)G +// 13 | 1 1 0 1 | (2^192 + 2^128 + 1)G +// 14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G +// 15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G +// followed by a copy of this with each element multiplied by 2^32. +// +// The reason for this is so that we can clock bits into four different +// locations when doing simple scalar multiplies against the base point, +// and then another four locations using the second 16 elements. +// +// Tables for other points have table[i] = iG for i in 0 .. 16. + +// fiat_p256_g_pre_comp is the table of precomputed base points +#if defined(OPENSSL_64_BIT) +static const fiat_p256_felem fiat_p256_g_pre_comp[2][15][2] = { + {{{0x79e730d418a9143c, 0x75ba95fc5fedb601, 0x79fb732b77622510, + 0x18905f76a53755c6}, + {0xddf25357ce95560a, 0x8b4ab8e4ba19e45c, 0xd2e88688dd21f325, + 0x8571ff1825885d85}}, + {{0x4f922fc516a0d2bb, 0x0d5cc16c1a623499, 0x9241cf3a57c62c8b, + 0x2f5e6961fd1b667f}, + {0x5c15c70bf5a01797, 0x3d20b44d60956192, 0x04911b37071fdb52, + 0xf648f9168d6f0f7b}}, + {{0x9e566847e137bbbc, 0xe434469e8a6a0bec, 0xb1c4276179d73463, + 0x5abe0285133d0015}, + {0x92aa837cc04c7dab, 0x573d9f4c43260c07, 0x0c93156278e6cc37, + 0x94bb725b6b6f7383}}, + {{0x62a8c244bfe20925, 0x91c19ac38fdce867, 0x5a96a5d5dd387063, + 0x61d587d421d324f6}, + {0xe87673a2a37173ea, 0x2384800853778b65, 0x10f8441e05bab43e, + 0xfa11fe124621efbe}}, + {{0x1c891f2b2cb19ffd, 0x01ba8d5bb1923c23, 0xb6d03d678ac5ca8e, + 0x586eb04c1f13bedc}, + {0x0c35c6e527e8ed09, 0x1e81a33c1819ede2, 0x278fd6c056c652fa, + 0x19d5ac0870864f11}}, + {{0x62577734d2b533d5, 0x673b8af6a1bdddc0, 0x577e7c9aa79ec293, + 0xbb6de651c3b266b1}, + {0xe7e9303ab65259b3, 0xd6a0afd3d03a7480, 0xc5ac83d19b3cfc27, + 0x60b4619a5d18b99b}}, + {{0xbd6a38e11ae5aa1c, 0xb8b7652b49e73658, 0x0b130014ee5f87ed, + 0x9d0f27b2aeebffcd}, + {0xca9246317a730a55, 0x9c955b2fddbbc83a, 0x07c1dfe0ac019a71, + 0x244a566d356ec48d}}, + {{0x56f8410ef4f8b16a, 0x97241afec47b266a, 0x0a406b8e6d9c87c1, + 0x803f3e02cd42ab1b}, + {0x7f0309a804dbec69, 0xa83b85f73bbad05f, 0xc6097273ad8e197f, + 0xc097440e5067adc1}}, + {{0x846a56f2c379ab34, 0xa8ee068b841df8d1, 0x20314459176c68ef, + 0xf1af32d5915f1f30}, + {0x99c375315d75bd50, 0x837cffbaf72f67bc, 0x0613a41848d7723f, + 0x23d0f130e2d41c8b}}, + {{0xed93e225d5be5a2b, 0x6fe799835934f3c6, 0x4314092622626ffc, + 0x50bbb4d97990216a}, + {0x378191c6e57ec63e, 0x65422c40181dcdb2, 0x41a8099b0236e0f6, + 0x2b10011801fe49c3}}, + {{0xfc68b5c59b391593, 0xc385f5a2598270fc, 0x7144f3aad19adcbb, + 0xdd55899983fbae0c}, + {0x93b88b8e74b82ff4, 0xd2e03c4071e734c9, 0x9a7a9eaf43c0322a, + 0xe6e4c551149d6041}}, + {{0x5fe14bfe80ec21fe, 0xf6ce116ac255be82, 0x98bc5a072f4a5d67, + 0xfad27148db7e63af}, + {0x90c0b6ac29ab05b3, 0x37a9a83c4e251ae6, 0x0a7dc875c2aade7d, + 0x77387de39f0e1a84}}, + {{0x1e9ecc49a56c0dd7, 0xa5cffcd846086c74, 0x8f7a1408f505aece, + 0xb37b85c0bef0c47e}, + {0x3596b6e4cc0e6a8f, 0xfd6d4bbf6b388f23, 0xaba453fac39cef4e, + 0x9c135ac8f9f628d5}}, + {{0x0a1c729495c8f8be, 0x2961c4803bf362bf, 0x9e418403df63d4ac, + 0xc109f9cb91ece900}, + {0xc2d095d058945705, 0xb9083d96ddeb85c0, 0x84692b8d7a40449b, + 0x9bc3344f2eee1ee1}}, + {{0x0d5ae35642913074, 0x55491b2748a542b1, 0x469ca665b310732a, + 0x29591d525f1a4cc1}, + {0xe76f5b6bb84f983f, 0xbe7eef419f5f84e1, 0x1200d49680baa189, + 0x6376551f18ef332c}}}, + {{{0x202886024147519a, 0xd0981eac26b372f0, 0xa9d4a7caa785ebc8, + 0xd953c50ddbdf58e9}, + {0x9d6361ccfd590f8f, 0x72e9626b44e6c917, 0x7fd9611022eb64cf, + 0x863ebb7e9eb288f3}}, + {{0x4fe7ee31b0e63d34, 0xf4600572a9e54fab, 0xc0493334d5e7b5a4, + 0x8589fb9206d54831}, + {0xaa70f5cc6583553a, 0x0879094ae25649e5, 0xcc90450710044652, + 0xebb0696d02541c4f}}, + {{0xabbaa0c03b89da99, 0xa6f2d79eb8284022, 0x27847862b81c05e8, + 0x337a4b5905e54d63}, + {0x3c67500d21f7794a, 0x207005b77d6d7f61, 0x0a5a378104cfd6e8, + 0x0d65e0d5f4c2fbd6}}, + {{0xd433e50f6d3549cf, 0x6f33696ffacd665e, 0x695bfdacce11fcb4, + 0x810ee252af7c9860}, + {0x65450fe17159bb2c, 0xf7dfbebe758b357b, 0x2b057e74d69fea72, + 0xd485717a92731745}}, + {{0xce1f69bbe83f7669, 0x09f8ae8272877d6b, 0x9548ae543244278d, + 0x207755dee3c2c19c}, + {0x87bd61d96fef1945, 0x18813cefb12d28c3, 0x9fbcd1d672df64aa, + 0x48dc5ee57154b00d}}, + {{0xef0f469ef49a3154, 0x3e85a5956e2b2e9a, 0x45aaec1eaa924a9c, + 0xaa12dfc8a09e4719}, + {0x26f272274df69f1d, 0xe0e4c82ca2ff5e73, 0xb9d8ce73b7a9dd44, + 0x6c036e73e48ca901}}, + {{0xe1e421e1a47153f0, 0xb86c3b79920418c9, 0x93bdce87705d7672, + 0xf25ae793cab79a77}, + {0x1f3194a36d869d0c, 0x9d55c8824986c264, 0x49fb5ea3096e945e, + 0x39b8e65313db0a3e}}, + {{0xe3417bc035d0b34a, 0x440b386b8327c0a7, 0x8fb7262dac0362d1, + 0x2c41114ce0cdf943}, + {0x2ba5cef1ad95a0b1, 0xc09b37a867d54362, 0x26d6cdd201e486c9, + 0x20477abf42ff9297}}, + {{0x0f121b41bc0a67d2, 0x62d4760a444d248a, 0x0e044f1d659b4737, + 0x08fde365250bb4a8}, + {0xaceec3da848bf287, 0xc2a62182d3369d6e, 0x3582dfdc92449482, + 0x2f7e2fd2565d6cd7}}, + {{0x0a0122b5178a876b, 0x51ff96ff085104b4, 0x050b31ab14f29f76, + 0x84abb28b5f87d4e6}, + {0xd5ed439f8270790a, 0x2d6cb59d85e3f46b, 0x75f55c1b6c1e2212, + 0xe5436f6717655640}}, + {{0xc2965ecc9aeb596d, 0x01ea03e7023c92b4, 0x4704b4b62e013961, + 0x0ca8fd3f905ea367}, + {0x92523a42551b2b61, 0x1eb7a89c390fcd06, 0xe7f1d2be0392a63e, + 0x96dca2644ddb0c33}}, + {{0x231c210e15339848, 0xe87a28e870778c8d, 0x9d1de6616956e170, + 0x4ac3c9382bb09c0b}, + {0x19be05516998987d, 0x8b2376c4ae09f4d6, 0x1de0b7651a3f933d, + 0x380d94c7e39705f4}}, + {{0x3685954b8c31c31d, 0x68533d005bf21a0c, 0x0bd7626e75c79ec9, + 0xca17754742c69d54}, + {0xcc6edafff6d2dbb2, 0xfd0d8cbd174a9d18, 0x875e8793aa4578e8, + 0xa976a7139cab2ce6}}, + {{0xce37ab11b43ea1db, 0x0a7ff1a95259d292, 0x851b02218f84f186, + 0xa7222beadefaad13}, + {0xa2ac78ec2b0a9144, 0x5a024051f2fa59c5, 0x91d1eca56147ce38, + 0xbe94d523bc2ac690}}, + {{0x2d8daefd79ec1a0f, 0x3bbcd6fdceb39c97, 0xf5575ffc58f61a95, + 0xdbd986c4adf7b420}, + {0x81aa881415f39eb7, 0x6ee2fcf5b98d976c, 0x5465475dcf2f717d, + 0x8e24d3c46860bbd0}}}}; +#else +static const fiat_p256_felem fiat_p256_g_pre_comp[2][15][2] = { + {{{0x18a9143c, 0x79e730d4, 0x5fedb601, 0x75ba95fc, 0x77622510, 0x79fb732b, + 0xa53755c6, 0x18905f76}, + {0xce95560a, 0xddf25357, 0xba19e45c, 0x8b4ab8e4, 0xdd21f325, 0xd2e88688, + 0x25885d85, 0x8571ff18}}, + {{0x16a0d2bb, 0x4f922fc5, 0x1a623499, 0x0d5cc16c, 0x57c62c8b, 0x9241cf3a, + 0xfd1b667f, 0x2f5e6961}, + {0xf5a01797, 0x5c15c70b, 0x60956192, 0x3d20b44d, 0x071fdb52, 0x04911b37, + 0x8d6f0f7b, 0xf648f916}}, + {{0xe137bbbc, 0x9e566847, 0x8a6a0bec, 0xe434469e, 0x79d73463, 0xb1c42761, + 0x133d0015, 0x5abe0285}, + {0xc04c7dab, 0x92aa837c, 0x43260c07, 0x573d9f4c, 0x78e6cc37, 0x0c931562, + 0x6b6f7383, 0x94bb725b}}, + {{0xbfe20925, 0x62a8c244, 0x8fdce867, 0x91c19ac3, 0xdd387063, 0x5a96a5d5, + 0x21d324f6, 0x61d587d4}, + {0xa37173ea, 0xe87673a2, 0x53778b65, 0x23848008, 0x05bab43e, 0x10f8441e, + 0x4621efbe, 0xfa11fe12}}, + {{0x2cb19ffd, 0x1c891f2b, 0xb1923c23, 0x01ba8d5b, 0x8ac5ca8e, 0xb6d03d67, + 0x1f13bedc, 0x586eb04c}, + {0x27e8ed09, 0x0c35c6e5, 0x1819ede2, 0x1e81a33c, 0x56c652fa, 0x278fd6c0, + 0x70864f11, 0x19d5ac08}}, + {{0xd2b533d5, 0x62577734, 0xa1bdddc0, 0x673b8af6, 0xa79ec293, 0x577e7c9a, + 0xc3b266b1, 0xbb6de651}, + {0xb65259b3, 0xe7e9303a, 0xd03a7480, 0xd6a0afd3, 0x9b3cfc27, 0xc5ac83d1, + 0x5d18b99b, 0x60b4619a}}, + {{0x1ae5aa1c, 0xbd6a38e1, 0x49e73658, 0xb8b7652b, 0xee5f87ed, 0x0b130014, + 0xaeebffcd, 0x9d0f27b2}, + {0x7a730a55, 0xca924631, 0xddbbc83a, 0x9c955b2f, 0xac019a71, 0x07c1dfe0, + 0x356ec48d, 0x244a566d}}, + {{0xf4f8b16a, 0x56f8410e, 0xc47b266a, 0x97241afe, 0x6d9c87c1, 0x0a406b8e, + 0xcd42ab1b, 0x803f3e02}, + {0x04dbec69, 0x7f0309a8, 0x3bbad05f, 0xa83b85f7, 0xad8e197f, 0xc6097273, + 0x5067adc1, 0xc097440e}}, + {{0xc379ab34, 0x846a56f2, 0x841df8d1, 0xa8ee068b, 0x176c68ef, 0x20314459, + 0x915f1f30, 0xf1af32d5}, + {0x5d75bd50, 0x99c37531, 0xf72f67bc, 0x837cffba, 0x48d7723f, 0x0613a418, + 0xe2d41c8b, 0x23d0f130}}, + {{0xd5be5a2b, 0xed93e225, 0x5934f3c6, 0x6fe79983, 0x22626ffc, 0x43140926, + 0x7990216a, 0x50bbb4d9}, + {0xe57ec63e, 0x378191c6, 0x181dcdb2, 0x65422c40, 0x0236e0f6, 0x41a8099b, + 0x01fe49c3, 0x2b100118}}, + {{0x9b391593, 0xfc68b5c5, 0x598270fc, 0xc385f5a2, 0xd19adcbb, 0x7144f3aa, + 0x83fbae0c, 0xdd558999}, + {0x74b82ff4, 0x93b88b8e, 0x71e734c9, 0xd2e03c40, 0x43c0322a, 0x9a7a9eaf, + 0x149d6041, 0xe6e4c551}}, + {{0x80ec21fe, 0x5fe14bfe, 0xc255be82, 0xf6ce116a, 0x2f4a5d67, 0x98bc5a07, + 0xdb7e63af, 0xfad27148}, + {0x29ab05b3, 0x90c0b6ac, 0x4e251ae6, 0x37a9a83c, 0xc2aade7d, 0x0a7dc875, + 0x9f0e1a84, 0x77387de3}}, + {{0xa56c0dd7, 0x1e9ecc49, 0x46086c74, 0xa5cffcd8, 0xf505aece, 0x8f7a1408, + 0xbef0c47e, 0xb37b85c0}, + {0xcc0e6a8f, 0x3596b6e4, 0x6b388f23, 0xfd6d4bbf, 0xc39cef4e, 0xaba453fa, + 0xf9f628d5, 0x9c135ac8}}, + {{0x95c8f8be, 0x0a1c7294, 0x3bf362bf, 0x2961c480, 0xdf63d4ac, 0x9e418403, + 0x91ece900, 0xc109f9cb}, + {0x58945705, 0xc2d095d0, 0xddeb85c0, 0xb9083d96, 0x7a40449b, 0x84692b8d, + 0x2eee1ee1, 0x9bc3344f}}, + {{0x42913074, 0x0d5ae356, 0x48a542b1, 0x55491b27, 0xb310732a, 0x469ca665, + 0x5f1a4cc1, 0x29591d52}, + {0xb84f983f, 0xe76f5b6b, 0x9f5f84e1, 0xbe7eef41, 0x80baa189, 0x1200d496, + 0x18ef332c, 0x6376551f}}}, + {{{0x4147519a, 0x20288602, 0x26b372f0, 0xd0981eac, 0xa785ebc8, 0xa9d4a7ca, + 0xdbdf58e9, 0xd953c50d}, + {0xfd590f8f, 0x9d6361cc, 0x44e6c917, 0x72e9626b, 0x22eb64cf, 0x7fd96110, + 0x9eb288f3, 0x863ebb7e}}, + {{0xb0e63d34, 0x4fe7ee31, 0xa9e54fab, 0xf4600572, 0xd5e7b5a4, 0xc0493334, + 0x06d54831, 0x8589fb92}, + {0x6583553a, 0xaa70f5cc, 0xe25649e5, 0x0879094a, 0x10044652, 0xcc904507, + 0x02541c4f, 0xebb0696d}}, + {{0x3b89da99, 0xabbaa0c0, 0xb8284022, 0xa6f2d79e, 0xb81c05e8, 0x27847862, + 0x05e54d63, 0x337a4b59}, + {0x21f7794a, 0x3c67500d, 0x7d6d7f61, 0x207005b7, 0x04cfd6e8, 0x0a5a3781, + 0xf4c2fbd6, 0x0d65e0d5}}, + {{0x6d3549cf, 0xd433e50f, 0xfacd665e, 0x6f33696f, 0xce11fcb4, 0x695bfdac, + 0xaf7c9860, 0x810ee252}, + {0x7159bb2c, 0x65450fe1, 0x758b357b, 0xf7dfbebe, 0xd69fea72, 0x2b057e74, + 0x92731745, 0xd485717a}}, + {{0xe83f7669, 0xce1f69bb, 0x72877d6b, 0x09f8ae82, 0x3244278d, 0x9548ae54, + 0xe3c2c19c, 0x207755de}, + {0x6fef1945, 0x87bd61d9, 0xb12d28c3, 0x18813cef, 0x72df64aa, 0x9fbcd1d6, + 0x7154b00d, 0x48dc5ee5}}, + {{0xf49a3154, 0xef0f469e, 0x6e2b2e9a, 0x3e85a595, 0xaa924a9c, 0x45aaec1e, + 0xa09e4719, 0xaa12dfc8}, + {0x4df69f1d, 0x26f27227, 0xa2ff5e73, 0xe0e4c82c, 0xb7a9dd44, 0xb9d8ce73, + 0xe48ca901, 0x6c036e73}}, + {{0xa47153f0, 0xe1e421e1, 0x920418c9, 0xb86c3b79, 0x705d7672, 0x93bdce87, + 0xcab79a77, 0xf25ae793}, + {0x6d869d0c, 0x1f3194a3, 0x4986c264, 0x9d55c882, 0x096e945e, 0x49fb5ea3, + 0x13db0a3e, 0x39b8e653}}, + {{0x35d0b34a, 0xe3417bc0, 0x8327c0a7, 0x440b386b, 0xac0362d1, 0x8fb7262d, + 0xe0cdf943, 0x2c41114c}, + {0xad95a0b1, 0x2ba5cef1, 0x67d54362, 0xc09b37a8, 0x01e486c9, 0x26d6cdd2, + 0x42ff9297, 0x20477abf}}, + {{0xbc0a67d2, 0x0f121b41, 0x444d248a, 0x62d4760a, 0x659b4737, 0x0e044f1d, + 0x250bb4a8, 0x08fde365}, + {0x848bf287, 0xaceec3da, 0xd3369d6e, 0xc2a62182, 0x92449482, 0x3582dfdc, + 0x565d6cd7, 0x2f7e2fd2}}, + {{0x178a876b, 0x0a0122b5, 0x085104b4, 0x51ff96ff, 0x14f29f76, 0x050b31ab, + 0x5f87d4e6, 0x84abb28b}, + {0x8270790a, 0xd5ed439f, 0x85e3f46b, 0x2d6cb59d, 0x6c1e2212, 0x75f55c1b, + 0x17655640, 0xe5436f67}}, + {{0x9aeb596d, 0xc2965ecc, 0x023c92b4, 0x01ea03e7, 0x2e013961, 0x4704b4b6, + 0x905ea367, 0x0ca8fd3f}, + {0x551b2b61, 0x92523a42, 0x390fcd06, 0x1eb7a89c, 0x0392a63e, 0xe7f1d2be, + 0x4ddb0c33, 0x96dca264}}, + {{0x15339848, 0x231c210e, 0x70778c8d, 0xe87a28e8, 0x6956e170, 0x9d1de661, + 0x2bb09c0b, 0x4ac3c938}, + {0x6998987d, 0x19be0551, 0xae09f4d6, 0x8b2376c4, 0x1a3f933d, 0x1de0b765, + 0xe39705f4, 0x380d94c7}}, + {{0x8c31c31d, 0x3685954b, 0x5bf21a0c, 0x68533d00, 0x75c79ec9, 0x0bd7626e, + 0x42c69d54, 0xca177547}, + {0xf6d2dbb2, 0xcc6edaff, 0x174a9d18, 0xfd0d8cbd, 0xaa4578e8, 0x875e8793, + 0x9cab2ce6, 0xa976a713}}, + {{0xb43ea1db, 0xce37ab11, 0x5259d292, 0x0a7ff1a9, 0x8f84f186, 0x851b0221, + 0xdefaad13, 0xa7222bea}, + {0x2b0a9144, 0xa2ac78ec, 0xf2fa59c5, 0x5a024051, 0x6147ce38, 0x91d1eca5, + 0xbc2ac690, 0xbe94d523}}, + {{0x79ec1a0f, 0x2d8daefd, 0xceb39c97, 0x3bbcd6fd, 0x58f61a95, 0xf5575ffc, + 0xadf7b420, 0xdbd986c4}, + {0x15f39eb7, 0x81aa8814, 0xb98d976c, 0x6ee2fcf5, 0xcf2f717d, 0x5465475d, + 0x6860bbd0, 0x8e24d3c4}}}}; +#endif diff --git a/ring-0.17.14/crypto/fipsmodule/ec/util.h b/ring-0.17.14/crypto/fipsmodule/ec/util.h new file mode 100644 index 0000000000..1fef3da614 --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/ec/util.h @@ -0,0 +1,258 @@ +// Copyright 2015 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "../../internal.h" + +#if defined(__GNUC__) +#pragma GCC diagnostic ignored "-Wconversion" +#pragma GCC diagnostic ignored "-Wsign-conversion" +#endif + + +// This function looks at 5+1 scalar bits (5 current, 1 adjacent less +// significant bit), and recodes them into a signed digit for use in fast point +// multiplication: the use of signed rather than unsigned digits means that +// fewer points need to be precomputed, given that point inversion is easy (a +// precomputed point dP makes -dP available as well). +// +// BACKGROUND: +// +// Signed digits for multiplication were introduced by Booth ("A signed binary +// multiplication technique", Quart. Journ. Mech. and Applied Math., vol. IV, +// pt. 2 (1951), pp. 236-240), in that case for multiplication of integers. +// Booth's original encoding did not generally improve the density of nonzero +// digits over the binary representation, and was merely meant to simplify the +// handling of signed factors given in two's complement; but it has since been +// shown to be the basis of various signed-digit representations that do have +// further advantages, including the wNAF, using the following general +// approach: +// +// (1) Given a binary representation +// +// b_k ... b_2 b_1 b_0, +// +// of a nonnegative integer (b_k in {0, 1}), rewrite it in digits 0, 1, -1 +// by using bit-wise subtraction as follows: +// +// b_k b_(k-1) ... b_2 b_1 b_0 +// - b_k ... b_3 b_2 b_1 b_0 +// ----------------------------------------- +// s_(k+1) s_k ... s_3 s_2 s_1 s_0 +// +// A left-shift followed by subtraction of the original value yields a new +// representation of the same value, using signed bits s_i = b_(i-1) - b_i. +// This representation from Booth's paper has since appeared in the +// literature under a variety of different names including "reversed binary +// form", "alternating greedy expansion", "mutual opposite form", and +// "sign-alternating {+-1}-representation". +// +// An interesting property is that among the nonzero bits, values 1 and -1 +// strictly alternate. +// +// (2) Various window schemes can be applied to the Booth representation of +// integers: for example, right-to-left sliding windows yield the wNAF +// (a signed-digit encoding independently discovered by various researchers +// in the 1990s), and left-to-right sliding windows yield a left-to-right +// equivalent of the wNAF (independently discovered by various researchers +// around 2004). +// +// To prevent leaking information through side channels in point multiplication, +// we need to recode the given integer into a regular pattern: sliding windows +// as in wNAFs won't do, we need their fixed-window equivalent -- which is a few +// decades older: we'll be using the so-called "modified Booth encoding" due to +// MacSorley ("High-speed arithmetic in binary computers", Proc. IRE, vol. 49 +// (1961), pp. 67-91), in a radix-2^5 setting. That is, we always combine five +// signed bits into a signed digit: +// +// s_(5j + 4) s_(5j + 3) s_(5j + 2) s_(5j + 1) s_(5j) +// +// The sign-alternating property implies that the resulting digit values are +// integers from -16 to 16. +// +// Of course, we don't actually need to compute the signed digits s_i as an +// intermediate step (that's just a nice way to see how this scheme relates +// to the wNAF): a direct computation obtains the recoded digit from the +// six bits b_(5j + 4) ... b_(5j - 1). +// +// This function takes those six bits as an integer (0 .. 63), writing the +// recoded digit to *sign (0 for positive, 1 for negative) and *digit (absolute +// value, in the range 0 .. 16). Note that this integer essentially provides +// the input bits "shifted to the left" by one position: for example, the input +// to compute the least significant recoded digit, given that there's no bit +// b_-1, has to be b_4 b_3 b_2 b_1 b_0 0. +// +// DOUBLING CASE: +// +// Point addition formulas for short Weierstrass curves are often incomplete. +// Edge cases such as P + P or P + ∞ must be handled separately. This +// complicates constant-time requirements. P + ∞ cannot be avoided (any window +// may be zero) and is handled with constant-time selects. P + P (where P is not +// ∞) usually is not. Instead, windowing strategies are chosen to avoid this +// case. Whether this happens depends on the group order. +// +// Let w be the window width (in this function, w = 5). The non-trivial doubling +// case in single-point scalar multiplication may occur if and only if the +// 2^(w-1) bit of the group order is zero. +// +// Note the above only holds if the scalar is fully reduced and the group order +// is a prime that is much larger than 2^w. It also only holds when windows +// are applied from most significant to least significant, doubling between each +// window. It does not apply to more complex table strategies such as +// |EC_nistz256_method|. +// +// PROOF: +// +// Let n be the group order. Let l be the number of bits needed to represent n. +// Assume there exists some 0 <= k < n such that signed w-bit windowed +// multiplication hits the doubling case. +// +// Windowed multiplication consists of iterating over groups of s_i (defined +// above based on k's binary representation) from most to least significant. At +// iteration i (for i = ..., 3w, 2w, w, 0, starting from the most significant +// window), we: +// +// 1. Double the accumulator A, w times. Let A_i be the value of A at this +// point. +// +// 2. Set A to T_i + A_i, where T_i is a precomputed multiple of P +// corresponding to the window s_(i+w-1) ... s_i. +// +// Let j be the index such that A_j = T_j ≠ ∞. Looking at A_i and T_i as +// multiples of P, define a_i and t_i to be scalar coefficients of A_i and T_i. +// Thus a_j = t_j ≠ 0 (mod n). Note a_i and t_i may not be reduced mod n. t_i is +// the value of the w signed bits s_(i+w-1) ... s_i. a_i is computed as a_i = +// 2^w * (a_(i+w) + t_(i+w)). +// +// t_i is bounded by -2^(w-1) <= t_i <= 2^(w-1). Additionally, we may write it +// in terms of unsigned bits b_i. t_i consists of signed bits s_(i+w-1) ... s_i. +// This is computed as: +// +// b_(i+w-2) b_(i+w-3) ... b_i b_(i-1) +// - b_(i+w-1) b_(i+w-2) ... b_(i+1) b_i +// -------------------------------------------- +// t_i = s_(i+w-1) s_(i+w-2) ... s_(i+1) s_i +// +// Observe that b_(i+w-2) through b_i occur in both terms. Let x be the integer +// represented by that bit string, i.e. 2^(w-2)*b_(i+w-2) + ... + b_i. +// +// t_i = (2*x + b_(i-1)) - (2^(w-1)*b_(i+w-1) + x) +// = x - 2^(w-1)*b_(i+w-1) + b_(i-1) +// +// Or, using C notation for bit operations: +// +// t_i = (k>>i) & ((1<<(w-1)) - 1) - (k>>i) & (1<<(w-1)) + (k>>(i-1)) & 1 +// +// Note b_(i-1) is added in left-shifted by one (or doubled) from its place. +// This is compensated by t_(i-w)'s subtraction term. Thus, a_i may be computed +// by adding b_l b_(l-1) ... b_(i+1) b_i and an extra copy of b_(i-1). In C +// notation, this is: +// +// a_i = (k>>(i+w)) << w + ((k>>(i+w-1)) & 1) << w +// +// Observe that, while t_i may be positive or negative, a_i is bounded by +// 0 <= a_i < n + 2^w. Additionally, a_i can only be zero if b_(i+w-1) and up +// are all zero. (Note this implies a non-trivial P + (-P) is unreachable for +// all groups. That would imply the subsequent a_i is zero, which means all +// terms thus far were zero.) +// +// Returning to our doubling position, we have a_j = t_j (mod n). We now +// determine the value of a_j - t_j, which must be divisible by n. Our bounds on +// a_j and t_j imply a_j - t_j is 0 or n. If it is 0, a_j = t_j. However, 2^w +// divides a_j and -2^(w-1) <= t_j <= 2^(w-1), so this can only happen if +// a_j = t_j = 0, which is a trivial doubling. Therefore, a_j - t_j = n. +// +// Now we determine j. Suppose j > 0. w divides j, so j >= w. Then, +// +// n = a_j - t_j = (k>>(j+w)) << w + ((k>>(j+w-1)) & 1) << w - t_j +// <= k/2^j + 2^w - t_j +// < n/2^w + 2^w + 2^(w-1) +// +// n is much larger than 2^w, so this is impossible. Thus, j = 0: only the final +// addition may hit the doubling case. +// +// Finally, we consider bit patterns for n and k. Divide k into k_H + k_M + k_L +// such that k_H is the contribution from b_(l-1) .. b_w, k_M is the +// contribution from b_(w-1), and k_L is the contribution from b_(w-2) ... b_0. +// That is: +// +// - 2^w divides k_H +// - k_M is 0 or 2^(w-1) +// - 0 <= k_L < 2^(w-1) +// +// Divide n into n_H + n_M + n_L similarly. We thus have: +// +// t_0 = (k>>0) & ((1<<(w-1)) - 1) - (k>>0) & (1<<(w-1)) + (k>>(0-1)) & 1 +// = k & ((1<<(w-1)) - 1) - k & (1<<(w-1)) +// = k_L - k_M +// +// a_0 = (k>>(0+w)) << w + ((k>>(0+w-1)) & 1) << w +// = (k>>w) << w + ((k>>(w-1)) & 1) << w +// = k_H + 2*k_M +// +// n = a_0 - t_0 +// n_H + n_M + n_L = (k_H + 2*k_M) - (k_L - k_M) +// = k_H + 3*k_M - k_L +// +// k_H - k_L < k and k < n, so k_H - k_L ≠ n. Therefore k_M is not 0 and must be +// 2^(w-1). Now we consider k_H and n_H. We know k_H <= n_H. Suppose k_H = n_H. +// Then, +// +// n_M + n_L = 3*(2^(w-1)) - k_L +// > 3*(2^(w-1)) - 2^(w-1) +// = 2^w +// +// Contradiction (n_M + n_L is the bottom w bits of n). Thus k_H < n_H. Suppose +// k_H < n_H - 2*2^w. Then, +// +// n_H + n_M + n_L = k_H + 3*(2^(w-1)) - k_L +// < n_H - 2*2^w + 3*(2^(w-1)) - k_L +// n_M + n_L < -2^(w-1) - k_L +// +// Contradiction. Thus, k_H = n_H - 2^w. (Note 2^w divides n_H and k_H.) Thus, +// +// n_H + n_M + n_L = k_H + 3*(2^(w-1)) - k_L +// = n_H - 2^w + 3*(2^(w-1)) - k_L +// n_M + n_L = 2^(w-1) - k_L +// <= 2^(w-1) +// +// Equality would mean 2^(w-1) divides n, which is impossible if n is prime. +// Thus n_M + n_L < 2^(w-1), so n_M is zero, proving our condition. +// +// This proof constructs k, so, to show the converse, let k_H = n_H - 2^w, +// k_M = 2^(w-1), k_L = 2^(w-1) - n_L. This will result in a non-trivial point +// doubling in the final addition and is the only such scalar. +// +// COMMON CURVES: +// +// The group orders for common curves end in the following bit patterns: +// +// P-521: ...00001001; w = 4 is okay +// P-384: ...01110011; w = 2, 5, 6, 7 are okay +// P-256: ...01010001; w = 5, 7 are okay +// P-224: ...00111101; w = 3, 4, 5, 6 are okay +static inline void recode_scalar_bits(crypto_word_t *sign, crypto_word_t *digit, + crypto_word_t in) { + crypto_word_t s, d; + + s = ~((in >> 5) - 1); /* sets all bits to MSB(in), 'in' seen as + * 6-bit value */ + d = (1 << 6) - in - 1; + d = (d & s) | (in & ~s); + d = (d >> 1) + (d & 1); + + *sign = s & 1; + *digit = d; +} diff --git a/ring-0.17.14/crypto/fipsmodule/ecdsa/ecdsa_verify_tests.txt b/ring-0.17.14/crypto/fipsmodule/ecdsa/ecdsa_verify_tests.txt new file mode 100644 index 0000000000..c00af6c9e9 --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/ecdsa/ecdsa_verify_tests.txt @@ -0,0 +1,1362 @@ +# Tests from NIST CAVP 186-4 ECDSA2VS Test Vectors, Signature Verification Test +# http://csrc.nist.gov/groups/STM/cavp/documents/dss/186-3ecdsatestvectors.zip +# +# NIST's files provide message and digest pairs. Since this is a low-level test, +# the digests have been extracted. P-521 test vectors were fixed to have the +# right number of leading zeros. + +Curve = P-256 +X = 1198b3c409a8b47edb1347e0982d533cb1813e5cb2a92c824b2881b3cd2f3f4a +Y = 0bdbac5fa02e41e775f8d602446d58ecb2209b5a3d79ae69eef399016e992e87 +Digest = 01ed0c41d650479c47057f61433d7e8b24492649 +R = 9206d435f148f88c15b2effbf3c506e41b2c620102022b801e371d0767b54bea +S = cbc4e1674ae1af69873946ccf6275946e59e0107278749b2d0010795833d80fa +Invalid = Y + +Curve = P-256 +X = f7c6280aecd6b936513b0ca84e63346333dc41437a15442e605d46bba93ae101 +Y = 3c834cecc16167b07866a9478f9f2d882de7ef937da447cd837e60cb5ed65d81 +Digest = f91b4dfddd5eb33a875d2e50d1e949211ac819da +R = f615af212ab030c4bbf9362d9815a1462312df4beb4358a7ce80d820355420bf +S = d12ed715ef65cfe6fe6bf348364088a0e7f70927bbafe4c12fc4cb65c0cc51bc +Invalid = Y + +Curve = P-256 +X = 0e7632dbc4db879e10d1d80f2789d9fa414c1fe77a6c1e56d6667af43e36e610 +Y = 6f0dd2a5840e5a6f6ff7e23f656f5c945b7a493fbb0cfd5b9b531bf04435b1ef +Digest = 3905696f8bad8205fa1445df0e91ade3dbc413e6 +R = 2b0b9ab4a575732a168f28494b66a855fc1a757fb1177864bf3e4f0a000c4a86 +S = 54901ce2f92f55ac112afa0f8b62bc00b44c8c10fe0c863675bfd305d6dc0cd8 +Invalid = Y + +Curve = P-256 +X = 1613f12bae8e98d09b4bba53f5229596a0d417d2c625f41bb15f923b3c1e4b57 +Y = 411319fa85227997a4cf3b1756161485124d2cedc38c9c30d82f42dc2647d545 +Digest = 580d31ce22700a20c2db81bcdac37330b491c86f +R = ed058d476a77be99c1b0fc8502abe545541b4c0ff3eed3f558133ae2f02042b0 +S = c571b4895712a4f64f7220b0694cab767379b09f1824fe7874acd127deb2371e +Invalid = Y + +Curve = P-256 +X = 88bb041dcb1733a676a7f4ae8d3e407d72d5396547f07db77078485c1d5db077 +Y = 72cf2b55e596cd140c58228f1b0a19c34fca26ffac043528a417c5abb6fca9c9 +Digest = 7900a02f768b0718a13525c33adace583de15c50 +R = 87208734deb125dca68f0d33f9d369cf1b79cf5a021391b9c6c1727d2efe663a +S = b984f722de18f1ce407104342948f03f2b55413a096c4b5fca1e032a2c814a4a +Invalid = Y + +Curve = P-256 +X = 811eb5180def7fb60d632f8cb2cba831b88cee778aa2a82ec3a5fc3d80ff7fb6 +Y = db88d65b0fc35d9ba1f1ced0400434979ae895d371d1441d7c7a441a9fb1709b +Digest = 17b7451ea903125ccb293ffaa9d1a4ca1141a2c5 +R = c329fa28dac0018276c5af0cd770e60be50bc14e2562d5556991971edc7d4916 +S = 2d111d13837a02fa279fe835a7dc59a521864d92b26649ca4e24b36ae93878e8 +Invalid = Y + +Curve = P-256 +X = 4a6f1e7f7268174d23993b8b58aa60c2a87b18de79b36a750ec86dd6f9e12227 +Y = 572df22bd6487a863a51ca544b8c5de2b47f801372a881cb996a97d9a98aa825 +Digest = 54e9a048559f370425e9c8e54a460ec91bcc930a +R = 4a800e24de65e5c57d4cab4dd1ef7b6c38a2f0aa5cfd3a571a4b552fb1993e69 +S = d9c89fb983640a7e65edf632cacd1de0823b7efbc798fc1f7bbfacdda7398955 +Invalid = Y + +Curve = P-256 +X = f3033d1e548d245b5e45ff1147db8cd44db8a1f2823c3c164125be88f9a982c2 +Y = 3c078f6cee2f50e95e8916aa9c4e93de3fdf9b045abac6f707cfcb22d065638e +Digest = e8d38e4c6a905a814b04c2841d898ed6da023c34 +R = d4255db86a416a5a688de4e238071ef16e5f2a20e31b9490c03dee9ae6164c34 +S = 4e0ac1e1a6725bf7c6bd207439b2d370c5f2dea1ff4decf1650ab84c7769efc0 + +Curve = P-256 +X = 0ea0a6bb6c70966fad1a2307479c12de2322795bdecb70e4b286bd6200ba9c1a +Y = c40eda3947021348db691ac4086fb6c06b587ce37c155bb0a7d912b93226de81 +Digest = 3b08bf1b67abc03c1cd69b0e24743b5c2d49e506 +R = f5509deff7bfda3f3759800fa4033af6a84466b114ecb48eac37eff48d2ae1b3 +S = 8c4b62dce2082f80caf220cdbb1d02567bbdfab40564b90ef31d86e3e10ce80a +Invalid = Y + +Curve = P-256 +X = e7a57e0f6ec0fa9c7c34978034cf82f039f8fd62804070ad943573fc8efa5775 +Y = 87b2cc85dfff2dae5620fbe3e6256bd728de28fc9dc1b5eb6b5d7bd5d29186ad +Digest = a8c5dc0344b1442dfdb5f8836251893d6c4ecbe9 +R = 97642038932fdddbe2021ec1af53ae6b9af00ef9c8b9f26aea582892e80e6285 +S = 9cb14918359338041cf795cf6781e4905837fa5ce3b3e50ffafb5f13c73b5bc8 +Invalid = Y + +Curve = P-256 +X = be7a651be0c87278569987cf62d7fa1dd1b3d6e1b868d8f4dfb56135a9960eec +Y = b7a62c588a987760b915edbd7f95506870c60f042471de1d8b2d4cd9d6563391 +Digest = 2f93ee45db133a14c26d418c2ffd3470ae63bf50 +R = aa889fb608b6939f6eeacf2f64c3b2e3a6061f2834058c7e724321720b737a63 +S = 6cd6d0ef2b93a760daa914e11b9b414bd4d72457405f00a62ab63f36d76efb73 +Invalid = Y + +Curve = P-256 +X = 76ddc46d8db8d7ce2ce837f60cdabcee92b7c7817ee41c8f066f1ae65f85c318 +Y = bea47191f1c584c87250370ce337a1de1583bcfc20ccc23b7a82e83f19adaa88 +Digest = 2136a5470ff9d45214a0b2c300042efea8ff7266 +R = 84a42efbf7ec04166ad144d19cd98c120aa2e79d483b5eea6fbdfa7f1222e07b +S = e41531205e691e65668f69f518abc7b60f32c373434872a043b7358462babf83 +Invalid = Y + +Curve = P-256 +X = 2f71b932f770ba9daf7c1dd47444ab6cb8881f71a1c597e719845b15cb84ca35 +Y = ab928625b40ec0738d0fc8dbc4df4a1f65d20bc0447b69cfa13bb20b95bb41d4 +Digest = ae6093bb37c1264ca3ead439e4f678721912c8c4 +R = 63fca172bbca6197cd2802a9cb61d74c2b47cf35f6d35203e67ffbaa838be775 +S = e70ec283cd212df6ba3723e26b697501f112d7cf64e4f45185dae76055e09f1e + +Curve = P-256 +X = ce775648b928db82ac5edb3b009d32959a73b86c45e96d4b8d5b6e640b7c2790 +Y = 52455caf08ee94d86f0984e9ec9268d74823f2102dd97fced59638055f6af18e +Digest = 60054807acb29e3091a023c42b9885c4945249e1 +R = 2a64b29146588f3153fee1029a0131ac0a8a25ba2ecc494f697c166c7c91fc08 +S = 7b429bc12a72ca3d76c119eea9f4098633cc31c87831e54d5d93afd6e8d20f4f +Invalid = Y + +Curve = P-256 +X = cd2f29a53f0ce57e0e4a542c3256e65ebbdc30415f4de771d5d706d3aeacc852 +Y = dbbf2c129f30d11fe77d7816a24187764eae3fb2ff70c1ec745e876e26f5232f +Digest = 5f50e35b134942295c16d003742fd6bce5bdab45 +R = 2454c5ee84e4f77b554acd368dd412389db8c78429590a092f24db2da43cb761 +S = 63e870ce2fa4085d4ff1e360f7a5c101a1f8b288abe71cca56887e613ad034b7 + +Curve = P-256 +X = 843f6d83d777aac75b758d58c670f417c8deea8d339a440bb626114318c34f29 +Y = 83e0c70008521c8509044b724420463e3478e3c91874d424be44413d1ce555f3 +Digest = cda2c7ad9abb2a858c4981550f78974c69e41cc31fa33509e3e83dc2 +R = d08e9a5db411019d826b20ac889227ed245503a6d839494db1e8d7995a6b245b +S = 8d46a204054125d0dc776ab1055302ec4eb0f20b90bca6d205f21d3cefd29097 + +Curve = P-256 +X = f08b56f73f7a0e098444f6f0a02ad81ce0b914a11cafa15893d1c84704e1c564 +Y = bbee9aeb91cdc2d1d1437b4168df73acfd64e8b02962b14c85e67187e1ef80a4 +Digest = 5453c2656550e9b3dc6c40a3f1362a73522396bc35d383dd6451128f +R = 71b3ec982725a007ac18a5cf60587e1fd1beb57685a1f9df3cddd9df25dcbc18 +S = 407e41217325f92f8a031cfcc4eb64c1a4b17b0a7459c254af754a7ea9eac997 +Invalid = Y + +Curve = P-256 +X = 0b688e761e1ddda2305e002809da65bf5916dfe1356a5b99b61f5576a9b90efa +Y = 90ec958e2e3a676e7bbf8e9394f72742875836125a317b0ae38374953f746a91 +Digest = 7289573d6bb7486e428e086bec9da9d7ff3c5f8bd0db2ec209fed6ae +R = ef89df3bbf079fb250f7e882c4f85c0023fc3804e862d9ef4d9530a15f1013f0 +S = 4ba985e900e6737b8e07eac638f7b38277ead4faee6d2076a2eee90fd2a6bf0f +Invalid = Y + +Curve = P-256 +X = 0b64480783e260e1e9caef37b4cc9c650d2d57e2c594b1106314843d8d7ab74e +Y = 29d373d8522deffe40055aef539f53f38937eb799b44f05a8d8c0b381f12907f +Digest = 497656e780360ec3b4bd1be97570615e4a32467982cd9330bc6aa224 +R = c5c26b0b21eef0f7a0f1cff38d0079d890376759369b01d8d8e959c1c785e203 +S = fecc400bf0deab99d87da168b9d0dd31d2dfa3435b0fe9d38b5fb8efd45195a4 +Invalid = Y + +Curve = P-256 +X = 7f78a8fd880c509940e2b83de67c9ab553ab91489bae75cdc1d5b523b06ab7f5 +Y = 7786aee7032c373cdfad7d9ddb6fa09a026f6da30fd477ab014d30a289d542a1 +Digest = 6d88da9e83ae9457e233d7977172c062dfbdd17d365694515251e031 +R = c93ada69db326f76b1362d610cb8bcc6e7ef1dc03d3d11367e153c0e39d5dc86 +S = d0c02c71b14ef7a4af4e23bd207ce98449f5d6e7e5b3ec8cbbca9549e97d379d + +Curve = P-256 +X = e58cdc207c56f62e0bb7c0b55b7f7236a6b308f8fc4de3e61cdb3bf20ad2f62c +Y = 6056c0ee827e85ba284838954d0c6cc096df03b4611b1e0f7f9002bac86856d4 +Digest = 3f9a97b8ea807edc88788df8956c296b1daaed8dd12d50c712344091 +R = 2df3906527ad322000285bccdd11dd09130d633cf43534f5802604639eb847e0 +S = adaaad19b7c66836ef0f4afeff8ac5e898cd2523246a74a1a291a3a1ff583322 + +Curve = P-256 +X = 70b4bba10b7bbc6d4175ada8d485f3685b13916d0c992301f47e45b629c63d0e +Y = 257a93be31b09ff4cd22e3375e30b5a79f3bf3c74c80dde93e5d65e88c07c1c4 +Digest = cc3a0d3a5d4f28dc9144a3cdb276eb92265f1157a8d8192cf628673c +R = 6e714a737b07a4784d26bde0399d8eee81998a13363785e2e4fb527e6a5c9e4e +S = 94c0220f0f3fa66ff24f96717f464b66ae3a7b0f228ab6a0b5775038da13768a +Invalid = Y + +Curve = P-256 +X = 8b11b48d2397355000a5289d816b9892ae64dffc842abec02a2fb2db2bb34310 +Y = fc1a42528a0473cfc2c2e184b8bc5055096350fe1549d24b526d6536681026e8 +Digest = f340e491fa935be8945b8caa485d0699c66331e0e17c7407da1b018e +R = 61a91dd1c80049e70dc4aea84bda0efc6ec9c7b9dd16ecbccf687244c51184ce +S = e381e7b32bab49578c7e7ce7784ce19263e4a7dab4b614df411d20eaebfc391c +Invalid = Y + +Curve = P-256 +X = 7bad1b3d8bad4355a44511d2eb50daeae793af99418ada118327359936aa0e1d +Y = e7eff40334b7a5455f6b0d0ecdcdc513702857bb5bbb73c910c86746092bcd7d +Digest = 9cf84546c046b370c372c167ebba39af6aadd60463626453787bb058 +R = fd961b60b21be32b47abafa77e22197dc99af6825dcca46e0e3b1991a90aa202 +S = a0477f97b94a1c26a3b2d186791d7fc9dfa8130bbae79c28fa11ec93a3aeac0b +Invalid = Y + +Curve = P-256 +X = 407d92c9b28723602bf09f20f0de002afdf90e22cb709a8d38e3c51e82cba96c +Y = 4530659432e1dd74237768133e1f9808e62d0fbe5d1d979d1571baf645dcb84c +Digest = 0cf5cd48c93f45472d254196bebea4bddb272a2adff23bab8c3adf99 +R = a7dc65293ee3deb0008ae3e2d7ef9e9a4ebb8bf7b10d165f80ab8bed58d6fdef +S = 3e8300a3ee603a8d8234fe265c628e705015bf1903eb74c943323050626f701f +Invalid = Y + +Curve = P-256 +X = 26aea3dd5c53f984dbdaf415c7f26e1e73048658a548eb3b59dd5f721899919a +Y = dff15f57bd9b08644d49cbb214403647195725cd4d4511bc8a48b0770466ae9f +Digest = 75d6b6b575d0a2c89528b83c94ef864c825b66253ab662b36bb0e716 +R = 726af92afe53e8125b0b9f3659745be401a37ae658b7b1aa88c3cb97e9de22c3 +S = 794484c5837a419efe11a4e4293341a6fa36d21230925a0e5e135887302acca9 +Invalid = Y + +Curve = P-256 +X = e73418677ce044b331a6d60773cbae199221699d31e1bec4b68b9bc0b87e4cd0 +Y = 37215db4e3d9161f3351b385a61ddb2fcf1cec469d1659e7574610ed27fe879f +Digest = dcbb92e3be3951d37e37852d508f78da29c8183c5dbe59d6549f78ed +R = ac469290a8f61a2a8c6adc7533dd5cfe804e2e7bf101cc74e5f624f301bccd23 +S = 4c328c3bc259316641fff44753743afebe89b8627f904df7245e42adcff2dc76 +Invalid = Y + +Curve = P-256 +X = b0892b19c508b3543a5ae864ba9194084c8f7ae544760759550cc160972e87ff +Y = 9208e9b0c86ad6bc833e53026f233db9a42298cdb35d906326008377520b7d98 +Digest = 90333facb4f5068c1d05d1a478fb46d02f367e271a000474c06a5fec +R = a62dd0d1518c6b9c60de766b952312a8d8c6eaa36a68196d2a30a46fb17dc067 +S = b9ded660e978129277f74c1d436003d1e6d556dc8eed9d505bbaf4c67cb13d21 +Invalid = Y + +Curve = P-256 +X = 8c5c41cb07d828a6a86be4533aef791d3a70a95cb285aa2956b21feeac2f8c49 +Y = 84101581cad7a48b7d0596df7ffed47085d22e8a4af685cddbeeb32ea69ae190 +Digest = 8bb52bd045c985167f673c07b613a3402f435a54c122877bc0c5fe34 +R = 9812449df0a51f7a2a8f78aa9a589ca9644dce285f1e69658daaea759fa5bd7e +S = beb4c27c748a7944e37afe861576f76b5a749a8ccbbd7dec00838ba250ddfe1a +Invalid = Y + +Curve = P-256 +X = 788d7e54ab03020e4954f41259052ee5af68361492b180da31fbbe68d868aa95 +Y = 982a3ababa6d351649e56da3faeb7160b9de74e22fe93a06ead1bd9a8dffdf7e +Digest = 9870ae25b0f0403eff1079b94669cf95fb250fb098eeb885ff08f117 +R = 3ddea06bf8aa4a1b0c68674a2c4796def0bfb52236f4efb3332204a41fd8ea89 +S = 871237039431a41aeefcdd08f67848b2b09067e3a1344c8ed9b372d1b1c754a6 +Invalid = Y + +Curve = P-256 +X = 87f8f2b218f49845f6f10eec3877136269f5c1a54736dbdf69f89940cad41555 +Y = e15f369036f49842fac7a86c8a2b0557609776814448b8f5e84aa9f4395205e9 +Digest = a82c31412f537135d1c418bd7136fb5fde9426e70c70e7c2fb11f02f30fdeae2 +R = d19ff48b324915576416097d2544f7cbdf8768b1454ad20e0baac50e211f23b0 +S = a3e81e59311cdfff2d4784949f7a2cb50ba6c3a91fa54710568e61aca3e847c6 +Invalid = Y + +Curve = P-256 +X = 5cf02a00d205bdfee2016f7421807fc38ae69e6b7ccd064ee689fc1a94a9f7d2 +Y = ec530ce3cc5c9d1af463f264d685afe2b4db4b5828d7e61b748930f3ce622a85 +Digest = 5984eab8854d0a9aa5f0c70f96deeb510e5f9ff8c51befcdc3c41bac53577f22 +R = dc23d130c6117fb5751201455e99f36f59aba1a6a21cf2d0e7481a97451d6693 +S = d6ce7708c18dbf35d4f8aa7240922dc6823f2e7058cbc1484fcad1599db5018c +Invalid = Y + +Curve = P-256 +X = 2ddfd145767883ffbb0ac003ab4a44346d08fa2570b3120dcce94562422244cb +Y = 5f70c7d11ac2b7a435ccfbbae02c3df1ea6b532cc0e9db74f93fffca7c6f9a64 +Digest = 44b02ad3088076f997220a68ff0b27a58ecfa528b604427097cce5ca956274c5 +R = 9913111cff6f20c5bf453a99cd2c2019a4e749a49724a08774d14e4c113edda8 +S = 9467cd4cd21ecb56b0cab0a9a453b43386845459127a952421f5c6382866c5cc +Invalid = Y + +Curve = P-256 +X = e424dc61d4bb3cb7ef4344a7f8957a0c5134e16f7a67c074f82e6e12f49abf3c +Y = 970eed7aa2bc48651545949de1dddaf0127e5965ac85d1243d6f60e7dfaee927 +Digest = d1b8ef21eb4182ee270638061063a3f3c16c114e33937f69fb232cc833965a94 +R = bf96b99aa49c705c910be33142017c642ff540c76349b9dab72f981fd9347f4f +S = 17c55095819089c2e03b9cd415abdf12444e323075d98f31920b9e0f57ec871c + +Curve = P-256 +X = e0fc6a6f50e1c57475673ee54e3a57f9a49f3328e743bf52f335e3eeaa3d2864 +Y = 7f59d689c91e463607d9194d99faf316e25432870816dde63f5d4b373f12f22a +Digest = b9336a8d1f3e8ede001d19f41320bc7672d772a3d2cb0e435fff3c27d6804a2c +R = 1d75830cd36f4c9aa181b2c4221e87f176b7f05b7c87824e82e396c88315c407 +S = cb2acb01dac96efc53a32d4a0d85d0c2e48955214783ecf50a4f0414a319c05a + +Curve = P-256 +X = a849bef575cac3c6920fbce675c3b787136209f855de19ffe2e8d29b31a5ad86 +Y = bf5fe4f7858f9b805bd8dcc05ad5e7fb889de2f822f3d8b41694e6c55c16b471 +Digest = 640c13e290147a48c83e0ea75a0f92723cda125ee21a747e34c8d1b36f16cf2d +R = 25acc3aa9d9e84c7abf08f73fa4195acc506491d6fc37cb9074528a7db87b9d6 +S = 9b21d5b5259ed3f2ef07dfec6cc90d3a37855d1ce122a85ba6a333f307d31537 +Invalid = Y + +Curve = P-256 +X = 3dfb6f40f2471b29b77fdccba72d37c21bba019efa40c1c8f91ec405d7dcc5df +Y = f22f953f1e395a52ead7f3ae3fc47451b438117b1e04d613bc8555b7d6e6d1bb +Digest = 8a3e7ad7b9b1b0cdc48e58d1e651fe6d710fef1420addeb61582bdd982d2b44c +R = 548886278e5ec26bed811dbb72db1e154b6f17be70deb1b210107decb1ec2a5a +S = e93bfebd2f14f3d827ca32b464be6e69187f5edbd52def4f96599c37d58eee75 +Invalid = Y + +Curve = P-256 +X = 69b7667056e1e11d6caf6e45643f8b21e7a4bebda463c7fdbc13bc98efbd0214 +Y = d3f9b12eb46c7c6fda0da3fc85bc1fd831557f9abc902a3be3cb3e8be7d1aa2f +Digest = d80e9933e86769731ec16ff31e6821531bcf07fcbad9e2ac16ec9e6cb343a870 +R = 288f7a1cd391842cce21f00e6f15471c04dc182fe4b14d92dc18910879799790 +S = 247b3c4e89a3bcadfea73c7bfd361def43715fa382b8c3edf4ae15d6e55e9979 +Invalid = Y + +Curve = P-256 +X = bf02cbcf6d8cc26e91766d8af0b164fc5968535e84c158eb3bc4e2d79c3cc682 +Y = 069ba6cb06b49d60812066afa16ecf7b51352f2c03bd93ec220822b1f3dfba03 +Digest = 7c1048884558961c7e178b3a9b22583fca0d17f355a9887e2f96d363d2a776a3 +R = f5acb06c59c2b4927fb852faa07faf4b1852bbb5d06840935e849c4d293d1bad +S = 049dab79c89cc02f1484c437f523e080a75f134917fda752f2d5ca397addfe5d +Invalid = Y + +Curve = P-256 +X = 224a4d65b958f6d6afb2904863efd2a734b31798884801fcab5a590f4d6da9de +Y = 178d51fddada62806f097aa615d33b8f2404e6b1479f5fd4859d595734d6d2b9 +Digest = 4c8d1afb724ad0c2ec458d866ac1dbb4497e273bbf05f88153102987e376fa75 +R = 87b93ee2fecfda54deb8dff8e426f3c72c8864991f8ec2b3205bb3b416de93d2 +S = 4044a24df85be0cc76f21a4430b75b8e77b932a87f51e4eccbc45c263ebf8f66 +Invalid = Y + +Curve = P-256 +X = 43691c7795a57ead8c5c68536fe934538d46f12889680a9cb6d055a066228369 +Y = f8790110b3c3b281aa1eae037d4f1234aff587d903d93ba3af225c27ddc9ccac +Digest = 8581034ec7d7a6b163d71820923f616b362748f2846042c9896d8e4bf7577960 +R = 8acd62e8c262fa50dd9840480969f4ef70f218ebf8ef9584f199031132c6b1ce +S = cfca7ed3d4347fb2a29e526b43c348ae1ce6c60d44f3191b6d8ea3a2d9c92154 +Invalid = Y + +Curve = P-256 +X = 9157dbfcf8cf385f5bb1568ad5c6e2a8652ba6dfc63bc1753edf5268cb7eb596 +Y = 972570f4313d47fc96f7c02d5594d77d46f91e949808825b3d31f029e8296405 +Digest = e5b30e0041a33281210644938d9aaa15ef2c1247b4178f7ca1ee935ce23daabc +R = dfaea6f297fa320b707866125c2a7d5d515b51a503bee817de9faa343cc48eeb +S = 8f780ad713f9c3e5a4f7fa4c519833dfefc6a7432389b1e4af463961f09764f2 +Invalid = Y + +Curve = P-256 +X = 072b10c081a4c1713a294f248aef850e297991aca47fa96a7470abe3b8acfdda +Y = 9581145cca04a0fb94cedce752c8f0370861916d2a94e7c647c5373ce6a4c8f5 +Digest = edd72dc0aa91649e09e2489c37ec27efab3b61953762c6b4532a9b1cd08a500d +R = 09f5483eccec80f9d104815a1be9cc1a8e5b12b6eb482a65c6907b7480cf4f19 +S = a4f90e560c5e4eb8696cb276e5165b6a9d486345dedfb094a76e8442d026378d +Invalid = Y + +Curve = P-256 +X = 09308ea5bfad6e5adf408634b3d5ce9240d35442f7fe116452aaec0d25be8c24 +Y = f40c93e023ef494b1c3079b2d10ef67f3170740495ce2cc57f8ee4b0618b8ee5 +Digest = 0d06ba42d256062e16b319a0f3099109518a765f26bac3b9f56930d965617726 +R = 5cc8aa7c35743ec0c23dde88dabd5e4fcd0192d2116f6926fef788cddb754e73 +S = 9c9c045ebaa1b828c32f82ace0d18daebf5e156eb7cbfdc1eff4399a8a900ae7 +Invalid = Y + +Curve = P-256 +X = 2d98ea01f754d34bbc3003df5050200abf445ec728556d7ed7d5c54c55552b6d +Y = 9b52672742d637a32add056dfd6d8792f2a33c2e69dafabea09b960bc61e230a +Digest = 41007876926a20f821d72d9c6f2c9dae6c03954123ea6e6939d7e6e669438891 +R = 06108e525f845d0155bf60193222b3219c98e3d49424c2fb2a0987f825c17959 +S = 62b5cdd591e5b507e560167ba8f6f7cda74673eb315680cb89ccbc4eec477dce + +Curve = P-256 +X = 40ded13dbbe72c629c38f07f7f95cf75a50e2a524897604c84fafde5e4cafb9f +Y = a17202e92d7d6a37c438779349fd79567d75a40ef22b7d09ca21ccf4aec9a66c +Digest = 5aa8e8a6f0622b841416e1a70d79a54641d2c699a075b6960fe5dcf96301da8ca6f15b0948d4ededac30a42e00d3b310 +R = be34730c31730b4e412e6c52c23edbd36583ace2102b39afa11d24b6848cb77f +S = 03655202d5fd8c9e3ae971b6f080640c406112fd95e7015874e9b6ee77752b10 +Invalid = Y + +Curve = P-256 +X = 1f80e19ffeb51dd74f1c397ac3dfd3415ab16ebd0847ed119e6c3b15a1a884b8 +Y = 9b395787371dbfb55d1347d7bed1c261d2908121fb78de1d1bf2d00666a62aed +Digest = 244656186c11c2e67be88099d55e60f4b68e61fba0b214aac3399dc559cfccc02f9884e85623426dbdc3243f2b5374f7 +R = 249ca2c3eb6e04ac57334c2f75dc5e658bbb485bf187100774f5099dd13ef707 +S = 97363a05202b602d13166346694e38135bbce025be94950e9233f4c8013bf5bf +Invalid = Y + +Curve = P-256 +X = ce4dcfa7384c83443ace0fb82c4ac1adfa100a9b2c7bf09f093f8b6d084e50c2 +Y = d98ae7b91abee648d0bfde192703741ac21daad7262af418b50e406d825eb0d6 +Digest = adaeadda3f0e941fba1d3e206a84e6d7530d800e0f215b3ddd82022f27c5be44fed27bc73084c6f7ca55555532be2e3b +R = 597e1e04d93a6b444ccc447a48651f17657ff43fb65fe94461d2bf816b01af40 +S = 359fe3817963548e676d6da34c2d0866aa42499237b682002889eaf8893814d2 + +Curve = P-256 +X = 1b677f535ac69d1acd4592c0d12fac13c9131e5a6f8ab4f9d0afdcb3a3f327e0 +Y = 5dca2c73ec89e58ef8267cba2bb5eb0f551f412f9dc087c1a6944f0ce475277a +Digest = e34a541f87ff0eaa0c640f555caec6bf11a1320c74c47a8ff172c4e2ec902e48d499732b12a86189e750bbf4c0424c72 +R = df0b0cd76d2555d4c38b3d70bfdf964884d0beeb9f74385f0893e87d20c9642d +S = 128299aabf1f5496112be1fe04365f5f8215b08a040abdfeca4626f4d15c005b +Invalid = Y + +Curve = P-256 +X = 7ffc2853f3e17887dda13b0eb43f183ce50a5ac0f8bba75fb1921172484f9b94 +Y = 4cc523d14192f80bd5b27d30b3b41e064da87bfbae15572dd382b9a176c123a2 +Digest = 0689927a38486cccf28fe9454e08e0d74843424b89be4cdee8e48f39a69addec730184da72f914cea67231c765ee2574 +R = 3156176d52eb26f9391229de4251993a41b8172f78970bb70e32a245be4bb653 +S = 62827a29e12d2f29b00fb2d02dd5f2d5412e17a4455f4431a5c996881fdfc0ee +Invalid = Y + +Curve = P-256 +X = 5569f76dc94243cde819fb6fc85144ec67e2b5d49539f62e24d406d1b68f0058 +Y = 1208c38dbe25870deab53c486f793a1e250c9d1b8e7c147ea68b71196c440730 +Digest = 97f8f8cea435282ac746730ac744bf97d85d4e249c0b1d9c7b83c7e59aed172ffc3724d7e6fab7d6ab55ffb3a39c0775 +R = 706f2ba4025e7c06b66d6369a3f93b2fec46c51eceff42a158f7431919506cfb +S = b4e75ac34a96393237fc4337789e37168d79382705b248051c9c72bcbac5f516 +Invalid = Y + +Curve = P-256 +X = e4b470c65b2c04db060d7105ec6911589863d3c7f7ce48726ba3f369ea3467e8 +Y = 44c38d3ae098de05f5915a5868c17fee296a6e150beb1f000df5f3bec8fc4532 +Digest = 5b937a2af46dbf18b4a6fb042ea353a6878e0d4beac016002b3d91a42bcba52856c07a3f35c08dfecb4f03e1c0b9948e +R = c9c347ee5717e4c759ddaf09e86f4e1db2c8658593177cfda4e6514b5e3ecb87 +S = baae01e9e44a7b04d69c8eaaed77c9e3a36ce8962f95cc50a0db146b4e49eb40 +Invalid = Y + +Curve = P-256 +X = 96050c5fa2ddd1b2e5451d89ee74a0b7b54347364ddc0231715a6ef1146fe8dc +Y = e0888a9e78aeea87f6e1e9002b2651169f36c4ee53013cfc8c9912b7fd504858 +Digest = b123e07744f05ad523790ea5bfa3f848869a3bfdbf936a496c8606b577ed8427eb7ee888e0fe18d4e3cfac73baad883f +R = 2353d6cd3c21b8ea7dbc1cd940519812dbe365a3b15cd6aebba9d11cf269867a +S = 85f560273cd9e82e6801e4cb1c8cd29cdac34a020da211d77453756b604b8fa7 + +Curve = P-256 +X = 0c07bb79f44012299fbfd5a0f31397aaf7d757f8a38437407c1b09271c6551a0 +Y = 84fe7846d5d403dc92c0091fbd39f3c5cbca3f94c10b5cae44e2e96562131b13 +Digest = fb8d12652de59e63ef5297641dfbce084808de146720e9069c2ef814bcd80b6187f7422a6cd9c706f8d64ccf80e8bc54 +R = 49e9425f82d0a8c503009cead24e12adc9d48a08594094ca4f6d13ad1e3c571d +S = 1f1b70aaa30a8ff639aa0935944e9b88326a213ab8fce5194c1a9dec070eb433 +Invalid = Y + +Curve = P-256 +X = 71db1de1a1f38f356c91feaff5cfe395d1a5b9d23cf6aa19f38ae0bcc90a486d +Y = ecdd6ffb174a50f1cc792985c2f9608c399c98b8a64a69d2b5b7cdd9241f67e2 +Digest = 2d8c6585a3b6319a556e27b53d434f455f73e771c8fc6a115f5c92a8e9a81ce2b4336a5c3edf98910689d11f4c93632a +R = b0443b33a6f249470d2f943675009d21b9ccbead1525ae57815df86bb20470bf +S = 316dbee27d998e09128539c269e297ac8f34b9ef8249a0619168c3495c5c1198 +Invalid = Y + +Curve = P-256 +X = 8219b225aa15472262c648cac8de9aad4173d17a231ba24352a5a1c4eea70fad +Y = 0fee2b08ad39fbf0db0016ef2896ca99adc07efc8c415f640f3720498be26037 +Digest = a4cc3b23f54d9d48ba6b0ad3da3b2e3a0806f41348bd7844e9c9b8648753bdeef8a039e1fa4f5172c89148d65b14056f +R = 134fb689101aaad3954de2819d9fbd12072fe2bc36f496bbf0d13fa72114ab96 +S = e65c232bd915b59e087e7fd5ec90bf636cfa80526345c79a0adfd75003045d6f +Invalid = Y + +Curve = P-256 +X = c934195de33b60cf00461fc3c45dad068e9f5f7af5c7fa78591e95aeb04e2617 +Y = b588dd5f9965fdaa523b475c2812c251bc6973e2df21d9beaace976abf5728cb +Digest = b962b63a7743ad77f9072f2f08d277f6dda8cc3420ddd37d873746008895902bcce218fbfed1a8cb28406978dd8e5134 +R = 71f302440eb4ed2a939b69e33e905e6fdc545c743458d38f7e1a1d456e35f389 +S = 54eaa0eb9cd7503b19a9658f0a04955d9f0ab20ebc8a0877e33c89ee88ad068f +Invalid = Y + +Curve = P-256 +X = 9e1adcd48e2e3f0e4c213501808228e587c40558f52bb54ddbb6102d4048ea92 +Y = 34eff98704790938e7e0bdf87ae39807a6b77dfdc9ecdfe6dd0f241abae1aeb2 +Digest = 21b883fae159867731b123a2606e9b3320fb53a00e4a5dfe3bc3429dd53b8068197be3c7288c1e0bf28a4fc7b13bd70f +R = ce4f0d7480522c8dd1b02dd0eb382f22406642f038c1ede9411883d72b3e7ed0 +S = 8546e1ee3b77f9927cdaccbc2f1cf19d6b5576b0f738bb1b86a0c66b39ca56fb +Invalid = Y + +Curve = P-256 +X = 93edbecb0b019c2cc03060f54cb4904b920fdb34eb83badd752be9443036ae13 +Y = b494e9295e080a9080fe7e73249b3a5904aa84e1c028121eecd3e2cf1a55f598 +Digest = fcc17b88077570c053650e1de42ae6bb1522900b38996decc87704aab6a87ab01d52f83f6442875f378a262c22d23ab2 +R = eec2986d47b71995892b0915d3d5becc4dcb2ab55206d772e0189541b2184ddf +S = 8a6c1edeb6452627ad27c8319599c54ac44cdd831ea66f13f49d90affe6ad45b + +Curve = P-256 +X = 3205bae876f9bd50b0713959e72457165e826cbbe3895d67320909daa48b0ebc +Y = d1592562273e5e0f57bbfb92cedd9af7f133255684ee050af9b6f02019bbcafa +Digest = 299a6070d32a5557010753d7559dbd8d2bde8a8feae5417616ceb5b167997fd2fac0c2bd44264106d3a9720d5e805a04 +R = 0124f3f1c61ec458561a4eaa6c155bd29e59703d14556324924683db3a4cf43b +S = 688a5c5fc0c7ba92210c50cce5b512a468a880e05acc21ca56571d89f45f603a +Invalid = Y + +Curve = P-256 +X = 484e31e69ef70bb8527853c22c6b6b4cd2a51311dde66c7b63f097dbb6ab27bf +Y = e1ff8177f4061d4fbbacbbc70519f0fc8c8b6053d72af0fe4f048d615004f74e +Digest = f1e9cda2e096ece9a1fc57e55eeeb56b1c635380c0f9a1800a4a1a5f105d1fc0c60e776234daaa8a6f7c0f5286bb420b3f607e7cc0a7d840ad5dcbab26c797b0 +R = 91a303d8fe3ab4176070f6406267f6b79bfe5eb5f62ae6aeb374d90667858518 +S = e152119cefa26826ea07ec40a428869132d70812c5578c5a260e48d6800e046a +Invalid = Y + +Curve = P-256 +X = 8b75fc0129c9a78f8395c63ae9694b05cd6950665cf5da7d66118de451422624 +Y = b394171981d4896d6e1b4ef2336d9befe7d27e1eb87f1c14b8ddda622af379dc +Digest = 0527199fadea30f9e5e66166a3ebcdf6aedf906984535f48165e591eff36f1c0de6b0fa69aefb6399e8a213cc2ce53268fbe18c3471b7708bc27c426aaa769a4 +R = 17e298e67ad2af76f6892fdcead00a88256573868f79dc74431b55103058f0b0 +S = 881328cd91e43d30133f6e471e0b9b04353b17893fb7614fd7333d812a3df6b4 +Invalid = Y + +Curve = P-256 +X = 76e51086e078b2b116fd1e9c6fa3d53f675ae40252fb9f0cc62817bd9ce8831d +Y = ca7e609a0b1d14b7c9249b53da0b2050450e2a25cb6c8f81c5311974a7efb576 +Digest = c926a5026d8f83ffa2092caf863f2d8a886af391462969b13a11d3c6c5fa66bb4281bc6e60a1e99a2e1ae95d689a66282096a0f27aacc048f32d39297649a014 +R = 23b653faaa7d4552388771931803ce939dd5ee62d3fa72b019be1b2272c85592 +S = a03c6f5c54a10861d6b8922821708e9306fd6d5d10d566845a106539cbf4fadd +Invalid = Y + +Curve = P-256 +X = bc7c8e09bd093468f706740a4130c544374fdc924a535ef02e9d3be6c6d3bbfa +Y = af3f813ae6646f5b6dbfb0f261fd42537705c800bb1647386343428a9f2e10fc +Digest = 4d74631eb67fd1a6fa93ecb6e6112b6699e78c1d4c24ae81d0d5842efe5d93c2fd7a7863f8d45d1b2fafecbe41b7dc19c4b2bc208e014ffdc216e7eda0392a70 +R = 6bd7ce95af25abfbf14aef4b17392f1da877ab562eca38d785fe39682e9c9324 +S = 6688bea20c87bab34d420642da9bdd4c69456bdec50835887367bb4fb7cd8650 +Invalid = Y + +Curve = P-256 +X = 9cb0cf69303dafc761d4e4687b4ecf039e6d34ab964af80810d8d558a4a8d6f7 +Y = 2d51233a1788920a86ee08a1962c79efa317fb7879e297dad2146db995fa1c78 +Digest = 0250f93e6932887df519921f9a8dcff110be0768dc351ef73a940a579fae2d20061759e892e289c3e4ba5f7fe17d6ebb15c5931d48db55ebc81549f6637292fe +R = 4b9f91e4285287261a1d1c923cf619cd52c175cfe7f1be60a5258c610348ba3d +S = 28c45f901d71c41b298638ec0d6a85d7fcb0c33bbfec5a9c810846b639289a84 + +Curve = P-256 +X = e31096c2d512fbf84f81e9bdb16f33121702897605b43a3db546f8fb695b5f6f +Y = 6fbec6a04a8c59d61c900a851d8bf8522187d3ec2637b10fa8f377689e086bba +Digest = f91b09107d10904d3968ec29f85e456ac4e828f32e8da3db6a13f5566bfa625e2ad03f8dad5425a073c0d61d25de63dcafa9f4fcd206f29e9cb6b0fecd74aa57 +R = 1b244c21c08c0c0a10477fb7a21382d405b95c755088292859ca0e71bab68361 +S = 852f4cbfd346e90f404e1dd5c4b2c1debca3ea1abefe8400685d703aea6c5c7f +Invalid = Y + +Curve = P-256 +X = 633c2ee5630b62c9ce839efd4d485a6d35e8b9430d264ffe501d28dbace79123 +Y = 4b668a1a6d1a25b089f75c2bd8d8c6a9a14fe7b729f45a82565da2e866e2c490 +Digest = 575c64df58c8dc517ce65b388fa3ed69470163afecbabc3fa94b497ff7f3fe36ff12fabe2b84cebbf667744195091e4e2335a71d36414e0af0d0260fc8e8ea44 +R = bf2111c93ec055a7eda90c106fce494fd866045634fd2aa28d6e018f9106994e +S = 86b0341208a0aa55edecfd272f49cb34408ce54b7febc1d0a1c2ce77ab6988f8 +Invalid = Y + +Curve = P-256 +X = f78dce40d1cb8c4af2749bf22c6f8a9a470b1e41112796215dd017e57df1b38a +Y = 61b29b0bc03dff7fa00613b4de1e2317cfbf2badd50dee3376c032a887c5b865 +Digest = 4c097f2f5b2489c94258b34d529675bb5d77d4be083b51b01188dd42b4b5473982728763ee6fbad479375c5eacb5edaaec0b6583a10b19aad81ec88dde2d0e7f +R = 4a96169a5dea36a2594011537ee0dc19e8f9f74e82c07434079447155a830152 +S = a204eaa4e97d7553a1521d9f6baadc0b6d6183ba0f385d8593d6ca83607c4d82 +Invalid = Y + +Curve = P-256 +X = 3fcc3b3e1b103fe435ac214c756bdaad309389e1c803e6d84bbbc27039fcf900 +Y = 7f09edd1ec87a6d36dc81c1528d52a62776e666c274415a9f441d6a8df6b9237 +Digest = 1a3dd21cb6ac1fa7fc196319cf534b7608afb93805420fcb5250dff453564a5b22e22971a3ce6dd222405fea018cd0508d86c561eca15e1ac7d79c14e916b86a +R = 1cac13f277354456ae67ab09b09e07eb1af2a2bf45108da70f5c8c6a4cbcd538 +S = 5d83752e540525602ba7e6fee4d4263f3eda59e67df20aac79ca67e8899fed0d +Invalid = Y + +Curve = P-256 +X = 5ec702d43a67ada86efbfc136cf16d96078906954a3f1f9e440674cd907e4676 +Y = 05a62044fed8470dd4fca38d89d583ce36d50d28b66ab0b51922b21da92c56d9 +Digest = c5c016f6c9b525987dd835131def77cc72d8360d364eeccdd7af8b95712b6cd487c0b846201f3b64466fd140833514ae8d765da395fbd9d3c03ca410effa9a69 +R = 75f3037298f1457dba55743999976a1c2636b2b8ab2ed3df4736a6d2934acc83 +S = 19d43ad168dda1bb8ac423f8f08876515234b3d841e57faef1b5ab27359b27ef +Invalid = Y + +Curve = P-256 +X = f63afe99e1b5fc652782f86b59926af22e6072be93390fe41f541204f9c935d1 +Y = f6e19ce5935e336183c21becf66596b8f559d2d02ee282aa87a7d6f936f7260c +Digest = 9eb2f9fa96a1f3ffcef9600522730e86d26d328ec0c1bf2fbfe55a38754610341fda1b894fdcf10c9bc4f48819010fdcf0d24f27ff539e40c6855cafbd306386 +R = cef4831e4515c77ca062282614b54a11b7dc4057e6997685c2fbfa95b392bf72 +S = f20dc01bf38e1344ba675a22239d9893b3a3e33d9a403329a3d21650e9125b75 + +Curve = P-256 +X = 6d11b09d2767cf8d275faee746c203486259f66dd2bfa3a65c39371a66b23385 +Y = 4eb05c73e05261e979182833f20311e5366f72f4b949665ff294f959375534c6 +Digest = 0e71b28b0a1eac7aa881c09daec616c93d9a9286b5f5fdf2642d211021b125fa884b2595b73c7c3e649e61cd7157ef6660076a3b87ddf830db46533f3aa30afa +R = 15a697cdb614e11c0810e1e764cd501fcabc70874c957587bc4883d9438e177f +S = 7bf6244f92bc768063cecb5336c8eaacd23db930b28703560f241c7d93950dfd +Invalid = Y + +Curve = P-256 +X = f3899caba038efb534c4cea0bd276814ffd80194473c903b81af11c8c05cb6e6 +Y = 6ea6b17402fcf2e8e737d11ffc7c2ed3b2d0bc3b8f271a381f4294cff62682c3 +Digest = 104ace16689d785df09a81c5cf47a496db30fbd696aa4df080219487575a23641436e70329dd1c13290582c0d03aae200e51189d43666c86f38a5203c16cd7e4 +R = 57b99380452e1d37b133c49b9ba493dee8630940477ca3351a43d90b99871e6a +S = df599c3a37105af3ecc159b3b685ccb3e151b7d5cf2d97147974ae71f466b615 +Invalid = Y + +Curve = P-256 +X = 1fd6f4b98d0755291e7a230e9f81ecf909e6350aadb08e42a3262ff19200fbd2 +Y = 5578fef79bc477acfb8ed0dc10c4f5809c14dc5492405b3792a7940650b305d7 +Digest = 761a54f3718985b6d7bcfdd57d6c4823f854831bd29305fcb07e34e3f825d451fca28a62ce9582e3957d89ea7c1bc1afe3aa58fd2fa18566974600fc394cf2a8 +R = 97a99e96e407b3ada2c2dcf9ceeeb984d9a4d0aa66ddf0a74ca23cabfb1566cc +S = 0ecac315dc199cfea3c15348c130924a1f787019fe4cd3ae47ca8b111268754a +Invalid = Y + +Curve = P-256 +X = 2dcbd8790cee552e9f18f2b3149a2252dcd58b99ca7dc9680b92c8c43aa33874 +Y = 5dbc8bb8813c8e019d80e19acdb0792f537980fecde93db621aaf1f6d0e6ee34 +Digest = 45b082e804443b53a82229cdf13e4c5f8f31fe93170cc8a23f63eef506cb7748388e1a971a2f81e3daa324cf2bb69118f7418f40df66a24f50c34a55e1416c3a +R = 2bdbd8b0d759595662cc10b10236136ef6ce429641f68cf6480f472fcc77bc9f +S = 7e7df0c8b86f7db06caf1610166f7b9c4c75447f991d5aaf4dea720c25985c8c + +Curve = P-384 +X = 6881154cfe3f09affbee04cd387b27b7854326faf8906c4b9c9e6ac2c632e0d59717b3f33f6d747d7b7cbb4e4dc01fb8 +Y = ba295ae0966f06ad9d84b3bb4da7f99b56044c99f88d71082cfea6964ea3c63bb79806a6a41fcc314b55b3f64f82b68a +Digest = 8a6429d55885146f7aab582a1aa9360fa9591b0a +R = 2112385a75d4edda89ae2bc3c74524dc792544a3a52fdb588da3f0feaee6a11623db275e2ab8abdd998cc42a29c60856 +S = 8d308a3987b81c595f8cec19898b1a42da8eda97496af280033b0f915283f171fed7e2a221fa9c78927962189333f437 +Invalid = Y + +Curve = P-384 +X = 2f2f43f244ae027c3d2ec5c900393f80a8ad0e9b9a12a047195d29a39f2b7026b071688dd9a6764379d02a5ed8035ec1 +Y = e43d45851bc76c37d34dbed996a65ffcfbbaf0e2cbfbc9f62d2116bdf3b330bbef5acdbcd0aa6d949f771daa17cda1e3 +Digest = 5f41322db1a276042ae807f0f0d6f1e04cb5cd26 +R = c011c52e9cb02048957a233704ff9a2c1d4c56e08ebb083aa8ba351f041a23a7d0da19088ac6c60ea2ca117531c7cf35 +S = a66ca9bf06c35d129a8253a0f793acf681e482d9994868b275a230b215286e03a66a0de77c7a53174375137fd4688556 +Invalid = Y + +Curve = P-384 +X = 9a5e1932d318bfa7986f0dac4489c6f55775427bb60fb24bac7646b9994bbc3a9b5cd15e818cc4e832afc1c3fca9abae +Y = 64c89e7c3399c136b2718ab675944207157f0bf23d9e2a807ae7ac3bef81da7ec3c56c2d2c08afc53301af2a3cc71861 +Digest = d36ef9ee70a3b61ba31cdfcd0cac6e49331a407f +R = 4cf6c63fea6c80efc105cd99afe2b53da05ae16566ddb20b9d40a076575ffac419b6807fa336fc6e7c7416c59775ef09 +S = aec2d96054b4b23c49faaf9903ccf63bc96281fb7c1b9d14daa54bba51bb2b2f4d3a901f3b0b9cb2b62976459219350c +Invalid = Y + +Curve = P-384 +X = b3aeff27b65540c6da10a88008404b1d49239c87fbf47932518fb87a9bb132403d1f310f531d086340bb4a68c3e64b9b +Y = 567e75f442fcd81017b8adc4cce634f5ffa3cd497d38221d34dc1f43aef99133131ff1b197f7b9f37beecae5c438849a +Digest = dd0f9c326fb50593fd0a0df31abeeb00a22eb956 +R = 3b94a2514eb915b71e18c867ad7f508a35375c5bcd4b797b86054798569870b2477e2ac14406628017d829400efc63b2 +S = 179a10441a0beea3b375248e697e0d19e24bb68184c373fe4302839b97dd7353a5a25929c2733796b0c0d8211bd67c51 +Invalid = Y + +Curve = P-384 +X = 0874a2e0b8ff448f0e54321e27f4f1e64d064cdeb7d26f458c32e930120f4e57dc85c2693f977eed4a8ecc8db981b4d9 +Y = 1f69446df4f4c6f5de19003f45f891d0ebcd2fffdb5c81c040e8d6994c43c7feedb98a4a31edfb35e89a30013c3b9267 +Digest = a871caf9fff9856031a79a55b96753c1a34ccb73 +R = 8d9d3e3d0b2b2871ea2f03f27ba8699f214be8d875c0d770b0fff1c4ce341f0c834ac11f9ec12bfdb8320b1724c8c220 +S = 62150dfba8e65c0c7be7ef81c87241d2c37a83c27eb31ccc2b3c3957670a744c81be6d741340b5189cc0c547df81b0d2 + +Curve = P-384 +X = b4b92211edbd41c5468d2ba70810bc37b5e7c954c7bd0db80c4fa89ccba10bf07cdab953828a068bc0104d28e4040c14 +Y = 93ed318efce3dff98fc782b788d78658ea5ecde4f716e2d5d0ec2d87a2e761daa1f1658cfb857762caa567baaccf9924 +Digest = 765343d50541bc2c0e20193648048016a95e7588 +R = aa3978eabd196ddf9cab2815cc9cbab0b61cd639deaf70e093a10a58ddf9f410ee1ab965ff8fbb98efbe812421a613d3 +S = 02761a2947e1855806b8a25b9ebb0762be9f5517461a371e5783f34b184f32c4ea684b362119b1a2d8a3ff439f10291f + +Curve = P-384 +X = 63b4cc14f9efd3b8f29e65806591d1e9c54f34a3f5231339bcdbfa4109c42d946a59cdd7bbd2591fd1b2383a0819772f +Y = 55ab3d208109da6ef039c23cddd52a5af619266d8fe066dcabb1af885ad5501401a78c44ed3b5fff2892fdcb2a3ac8b2 +Digest = 4535ef8d7396b4f2af65660ebbb56f356cacefd9 +R = a3f9b840fd7201356f35b5dde39027410aad26ac61919c14fe7b0535bb74e7218cb3312bfa60aac63f14166f32ceff26 +S = 1b1bcbcb0237fad4e406c8d4e3e39b55642d8535afa9ccbc9c601cb4e01891df79f1bc792687cb3a5ee7703565c4a13b +Invalid = Y + +Curve = P-384 +X = f82f82f8f7454ce7a94a040ec0bbb52d49e3b9f8ddd095704973c760ee6067a5c28369656f22d70d8bb1cd70ef9bfea0 +Y = 0e36e256d02870ee5646a17aac4b280c9d1d2e1d4803eb3cb32e7f754cc889522120efd7c4d8a82e509a4d8f266d3ce4 +Digest = 26302c41e6da59e2df2e26c12382738880be94cc +R = 27a2332f3c59464f5dfe7bb1201a3936248d375bde603724c048eb8f7c0c2be3ed4b56c14b51d7d68bd2554526b36d9e +S = e1f90367b0cc530c545f95163d9ffb1208c943685d5ae221052b83ee40953397be581e5979c9855b20246e9d26d57acc +Invalid = Y + +Curve = P-384 +X = 7d40b51127cb1642dd8538d4124138a2f49c41b4d12f702c1b0cec8deba50c3712e01c2e1e693e00438af0e86025da33 +Y = e734b5939b673c45dd32baf20d234f01b7124b391d14beea231e9c604e813fc83b3a77b0cb1f2ce4873a69b0165e369d +Digest = 0b30b209147432207a72177997d28d6f1d03330f +R = abf16821b6657e0005071f78c679cbbb130bee6e7ca63526eef0f747fb721feefe6258dae1aa02064a700e963bd9dedf +S = 3f7e61c34a30cc5ff7a8be375fcc9c38a76dbc0c30a4356843421ca37a7bcf24edcd41d8235903bb522fb6e5a8033885 +Invalid = Y + +Curve = P-384 +X = a5b59d59599c105e39f61354da99c7c9135c749cf996cc2252eb83b008299cdafbcb44227d2d2c4a5ffa44823922893b +Y = 0399fb0edcbfd0b76b524f22b7b87ddbb4fa02f510661615312a4492eb3f2001e0fc0e479f77c33a88f9a7e20757373c +Digest = 44aa3083d111bbce7feb412af74a782cd320becd +R = a4c9cac2409a9bfea1ebe28fec4e19545f08cd18fdd31048f52a3f2d32b2ed859dcae4dc12fb2fecabe542c4f03191ba +S = b4d83f927ad1980d96cbb0ccc36aa640f786293b8b19e4dd97a797d192b420f630a5e42ac42d8736e7d42008f445dbc1 +Invalid = Y + +Curve = P-384 +X = 29178ce9127e1048ea70c7d435439e9ff9915387e51b7e5ca10bfdafe53565978eb3784d9a4226f443d4834f4d451685 +Y = 5cc2970589a453488649711bdf3cdac9a200519aae65b1c6bd54fed0d965755b36b74d978d674275bd71a03e8f054b0e +Digest = c679b4a0e61406c4869d721192bd314d77e1cb39 +R = 5d6f5e9a94d9c92a0890c558bc0408b3405cd04e33f663df16701e80520e4394f1c54d3c8225d36f4753a799aaf6ff90 +S = d895b1cc522ceec6a7867867b8f603245c6e4d48945dfc43af721ebae4683d40a3c21b905ca3bd4b974d36806825b2cd +Invalid = Y + +Curve = P-384 +X = 9f03569f8c6ca2c16d707f0ca36a8a8cf214a9d5c14034829d709e283cd675eb4e3090c6b973429efdf476c0782e0a7c +Y = e1b842536731e91596782787d57af17db85dc92fd2fb95ac65339174aee66775ce0a4721d1faeb29da968ea5eb705e59 +Digest = ae1a63f88a59c7da5d9f512d11bbd5d75dd1f583 +R = 31ccbe22a360b1786dac89394c6ef4ed6604943e50837395f96052821f6182914840096e90f2ad650917bd91d7bd4cfd +S = d97199a6b952dcaefb1defe23def92bf2ee236ad18046a2ccf8924d42ee10a62e70ffe7f3c909b11112278f160d98b7a + +Curve = P-384 +X = b85e78a935d169dd5ba8f558f964b21c07804464816f9231233184675f557463a8b00470ac0ca8278cd008f4642e7962 +Y = 8edf7be8584c5f207939d479e65173e2e69673090a8538fa93efb4432127895d92b4e4cf13b7632a830e9a33b37f75e1 +Digest = 811685f7ff2701e692f6830a33d8712d0432cd5a +R = fd2876b250a94ced71734aa7a0d32423b2c6f039c926c557e748f38e23bbdb46e17d1204832c6f76c3ea854e1da23979 +S = 76409e381799502c81194ba87540aec0b89fc4680dd683780d49f82a46a7191b40f5f06ccb02e45e704c31fcd59382b9 +Invalid = Y + +Curve = P-384 +X = 0c74aaa0527524cb6171ab741896b405a6ac4615e474cdc09c9457b18bed33c6383e1b92f2fa1306e8e5dcd1667e45fe +Y = 7b00d934dfd876f6e07dc0582b20ed650be104fa603a5a1255c62b6059d2685aa9773f1ba31254d213c815d0efc8ed93 +Digest = 328029316d73d1b8d2b8927d12332036e5671384 +R = 832c62b0f34986eda9d1ace5068a0c5318051b0d0166d3dacf137ac072cc359f109ad6e17059e700bb1958bcf4101246 +S = 6bb56f4eb550688ea66e5dd09aebe7e0b39e2716b4697ebb68f113e080f0ff26fd0fc947a34f3c5a8a2f10e07dc1405e +Invalid = Y + +Curve = P-384 +X = 4104de08b4108ee26ee239e0a5d340c1b1aa48b1b3b40717debd6ed3ff0d777923c106f857a3830ce7f3d08d0d6d7908 +Y = 00498c38393e6393edcf254804558f86e461df1f5a6557bc5144f8d2f3806413d372b6ce417d531c08a52d1e38e8b949 +Digest = a13ebaf4431c43b684d1e18e610a75fd7527200e +R = 9924a3273248db20db007309560a0e616572ac799d773529a5215786cf4a6e03cc73bea81d4810c1eee4b5e975652eee +S = 6cc8ea4c4c56da87c25946a198e86917227bcb90da7be1dcde7b6547bc45a98e8175dd54af15bb6ef955b4cb48b7bb0a +Invalid = Y + +Curve = P-384 +X = b6bc9418f3da0cce38a65f1b52bb3a9d22a0368e02f5f12fa1f1303ac67df1cffa55d049a782bf5bddb5e841b125aed6 +Y = 3b578a0560280a2958a14286e10faa7f5dec77fd8d90123aff5780efa8a636cee833fc9f10d7a164f1254a483b613746 +Digest = 7b44de2e448107197558cb071bb5bec9a5849467827d29b2c6625708 +R = 6602090aec001c16e5f6e7e3e488bed5d1702d36b258b6a8a2d8392a5ff30a6af12fbf4308d67eed6aaa8b7be8b831c5 +S = 65d0c3bb1910ba0b7cc108ae1ccaae63405ff01a8df91021e17cd46aa6f8ca8f4eaeac6d6fc26fc816a3ea537fd9576b +Invalid = Y + +Curve = P-384 +X = b4ab83a4ded7d76aa15eaecb1bafe59427d3cfc38564af9123cb707da2405184acd40a6c093ba29e321ba0f67c1e0c6a +Y = 26e2902499495f8550e798617a44ac9990c4c1cc3527dc0dd003a15aee3cbd3955151f7863de1692a94aafd3730e7665 +Digest = 8f902a34f36d7cd36748d5ddcc8fba6040be223a462842d506f185d1 +R = 61e48d5a100049578e820768ea57f30f27ffd1a1f839fabc55e8f4816c9b95d042619cd3bcc7180fd99834e344f53e7f +S = 977b81d43216f31d8bedc3ffe873047817de3441df8b80a321aa0a80931f25a15c6628f43cf8e48d5c6aeca7626b0a18 + +Curve = P-384 +X = f886f36fcf34e8df2a7e09220051b9981a3a6f693ec5999f28864e012c13896d633c9564f0118a95631cea8355b25b20 +Y = 746f9a77835325f18338dee5dc88a9b086b858ce15b4e4462a98844bb01811195f4fae0bee8f457c32823e142210dbb8 +Digest = 6a80377d3c7f0e6a50f6dc1656cef5a0d33cf7934441244f69f0062a +R = 665390653ed280b8f6bd3718d8423f26cb38d2d7faa10fc0f094295677d9dafad45fc64cfc22ded56afdd86a77cf3c33 +S = 864f0eb3a8d93c388d987cfcb60bba76098039d46bf4ff4be083961f70a29e724c25cf56685802b7b5be048107ad52e3 +Invalid = Y + +Curve = P-384 +X = 5fc835a2f5429adb719ed22f11dfcb02731da6759a8ea75c21d1af9631187626c31e191f4dcdc183df01c48e13dbbce6 +Y = 9ed2d03df1cbeaefd4478b8106e90f92e0b6e958145cb81b9648aef0b96b71d1d55918564694b1987d68cc8e7cbd7dd1 +Digest = 807f609592e2ededa12792a7006a6db641904e86a1df3cec477dfd3c +R = 94d9dedd27f2d014ba84ea58d2e88d68f3e86ba88b93750e50255211effe88b0a0e2f62017f22965726cdc77c55bca4f +S = 14814bd09d9b7ba81b2485777cc588b5c0a4064df95c63f18a8bfd57494cd0f40c5bda9dc6c01ea72540f57a354360ef +Invalid = Y + +Curve = P-384 +X = 0b86851d7c19f0f04a16e5e2903a36d09bf1863e152d87936fb2d74cf916bcf6dedf3c066d242f7dd327df0fcb42270a +Y = b0c93480740bb635e6c25fb61630fdfcc462a1418366a51b1265656f721e18ba89ebf754c7dfdad865a252c884a6c4fc +Digest = c34e896a31fc4de7596679e12bb2416a51e58e8942eabd5cb01f0737 +R = 33fa5fe3e495076e90f4b62753d3cdc7603aa7f5b407dbf89a854b9521d15e6c381d3cf28f103035dc4291ae318c5f82 +S = 30919a2a3fae71e1afe8378aedcaa08fadfab6c6bf954031452d4fe514969ede2acf0347a2f1e81abf1bfb9d8bd55a36 +Invalid = Y + +Curve = P-384 +X = 6f8f2fc40d1db28309c8850bf94d77c01c5449b4fc556e6bf50e5ee805209c4489d8ff9bd781699eb0e42f6a962d56fe +Y = a4c7c77271dbbe7e00d1c6e4287dddc5463c6803a577a18f89a5eea01c6addc12404353abbc128cb9cf2496732312d65 +Digest = c19cabc6141b2adf67fe4bd0a3fead50473dea8cb0276de1fdc467c5 +R = 327c4642019a635d80dab82f7dc22e3102a3c1ba684c2b6de67d3d3009a17d39ae3d58ca2caec9f6f03f5ba3b406178c +S = 6b1af807cc7265cc6d3049959cd7779ae0de819036647f9510b0e9f7e4c0e3fece5fc3741b68881145a2c944dc5c54d1 + +Curve = P-384 +X = e98ba8016a976dcc3c50127d2af792969835b1096b1644b37c004d1786f4fb1026233f33ad56cd9444ba0a332c92efb8 +Y = 54bbcb78ffa3c855dd24bf182376ff5d28dd7b7551e4b05a19549c9f59c83dcc12a43092d63c5967fc0256612475b7d4 +Digest = d8d9319d3f705d03dfc992e8e7596586200fb1574f2a918350deb268 +R = 3b76a0c0ece2348085f3554fc92b9e5b0fe84801ab2adf1d239d7c81c9697b62285e8e5667774559d1bbc6e86f2ade64 +S = 91d929e42f8223ccc74d4cb09ee7eb619d3a348886c21091ec55d36164ad3cc04e1da6edd88ad89710a908ca4bc00333 +Invalid = Y + +Curve = P-384 +X = b8d7a836715635a8b095d3712817aa9e6ffdd98d24be2db751bb0c1fad42b082542500ea255cde17525ec159afca7002 +Y = 1a526c876d4771157b4f66e3056485c95066d4bd1e73e991ce6d5d3642807efe80015c52ef3cf8c86e57ab9a510ec86a +Digest = fe23e8ab9dc934144247930a48babb0d8ba57703c2bef60e0e9a1e2b +R = 9e36f47ec1b7ffdc6e3472f3cbec913494c0bbaa0c073f597e01845b5a3107c0e23a4575de4f2b582e1c2fe3067ec048 +S = b013cf51008a89b379a2a6b519b8d229ff0374401eae21a8da350fe35756b94168e7fafbd81f0f681f21c056941a82eb +Invalid = Y + +Curve = P-384 +X = 4ffdecf5d5f7c1164297a93742c8a685bb425b97fdfe85f630dab2064ab29e52a0df34629c2531048c288216723fc9bf +Y = 84fcff3e7e478a6932ace6f6b0ab70e61d8a5137b76886c59e721d938e0e252e2f7e57c2ab7dab90493446ad85c3fe4c +Digest = 28d44c363bfb2e36bc59bb68c56e8b5d2587f149839fd3b8c05d9eb3 +R = 7d909d9aacf064c32d070c3149ace8b8f5d83b2006e8460b84c4bce664fc20e91c61ac8b415965b6155eddbe9238fe3d +S = 19d909e358e71985179dab9113941ecad21e4f3608cb3a32dd065868af1657df8e06aa86855ac7ad757a7f8fb568a953 +Invalid = Y + +Curve = P-384 +X = e805e0733fc156bd582faaf794e58d4630ce73fc383cdc964dd337728f774e4989a697d79665a3282ee6e0ee343d6c7b +Y = 43821b7b9a6ce1ddf0c59ada552668a0cfc85a87a610b5c36b7a691947116b49a4099340306e53494fc6b496cb8d12b0 +Digest = fd1bb27d666e3d40f5bd19d8c026a3614404b9edc11e582eb80b044c +R = 3d4fa4ec95b55feac607fddc618d6f4eed71da65dc49d732e64460e5c80c57dc4421c64bacf3ef1e22995fd19c2a3cf5 +S = b11898ba475f2b28402d038afc15f171b99aab93437b35a2f8a3b89f42fdb7f93a0469d9da7652882000dd5bb1e8b9a8 +Invalid = Y + +Curve = P-384 +X = e15c7ef9791b9392c3e97389f2597ee161545c267e584b94262870ef25fda348f72349f396c27ac884fa8d776387fdd8 +Y = 107b4a7da8be564a14f9c45e4df5cc9b62f0671b3f2c0573c33fa37f985fefd1ae3ff2640947ebb12dffda72757db6af +Digest = 3d9611421379fc93226fff23f5fe472a33f6bdc759d5705f7e9a2be3 +R = 9d715fd1a3668283fa83c407242e8d2a4f3fa1bf41919ca4101114bd0e0ac1b16c4379edb11de5210eee8618d42e9ed1 +S = 2dc37f453c8cfe01ea80c56d1865daf0f28847b12970132a1853c3ed80da6693e0da47a2476207947f29da34d68d604a +Invalid = Y + +Curve = P-384 +X = efcb97dd73106b0a2be4f665c496352f6938da9d0fa97690dc0e8d018b06dce2ba8d19b93ddfe889d549a33e64497c31 +Y = 66a0cb7e64f40470b6d09b9e12f217b59e9e6615af52fbdc4ddcb379e77809361eca2093a3e24c7103e971567018400f +Digest = 5598b06acf834ffbb2e50784fe2bc493fa51967f7ffadf1ece63f9b2 +R = 4ea5d4faf8ee52540db2f4c6283cea5302a3540a56e14c8a7533441c248465be99e10f23bba85be9634efaba7a8b172e +S = 4c98a2142ecaba7db44c78658efffc1175f810a147306ba2e6498553526adb1507d7a99a372e0f84c8dbd160ef7fd5bf + +Curve = P-384 +X = 4e916a3cf2561580b49ecc52321db7103292fd2fcce8dd4d6f86be6035808e0df51c3c4ac1894f0b08ef6ebf953e0d18 +Y = 4e6f28895d024b4c71220b27052ddd4bf6115a260825acade48c043b3e06d2b6b8e4ebdf465980f3b013cb575d475bbb +Digest = 1668ee6ae19c2d6f23b9184b6895ede8f55549b23095d53ef89487f6 +R = efce00544ebe0d98ba6015c07e3e9d09af808d49a0820c22ef572a3ef9c8a684b377bef1f8b3bbddb734b9b0bd0b1cd4 +S = e80d0e183b3f00098308e20e5b4ae393a07f1d1a8defda9a9d10f19b3e5236e42f593b1dc57f6718dd8d4583f0175ff7 +Invalid = Y + +Curve = P-384 +X = 3c6528c82d9d5e8dddf41a211c70f78604d81f49853bdc746270f1340a2a645dca3bc7844c3680268fa5973cd1758313 +Y = 4b9e697f1caf83d3224486bb0a8cd6a7c56e47c91043d8cba3aba51b6e504441d37abcc9b7b2d49b9126463703e514a0 +Digest = 1b39217bcc5dc841b32ddf00245623c581f19cac8a4ecd03eb2c07f0 +R = 848814c01c3d18534f39bcd53a8736db16f0f77a015a0e578cbb2f831739723e83b29cb6d4eee7822c76ff056d0f467d +S = 05beb19f766bd1d4ec5e65786042258298a2dc617e3f13d8e2f0f4b50d934565f3162c737fa791a81897397f29305943 +Invalid = Y + +Curve = P-384 +X = 80c3f6488dcd76f33cdb75e30f8452ab9a3bd6110f14e25179b0aefe4c19c60a07b4af10844b130b0b75a7024e341298 +Y = 6c85a17ad4bbefb33910250e05ac02a17c892c3380712d06dd070843dff0d040e219dae78679b774cd5eff0adb67189a +Digest = 23cd0066d1d88702c5d4461deff89aa5662b517806a04c4da30e0d82 +R = bc444deb0c7dd9f96f20a7ffd3ddb35a1189316655531860c39b5f87f09992106985e5562e083ee9f538c8e2d5363c52 +S = 91adde5d47eae80a98661f4347fd6e4778478c3d4aff3cff8aa92e2345a8e03cd4ab64adfd38e461bb98b496516439e7 +Invalid = Y + +Curve = P-384 +X = 97c3f446803a61a7014f61cb7f8b3f36486c7ea96d90ee1767f5c7e1d896dd5114255abb36c74be218c1f0a4e7ebba3d +Y = 553ed1fed72c62851e042f0171454f120029adba4ee26855ab881d9470355f1947aa1d2e806a7ff2583660fedbd037a0 +Digest = 647eb206a8477440b4bd048d00f37dca8635b15c2a8e79e2a9d74fb9a5553211 +R = 7b06d6c2b63f1cc3bfdaa897d07dc15a83bdf35d979f70c34578332b3f4920422bb24867c51bde10831324df424e04ec +S = 4bef715161f400dc98d4b63bd13ff4ad4a6c981ead44bfc662fe9bca4b56cd790698e4deddf9a4bd69327f26bfe801e6 +Invalid = Y + +Curve = P-384 +X = 08bd5c6cdc1f8c611df96485090e20e9188df6abb766bff3c1ba341ed209ad5dfd78b628ec60998ddfdd0dd029352fbd +Y = d9831d75dec760e9f405d1aa5e23aac506dc019fb64d44bd57f6c570d017e6609f8fdbb2dc7b28ca9e00e37cd32a3b73 +Digest = 9a4985f744dd6f2774cb6f20ad6b6969e212abf4ac035b72ad3f8b1955ae1862 +R = 8b372c86ed1eec2163d6f7152e53696b4a10958948d863eb622873b471702ac5b2e75ff852149a499e61510905f98e4c +S = b2ed728e8b30787a28f2a6d3740872e47348686c7cb426411379411310241d25f08a026b853789b1157f1fc1a7f6ff49 +Invalid = Y + +Curve = P-384 +X = 10a784abb3c549444a62c28df1c926b8aabb20c8d9aa4b1f7ca830258857cbe9718dbc9845fa9cbb78587a373baee80d +Y = a1ad0c10b5ab6780cad49c8cd3eebd27de8f1b382ddd7a604458cef8e76ca632a7e44e1c63141a742426cec598029e2e +Digest = f5b47101b4ff9baf64aca830b6afbc4f9620035d88a1d84a12cefa6f7f99faf2 +R = d9e52be2a3f7f566899cf6daaa38116d092473066f3a1bf91f3df44d81bca1deb438d9d25ce1632599c1d3576a30f128 +S = 0cad30bce4b3d7f40b3eef762a21bb1a3bad77439838b13024b7b2c70316875a99e80723a74a9e7a404715ca06a5d673 +Invalid = Y + +Curve = P-384 +X = 8760182393132d69011edfa127e36f92eeac8272641c27f52f3337ef8af7451e6d14f4e4590c7eb9fafb76e8c92865cf +Y = ebc2b123ed871ca570ead40ae8f6f32335393c569b21b38f626d09c064a3c8668e9fb10a4667e0f0c68bf25ca98fd6dc +Digest = 979131ca1d07e0b4ac6f27b20a978e0a230159eec4906db5dbd22b10ec71af87 +R = 1db957e5c2d294035d7f476a0cbc28a4aac2614d8212de5017076cd836bf04ffe237dce8fec91f2fb5ef82449ff1c65d +S = 3e3b9058d0a9c5b417f9c6f86557b9d50e7a902694a7012a1be6bb70708497e4d39fc1f6d6bc60dfa52d23cab173385f +Invalid = Y + +Curve = P-384 +X = 2b1f98d2acdda8347b9a68c75174408eae7de3d6b9c08c26e73ce9ed2ac147b8d90cd82e30ab43909d63f6b457de2071 +Y = 33f5e6f5f5793201991e014cce0045d04adc352298e32f45f4e374450111c8456b5c2efaec43d157949b5c191b2bc934 +Digest = a1daaf888d93a2a7e52bcd2a66cca3ff2e02916616d1919adefdd7257490e5b8 +R = 23d046402cbce807d232bcf0dc96d53c72992e0ba1ffce0d79050c0f4c5ad9bfbbdc1c96c730d67ff3aa3edaa3845da9 +S = 2cd46a4fe5d120b3af3a6d9ea63cc78f4079e8b5520a8fa96828334a4f182ff4d5e3d79470019e4eb8afc4f598b6becb +Invalid = Y + +Curve = P-384 +X = 86ac12dd0a7fe5b81fdae86b12435d316ef9392a3f50b307ab65d9c6079dd0d2d819dc09e22861459c2ed99fbab66fae +Y = ac8444077aaed6d6ccacbe67a4caacee0b5a094a3575ca12ea4b4774c030fe1c870c9249023f5dc4d9ad6e333668cc38 +Digest = e3bcded61cbb0bf6ec20d59f91e8e73e532f15b082b89c984c1b51fb0d1db8a9 +R = 798065f1d1cbd3a1897794f4a025ed47565df773843f4fa74c85fe4d30e3a394783ec5723b530fc5f57906f946ce15e8 +S = b57166044c57c7d9582066805b5885abc06e0bfc02433850c2b74973205ca357a2da94a65172086f5a1580baa697400b + +Curve = P-384 +X = 9e7553eab8cc7e2e7396128f42ab260c6dbb5457cbff2070ea7c0db21def1537939e3f02699e5dd460eca3798d08bd6d +Y = 892c0c8e47dddf858e89099a8fc1026e8b8333532b22f561f7647f63f9c79dbf5e8dd18fbfe6ff34902233119c5d5aa3 +Digest = 0f2a9b447ea5cfcfb9e67d661d7f0752befd3b4e3454fe40b9ae1eca47806025 +R = 2452da6a48c3749b66e576e0f1f768d51728be17aea149164c4e1654c5ce27f625a4610c4a2eeddb3a0626d3abc6c37c +S = 499504fb58c9db24a7ff5f7921e1312f8aa583c08a308e080f5ef1acf5cdae7927c4101573db069ab0b6de7f4f1cab38 +Invalid = Y + +Curve = P-384 +X = 0cf4dc51e71185a29c0c6fa3c075d9da5bd7ede085053344dce5dbbe8329e8ac9045f7246c9d0efed393b8e113c71429 +Y = fdb7917b73974b355cf9f3bef6a0a460c2d39fdf1fe32a7744be0a54ddd1cfa8d03914cff4b5ca536b40707ff2629aa4 +Digest = 331aefe2369b9c5ee6dd9f850259b3b8512f5934434e61573f97fe2c1cd2b147 +R = 3812c2dc2881d7ef7f621993b161672329b261ff100bbd19fb5826c9face09aec2017b6843d69336b813b673c5402527 +S = 5dc102fab9d6325131c556ec00309c2959d1031a63fbc1e2d5d04996d3234ed33875c0ab98e5878e9bc72742519ed398 +Invalid = Y + +Curve = P-384 +X = 6c590434988155236b43147389c6dbfdd27dcd3387e9b4c2587ece670753a542a13a736579887791cf53d31e5ce99994 +Y = 35a20194ff3f1b55f7ffb2758ddd4b98dd0d9e0cc213e10ed25e8e0430fe861066c1d4423c67f0c93f7ebd87fd3c561e +Digest = 153475076a003545d3ca3d4a772866f12cc85f6e69f8c486a91a80fd709206b1 +R = 89ff866889245e797926509e563b1746920b78c9370a6cdae52663730d131e558e327d1f5fef8faf9e6c802fa29504ed +S = 8dd68e2de2f788e598b3e5a60c18d81849a0cc14b3b0e3c931910639f3125e5d6045f00330b1fa989252a80f95419b04 +Invalid = Y + +Curve = P-384 +X = 499cbdf18ec4e69b88051543c7da80845fa2de8be2b9d9045fee7f104a8b5b7d04e69142de9955c5ab18c5a34ebff075 +Y = a29cb8d28836b201a389922b6f8f93870f09c80a00242d00d32656a43ac1440fc55bcb123551a73290f603c3469be9ed +Digest = 5f00b3b48c1ee8287abe6f3fbc3438b91f4268f318ae2aa1e7810369d6716020 +R = 25d4d243da6fd9b439a9242c3656fade7acb7a306e8cf23ea89e3ff4f9330be19c61aaa42d7b426d12c8e0f96b80dae5 +S = e7a99cf4b269bb4a6210d185e9654602523b5cfa1cddc94b1db92018aa557ecb6adda44c816975f5ec1756b6df3c44fd +Invalid = Y + +Curve = P-384 +X = 9a74ea00203c571bd91ae873ce0ed517f8f0a929c1854d68abd3b83a5051c0b686bb37d12958a54940cfa2de23902da7 +Y = 6f20ccf8fa360a9ec03d7bb79ff17ad885f714757ef62995f824908561dc0c3dffc49d873627936a2fff018b82879ced +Digest = 45c3a1b29a18780234f12f5e4b64e7af9de2acf0029ce55b706cc79a7e4df994 +R = acc1fcac98c593fb0a0765fce35a601c2e9570d63ea1e612fff8bc99ac2d4d877750bb44cfb1014e52e00b9235e350af +S = 7f53de3afa4146b1447e829ebac8f5645e948cc99e871c07280cc631613cfdaf52ccaeccbe93588a3fd12170a7ec79fa + +Curve = P-384 +X = e22f221809fb7a054ac799a70b3d24744eb7c5096c8671770399527c88ccf9ddaea0257a0ae9430d927ff5d9f109c533 +Y = af4101d60df9b306ae92da7592f4faf3df422a3e33f1c2ed2973b2b900eefc346b4cf024de650abf537cecd12ac77618 +Digest = ef1057d83a6e6481be7caf2c12c15f085ff971f02f0db8544352558e2b9fd61c +R = c39a8e79f0560b9f26504469a470c7b2230c0d25de07c206e87dfbde9aff0a5d85322f56dfb50d4c1fc67c67d615dad7 +S = 2ad94dd13a39cf4f4cb24c2c81d4c1181652363addd856dc9ba7455458e40ed047cd113129bc87f43949d5a98a0d5205 +Invalid = Y + +Curve = P-384 +X = fa8ebc3682d90ac7356f0b75b9e3376e76518676e0bedd176cfa7fa57fea4b3a399dbb2bf735ec90b9c1705cf9fa6f57 +Y = 18c3fbca0150ec10696b3851f31fb3ba62c0b6be509d249e0d4b374c7a08e49338e0922e2a8a9319999e6569ab8d292e +Digest = 0c7152ec620fe9b783625196b41192dd5d49df184ad26965c970ac5e28bb1c4b +R = fb58ab09b8a7ef7a6ec05b854eae11af9b713f7c7540e25115f609846e636ad4f88dcf4dd61e311273df23ccda474f03 +S = 485be4c21b7c3a9c6b39ffc9f0c39f4050f76d2a6b3fae203d016318c541c1b4ad6cfc0d0950636ff6883895dd49e4e9 + +Curve = P-384 +X = e5f331536a2940cd67234bedf813c12e15aefa9a1a68429f8754bf2769a47c9c2efb5c42135e7b01a110d7302e097eac +Y = 63b2398612c863febd482184e834d3acb51408c49aacbbd35d8719746f37cb13e013c9505ce034cd815aacd10d2f7a0d +Digest = d925955406f6b6dd4df05270a2539a5924830dfbcbf6a5a34f21354db246244b +R = 96c35f22d036785a392dc6abf9b3cfb0ad37b5c59caefcc0b5212e94e86739a2674020ff79258094d90d7d59f09d47a1 +S = 373cbc865384734c56952f7a35a1fdecd88e8b343ee3aa073d30f5f25b73506f1e5f5857f668b0080dec6edeb5e1be96 +Invalid = Y + +Curve = P-384 +X = c53ad865beb1e2b92764065f1a6bb465ee94aacabe43426a93c277d02e00fe36be1c859ba08a031fc518a0d007668979 +Y = 6728d42bae9bc097151748ffa0982964bdd16076fa0e7cc15837c1f773b08d02c3dbc57339091ccc34105b84781150b4 +Digest = 6d5fa5b492406a1e93df6bb6364d7b17a24ef43807a1159acc77486dd7b49b60 +R = d4f0dd94fc3b657dbd234767949207624082ff946de9ce0aeb0d9993b8c7d7935760e1bf9d8b233bc7d6cd34928f5218 +S = 0941df05062aa8849610f4b37d184db77ed1bc19ad2bb42f9a12c123017592bf4086bf424b3caad9a404b260a0f69efb +Invalid = Y + +Curve = P-384 +X = 1f94eb6f439a3806f8054dd79124847d138d14d4f52bac93b042f2ee3cdb7dc9e09925c2a5fee70d4ce08c61e3b19160 +Y = 1c4fd111f6e33303069421deb31e873126be35eeb436fe2034856a3ed1e897f26c846ee3233cd16240989a7990c19d8c +Digest = 8cf5e81c6858b8395421d8c913f1ac887e282b5818eab525fb79feb9bc64bca7eb98f94b9e48b705e6c28311bb0ca672 +R = 3c15c3cedf2a6fbff2f906e661f5932f2542f0ce68e2a8182e5ed3858f33bd3c5666f17ac39e52cb004b80a0d4ba73cd +S = 9de879083cbb0a97973c94f1963d84f581e4c6541b7d000f9850deb25154b23a37dd72267bdd72665cc7027f88164fab +Invalid = Y + +Curve = P-384 +X = cb908b1fd516a57b8ee1e14383579b33cb154fece20c5035e2b3765195d1951d75bd78fb23e00fef37d7d064fd9af144 +Y = cd99c46b5857401ddcff2cf7cf822121faf1cbad9a011bed8c551f6f59b2c360f79bfbe32adbcaa09583bdfdf7c374bb +Digest = 965b83f5d34f7443eb88e78fcc23479156c9cb0080dd68334dac0ad33ba8c774100e440063db28b40b51ac37705d4d70 +R = 33f64fb65cd6a8918523f23aea0bbcf56bba1daca7aff817c8791dc92428d605ac629de2e847d43cee55ba9e4a0e83ba +S = 4428bb478a43ac73ecd6de51ddf7c28ff3c2441625a081714337dd44fea8011bae71959a10947b6ea33f77e128d3c6ae + +Curve = P-384 +X = 9b3c48d924194146eca4172b6d7d618423682686f43e1dbc54ed909053d075ca53b68ae12f0f16a1633d5d9cb17011ec +Y = 695039f837b68e59330ee95d11d5315a8fb5602a7b60c15142dbba6e93b5e4aba8ae4469eac39fa6436323eccc60dcb6 +Digest = c68382d0641ffad850c41365a8ec68e3d55acba376d1bb941e7dcdf7b71f37b8288b023b942373a40be1dfaaf4aea633 +R = 202da4e4e9632bcb6bf0f6dafb7e348528d0b469d77e46b9f939e2fa946a608dd1f166bcbcde96cfad551701da69f6c2 +S = db595b49983882c48df8a396884cd98893a469c4d590e56c6a59b6150d9a0acdf142cf92151052644702ed857a5b7981 +Invalid = Y + +Curve = P-384 +X = 5140108b93b52d9ad572d6129ed6564766f8df3755e49fa53eba41a5a0d6c1d24a483c90070583a66e3cfa52b6fb1f31 +Y = ff52498446a40c61e60c97554256472625633eda0c1a8b4061481fecfbe9c4503e99dfc69e86c9e85c8cc53dca6b8dc4 +Digest = 4b945020c329a61221060e924ec682eceb842c09537fe26265ad084753b89f7650cee4e8df30b38126984d80fd25d246 +R = b2726b2ba9da02de35e9953fc283d1e78700860d4c33dce8db04dd41499d904866c1b8debb377f6c0dfcb0704252174f +S = 0775b027068d7ad55121a278a819f52099ace750d5e996eaec9dee7be72758736cf769650148fbd5c411beb9b88f979e +Invalid = Y + +Curve = P-384 +X = 31f4fc2fac3a163a5796f5e414af6f8107ab5e4a98c755d81efa9d5a83c10128c16c863190112fc29d3d5f3057a2edf1 +Y = fe208743f3e96c3a34b5fff78c9716c074a1ce3dc01c3f0e471ddfae91cd88e7dda38dd0e5e1f91b00b8539da3cc10bc +Digest = 2d6affdf541609f649dbe9fd5829059bf42021fcfefee42d8c9cd5c127015c06b4c3c13ef56d08767788955887752e44 +R = 706911812ec9e7370234efd57b2855975eab81e9c2fe783aa8e442dc6e7d681dab2dc0dfc6765f87ab67001108e3facf +S = 42c89efa22d853d32f619c9fe13e9852889ac98a9fed5d4fa47fed238e1cbe70d7970af9f7bdf84e51176af4885f2490 +Invalid = Y + +Curve = P-384 +X = 1f7911dcfe63a6f270cf75b8584d9b1b4a00afc1fa43543c945945b8a821ebeb37fbc705a000f9cc7c35f7d27027b7bb +Y = f11835ec80c4ac06d99247e73bf72522109ac255e6109262de4dfbf9619244f74fb6c9ee57694537d7e79c248db34dc4 +Digest = f4b0a912331e7fc59a7071e5f47c9dafa6dc09b32c5c3d05301b3833bbe0b9168e2b63f12248849572a322b2f5423b8d +R = 3587c9c6885adf3be1086825f9a41ccd2edfa0bd95e7fc4dba5a9710f41d539132de7772f14c18e318f8992b66d2a86c +S = 73a844d729599d4e3e3c1b63e9c4bf5a73d1f69e0160857fe63a56c381c051f5c37ea6b4cc4caacb6ff26ef9699efe30 +Invalid = Y + +Curve = P-384 +X = 2039661db813d494a9ecb2c4e0cdd7b54068aae8a5d0597009f67f4f36f32c8ee939abe03716e94970bba69f595fead6 +Y = e2d5236e7e357744514e66a3fb111073336de929598eb79fb4368c5bf80814e7584a3b94118faac9321df37452a846fc +Digest = cae50a424395e38bde9ba31fa5ea0c107ccceaff06663719162aac2c3e15f2b2cfd376f90d371326e1d29e0392a756ee +R = 164b8ac2b34c4c499b9d6727e130b5ef37c296bd22c306d1396c6aa54ca661f729aa6353b55d7cf1793b80b5a485115f +S = 4e7187f8f735b7272f2c0985315b5602bb9b1a09f32233aa10570c82d1ccedef6e725800336511e47f88ddbbbdc08f54 +Invalid = Y + +Curve = P-384 +X = 46dcf8ee848c6459fa66d1cae91ccd471401a5782cb2d3b9b9264189f0e9ddf7197b05c694931bde3306240cf9d24b7e +Y = 79d9508f82c5ead05c3f9392f3b1458f6d6c02f44420b9021d656e59402e2645bf3ba1a6b244ddb12edbb69516d5873b +Digest = 039fe89dfc54e7f2162545af700a8c49a1216b08854643656b07d74e7032516fd0c9368c5e5ce54655e4d08baa29b6f0 +R = 5ffba3b5bd7c3a89ec40b47884b0b3464e8abb78608c6d61e1e62c2ca98d44fcdf61825d69dffee8408d0849d0623bac +S = 0d2597b5fc3842ffce1957172253a8c9c0e4dbe770ce54f70f139e0545dc34ec639d609e14175bdb2b812ccfda00c9d4 +Invalid = Y + +Curve = P-384 +X = 097cea75f685cf4d54324ad2124ce3f77b1e490bbaa1ffacde40dd988f7591e1c5d158e6f232500d958762831914af7f +Y = 716d8bc056daf69ca2edd21b89a6ae9923cfcae87bfda5f9a6e514dd4b9d28d164fcc613ca2afb9660adfece59f09b66 +Digest = 02afb35f1df33b3d83df3391ca4184121ca52f520dd12ffc891aee77eab6503f232a5b1231bd997239751f46c4133edb +R = 1c5d4561d2a3af8835839b543098c101c715c545eb7d00300c5cb05bb08dac29e732ffdc31c50915e691999ad505104c +S = c3442f2fb1498fd47c2f959edff37a19783e3ccee80dc6955ca64db087fd188e67358e7b9223535bbb858d21ba6a978c +Invalid = Y + +Curve = P-384 +X = d2e2b3d262bb1105d914c32c007ea23d15a98197f0ed90b46a17f3d403e406a76c8f752be1a8cd01a94fd45157f6511a +Y = e585fba180017b9983b4c853ad3a5dd52e079c5f0ef792d1a0213b6085e390b073de1a4b01749ceab27806e5604980fe +Digest = e66b11b84f87c38526438e5e3c5b4521248c358eaab80e40526906a05fb29d14d4e5686681f03bc3f0025d45dfb83b5f +R = 49c001c47bbcee10c81c0cdfdb84c86e5b388510801e9c9dc7f81bf667e43f74b6a6769c4ac0a38863dc4f21c558f286 +S = 1fb4ff67340cc44f212404ba60f39a2cb8dcd3f354c81b7219289d32e849d4915e9d2f91969ba71e3dd4414f1e8f18f7 +Invalid = Y + +Curve = P-384 +X = cd887c65c01a1f0880bf58611bf360a8435573bc6704bfb249f1192793f6d3283637cd50f3911e5134b0d6130a1db60e +Y = f2b3cbf4fe475fd15a7897561e5c898f10caa6d9d73fef10d4345917b527ce30caeaef138e21ac6d0a49ef2fef14bee6 +Digest = f6325d6bcaaaf1aba1197a290b33974f2fe8af200d5d726e78705904e9894ec31988e35dc76b9976834b7cd1c4c67146 +R = addfa475b998f391144156c418561d323bdfd0c4f416a2f71a946712c349bb79ba1334c3de5b86c2567b8657fe4ca1f1 +S = 1c314b1339f73545ff457323470695e0474c4b6860b35d703784fbf66e9c665de6ca3acb60283df61413e0740906f19e +Invalid = Y + +Curve = P-384 +X = a370cdbef95d1df5bf68ec487122514a107db87df3f8852068fd4694abcadb9b14302c72491a76a64442fc07bd99f02c +Y = d397c25dc1a5781573d039f2520cf329bf65120fdbe964b6b80101160e533d5570e62125b9f3276c49244b8d0f3e44ec +Digest = 709d1bf45b5817f5a67b859651eb47133ebed2622fda09ab66d3467b5e95da50ecc2c74d8f4d289feebec29729a4bfa3 +R = c6c7bb516cc3f37a304328d136b2f44bb89d3dac78f1f5bcd36b412a8b4d879f6cdb75175292c696b58bfa9c91fe6391 +S = 6b711425e1b14f7224cd4b96717a84d65a60ec9951a30152ea1dd3b6ea66a0088d1fd3e9a1ef069804b7d969148c37a0 + +Curve = P-384 +X = d1cf635ca04f09b58879d29012f2025479a002bda590020e6a238bccc764478131cac7e6980c67027d92ece947fea5a6 +Y = 21f7675c2be60c0a5b7d6df2bcc89b56212a2849ec0210c59316200c59864fd86b9a19e1641d206fd8b29af7768b61d3 +Digest = 5d54d236db6ab4691b3d50dc81471c5d388e5735ebdd435e9742a5a8a0ad0e841bab57326c8535a680ada57d2b3a70fa +R = 6101d26e76690634b7294b6b162dcc1a5e6233813ba09edf8567fb57a8f707e024abe0eb3ce948675cd518bb3bfd4383 +S = 4e2a30f71c8f18b74184837f981a90485cd5943c7a184aba9ac787d179f170114a96ddbb8720860a213cc289ae340f1f +Invalid = Y + +Curve = P-384 +X = d15ca4b2d944d5539658a19be8ef85874f0c363b870f1cd1f2dc9cb68b2a43a10d37064697c84543e60982ab62bb32c8 +Y = 062fb7dfc379fc6465302ac5d8d11d3b957b594c9ef445cfe856765dd59e6f10f11809e115ac64969baa23543f2e5661 +Digest = 67cf9e6f9e9558a379ef7361771323a4f3925f2c7a5d94d9156bf2d9d45f9f8fc4d47322da622fbce92fc764a2ccc327 +R = e2cf123ce15ca4edad5f087778d483d9536e4a37d2d55599541c06f878e60354aa31df250b2fc4ed252b80219552c958 +S = 696707a7e3f9a4b918e7c994e7332103d8e816bbe6d0d1cf72877318e087ed0e230b0d1269902f369acb432b9e97a389 + +Curve = P-384 +X = c83d30de9c4e18167cb41c990781b34b9fceb52793b4627e696796c5803515dbc4d142977d914bc04c153261cc5b537f +Y = 42318e5c15d65c3f545189781619267d899250d80acc611fe7ed0943a0f5bfc9d4328ff7ccf675ae0aac069ccb4b4d6e +Digest = e8d6b550271b486e79f6975cff753d49519ed9393b207af7039b4c070cbc2fe7d49dd1bb87f7021e442fadd80ce8a5b0 +R = b567c37f7c84107ef72639e52065486c2e5bf4125b861d37ea3b44fc0b75bcd96dcea3e4dbb9e8f4f45923240b2b9e44 +S = d06266e0f27cfe4be1c6210734a8fa689a6cd1d63240cb19127961365e35890a5f1b464dcb4305f3e8295c6f842ef344 +Invalid = Y + +Curve = P-384 +X = d4e93c4bafb54c06814011309e9f3d8e68b76a5452e364ef05ccc3b44b271e576c9028106b1584f09271c886d467f41d +Y = db730ccfdeb6644362f4fb510d5254bfe6f23e891e936132f90f1913e93baa8b1f8c0613a0f0c61a760ce659f22babc6 +Digest = d5c82ff11f555ce21c3f20a9ecfa6047cb6895e32fa0fb379f49085a59f61b7c8fa05058ef144cf47db5738fa40f4890cb59695998a2358162bbbf6d7f53517b +R = 8d0fd14a59c24b0c2a34b438e162f1f536fe09a698cacfe0760d026d1593265d02f2668d2a5e49ac0b21e93807aa9c18 +S = 3162ffd2adc9dd5ec1bb1d97d2b0c27b8ae234235ffb374878d0b76382002ea505e885c178d56a2d7809bd1d83117ef1 +Invalid = Y + +Curve = P-384 +X = c665feccf51e6bca31593087df60f65b9fe14a12022814615deb892eedb99d86069a82aa91319310b66588185282dad6 +Y = 1e6e25bb8ae7714415b94f89def0f75dcb81d4af6b78d61f277b74b990c11aff51bd12fc88d691c99f2afde7fbd13e51 +Digest = ea056beb112fa9aad69c8dfe51ea947b772bf1c11287edcede43a98089d21492ed581edcb6d1823e2873aabba213b84291db3bffa6eac3ae43a92fc2da276a24 +R = 0e18c4063137468fe864fdc405ad4e120176eb91b4538b28ce43a22ae1a310cc22a2f7a2b3a0f3d15e0f82038b4a4301 +S = 5a1620e42041ce4357daf824befbb2ed65596bcd8214e88726149b26b1f416b9472a8877413f1c3705fc2edf4731943b + +Curve = P-384 +X = a6bbf85e8068151482ce855ccf0ed22988fcf4b162c4b811cb7243b849299e3390a083147fbd68683203ba33588b13ae +Y = 5c837ec9f2eda225c83ab2d5f10b1aa5bfb56387deebf27ecda779f6254a17968260247c75dd813ea0e1926887d46f86 +Digest = 81b1303e10f25d37877b09f9d82dbd894e40264992d86cc74656ebeef505b46fdf9dec312a7f0a26e3f56a7195d5b01d198c378fff9d049e00cbad9586da20c9 +R = 9c11879e59659848274fc1ef5a6a181af813d23708b09a24dc06c089b93b918828dd938a75a34d5a681b0af362dc19a0 +S = 9c362231962ba7579c4a874e87bdc60dc15cb2e0677149c8ea31162963e05a6614616f67a5269616071cf095be7ff44b +Invalid = Y + +Curve = P-384 +X = 9c1eb5cdb1a873e4c275b7ded8712b9058ee0d9ded06c96a2a8d7c652b82e894e2f918dd8e18138e5c34821744b97952 +Y = dd474c93619f02b5d4fe30ea7805c1a13fb80008a81bb5f3eeb95cd11f38841b8e34d64f2c6cc2d6cc2587365eed6b6e +Digest = c0f9ae90fe8aaf54962e7d47a832e4ca6e60355e4066cd2b08bff78650d4e4a5d1eb1de296f9f0ef92887e09f82e0db4411aa9c3c6b109159bd39feed40419a3 +R = f17b2f2fa3b5c8e9c62a633e5d417139ddf3dafba75b464fa156c99b3948a0aca532c7fd3e14a266eb17e7fa80881da2 +S = 01c246866983fa74d6dff38b1ea091f8afd218b5a42467761b147c19a3bb20cd24be8ed1f95f1e61863a709d2d0148e2 +Invalid = Y + +Curve = P-384 +X = 20622a293edc96d83fee77cf1ee8077c61d6f8ed0073d53cfb5ee9c68e764c553fa4fc35fe42dade3a7307179d6fc9c2 +Y = 710fa24383f78cc4568fe0f4ecbbe6b11f0dce5434f4483712a6d2befae975a2efb554907aa46356f29bf7c6c2707c65 +Digest = 5cb8ed471a4001e280a0927faf25183c857b9b2de21c8566e8a1bf04ee085c36db7fab9d8f627898b3bb23c10225305938b56a732659f2cab3fa857d80dfde19 +R = 45a6cf5cef06256139caa709292d1e0f963d176add188572e9c7be29af21a95853a98e23aef0a0850e58d44d60b6d780 +S = df8d71cd5ab22fc718070078103483e5258734872ab935435f21ea199018e49a69c064a63801beb0759fde6e2c4a85b8 +Invalid = Y + +Curve = P-384 +X = 83a4fecc0bf0a353b0acf6f54094b822f2b12564e172b296f3461cafa7315d7d31d0089b1b4c18ad3c86bd18f539774a +Y = e4fd57c5b2937e6fba1e7d72fc3f02352bd79c13611931935f4dfd073b9379f862f2277585137e996e212b5b6533dcba +Digest = cd7c623c3c3b52f46be0ebb2b353ff97db3cd7dfc1a059a57668fc50101aeeb37b8aee9ddda8ab611546999a120cc9acb0e2c3df48dee66d5c31a46a7be94bc7 +R = fb02804010a570d702ebfbcf3d6cc9d55ddac2bd4b4de56d325e9790571b1737f91d3fa1d4caeec6eea806195aed3187 +S = 1fd20fe383e907e77639c05594642798619b2742090919bedeefb672c5700881baf0df19b9529d64bc7bb02683226103 + +Curve = P-384 +X = 208a8c5a6b59458160c5b680116c8b23799c54a7ee8954a4869425a717739facfe4fe24540505cdc133fde8c74bfca78 +Y = 22aa7aba797bde1e8389c3c3f8d8d9aa2a914f4d2d7aaf7187ebed9b2761975718ef97660ba0b8a71dee17f2b982e2cf +Digest = 007b907b90fa60835d45d2f0201a4486d9782fea4f0a235d97d4968336c5369c6c2e82bded56288a10fd6741f4c15d1633bc92e0196308d9f0490fc2077d3b6c +R = 0b4e835ed83151d2bde96e201c54544ba5f301aca853957d3c538c9858fcce796b60fc50f5600a48dcdf13e5bc029827 +S = 0270adf02d31d5428d523e13d7d315c1929a1d89bbd0f61eec0b1186abe1c307cbba6b1067a68bc3947e6196d49719a0 +Invalid = Y + +Curve = P-384 +X = 80ae47e99107d6148b1088c6694df5c1273ff336b66e45b68a7c65fed735129dadcaf2b900e9f8ec50eff70a5ba89ea3 +Y = 47450efb5669bfacd7cbff1f801aafa0812ff88a6ae7b5a1f85e88e19129ed995f509fbf8dec15ce42bbbbd33814c09e +Digest = 1cacc8f609080e7b8339529f944850a700977ef9107f40956fb35645e15fdd54ef01755f07a2582d0bf2ca0cb84ee8ab154fe0914dfc9ad7ad5fe54b857d0f4e +R = bae6fba7b1485ecdca48219ead3c39295fa9c196b1f0941445b1ac768e33962f68d37f1f1749eaad7200064aa202fb41 +S = b411a38d02deb42d1015a7837b033c89d2f37d92c70fa8bb1f592223f7750520b950f30277abfb4155a3ab194b3beca0 +Invalid = Y + +Curve = P-384 +X = 45cb6dcca8d2e80ac04536a22f9d68ea2313245550108ddcd32799d154c0a55492e49463e826275bd9bf0d5e380205c1 +Y = 6fd124f5a6c745751ccfb3ba4dd9144ea8fd41a4d9a4b34820434da66aa7385e73ffe71e6c11ed1beb6c7af22ce00edf +Digest = dd7947a5b9a1c988dd7dff537e15335aacafd3e602adc8373765013f338334dd58aed4fb7144de0007c3410d79f5e78bcd4cf0dd63cc33ed3dd564882e299c7b +R = 2c782c4263eeee63657fbf20fa287a1a81fcd14b1d3bae333928ba4fc31abb20edebc130714380608e38ea74309eca9d +S = 716113d95bc9dba532bfb470112b0d43d9cd6560ad15e0de2e514994801ff339bcf19ad4ee2b8af573f57c038fbd70f0 + +Curve = P-384 +X = 36c1459d9e9f7b6c1598778c784cbf94661a2b11370c02ee092f6ea0ca20acf81f1ed5048a28a1466a91689df26bc291 +Y = d1367418c7b216bd32c6dafc8b2be99d02cab68df990758b2ddd543b7eb6ff6e285b649ffe588b1811b549cfb5f0289b +Digest = 242ff2713c03e3d5277652f8e7fb1e5a1f0422b6652e1bdd696e46c03cdd3aaac329b1d88e7aa345ff7224ce6dc6df05c7e9d7dc2665282c817d15a15b8288fd +R = 40c338adeb504193444bdb95336177362031aaadc5b7e151e42030df9dd8687f3cb8fe2292fd4f9206989c089d966dae +S = be4b2ba251094c24de006c89af2b5c77e6937f36d7bb703b4f8edcfe65d45f4b2fd2486222163ae0ed9e215c0a96f488 +Invalid = Y + +Curve = P-384 +X = b5eb6670bb0b0d3aef10e533d3660756b7372a2a081d9d920130034f48202cd43b9e2d1e5893d0cfb322db65ab839716 +Y = e28444770396041b489b302786a57fca9a98f19685cb4b455d219151e64645ad30dd3149ec96f3bc90879834b65e58aa +Digest = 8d2e653807e87962883956ee3705b2167c50370c3af12eb8f6c26f0f15ede56dddc7d0c9642a1c1c2444b06571fa1a4d47e7884acc7ea3884daaa50940f782e2 +R = 0887a13df940907864b425ec0d8f91ac719abcc62b276fa08c5122b38831c8930abd3c8454e98182bb588fc72843717a +S = a380284eacaa36a34e35f04fbf6e28ffb59176f41ea52d9c9bc1362eccd8e0d699c2e08111d93e9dc2785637b1f4f09e +Invalid = Y + +Curve = P-384 +X = 700e8f65e052e918a63a96fa57f4eda849f9f9faca3302d6ead66ebf85838f8145a6d6718a681b7bef73170d7254958f +Y = 9e9e10357658913007803859165926cd1e5e92c3a644d834098cb1cbfab466349bf4238a5154cf50ed77c77a78263e81 +Digest = cf885fa7a96db595f825a0ccc56b70b60e0e1c30d0a15af636d1f4957328aecb7eeb734d5874bd72ddaf15c357ca36bd42abf387f7b771ea6160e2e23a08652e +R = 59be870e0fd684b000cce95c616d9f34674354e9d20db15d204b8a6285ff55258e4eeb49da1573ef1030cd6b2626dcfb +S = c0bbbf71d87479d82575458be9f4d686921db7ea458d620271f51ec3f4d1afe3bf25ef9c0c400eb7b92cd7058fb17346 +Invalid = Y + +Curve = P-384 +X = a9de6f029445fffcf16349b44095cc83b11e3d0d9f08654b158014803b1cc31b8dfe00b1a8167c6f704d69cdd62c6512 +Y = 27336a503a669ba1d1f3619f51dc8aa2a44b2075c682a36f071be486e7dafba9adfac2ce74be0442b7251e99304ffc05 +Digest = b7e73f38767f253790e7fff019b4e0e61562aeb97b2b749afec2a61c87ab0e15916d4286c0a13989912f6bafdf3efc6f64ddc3b944f9041266e5abd4480c1606 +R = f93a4d2eb94d087f28572847e0099ae2ee944efacdad392ec268c9c1e632e6ccd670c36584e58aba52a4c2b07127d55a +S = 941ee89cea6e7ed20213a95482fae134707ddf4d292ab1952ed5464f1f1138669dedbfc9998b696eaf469be5fb240c80 +Invalid = Y + +Curve = P-384 +X = e63500d6d13069c01fafc4518f1d429661c5bb6ad1ff0383037ca6a469a5c20c453dce03bf6e4164f7e26f849016b3d0 +Y = 83b7b731c2531c3ac61b194cf3db6dc02ccdfa16d9eb49f97bc4ec3fe6c8bd865ea27f1538531ad07dc44fc5107af8e6 +Digest = afc0ed355377d0ab0c4f79d420dcf67ad4920c013d5c8afde2287525da4596672927540418a61568b21ae7799d7659f16b85f611bd6e8d2066a55903da0c48b9 +R = eb78733e73fd64a6a1f23eba5311af23d26816fb8847671e01fdbd8dc7d5fce1a0823b080ee99e8d75edb3f100e16077 +S = bcaedfe599f98b51542c0f94ae1010611c6767ac3abb2bd887399d62fd0f1b3a0e97deb24c95a76de44521bf24c8645e +Invalid = Y + +Curve = P-384 +X = 3ebd869be687f82d844416e6816d698d82e1e22a1f451d50b6c146134deb07f05204c0b04e7dc07ebdcfd916531dc7c3 +Y = 6e4d7bde063edb7254a82b9d9249d2a2b9ad8988c37a84ac9f7c09daed42b1fd28f7cca1ea8b4f91a66e878224800bdc +Digest = 56a61339a35750e95770f28846930e3f594e8d759e07423718734a82b2a80430b0fb3378e40bdcf5c12be135be9a9bec32916b4988a763091a6da7b44631414e +R = 575f87a8a7980555a198cfdec279cbb2f89551b5271d242397c29f6bc4bf413dc30312a7e626ef7fc77a9124a79bf9be +S = f0b7d759246ad36ba8240c537b1eeb5d148c38d324f48028c598eaef6e49d79ff3f6cfe3a32fbbf6f3ed3aaaec31d572 +Invalid = Y + +# The following tests use digests equal to the order and 2^n - 1, where n is +# the number of bits in the order. This is to test the truncated digest not +# being fully reduced. + +Curve = P-256 +X = e57231383637c82c1ac801724cf7e03e67198f467a9beb60ac13cb582d13afa8 +Y = 8f190e090155fcf63810b858bc88e259dc49afef8bdef6fd06d93dddb1991aed +Digest = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632551 +R = 05cc6037bb021f4910ea2e489fab2bae6bb6a2769a97f42ba5736994102b7f10 +S = 5db54832ceabf8bccdb8be99b1a49cecff8feee045cb697dec43118e2695b1da + +Curve = P-256 +X = 6e0e2897b9a554ee287cdaf43bfbe25ca8404373971575a0e4b61c61aff5a2fe +Y = 23ea7823a411eb1b39f81bbde24c2cd6ac68be2c7eec3a0671c8676131b8905c +Digest = ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff +R = 16831feeceab2fab1c575e073e944d73ce7e6f3e9b06312088f06159c530ff50 +S = 870cb824692638538b1569c6093fcb693c054e8e3b9a919e3bb26798910f66e9 + +Curve = P-384 +X = f4a961c19f9cc4ebe4f43081110955f3cede085a08c1415d726e80b2eb774028c5fc96f092ba3ea7d1288dd57fe1db08 +Y = 981398eed0895e09b3b582a0616f3024e51cca7b1ecc347dbf0d24a5f6a222b0c31912f8f5e427d4dde5c6c45212bb10 +Digest = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52973 +R = 0b77eaff05bbd922dd80525d2ab301cc119318f5a920a12c71c4b5ff5bb77d25a538983df9bdd5984b0d159daf21f1a2 +S = 73af85ad03a34b6b3993082bf719018d25d1555717b2d2f2535d0601af06a71ad020eff8232d065ab9d7fc4cd0c0ee42 + +Curve = P-384 +X = 54dd8d7cbf2ccdf1a42f5bbc615a372803b094f6040e3c7b651a61bc6912432c836cf2410ab7d67f543236751d81066f +Y = 2219d6257b1c80bf327c96786f2b5d0b5a9b9bf7eee9c853bf66a3bf09520494cb1f7823e4c566d79a617b7e201ead96 +Digest = ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff +R = 9d923e199d98272e44b8fba382bf3c19660ecb4a9aae3513ff6802a73fef510c15c202807c3f9334b0bce7d6c6a80839 +S = 520784e6290d04d9b61993ee5ebc6fa8ff527fb0777c43cdefc7586701e60edb399005a5648ff852de80208232849fbd + +# The following tests are intended to stress the final comparison in ECDSA. +# ECDSA verification computes some curve point (x, y), picking the fully-reduced +# representive of x mod p, and checking that x mod n is r. (n is the order of +# the group and p defines the underlying prime field.) +# +# This makes the computation sensitive to values near n and p, and which of n or +# p is larger. Additionally, there is an optimization that performs the +# comparison mod p rather than n and compensates for the difference. +# +# These tests were generated by picking a target value of r and x, adjusting +# both until x corresponded to a point on the curve, and then computing the +# public key by solving for P in ECDSA's (x, y) = u1*G + u2*P. The digest is the +# hash of "hello, world" with the suitably-sized SHA-2 hash, so the test vectors +# are suitable for both message- and digest-based APIs. +# +# "x" in the comments refer to the x-coordinate of the computed point, not that +# of the public key. + +# r = 5, x = 5 is valid. +Curve = P-256 +X = 264d796a0dab9b376d34eea6fe297dde1c7b73e53944bc96c8f1e8a6850bb6c9 +Y = cf5308020eed460c649ddae61d4ef8bb79958113f106befaf4f18876d12a5e64 +Digest = 09ca7e4eaa6e8ae9c7d261167129184883644d07dfba7cbfbc4c8a2e08360d5b +R = 0000000000000000000000000000000000000000000000000000000000000005 +S = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254e + +# r = 5 + n, x = 5 is invalid. r must already be reduced. +Curve = P-256 +X = 264d796a0dab9b376d34eea6fe297dde1c7b73e53944bc96c8f1e8a6850bb6c9 +Y = cf5308020eed460c649ddae61d4ef8bb79958113f106befaf4f18876d12a5e64 +Digest = 09ca7e4eaa6e8ae9c7d261167129184883644d07dfba7cbfbc4c8a2e08360d5b +R = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632556 +S = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254e +Invalid = Y + +# r = n-2, x = n-2 is the largest x without a reduction. +Curve = P-256 +X = 50a50c01132bf79e42b31fb278f7317b29515e9e1c973a41266b69048826fb8e +Y = aac53e7df37b5eb25ce4ddb705fc7135c6b1e00a7f56e30744f62f258afa5537 +Digest = 09ca7e4eaa6e8ae9c7d261167129184883644d07dfba7cbfbc4c8a2e08360d5b +R = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254f +S = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254e + +# r = n-3, x = n-2 is incorrect. +Curve = P-256 +X = 50a50c01132bf79e42b31fb278f7317b29515e9e1c973a41266b69048826fb8e +Y = aac53e7df37b5eb25ce4ddb705fc7135c6b1e00a7f56e30744f62f258afa5537 +Digest = 09ca7e4eaa6e8ae9c7d261167129184883644d07dfba7cbfbc4c8a2e08360d5b +R = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254e +S = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254e +Invalid = Y + +# r = 3, x = n+3 is the smallest x with a reduction. +Curve = P-256 +X = ce24c99032d52ac6ead23c0ae3ec68ef41e51a281fd457808c83136d7dcce90e +Y = 8f7a154b551e9f39c59279357aa491b2a62bdebc2bb78613883fc72936c057e0 +Digest = 09ca7e4eaa6e8ae9c7d261167129184883644d07dfba7cbfbc4c8a2e08360d5b +R = 0000000000000000000000000000000000000000000000000000000000000003 +S = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254e + +# r = 4, x = n+3 is incorrect. +Curve = P-256 +X = ce24c99032d52ac6ead23c0ae3ec68ef41e51a281fd457808c83136d7dcce90e +Y = 8f7a154b551e9f39c59279357aa491b2a62bdebc2bb78613883fc72936c057e0 +Digest = 09ca7e4eaa6e8ae9c7d261167129184883644d07dfba7cbfbc4c8a2e08360d5b +R = 0000000000000000000000000000000000000000000000000000000000000004 +S = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254e +Invalid = Y + +# r = p-3-n, x = p-3 is the largest valid x. +Curve = P-256 +X = 768a0d300a595005a520130e50927d403395c8e1e40be997b48fc048410f7cdb +Y = 16f217d8e1c02bd887e5de388a17783b182e61b5d534152dc2c4be8d75fdd706 +Digest = 09ca7e4eaa6e8ae9c7d261167129184883644d07dfba7cbfbc4c8a2e08360d5b +R = 000000000000000000000000000000004319055358e8617b0c46353d039cdaab +S = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254e + +# r = p-n+5, x = 5 is incorrect. r is too large to compare r+n with x. +Curve = P-256 +X = 0ec505bc19b14a43e05678cccf07a443d3e871a2e19b68a4da91859a0650f324 +Y = 77300e4f64e9982d94dff5d294428bb37cc9be66117cae9c389d2d495f68b987 +Digest = 09ca7e4eaa6e8ae9c7d261167129184883644d07dfba7cbfbc4c8a2e08360d5b +R = 000000000000000000000000000000004319055358e8617b0c46353d039cdab3 +S = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254e +Invalid = Y + +# r = 2, x = 2 is valid. +Curve = P-384 +X = 016d2db67561bc126ad6c344d6eeb2713a9e2892c649af0f015c6b7617f160c8a3b3a88add669d7155025073c5ac5b4f +Y = 43bf2ed0088af08645c80aa0a24a567a94ba2d794e9689d3ad4b185bc5d2dd008333e2dd2ebb5069a9b32251a3cac71e +Digest = 1fcdb6059ce05172a26bbe2a3ccc88ed5a8cd5fc53edfd9053304d429296a6da23b1cd9e5c9ed3bb34f00418a70cdb7e +R = 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000002 +S = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52970 + +# r = 2 + n, x = 2 is invalid. r must already be reduced. +Curve = P-384 +X = 016d2db67561bc126ad6c344d6eeb2713a9e2892c649af0f015c6b7617f160c8a3b3a88add669d7155025073c5ac5b4f +Y = 43bf2ed0088af08645c80aa0a24a567a94ba2d794e9689d3ad4b185bc5d2dd008333e2dd2ebb5069a9b32251a3cac71e +Digest = 1fcdb6059ce05172a26bbe2a3ccc88ed5a8cd5fc53edfd9053304d429296a6da23b1cd9e5c9ed3bb34f00418a70cdb7e +R = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52975 +S = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52970 +Invalid = Y + +# r = n-1, x = n-1 is the largest x without a reduction. +Curve = P-384 +X = b5b375264c09acf145ca91d12ab10a096092a41ec43f4d718e129ea1c12b2dea62c7785efc52f46f009fb1dba133e811 +Y = bc0b2af172b4b3068d032a798080e76f4d56f72069519e3c19a43682a41794e52cb3ca139348d6bbc923e6a4f7945cb1 +Digest = 1fcdb6059ce05172a26bbe2a3ccc88ed5a8cd5fc53edfd9053304d429296a6da23b1cd9e5c9ed3bb34f00418a70cdb7e +R = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52972 +S = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52970 + +# r = n-2, x = n-1 is incorrect. +Curve = P-384 +X = b5b375264c09acf145ca91d12ab10a096092a41ec43f4d718e129ea1c12b2dea62c7785efc52f46f009fb1dba133e811 +Y = bc0b2af172b4b3068d032a798080e76f4d56f72069519e3c19a43682a41794e52cb3ca139348d6bbc923e6a4f7945cb1 +Digest = 1fcdb6059ce05172a26bbe2a3ccc88ed5a8cd5fc53edfd9053304d429296a6da23b1cd9e5c9ed3bb34f00418a70cdb7e +R = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52971 +S = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52970 +Invalid = Y + +# r = 2, x = n+2 is the smallest x with a reduction. +Curve = P-384 +X = 01b54a697305092bac2939fb906d7471b411c4eba8654169166a5da3810e1fc96795df921f7abbf519be4a027435176c +Y = a19012a3518773d508106d4153adee43c3c384fa62ce36a4addea08f593ec9c76b09a6b9c69d29bd7d47eb48e167dd2f +Digest = 1fcdb6059ce05172a26bbe2a3ccc88ed5a8cd5fc53edfd9053304d429296a6da23b1cd9e5c9ed3bb34f00418a70cdb7e +R = 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000002 +S = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52970 + +# r = 3, x = n+2 is incorrect. +Curve = P-384 +X = 01b54a697305092bac2939fb906d7471b411c4eba8654169166a5da3810e1fc96795df921f7abbf519be4a027435176c +Y = a19012a3518773d508106d4153adee43c3c384fa62ce36a4addea08f593ec9c76b09a6b9c69d29bd7d47eb48e167dd2f +Digest = 1fcdb6059ce05172a26bbe2a3ccc88ed5a8cd5fc53edfd9053304d429296a6da23b1cd9e5c9ed3bb34f00418a70cdb7e +R = 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000003 +S = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52970 +Invalid = Y + +# r = p-1-n, x = p-1 is the largest valid x. +Curve = P-384 +X = c4fd8e68006b83f7b7b20b731ae405813aa05f6e57374589b36ae1cecd1d49cae1418c22f398188bcf4ef02e89fe7394 +Y = dd1164b3707f59e05129fa228b8448031db159985f035d93470dc42b3ab4129f0760c46cf201d42e73a7e33ba7402ea6 +Digest = 1fcdb6059ce05172a26bbe2a3ccc88ed5a8cd5fc53edfd9053304d429296a6da23b1cd9e5c9ed3bb34f00418a70cdb7e +R = 000000000000000000000000000000000000000000000000389cb27e0bc8d21fa7e5f24cb74f58851313e696333ad68b +S = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52970 + +# r = p-n+2, x = 2 is incorrect. r is too large to compare r+n with x. +Curve = P-384 +X = 4e5e4f1a6e97059a6cf2f4e8129e5c7c64cb84f9994a41ff5bf30b29c1bf5ba6898627c91a23c73e05cd1a43c8f908c0 +Y = 06a0aed7f1e63a728f87dbd5360a67571a076ab0b4cde81b10d499959814ddb3a8c7854b0bbfa87cc272f90bca2a2254 +Digest = 1fcdb6059ce05172a26bbe2a3ccc88ed5a8cd5fc53edfd9053304d429296a6da23b1cd9e5c9ed3bb34f00418a70cdb7e +R = 000000000000000000000000000000000000000000000000389cb27e0bc8d21fa7e5f24cb74f58851313e696333ad68e +S = ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52970 +Invalid = Y diff --git a/ring-0.17.14/crypto/fipsmodule/sha/asm/sha256-armv4.pl b/ring-0.17.14/crypto/fipsmodule/sha/asm/sha256-armv4.pl new file mode 100644 index 0000000000..5bcdb3f26e --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/sha/asm/sha256-armv4.pl @@ -0,0 +1,625 @@ +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. +# ==================================================================== + +# SHA256 block procedure for ARMv4. May 2007. + +# Performance is ~2x better than gcc 3.4 generated code and in "abso- +# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per +# byte [on single-issue Xscale PXA250 core]. + +# July 2010. +# +# Rescheduling for dual-issue pipeline resulted in 22% improvement on +# Cortex A8 core and ~20 cycles per processed byte. + +# February 2011. +# +# Profiler-assisted and platform-specific optimization resulted in 16% +# improvement on Cortex A8 core and ~15.4 cycles per processed byte. + +# September 2013. +# +# Add NEON implementation. On Cortex A8 it was measured to process one +# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon +# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only +# code (meaning that latter performs sub-optimally, nothing was done +# about it). + +# May 2014. +# +# Add ARMv8 code path performing at 2.0 cpb on Apple A7. + +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; + *STDOUT=*OUT; +} else { + open OUT,">$output"; + *STDOUT=*OUT; +} + +$ctx="r0"; $t0="r0"; +$inp="r1"; $t4="r1"; +$len="r2"; $t1="r2"; +$T1="r3"; $t3="r3"; +$A="r4"; +$B="r5"; +$C="r6"; +$D="r7"; +$E="r8"; +$F="r9"; +$G="r10"; +$H="r11"; +@V=($A,$B,$C,$D,$E,$F,$G,$H); +$t2="r12"; +$Ktbl="r14"; + +@Sigma0=( 2,13,22); +@Sigma1=( 6,11,25); +@sigma0=( 7,18, 3); +@sigma1=(17,19,10); + +sub BODY_00_15 { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; + +$code.=<<___ if ($i<16); +#if __ARM_ARCH>=7 + @ ldr $t1,[$inp],#4 @ $i +# if $i==15 + str $inp,[sp,#17*4] @ make room for $t4 +# endif + eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` + add $a,$a,$t2 @ h+=Maj(a,b,c) from the past + eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) +# ifndef __ARMEB__ + rev $t1,$t1 +# endif +#else + @ ldrb $t1,[$inp,#3] @ $i + add $a,$a,$t2 @ h+=Maj(a,b,c) from the past + ldrb $t2,[$inp,#2] + ldrb $t0,[$inp,#1] + orr $t1,$t1,$t2,lsl#8 + ldrb $t2,[$inp],#4 + orr $t1,$t1,$t0,lsl#16 +# if $i==15 + str $inp,[sp,#17*4] @ make room for $t4 +# endif + eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` + orr $t1,$t1,$t2,lsl#24 + eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) +#endif +___ +$code.=<<___; + ldr $t2,[$Ktbl],#4 @ *K256++ + add $h,$h,$t1 @ h+=X[i] + str $t1,[sp,#`$i%16`*4] + eor $t1,$f,$g + add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e) + and $t1,$t1,$e + add $h,$h,$t2 @ h+=K256[i] + eor $t1,$t1,$g @ Ch(e,f,g) + eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]` + add $h,$h,$t1 @ h+=Ch(e,f,g) +#if $i==31 + and $t2,$t2,#0xff + cmp $t2,#0xf2 @ done? +#endif +#if $i<15 +# if __ARM_ARCH>=7 + ldr $t1,[$inp],#4 @ prefetch +# else + ldrb $t1,[$inp,#3] +# endif + eor $t2,$a,$b @ a^b, b^c in next round +#else + ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx + eor $t2,$a,$b @ a^b, b^c in next round + ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx +#endif + eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a) + and $t3,$t3,$t2 @ (b^c)&=(a^b) + add $d,$d,$h @ d+=h + eor $t3,$t3,$b @ Maj(a,b,c) + add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a) + @ add $h,$h,$t3 @ h+=Maj(a,b,c) +___ + ($t2,$t3)=($t3,$t2); +} + +sub BODY_16_XX { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; + +$code.=<<___; + @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i + @ ldr $t4,[sp,#`($i+14)%16`*4] + mov $t0,$t1,ror#$sigma0[0] + add $a,$a,$t2 @ h+=Maj(a,b,c) from the past + mov $t2,$t4,ror#$sigma1[0] + eor $t0,$t0,$t1,ror#$sigma0[1] + eor $t2,$t2,$t4,ror#$sigma1[1] + eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) + ldr $t1,[sp,#`($i+0)%16`*4] + eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14]) + ldr $t4,[sp,#`($i+9)%16`*4] + + add $t2,$t2,$t0 + eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15 + add $t1,$t1,$t2 + eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) + add $t1,$t1,$t4 @ X[i] +___ + &BODY_00_15(@_); +} + +$code=<<___; +#ifdef __KERNEL__ +# define __ARM_ARCH __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ 7 +#endif + +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those +@ instructions are manually-encoded. (See unsha256.) +.arch armv7-a + +.text +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif + +.type K256,%object +.align 5 +K256: +.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.size K256,.-K256 +.word 0 @ terminator +.align 5 + +.global sha256_block_data_order_nohw +.type sha256_block_data_order_nohw,%function +sha256_block_data_order_nohw: + add $len,$inp,$len,lsl#6 @ len to point at the end of inp + stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} + ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} + adr $Ktbl,K256 + sub sp,sp,#16*4 @ alloca(X[16]) +.Loop: +# if __ARM_ARCH>=7 + ldr $t1,[$inp],#4 +# else + ldrb $t1,[$inp,#3] +# endif + eor $t3,$B,$C @ magic + eor $t2,$t2,$t2 +___ +for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } +$code.=".Lrounds_16_xx:\n"; +for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; +#if __ARM_ARCH>=7 + ite eq @ Thumb2 thing, sanity check in ARM +#endif + ldreq $t3,[sp,#16*4] @ pull ctx + bne .Lrounds_16_xx + + add $A,$A,$t2 @ h+=Maj(a,b,c) from the past + ldr $t0,[$t3,#0] + ldr $t1,[$t3,#4] + ldr $t2,[$t3,#8] + add $A,$A,$t0 + ldr $t0,[$t3,#12] + add $B,$B,$t1 + ldr $t1,[$t3,#16] + add $C,$C,$t2 + ldr $t2,[$t3,#20] + add $D,$D,$t0 + ldr $t0,[$t3,#24] + add $E,$E,$t1 + ldr $t1,[$t3,#28] + add $F,$F,$t2 + ldr $inp,[sp,#17*4] @ pull inp + ldr $t2,[sp,#18*4] @ pull inp+len + add $G,$G,$t0 + add $H,$H,$t1 + stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H} + cmp $inp,$t2 + sub $Ktbl,$Ktbl,#256 @ rewind Ktbl + bne .Loop + + add sp,sp,#`16+3`*4 @ destroy frame +#if __ARM_ARCH>=5 + ldmia sp!,{r4-r11,pc} +#else + ldmia sp!,{r4-r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size sha256_block_data_order_nohw,.-sha256_block_data_order_nohw +___ +###################################################################### +# NEON stuff +# +{{{ +my @X=map("q$_",(0..3)); +my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25"); +my $Xfer=$t4; +my $j=0; + +sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } +sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } + +sub AUTOLOAD() # thunk [simplified] x86-style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; + my $arg = pop; + $arg = "#$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; +} + +sub Xupdate() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e,$f,$g,$h); + + &vext_8 ($T0,@X[0],@X[1],4); # X[1..4] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vext_8 ($T1,@X[2],@X[3],4); # X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T2,$T0,$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T1,$T0,$sigma0[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T2,$T0,32-$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T3,$T0,$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T1,$T1,$T2); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T3,$T0,32-$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T1,$T1,$T3); # sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T5,$T5,$T4); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T5,$T5,$T4); # sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T5,$T5,$T4); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &vld1_32 ("{$T0}","[$Ktbl,:128]!"); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T5,$T5,$T4); # sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 ($T0,$T0,@X[0]); + while($#insns>=2) { eval(shift(@insns)); } + &vst1_32 ("{$T0}","[$Xfer,:128]!"); + eval(shift(@insns)); + eval(shift(@insns)); + + push(@X,shift(@X)); # "rotate" X[] +} + +sub Xpreload() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e,$f,$g,$h); + + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vld1_32 ("{$T0}","[$Ktbl,:128]!"); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vrev32_8 (@X[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 ($T0,$T0,@X[0]); + foreach (@insns) { eval; } # remaining instructions + &vst1_32 ("{$T0}","[$Xfer,:128]!"); + + push(@X,shift(@X)); # "rotate" X[] +} + +sub body_00_15 () { + ( + '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. + '&add ($h,$h,$t1)', # h+=X[i]+K[i] + '&eor ($t1,$f,$g)', + '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', + '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past + '&and ($t1,$t1,$e)', + '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) + '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', + '&eor ($t1,$t1,$g)', # Ch(e,f,g) + '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e) + '&eor ($t2,$a,$b)', # a^b, b^c in next round + '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) + '&add ($h,$h,$t1)', # h+=Ch(e,f,g) + '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. + '&ldr ($t1,"[$Ktbl]") if ($j==15);'. + '&ldr ($t1,"[sp,#64]") if ($j==31)', + '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) + '&add ($d,$d,$h)', # d+=h + '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a) + '&eor ($t3,$t3,$b)', # Maj(a,b,c) + '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' + ) +} + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.LK256_shortcut_neon: +@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode. +#if defined(__thumb2__) +.word K256-(.LK256_add_neon+4) +#else +.word K256-(.LK256_add_neon+8) +#endif + +.global sha256_block_data_order_neon +.type sha256_block_data_order_neon,%function +.align 5 +.skip 16 +sha256_block_data_order_neon: + stmdb sp!,{r4-r12,lr} + + sub $H,sp,#16*4+16 + + @ K256 is just at the boundary of being easily referenced by an ADR from + @ this function. In Arm mode, when building with __ARM_ARCH=6, it does + @ not fit. By moving code around, we could make it fit, but this is too + @ fragile. For simplicity, just load the offset from + @ .LK256_shortcut_neon. + @ + @ TODO(davidben): adrl would avoid a load, but clang-assembler does not + @ support it. We might be able to emulate it with a macro, but Android's + @ did not work when I tried it. + @ https://android.googlesource.com/platform/ndk/+/refs/heads/main/docs/ClangMigration.md#arm + ldr $Ktbl,.LK256_shortcut_neon +.LK256_add_neon: + add $Ktbl,pc,$Ktbl + + bic $H,$H,#15 @ align for 128-bit stores + mov $t2,sp + mov sp,$H @ alloca + add $len,$inp,$len,lsl#6 @ len to point at the end of inp + + vld1.8 {@X[0]},[$inp]! + vld1.8 {@X[1]},[$inp]! + vld1.8 {@X[2]},[$inp]! + vld1.8 {@X[3]},[$inp]! + vld1.32 {$T0},[$Ktbl,:128]! + vld1.32 {$T1},[$Ktbl,:128]! + vld1.32 {$T2},[$Ktbl,:128]! + vld1.32 {$T3},[$Ktbl,:128]! + vrev32.8 @X[0],@X[0] @ yes, even on + str $ctx,[sp,#64] + vrev32.8 @X[1],@X[1] @ big-endian + str $inp,[sp,#68] + mov $Xfer,sp + vrev32.8 @X[2],@X[2] + str $len,[sp,#72] + vrev32.8 @X[3],@X[3] + str $t2,[sp,#76] @ save original sp + vadd.i32 $T0,$T0,@X[0] + vadd.i32 $T1,$T1,@X[1] + vst1.32 {$T0},[$Xfer,:128]! + vadd.i32 $T2,$T2,@X[2] + vst1.32 {$T1},[$Xfer,:128]! + vadd.i32 $T3,$T3,@X[3] + vst1.32 {$T2},[$Xfer,:128]! + vst1.32 {$T3},[$Xfer,:128]! + + ldmia $ctx,{$A-$H} + sub $Xfer,$Xfer,#64 + ldr $t1,[sp,#0] + eor $t2,$t2,$t2 + eor $t3,$B,$C + b .L_00_48 + +.align 4 +.L_00_48: +___ + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); +$code.=<<___; + teq $t1,#0 @ check for K256 terminator + ldr $t1,[sp,#0] + sub $Xfer,$Xfer,#64 + bne .L_00_48 + + ldr $inp,[sp,#68] + ldr $t0,[sp,#72] + sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl + teq $inp,$t0 + it eq + subeq $inp,$inp,#64 @ avoid SEGV + vld1.8 {@X[0]},[$inp]! @ load next input block + vld1.8 {@X[1]},[$inp]! + vld1.8 {@X[2]},[$inp]! + vld1.8 {@X[3]},[$inp]! + it ne + strne $inp,[sp,#68] + mov $Xfer,sp +___ + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); +$code.=<<___; + ldr $t0,[$t1,#0] + add $A,$A,$t2 @ h+=Maj(a,b,c) from the past + ldr $t2,[$t1,#4] + ldr $t3,[$t1,#8] + ldr $t4,[$t1,#12] + add $A,$A,$t0 @ accumulate + ldr $t0,[$t1,#16] + add $B,$B,$t2 + ldr $t2,[$t1,#20] + add $C,$C,$t3 + ldr $t3,[$t1,#24] + add $D,$D,$t4 + ldr $t4,[$t1,#28] + add $E,$E,$t0 + str $A,[$t1],#4 + add $F,$F,$t2 + str $B,[$t1],#4 + add $G,$G,$t3 + str $C,[$t1],#4 + add $H,$H,$t4 + str $D,[$t1],#4 + stmia $t1,{$E-$H} + + ittte ne + movne $Xfer,sp + ldrne $t1,[sp,#0] + eorne $t2,$t2,$t2 + ldreq sp,[sp,#76] @ restore original sp + itt ne + eorne $t3,$B,$C + bne .L_00_48 + + ldmia sp!,{r4-r12,pc} +.size sha256_block_data_order_neon,.-sha256_block_data_order_neon +#endif +___ +}}} + +$code.=<<___; +.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by " +.align 2 +___ + +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/@/ and !/^$/); + print; +} +close SELF; + +foreach (split($/,$code)) { + + s/\`([^\`]*)\`/eval $1/geo; + + s/\bret\b/bx lr/go or + s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 + + print $_,"\n"; +} + +close STDOUT or die "error closing STDOUT: $!"; # enforce flush diff --git a/ring-0.17.14/crypto/fipsmodule/sha/asm/sha512-armv4.pl b/ring-0.17.14/crypto/fipsmodule/sha/asm/sha512-armv4.pl new file mode 100644 index 0000000000..b1fa016181 --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/sha/asm/sha512-armv4.pl @@ -0,0 +1,649 @@ +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. +# ==================================================================== + +# SHA512 block procedure for ARMv4. September 2007. + +# This code is ~4.5 (four and a half) times faster than code generated +# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue +# Xscale PXA250 core]. +# +# July 2010. +# +# Rescheduling for dual-issue pipeline resulted in 6% improvement on +# Cortex A8 core and ~40 cycles per processed byte. + +# February 2011. +# +# Profiler-assisted and platform-specific optimization resulted in 7% +# improvement on Coxtex A8 core and ~38 cycles per byte. + +# March 2011. +# +# Add NEON implementation. On Cortex A8 it was measured to process +# one byte in 23.3 cycles or ~60% faster than integer-only code. + +# August 2012. +# +# Improve NEON performance by 12% on Snapdragon S4. In absolute +# terms it's 22.6 cycles per byte, which is disappointing result. +# Technical writers asserted that 3-way S4 pipeline can sustain +# multiple NEON instructions per cycle, but dual NEON issue could +# not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html +# for further details. On side note Cortex-A15 processes one byte in +# 16 cycles. + +# Byte order [in]dependence. ========================================= +# +# Originally caller was expected to maintain specific *dword* order in +# h[0-7], namely with most significant dword at *lower* address, which +# was reflected in below two parameters as 0 and 4. Now caller is +# expected to maintain native byte order for whole 64-bit values. +$hi="HI"; +$lo="LO"; +# ==================================================================== + +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; + *STDOUT=*OUT; +} else { + open OUT,">$output"; + *STDOUT=*OUT; +} + +$ctx="r0"; # parameter block +$inp="r1"; +$len="r2"; + +$Tlo="r3"; +$Thi="r4"; +$Alo="r5"; +$Ahi="r6"; +$Elo="r7"; +$Ehi="r8"; +$t0="r9"; +$t1="r10"; +$t2="r11"; +$t3="r12"; +############ r13 is stack pointer +$Ktbl="r14"; +############ r15 is program counter + +$Aoff=8*0; +$Boff=8*1; +$Coff=8*2; +$Doff=8*3; +$Eoff=8*4; +$Foff=8*5; +$Goff=8*6; +$Hoff=8*7; +$Xoff=8*8; + +sub BODY_00_15() { +my $magic = shift; +$code.=<<___; + @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) + @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 + @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 + mov $t0,$Elo,lsr#14 + str $Tlo,[sp,#$Xoff+0] + mov $t1,$Ehi,lsr#14 + str $Thi,[sp,#$Xoff+4] + eor $t0,$t0,$Ehi,lsl#18 + ldr $t2,[sp,#$Hoff+0] @ h.lo + eor $t1,$t1,$Elo,lsl#18 + ldr $t3,[sp,#$Hoff+4] @ h.hi + eor $t0,$t0,$Elo,lsr#18 + eor $t1,$t1,$Ehi,lsr#18 + eor $t0,$t0,$Ehi,lsl#14 + eor $t1,$t1,$Elo,lsl#14 + eor $t0,$t0,$Ehi,lsr#9 + eor $t1,$t1,$Elo,lsr#9 + eor $t0,$t0,$Elo,lsl#23 + eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e) + adds $Tlo,$Tlo,$t0 + ldr $t0,[sp,#$Foff+0] @ f.lo + adc $Thi,$Thi,$t1 @ T += Sigma1(e) + ldr $t1,[sp,#$Foff+4] @ f.hi + adds $Tlo,$Tlo,$t2 + ldr $t2,[sp,#$Goff+0] @ g.lo + adc $Thi,$Thi,$t3 @ T += h + ldr $t3,[sp,#$Goff+4] @ g.hi + + eor $t0,$t0,$t2 + str $Elo,[sp,#$Eoff+0] + eor $t1,$t1,$t3 + str $Ehi,[sp,#$Eoff+4] + and $t0,$t0,$Elo + str $Alo,[sp,#$Aoff+0] + and $t1,$t1,$Ehi + str $Ahi,[sp,#$Aoff+4] + eor $t0,$t0,$t2 + ldr $t2,[$Ktbl,#$lo] @ K[i].lo + eor $t1,$t1,$t3 @ Ch(e,f,g) + ldr $t3,[$Ktbl,#$hi] @ K[i].hi + + adds $Tlo,$Tlo,$t0 + ldr $Elo,[sp,#$Doff+0] @ d.lo + adc $Thi,$Thi,$t1 @ T += Ch(e,f,g) + ldr $Ehi,[sp,#$Doff+4] @ d.hi + adds $Tlo,$Tlo,$t2 + and $t0,$t2,#0xff + adc $Thi,$Thi,$t3 @ T += K[i] + adds $Elo,$Elo,$Tlo + ldr $t2,[sp,#$Boff+0] @ b.lo + adc $Ehi,$Ehi,$Thi @ d += T + teq $t0,#$magic + + ldr $t3,[sp,#$Coff+0] @ c.lo +#if __ARM_ARCH>=7 + it eq @ Thumb2 thing, sanity check in ARM +#endif + orreq $Ktbl,$Ktbl,#1 + @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) + @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 + @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 + mov $t0,$Alo,lsr#28 + mov $t1,$Ahi,lsr#28 + eor $t0,$t0,$Ahi,lsl#4 + eor $t1,$t1,$Alo,lsl#4 + eor $t0,$t0,$Ahi,lsr#2 + eor $t1,$t1,$Alo,lsr#2 + eor $t0,$t0,$Alo,lsl#30 + eor $t1,$t1,$Ahi,lsl#30 + eor $t0,$t0,$Ahi,lsr#7 + eor $t1,$t1,$Alo,lsr#7 + eor $t0,$t0,$Alo,lsl#25 + eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a) + adds $Tlo,$Tlo,$t0 + and $t0,$Alo,$t2 + adc $Thi,$Thi,$t1 @ T += Sigma0(a) + + ldr $t1,[sp,#$Boff+4] @ b.hi + orr $Alo,$Alo,$t2 + ldr $t2,[sp,#$Coff+4] @ c.hi + and $Alo,$Alo,$t3 + and $t3,$Ahi,$t1 + orr $Ahi,$Ahi,$t1 + orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo + and $Ahi,$Ahi,$t2 + adds $Alo,$Alo,$Tlo + orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi + sub sp,sp,#8 + adc $Ahi,$Ahi,$Thi @ h += T + tst $Ktbl,#1 + add $Ktbl,$Ktbl,#8 +___ +} +$code=<<___; +#ifndef __KERNEL__ +# define VFP_ABI_PUSH vstmdb sp!,{d8-d15} +# define VFP_ABI_POP vldmia sp!,{d8-d15} +#else +# define __ARM_MAX_ARCH__ 7 +# define VFP_ABI_PUSH +# define VFP_ABI_POP +#endif + +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. +.arch armv7-a + +#ifdef __ARMEL__ +# define LO 0 +# define HI 4 +# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1 +#else +# define HI 0 +# define LO 4 +# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 +#endif + +.text +#if defined(__thumb2__) +.syntax unified +.thumb +# define adrl adr +#else +.code 32 +#endif + +.type K512,%object +.align 5 +K512: +WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd) +WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc) +WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019) +WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118) +WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe) +WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2) +WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1) +WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694) +WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3) +WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65) +WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483) +WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5) +WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210) +WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4) +WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725) +WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70) +WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926) +WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df) +WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8) +WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b) +WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001) +WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30) +WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910) +WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8) +WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53) +WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8) +WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb) +WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3) +WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60) +WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec) +WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9) +WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b) +WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207) +WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178) +WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6) +WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b) +WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493) +WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) +WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) +WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) +.size K512,.-K512 + +.global sha512_block_data_order_nohw +.type sha512_block_data_order_nohw,%function +sha512_block_data_order_nohw: + add $len,$inp,$len,lsl#7 @ len to point at the end of inp + stmdb sp!,{r4-r12,lr} + adr $Ktbl,K512 + sub sp,sp,#9*8 + + ldr $Elo,[$ctx,#$Eoff+$lo] + ldr $Ehi,[$ctx,#$Eoff+$hi] + ldr $t0, [$ctx,#$Goff+$lo] + ldr $t1, [$ctx,#$Goff+$hi] + ldr $t2, [$ctx,#$Hoff+$lo] + ldr $t3, [$ctx,#$Hoff+$hi] +.Loop: + str $t0, [sp,#$Goff+0] + str $t1, [sp,#$Goff+4] + str $t2, [sp,#$Hoff+0] + str $t3, [sp,#$Hoff+4] + ldr $Alo,[$ctx,#$Aoff+$lo] + ldr $Ahi,[$ctx,#$Aoff+$hi] + ldr $Tlo,[$ctx,#$Boff+$lo] + ldr $Thi,[$ctx,#$Boff+$hi] + ldr $t0, [$ctx,#$Coff+$lo] + ldr $t1, [$ctx,#$Coff+$hi] + ldr $t2, [$ctx,#$Doff+$lo] + ldr $t3, [$ctx,#$Doff+$hi] + str $Tlo,[sp,#$Boff+0] + str $Thi,[sp,#$Boff+4] + str $t0, [sp,#$Coff+0] + str $t1, [sp,#$Coff+4] + str $t2, [sp,#$Doff+0] + str $t3, [sp,#$Doff+4] + ldr $Tlo,[$ctx,#$Foff+$lo] + ldr $Thi,[$ctx,#$Foff+$hi] + str $Tlo,[sp,#$Foff+0] + str $Thi,[sp,#$Foff+4] + +.L00_15: +#if __ARM_ARCH<7 + ldrb $Tlo,[$inp,#7] + ldrb $t0, [$inp,#6] + ldrb $t1, [$inp,#5] + ldrb $t2, [$inp,#4] + ldrb $Thi,[$inp,#3] + ldrb $t3, [$inp,#2] + orr $Tlo,$Tlo,$t0,lsl#8 + ldrb $t0, [$inp,#1] + orr $Tlo,$Tlo,$t1,lsl#16 + ldrb $t1, [$inp],#8 + orr $Tlo,$Tlo,$t2,lsl#24 + orr $Thi,$Thi,$t3,lsl#8 + orr $Thi,$Thi,$t0,lsl#16 + orr $Thi,$Thi,$t1,lsl#24 +#else + ldr $Tlo,[$inp,#4] + ldr $Thi,[$inp],#8 +#ifdef __ARMEL__ + rev $Tlo,$Tlo + rev $Thi,$Thi +#endif +#endif +___ + &BODY_00_15(0x94); +$code.=<<___; + tst $Ktbl,#1 + beq .L00_15 + ldr $t0,[sp,#`$Xoff+8*(16-1)`+0] + ldr $t1,[sp,#`$Xoff+8*(16-1)`+4] + bic $Ktbl,$Ktbl,#1 +.L16_79: + @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) + @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 + @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 + mov $Tlo,$t0,lsr#1 + ldr $t2,[sp,#`$Xoff+8*(16-14)`+0] + mov $Thi,$t1,lsr#1 + ldr $t3,[sp,#`$Xoff+8*(16-14)`+4] + eor $Tlo,$Tlo,$t1,lsl#31 + eor $Thi,$Thi,$t0,lsl#31 + eor $Tlo,$Tlo,$t0,lsr#8 + eor $Thi,$Thi,$t1,lsr#8 + eor $Tlo,$Tlo,$t1,lsl#24 + eor $Thi,$Thi,$t0,lsl#24 + eor $Tlo,$Tlo,$t0,lsr#7 + eor $Thi,$Thi,$t1,lsr#7 + eor $Tlo,$Tlo,$t1,lsl#25 + + @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) + @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26 + @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6 + mov $t0,$t2,lsr#19 + mov $t1,$t3,lsr#19 + eor $t0,$t0,$t3,lsl#13 + eor $t1,$t1,$t2,lsl#13 + eor $t0,$t0,$t3,lsr#29 + eor $t1,$t1,$t2,lsr#29 + eor $t0,$t0,$t2,lsl#3 + eor $t1,$t1,$t3,lsl#3 + eor $t0,$t0,$t2,lsr#6 + eor $t1,$t1,$t3,lsr#6 + ldr $t2,[sp,#`$Xoff+8*(16-9)`+0] + eor $t0,$t0,$t3,lsl#26 + + ldr $t3,[sp,#`$Xoff+8*(16-9)`+4] + adds $Tlo,$Tlo,$t0 + ldr $t0,[sp,#`$Xoff+8*16`+0] + adc $Thi,$Thi,$t1 + + ldr $t1,[sp,#`$Xoff+8*16`+4] + adds $Tlo,$Tlo,$t2 + adc $Thi,$Thi,$t3 + adds $Tlo,$Tlo,$t0 + adc $Thi,$Thi,$t1 +___ + &BODY_00_15(0x17); +$code.=<<___; +#if __ARM_ARCH>=7 + ittt eq @ Thumb2 thing, sanity check in ARM +#endif + ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0] + ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4] + beq .L16_79 + bic $Ktbl,$Ktbl,#1 + + ldr $Tlo,[sp,#$Boff+0] + ldr $Thi,[sp,#$Boff+4] + ldr $t0, [$ctx,#$Aoff+$lo] + ldr $t1, [$ctx,#$Aoff+$hi] + ldr $t2, [$ctx,#$Boff+$lo] + ldr $t3, [$ctx,#$Boff+$hi] + adds $t0,$Alo,$t0 + str $t0, [$ctx,#$Aoff+$lo] + adc $t1,$Ahi,$t1 + str $t1, [$ctx,#$Aoff+$hi] + adds $t2,$Tlo,$t2 + str $t2, [$ctx,#$Boff+$lo] + adc $t3,$Thi,$t3 + str $t3, [$ctx,#$Boff+$hi] + + ldr $Alo,[sp,#$Coff+0] + ldr $Ahi,[sp,#$Coff+4] + ldr $Tlo,[sp,#$Doff+0] + ldr $Thi,[sp,#$Doff+4] + ldr $t0, [$ctx,#$Coff+$lo] + ldr $t1, [$ctx,#$Coff+$hi] + ldr $t2, [$ctx,#$Doff+$lo] + ldr $t3, [$ctx,#$Doff+$hi] + adds $t0,$Alo,$t0 + str $t0, [$ctx,#$Coff+$lo] + adc $t1,$Ahi,$t1 + str $t1, [$ctx,#$Coff+$hi] + adds $t2,$Tlo,$t2 + str $t2, [$ctx,#$Doff+$lo] + adc $t3,$Thi,$t3 + str $t3, [$ctx,#$Doff+$hi] + + ldr $Tlo,[sp,#$Foff+0] + ldr $Thi,[sp,#$Foff+4] + ldr $t0, [$ctx,#$Eoff+$lo] + ldr $t1, [$ctx,#$Eoff+$hi] + ldr $t2, [$ctx,#$Foff+$lo] + ldr $t3, [$ctx,#$Foff+$hi] + adds $Elo,$Elo,$t0 + str $Elo,[$ctx,#$Eoff+$lo] + adc $Ehi,$Ehi,$t1 + str $Ehi,[$ctx,#$Eoff+$hi] + adds $t2,$Tlo,$t2 + str $t2, [$ctx,#$Foff+$lo] + adc $t3,$Thi,$t3 + str $t3, [$ctx,#$Foff+$hi] + + ldr $Alo,[sp,#$Goff+0] + ldr $Ahi,[sp,#$Goff+4] + ldr $Tlo,[sp,#$Hoff+0] + ldr $Thi,[sp,#$Hoff+4] + ldr $t0, [$ctx,#$Goff+$lo] + ldr $t1, [$ctx,#$Goff+$hi] + ldr $t2, [$ctx,#$Hoff+$lo] + ldr $t3, [$ctx,#$Hoff+$hi] + adds $t0,$Alo,$t0 + str $t0, [$ctx,#$Goff+$lo] + adc $t1,$Ahi,$t1 + str $t1, [$ctx,#$Goff+$hi] + adds $t2,$Tlo,$t2 + str $t2, [$ctx,#$Hoff+$lo] + adc $t3,$Thi,$t3 + str $t3, [$ctx,#$Hoff+$hi] + + add sp,sp,#640 + sub $Ktbl,$Ktbl,#640 + + teq $inp,$len + bne .Loop + + add sp,sp,#8*9 @ destroy frame +#if __ARM_ARCH>=5 + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size sha512_block_data_order_nohw,.-sha512_block_data_order_nohw +___ + +{ +my @Sigma0=(28,34,39); +my @Sigma1=(14,18,41); +my @sigma0=(1, 8, 7); +my @sigma1=(19,61,6); + +my $Ktbl="r3"; +my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch + +my @X=map("d$_",(0..15)); +my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23)); + +sub NEON_00_15() { +my $i=shift; +my ($a,$b,$c,$d,$e,$f,$g,$h)=@_; +my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps + +$code.=<<___ if ($i<16 || $i&1); + vshr.u64 $t0,$e,#@Sigma1[0] @ $i +#if $i<16 + vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned +#endif + vshr.u64 $t1,$e,#@Sigma1[1] +#if $i>0 + vadd.i64 $a,$Maj @ h+=Maj from the past +#endif + vshr.u64 $t2,$e,#@Sigma1[2] +___ +$code.=<<___; + vld1.64 {$K},[$Ktbl,:64]! @ K[i++] + vsli.64 $t0,$e,#`64-@Sigma1[0]` + vsli.64 $t1,$e,#`64-@Sigma1[1]` + vmov $Ch,$e + vsli.64 $t2,$e,#`64-@Sigma1[2]` +#if $i<16 && defined(__ARMEL__) + vrev64.8 @X[$i],@X[$i] +#endif + veor $t1,$t0 + vbsl $Ch,$f,$g @ Ch(e,f,g) + vshr.u64 $t0,$a,#@Sigma0[0] + veor $t2,$t1 @ Sigma1(e) + vadd.i64 $T1,$Ch,$h + vshr.u64 $t1,$a,#@Sigma0[1] + vsli.64 $t0,$a,#`64-@Sigma0[0]` + vadd.i64 $T1,$t2 + vshr.u64 $t2,$a,#@Sigma0[2] + vadd.i64 $K,@X[$i%16] + vsli.64 $t1,$a,#`64-@Sigma0[1]` + veor $Maj,$a,$b + vsli.64 $t2,$a,#`64-@Sigma0[2]` + veor $h,$t0,$t1 + vadd.i64 $T1,$K + vbsl $Maj,$c,$b @ Maj(a,b,c) + veor $h,$t2 @ Sigma0(a) + vadd.i64 $d,$T1 + vadd.i64 $Maj,$T1 + @ vadd.i64 $h,$Maj +___ +} + +sub NEON_16_79() { +my $i=shift; + +if ($i&1) { &NEON_00_15($i,@_); return; } + +# 2x-vectorized, therefore runs every 2nd round +my @X=map("q$_",(0..7)); # view @X as 128-bit vector +my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps +my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15 +my $e=@_[4]; # $e from NEON_00_15 +$i /= 2; +$code.=<<___; + vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0] + vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1] + vadd.i64 @_[0],d30 @ h+=Maj from the past + vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2] + vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]` + vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1] + vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]` + veor $s1,$t0 + vshr.u64 $t0,$s0,#@sigma0[0] + veor $s1,$t1 @ sigma1(X[i+14]) + vshr.u64 $t1,$s0,#@sigma0[1] + vadd.i64 @X[$i%8],$s1 + vshr.u64 $s1,$s0,#@sigma0[2] + vsli.64 $t0,$s0,#`64-@sigma0[0]` + vsli.64 $t1,$s0,#`64-@sigma0[1]` + vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9] + veor $s1,$t0 + vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15 + vadd.i64 @X[$i%8],$s0 + vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15 + veor $s1,$t1 @ sigma0(X[i+1]) + vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15 + vadd.i64 @X[$i%8],$s1 +___ + &NEON_00_15(2*$i,@_); +} + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.global sha512_block_data_order_neon +.type sha512_block_data_order_neon,%function +.align 4 +sha512_block_data_order_neon: + dmb @ errata #451034 on early Cortex A8 + add $len,$inp,$len,lsl#7 @ len to point at the end of inp + adr $Ktbl,K512 + VFP_ABI_PUSH + vldmia $ctx,{$A-$H} @ load context +.Loop_neon: +___ +for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + mov $cnt,#4 +.L16_79_neon: + subs $cnt,#1 +___ +for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + bne .L16_79_neon + + vadd.i64 $A,d30 @ h+=Maj from the past + vldmia $ctx,{d24-d31} @ load context to temp + vadd.i64 q8,q12 @ vectorized accumulate + vadd.i64 q9,q13 + vadd.i64 q10,q14 + vadd.i64 q11,q15 + vstmia $ctx,{$A-$H} @ save context + teq $inp,$len + sub $Ktbl,#640 @ rewind K512 + bne .Loop_neon + + VFP_ABI_POP + ret @ bx lr +.size sha512_block_data_order_neon,.-sha512_block_data_order_neon +#endif +___ +} +$code.=<<___; +.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by " +.align 2 +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 +$code =~ s/\bret\b/bx lr/gm; + +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/@/ and !/^$/); + print; +} +close SELF; + +print $code; +close STDOUT or die "error closing STDOUT: $!"; # enforce flush diff --git a/ring-0.17.14/crypto/fipsmodule/sha/asm/sha512-armv8.pl b/ring-0.17.14/crypto/fipsmodule/sha/asm/sha512-armv8.pl new file mode 100644 index 0000000000..a3ed706774 --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/sha/asm/sha512-armv8.pl @@ -0,0 +1,575 @@ +#! /usr/bin/env perl +# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. +# ==================================================================== +# +# SHA256/512 for ARMv8. +# +# Performance in cycles per processed byte and improvement coefficient +# over code generated with "default" compiler: +# +# SHA256-hw SHA256(*) SHA512 +# Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) +# Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) +# Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) +# Denver 2.01 10.5 (+26%) 6.70 (+8%) +# X-Gene 20.0 (+100%) 12.8 (+300%(***)) +# Mongoose 2.36 13.0 (+50%) 8.36 (+33%) +# Kryo 1.92 17.4 (+30%) 11.2 (+8%) +# +# (*) Software SHA256 results are of lesser relevance, presented +# mostly for informational purposes. +# (**) The result is a trade-off: it's possible to improve it by +# 10% (or by 1 cycle per round), but at the cost of 20% loss +# on Cortex-A53 (or by 4 cycles per round). +# (***) Super-impressive coefficients over gcc-generated code are +# indication of some compiler "pathology", most notably code +# generated with -mgeneral-regs-only is significantly faster +# and the gap is only 40-90%. + +my ($flavour, $output) = @ARGV; + +if ($output =~ /sha512-armv8/) { + $BITS=512; + $SZ=8; + @Sigma0=(28,34,39); + @Sigma1=(14,18,41); + @sigma0=(1, 8, 7); + @sigma1=(19,61, 6); + $rounds=80; + $reg_t="x"; +} else { + $BITS=256; + $SZ=4; + @Sigma0=( 2,13,22); + @Sigma1=( 6,11,25); + @sigma0=( 7,18, 3); + @sigma1=(17,19,10); + $rounds=64; + $reg_t="w"; +} + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; + *STDOUT=*OUT; +} else { + open OUT,">$output"; + *STDOUT=*OUT; +} + +$func="sha${BITS}_block_data_order_nohw"; + +($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30)); + +@X=map("$reg_t$_",(3..15,0..2)); +@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27)); +($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28)); + +sub BODY_00_xx { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; +my $j=($i+1)&15; +my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]); + $T0=@X[$i+3] if ($i<11); + +$code.=<<___ if ($i<16); +#ifndef __AARCH64EB__ + rev @X[$i],@X[$i] // $i +#endif +___ +$code.=<<___ if ($i<13 && ($i&1)); + ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ +___ +$code.=<<___ if ($i==13); + ldp @X[14],@X[15],[$inp] +___ +$code.=<<___ if ($i>=14); + ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`] +___ +$code.=<<___ if ($i>0 && $i<16); + add $a,$a,$t1 // h+=Sigma0(a) +___ +$code.=<<___ if ($i>=11); + str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`] +___ +# While ARMv8 specifies merged rotate-n-logical operation such as +# 'eor x,y,z,ror#n', it was found to negatively affect performance +# on Apple A7. The reason seems to be that it requires even 'y' to +# be available earlier. This means that such merged instruction is +# not necessarily best choice on critical path... On the other hand +# Cortex-A5x handles merged instructions much better than disjoint +# rotate and logical... See (**) footnote above. +$code.=<<___ if ($i<15); + ror $t0,$e,#$Sigma1[0] + add $h,$h,$t2 // h+=K[i] + eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]` + and $t1,$f,$e + bic $t2,$g,$e + add $h,$h,@X[$i&15] // h+=X[i] + orr $t1,$t1,$t2 // Ch(e,f,g) + eor $t2,$a,$b // a^b, b^c in next round + eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e) + ror $T0,$a,#$Sigma0[0] + add $h,$h,$t1 // h+=Ch(e,f,g) + eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]` + add $h,$h,$t0 // h+=Sigma1(e) + and $t3,$t3,$t2 // (b^c)&=(a^b) + add $d,$d,$h // d+=h + eor $t3,$t3,$b // Maj(a,b,c) + eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a) + add $h,$h,$t3 // h+=Maj(a,b,c) + ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round + //add $h,$h,$t1 // h+=Sigma0(a) +___ +$code.=<<___ if ($i>=15); + ror $t0,$e,#$Sigma1[0] + add $h,$h,$t2 // h+=K[i] + ror $T1,@X[($j+1)&15],#$sigma0[0] + and $t1,$f,$e + ror $T2,@X[($j+14)&15],#$sigma1[0] + bic $t2,$g,$e + ror $T0,$a,#$Sigma0[0] + add $h,$h,@X[$i&15] // h+=X[i] + eor $t0,$t0,$e,ror#$Sigma1[1] + eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1] + orr $t1,$t1,$t2 // Ch(e,f,g) + eor $t2,$a,$b // a^b, b^c in next round + eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e) + eor $T0,$T0,$a,ror#$Sigma0[1] + add $h,$h,$t1 // h+=Ch(e,f,g) + and $t3,$t3,$t2 // (b^c)&=(a^b) + eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1] + eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1]) + add $h,$h,$t0 // h+=Sigma1(e) + eor $t3,$t3,$b // Maj(a,b,c) + eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a) + eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14]) + add @X[$j],@X[$j],@X[($j+9)&15] + add $d,$d,$h // d+=h + add $h,$h,$t3 // h+=Maj(a,b,c) + ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round + add @X[$j],@X[$j],$T1 + add $h,$h,$t1 // h+=Sigma0(a) + add @X[$j],@X[$j],$T2 +___ + ($t2,$t3)=($t3,$t2); +} + +$code.=<<___; +#ifndef __KERNEL__ +#endif + +.text + +.globl $func +.type $func,%function +.align 6 +$func: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#4*$SZ + + ldp $A,$B,[$ctx] // load context + ldp $C,$D,[$ctx,#2*$SZ] + ldp $E,$F,[$ctx,#4*$SZ] + add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input + ldp $G,$H,[$ctx,#6*$SZ] + adrp $Ktbl,:pg_hi21:.LK$BITS + add $Ktbl,$Ktbl,:lo12:.LK$BITS + stp $ctx,$num,[x29,#96] + +.Loop: + ldp @X[0],@X[1],[$inp],#2*$SZ + ldr $t2,[$Ktbl],#$SZ // *K++ + eor $t3,$B,$C // magic seed + str $inp,[x29,#112] +___ +for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } +$code.=".Loop_16_xx:\n"; +for (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + cbnz $t2,.Loop_16_xx + + ldp $ctx,$num,[x29,#96] + ldr $inp,[x29,#112] + sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind + + ldp @X[0],@X[1],[$ctx] + ldp @X[2],@X[3],[$ctx,#2*$SZ] + add $inp,$inp,#14*$SZ // advance input pointer + ldp @X[4],@X[5],[$ctx,#4*$SZ] + add $A,$A,@X[0] + ldp @X[6],@X[7],[$ctx,#6*$SZ] + add $B,$B,@X[1] + add $C,$C,@X[2] + add $D,$D,@X[3] + stp $A,$B,[$ctx] + add $E,$E,@X[4] + add $F,$F,@X[5] + stp $C,$D,[$ctx,#2*$SZ] + add $G,$G,@X[6] + add $H,$H,@X[7] + cmp $inp,$num + stp $E,$F,[$ctx,#4*$SZ] + stp $G,$H,[$ctx,#6*$SZ] + b.ne .Loop + + ldp x19,x20,[x29,#16] + add sp,sp,#4*$SZ + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size $func,.-$func + +.section .rodata +.align 6 +.type .LK$BITS,%object +.LK$BITS: +___ +$code.=<<___ if ($SZ==8); + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + .quad 0 // terminator +___ +$code.=<<___ if ($SZ==4); + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + .long 0 //terminator +___ +$code.=<<___; +.size .LK$BITS,.-.LK$BITS +.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by " +.align 2 +___ + +if ($SZ==4) { +my $Ktbl="x3"; + +my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2)); +my @MSG=map("v$_.16b",(4..7)); +my ($W0,$W1)=("v16.4s","v17.4s"); +my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b"); + +$code.=<<___; +.text +#ifndef __KERNEL__ +.globl sha256_block_data_order_hw +.type sha256_block_data_order_hw,%function +.align 6 +sha256_block_data_order_hw: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1.32 {$ABCD,$EFGH},[$ctx] + adrp $Ktbl,:pg_hi21:.LK256 + add $Ktbl,$Ktbl,:lo12:.LK256 + +.Loop_hw: + ld1 {@MSG[0]-@MSG[3]},[$inp],#64 + sub $num,$num,#1 + ld1.32 {$W0},[$Ktbl],#16 + rev32 @MSG[0],@MSG[0] + rev32 @MSG[1],@MSG[1] + rev32 @MSG[2],@MSG[2] + rev32 @MSG[3],@MSG[3] + orr $ABCD_SAVE,$ABCD,$ABCD // offload + orr $EFGH_SAVE,$EFGH,$EFGH +___ +for($i=0;$i<12;$i++) { +$code.=<<___; + ld1.32 {$W1},[$Ktbl],#16 + add.i32 $W0,$W0,@MSG[0] + sha256su0 @MSG[0],@MSG[1] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + sha256su1 @MSG[0],@MSG[2],@MSG[3] +___ + ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); +} +$code.=<<___; + ld1.32 {$W1},[$Ktbl],#16 + add.i32 $W0,$W0,@MSG[0] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + ld1.32 {$W0},[$Ktbl],#16 + add.i32 $W1,$W1,@MSG[1] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + ld1.32 {$W1},[$Ktbl] + add.i32 $W0,$W0,@MSG[2] + sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + add.i32 $W1,$W1,@MSG[3] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + add.i32 $ABCD,$ABCD,$ABCD_SAVE + add.i32 $EFGH,$EFGH,$EFGH_SAVE + + cbnz $num,.Loop_hw + + st1.32 {$ABCD,$EFGH},[$ctx] + + ldr x29,[sp],#16 + ret +.size sha256_block_data_order_hw,.-sha256_block_data_order_hw +#endif +___ +} + +if ($SZ==8) { +my $Ktbl="x3"; + +my @H = map("v$_.16b",(0..4)); +my ($fg,$de,$m9_10)=map("v$_.16b",(5..7)); +my @MSG=map("v$_.16b",(16..23)); +my ($W0,$W1)=("v24.2d","v25.2d"); +my ($AB,$CD,$EF,$GH)=map("v$_.16b",(26..29)); + +$code.=<<___; +.text +#ifndef __KERNEL__ +.globl sha512_block_data_order_hw +.type sha512_block_data_order_hw,%function +.align 6 +sha512_block_data_order_hw: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {@MSG[0]-@MSG[3]},[$inp],#64 // load input + ld1 {@MSG[4]-@MSG[7]},[$inp],#64 + + ld1.64 {@H[0]-@H[3]},[$ctx] // load context + adrp $Ktbl,:pg_hi21:.LK512 + add $Ktbl,$Ktbl,:lo12:.LK512 + + rev64 @MSG[0],@MSG[0] + rev64 @MSG[1],@MSG[1] + rev64 @MSG[2],@MSG[2] + rev64 @MSG[3],@MSG[3] + rev64 @MSG[4],@MSG[4] + rev64 @MSG[5],@MSG[5] + rev64 @MSG[6],@MSG[6] + rev64 @MSG[7],@MSG[7] + b .Loop_hw + +.align 4 +.Loop_hw: + ld1.64 {$W0},[$Ktbl],#16 + subs $num,$num,#1 + sub x4,$inp,#128 + orr $AB,@H[0],@H[0] // offload + orr $CD,@H[1],@H[1] + orr $EF,@H[2],@H[2] + orr $GH,@H[3],@H[3] + csel $inp,$inp,x4,ne // conditional rewind +___ +for($i=0;$i<32;$i++) { +$code.=<<___; + add.i64 $W0,$W0,@MSG[0] + ld1.64 {$W1},[$Ktbl],#16 + ext $W0,$W0,$W0,#8 + ext $fg,@H[2],@H[3],#8 + ext $de,@H[1],@H[2],#8 + add.i64 @H[3],@H[3],$W0 // "T1 + H + K512[i]" + sha512su0 @MSG[0],@MSG[1] + ext $m9_10,@MSG[4],@MSG[5],#8 + sha512h @H[3],$fg,$de + sha512su1 @MSG[0],@MSG[7],$m9_10 + add.i64 @H[4],@H[1],@H[3] // "D + T1" + sha512h2 @H[3],$H[1],@H[0] +___ + ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); + @H = (@H[3],@H[0],@H[4],@H[2],@H[1]); +} +for(;$i<40;$i++) { +$code.=<<___ if ($i<39); + ld1.64 {$W1},[$Ktbl],#16 +___ +$code.=<<___ if ($i==39); + sub $Ktbl,$Ktbl,#$rounds*$SZ // rewind +___ +$code.=<<___; + add.i64 $W0,$W0,@MSG[0] + ld1 {@MSG[0]},[$inp],#16 // load next input + ext $W0,$W0,$W0,#8 + ext $fg,@H[2],@H[3],#8 + ext $de,@H[1],@H[2],#8 + add.i64 @H[3],@H[3],$W0 // "T1 + H + K512[i]" + sha512h @H[3],$fg,$de + rev64 @MSG[0],@MSG[0] + add.i64 @H[4],@H[1],@H[3] // "D + T1" + sha512h2 @H[3],$H[1],@H[0] +___ + ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); + @H = (@H[3],@H[0],@H[4],@H[2],@H[1]); +} +$code.=<<___; + add.i64 @H[0],@H[0],$AB // accumulate + add.i64 @H[1],@H[1],$CD + add.i64 @H[2],@H[2],$EF + add.i64 @H[3],@H[3],$GH + + cbnz $num,.Loop_hw + + st1.64 {@H[0]-@H[3]},[$ctx] // store context + + ldr x29,[sp],#16 + ret +.size sha512_block_data_order_hw,.-sha512_block_data_order_hw +#endif +___ +} + +{ my %opcode = ( + "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000, + "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 ); + + sub unsha256 { + my ($mnemonic,$arg)=@_; + + $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o + && + sprintf ".inst\t0x%08x\t//%s %s", + $opcode{$mnemonic}|$1|($2<<5)|($3<<16), + $mnemonic,$arg; + } +} + +{ my %opcode = ( + "sha512h" => 0xce608000, "sha512h2" => 0xce608400, + "sha512su0" => 0xcec08000, "sha512su1" => 0xce608800 ); + + sub unsha512 { + my ($mnemonic,$arg)=@_; + + $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o + && + sprintf ".inst\t0x%08x\t//%s %s", + $opcode{$mnemonic}|$1|($2<<5)|($3<<16), + $mnemonic,$arg; + } +} + +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/\/\// and !/^$/); + print; +} +close SELF; + +foreach(split("\n",$code)) { + + s/\`([^\`]*)\`/eval($1)/ge; + + s/\b(sha512\w+)\s+([qv].*)/unsha512($1,$2)/ge or + s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge; + + s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers + + s/\.[ui]?8(\s)/$1/; + s/\.\w?64\b// and s/\.16b/\.2d/g or + s/\.\w?32\b// and s/\.16b/\.4s/g; + m/\bext\b/ and s/\.2d/\.16b/g or + m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g; + + print $_,"\n"; +} + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/ring-0.17.14/crypto/fipsmodule/sha/asm/sha512-x86_64.pl b/ring-0.17.14/crypto/fipsmodule/sha/asm/sha512-x86_64.pl new file mode 100644 index 0000000000..76a411043e --- /dev/null +++ b/ring-0.17.14/crypto/fipsmodule/sha/asm/sha512-x86_64.pl @@ -0,0 +1,1655 @@ +#! /usr/bin/env perl +# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. Rights for redistribution and usage in source and binary +# forms are granted according to the License. +# ==================================================================== +# +# sha256/512_block procedure for x86_64. +# +# 40% improvement over compiler-generated code on Opteron. On EM64T +# sha256 was observed to run >80% faster and sha512 - >40%. No magical +# tricks, just straight implementation... I really wonder why gcc +# [being armed with inline assembler] fails to generate as fast code. +# The only thing which is cool about this module is that it's very +# same instruction sequence used for both SHA-256 and SHA-512. In +# former case the instructions operate on 32-bit operands, while in +# latter - on 64-bit ones. All I had to do is to get one flavor right, +# the other one passed the test right away:-) +# +# sha256_block runs in ~1005 cycles on Opteron, which gives you +# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock +# frequency in GHz. sha512_block runs in ~1275 cycles, which results +# in 128*1000/1275=100MBps per GHz. Is there room for improvement? +# Well, if you compare it to IA-64 implementation, which maintains +# X[16] in register bank[!], tends to 4 instructions per CPU clock +# cycle and runs in 1003 cycles, 1275 is very good result for 3-way +# issue Opteron pipeline and X[16] maintained in memory. So that *if* +# there is a way to improve it, *then* the only way would be to try to +# offload X[16] updates to SSE unit, but that would require "deeper" +# loop unroll, which in turn would naturally cause size blow-up, not +# to mention increased complexity! And once again, only *if* it's +# actually possible to noticeably improve overall ILP, instruction +# level parallelism, on a given CPU implementation in this case. +# +# Special note on Intel EM64T. While Opteron CPU exhibits perfect +# performance ratio of 1.5 between 64- and 32-bit flavors [see above], +# [currently available] EM64T CPUs apparently are far from it. On the +# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit +# sha256_block:-( This is presumably because 64-bit shifts/rotates +# apparently are not atomic instructions, but implemented in microcode. +# +# May 2012. +# +# Optimization including one of Pavel Semjanov's ideas, alternative +# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and +# unfortunately -2% SHA512 on P4 [which nobody should care about +# that much]. +# +# June 2012. +# +# Add SIMD code paths, see below for improvement coefficients. SSSE3 +# code path was not attempted for SHA512, because improvement is not +# estimated to be high enough, noticeably less than 9%, to justify +# the effort, not on pre-AVX processors. [Obviously with exclusion +# for VIA Nano, but it has SHA512 instruction that is faster and +# should be used instead.] For reference, corresponding estimated +# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that +# higher coefficients are observed on VIA Nano and Bulldozer has more +# to do with specifics of their architecture [which is topic for +# separate discussion]. +# +# November 2012. +# +# Add AVX2 code path. Two consecutive input blocks are loaded to +# 256-bit %ymm registers, with data from first block to least +# significant 128-bit halves and data from second to most significant. +# The data is then processed with same SIMD instruction sequence as +# for AVX, but with %ymm as operands. Side effect is increased stack +# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB +# code size increase. +# +# March 2014. +# +# Add support for Intel SHA Extensions. + +###################################################################### +# Current performance in cycles per processed byte (less is better): +# +# SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*) +# +# AMD K8 14.9 - - 9.57 - +# P4 17.3 - - 30.8 - +# Core 2 15.6 13.8(+13%) - 9.97 - +# Westmere 14.8 12.3(+19%) - 9.58 - +# Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**)) +# Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%) +# Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%) +# Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%) +# Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%) +# Ryzen 11.0 9.02(+22%) 2.05(+440%) 7.05 5.67(+20%) +# VIA Nano 23.0 16.5(+39%) - 14.7 - +# Atom 23.0 18.9(+22%) - 14.7 - +# Silvermont 27.4 20.6(+33%) - 17.5 - +# Knights L 27.4 21.0(+30%) 19.6(+40%) 17.5 12.8(+37%) +# Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 - +# +# (*) whichever best applicable, including SHAEXT; +# (**) switch from ror to shrd stands for fair share of improvement; +# (***) execution time is fully determined by remaining integer-only +# part, body_00_15; reducing the amount of SIMD instructions +# below certain limit makes no difference/sense; to conserve +# space SHA256 XOP code path is therefore omitted; +# +# Modified from upstream OpenSSL to remove the XOP code. + +my ($flavour, $output) = @ARGV; + +if ($output =~ /sha512-x86_64/) { + $func="sha512_block_data_order"; + $TABLE="K512"; + $SZ=8; + @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx", + "%r8", "%r9", "%r10","%r11"); + ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi"); + @Sigma0=(28,34,39); + @Sigma1=(14,18,41); + @sigma0=(1, 8, 7); + @sigma1=(19,61, 6); + $rounds=80; +} else { + $func="sha256_block_data_order"; + $TABLE="K256"; + $SZ=4; + @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", + "%r8d","%r9d","%r10d","%r11d"); + ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi"); + @Sigma0=( 2,13,22); + @Sigma1=( 6,11,25); + @sigma0=( 7,18, 3); + @sigma1=(17,19,10); + $rounds=64; +} + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +# In upstream, this is controlled by shelling out to the compiler to check +# versions, but BoringSSL is intended to be used with pre-generated perlasm +# output, so this isn't useful anyway. +# +# This file also has an AVX2 implementation, controlled by setting $avx to 2. +# For now, we intentionally disable it. While it gives a 13-16% perf boost, the +# CFI annotations are wrong. It allocates stack in a loop and should be +# rewritten to avoid this. +$avx = 1; +$shaext = 1; + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; + +$ctx="%rdi"; # 1st arg, zapped by $a3 +$inp="%rsi"; # 2nd arg +$Tbl="%rbp"; + +$_ctx="16*$SZ+0*8(%rsp)"; +$_inp="16*$SZ+1*8(%rsp)"; +$_end="16*$SZ+2*8(%rsp)"; +$_rsp="`16*$SZ+3*8`(%rsp)"; +$framesz="16*$SZ+4*8"; + + +sub ROUND_00_15() +{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; + my $STRIDE=$SZ; + $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1)); + +$code.=<<___; + ror \$`$Sigma1[2]-$Sigma1[1]`,$a0 + mov $f,$a2 + + xor $e,$a0 + ror \$`$Sigma0[2]-$Sigma0[1]`,$a1 + xor $g,$a2 # f^g + + mov $T1,`$SZ*($i&0xf)`(%rsp) + xor $a,$a1 + and $e,$a2 # (f^g)&e + + ror \$`$Sigma1[1]-$Sigma1[0]`,$a0 + add $h,$T1 # T1+=h + xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g + + ror \$`$Sigma0[1]-$Sigma0[0]`,$a1 + xor $e,$a0 + add $a2,$T1 # T1+=Ch(e,f,g) + + mov $a,$a2 + add ($Tbl),$T1 # T1+=K[round] + xor $a,$a1 + + xor $b,$a2 # a^b, b^c in next round + ror \$$Sigma1[0],$a0 # Sigma1(e) + mov $b,$h + + and $a2,$a3 + ror \$$Sigma0[0],$a1 # Sigma0(a) + add $a0,$T1 # T1+=Sigma1(e) + + xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b) + add $T1,$d # d+=T1 + add $T1,$h # h+=T1 + + lea $STRIDE($Tbl),$Tbl # round++ +___ +$code.=<<___ if ($i<15); + add $a1,$h # h+=Sigma0(a) +___ + ($a2,$a3) = ($a3,$a2); +} + +sub ROUND_16_XX() +{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; + +$code.=<<___; + mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 + mov `$SZ*(($i+14)&0xf)`(%rsp),$a2 + + mov $a0,$T1 + ror \$`$sigma0[1]-$sigma0[0]`,$a0 + add $a1,$a # modulo-scheduled h+=Sigma0(a) + mov $a2,$a1 + ror \$`$sigma1[1]-$sigma1[0]`,$a2 + + xor $T1,$a0 + shr \$$sigma0[2],$T1 + ror \$$sigma0[0],$a0 + xor $a1,$a2 + shr \$$sigma1[2],$a1 + + ror \$$sigma1[0],$a2 + xor $a0,$T1 # sigma0(X[(i+1)&0xf]) + xor $a1,$a2 # sigma1(X[(i+14)&0xf]) + add `$SZ*(($i+9)&0xf)`(%rsp),$T1 + + add `$SZ*($i&0xf)`(%rsp),$T1 + mov $e,$a0 + add $a2,$T1 + mov $a,$a1 +___ + &ROUND_00_15(@_); +} + +$code=<<___; +.text + +.globl ${func}_nohw +.type ${func}_nohw,\@function,3 +.align 16 +${func}_nohw: +.cfi_startproc + _CET_ENDBR + mov %rsp,%rax # copy %rsp +.cfi_def_cfa_register %rax + push %rbx +.cfi_push %rbx + push %rbp +.cfi_push %rbp + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + shl \$4,%rdx # num*16 + sub \$$framesz,%rsp + lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ + and \$-64,%rsp # align stack frame + mov $ctx,$_ctx # save ctx, 1st arg + mov $inp,$_inp # save inp, 2nd arh + mov %rdx,$_end # save end pointer, "3rd" arg + mov %rax,$_rsp # save copy of %rsp +.cfi_cfa_expression $_rsp,deref,+8 +.Lprologue: + + mov $SZ*0($ctx),$A + mov $SZ*1($ctx),$B + mov $SZ*2($ctx),$C + mov $SZ*3($ctx),$D + mov $SZ*4($ctx),$E + mov $SZ*5($ctx),$F + mov $SZ*6($ctx),$G + mov $SZ*7($ctx),$H + jmp .Lloop + +.align 16 +.Lloop: + mov $B,$a3 + lea $TABLE(%rip),$Tbl + xor $C,$a3 # magic +___ + for($i=0;$i<16;$i++) { + $code.=" mov $SZ*$i($inp),$T1\n"; + $code.=" mov @ROT[4],$a0\n"; + $code.=" mov @ROT[0],$a1\n"; + $code.=" bswap $T1\n"; + &ROUND_00_15($i,@ROT); + unshift(@ROT,pop(@ROT)); + } +$code.=<<___; + jmp .Lrounds_16_xx +.align 16 +.Lrounds_16_xx: +___ + for(;$i<32;$i++) { + &ROUND_16_XX($i,@ROT); + unshift(@ROT,pop(@ROT)); + } + +$code.=<<___; + cmpb \$0,`$SZ-1`($Tbl) + jnz .Lrounds_16_xx + + mov $_ctx,$ctx + add $a1,$A # modulo-scheduled h+=Sigma0(a) + lea 16*$SZ($inp),$inp + + add $SZ*0($ctx),$A + add $SZ*1($ctx),$B + add $SZ*2($ctx),$C + add $SZ*3($ctx),$D + add $SZ*4($ctx),$E + add $SZ*5($ctx),$F + add $SZ*6($ctx),$G + add $SZ*7($ctx),$H + + cmp $_end,$inp + + mov $A,$SZ*0($ctx) + mov $B,$SZ*1($ctx) + mov $C,$SZ*2($ctx) + mov $D,$SZ*3($ctx) + mov $E,$SZ*4($ctx) + mov $F,$SZ*5($ctx) + mov $G,$SZ*6($ctx) + mov $H,$SZ*7($ctx) + jb .Lloop + + mov $_rsp,%rsi +.cfi_def_cfa %rsi,8 + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbp +.cfi_restore %rbp + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue: + ret +.cfi_endproc +.size ${func}_nohw,.-${func}_nohw +___ + +if ($SZ==4) { +$code.=<<___; +.section .rodata +.align 64 +.type $TABLE,\@object +$TABLE: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + + .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f + .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f + .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff + .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff + .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 + .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 + .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by " +.text +___ +} else { +$code.=<<___; +.section .rodata +.align 64 +.type $TABLE,\@object +$TABLE: + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + + .quad 0x0001020304050607,0x08090a0b0c0d0e0f + .quad 0x0001020304050607,0x08090a0b0c0d0e0f + .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by " +.text +___ +} + +###################################################################### +# SIMD code paths +# +if ($SZ==4 && $shaext) {{{ +###################################################################### +# Intel SHA Extensions implementation of SHA256 update function. +# +my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx"); + +my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10)); +my @MSG=map("%xmm$_",(3..6)); + +$code.=<<___; +.globl sha256_block_data_order_hw +.type sha256_block_data_order_hw,\@function,3 +.align 64 +sha256_block_data_order_hw: +.cfi_startproc + _CET_ENDBR +___ +$code.=<<___ if ($win64); + lea `-8-5*16`(%rsp),%rsp + movaps %xmm6,-8-5*16(%rax) + movaps %xmm7,-8-4*16(%rax) + movaps %xmm8,-8-3*16(%rax) + movaps %xmm9,-8-2*16(%rax) + movaps %xmm10,-8-1*16(%rax) +.Lprologue_shaext: +___ +$code.=<<___; + lea K256+0x80(%rip),$Tbl + movdqu ($ctx),$ABEF # DCBA + movdqu 16($ctx),$CDGH # HGFE + movdqa 0x200-0x80($Tbl),$TMP # byte swap mask + + pshufd \$0x1b,$ABEF,$Wi # ABCD + pshufd \$0xb1,$ABEF,$ABEF # CDAB + pshufd \$0x1b,$CDGH,$CDGH # EFGH + movdqa $TMP,$BSWAP # offload + palignr \$8,$CDGH,$ABEF # ABEF + punpcklqdq $Wi,$CDGH # CDGH + jmp .Loop_shaext + +.align 16 +.Loop_shaext: + movdqu ($inp),@MSG[0] + movdqu 0x10($inp),@MSG[1] + movdqu 0x20($inp),@MSG[2] + pshufb $TMP,@MSG[0] + movdqu 0x30($inp),@MSG[3] + + movdqa 0*32-0x80($Tbl),$Wi + paddd @MSG[0],$Wi + pshufb $TMP,@MSG[1] + movdqa $CDGH,$CDGH_SAVE # offload + sha256rnds2 $ABEF,$CDGH # 0-3 + pshufd \$0x0e,$Wi,$Wi + nop + movdqa $ABEF,$ABEF_SAVE # offload + sha256rnds2 $CDGH,$ABEF + + movdqa 1*32-0x80($Tbl),$Wi + paddd @MSG[1],$Wi + pshufb $TMP,@MSG[2] + sha256rnds2 $ABEF,$CDGH # 4-7 + pshufd \$0x0e,$Wi,$Wi + lea 0x40($inp),$inp + sha256msg1 @MSG[1],@MSG[0] + sha256rnds2 $CDGH,$ABEF + + movdqa 2*32-0x80($Tbl),$Wi + paddd @MSG[2],$Wi + pshufb $TMP,@MSG[3] + sha256rnds2 $ABEF,$CDGH # 8-11 + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[3],$TMP + palignr \$4,@MSG[2],$TMP + nop + paddd $TMP,@MSG[0] + sha256msg1 @MSG[2],@MSG[1] + sha256rnds2 $CDGH,$ABEF + + movdqa 3*32-0x80($Tbl),$Wi + paddd @MSG[3],$Wi + sha256msg2 @MSG[3],@MSG[0] + sha256rnds2 $ABEF,$CDGH # 12-15 + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[0],$TMP + palignr \$4,@MSG[3],$TMP + nop + paddd $TMP,@MSG[1] + sha256msg1 @MSG[3],@MSG[2] + sha256rnds2 $CDGH,$ABEF +___ +for($i=4;$i<16-3;$i++) { +$code.=<<___; + movdqa $i*32-0x80($Tbl),$Wi + paddd @MSG[0],$Wi + sha256msg2 @MSG[0],@MSG[1] + sha256rnds2 $ABEF,$CDGH # 16-19... + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[1],$TMP + palignr \$4,@MSG[0],$TMP + nop + paddd $TMP,@MSG[2] + sha256msg1 @MSG[0],@MSG[3] + sha256rnds2 $CDGH,$ABEF +___ + push(@MSG,shift(@MSG)); +} +$code.=<<___; + movdqa 13*32-0x80($Tbl),$Wi + paddd @MSG[0],$Wi + sha256msg2 @MSG[0],@MSG[1] + sha256rnds2 $ABEF,$CDGH # 52-55 + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[1],$TMP + palignr \$4,@MSG[0],$TMP + sha256rnds2 $CDGH,$ABEF + paddd $TMP,@MSG[2] + + movdqa 14*32-0x80($Tbl),$Wi + paddd @MSG[1],$Wi + sha256rnds2 $ABEF,$CDGH # 56-59 + pshufd \$0x0e,$Wi,$Wi + sha256msg2 @MSG[1],@MSG[2] + movdqa $BSWAP,$TMP + sha256rnds2 $CDGH,$ABEF + + movdqa 15*32-0x80($Tbl),$Wi + paddd @MSG[2],$Wi + nop + sha256rnds2 $ABEF,$CDGH # 60-63 + pshufd \$0x0e,$Wi,$Wi + dec $num + nop + sha256rnds2 $CDGH,$ABEF + + paddd $CDGH_SAVE,$CDGH + paddd $ABEF_SAVE,$ABEF + jnz .Loop_shaext + + pshufd \$0xb1,$CDGH,$CDGH # DCHG + pshufd \$0x1b,$ABEF,$TMP # FEBA + pshufd \$0xb1,$ABEF,$ABEF # BAFE + punpckhqdq $CDGH,$ABEF # DCBA + palignr \$8,$TMP,$CDGH # HGFE + + movdqu $ABEF,($ctx) + movdqu $CDGH,16($ctx) +___ +$code.=<<___ if ($win64); + movaps -8-5*16(%rax),%xmm6 + movaps -8-4*16(%rax),%xmm7 + movaps -8-3*16(%rax),%xmm8 + movaps -8-2*16(%rax),%xmm9 + movaps -8-1*16(%rax),%xmm10 + mov %rax,%rsp +.Lepilogue_shaext: +___ +$code.=<<___; + ret +.cfi_endproc +.size sha256_block_data_order_hw,.-sha256_block_data_order_hw +___ +}}} +{{{ + +my $a4=$T1; +my ($a,$b,$c,$d,$e,$f,$g,$h); + +sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; + my $arg = pop; + $arg = "\$$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; +} + +sub body_00_15 () { + ( + '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. + + '&ror ($a0,$Sigma1[2]-$Sigma1[1])', + '&mov ($a,$a1)', + '&mov ($a4,$f)', + + '&ror ($a1,$Sigma0[2]-$Sigma0[1])', + '&xor ($a0,$e)', + '&xor ($a4,$g)', # f^g + + '&ror ($a0,$Sigma1[1]-$Sigma1[0])', + '&xor ($a1,$a)', + '&and ($a4,$e)', # (f^g)&e + + '&xor ($a0,$e)', + '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] + '&mov ($a2,$a)', + + '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g + '&ror ($a1,$Sigma0[1]-$Sigma0[0])', + '&xor ($a2,$b)', # a^b, b^c in next round + + '&add ($h,$a4)', # h+=Ch(e,f,g) + '&ror ($a0,$Sigma1[0])', # Sigma1(e) + '&and ($a3,$a2)', # (b^c)&(a^b) + + '&xor ($a1,$a)', + '&add ($h,$a0)', # h+=Sigma1(e) + '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) + + '&ror ($a1,$Sigma0[0])', # Sigma0(a) + '&add ($d,$h)', # d+=h + '&add ($h,$a3)', # h+=Maj(a,b,c) + + '&mov ($a0,$d)', + '&add ($a1,$h);'. # h+=Sigma0(a) + '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' + ); +} + +###################################################################### +# SSSE3 code path +# +if ($SZ==4) { # SHA256 only +my @X = map("%xmm$_",(0..3)); +my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); + +$code.=<<___; +.globl ${func}_ssse3 +.type ${func}_ssse3,\@function,3 +.align 64 +${func}_ssse3: +.cfi_startproc + _CET_ENDBR + mov %rsp,%rax # copy %rsp +.cfi_def_cfa_register %rax + push %rbx +.cfi_push %rbx + push %rbp +.cfi_push %rbp + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + shl \$4,%rdx # num*16 + sub \$`$framesz+$win64*16*4`,%rsp + lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ + and \$-64,%rsp # align stack frame + mov $ctx,$_ctx # save ctx, 1st arg + mov $inp,$_inp # save inp, 2nd arh + mov %rdx,$_end # save end pointer, "3rd" arg + mov %rax,$_rsp # save copy of %rsp +.cfi_cfa_expression $_rsp,deref,+8 +___ +$code.=<<___ if ($win64); + movaps %xmm6,16*$SZ+32(%rsp) + movaps %xmm7,16*$SZ+48(%rsp) + movaps %xmm8,16*$SZ+64(%rsp) + movaps %xmm9,16*$SZ+80(%rsp) +___ +$code.=<<___; +.Lprologue_ssse3: + + mov $SZ*0($ctx),$A + mov $SZ*1($ctx),$B + mov $SZ*2($ctx),$C + mov $SZ*3($ctx),$D + mov $SZ*4($ctx),$E + mov $SZ*5($ctx),$F + mov $SZ*6($ctx),$G + mov $SZ*7($ctx),$H +___ + +$code.=<<___; + #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 + #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 + jmp .Lloop_ssse3 +.align 16 +.Lloop_ssse3: + movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 + movdqu 0x00($inp),@X[0] + movdqu 0x10($inp),@X[1] + movdqu 0x20($inp),@X[2] + pshufb $t3,@X[0] + movdqu 0x30($inp),@X[3] + lea $TABLE(%rip),$Tbl + pshufb $t3,@X[1] + movdqa 0x00($Tbl),$t0 + movdqa 0x20($Tbl),$t1 + pshufb $t3,@X[2] + paddd @X[0],$t0 + movdqa 0x40($Tbl),$t2 + pshufb $t3,@X[3] + movdqa 0x60($Tbl),$t3 + paddd @X[1],$t1 + paddd @X[2],$t2 + paddd @X[3],$t3 + movdqa $t0,0x00(%rsp) + mov $A,$a1 + movdqa $t1,0x10(%rsp) + mov $B,$a3 + movdqa $t2,0x20(%rsp) + xor $C,$a3 # magic + movdqa $t3,0x30(%rsp) + mov $E,$a0 + jmp .Lssse3_00_47 + +.align 16 +.Lssse3_00_47: + sub \$`-16*2*$SZ`,$Tbl # size optimization +___ +sub Xupdate_256_SSSE3 () { + ( + '&movdqa ($t0,@X[1]);', + '&movdqa ($t3,@X[3])', + '&palignr ($t0,@X[0],$SZ)', # X[1..4] + '&palignr ($t3,@X[2],$SZ);', # X[9..12] + '&movdqa ($t1,$t0)', + '&movdqa ($t2,$t0);', + '&psrld ($t0,$sigma0[2])', + '&paddd (@X[0],$t3);', # X[0..3] += X[9..12] + '&psrld ($t2,$sigma0[0])', + '&pshufd ($t3,@X[3],0b11111010)',# X[14..15] + '&pslld ($t1,8*$SZ-$sigma0[1]);'. + '&pxor ($t0,$t2)', + '&psrld ($t2,$sigma0[1]-$sigma0[0]);'. + '&pxor ($t0,$t1)', + '&pslld ($t1,$sigma0[1]-$sigma0[0]);'. + '&pxor ($t0,$t2);', + '&movdqa ($t2,$t3)', + '&pxor ($t0,$t1);', # sigma0(X[1..4]) + '&psrld ($t3,$sigma1[2])', + '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) + '&psrlq ($t2,$sigma1[0])', + '&pxor ($t3,$t2);', + '&psrlq ($t2,$sigma1[1]-$sigma1[0])', + '&pxor ($t3,$t2)', + '&pshufb ($t3,$t4)', # sigma1(X[14..15]) + '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) + '&pshufd ($t3,@X[0],0b01010000)',# X[16..17] + '&movdqa ($t2,$t3);', + '&psrld ($t3,$sigma1[2])', + '&psrlq ($t2,$sigma1[0])', + '&pxor ($t3,$t2);', + '&psrlq ($t2,$sigma1[1]-$sigma1[0])', + '&pxor ($t3,$t2);', + '&movdqa ($t2,16*2*$j."($Tbl)")', + '&pshufb ($t3,$t5)', + '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17]) + ); +} + +sub SSSE3_256_00_47 () { +my $j = shift; +my $body = shift; +my @X = @_; +my @insns = (&$body,&$body,&$body,&$body); # 104 instructions + + if (0) { + foreach (Xupdate_256_SSSE3()) { # 36 instructions + eval; + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + } + } else { # squeeze extra 4% on Westmere and 19% on Atom + eval(shift(@insns)); #@ + &movdqa ($t0,@X[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa ($t3,@X[3]); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + eval(shift(@insns)); + &palignr ($t0,@X[0],$SZ); # X[1..4] + eval(shift(@insns)); + eval(shift(@insns)); + &palignr ($t3,@X[2],$SZ); # X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &movdqa ($t1,$t0); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa ($t2,$t0); + eval(shift(@insns)); #@ + eval(shift(@insns)); + &psrld ($t0,$sigma0[2]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &paddd (@X[0],$t3); # X[0..3] += X[9..12] + eval(shift(@insns)); #@ + eval(shift(@insns)); + &psrld ($t2,$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &pshufd ($t3,@X[3],0b11111010); # X[4..15] + eval(shift(@insns)); + eval(shift(@insns)); #@ + &pslld ($t1,8*$SZ-$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t0,$t2); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &psrld ($t2,$sigma0[1]-$sigma0[0]); + eval(shift(@insns)); + &pxor ($t0,$t1); + eval(shift(@insns)); + eval(shift(@insns)); + &pslld ($t1,$sigma0[1]-$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t0,$t2); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &movdqa ($t2,$t3); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t0,$t1); # sigma0(X[1..4]) + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + &psrld ($t3,$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) + eval(shift(@insns)); #@ + eval(shift(@insns)); + &psrlq ($t2,$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t3,$t2); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &psrlq ($t2,$sigma1[1]-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t3,$t2); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + #&pshufb ($t3,$t4); # sigma1(X[14..15]) + &pshufd ($t3,$t3,0b10000000); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &psrldq ($t3,8); + eval(shift(@insns)); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &pshufd ($t3,@X[0],0b01010000); # X[16..17] + eval(shift(@insns)); + eval(shift(@insns)); #@ + eval(shift(@insns)); + &movdqa ($t2,$t3); + eval(shift(@insns)); + eval(shift(@insns)); + &psrld ($t3,$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &psrlq ($t2,$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t3,$t2); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + eval(shift(@insns)); + &psrlq ($t2,$sigma1[1]-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t3,$t2); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + #&pshufb ($t3,$t5); + &pshufd ($t3,$t3,0b00001000); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa ($t2,16*2*$j."($Tbl)"); + eval(shift(@insns)); #@ + eval(shift(@insns)); + &pslldq ($t3,8); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + } + &paddd ($t2,@X[0]); + foreach (@insns) { eval; } # remaining instructions + &movdqa (16*$j."(%rsp)",$t2); +} + + for ($i=0,$j=0; $j<4; $j++) { + &SSSE3_256_00_47($j,\&body_00_15,@X); + push(@X,shift(@X)); # rotate(@X) + } + &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); + &jne (".Lssse3_00_47"); + + for ($i=0; $i<16; ) { + foreach(body_00_15()) { eval; } + } +$code.=<<___; + mov $_ctx,$ctx + mov $a1,$A + + add $SZ*0($ctx),$A + lea 16*$SZ($inp),$inp + add $SZ*1($ctx),$B + add $SZ*2($ctx),$C + add $SZ*3($ctx),$D + add $SZ*4($ctx),$E + add $SZ*5($ctx),$F + add $SZ*6($ctx),$G + add $SZ*7($ctx),$H + + cmp $_end,$inp + + mov $A,$SZ*0($ctx) + mov $B,$SZ*1($ctx) + mov $C,$SZ*2($ctx) + mov $D,$SZ*3($ctx) + mov $E,$SZ*4($ctx) + mov $F,$SZ*5($ctx) + mov $G,$SZ*6($ctx) + mov $H,$SZ*7($ctx) + jb .Lloop_ssse3 + + mov $_rsp,%rsi +.cfi_def_cfa %rsi,8 +___ +$code.=<<___ if ($win64); + movaps 16*$SZ+32(%rsp),%xmm6 + movaps 16*$SZ+48(%rsp),%xmm7 + movaps 16*$SZ+64(%rsp),%xmm8 + movaps 16*$SZ+80(%rsp),%xmm9 +___ +$code.=<<___; + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbp +.cfi_restore %rbp + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_ssse3: + ret +.cfi_endproc +.size ${func}_ssse3,.-${func}_ssse3 +___ +} + +if ($avx) {{ +###################################################################### +# AVX+shrd code path +# +local *ror = sub { &shrd(@_[0],@_) }; + +$code.=<<___; +.globl ${func}_avx +.type ${func}_avx,\@function,3 +.align 64 +${func}_avx: +.cfi_startproc + _CET_ENDBR + mov %rsp,%rax # copy %rsp +.cfi_def_cfa_register %rax + push %rbx +.cfi_push %rbx + push %rbp +.cfi_push %rbp + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + shl \$4,%rdx # num*16 + sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp + lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ + and \$-64,%rsp # align stack frame + mov $ctx,$_ctx # save ctx, 1st arg + mov $inp,$_inp # save inp, 2nd arh + mov %rdx,$_end # save end pointer, "3rd" arg + mov %rax,$_rsp # save copy of %rsp +.cfi_cfa_expression $_rsp,deref,+8 +___ +$code.=<<___ if ($win64); + movaps %xmm6,16*$SZ+32(%rsp) + movaps %xmm7,16*$SZ+48(%rsp) + movaps %xmm8,16*$SZ+64(%rsp) + movaps %xmm9,16*$SZ+80(%rsp) +___ +$code.=<<___ if ($win64 && $SZ>4); + movaps %xmm10,16*$SZ+96(%rsp) + movaps %xmm11,16*$SZ+112(%rsp) +___ +$code.=<<___; +.Lprologue_avx: + + vzeroupper + mov $SZ*0($ctx),$A + mov $SZ*1($ctx),$B + mov $SZ*2($ctx),$C + mov $SZ*3($ctx),$D + mov $SZ*4($ctx),$E + mov $SZ*5($ctx),$F + mov $SZ*6($ctx),$G + mov $SZ*7($ctx),$H +___ + if ($SZ==4) { # SHA256 + my @X = map("%xmm$_",(0..3)); + my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); + +$code.=<<___; + vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 + vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 + jmp .Lloop_avx +.align 16 +.Lloop_avx: + vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 + vmovdqu 0x00($inp),@X[0] + vmovdqu 0x10($inp),@X[1] + vmovdqu 0x20($inp),@X[2] + vmovdqu 0x30($inp),@X[3] + vpshufb $t3,@X[0],@X[0] + lea $TABLE(%rip),$Tbl + vpshufb $t3,@X[1],@X[1] + vpshufb $t3,@X[2],@X[2] + vpaddd 0x00($Tbl),@X[0],$t0 + vpshufb $t3,@X[3],@X[3] + vpaddd 0x20($Tbl),@X[1],$t1 + vpaddd 0x40($Tbl),@X[2],$t2 + vpaddd 0x60($Tbl),@X[3],$t3 + vmovdqa $t0,0x00(%rsp) + mov $A,$a1 + vmovdqa $t1,0x10(%rsp) + mov $B,$a3 + vmovdqa $t2,0x20(%rsp) + xor $C,$a3 # magic + vmovdqa $t3,0x30(%rsp) + mov $E,$a0 + jmp .Lavx_00_47 + +.align 16 +.Lavx_00_47: + sub \$`-16*2*$SZ`,$Tbl # size optimization +___ +sub Xupdate_256_AVX () { + ( + '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4] + '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12] + '&vpsrld ($t2,$t0,$sigma0[0]);', + '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12] + '&vpsrld ($t3,$t0,$sigma0[2])', + '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);', + '&vpxor ($t0,$t3,$t2)', + '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] + '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);', + '&vpxor ($t0,$t0,$t1)', + '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);', + '&vpxor ($t0,$t0,$t2)', + '&vpsrld ($t2,$t3,$sigma1[2]);', + '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4]) + '&vpsrlq ($t3,$t3,$sigma1[0]);', + '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4]) + '&vpxor ($t2,$t2,$t3);', + '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', + '&vpxor ($t2,$t2,$t3)', + '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15]) + '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15]) + '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] + '&vpsrld ($t2,$t3,$sigma1[2])', + '&vpsrlq ($t3,$t3,$sigma1[0])', + '&vpxor ($t2,$t2,$t3);', + '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', + '&vpxor ($t2,$t2,$t3)', + '&vpshufb ($t2,$t2,$t5)', + '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17]) + ); +} + +sub AVX_256_00_47 () { +my $j = shift; +my $body = shift; +my @X = @_; +my @insns = (&$body,&$body,&$body,&$body); # 104 instructions + + foreach (Xupdate_256_AVX()) { # 29 instructions + eval; + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + } + &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); + foreach (@insns) { eval; } # remaining instructions + &vmovdqa (16*$j."(%rsp)",$t2); +} + + for ($i=0,$j=0; $j<4; $j++) { + &AVX_256_00_47($j,\&body_00_15,@X); + push(@X,shift(@X)); # rotate(@X) + } + &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); + &jne (".Lavx_00_47"); + + for ($i=0; $i<16; ) { + foreach(body_00_15()) { eval; } + } + + } else { # SHA512 + my @X = map("%xmm$_",(0..7)); + my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); + +$code.=<<___; + jmp .Lloop_avx +.align 16 +.Lloop_avx: + vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 + vmovdqu 0x00($inp),@X[0] + lea $TABLE+0x80(%rip),$Tbl # size optimization + vmovdqu 0x10($inp),@X[1] + vmovdqu 0x20($inp),@X[2] + vpshufb $t3,@X[0],@X[0] + vmovdqu 0x30($inp),@X[3] + vpshufb $t3,@X[1],@X[1] + vmovdqu 0x40($inp),@X[4] + vpshufb $t3,@X[2],@X[2] + vmovdqu 0x50($inp),@X[5] + vpshufb $t3,@X[3],@X[3] + vmovdqu 0x60($inp),@X[6] + vpshufb $t3,@X[4],@X[4] + vmovdqu 0x70($inp),@X[7] + vpshufb $t3,@X[5],@X[5] + vpaddq -0x80($Tbl),@X[0],$t0 + vpshufb $t3,@X[6],@X[6] + vpaddq -0x60($Tbl),@X[1],$t1 + vpshufb $t3,@X[7],@X[7] + vpaddq -0x40($Tbl),@X[2],$t2 + vpaddq -0x20($Tbl),@X[3],$t3 + vmovdqa $t0,0x00(%rsp) + vpaddq 0x00($Tbl),@X[4],$t0 + vmovdqa $t1,0x10(%rsp) + vpaddq 0x20($Tbl),@X[5],$t1 + vmovdqa $t2,0x20(%rsp) + vpaddq 0x40($Tbl),@X[6],$t2 + vmovdqa $t3,0x30(%rsp) + vpaddq 0x60($Tbl),@X[7],$t3 + vmovdqa $t0,0x40(%rsp) + mov $A,$a1 + vmovdqa $t1,0x50(%rsp) + mov $B,$a3 + vmovdqa $t2,0x60(%rsp) + xor $C,$a3 # magic + vmovdqa $t3,0x70(%rsp) + mov $E,$a0 + jmp .Lavx_00_47 + +.align 16 +.Lavx_00_47: + add \$`16*2*$SZ`,$Tbl +___ +sub Xupdate_512_AVX () { + ( + '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2] + '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10] + '&vpsrlq ($t2,$t0,$sigma0[0])', + '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10] + '&vpsrlq ($t3,$t0,$sigma0[2])', + '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);', + '&vpxor ($t0,$t3,$t2)', + '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);', + '&vpxor ($t0,$t0,$t1)', + '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);', + '&vpxor ($t0,$t0,$t2)', + '&vpsrlq ($t3,@X[7],$sigma1[2]);', + '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2]) + '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);', + '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2]) + '&vpsrlq ($t1,@X[7],$sigma1[0]);', + '&vpxor ($t3,$t3,$t2)', + '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);', + '&vpxor ($t3,$t3,$t1)', + '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);', + '&vpxor ($t3,$t3,$t2)', + '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15]) + '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) + ); +} + +sub AVX_512_00_47 () { +my $j = shift; +my $body = shift; +my @X = @_; +my @insns = (&$body,&$body); # 52 instructions + + foreach (Xupdate_512_AVX()) { # 23 instructions + eval; + eval(shift(@insns)); + eval(shift(@insns)); + } + &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); + foreach (@insns) { eval; } # remaining instructions + &vmovdqa (16*$j."(%rsp)",$t2); +} + + for ($i=0,$j=0; $j<8; $j++) { + &AVX_512_00_47($j,\&body_00_15,@X); + push(@X,shift(@X)); # rotate(@X) + } + &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); + &jne (".Lavx_00_47"); + + for ($i=0; $i<16; ) { + foreach(body_00_15()) { eval; } + } +} +$code.=<<___; + mov $_ctx,$ctx + mov $a1,$A + + add $SZ*0($ctx),$A + lea 16*$SZ($inp),$inp + add $SZ*1($ctx),$B + add $SZ*2($ctx),$C + add $SZ*3($ctx),$D + add $SZ*4($ctx),$E + add $SZ*5($ctx),$F + add $SZ*6($ctx),$G + add $SZ*7($ctx),$H + + cmp $_end,$inp + + mov $A,$SZ*0($ctx) + mov $B,$SZ*1($ctx) + mov $C,$SZ*2($ctx) + mov $D,$SZ*3($ctx) + mov $E,$SZ*4($ctx) + mov $F,$SZ*5($ctx) + mov $G,$SZ*6($ctx) + mov $H,$SZ*7($ctx) + jb .Lloop_avx + + mov $_rsp,%rsi +.cfi_def_cfa %rsi,8 + vzeroupper +___ +$code.=<<___ if ($win64); + movaps 16*$SZ+32(%rsp),%xmm6 + movaps 16*$SZ+48(%rsp),%xmm7 + movaps 16*$SZ+64(%rsp),%xmm8 + movaps 16*$SZ+80(%rsp),%xmm9 +___ +$code.=<<___ if ($win64 && $SZ>4); + movaps 16*$SZ+96(%rsp),%xmm10 + movaps 16*$SZ+112(%rsp),%xmm11 +___ +$code.=<<___; + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbp +.cfi_restore %rbp + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx: + ret +.cfi_endproc +.size ${func}_avx,.-${func}_avx +___ + +}}}}} + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HanderlData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->RipRsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lin_prologue +___ +$code.=<<___; + mov %rax,%rsi # put aside Rsp + mov 16*$SZ+3*8(%rax),%rax # pull $_rsp + + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 + + lea .Lepilogue(%rip),%r10 + cmp %r10,%rbx + jb .Lin_prologue # non-AVX code + + lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area + lea 512($context),%rdi # &context.Xmm6 + mov \$`$SZ==4?8:12`,%ecx + .long 0xa548f3fc # cld; rep movsq + +.Lin_prologue: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size se_handler,.-se_handler +___ + +$code.=<<___ if ($SZ==4 && $shaext); +.type shaext_handler,\@abi-omnipotent +.align 16 +shaext_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + lea .Lprologue_shaext(%rip),%r10 + cmp %r10,%rbx # context->Rip<.Lprologue + jb .Lin_prologue + + lea .Lepilogue_shaext(%rip),%r10 + cmp %r10,%rbx # context->Rip>=.Lepilogue + jae .Lin_prologue + + lea -8-5*16(%rax),%rsi + lea 512($context),%rdi # &context.Xmm6 + mov \$10,%ecx + .long 0xa548f3fc # cld; rep movsq + + jmp .Lin_prologue +.size shaext_handler,.-shaext_handler +___ + +$code.=<<___; +.section .pdata +.align 4 + .rva .LSEH_begin_${func}_nohw + .rva .LSEH_end_${func}_nohw + .rva .LSEH_info_${func}_nohw +___ +$code.=<<___ if ($SZ==4 && $shaext); + .rva .LSEH_begin_${func}_hw + .rva .LSEH_end_${func}_hw + .rva .LSEH_info_${func}_hw +___ +$code.=<<___ if ($SZ==4); + .rva .LSEH_begin_${func}_ssse3 + .rva .LSEH_end_${func}_ssse3 + .rva .LSEH_info_${func}_ssse3 +___ +$code.=<<___ if ($avx); + .rva .LSEH_begin_${func}_avx + .rva .LSEH_end_${func}_avx + .rva .LSEH_info_${func}_avx +___ +$code.=<<___; +.section .xdata +.align 8 +.LSEH_info_${func}_nohw: + .byte 9,0,0,0 + .rva se_handler + .rva .Lprologue,.Lepilogue # HandlerData[] +___ +$code.=<<___ if ($SZ==4 && $shaext); +.LSEH_info_${func}_hw: + .byte 9,0,0,0 + .rva shaext_handler +___ +$code.=<<___ if ($SZ==4); +.LSEH_info_${func}_ssse3: + .byte 9,0,0,0 + .rva se_handler + .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] +___ +$code.=<<___ if ($avx); +.LSEH_info_${func}_avx: + .byte 9,0,0,0 + .rva se_handler + .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] +___ +} + +sub sha256op38 { + my $instr = shift; + my %opcodelet = ( + "sha256rnds2" => 0xcb, + "sha256msg1" => 0xcc, + "sha256msg2" => 0xcd ); + + if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) { + my @opcode=(0x0f,0x38); + push @opcode,$opcodelet{$instr}; + push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M + return ".byte\t".join(',',@opcode); + } else { + return $instr."\t".@_[0]; + } +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo; + + print $_,"\n"; +} +close STDOUT or die "error closing STDOUT: $!"; diff --git a/ring-0.17.14/crypto/internal.h b/ring-0.17.14/crypto/internal.h new file mode 100644 index 0000000000..99223d1aca --- /dev/null +++ b/ring-0.17.14/crypto/internal.h @@ -0,0 +1,474 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_INTERNAL_H + +#include // Must be first. + +#include "ring-core/check.h" + +#if defined(__clang__) +// Don't require prototypes for functions defined in C that are only +// used from Rust. +#pragma GCC diagnostic ignored "-Wmissing-prototypes" +#endif + +#if defined(__GNUC__) && \ + (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) < 40800 +// |alignas| and |alignof| were added in C11. GCC added support in version 4.8. +// Testing for __STDC_VERSION__/__cplusplus doesn't work because 4.7 already +// reports support for C11. +#define alignas(x) __attribute__ ((aligned (x))) +#elif defined(_MSC_VER) && !defined(__clang__) +#define alignas(x) __declspec(align(x)) +#else +#include +#endif + +#if defined(__clang__) || defined(__GNUC__) +#define RING_NOINLINE __attribute__((noinline)) +#elif defined(_MSC_VER) +#define RING_NOINLINE __declspec(noinline) +#else +#define RING_NOINLINE +#endif + +// Some C compilers require a useless cast when dealing with arrays for the +// reason explained in +// https://gustedt.wordpress.com/2011/02/12/const-and-arrays/ +#if defined(__clang__) || defined(_MSC_VER) +#define RING_CORE_POINTLESS_ARRAY_CONST_CAST(cast) +#else +#define RING_CORE_POINTLESS_ARRAY_CONST_CAST(cast) cast +#endif + +// `uint8_t` isn't guaranteed to be 'unsigned char' and only 'char' and +// 'unsigned char' are allowed to alias according to ISO C. +typedef unsigned char aliasing_uint8_t; + +#if (!defined(_MSC_VER) || defined(__clang__)) && defined(OPENSSL_64_BIT) +#define BORINGSSL_HAS_UINT128 +typedef __int128_t int128_t; +typedef __uint128_t uint128_t; +#endif + +// GCC-like compilers indicate SSE2 with |__SSE2__|. MSVC leaves the caller to +// know that x86_64 has SSE2, and uses _M_IX86_FP to indicate SSE2 on x86. +// https://learn.microsoft.com/en-us/cpp/preprocessor/predefined-macros?view=msvc-170 +#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64) +# if defined(_MSC_VER) && !defined(__clang__) +# if defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2) +# define OPENSSL_SSE2 +# else +# error "SSE2 is required." +# endif +# elif !defined(__SSE2__) +# error "SSE2 is required." +# endif +#endif + +// For convenience in testing the fallback code, we allow disabling SSE2 +// intrinsics via |OPENSSL_NO_SSE2_FOR_TESTING|. We require SSE2 on x86 and +// x86_64, so we would otherwise need to test such code on a non-x86 platform. +// +// This does not remove the above requirement for SSE2 support with assembly +// optimizations. It only disables some intrinsics-based optimizations so that +// we can test the fallback code on CI. +#if defined(OPENSSL_SSE2) && defined(OPENSSL_NO_SSE2_FOR_TESTING) +#undef OPENSSL_SSE2 +#endif + +// Pointer utility functions. + +// buffers_alias returns one if |a| and |b| alias and zero otherwise. +static inline int buffers_alias(const void *a, size_t a_bytes, + const void *b, size_t b_bytes) { + // Cast |a| and |b| to integers. In C, pointer comparisons between unrelated + // objects are undefined whereas pointer to integer conversions are merely + // implementation-defined. We assume the implementation defined it in a sane + // way. + uintptr_t a_u = (uintptr_t)a; + uintptr_t b_u = (uintptr_t)b; + return a_u + a_bytes > b_u && b_u + b_bytes > a_u; +} + + +// Constant-time utility functions. +// +// The following methods return a bitmask of all ones (0xff...f) for true and 0 +// for false. This is useful for choosing a value based on the result of a +// conditional in constant time. For example, +// +// if (a < b) { +// c = a; +// } else { +// c = b; +// } +// +// can be written as +// +// crypto_word_t lt = constant_time_lt_w(a, b); +// c = constant_time_select_w(lt, a, b); + +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#pragma GCC diagnostic ignored "-Wsign-conversion" +#endif +#if defined(_MSC_VER) && !defined(__clang__) +#pragma warning(push) +// '=': conversion from 'crypto_word_t' to 'uint8_t', possible loss of data +#pragma warning(disable: 4242) +// 'initializing': conversion from 'crypto_word_t' to 'uint8_t', ... +#pragma warning(disable: 4244) +#endif + +// crypto_word_t is the type that most constant-time functions use. Ideally we +// would like it to be |size_t|, but NaCl builds in 64-bit mode with 32-bit +// pointers, which means that |size_t| can be 32 bits when |BN_ULONG| is 64 +// bits. Since we want to be able to do constant-time operations on a +// |BN_ULONG|, |crypto_word_t| is defined as an unsigned value with the native +// word length. +#if defined(OPENSSL_64_BIT) +typedef uint64_t crypto_word_t; +#define CRYPTO_WORD_BITS (64u) +#elif defined(OPENSSL_32_BIT) +typedef uint32_t crypto_word_t; +#define CRYPTO_WORD_BITS (32u) +#else +#error "Must define either OPENSSL_32_BIT or OPENSSL_64_BIT" +#endif + +#define CONSTTIME_TRUE_W ~((crypto_word_t)0) +#define CONSTTIME_FALSE_W ((crypto_word_t)0) + +// value_barrier_w returns |a|, but prevents GCC and Clang from reasoning about +// the returned value. This is used to mitigate compilers undoing constant-time +// code, until we can express our requirements directly in the language. +// +// Note the compiler is aware that |value_barrier_w| has no side effects and +// always has the same output for a given input. This allows it to eliminate +// dead code, move computations across loops, and vectorize. +static inline crypto_word_t value_barrier_w(crypto_word_t a) { +#if defined(__GNUC__) || defined(__clang__) + __asm__("" : "+r"(a) : /* no inputs */); +#endif + return a; +} + +// value_barrier_u32 behaves like |value_barrier_w| but takes a |uint32_t|. +static inline uint32_t value_barrier_u32(uint32_t a) { +#if defined(__GNUC__) || defined(__clang__) + __asm__("" : "+r"(a) : /* no inputs */); +#endif + return a; +} + +// |value_barrier_u8| could be defined as above, but compilers other than +// clang seem to still materialize 0x00..00MM instead of reusing 0x??..??MM. + +// constant_time_msb_w returns the given value with the MSB copied to all the +// other bits. +static inline crypto_word_t constant_time_msb_w(crypto_word_t a) { + return 0u - (a >> (sizeof(a) * 8 - 1)); +} + +// constant_time_is_zero returns 0xff..f if a == 0 and 0 otherwise. +static inline crypto_word_t constant_time_is_zero_w(crypto_word_t a) { + // Here is an SMT-LIB verification of this formula: + // + // (define-fun is_zero ((a (_ BitVec 32))) (_ BitVec 32) + // (bvand (bvnot a) (bvsub a #x00000001)) + // ) + // + // (declare-fun a () (_ BitVec 32)) + // + // (assert (not (= (= #x00000001 (bvlshr (is_zero a) #x0000001f)) (= a #x00000000)))) + // (check-sat) + // (get-model) + return constant_time_msb_w(~a & (a - 1)); +} + +static inline crypto_word_t constant_time_is_nonzero_w(crypto_word_t a) { + return ~constant_time_is_zero_w(a); +} + +// constant_time_eq_w returns 0xff..f if a == b and 0 otherwise. +static inline crypto_word_t constant_time_eq_w(crypto_word_t a, + crypto_word_t b) { + return constant_time_is_zero_w(a ^ b); +} + +// constant_time_select_w returns (mask & a) | (~mask & b). When |mask| is all +// 1s or all 0s (as returned by the methods above), the select methods return +// either |a| (if |mask| is nonzero) or |b| (if |mask| is zero). +static inline crypto_word_t constant_time_select_w(crypto_word_t mask, + crypto_word_t a, + crypto_word_t b) { + // Clang recognizes this pattern as a select. While it usually transforms it + // to a cmov, it sometimes further transforms it into a branch, which we do + // not want. + // + // Hiding the value of the mask from the compiler evades this transformation. + mask = value_barrier_w(mask); + return (mask & a) | (~mask & b); +} + +// constant_time_select_8 acts like |constant_time_select| but operates on +// 8-bit values. +static inline uint8_t constant_time_select_8(crypto_word_t mask, uint8_t a, + uint8_t b) { + // |mask| is a word instead of |uint8_t| to avoid materializing 0x000..0MM + // Making both |mask| and its value barrier |uint8_t| would allow the compiler + // to materialize 0x????..?MM instead, but only clang is that clever. + // However, vectorization of bitwise operations seems to work better on + // |uint8_t| than a mix of |uint64_t| and |uint8_t|, so |m| is cast to + // |uint8_t| after the value barrier but before the bitwise operations. + uint8_t m = value_barrier_w(mask); + return (m & a) | (~m & b); +} + +// constant_time_conditional_memcpy copies |n| bytes from |src| to |dst| if +// |mask| is 0xff..ff and does nothing if |mask| is 0. The |n|-byte memory +// ranges at |dst| and |src| must not overlap, as when calling |memcpy|. +static inline void constant_time_conditional_memcpy(void *dst, const void *src, + const size_t n, + const crypto_word_t mask) { + debug_assert_nonsecret(!buffers_alias(dst, n, src, n)); + uint8_t *out = (uint8_t *)dst; + const uint8_t *in = (const uint8_t *)src; + for (size_t i = 0; i < n; i++) { + out[i] = constant_time_select_8(mask, in[i], out[i]); + } +} + +// constant_time_conditional_memxor xors |n| bytes from |src| to |dst| if +// |mask| is 0xff..ff and does nothing if |mask| is 0. The |n|-byte memory +// ranges at |dst| and |src| must not overlap, as when calling |memcpy|. +static inline void constant_time_conditional_memxor(void *dst, const void *src, + size_t n, + const crypto_word_t mask) { + debug_assert_nonsecret(!buffers_alias(dst, n, src, n)); + aliasing_uint8_t *out = dst; + const aliasing_uint8_t *in = src; +#if defined(__GNUC__) && !defined(__clang__) + // gcc 13.2.0 doesn't automatically vectorize this loop regardless of barrier + typedef aliasing_uint8_t v32u8 __attribute__((vector_size(32), aligned(1), may_alias)); + size_t n_vec = n&~(size_t)31; + v32u8 masks = ((aliasing_uint8_t)mask-(v32u8){}); // broadcast + for (size_t i = 0; i < n_vec; i += 32) { + *(v32u8*)&out[i] ^= masks & *(v32u8 const*)&in[i]; + } + out += n_vec; + n -= n_vec; +#endif + for (size_t i = 0; i < n; i++) { + out[i] ^= value_barrier_w(mask) & in[i]; + } +} + +#if defined(BORINGSSL_CONSTANT_TIME_VALIDATION) + +// CONSTTIME_SECRET takes a pointer and a number of bytes and marks that region +// of memory as secret. Secret data is tracked as it flows to registers and +// other parts of a memory. If secret data is used as a condition for a branch, +// or as a memory index, it will trigger warnings in valgrind. +#define CONSTTIME_SECRET(ptr, len) VALGRIND_MAKE_MEM_UNDEFINED(ptr, len) + +// CONSTTIME_DECLASSIFY takes a pointer and a number of bytes and marks that +// region of memory as public. Public data is not subject to constant-time +// rules. +#define CONSTTIME_DECLASSIFY(ptr, len) VALGRIND_MAKE_MEM_DEFINED(ptr, len) + +#else + +#define CONSTTIME_SECRET(ptr, len) +#define CONSTTIME_DECLASSIFY(ptr, len) + +#endif // BORINGSSL_CONSTANT_TIME_VALIDATION + +static inline crypto_word_t constant_time_declassify_w(crypto_word_t v) { + // Return |v| through a value barrier to be safe. Valgrind-based constant-time + // validation is partly to check the compiler has not undone any constant-time + // work. Any place |BORINGSSL_CONSTANT_TIME_VALIDATION| influences + // optimizations, this validation is inaccurate. + // + // However, by sending pointers through valgrind, we likely inhibit escape + // analysis. On local variables, particularly booleans, we likely + // significantly impact optimizations. + // + // Thus, to be safe, stick a value barrier, in hopes of comparably inhibiting + // compiler analysis. + CONSTTIME_DECLASSIFY(&v, sizeof(v)); + return value_barrier_w(v); +} + +static inline int constant_time_declassify_int(int v) { + OPENSSL_STATIC_ASSERT(sizeof(uint32_t) == sizeof(int), + "int is not the same size as uint32_t"); + // See comment above. + CONSTTIME_DECLASSIFY(&v, sizeof(v)); + return value_barrier_u32((uint32_t)v); +} + +#if defined(_MSC_VER) && !defined(__clang__) +// '=': conversion from 'int64_t' to 'int32_t', possible loss of data +#pragma warning(pop) +#endif +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic pop +#endif + +// declassify_assert behaves like |assert| but declassifies the result of +// evaluating |expr|. This allows the assertion to branch on the (presumably +// public) result, but still ensures that values leading up to the computation +// were secret. +#define declassify_assert(expr) dev_assert_secret(constant_time_declassify_int(expr)) + +// Endianness conversions. + +#if defined(__GNUC__) && __GNUC__ >= 2 +static inline uint32_t CRYPTO_bswap4(uint32_t x) { + return __builtin_bswap32(x); +} + +static inline uint64_t CRYPTO_bswap8(uint64_t x) { + return __builtin_bswap64(x); +} +#elif defined(_MSC_VER) +#pragma warning(push, 3) +#include +#pragma warning(pop) +#pragma intrinsic(_byteswap_ulong) +static inline uint32_t CRYPTO_bswap4(uint32_t x) { + return _byteswap_ulong(x); +} +#endif + +#if !defined(RING_CORE_NOSTDLIBINC) +#include +#endif + +static inline void *OPENSSL_memcpy(void *dst, const void *src, size_t n) { +#if !defined(RING_CORE_NOSTDLIBINC) + if (n == 0) { + return dst; + } + return memcpy(dst, src, n); +#else + aliasing_uint8_t *d = dst; + const aliasing_uint8_t *s = src; + for (size_t i = 0; i < n; ++i) { + d[i] = s[i]; + } + return dst; +#endif +} + +static inline void *OPENSSL_memset(void *dst, int c, size_t n) { +#if !defined(RING_CORE_NOSTDLIBINC) + if (n == 0) { + return dst; + } + return memset(dst, c, n); +#else + aliasing_uint8_t *d = dst; + for (size_t i = 0; i < n; ++i) { + d[i] = (aliasing_uint8_t)c; + } + return dst; +#endif +} + + +// Loads and stores. +// +// The following functions load and store sized integers with the specified +// endianness. They use |memcpy|, and so avoid alignment or strict aliasing +// requirements on the input and output pointers. + +#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define RING_BIG_ENDIAN +#endif +#endif + +static inline uint32_t CRYPTO_load_u32_le(const void *in) { + uint32_t v; + OPENSSL_memcpy(&v, in, sizeof(v)); +#if defined(RING_BIG_ENDIAN) + return CRYPTO_bswap4(v); +#else + return v; +#endif +} + +static inline void CRYPTO_store_u32_le(void *out, uint32_t v) { +#if defined(RING_BIG_ENDIAN) + v = CRYPTO_bswap4(v); +#endif + OPENSSL_memcpy(out, &v, sizeof(v)); +} + +static inline uint32_t CRYPTO_load_u32_be(const void *in) { + uint32_t v; + OPENSSL_memcpy(&v, in, sizeof(v)); +#if !defined(RING_BIG_ENDIAN) + return CRYPTO_bswap4(v); +#else + return v; +#endif +} + +static inline void CRYPTO_store_u32_be(void *out, uint32_t v) { +#if !defined(RING_BIG_ENDIAN) + v = CRYPTO_bswap4(v); +#endif + OPENSSL_memcpy(out, &v, sizeof(v)); +} + +// Runtime CPU feature support + +#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64) +// OPENSSL_ia32cap_P contains the Intel CPUID bits when running on an x86 or +// x86-64 system. +// +// Index 0: +// EDX for CPUID where EAX = 1 +// Bit 30 is used to indicate an Intel CPU +// Index 1: +// ECX for CPUID where EAX = 1 +// Index 2: +// EBX for CPUID where EAX = 7, ECX = 0 +// Bit 14 (for removed feature MPX) is used to indicate a preference for ymm +// registers over zmm even when zmm registers are supported +// Index 3: +// ECX for CPUID where EAX = 7, ECX = 0 +// +// Note: the CPUID bits are pre-adjusted for the OSXSAVE bit and the XMM, YMM, +// and AVX512 bits in XCR0, so it is not necessary to check those. (WARNING: See +// caveats in cpu_intel.c.) +#if defined(OPENSSL_X86_64) +extern uint32_t avx2_available; +extern uint32_t adx_bmi2_available; +#endif +#endif + + +#if defined(OPENSSL_ARM) +extern alignas(4) uint32_t neon_available; +#endif // OPENSSL_ARM + +#endif // OPENSSL_HEADER_CRYPTO_INTERNAL_H diff --git a/ring-0.17.14/crypto/limbs/limbs.c b/ring-0.17.14/crypto/limbs/limbs.c new file mode 100644 index 0000000000..d0027d400d --- /dev/null +++ b/ring-0.17.14/crypto/limbs/limbs.c @@ -0,0 +1,170 @@ +/* Copyright 2016-2017 Brian Smith. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#include "limbs.h" + +#include "../internal.h" +#include "../fipsmodule/bn/internal.h" +#include "limbs.inl" + + +/* XXX: We assume that the conversion from |Carry| to |Limb| is constant-time, + * but we haven't verified that assumption. TODO: Fix it so we don't need to + * make that assumption. */ + +/* Returns 0xfff..f if |a| is zero, and zero otherwise. */ +Limb LIMB_is_zero(const Limb a) { + return constant_time_is_zero_w(a); +} + +/* Returns 0xfff..f if |a| is all zero limbs, and zero otherwise. |num_limbs| + * may be zero. */ +Limb LIMBS_are_zero(const Limb a[], size_t num_limbs) { + Limb all = 0; + for (size_t i = 0; i < num_limbs; ++i) { + all |= a[i]; + } + return LIMB_is_zero(all); +} + +/* Returns 0xffff..f if |a == b|, and zero otherwise. |num_limbs| may be zero. */ +Limb LIMBS_equal(const Limb a[], const Limb b[], size_t num_limbs) { + Limb eq = CONSTTIME_TRUE_W; + for (size_t i = 0; i < num_limbs; ++i) { + eq = constant_time_select_w(eq, constant_time_eq_w(a[i], b[i]), eq); + } + return eq; +} + +/* Returns 0xffff...f if |a| is less than |b|, and zero otherwise. */ +Limb LIMBS_less_than(const Limb a[], const Limb b[], size_t num_limbs) { + debug_assert_nonsecret(num_limbs >= 1); + /* There are lots of ways to implement this. It is implemented this way to + * be consistent with |LIMBS_limbs_reduce_once| and other code that makes such + * comparisons as part of doing conditional reductions. */ + Limb dummy; + Carry borrow = limb_sub(&dummy, a[0], b[0]); + for (size_t i = 1; i < num_limbs; ++i) { + borrow = limb_sbb(&dummy, a[i], b[i], borrow); + } + return constant_time_is_nonzero_w(borrow); +} + +/* if (r >= m) { r -= m; } */ +void LIMBS_reduce_once(Limb r[], const Limb m[], size_t num_limbs) { + debug_assert_nonsecret(num_limbs >= 1); + /* This could be done more efficiently if we had |num_limbs| of extra space + * available, by storing |r - m| and then doing a conditional copy of either + * |r| or |r - m|. But, in order to operate in constant space, with an eye + * towards this function being used in RSA in the future, we do things a + * slightly less efficient way. */ + Limb lt = LIMBS_less_than(r, m, num_limbs); + Carry borrow = + limb_sub(&r[0], r[0], constant_time_select_w(lt, 0, m[0])); + for (size_t i = 1; i < num_limbs; ++i) { + /* XXX: This is probably particularly inefficient because the operations in + * constant_time_select affect the carry flag, so there will likely be + * loads and stores of |borrow|. */ + borrow = + limb_sbb(&r[i], r[i], constant_time_select_w(lt, 0, m[i]), borrow); + } + dev_assert_secret(borrow == 0); +} + +void LIMBS_add_mod(Limb r[], const Limb a[], const Limb b[], const Limb m[], + size_t num_limbs) { + Limb overflow1 = + constant_time_is_nonzero_w(limbs_add(r, a, b, num_limbs)); + Limb overflow2 = ~LIMBS_less_than(r, m, num_limbs); + Limb overflow = overflow1 | overflow2; + Carry borrow = limb_sub(&r[0], r[0], m[0] & overflow); + for (size_t i = 1; i < num_limbs; ++i) { + borrow = limb_sbb(&r[i], r[i], m[i] & overflow, borrow); + } +} + +void LIMBS_sub_mod(Limb r[], const Limb a[], const Limb b[], const Limb m[], + size_t num_limbs) { + Limb underflow = + constant_time_is_nonzero_w(limbs_sub(r, a, b, num_limbs)); + Carry carry = limb_add(&r[0], r[0], m[0] & underflow); + for (size_t i = 1; i < num_limbs; ++i) { + carry = limb_adc(&r[i], r[i], m[i] & underflow, carry); + } +} + +void LIMBS_shl_mod(Limb r[], const Limb a[], const Limb m[], size_t num_limbs) { + Limb overflow1 = + constant_time_is_nonzero_w(a[num_limbs - 1] & LIMB_HIGH_BIT); + Limb carry = 0; + for (size_t i = 0; i < num_limbs; ++i) { + Limb limb = a[i]; + Limb new_carry = limb >> (LIMB_BITS - 1); + r[i] = (limb << 1) | carry; + carry = new_carry; + } + Limb overflow2 = ~LIMBS_less_than(r, m, num_limbs); + Limb overflow = overflow1 | overflow2; + Carry borrow = limb_sub(&r[0], r[0], m[0] & overflow); + for (size_t i = 1; i < num_limbs; ++i) { + borrow = limb_sbb(&r[i], r[i], m[i] & overflow, borrow); + } +} + +int LIMBS_select_512_32(Limb r[], const Limb table[], size_t num_limbs, + crypto_word_t index) { + if (num_limbs % (512 / LIMB_BITS) != 0) { + return 0; + } + limbs_select(r, table, num_limbs, 32, index); + return 1; +} + +static const Limb FIVE_BITS_MASK = 0x1f; + +crypto_word_t LIMBS_window5_split_window(Limb lower_limb, Limb higher_limb, size_t index_within_word) { + Limb high_bits = (higher_limb << (LIMB_BITS - index_within_word)) + & FIVE_BITS_MASK; + // There are no bits outside the window above |index_within_word| (if there + // were then this wouldn't be a split window), so we don't need to mask + // |low_bits|. + Limb low_bits = lower_limb >> index_within_word; + return low_bits | high_bits; +} + +crypto_word_t LIMBS_window5_unsplit_window(Limb limb, size_t index_within_word) { + return (limb >> index_within_word) & FIVE_BITS_MASK; +} + +Limb LIMB_shr(Limb a, size_t shift) { + return a >> shift; +} + +Limb limbs_mul_add_limb(Limb r[], const Limb a[], Limb b, size_t num_limbs) { + Limb carried = 0; + for (size_t i = 0; i < num_limbs; ++i) { + Limb lo; + Limb hi; + bn_umult_lohi(&lo, &hi, a[i], b); + Limb tmp; + Carry c = limb_add(&tmp, lo, carried); + c = limb_adc(&carried, hi, 0, c); + dev_assert_secret(c == 0); + c = limb_add(&r[i], r[i], tmp); + c = limb_adc(&carried, carried, 0, c); + // (A * B) + C + D never carries. + dev_assert_secret(c == 0); + } + return carried; +} diff --git a/ring-0.17.14/crypto/limbs/limbs.h b/ring-0.17.14/crypto/limbs/limbs.h new file mode 100644 index 0000000000..0cf83dd651 --- /dev/null +++ b/ring-0.17.14/crypto/limbs/limbs.h @@ -0,0 +1,38 @@ +/* Copyright 2016 Brian Smith. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#ifndef RING_LIMBS_H +#define RING_LIMBS_H + +#include + +#include "../internal.h" + +typedef crypto_word_t Limb; + +#define LIMB_BITS CRYPTO_WORD_BITS +#define LIMB_HIGH_BIT ((Limb)(1) << (LIMB_BITS - 1)) + + +Limb LIMBS_are_zero(const Limb a[], size_t num_limbs); +Limb LIMBS_equal(const Limb a[], const Limb b[], size_t num_limbs); +void LIMBS_reduce_once(Limb r[], const Limb m[], size_t num_limbs); +void LIMBS_add_mod(Limb r[], const Limb a[], const Limb b[], const Limb m[], + size_t num_limbs); +void LIMBS_sub_mod(Limb r[], const Limb a[], const Limb b[], const Limb m[], + size_t num_limbs); +void LIMBS_shl_mod(Limb r[], const Limb a[], const Limb m[], size_t num_limbs); +Limb limbs_mul_add_limb(Limb r[], const Limb a[], Limb b, size_t num_limbs); + +#endif /* RING_LIMBS_H */ diff --git a/ring-0.17.14/crypto/limbs/limbs.inl b/ring-0.17.14/crypto/limbs/limbs.inl new file mode 100644 index 0000000000..1ca72cbb23 --- /dev/null +++ b/ring-0.17.14/crypto/limbs/limbs.inl @@ -0,0 +1,162 @@ +/* Copyright 2016 Brian Smith. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#include "limbs.h" +#include "ring-core/check.h" + +#if defined(_MSC_VER) && !defined(__clang__) +#pragma warning(push, 3) +#include +#pragma warning(pop) + +/* MSVC 2015 RC, when compiling for x86 with /Ox (at least), miscompiles + * _addcarry_u32(c, 0, prod_hi, &x) like so: + * + * add eax,esi ; The previous add that might have set the carry flag. + * xor esi,esi ; OOPS! Carry flag is now reset! + * mov dword ptr [edi-4],eax + * adc esi,dword ptr [prod_hi] + * + * We test with MSVC 2015 update 2, so make sure we're using a version at least + * as new as that. */ +#if _MSC_FULL_VER < 190023918 +#error "MSVC 2015 Update 2 or later is required." +#endif +typedef uint8_t Carry; +#if LIMB_BITS == 64 +#pragma intrinsic(_addcarry_u64, _subborrow_u64) +#define RING_CORE_ADDCARRY_INTRINSIC _addcarry_u64 +#define RING_CORE_SUBBORROW_INTRINSIC _subborrow_u64 +#elif LIMB_BITS == 32 +#pragma intrinsic(_addcarry_u32, _subborrow_u32) +#define RING_CORE_ADDCARRY_INTRINSIC _addcarry_u32 +#define RING_CORE_SUBBORROW_INTRINSIC _subborrow_u32 +typedef uint64_t DoubleLimb; +#endif +#else +typedef Limb Carry; +#if LIMB_BITS == 64 +typedef __uint128_t DoubleLimb; +#elif LIMB_BITS == 32 +typedef uint64_t DoubleLimb; +#endif +#endif + +/* |*r = a + b + carry_in|, returning carry out bit. |carry_in| must be 0 or 1. + */ +static inline Carry limb_adc(Limb *r, Limb a, Limb b, Carry carry_in) { + dev_assert_secret(carry_in == 0 || carry_in == 1); + Carry ret; +#if defined(RING_CORE_ADDCARRY_INTRINSIC) + ret = RING_CORE_ADDCARRY_INTRINSIC(carry_in, a, b, r); +#else + DoubleLimb x = (DoubleLimb)a + b + carry_in; + *r = (Limb)x; + ret = (Carry)(x >> LIMB_BITS); +#endif + dev_assert_secret(ret == 0 || ret == 1); + return ret; +} + +/* |*r = a + b|, returning carry bit. */ +static inline Carry limb_add(Limb *r, Limb a, Limb b) { + Carry ret; +#if defined(RING_CORE_ADDCARRY_INTRINSIC) + ret = RING_CORE_ADDCARRY_INTRINSIC(0, a, b, r); +#else + DoubleLimb x = (DoubleLimb)a + b; + *r = (Limb)x; + ret = (Carry)(x >> LIMB_BITS); +#endif + dev_assert_secret(ret == 0 || ret == 1); + return ret; +} + +/* |*r = a - b - borrow_in|, returning the borrow out bit. |borrow_in| must be + * 0 or 1. */ +static inline Carry limb_sbb(Limb *r, Limb a, Limb b, Carry borrow_in) { + dev_assert_secret(borrow_in == 0 || borrow_in == 1); + Carry ret; +#if defined(RING_CORE_SUBBORROW_INTRINSIC) + ret = RING_CORE_SUBBORROW_INTRINSIC(borrow_in, a, b, r); +#else + DoubleLimb x = (DoubleLimb)a - b - borrow_in; + *r = (Limb)x; + ret = (Carry)((x >> LIMB_BITS) & 1); +#endif + dev_assert_secret(ret == 0 || ret == 1); + return ret; +} + +/* |*r = a - b|, returning borrow bit. */ +static inline Carry limb_sub(Limb *r, Limb a, Limb b) { + Carry ret; +#if defined(RING_CORE_SUBBORROW_INTRINSIC) + ret = RING_CORE_SUBBORROW_INTRINSIC(0, a, b, r); +#else + DoubleLimb x = (DoubleLimb)a - b; + *r = (Limb)x; + ret = (Carry)((x >> LIMB_BITS) & 1); +#endif + dev_assert_secret(ret == 0 || ret == 1); + return ret; +} + +static inline Carry limbs_add(Limb r[], const Limb a[], const Limb b[], + size_t num_limbs) { + debug_assert_nonsecret(num_limbs >= 1); + Carry carry = limb_add(&r[0], a[0], b[0]); + for (size_t i = 1; i < num_limbs; ++i) { + carry = limb_adc(&r[i], a[i], b[i], carry); + } + return carry; +} + +/* |r -= s|, returning the borrow. */ +static inline Carry limbs_sub(Limb r[], const Limb a[], const Limb b[], + size_t num_limbs) { + debug_assert_nonsecret(num_limbs >= 1); + Carry borrow = limb_sub(&r[0], a[0], b[0]); + for (size_t i = 1; i < num_limbs; ++i) { + borrow = limb_sbb(&r[i], a[i], b[i], borrow); + } + return borrow; +} + +static inline void limbs_copy(Limb r[], const Limb a[], size_t num_limbs) { + for (size_t i = 0; i < num_limbs; ++i) { + r[i] = a[i]; + } +} + +static inline void limbs_select(Limb r[], const Limb table[], + size_t num_limbs, size_t num_entries, + crypto_word_t index) { + for (size_t i = 0; i < num_limbs; ++i) { + r[i] = 0; + } + + for (size_t e = 0; e < num_entries; ++e) { + Limb equal = constant_time_eq_w(index, e); + for (size_t i = 0; i < num_limbs; ++i) { + r[i] = constant_time_select_w(equal, table[(e * num_limbs) + i], r[i]); + } + } +} + +static inline void limbs_zero(Limb r[], size_t num_limbs) { + for (size_t i = 0; i < num_limbs; ++i) { + r[i] = 0; + } +} diff --git a/ring-0.17.14/crypto/mem.c b/ring-0.17.14/crypto/mem.c new file mode 100644 index 0000000000..5a85ceaca8 --- /dev/null +++ b/ring-0.17.14/crypto/mem.c @@ -0,0 +1,28 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "internal.h" + +int CRYPTO_memcmp(const void *in_a, const void *in_b, size_t len) { + const aliasing_uint8_t *a = in_a; + const aliasing_uint8_t *b = in_b; + uint8_t x = 0; + + for (size_t i = 0; i < len; i++) { + x |= a[i] ^ b[i]; + } + + return x; +} diff --git a/ring-0.17.14/crypto/perlasm/arm-xlate.pl b/ring-0.17.14/crypto/perlasm/arm-xlate.pl new file mode 100644 index 0000000000..36971387ed --- /dev/null +++ b/ring-0.17.14/crypto/perlasm/arm-xlate.pl @@ -0,0 +1,263 @@ +#! /usr/bin/env perl +# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +use strict; + +my $flavour = shift; +my $output = shift; +open STDOUT,">$output" || die "can't open $output: $!"; + +$flavour = "linux32" if (!$flavour or $flavour eq "void"); + +my %GLOBALS; +my $dotinlocallabels=($flavour=~/linux/)?1:0; + +################################################################ +# directives which need special treatment on different platforms +################################################################ +my $arch = sub { + if ($flavour =~ /linux/) { ".arch\t".join(',',@_); } + elsif ($flavour =~ /win64/) { ".arch\t".join(',',@_); } + else { ""; } +}; +my $fpu = sub { + if ($flavour =~ /linux/) { ".fpu\t".join(',',@_); } + else { ""; } +}; +my $hidden = sub { + if ($flavour =~ /ios/) { ".private_extern\t".join(',',@_); } + elsif ($flavour =~ /win64/) { ""; } + else { ".hidden\t".join(',',@_); } +}; +my $comm = sub { + my @args = split(/,\s*/,shift); + my $name = @args[0]; + my $global = \$GLOBALS{$name}; + my $ret; + + if ($flavour =~ /ios32/) { + $ret = ".comm\t_$name,@args[1]\n"; + $ret .= ".non_lazy_symbol_pointer\n"; + $ret .= "$name:\n"; + $ret .= ".indirect_symbol\t_$name\n"; + $ret .= ".long\t0"; + $name = "_$name"; + } else { $ret = ".comm\t".join(',',@args); } + + $$global = $name; + $ret; +}; +my $globl = sub { + my $name = shift; + my $global = \$GLOBALS{$name}; + my $ret; + + SWITCH: for ($flavour) { + /ios/ && do { $name = "_$name"; + last; + }; + } + + $ret = ".globl $name\n"; + # All symbols in assembly files are hidden. + $ret .= &$hidden($name); + $$global = $name; + $ret; +}; +my $global = $globl; +my $extern = sub { + &$globl(@_); + return; # return nothing +}; +my $type = sub { + if ($flavour =~ /linux/) { ".type\t".join(',',@_); } + elsif ($flavour =~ /ios32/) { if (join(',',@_) =~ /(\w+),%function/) { + "#ifdef __thumb2__\n". + ".thumb_func $1\n". + "#endif"; + } + } + elsif ($flavour =~ /win64/) { if (join(',',@_) =~ /(\w+),%function/) { + # See https://sourceware.org/binutils/docs/as/Pseudo-Ops.html + # Per https://docs.microsoft.com/en-us/windows/win32/debug/pe-format#coff-symbol-table, + # the type for functions is 0x20, or 32. + ".def $1\n". + " .type 32\n". + ".endef"; + } + } + else { ""; } +}; +my $size = sub { + if ($flavour =~ /linux/) { ".size\t".join(',',@_); } + else { ""; } +}; +my $inst = sub { + if ($flavour =~ /linux/) { ".inst\t".join(',',@_); } + else { ".long\t".join(',',@_); } +}; +my $asciz = sub { + my $line = join(",",@_); + if ($line =~ /^"(.*)"$/) + { ".byte " . join(",",unpack("C*",$1),0) . "\n.align 2"; } + else + { ""; } +}; +my $section = sub { + if ($flavour =~ /ios/) { + if ($_[0] eq ".rodata") { + return ".section\t__TEXT,__const"; + } + die "Unknown section name $_[0]"; + } else { + return ".section\t" . join(",", @_); + } +}; + +sub range { + my ($r,$sfx,$start,$end) = @_; + + join(",",map("$r$_$sfx",($start..$end))); +} + +sub expand_line { + my $line = shift; + my @ret = (); + + pos($line)=0; + + while ($line =~ m/\G[^@\/\{\"]*/g) { + if ($line =~ m/\G(@|\/\/|$)/gc) { + last; + } + elsif ($line =~ m/\G\{/gc) { + my $saved_pos = pos($line); + $line =~ s/\G([rdqv])([0-9]+)([^\-]*)\-\1([0-9]+)\3/range($1,$3,$2,$4)/e; + pos($line) = $saved_pos; + $line =~ m/\G[^\}]*\}/g; + } + elsif ($line =~ m/\G\"/gc) { + $line =~ m/\G[^\"]*\"/g; + } + } + + $line =~ s/\b(\w+)/$GLOBALS{$1} or $1/ge; + + return $line; +} + +my ($arch_defines, $target_defines); +if ($flavour =~ /32/) { + $arch_defines = "defined(OPENSSL_ARM)"; +} elsif ($flavour =~ /64/) { + $arch_defines = "defined(OPENSSL_AARCH64)"; +} else { + die "unknown architecture: $flavour"; +} +if ($flavour =~ /linux/) { + # Although the flavour is specified as "linux", it is really used by all + # ELF platforms. + $target_defines = "defined(__ELF__)"; +} elsif ($flavour =~ /ios/) { + # Although the flavour is specified as "ios", it is really used by all Apple + # platforms. + $target_defines = "defined(__APPLE__)"; +} elsif ($flavour =~ /win/) { + $target_defines = "defined(_WIN32)"; +} else { + die "unknown target: $flavour"; +} + +print <<___; +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && $arch_defines && $target_defines +___ + +while(my $line=<>) { + + if ($line =~ m/^\s*(#|@|\/\/)/) { print $line; next; } + + $line =~ s|/\*.*\*/||; # get rid of C-style comments... + $line =~ s|^\s+||; # ... and skip white spaces in beginning... + $line =~ s|\s+$||; # ... and at the end + + if ($flavour =~ /64/) { + my $copy = $line; + # Also remove line comments. + $copy =~ s|//.*||; + if ($copy =~ /\b[wx]18\b/) { + die "r18 is reserved by the platform and may not be used."; + } + } + + { + $line =~ s|[\b\.]L(\w{2,})|L$1|g; # common denominator for Locallabel + $line =~ s|\bL(\w{2,})|\.L$1|g if ($dotinlocallabels); + } + + { + $line =~ s|(^[\.\w]+)\:\s*||; + my $label = $1; + if ($label) { + printf "%s:",($GLOBALS{$label} or $label); + } + } + + if ($line !~ m/^[#@]/) { + $line =~ s|^\s*(\.?)(\S+)\s*||; + my $c = $1; $c = "\t" if ($c eq ""); + my $mnemonic = $2; + my $opcode; + if ($mnemonic =~ m/([^\.]+)\.([^\.]+)/) { + $opcode = eval("\$$1_$2"); + } else { + $opcode = eval("\$$mnemonic"); + } + + if ($flavour =~ /ios/) { + # Mach-O and ELF use different syntax for these relocations. Note + # that we require :pg_hi21: to be explicitly listed. It is normally + # optional with adrp instructions. + $line =~ s|:pg_hi21:(\w+)|\1\@PAGE|; + $line =~ s|:lo12:(\w+)|\1\@PAGEOFF|; + } else { + # Clang's integrated assembly does not support the optional + # :pg_hi21: markers, so erase them. + $line =~ s|:pg_hi21:||; + } + + my $arg=expand_line($line); + + if (ref($opcode) eq 'CODE') { + $line = &$opcode($arg); + } elsif ($mnemonic) { + $line = $c.$mnemonic; + $line.= "\t$arg" if ($arg ne ""); + } + } + + print $line if ($line); + print "\n"; +} + +print <<___; +#endif // !OPENSSL_NO_ASM && $arch_defines && $target_defines +___ + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/ring-0.17.14/crypto/perlasm/x86_64-xlate.pl b/ring-0.17.14/crypto/perlasm/x86_64-xlate.pl new file mode 100644 index 0000000000..f9d71e1d24 --- /dev/null +++ b/ring-0.17.14/crypto/perlasm/x86_64-xlate.pl @@ -0,0 +1,1894 @@ +#! /usr/bin/env perl +# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Ascetic x86_64 AT&T to MASM/NASM assembler translator by . +# +# Why AT&T to MASM and not vice versa? Several reasons. Because AT&T +# format is way easier to parse. Because it's simpler to "gear" from +# Unix ABI to Windows one [see cross-reference "card" at the end of +# file]. Because Linux targets were available first... +# +# In addition the script also "distills" code suitable for GNU +# assembler, so that it can be compiled with more rigid assemblers, +# such as Solaris /usr/ccs/bin/as. +# +# This translator is not designed to convert *arbitrary* assembler +# code from AT&T format to MASM one. It's designed to convert just +# enough to provide for dual-ABI OpenSSL modules development... +# There *are* limitations and you might have to modify your assembler +# code or this script to achieve the desired result... +# +# Currently recognized limitations: +# +# - can't use multiple ops per line; +# +# Dual-ABI styling rules. +# +# 1. Adhere to Unix register and stack layout [see cross-reference +# ABI "card" at the end for explanation]. +# 2. Forget about "red zone," stick to more traditional blended +# stack frame allocation. If volatile storage is actually required +# that is. If not, just leave the stack as is. +# 3. Functions tagged with ".type name,@function" get crafted with +# unified Win64 prologue and epilogue automatically. If you want +# to take care of ABI differences yourself, tag functions as +# ".type name,@abi-omnipotent" instead. +# 4. To optimize the Win64 prologue you can specify number of input +# arguments as ".type name,@function,N." Keep in mind that if N is +# larger than 6, then you *have to* write "abi-omnipotent" code, +# because >6 cases can't be addressed with unified prologue. +# 5. Name local labels as .L*, do *not* use dynamic labels such as 1: +# (sorry about latter). +# 6. Don't use [or hand-code with .byte] "rep ret." "ret" mnemonic is +# required to identify the spots, where to inject Win64 epilogue! +# 7. Stick to explicit ip-relative addressing. If you have to use +# GOTPCREL addressing, stick to mov symbol@GOTPCREL(%rip),%r??. +# Both are recognized and translated to proper Win64 addressing +# modes. +# +# 8. In order to provide for structured exception handling unified +# Win64 prologue copies %rsp value to %rax. For further details +# see SEH paragraph at the end. +# 9. .init segment is allowed to contain calls to functions only. +# a. If function accepts more than 4 arguments *and* >4th argument +# is declared as non 64-bit value, do clear its upper part. +# +# TODO(https://crbug.com/boringssl/259): The dual-ABI mechanism described here +# does not quite unwind correctly on Windows. The seh_directive logic below has +# the start of a new mechanism. + + +use strict; + +my $flavour = shift; +my $output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +open STDOUT,">$output" || die "can't open $output: $!" + if (defined($output)); + +my $gas=1; $gas=0 if ($output =~ /\.asm$/); +my $elf=1; $elf=0 if (!$gas); +my $apple=0; +my $win64=0; +my $prefix=""; +my $decor=".L"; + +my $masmref=8 + 50727*2**-32; # 8.00.50727 shipped with VS2005 +my $masm=0; +my $PTR=" PTR"; + +my $nasmref=2.03; +my $nasm=0; + +if ($flavour eq "mingw64") { $gas=1; $elf=0; $win64=1; + # TODO(davidben): Before supporting the + # mingw64 perlasm flavour, do away with this + # environment variable check. + die "mingw64 not supported"; + $prefix=`echo __USER_LABEL_PREFIX__ | $ENV{CC} -E -P -`; + $prefix =~ s|\R$||; # Better chomp + } +elsif ($flavour eq "macosx") { $gas=1; $elf=0; $apple=1; $prefix="_"; $decor="L\$"; } +elsif ($flavour eq "masm") { $gas=0; $elf=0; $masm=$masmref; $win64=1; $decor="\$L\$"; } +elsif ($flavour eq "nasm") { $gas=0; $elf=0; $nasm=$nasmref; $win64=1; $decor="\$L\$"; $PTR=""; } +elsif (!$gas) { die "unknown flavour $flavour"; } + +my $current_segment; +my $current_function; +my %globals; + +{ package opcode; # pick up opcodes + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + + if ($$line =~ /^([a-z][a-z0-9]*)/i) { + bless $self,$class; + $self->{op} = $1; + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + + undef $self->{sz}; + if ($self->{op} =~ /^(movz)x?([bw]).*/) { # movz is pain... + $self->{op} = $1; + $self->{sz} = $2; + } elsif ($self->{op} =~ /call|jmp/) { + $self->{sz} = ""; + } elsif ($self->{op} =~ /^p/ && $' !~ /^(ush|op|insrw)/) { # SSEn + $self->{sz} = ""; + } elsif ($self->{op} =~ /^[vk]/) { # VEX or k* such as kmov + $self->{sz} = ""; + } elsif ($self->{op} =~ /mov[dq]/ && $$line =~ /%xmm/) { + $self->{sz} = ""; + } elsif ($self->{op} =~ /^or([qlwb])$/) { + $self->{op} = "or"; + $self->{sz} = $1; + } elsif ($self->{op} =~ /([a-z]{3,})([qlwb])$/) { + $self->{op} = $1; + $self->{sz} = $2; + } + } + $ret; + } + sub size { + my ($self, $sz) = @_; + $self->{sz} = $sz if (defined($sz) && !defined($self->{sz})); + $self->{sz}; + } + sub out { + my $self = shift; + if ($gas) { + if ($self->{op} eq "movz") { # movz is pain... + sprintf "%s%s%s",$self->{op},$self->{sz},shift; + } elsif ($self->{op} =~ /^set/) { + "$self->{op}"; + } elsif ($self->{op} eq "ret") { + my $epilogue = ""; + if ($win64 && $current_function->{abi} eq "svr4") { + $epilogue = "movq 8(%rsp),%rdi\n\t" . + "movq 16(%rsp),%rsi\n\t"; + } + $epilogue . "ret"; + } elsif ($self->{op} eq "call" && !$elf && $current_segment eq ".init") { + ".p2align\t3\n\t.quad"; + } else { + "$self->{op}$self->{sz}"; + } + } else { + $self->{op} =~ s/^movz/movzx/; + if ($self->{op} eq "ret") { + $self->{op} = ""; + if ($win64 && $current_function->{abi} eq "svr4") { + $self->{op} = "mov rdi,QWORD$PTR\[8+rsp\]\t;WIN64 epilogue\n\t". + "mov rsi,QWORD$PTR\[16+rsp\]\n\t"; + } + $self->{op} .= "ret"; + } elsif ($self->{op} =~ /^(pop|push)f/) { + $self->{op} .= $self->{sz}; + } elsif ($self->{op} eq "call" && $current_segment eq ".CRT\$XCU") { + $self->{op} = "\tDQ"; + } + $self->{op}; + } + } + sub mnemonic { + my ($self, $op) = @_; + $self->{op}=$op if (defined($op)); + $self->{op}; + } +} +{ package const; # pick up constants, which start with $ + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + + if ($$line =~ /^\$([^,]+)/) { + bless $self, $class; + $self->{value} = $1; + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + } + $ret; + } + sub out { + my $self = shift; + + $self->{value} =~ s/\b(0b[0-1]+)/oct($1)/eig; + if ($gas) { + # Solaris /usr/ccs/bin/as can't handle multiplications + # in $self->{value} + my $value = $self->{value}; + no warnings; # oct might complain about overflow, ignore here... + $value =~ s/(?{value} = $value; + } + sprintf "\$%s",$self->{value}; + } else { + my $value = $self->{value}; + $value =~ s/0x([0-9a-f]+)/0$1h/ig if ($masm); + sprintf "%s",$value; + } + } +} +{ package ea; # pick up effective addresses: expr(%reg,%reg,scale) + + my %szmap = ( b=>"BYTE$PTR", w=>"WORD$PTR", + l=>"DWORD$PTR", d=>"DWORD$PTR", + q=>"QWORD$PTR", o=>"OWORD$PTR", + x=>"XMMWORD$PTR", y=>"YMMWORD$PTR", + z=>"ZMMWORD$PTR" ) if (!$gas); + + sub re { + my ($class, $line, $opcode) = @_; + my $self = {}; + my $ret; + + # optional * ----vvv--- appears in indirect jmp/call + if ($$line =~ /^(\*?)([^\(,]*)\(([%\w,]+)\)((?:{[^}]+})*)/) { + bless $self, $class; + $self->{asterisk} = $1; + $self->{label} = $2; + ($self->{base},$self->{index},$self->{scale})=split(/,/,$3); + $self->{scale} = 1 if (!defined($self->{scale})); + $self->{opmask} = $4; + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + + if ($win64 && $self->{label} =~ s/\@GOTPCREL//) { + die if ($opcode->mnemonic() ne "mov"); + $opcode->mnemonic("lea"); + } + $self->{base} =~ s/^%//; + $self->{index} =~ s/^%// if (defined($self->{index})); + $self->{opcode} = $opcode; + } + $ret; + } + sub size {} + sub out { + my ($self, $sz) = @_; + + $self->{label} =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei; + $self->{label} =~ s/\.L/$decor/g; + + # Silently convert all EAs to 64-bit. This is required for + # elder GNU assembler and results in more compact code, + # *but* most importantly AES module depends on this feature! + $self->{index} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/; + $self->{base} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/; + + # Solaris /usr/ccs/bin/as can't handle multiplications + # in $self->{label}... + use integer; + $self->{label} =~ s/(?{label} =~ s/\b([0-9]+\s*[\*\/\%]\s*[0-9]+)\b/eval($1)/eg; + + # Some assemblers insist on signed presentation of 32-bit + # offsets, but sign extension is a tricky business in perl... + if ((1<<31)<<1) { + $self->{label} =~ s/\b([0-9]+)\b/$1<<32>>32/eg; + } else { + $self->{label} =~ s/\b([0-9]+)\b/$1>>0/eg; + } + + # if base register is %rbp or %r13, see if it's possible to + # flip base and index registers [for better performance] + if (!$self->{label} && $self->{index} && $self->{scale}==1 && + $self->{base} =~ /(rbp|r13)/) { + $self->{base} = $self->{index}; $self->{index} = $1; + } + + if ($gas) { + $self->{label} =~ s/^___imp_/__imp__/ if ($flavour eq "mingw64"); + + if (defined($self->{index})) { + sprintf "%s%s(%s,%%%s,%d)%s", + $self->{asterisk},$self->{label}, + $self->{base}?"%$self->{base}":"", + $self->{index},$self->{scale}, + $self->{opmask}; + } else { + sprintf "%s%s(%%%s)%s", $self->{asterisk},$self->{label}, + $self->{base},$self->{opmask}; + } + } else { + $self->{label} =~ s/\./\$/g; + $self->{label} =~ s/(?{label} = "($self->{label})" if ($self->{label} =~ /[\*\+\-\/]/); + + my $mnemonic = $self->{opcode}->mnemonic(); + ($self->{asterisk}) && ($sz="q") || + ($mnemonic =~ /^v?mov([qd])$/) && ($sz=$1) || + ($mnemonic =~ /^v?pinsr([qdwb])$/) && ($sz=$1) || + ($mnemonic =~ /^vpbroadcast([qdwb])$/) && ($sz=$1) || + ($mnemonic =~ /^v(?!perm)[a-z]+[fi]128$/) && ($sz="x"); + + $self->{opmask} =~ s/%(k[0-7])/$1/; + + if (defined($self->{index})) { + sprintf "%s[%s%s*%d%s]%s",$szmap{$sz}, + $self->{label}?"$self->{label}+":"", + $self->{index},$self->{scale}, + $self->{base}?"+$self->{base}":"", + $self->{opmask}; + } elsif ($self->{base} eq "rip") { + sprintf "%s[%s]",$szmap{$sz},$self->{label}; + } else { + sprintf "%s[%s%s]%s", $szmap{$sz}, + $self->{label}?"$self->{label}+":"", + $self->{base},$self->{opmask}; + } + } + } +} +{ package register; # pick up registers, which start with %. + sub re { + my ($class, $line, $opcode) = @_; + my $self = {}; + my $ret; + + # optional * ----vvv--- appears in indirect jmp/call + if ($$line =~ /^(\*?)%(\w+)((?:{[^}]+})*)/) { + bless $self,$class; + $self->{asterisk} = $1; + $self->{value} = $2; + $self->{opmask} = $3; + $opcode->size($self->size()); + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + } + $ret; + } + sub size { + my $self = shift; + my $ret; + + if ($self->{value} =~ /^r[\d]+b$/i) { $ret="b"; } + elsif ($self->{value} =~ /^r[\d]+w$/i) { $ret="w"; } + elsif ($self->{value} =~ /^r[\d]+d$/i) { $ret="l"; } + elsif ($self->{value} =~ /^r[\w]+$/i) { $ret="q"; } + elsif ($self->{value} =~ /^[a-d][hl]$/i){ $ret="b"; } + elsif ($self->{value} =~ /^[\w]{2}l$/i) { $ret="b"; } + elsif ($self->{value} =~ /^[\w]{2}$/i) { $ret="w"; } + elsif ($self->{value} =~ /^e[a-z]{2}$/i){ $ret="l"; } + + $ret; + } + sub out { + my $self = shift; + if ($gas) { sprintf "%s%%%s%s", $self->{asterisk}, + $self->{value}, + $self->{opmask}; } + else { $self->{opmask} =~ s/%(k[0-7])/$1/; + $self->{value}.$self->{opmask}; } + } +} +{ package label; # pick up labels, which end with : + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + + if ($$line =~ /(^[\.\w]+)\:/) { + bless $self,$class; + $self->{value} = $1; + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + + $self->{value} =~ s/^\.L/$decor/; + } + $ret; + } + sub out { + my $self = shift; + + if ($gas) { + my $func = ($globals{$self->{value}} or $self->{value}) . ":"; + if ($win64 && $current_function->{name} eq $self->{value} + && $current_function->{abi} eq "svr4") { + $func .= "\n"; + $func .= " movq %rdi,8(%rsp)\n"; + $func .= " movq %rsi,16(%rsp)\n"; + $func .= " movq %rsp,%rax\n"; + $func .= "${decor}SEH_begin_$current_function->{name}:\n"; + my $narg = $current_function->{narg}; + $narg=6 if (!defined($narg)); + $func .= " movq %rcx,%rdi\n" if ($narg>0); + $func .= " movq %rdx,%rsi\n" if ($narg>1); + $func .= " movq %r8,%rdx\n" if ($narg>2); + $func .= " movq %r9,%rcx\n" if ($narg>3); + $func .= " movq 40(%rsp),%r8\n" if ($narg>4); + $func .= " movq 48(%rsp),%r9\n" if ($narg>5); + } + $func; + } elsif ($self->{value} ne "$current_function->{name}") { + # Make all labels in masm global. + $self->{value} .= ":" if ($masm); + $self->{value} . ":"; + } elsif ($win64 && $current_function->{abi} eq "svr4") { + my $func = "$current_function->{name}" . + ($nasm ? ":" : "\tPROC $current_function->{scope}") . + "\n"; + $func .= " mov QWORD$PTR\[8+rsp\],rdi\t;WIN64 prologue\n"; + $func .= " mov QWORD$PTR\[16+rsp\],rsi\n"; + $func .= " mov rax,rsp\n"; + $func .= "${decor}SEH_begin_$current_function->{name}:"; + $func .= ":" if ($masm); + $func .= "\n"; + my $narg = $current_function->{narg}; + $narg=6 if (!defined($narg)); + $func .= " mov rdi,rcx\n" if ($narg>0); + $func .= " mov rsi,rdx\n" if ($narg>1); + $func .= " mov rdx,r8\n" if ($narg>2); + $func .= " mov rcx,r9\n" if ($narg>3); + $func .= " mov r8,QWORD$PTR\[40+rsp\]\n" if ($narg>4); + $func .= " mov r9,QWORD$PTR\[48+rsp\]\n" if ($narg>5); + $func .= "\n"; + } else { + "$current_function->{name}". + ($nasm ? ":" : "\tPROC $current_function->{scope}"); + } + } +} +{ package expr; # pick up expressions + sub re { + my ($class, $line, $opcode) = @_; + my $self = {}; + my $ret; + + if ($$line =~ /(^[^,]+)/) { + bless $self,$class; + $self->{value} = $1; + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + + $self->{value} =~ s/\@PLT// if (!$elf); + $self->{value} =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei; + $self->{value} =~ s/\.L/$decor/g; + $self->{opcode} = $opcode; + } + $ret; + } + sub out { + my $self = shift; + if ($nasm && $self->{opcode}->mnemonic()=~m/^j(?![re]cxz)/) { + "NEAR ".$self->{value}; + } else { + $self->{value}; + } + } +} +{ package cfi_directive; + # CFI directives annotate instructions that are significant for + # stack unwinding procedure compliant with DWARF specification, + # see http://dwarfstd.org/. Besides naturally expected for this + # script platform-specific filtering function, this module adds + # three auxiliary synthetic directives not recognized by [GNU] + # assembler: + # + # - .cfi_push to annotate push instructions in prologue, which + # translates to .cfi_adjust_cfa_offset (if needed) and + # .cfi_offset; + # - .cfi_pop to annotate pop instructions in epilogue, which + # translates to .cfi_adjust_cfa_offset (if needed) and + # .cfi_restore; + # - [and most notably] .cfi_cfa_expression which encodes + # DW_CFA_def_cfa_expression and passes it to .cfi_escape as + # byte vector; + # + # CFA expressions were introduced in DWARF specification version + # 3 and describe how to deduce CFA, Canonical Frame Address. This + # becomes handy if your stack frame is variable and you can't + # spare register for [previous] frame pointer. Suggested directive + # syntax is made-up mix of DWARF operator suffixes [subset of] + # and references to registers with optional bias. Following example + # describes offloaded *original* stack pointer at specific offset + # from *current* stack pointer: + # + # .cfi_cfa_expression %rsp+40,deref,+8 + # + # Final +8 has everything to do with the fact that CFA is defined + # as reference to top of caller's stack, and on x86_64 call to + # subroutine pushes 8-byte return address. In other words original + # stack pointer upon entry to a subroutine is 8 bytes off from CFA. + + # Below constants are taken from "DWARF Expressions" section of the + # DWARF specification, section is numbered 7.7 in versions 3 and 4. + my %DW_OP_simple = ( # no-arg operators, mapped directly + deref => 0x06, dup => 0x12, + drop => 0x13, over => 0x14, + pick => 0x15, swap => 0x16, + rot => 0x17, xderef => 0x18, + + abs => 0x19, and => 0x1a, + div => 0x1b, minus => 0x1c, + mod => 0x1d, mul => 0x1e, + neg => 0x1f, not => 0x20, + or => 0x21, plus => 0x22, + shl => 0x24, shr => 0x25, + shra => 0x26, xor => 0x27, + ); + + my %DW_OP_complex = ( # used in specific subroutines + constu => 0x10, # uleb128 + consts => 0x11, # sleb128 + plus_uconst => 0x23, # uleb128 + lit0 => 0x30, # add 0-31 to opcode + reg0 => 0x50, # add 0-31 to opcode + breg0 => 0x70, # add 0-31 to opcole, sleb128 + regx => 0x90, # uleb28 + fbreg => 0x91, # sleb128 + bregx => 0x92, # uleb128, sleb128 + piece => 0x93, # uleb128 + ); + + # Following constants are defined in x86_64 ABI supplement, for + # example available at https://www.uclibc.org/docs/psABI-x86_64.pdf, + # see section 3.7 "Stack Unwind Algorithm". + my %DW_reg_idx = ( + "%rax"=>0, "%rdx"=>1, "%rcx"=>2, "%rbx"=>3, + "%rsi"=>4, "%rdi"=>5, "%rbp"=>6, "%rsp"=>7, + "%r8" =>8, "%r9" =>9, "%r10"=>10, "%r11"=>11, + "%r12"=>12, "%r13"=>13, "%r14"=>14, "%r15"=>15 + ); + + my ($cfa_reg, $cfa_rsp); + my @cfa_stack; + + # [us]leb128 format is variable-length integer representation base + # 2^128, with most significant bit of each byte being 0 denoting + # *last* most significant digit. See "Variable Length Data" in the + # DWARF specification, numbered 7.6 at least in versions 3 and 4. + sub sleb128 { + use integer; # get right shift extend sign + + my $val = shift; + my $sign = ($val < 0) ? -1 : 0; + my @ret = (); + + while(1) { + push @ret, $val&0x7f; + + # see if remaining bits are same and equal to most + # significant bit of the current digit, if so, it's + # last digit... + last if (($val>>6) == $sign); + + @ret[-1] |= 0x80; + $val >>= 7; + } + + return @ret; + } + sub uleb128 { + my $val = shift; + my @ret = (); + + while(1) { + push @ret, $val&0x7f; + + # see if it's last significant digit... + last if (($val >>= 7) == 0); + + @ret[-1] |= 0x80; + } + + return @ret; + } + sub const { + my $val = shift; + + if ($val >= 0 && $val < 32) { + return ($DW_OP_complex{lit0}+$val); + } + return ($DW_OP_complex{consts}, sleb128($val)); + } + sub reg { + my $val = shift; + + return if ($val !~ m/^(%r\w+)(?:([\+\-])((?:0x)?[0-9a-f]+))?/); + + my $reg = $DW_reg_idx{$1}; + my $off = eval ("0 $2 $3"); + + return (($DW_OP_complex{breg0} + $reg), sleb128($off)); + # Yes, we use DW_OP_bregX+0 to push register value and not + # DW_OP_regX, because latter would require even DW_OP_piece, + # which would be a waste under the circumstances. If you have + # to use DWP_OP_reg, use "regx:N"... + } + sub cfa_expression { + my $line = shift; + my @ret; + + foreach my $token (split(/,\s*/,$line)) { + if ($token =~ /^%r/) { + push @ret,reg($token); + } elsif ($token =~ /((?:0x)?[0-9a-f]+)\((%r\w+)\)/) { + push @ret,reg("$2+$1"); + } elsif ($token =~ /(\w+):(\-?(?:0x)?[0-9a-f]+)(U?)/i) { + my $i = 1*eval($2); + push @ret,$DW_OP_complex{$1}, ($3 ? uleb128($i) : sleb128($i)); + } elsif (my $i = 1*eval($token) or $token eq "0") { + if ($token =~ /^\+/) { + push @ret,$DW_OP_complex{plus_uconst},uleb128($i); + } else { + push @ret,const($i); + } + } else { + push @ret,$DW_OP_simple{$token}; + } + } + + # Finally we return DW_CFA_def_cfa_expression, 15, followed by + # length of the expression and of course the expression itself. + return (15,scalar(@ret),@ret); + } + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + + if ($$line =~ s/^\s*\.cfi_(\w+)\s*//) { + bless $self,$class; + $ret = $self; + undef $self->{value}; + my $dir = $1; + + SWITCH: for ($dir) { + # What is $cfa_rsp? Effectively it's difference between %rsp + # value and current CFA, Canonical Frame Address, which is + # why it starts with -8. Recall that CFA is top of caller's + # stack... + /startproc/ && do { ($cfa_reg, $cfa_rsp) = ("%rsp", -8); last; }; + /endproc/ && do { ($cfa_reg, $cfa_rsp) = ("%rsp", 0); last; }; + /def_cfa_register/ + && do { $cfa_reg = $$line; last; }; + /def_cfa_offset/ + && do { $cfa_rsp = -1*eval($$line) if ($cfa_reg eq "%rsp"); + last; + }; + /adjust_cfa_offset/ + && do { $cfa_rsp -= 1*eval($$line) if ($cfa_reg eq "%rsp"); + last; + }; + /def_cfa/ && do { if ($$line =~ /(%r\w+)\s*,\s*(.+)/) { + $cfa_reg = $1; + $cfa_rsp = -1*eval($2) if ($cfa_reg eq "%rsp"); + } + last; + }; + /push/ && do { $dir = undef; + $cfa_rsp -= 8; + if ($cfa_reg eq "%rsp") { + $self->{value} = ".cfi_adjust_cfa_offset\t8\n"; + } + $self->{value} .= ".cfi_offset\t$$line,$cfa_rsp"; + last; + }; + /pop/ && do { $dir = undef; + $cfa_rsp += 8; + if ($cfa_reg eq "%rsp") { + $self->{value} = ".cfi_adjust_cfa_offset\t-8\n"; + } + $self->{value} .= ".cfi_restore\t$$line"; + last; + }; + /cfa_expression/ + && do { $dir = undef; + $self->{value} = ".cfi_escape\t" . + join(",", map(sprintf("0x%02x", $_), + cfa_expression($$line))); + last; + }; + /remember_state/ + && do { push @cfa_stack, [$cfa_reg, $cfa_rsp]; + last; + }; + /restore_state/ + && do { ($cfa_reg, $cfa_rsp) = @{pop @cfa_stack}; + last; + }; + } + + $self->{value} = ".cfi_$dir\t$$line" if ($dir); + + $$line = ""; + } + + return $ret; + } + sub out { + my $self = shift; + return ($elf ? $self->{value} : undef); + } +} +{ package seh_directive; + # This implements directives, like MASM, gas, and clang-assembler for + # specifying Windows unwind codes. See + # https://learn.microsoft.com/en-us/cpp/build/exception-handling-x64?view=msvc-170 + # for details on the Windows unwind mechanism. As perlasm generally uses gas + # syntax, the syntax is patterned after the gas spelling, described in + # https://sourceware.org/legacy-ml/binutils/2009-08/msg00193.html + # + # TODO(https://crbug.com/boringssl/571): Translate to the MASM directives + # when using the MASM output. Emit as-is when using "mingw64" output, which + # is Windows with gas syntax. + # + # TODO(https://crbug.com/boringssl/259): For now, SEH directives are ignored + # on non-Windows platforms. This means functions need to specify both CFI + # and SEH directives, often redundantly. Ideally we'd abstract between the + # two. E.g., we can synthesize CFI from SEH prologues, but SEH does not + # annotate epilogs, so we'd need to combine parts from both. Or we can + # restrict ourselves to a subset of CFI and synthesize SEH from CFI. + # + # Additionally, this only supports @abi-omnipotent functions. It is + # incompatible with the automatic calling convention conversion. The main + # complication is the current scheme modifies RDI and RSI (non-volatile on + # Windows) at the start of the function, and saves them in the parameter + # stack area. This can be expressed with .seh_savereg, but .seh_savereg is + # only usable late in the prologue. However, unwind information gives enough + # information to locate the parameter stack area at any point in the + # function, so we can defer conversion or implement other schemes. + + my $UWOP_PUSH_NONVOL = 0; + my $UWOP_ALLOC_LARGE = 1; + my $UWOP_ALLOC_SMALL = 2; + my $UWOP_SET_FPREG = 3; + my $UWOP_SAVE_NONVOL = 4; + my $UWOP_SAVE_NONVOL_FAR = 5; + my $UWOP_SAVE_XMM128 = 8; + my $UWOP_SAVE_XMM128_FAR = 9; + + my %UWOP_REG_TO_NUMBER = ("%rax" => 0, "%rcx" => 1, "%rdx" => 2, "%rbx" => 3, + "%rsp" => 4, "%rbp" => 5, "%rsi" => 6, "%rdi" => 7, + map(("%r$_" => $_), (8..15))); + my %UWOP_NUMBER_TO_REG = reverse %UWOP_REG_TO_NUMBER; + + # The contents of the pdata and xdata sections so far. + my ($xdata, $pdata) = ("", ""); + + my %info; + + my $next_label = 0; + my $current_label_func = ""; + + # _new_unwind_label allocates a new label, unique to the file. + sub _new_unwind_label { + my ($name) = (@_); + # Labels only need to be unique, but to make diffs easier to read, scope + # them all under the current function. + my $func = $current_function->{name}; + if ($func ne $current_label_func) { + $current_label_func = $func; + $next_label = 0; + } + + my $num = $next_label++; + return ".LSEH_${name}_${func}_${num}"; + } + + sub _check_in_proc { + die "Missing .seh_startproc directive" unless %info; + } + + sub _check_in_prologue { + _check_in_proc(); + die "Invalid SEH directive after .seh_endprologue" if defined($info{endprologue}); + } + + sub _check_not_in_proc { + die "Missing .seh_endproc directive" if %info; + } + + sub _startproc { + _check_not_in_proc(); + if ($current_function->{abi} eq "svr4") { + die "SEH directives can only be used with \@abi-omnipotent"; + } + + my $info_label = _new_unwind_label("info"); + my $start_label = _new_unwind_label("begin"); + %info = ( + # info_label is the label of the function's entry in .xdata. + info_label => $info_label, + # start_label is the start of the function. + start_label => $start_label, + # endprologue is the label of the end of the prologue. + endprologue => undef, + # unwind_codes contains the textual representation of the + # unwind codes in the function so far. + unwind_codes => "", + # num_codes is the number of 16-bit words in unwind_codes. + num_codes => 0, + # frame_reg is the number of the frame register, or zero if + # there is none. + frame_reg => 0, + # frame_offset is the offset into the fixed part of the stack that + # the frame register points into. + frame_offset => 0, + # has_offset is whether directives taking an offset have + # been used. This is used to check that such directives + # come after the fixed portion of the stack frame is established. + has_offset => 0, + # has_nonpushreg is whether directives other than + # .seh_pushreg have been used. This is used to check that + # .seh_pushreg directives are first. + has_nonpushreg => 0, + ); + return $start_label; + } + + sub _add_unwind_code { + my ($op, $value, @extra) = @_; + _check_in_prologue(); + if ($op != $UWOP_PUSH_NONVOL) { + $info{has_nonpushreg} = 1; + } elsif ($info{has_nonpushreg}) { + die ".seh_pushreg directives must appear first in the prologue"; + } + + my $label = _new_unwind_label("prologue"); + # Encode an UNWIND_CODE structure. See + # https://learn.microsoft.com/en-us/cpp/build/exception-handling-x64?view=msvc-170#struct-unwind_code + my $encoded = $op | ($value << 4); + my $codes = <<____; + .byte $label-$info{start_label} + .byte $encoded +____ + # Some opcodes need additional values to encode themselves. + foreach (@extra) { + $codes .= "\t.value\t$_\n"; + } + + $info{num_codes} += 1 + scalar(@extra); + # Unwind codes are listed in reverse order. + $info{unwind_codes} = $codes . $info{unwind_codes}; + return $label; + } + + sub _updating_fixed_allocation { + _check_in_prologue(); + if ($info{frame_reg} != 0) { + # Windows documentation does not explicitly forbid .seh_stackalloc + # after .seh_setframe, but it appears to have no effect. Offsets are + # still relative to the fixed allocation when the frame register was + # established. + die "fixed allocation may not be increased after .seh_setframe"; + } + if ($info{has_offset}) { + # Windows documentation does not explicitly forbid .seh_savereg + # before .seh_stackalloc, but it does not work very well. Offsets + # are relative to the top of the final fixed allocation, not where + # RSP currently is. + die "directives with an offset must come after the fixed allocation is established."; + } + } + + sub _endproc { + _check_in_proc(); + if (!defined($info{endprologue})) { + die "Missing .seh_endprologue"; + } + + my $end_label = _new_unwind_label("end"); + # Encode a RUNTIME_FUNCTION. See + # https://learn.microsoft.com/en-us/cpp/build/exception-handling-x64?view=msvc-170#struct-runtime_function + $pdata .= <<____; + .rva $info{start_label} + .rva $end_label + .rva $info{info_label} + +____ + + # Encode an UNWIND_INFO. See + # https://learn.microsoft.com/en-us/cpp/build/exception-handling-x64?view=msvc-170#struct-unwind_info + my $frame_encoded = $info{frame_reg} | (($info{frame_offset} / 16) << 4); + $xdata .= <<____; +$info{info_label}: + .byte 1 # version 1, no flags + .byte $info{endprologue}-$info{start_label} + .byte $info{num_codes} + .byte $frame_encoded +$info{unwind_codes} +____ + + # UNWIND_INFOs must be 4-byte aligned. If needed, we must add an extra + # unwind code. This does not change the unwind code count. Windows + # documentation says "For alignment purposes, this array always has an + # even number of entries, and the final entry is potentially unused. In + # that case, the array is one longer than indicated by the count of + # unwind codes field." + if ($info{num_codes} & 1) { + $xdata .= "\t.value\t0\n"; + } + + %info = (); + return $end_label; + } + + sub re { + my ($class, $line) = @_; + if ($$line =~ s/^\s*\.seh_(\w+)\s*//) { + my $dir = $1; + if (!$win64) { + $$line = ""; + return; + } + + my $label; + SWITCH: for ($dir) { + /^startproc$/ && do { + $label = _startproc($1); + last; + }; + /^pushreg$/ && do { + $$line =~ /^(%\w+)\s*$/ or die "could not parse .seh_$dir"; + my $reg_num = $UWOP_REG_TO_NUMBER{$1} or die "unknown register $1"; + _updating_fixed_allocation(); + $label = _add_unwind_code($UWOP_PUSH_NONVOL, $reg_num); + last; + }; + /^stackalloc$/ && do { + my $num = eval($$line); + if ($num <= 0 || $num % 8 != 0) { + die "invalid stack allocation: $num"; + } + _updating_fixed_allocation(); + if ($num <= 128) { + $label = _add_unwind_code($UWOP_ALLOC_SMALL, ($num - 8) / 8); + } elsif ($num < 512 * 1024) { + $label = _add_unwind_code($UWOP_ALLOC_LARGE, 0, $num / 8); + } elsif ($num < 4 * 1024 * 1024 * 1024) { + $label = _add_unwind_code($UWOP_ALLOC_LARGE, 1, $num >> 16, $num & 0xffff); + } else { + die "stack allocation too large: $num" + } + last; + }; + /^setframe$/ && do { + if ($info{frame_reg} != 0) { + die "duplicate .seh_setframe directive"; + } + if ($info{has_offset}) { + die "directives with with an offset must come after .seh_setframe."; + } + $$line =~ /(%\w+)\s*,\s*(.+)/ or die "could not parse .seh_$dir"; + my $reg_num = $UWOP_REG_TO_NUMBER{$1} or die "unknown register $1"; + my $offset = eval($2); + if ($offset < 0 || $offset % 16 != 0 || $offset > 240) { + die "invalid offset: $offset"; + } + $info{frame_reg} = $reg_num; + $info{frame_offset} = $offset; + $label = _add_unwind_code($UWOP_SET_FPREG, 0); + last; + }; + /^savereg$/ && do { + $$line =~ /(%\w+)\s*,\s*(.+)/ or die "could not parse .seh_$dir"; + my $reg_num = $UWOP_REG_TO_NUMBER{$1} or die "unknown register $1"; + my $offset = eval($2); + if ($offset < 0 || $offset % 8 != 0) { + die "invalid offset: $offset"; + } + if ($offset < 8 * 65536) { + $label = _add_unwind_code($UWOP_SAVE_NONVOL, $reg_num, $offset / 8); + } else { + $label = _add_unwind_code($UWOP_SAVE_NONVOL_FAR, $reg_num, $offset >> 16, $offset & 0xffff); + } + $info{has_offset} = 1; + last; + }; + /^savexmm$/ && do { + $$line =~ /%xmm(\d+)\s*,\s*(.+)/ or die "could not parse .seh_$dir"; + my $reg_num = $1; + my $offset = eval($2); + if ($offset < 0 || $offset % 16 != 0) { + die "invalid offset: $offset"; + } + if ($offset < 16 * 65536) { + $label = _add_unwind_code($UWOP_SAVE_XMM128, $reg_num, $offset / 16); + } else { + $label = _add_unwind_code($UWOP_SAVE_XMM128_FAR, $reg_num, $offset >> 16, $offset & 0xffff); + } + $info{has_offset} = 1; + last; + }; + /^endprologue$/ && do { + _check_in_prologue(); + if ($info{num_codes} == 0) { + # If a Windows function has no directives (i.e. it + # doesn't touch the stack), it is a leaf function and is + # not expected to appear in .pdata or .xdata. + die ".seh_endprologue found with no unwind codes"; + } + + $label = _new_unwind_label("endprologue"); + $info{endprologue} = $label; + last; + }; + /^endproc$/ && do { + $label = _endproc(); + last; + }; + die "unknown SEH directive .seh_$dir"; + } + + # All SEH directives compile to labels inline. The other data is + # emitted later. + $$line = ""; + $label .= ":"; + return label->re(\$label); + } + } + + sub pdata_and_xdata { + return "" unless $win64; + + my $ret = ""; + if ($pdata ne "") { + $ret .= <<____; +.section .pdata +.align 4 +$pdata +____ + } + if ($xdata ne "") { + $ret .= <<____; +.section .xdata +.align 4 +$xdata +____ + } + return $ret; + } +} +{ package directive; # pick up directives, which start with . + my %sections; + sub nasm_section { + my ($name, $qualifiers) = @_; + my $ret = "section\t$name"; + if (exists $sections{$name}) { + # Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392701. Only + # emit section qualifiers the first time a section is referenced. + # For all subsequent references, require the qualifiers match and + # omit them. + # + # See also https://crbug.com/1422018 and b/270643835. + my $old = $sections{$name}; + die "Inconsistent qualifiers: $qualifiers vs $old" if ($qualifiers ne "" && $qualifiers ne $old); + } else { + $sections{$name} = $qualifiers; + if ($qualifiers ne "") { + $ret .= " $qualifiers"; + } + } + return $ret; + } + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + my $dir; + + # chain-call to cfi_directive and seh_directive. + $ret = cfi_directive->re($line) and return $ret; + $ret = seh_directive->re($line) and return $ret; + + if ($$line =~ /^\s*(\.\w+)/) { + bless $self,$class; + $dir = $1; + $ret = $self; + undef $self->{value}; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + + SWITCH: for ($dir) { + /\.global|\.globl|\.extern/ + && do { $globals{$$line} = $prefix . $$line; + $$line = $globals{$$line} if ($prefix); + last; + }; + /\.type/ && do { my ($sym,$type,$narg) = split(/\s*,\s*/,$$line); + if ($type eq "\@function") { + undef $current_function; + $current_function->{name} = $sym; + $current_function->{abi} = "svr4"; + $current_function->{narg} = $narg; + $current_function->{scope} = defined($globals{$sym})?"PUBLIC":"PRIVATE"; + } elsif ($type eq "\@abi-omnipotent") { + undef $current_function; + $current_function->{name} = $sym; + $current_function->{scope} = defined($globals{$sym})?"PUBLIC":"PRIVATE"; + } + $$line =~ s/\@abi\-omnipotent/\@function/; + $$line =~ s/\@function.*/\@function/; + last; + }; + /\.asciz/ && do { if ($$line =~ /^"(.*)"$/) { + $dir = ".byte"; + $$line = join(",",unpack("C*",$1),0); + } + last; + }; + /\.rva|\.long|\.quad|\.byte/ + && do { $$line =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei; + $$line =~ s/\.L/$decor/g; + last; + }; + } + + if ($gas) { + $self->{value} = $dir . "\t" . $$line; + + if ($dir =~ /\.extern/) { + if ($flavour eq "elf") { + $self->{value} .= "\n.hidden $$line"; + } else { + $self->{value} = ""; + } + } elsif (!$elf && $dir =~ /\.type/) { + $self->{value} = ""; + $self->{value} = ".def\t" . ($globals{$1} or $1) . ";\t" . + (defined($globals{$1})?".scl 2;":".scl 3;") . + "\t.type 32;\t.endef" + if ($win64 && $$line =~ /([^,]+),\@function/); + } elsif (!$elf && $dir =~ /\.size/) { + $self->{value} = ""; + if (defined($current_function)) { + $self->{value} .= "${decor}SEH_end_$current_function->{name}:" + if ($win64 && $current_function->{abi} eq "svr4"); + undef $current_function; + } + } elsif (!$elf && $dir =~ /\.align/) { + $self->{value} = ".p2align\t" . (log($$line)/log(2)); + } elsif ($dir eq ".section") { + $current_segment=$$line; + if (!$elf && $current_segment eq ".rodata") { + if ($flavour eq "macosx") { $self->{value} = ".section\t__DATA,__const"; } + } + if (!$elf && $current_segment eq ".init") { + if ($flavour eq "macosx") { $self->{value} = ".mod_init_func"; } + elsif ($flavour eq "mingw64") { $self->{value} = ".section\t.ctors"; } + } + } elsif ($dir =~ /\.(text|data)/) { + $current_segment=".$1"; + } elsif ($dir =~ /\.global|\.globl|\.extern/) { + if ($flavour eq "macosx") { + $self->{value} .= "\n.private_extern $$line"; + } else { + $self->{value} .= "\n.hidden $$line"; + } + } elsif ($dir =~ /\.hidden/) { + if ($flavour eq "macosx") { $self->{value} = ".private_extern\t$prefix$$line"; } + elsif ($flavour eq "mingw64") { $self->{value} = ""; } + } elsif ($dir =~ /\.comm/) { + $self->{value} = "$dir\t$prefix$$line"; + $self->{value} =~ s|,([0-9]+),([0-9]+)$|",$1,".log($2)/log(2)|e if ($flavour eq "macosx"); + } + $$line = ""; + return $self; + } + + # non-gas case or nasm/masm + SWITCH: for ($dir) { + /\.text/ && do { my $v=undef; + if ($nasm) { + $v=nasm_section(".text", "code align=64")."\n"; + } else { + $v="$current_segment\tENDS\n" if ($current_segment); + $current_segment = ".text\$"; + $v.="$current_segment\tSEGMENT "; + $v.=$masm>=$masmref ? "ALIGN(256)" : "PAGE"; + $v.=" 'CODE'"; + } + $self->{value} = $v; + last; + }; + /\.data/ && do { my $v=undef; + if ($nasm) { + $v=nasm_section(".data", "data align=8")."\n"; + } else { + $v="$current_segment\tENDS\n" if ($current_segment); + $current_segment = "_DATA"; + $v.="$current_segment\tSEGMENT"; + } + $self->{value} = $v; + last; + }; + /\.section/ && do { my $v=undef; + $$line =~ s/([^,]*).*/$1/; + $$line = ".CRT\$XCU" if ($$line eq ".init"); + $$line = ".rdata" if ($$line eq ".rodata"); + if ($nasm) { + my $qualifiers = ""; + if ($$line=~/\.([prx])data/) { + $qualifiers = "rdata align="; + $qualifiers .= $1 eq "p"? 4 : 8; + } elsif ($$line=~/\.CRT\$/i) { + $qualifiers = "rdata align=8"; + } + $v = nasm_section($$line, $qualifiers); + } else { + $v="$current_segment\tENDS\n" if ($current_segment); + $v.="$$line\tSEGMENT"; + if ($$line=~/\.([prx])data/) { + $v.=" READONLY"; + $v.=" ALIGN(".($1 eq "p" ? 4 : 8).")" if ($masm>=$masmref); + } elsif ($$line=~/\.CRT\$/i) { + $v.=" READONLY "; + $v.=$masm>=$masmref ? "ALIGN(8)" : "DWORD"; + } + } + $current_segment = $$line; + $self->{value} = $v; + last; + }; + /\.extern/ && do { $self->{value} = "EXTERN\t".$$line; + $self->{value} .= ":NEAR" if ($masm); + last; + }; + /\.globl|.global/ + && do { $self->{value} = $masm?"PUBLIC":"global"; + $self->{value} .= "\t".$$line; + last; + }; + /\.size/ && do { if (defined($current_function)) { + undef $self->{value}; + if ($current_function->{abi} eq "svr4") { + $self->{value}="${decor}SEH_end_$current_function->{name}:"; + $self->{value}.=":\n" if($masm); + } + $self->{value}.="$current_function->{name}\tENDP" if($masm && $current_function->{name}); + undef $current_function; + } + last; + }; + /\.align/ && do { my $max = ($masm && $masm>=$masmref) ? 256 : 4096; + $self->{value} = "ALIGN\t".($$line>$max?$max:$$line); + last; + }; + /\.(value|long|rva|quad)/ + && do { my $sz = substr($1,0,1); + my @arr = split(/,\s*/,$$line); + my $last = pop(@arr); + my $conv = sub { my $var=shift; + $var=~s/^(0b[0-1]+)/oct($1)/eig; + $var=~s/^0x([0-9a-f]+)/0$1h/ig if ($masm); + if ($sz eq "D" && ($current_segment=~/.[px]data/ || $dir eq ".rva")) + { $var=~s/^([_a-z\$\@][_a-z0-9\$\@]*)/$nasm?"$1 wrt ..imagebase":"imagerel $1"/egi; } + $var; + }; + + $sz =~ tr/bvlrq/BWDDQ/; + $self->{value} = "\tD$sz\t"; + for (@arr) { $self->{value} .= &$conv($_).","; } + $self->{value} .= &$conv($last); + last; + }; + /\.byte/ && do { my @str=split(/,\s*/,$$line); + map(s/(0b[0-1]+)/oct($1)/eig,@str); + map(s/0x([0-9a-f]+)/0$1h/ig,@str) if ($masm); + while ($#str>15) { + $self->{value}.="\tDB\t" + .join(",",@str[0..15])."\n"; + foreach (0..15) { shift @str; } + } + $self->{value}.="\tDB\t" + .join(",",@str) if (@str); + last; + }; + /\.comm/ && do { my @str=split(/,\s*/,$$line); + my $v=undef; + if ($nasm) { + $v.="common $prefix@str[0] @str[1]"; + } else { + $v="$current_segment\tENDS\n" if ($current_segment); + $current_segment = "_DATA"; + $v.="$current_segment\tSEGMENT\n"; + $v.="COMM @str[0]:DWORD:".@str[1]/4; + } + $self->{value} = $v; + last; + }; + } + $$line = ""; + } + + $ret; + } + sub out { + my $self = shift; + $self->{value}; + } +} + +# Upon initial x86_64 introduction SSE>2 extensions were not introduced +# yet. In order not to be bothered by tracing exact assembler versions, +# but at the same time to provide a bare security minimum of AES-NI, we +# hard-code some instructions. Extensions past AES-NI on the other hand +# are traced by examining assembler version in individual perlasm +# modules... + +my %regrm = ( "%eax"=>0, "%ecx"=>1, "%edx"=>2, "%ebx"=>3, + "%esp"=>4, "%ebp"=>5, "%esi"=>6, "%edi"=>7 ); + +sub rex { + my $opcode=shift; + my ($dst,$src,$rex)=@_; + + $rex|=0x04 if($dst>=8); + $rex|=0x01 if($src>=8); + push @$opcode,($rex|0x40) if ($rex); +} + +my $movq = sub { # elderly gas can't handle inter-register movq + my $arg = shift; + my @opcode=(0x66); + if ($arg =~ /%xmm([0-9]+),\s*%r(\w+)/) { + my ($src,$dst)=($1,$2); + if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } + rex(\@opcode,$src,$dst,0x8); + push @opcode,0x0f,0x7e; + push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M + @opcode; + } elsif ($arg =~ /%r(\w+),\s*%xmm([0-9]+)/) { + my ($src,$dst)=($2,$1); + if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } + rex(\@opcode,$src,$dst,0x8); + push @opcode,0x0f,0x6e; + push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M + @opcode; + } else { + (); + } +}; + +my $pextrd = sub { + if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*(%\w+)/) { + my @opcode=(0x66); + my $imm=$1; + my $src=$2; + my $dst=$3; + if ($dst =~ /%r([0-9]+)d/) { $dst = $1; } + elsif ($dst =~ /%e/) { $dst = $regrm{$dst}; } + rex(\@opcode,$src,$dst); + push @opcode,0x0f,0x3a,0x16; + push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M + push @opcode,$imm; + @opcode; + } else { + (); + } +}; + +my $pinsrd = sub { + if (shift =~ /\$([0-9]+),\s*(%\w+),\s*%xmm([0-9]+)/) { + my @opcode=(0x66); + my $imm=$1; + my $src=$2; + my $dst=$3; + if ($src =~ /%r([0-9]+)/) { $src = $1; } + elsif ($src =~ /%e/) { $src = $regrm{$src}; } + rex(\@opcode,$dst,$src); + push @opcode,0x0f,0x3a,0x22; + push @opcode,0xc0|(($dst&7)<<3)|($src&7); # ModR/M + push @opcode,$imm; + @opcode; + } else { + (); + } +}; + +my $pshufb = sub { + if (shift =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x66); + rex(\@opcode,$2,$1); + push @opcode,0x0f,0x38,0x00; + push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M + @opcode; + } else { + (); + } +}; + +my $palignr = sub { + if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x66); + rex(\@opcode,$3,$2); + push @opcode,0x0f,0x3a,0x0f; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + push @opcode,$1; + @opcode; + } else { + (); + } +}; + +my $pclmulqdq = sub { + if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x66); + rex(\@opcode,$3,$2); + push @opcode,0x0f,0x3a,0x44; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + my $c=$1; + push @opcode,$c=~/^0/?oct($c):$c; + @opcode; + } else { + (); + } +}; + +my $rdrand = sub { + if (shift =~ /%[er](\w+)/) { + my @opcode=(); + my $dst=$1; + if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } + rex(\@opcode,0,$dst,8); + push @opcode,0x0f,0xc7,0xf0|($dst&7); + @opcode; + } else { + (); + } +}; + +my $rdseed = sub { + if (shift =~ /%[er](\w+)/) { + my @opcode=(); + my $dst=$1; + if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } + rex(\@opcode,0,$dst,8); + push @opcode,0x0f,0xc7,0xf8|($dst&7); + @opcode; + } else { + (); + } +}; + +# Not all AVX-capable assemblers recognize AMD XOP extension. Since we +# are using only two instructions hand-code them in order to be excused +# from chasing assembler versions... + +sub rxb { + my $opcode=shift; + my ($dst,$src1,$src2,$rxb)=@_; + + $rxb|=0x7<<5; + $rxb&=~(0x04<<5) if($dst>=8); + $rxb&=~(0x01<<5) if($src1>=8); + $rxb&=~(0x02<<5) if($src2>=8); + push @$opcode,$rxb; +} + +my $vprotd = sub { + if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x8f); + rxb(\@opcode,$3,$2,-1,0x08); + push @opcode,0x78,0xc2; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + my $c=$1; + push @opcode,$c=~/^0/?oct($c):$c; + @opcode; + } else { + (); + } +}; + +my $vprotq = sub { + if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x8f); + rxb(\@opcode,$3,$2,-1,0x08); + push @opcode,0x78,0xc3; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + my $c=$1; + push @opcode,$c=~/^0/?oct($c):$c; + @opcode; + } else { + (); + } +}; + +# Intel Control-flow Enforcement Technology extension. All functions and +# indirect branch targets will have to start with this instruction... + +my $endbranch = sub { + (0xf3,0x0f,0x1e,0xfa); +}; + +######################################################################## + +{ + my $comment = "//"; + $comment = ";" if ($masm || $nasm); + print <<___; +$comment This file is generated from a similarly-named Perl script in the BoringSSL +$comment source tree. Do not edit by hand. + +___ +} + +if ($nasm) { + die "unknown target" unless ($win64); + print <<___; +\%ifidn __OUTPUT_FORMAT__, win64 +default rel +\%define XMMWORD +\%define YMMWORD +\%define ZMMWORD +\%define _CET_ENDBR + +\%include "ring_core_generated/prefix_symbols_nasm.inc" +___ +} elsif ($masm) { + print <<___; +OPTION DOTNAME +___ +} + +if ($gas) { + my $target; + if ($elf) { + # The "elf" target is really ELF with SysV ABI, but every ELF platform + # uses the SysV ABI. + $target = "defined(__ELF__)"; + } elsif ($apple) { + $target = "defined(__APPLE__)"; + } else { + die "unknown target: $flavour"; + } + print <<___; +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && $target +___ +} + +sub process_line { + my $line = shift; + $line =~ s|\R$||; # Better chomp + + if ($nasm) { + $line =~ s|^#ifdef |%ifdef |; + $line =~ s|^#ifndef |%ifndef |; + $line =~ s|^#endif|%endif|; + $line =~ s|[#!].*$||; # get rid of asm-style comments... + } else { + # Get rid of asm-style comments but not preprocessor directives. The + # former are identified by having a letter after the '#' and starting in + # the first column. + $line =~ s|!.*$||; + $line =~ s|(?<=.)#.*$||; + $line =~ s|^#([^a-z].*)?$||; + } + + $line =~ s|/\*.*\*/||; # ... and C-style comments... + $line =~ s|^\s+||; # ... and skip white spaces in beginning + $line =~ s|\s+$||; # ... and at the end + + if (my $label=label->re(\$line)) { print $label->out(); } + + if (my $directive=directive->re(\$line)) { + printf "%s",$directive->out(); + } elsif (my $opcode=opcode->re(\$line)) { + my $asm = eval("\$".$opcode->mnemonic()); + + if ((ref($asm) eq 'CODE') && scalar(my @bytes=&$asm($line))) { + print $gas?".byte\t":"DB\t",join(',',@bytes),"\n"; + next; + } + + my @args; + ARGUMENT: while (1) { + my $arg; + + ($arg=register->re(\$line, $opcode))|| + ($arg=const->re(\$line)) || + ($arg=ea->re(\$line, $opcode)) || + ($arg=expr->re(\$line, $opcode)) || + last ARGUMENT; + + push @args,$arg; + + last ARGUMENT if ($line !~ /^,/); + + $line =~ s/^,\s*//; + } # ARGUMENT: + + if ($#args>=0) { + my $insn; + my $sz=$opcode->size(); + + if ($gas) { + $insn = $opcode->out($#args>=1?$args[$#args]->size():$sz); + @args = map($_->out($sz),@args); + printf "\t%s\t%s",$insn,join(",",@args); + } else { + $insn = $opcode->out(); + foreach (@args) { + my $arg = $_->out(); + # $insn.=$sz compensates for movq, pinsrw, ... + if ($arg =~ /^xmm[0-9]+$/) { $insn.=$sz; $sz="x" if(!$sz); last; } + if ($arg =~ /^ymm[0-9]+$/) { $insn.=$sz; $sz="y" if(!$sz); last; } + if ($arg =~ /^zmm[0-9]+$/) { $insn.=$sz; $sz="z" if(!$sz); last; } + if ($arg =~ /^mm[0-9]+$/) { $insn.=$sz; $sz="q" if(!$sz); last; } + } + @args = reverse(@args); + undef $sz if ($nasm && $opcode->mnemonic() eq "lea"); + printf "\t%s\t%s",$insn,join(",",map($_->out($sz),@args)); + } + } else { + printf "\t%s",$opcode->out(); + } + } + + print $line,"\n"; +} + +while(defined(my $line=<>)) { + process_line($line); +} +foreach my $line (split(/\n/, seh_directive->pdata_and_xdata())) { + process_line($line); +} + +print "\n$current_segment\tENDS\n" if ($current_segment && $masm); +if ($masm) { + print "END\n"; +} elsif ($gas) { + print "#endif\n"; +} elsif ($nasm) { + print <<___; +\%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +\%endif +___ +} else { + die "unknown assembler"; +} + +close STDOUT or die "error closing STDOUT: $!"; + + ################################################# +# Cross-reference x86_64 ABI "card" +# +# Unix Win64 +# %rax * * +# %rbx - - +# %rcx #4 #1 +# %rdx #3 #2 +# %rsi #2 - +# %rdi #1 - +# %rbp - - +# %rsp - - +# %r8 #5 #3 +# %r9 #6 #4 +# %r10 * * +# %r11 * * +# %r12 - - +# %r13 - - +# %r14 - - +# %r15 - - +# +# (*) volatile register +# (-) preserved by callee +# (#) Nth argument, volatile +# +# In Unix terms top of stack is argument transfer area for arguments +# which could not be accommodated in registers. Or in other words 7th +# [integer] argument resides at 8(%rsp) upon function entry point. +# 128 bytes above %rsp constitute a "red zone" which is not touched +# by signal handlers and can be used as temporal storage without +# allocating a frame. +# +# In Win64 terms N*8 bytes on top of stack is argument transfer area, +# which belongs to/can be overwritten by callee. N is the number of +# arguments passed to callee, *but* not less than 4! This means that +# upon function entry point 5th argument resides at 40(%rsp), as well +# as that 32 bytes from 8(%rsp) can always be used as temporal +# storage [without allocating a frame]. One can actually argue that +# one can assume a "red zone" above stack pointer under Win64 as well. +# Point is that at apparently no occasion Windows kernel would alter +# the area above user stack pointer in true asynchronous manner... +# +# All the above means that if assembler programmer adheres to Unix +# register and stack layout, but disregards the "red zone" existence, +# it's possible to use following prologue and epilogue to "gear" from +# Unix to Win64 ABI in leaf functions with not more than 6 arguments. +# +# omnipotent_function: +# ifdef WIN64 +# movq %rdi,8(%rsp) +# movq %rsi,16(%rsp) +# movq %rcx,%rdi ; if 1st argument is actually present +# movq %rdx,%rsi ; if 2nd argument is actually ... +# movq %r8,%rdx ; if 3rd argument is ... +# movq %r9,%rcx ; if 4th argument ... +# movq 40(%rsp),%r8 ; if 5th ... +# movq 48(%rsp),%r9 ; if 6th ... +# endif +# ... +# ifdef WIN64 +# movq 8(%rsp),%rdi +# movq 16(%rsp),%rsi +# endif +# ret +# + ################################################# +# Win64 SEH, Structured Exception Handling. +# +# Unlike on Unix systems(*) lack of Win64 stack unwinding information +# has undesired side-effect at run-time: if an exception is raised in +# assembler subroutine such as those in question (basically we're +# referring to segmentation violations caused by malformed input +# parameters), the application is briskly terminated without invoking +# any exception handlers, most notably without generating memory dump +# or any user notification whatsoever. This poses a problem. It's +# possible to address it by registering custom language-specific +# handler that would restore processor context to the state at +# subroutine entry point and return "exception is not handled, keep +# unwinding" code. Writing such handler can be a challenge... But it's +# doable, though requires certain coding convention. Consider following +# snippet: +# +# .type function,@function +# function: +# movq %rsp,%rax # copy rsp to volatile register +# pushq %r15 # save non-volatile registers +# pushq %rbx +# pushq %rbp +# movq %rsp,%r11 +# subq %rdi,%r11 # prepare [variable] stack frame +# andq $-64,%r11 +# movq %rax,0(%r11) # check for exceptions +# movq %r11,%rsp # allocate [variable] stack frame +# movq %rax,0(%rsp) # save original rsp value +# magic_point: +# ... +# movq 0(%rsp),%rcx # pull original rsp value +# movq -24(%rcx),%rbp # restore non-volatile registers +# movq -16(%rcx),%rbx +# movq -8(%rcx),%r15 +# movq %rcx,%rsp # restore original rsp +# magic_epilogue: +# ret +# .size function,.-function +# +# The key is that up to magic_point copy of original rsp value remains +# in chosen volatile register and no non-volatile register, except for +# rsp, is modified. While past magic_point rsp remains constant till +# the very end of the function. In this case custom language-specific +# exception handler would look like this: +# +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +# { ULONG64 *rsp = (ULONG64 *)context->Rax; +# ULONG64 rip = context->Rip; +# +# if (rip >= magic_point) +# { rsp = (ULONG64 *)context->Rsp; +# if (rip < magic_epilogue) +# { rsp = (ULONG64 *)rsp[0]; +# context->Rbp = rsp[-3]; +# context->Rbx = rsp[-2]; +# context->R15 = rsp[-1]; +# } +# } +# context->Rsp = (ULONG64)rsp; +# context->Rdi = rsp[1]; +# context->Rsi = rsp[2]; +# +# memcpy (disp->ContextRecord,context,sizeof(CONTEXT)); +# RtlVirtualUnwind(UNW_FLAG_NHANDLER,disp->ImageBase, +# dips->ControlPc,disp->FunctionEntry,disp->ContextRecord, +# &disp->HandlerData,&disp->EstablisherFrame,NULL); +# return ExceptionContinueSearch; +# } +# +# It's appropriate to implement this handler in assembler, directly in +# function's module. In order to do that one has to know members' +# offsets in CONTEXT and DISPATCHER_CONTEXT structures and some constant +# values. Here they are: +# +# CONTEXT.Rax 120 +# CONTEXT.Rcx 128 +# CONTEXT.Rdx 136 +# CONTEXT.Rbx 144 +# CONTEXT.Rsp 152 +# CONTEXT.Rbp 160 +# CONTEXT.Rsi 168 +# CONTEXT.Rdi 176 +# CONTEXT.R8 184 +# CONTEXT.R9 192 +# CONTEXT.R10 200 +# CONTEXT.R11 208 +# CONTEXT.R12 216 +# CONTEXT.R13 224 +# CONTEXT.R14 232 +# CONTEXT.R15 240 +# CONTEXT.Rip 248 +# CONTEXT.Xmm6 512 +# sizeof(CONTEXT) 1232 +# DISPATCHER_CONTEXT.ControlPc 0 +# DISPATCHER_CONTEXT.ImageBase 8 +# DISPATCHER_CONTEXT.FunctionEntry 16 +# DISPATCHER_CONTEXT.EstablisherFrame 24 +# DISPATCHER_CONTEXT.TargetIp 32 +# DISPATCHER_CONTEXT.ContextRecord 40 +# DISPATCHER_CONTEXT.LanguageHandler 48 +# DISPATCHER_CONTEXT.HandlerData 56 +# UNW_FLAG_NHANDLER 0 +# ExceptionContinueSearch 1 +# +# In order to tie the handler to the function one has to compose +# couple of structures: one for .xdata segment and one for .pdata. +# +# UNWIND_INFO structure for .xdata segment would be +# +# function_unwind_info: +# .byte 9,0,0,0 +# .rva handler +# +# This structure designates exception handler for a function with +# zero-length prologue, no stack frame or frame register. +# +# To facilitate composing of .pdata structures, auto-generated "gear" +# prologue copies rsp value to rax and denotes next instruction with +# .LSEH_begin_{function_name} label. This essentially defines the SEH +# styling rule mentioned in the beginning. Position of this label is +# chosen in such manner that possible exceptions raised in the "gear" +# prologue would be accounted to caller and unwound from latter's frame. +# End of function is marked with respective .LSEH_end_{function_name} +# label. To summarize, .pdata segment would contain +# +# .rva .LSEH_begin_function +# .rva .LSEH_end_function +# .rva function_unwind_info +# +# Reference to function_unwind_info from .xdata segment is the anchor. +# In case you wonder why references are 32-bit .rvas and not 64-bit +# .quads. References put into these two segments are required to be +# *relative* to the base address of the current binary module, a.k.a. +# image base. No Win64 module, be it .exe or .dll, can be larger than +# 2GB and thus such relative references can be and are accommodated in +# 32 bits. +# +# Having reviewed the example function code, one can argue that "movq +# %rsp,%rax" above is redundant. It is not! Keep in mind that on Unix +# rax would contain an undefined value. If this "offends" you, use +# another register and refrain from modifying rax till magic_point is +# reached, i.e. as if it was a non-volatile register. If more registers +# are required prior [variable] frame setup is completed, note that +# nobody says that you can have only one "magic point." You can +# "liberate" non-volatile registers by denoting last stack off-load +# instruction and reflecting it in finer grade unwind logic in handler. +# After all, isn't it why it's called *language-specific* handler... +# +# SE handlers are also involved in unwinding stack when executable is +# profiled or debugged. Profiling implies additional limitations that +# are too subtle to discuss here. For now it's sufficient to say that +# in order to simplify handlers one should either a) offload original +# %rsp to stack (like discussed above); or b) if you have a register to +# spare for frame pointer, choose volatile one. +# +# (*) Note that we're talking about run-time, not debug-time. Lack of +# unwind information makes debugging hard on both Windows and +# Unix. "Unlike" refers to the fact that on Unix signal handler +# will always be invoked, core dumped and appropriate exit code +# returned to parent (for user notification). diff --git a/ring-0.17.14/crypto/perlasm/x86asm.pl b/ring-0.17.14/crypto/perlasm/x86asm.pl new file mode 100644 index 0000000000..7b685e0401 --- /dev/null +++ b/ring-0.17.14/crypto/perlasm/x86asm.pl @@ -0,0 +1,368 @@ +#! /usr/bin/env perl +# Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# require 'x86asm.pl'; +# &asm_init([,$i386only]); +# &function_begin("foo"); +# ... +# &function_end("foo"); +# &asm_finish + +$out=(); +$i386=0; + +# AUTOLOAD is this context has quite unpleasant side effect, namely +# that typos in function calls effectively go to assembler output, +# but on the pros side we don't have to implement one subroutine per +# each opcode... +sub ::AUTOLOAD +{ my $opcode = $AUTOLOAD; + + die "more than 4 arguments passed to $opcode" if ($#_>3); + + $opcode =~ s/.*:://; + if ($opcode =~ /^push/) { $stack+=4; } + elsif ($opcode =~ /^pop/) { $stack-=4; } + + &generic($opcode,@_) or die "undefined subroutine \&$AUTOLOAD"; +} + +# record_function_hit(int) writes a byte with value one to the given offset of +# |BORINGSSL_function_hit|, but only if BORINGSSL_DISPATCH_TEST is defined. +# This is used in impl_dispatch_test.cc to test whether the expected assembly +# functions are triggered by high-level API calls. +sub ::record_function_hit +{ my($index)=@_; + &preprocessor_ifdef("BORINGSSL_DISPATCH_TEST"); + &push("ebx"); + &push("edx"); + &call(&label("pic_for_function_hit")); + &set_label("pic_for_function_hit"); + &blindpop("ebx"); + &lea("ebx",&DWP("BORINGSSL_function_hit+$index"."-".&label("pic_for_function_hit"),"ebx")); + &mov("edx", 1); + &movb(&BP(0, "ebx"), "dl"); + &pop("edx"); + &pop("ebx"); + &preprocessor_endif(); +} + +sub ::emit +{ my $opcode=shift; + + if ($#_==-1) { push(@out,"\t$opcode\n"); } + else { push(@out,"\t$opcode\t".join(',',@_)."\n"); } +} + +sub ::LB +{ $_[0] =~ m/^e?([a-d])x$/o or die "$_[0] does not have a 'low byte'"; + $1."l"; +} +sub ::HB +{ $_[0] =~ m/^e?([a-d])x$/o or die "$_[0] does not have a 'high byte'"; + $1."h"; +} +sub ::stack_push{ my $num=$_[0]*4; $stack+=$num; &sub("esp",$num); } +sub ::stack_pop { my $num=$_[0]*4; $stack-=$num; &add("esp",$num); } +sub ::blindpop { &pop($_[0]); $stack+=4; } +sub ::wparam { &DWP($stack+4*$_[0],"esp"); } +sub ::swtmp { &DWP(4*$_[0],"esp"); } + +sub ::bswap +{ if ($i386) # emulate bswap for i386 + { &comment("bswap @_"); + &xchg(&HB(@_),&LB(@_)); + &ror (@_,16); + &xchg(&HB(@_),&LB(@_)); + } + else + { &generic("bswap",@_); } +} +# These are made-up opcodes introduced over the years essentially +# by ignorance, just alias them to real ones... +sub ::movb { &mov(@_); } +sub ::xorb { &xor(@_); } +sub ::rotl { &rol(@_); } +sub ::rotr { &ror(@_); } +sub ::exch { &xchg(@_); } +sub ::halt { &hlt; } +sub ::movz { &movzx(@_); } +sub ::pushf { &pushfd; } +sub ::popf { &popfd; } + +# 3 argument instructions +sub ::movq +{ my($p1,$p2,$optimize)=@_; + + if ($optimize && $p1=~/^mm[0-7]$/ && $p2=~/^mm[0-7]$/) + # movq between mmx registers can sink Intel CPUs + { &::pshufw($p1,$p2,0xe4); } + else + { &::generic("movq",@_); } +} + +# SSE>2 instructions +my %regrm = ( "eax"=>0, "ecx"=>1, "edx"=>2, "ebx"=>3, + "esp"=>4, "ebp"=>5, "esi"=>6, "edi"=>7 ); +sub ::pextrd +{ my($dst,$src,$imm)=@_; + if ("$dst:$src" =~ /(e[a-dsd][ixp]):xmm([0-7])/) + { &::data_byte(0x66,0x0f,0x3a,0x16,0xc0|($2<<3)|$regrm{$1},$imm); } + else + { &::generic("pextrd",@_); } +} + +sub ::pinsrd +{ my($dst,$src,$imm)=@_; + if ("$dst:$src" =~ /xmm([0-7]):(e[a-dsd][ixp])/) + { &::data_byte(0x66,0x0f,0x3a,0x22,0xc0|($1<<3)|$regrm{$2},$imm); } + else + { &::generic("pinsrd",@_); } +} + +sub ::pshufb +{ my($dst,$src)=@_; + if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) + { &data_byte(0x66,0x0f,0x38,0x00,0xc0|($1<<3)|$2); } + else + { &::generic("pshufb",@_); } +} + +sub ::palignr +{ my($dst,$src,$imm)=@_; + if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) + { &::data_byte(0x66,0x0f,0x3a,0x0f,0xc0|($1<<3)|$2,$imm); } + else + { &::generic("palignr",@_); } +} + +sub ::pclmulqdq +{ my($dst,$src,$imm)=@_; + if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) + { &::data_byte(0x66,0x0f,0x3a,0x44,0xc0|($1<<3)|$2,$imm); } + else + { &::generic("pclmulqdq",@_); } +} + +sub ::rdrand +{ my ($dst)=@_; + if ($dst =~ /(e[a-dsd][ixp])/) + { &::data_byte(0x0f,0xc7,0xf0|$regrm{$dst}); } + else + { &::generic("rdrand",@_); } +} + +sub ::rdseed +{ my ($dst)=@_; + if ($dst =~ /(e[a-dsd][ixp])/) + { &::data_byte(0x0f,0xc7,0xf8|$regrm{$dst}); } + else + { &::generic("rdrand",@_); } +} + +sub rxb { + local *opcode=shift; + my ($dst,$src1,$src2,$rxb)=@_; + + $rxb|=0x7<<5; + $rxb&=~(0x04<<5) if($dst>=8); + $rxb&=~(0x01<<5) if($src1>=8); + $rxb&=~(0x02<<5) if($src2>=8); + push @opcode,$rxb; +} + +sub ::vprotd +{ my $args=join(',',@_); + if ($args =~ /xmm([0-7]),xmm([0-7]),([x0-9a-f]+)/) + { my @opcode=(0x8f); + rxb(\@opcode,$1,$2,-1,0x08); + push @opcode,0x78,0xc2; + push @opcode,0xc0|($2&7)|(($1&7)<<3); # ModR/M + my $c=$3; + push @opcode,$c=~/^0/?oct($c):$c; + &::data_byte(@opcode); + } + else + { &::generic("vprotd",@_); } +} + +sub ::endbranch +{ + &::data_byte(0xf3,0x0f,0x1e,0xfb); +} + +# label management +$lbdecor="L"; # local label decoration, set by package +$label="000"; + +sub ::islabel # see is argument is a known label +{ my $i; + foreach $i (values %label) { return $i if ($i eq $_[0]); } + $label{$_[0]}; # can be undef +} + +sub ::label # instantiate a function-scope label +{ if (!defined($label{$_[0]})) + { $label{$_[0]}="${lbdecor}${label}${_[0]}"; $label++; } + $label{$_[0]}; +} + +sub ::LABEL # instantiate a file-scope label +{ $label{$_[0]}=$_[1] if (!defined($label{$_[0]})); + $label{$_[0]}; +} + +sub ::static_label { &::LABEL($_[0],$lbdecor.$_[0]); } + +sub ::set_label_B { push(@out,"@_:\n"); } +sub ::set_label +{ my $label=&::label($_[0]); + &::align($_[1]) if ($_[1]>1); + &::set_label_B($label); + $label; +} + +sub ::wipe_labels # wipes function-scope labels +{ foreach $i (keys %label) + { delete $label{$i} if ($label{$i} =~ /^\Q${lbdecor}\E[0-9]{3}/); } +} + +# subroutine management +sub ::function_begin +{ &function_begin_B(@_); + $stack=4; + &push("ebp"); + &push("ebx"); + &push("esi"); + &push("edi"); +} + +sub ::function_end +{ &pop("edi"); + &pop("esi"); + &pop("ebx"); + &pop("ebp"); + &ret(); + &function_end_B(@_); + $stack=0; + &wipe_labels(); +} + +sub ::function_end_A +{ &pop("edi"); + &pop("esi"); + &pop("ebx"); + &pop("ebp"); + &ret(); + $stack+=16; # readjust esp as if we didn't pop anything +} + +sub ::asciz +{ my @str=unpack("C*",shift); + push @str,0; + while ($#str>15) { + &data_byte(@str[0..15]); + foreach (0..15) { shift @str; } + } + &data_byte(@str) if (@str); +} + +sub ::asm_finish +{ &file_end(); + my $comment = "//"; + $comment = ";" if ($win32); + print <<___; +$comment This file is generated from a similarly-named Perl script in the BoringSSL +$comment source tree. Do not edit by hand. + +___ + if ($win32) { + print <<___ unless $masm; +\%include "ring_core_generated/prefix_symbols_nasm.inc" +\%ifidn __OUTPUT_FORMAT__, win32 +___ + print @out; + print <<___ unless $masm; +\%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +\%endif +___ + } else { + my $target; + if ($elf) { + $target = "defined(__ELF__)"; + } elsif ($macosx) { + $target = "defined(__APPLE__)"; + } else { + die "unknown target"; + } + + print <<___; +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && $target +___ + print @out; + print <<___; +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && $target +___ + } +} + +sub ::asm_init +{ my ($type,$cpu)=@_; + + $i386=$cpu; + + $elf=$cpp=$coff=$aout=$macosx=$win32=$mwerks=$android=0; + if (($type eq "elf")) + { $elf=1; require "x86gas.pl"; } + elsif (($type eq "elf-1")) + { $elf=-1; require "x86gas.pl"; } + elsif (($type eq "a\.out")) + { $aout=1; require "x86gas.pl"; } + elsif (($type eq "coff" or $type eq "gaswin")) + { $coff=1; require "x86gas.pl"; } + elsif (($type eq "win32n")) + { $win32=1; require "x86nasm.pl"; } + elsif (($type eq "win32")) + { $win32=1; $masm=1; require "x86masm.pl"; } + elsif (($type eq "macosx")) + { $aout=1; $macosx=1; require "x86gas.pl"; } + elsif (($type eq "android")) + { $elf=1; $android=1; require "x86gas.pl"; } + else + { print STDERR <<"EOF"; +Pick one target type from + elf - Linux, FreeBSD, Solaris x86, etc. + a.out - DJGPP, elder OpenBSD, etc. + coff - GAS/COFF such as Win32 targets + win32n - Windows 95/Windows NT NASM format + macosx - Mac OS X +EOF + exit(1); + } + + $pic=0; + for (@ARGV) { $pic=1 if (/\-[fK]PIC/i); } + + &file(); +} + +sub ::hidden {} + +1; diff --git a/ring-0.17.14/crypto/perlasm/x86gas.pl b/ring-0.17.14/crypto/perlasm/x86gas.pl new file mode 100644 index 0000000000..62a5710a8f --- /dev/null +++ b/ring-0.17.14/crypto/perlasm/x86gas.pl @@ -0,0 +1,285 @@ +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +package x86gas; + +*out=\@::out; + +$::lbdecor=$::aout?"L":".L"; # local label decoration +$nmdecor=($::aout or $::coff)?"_":""; # external name decoration + +$initseg=""; + +$align=16; +$align=log($align)/log(2) if ($::aout); +$com_start="#" if ($::aout or $::coff); + +sub opsize() +{ my $reg=shift; + if ($reg =~ m/^%e/o) { "l"; } + elsif ($reg =~ m/^%[a-d][hl]$/o) { "b"; } + elsif ($reg =~ m/^%[yxm]/o) { undef; } + else { "w"; } +} + +# swap arguments; +# expand opcode with size suffix; +# prefix numeric constants with $; +sub ::generic +{ my($opcode,@arg)=@_; + my($suffix,$dst,$src); + + @arg=reverse(@arg); + + for (@arg) + { s/^(\*?)(e?[a-dsixphl]{2})$/$1%$2/o; # gp registers + s/^([xy]?mm[0-7])$/%$1/o; # xmm/mmx registers + s/^(\-?[0-9]+)$/\$$1/o; # constants + s/^(\-?0x[0-9a-f]+)$/\$$1/o; # constants + } + + $dst = $arg[$#arg] if ($#arg>=0); + $src = $arg[$#arg-1] if ($#arg>=1); + if ($dst =~ m/^%/o) { $suffix=&opsize($dst); } + elsif ($src =~ m/^%/o) { $suffix=&opsize($src); } + else { $suffix="l"; } + undef $suffix if ($dst =~ m/^%[xm]/o || $src =~ m/^%[xm]/o); + + if ($#_==0) { &::emit($opcode); } + elsif ($#_==1 && $opcode =~ m/^(call|clflush|j|loop|set)/o) + { &::emit($opcode,@arg); } + else { &::emit($opcode.$suffix,@arg);} + + 1; +} +# +# opcodes not covered by ::generic above, mostly inconsistent namings... +# +sub ::movzx { &::movzb(@_); } +sub ::pushfd { &::pushfl; } +sub ::popfd { &::popfl; } +sub ::cpuid { &::emit(".byte\t0x0f,0xa2"); } +sub ::rdtsc { &::emit(".byte\t0x0f,0x31"); } + +sub ::call { &::emit("call",(&::islabel($_[0]) or "$nmdecor$_[0]")); } +sub ::call_ptr { &::generic("call","*$_[0]"); } +sub ::jmp_ptr { &::generic("jmp","*$_[0]"); } + +*::bswap = sub { &::emit("bswap","%$_[0]"); } if (!$::i386); + +sub ::DWP +{ my($addr,$reg1,$reg2,$idx)=@_; + my $ret=""; + + if (!defined($idx) && 1*$reg2) { $idx=$reg2; $reg2=$reg1; undef $reg1; } + + $addr =~ s/^\s+//; + # prepend global references with optional underscore + $addr =~ s/^([^\+\-0-9][^\+\-]*)/&::islabel($1) or "$nmdecor$1"/ige; + + $reg1 = "%$reg1" if ($reg1); + $reg2 = "%$reg2" if ($reg2); + + $ret .= $addr if (($addr ne "") && ($addr ne 0)); + + if ($reg2) + { $idx!= 0 or $idx=1; + $ret .= "($reg1,$reg2,$idx)"; + } + elsif ($reg1) + { $ret .= "($reg1)"; } + + $ret; +} +sub ::QWP { &::DWP(@_); } +sub ::BP { &::DWP(@_); } +sub ::WP { &::DWP(@_); } +sub ::BC { @_; } +sub ::DWC { @_; } + +sub ::file +{ push(@out,".text\n"); } + +sub ::function_begin_B +{ my $func=shift; + my $global=($func !~ /^_/); + my $begin="${::lbdecor}_${func}_begin"; + + &::LABEL($func,$global?"$begin":"$nmdecor$func"); + $func=$nmdecor.$func; + + push(@out,".globl\t$func\n") if ($global); + if ($::macosx) { + push(@out,".private_extern\t$func\n"); + } else { + push(@out,".hidden\t$func\n"); + } + if ($::coff) + { push(@out,".def\t$func;\t.scl\t".(3-$global).";\t.type\t32;\t.endef\n"); } + elsif (($::aout and !$::pic) or $::macosx) + { } + else + { push(@out,".type $func,\@function\n"); } + push(@out,".align\t$align\n"); + push(@out,"$func:\n"); + push(@out,"$begin:\n") if ($global); + $::stack=4; +} + +sub ::function_end_B +{ my $func=shift; + push(@out,".size\t$nmdecor$func,.-".&::LABEL($func)."\n") if ($::elf); + $::stack=0; + &::wipe_labels(); +} + +sub ::comment + { + if (!defined($com_start) or $::elf) + { # Regarding $::elf above... + # GNU and SVR4 as'es use different comment delimiters, + push(@out,"\n"); # so we just skip ELF comments... + return; + } + foreach (@_) + { + if (/^\s*$/) + { push(@out,"\n"); } + else + { push(@out,"\t$com_start $_ $com_end\n"); } + } + } + +sub ::external_label +{ foreach(@_) { &::LABEL($_,$nmdecor.$_); } } + +sub ::public_label +{ push(@out,".globl\t".&::LABEL($_[0],$nmdecor.$_[0])."\n"); } + +sub ::file_end +{ if ($::macosx) + { if (%non_lazy_ptr) + { push(@out,".section __IMPORT,__pointers,non_lazy_symbol_pointers\n"); + foreach $i (keys %non_lazy_ptr) + { push(@out,"$non_lazy_ptr{$i}:\n.indirect_symbol\t$i\n.long\t0\n"); } + } + } + if (0 && grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out) { + my $tmp=".comm\t${nmdecor}OPENSSL_ia32cap_P,16"; + if ($::macosx) { push (@out,"$tmp,2\n"); } + elsif ($::elf) { push (@out,"$tmp,4\n"); } + else { push (@out,"$tmp\n"); } + } + push(@out,$initseg) if ($initseg); +} + +sub ::data_byte { push(@out,".byte\t".join(',',@_)."\n"); } +sub ::data_short{ push(@out,".value\t".join(',',@_)."\n"); } +sub ::data_word { push(@out,".long\t".join(',',@_)."\n"); } + +sub ::align +{ my $val=$_[0]; + if ($::aout) + { $val=int(log($val)/log(2)); + $val.=",0x90"; + } + push(@out,".align\t$val\n"); +} + +sub ::picmeup +{ my($dst,$sym,$base,$reflabel)=@_; + + if (($::pic && ($::elf || $::aout)) || $::macosx) + { if (!defined($base)) + { &::call(&::label("PIC_me_up")); + &::set_label("PIC_me_up"); + &::blindpop($dst); + $base=$dst; + $reflabel=&::label("PIC_me_up"); + } + if ($::macosx) + { my $indirect=&::static_label("$nmdecor$sym\$non_lazy_ptr"); + &::mov($dst,&::DWP("$indirect-$reflabel",$base)); + $non_lazy_ptr{"$nmdecor$sym"}=$indirect; + } + elsif ($sym eq "OPENSSL_ia32cap_P" && $::elf>0) + { &::lea($dst,&::DWP("$sym-$reflabel",$base)); } + else + { &::lea($dst,&::DWP("_GLOBAL_OFFSET_TABLE_+[.-$reflabel]", + $base)); + &::mov($dst,&::DWP("$sym\@GOT",$dst)); + } + } + else + { &::lea($dst,&::DWP($sym)); } +} + +sub ::initseg +{ my $f=$nmdecor.shift; + + if ($::android) + { $initseg.=<<___; +.section .init_array +.align 4 +.long $f +___ + } + elsif ($::elf) + { $initseg.=<<___; +.section .init + call $f +___ + } + elsif ($::coff) + { $initseg.=<<___; # applies to both Cygwin and Mingw +.section .ctors +.long $f +___ + } + elsif ($::macosx) + { $initseg.=<<___; +.mod_init_func +.align 2 +.long $f +___ + } + elsif ($::aout) + { my $ctor="${nmdecor}_GLOBAL_\$I\$$f"; + $initseg.=".text\n"; + $initseg.=".type $ctor,\@function\n" if ($::pic); + $initseg.=<<___; # OpenBSD way... +.globl $ctor +.align 2 +$ctor: + jmp $f +___ + } +} + +sub ::dataseg +{ push(@out,".data\n"); } + +sub ::preprocessor_ifdef +{ my($define)=@_; + push(@out,"#ifdef ${define}\n"); +} + +sub ::preprocessor_endif +{ push(@out,"#endif\n"); } + +*::hidden = sub { push(@out,".hidden\t$nmdecor$_[0]\n"); } if ($::elf); + +1; diff --git a/ring-0.17.14/crypto/perlasm/x86nasm.pl b/ring-0.17.14/crypto/perlasm/x86nasm.pl new file mode 100644 index 0000000000..61c70f48a1 --- /dev/null +++ b/ring-0.17.14/crypto/perlasm/x86nasm.pl @@ -0,0 +1,201 @@ +#! /usr/bin/env perl +# Copyright 1999-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +package x86nasm; + +*out=\@::out; + +$::lbdecor="L\$"; # local label decoration +$nmdecor="_"; # external name decoration +$drdecor=$::mwerks?".":""; # directive decoration + +$initseg=""; + +sub ::generic +{ my $opcode=shift; + my $tmp; + + if (!$::mwerks) + { if ($opcode =~ m/^j/o && $#_==0) # optimize jumps + { $_[0] = "NEAR $_[0]"; } + elsif ($opcode eq "lea" && $#_==1) # wipe storage qualifier from lea + { $_[1] =~ s/^[^\[]*\[/\[/o; } + elsif ($opcode eq "clflush" && $#_==0) + { $_[0] =~ s/^[^\[]*\[/\[/o; } + } + &::emit($opcode,@_); + 1; +} +# +# opcodes not covered by ::generic above, mostly inconsistent namings... +# +sub ::call { &::emit("call",(&::islabel($_[0]) or "$nmdecor$_[0]")); } +sub ::call_ptr { &::emit("call",@_); } +sub ::jmp_ptr { &::emit("jmp",@_); } + +sub get_mem +{ my($size,$addr,$reg1,$reg2,$idx)=@_; + my($post,$ret); + + if (!defined($idx) && 1*$reg2) { $idx=$reg2; $reg2=$reg1; undef $reg1; } + + if ($size ne "") + { $ret .= "$size"; + $ret .= " PTR" if ($::mwerks); + $ret .= " "; + } + $ret .= "["; + + $addr =~ s/^\s+//; + # prepend global references with optional underscore + $addr =~ s/^([^\+\-0-9][^\+\-]*)/::islabel($1) or "$nmdecor$1"/ige; + # put address arithmetic expression in parenthesis + $addr="($addr)" if ($addr =~ /^.+[\-\+].+$/); + + if (($addr ne "") && ($addr ne 0)) + { if ($addr !~ /^-/) { $ret .= "$addr+"; } + else { $post=$addr; } + } + + if ($reg2 ne "") + { $idx!=0 or $idx=1; + $ret .= "$reg2*$idx"; + $ret .= "+$reg1" if ($reg1 ne ""); + } + else + { $ret .= "$reg1"; } + + $ret .= "$post]"; + $ret =~ s/\+\]/]/; # in case $addr was the only argument + + $ret; +} +sub ::BP { &get_mem("BYTE",@_); } +sub ::DWP { &get_mem("DWORD",@_); } +sub ::WP { &get_mem("WORD",@_); } +sub ::QWP { &get_mem("",@_); } +sub ::BC { (($::mwerks)?"":"BYTE ")."@_"; } +sub ::DWC { (($::mwerks)?"":"DWORD ")."@_"; } + +sub ::file +{ if ($::mwerks) { push(@out,".section\t.text,64\n"); } + else + { my $tmp=<<___; +%ifidn __OUTPUT_FORMAT__,obj +section code use32 class=code align=64 +%elifidn __OUTPUT_FORMAT__,win32 +\$\@feat.00 equ 1 +section .text code align=64 +%else +section .text code +%endif +___ + push(@out,$tmp); + } +} + +sub ::function_begin_B +{ my $func=shift; + my $global=($func !~ /^_/); + my $begin="${::lbdecor}_${func}_begin"; + + $begin =~ s/^\@/./ if ($::mwerks); # the torture never stops + + &::LABEL($func,$global?"$begin":"$nmdecor$func"); + $func=$nmdecor.$func; + + push(@out,"${drdecor}global $func\n") if ($global); + push(@out,"${drdecor}align 16\n"); + push(@out,"$func:\n"); + push(@out,"$begin:\n") if ($global); + $::stack=4; +} + +sub ::function_end_B +{ $::stack=0; + &::wipe_labels(); +} + +sub ::file_end +{ if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out) + { my $comm=<<___; +${drdecor}segment .bss +${drdecor}common ${nmdecor}OPENSSL_ia32cap_P 16 +___ + # comment out OPENSSL_ia32cap_P declarations + grep {s/(^extern\s+${nmdecor}OPENSSL_ia32cap_P)/\;$1/} @out; + push (@out,$comm) + } + push (@out,$initseg) if ($initseg); +} + +sub ::comment { foreach (@_) { push(@out,"\t; $_\n"); } } + +sub ::external_label +{ foreach(@_) + { push(@out,"${drdecor}extern\t".&::LABEL($_,$nmdecor.$_)."\n"); } +} + +sub ::public_label +{ push(@out,"${drdecor}global\t".&::LABEL($_[0],$nmdecor.$_[0])."\n"); } + +sub ::data_byte +{ push(@out,(($::mwerks)?".byte\t":"db\t").join(',',@_)."\n"); } +sub ::data_short +{ push(@out,(($::mwerks)?".word\t":"dw\t").join(',',@_)."\n"); } +sub ::data_word +{ push(@out,(($::mwerks)?".long\t":"dd\t").join(',',@_)."\n"); } + +sub ::align +{ push(@out,"${drdecor}align\t$_[0]\n"); } + +sub ::picmeup +{ my($dst,$sym)=@_; + &::lea($dst,&::DWP($sym)); +} + +sub ::initseg +{ my $f=$nmdecor.shift; + if ($::win32) + { $initseg=<<___; +segment .CRT\$XCU data align=4 +extern $f +dd $f +___ + } +} + +sub ::dataseg +{ if ($mwerks) { push(@out,".section\t.data,4\n"); } + else { push(@out,"section\t.data align=4\n"); } +} + +sub ::safeseh +{ my $nm=shift; + push(@out,"%if __NASM_VERSION_ID__ >= 0x02030000\n"); + push(@out,"safeseh ".&::LABEL($nm,$nmdecor.$nm)."\n"); + push(@out,"%endif\n"); +} + +sub ::preprocessor_ifdef +{ my($define)=@_; + push(@out,"%ifdef ${define}\n"); +} + +sub ::preprocessor_endif +{ push(@out,"%endif\n"); } + +1; diff --git a/ring-0.17.14/crypto/poly1305/poly1305.c b/ring-0.17.14/crypto/poly1305/poly1305.c new file mode 100644 index 0000000000..67595362f8 --- /dev/null +++ b/ring-0.17.14/crypto/poly1305/poly1305.c @@ -0,0 +1,246 @@ +/* Copyright (c) 2014, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +// This implementation of poly1305 is by Andrew Moon +// (https://github.com/floodyberry/poly1305-donna) and released as public +// domain. + +#include + +#include "../internal.h" +#include "ring-core/check.h" + +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic ignored "-Wsign-conversion" +#pragma GCC diagnostic ignored "-Wconversion" +#endif + +static uint64_t mul32x32_64(uint32_t a, uint32_t b) { return (uint64_t)a * b; } + +// Keep in sync with `poly1305_state_st` in ffi_fallback.rs. +struct poly1305_state_st { + alignas(64) uint32_t r0; + uint32_t r1, r2, r3, r4; + uint32_t s1, s2, s3, s4; + uint32_t h0, h1, h2, h3, h4; + uint8_t key[16]; +}; + +// poly1305_blocks updates |state| given some amount of input data. This +// function may only be called with a |len| that is not a multiple of 16 at the +// end of the data. Otherwise the input must be buffered into 16 byte blocks. +static void poly1305_update(struct poly1305_state_st *state, const uint8_t *in, + size_t len) { + debug_assert_nonsecret((uintptr_t)state % 64 == 0); + + uint32_t t0, t1, t2, t3; + uint64_t t[5]; + uint32_t b; + uint64_t c; + size_t j; + uint8_t mp[16]; + + if (len < 16) { + goto poly1305_donna_atmost15bytes; + } + +poly1305_donna_16bytes: + t0 = CRYPTO_load_u32_le(in); + t1 = CRYPTO_load_u32_le(in + 4); + t2 = CRYPTO_load_u32_le(in + 8); + t3 = CRYPTO_load_u32_le(in + 12); + + in += 16; + len -= 16; + + state->h0 += t0 & 0x3ffffff; + state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff; + state->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff; + state->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff; + state->h4 += (t3 >> 8) | (1 << 24); + +poly1305_donna_mul: + t[0] = mul32x32_64(state->h0, state->r0) + mul32x32_64(state->h1, state->s4) + + mul32x32_64(state->h2, state->s3) + mul32x32_64(state->h3, state->s2) + + mul32x32_64(state->h4, state->s1); + t[1] = mul32x32_64(state->h0, state->r1) + mul32x32_64(state->h1, state->r0) + + mul32x32_64(state->h2, state->s4) + mul32x32_64(state->h3, state->s3) + + mul32x32_64(state->h4, state->s2); + t[2] = mul32x32_64(state->h0, state->r2) + mul32x32_64(state->h1, state->r1) + + mul32x32_64(state->h2, state->r0) + mul32x32_64(state->h3, state->s4) + + mul32x32_64(state->h4, state->s3); + t[3] = mul32x32_64(state->h0, state->r3) + mul32x32_64(state->h1, state->r2) + + mul32x32_64(state->h2, state->r1) + mul32x32_64(state->h3, state->r0) + + mul32x32_64(state->h4, state->s4); + t[4] = mul32x32_64(state->h0, state->r4) + mul32x32_64(state->h1, state->r3) + + mul32x32_64(state->h2, state->r2) + mul32x32_64(state->h3, state->r1) + + mul32x32_64(state->h4, state->r0); + + state->h0 = (uint32_t)t[0] & 0x3ffffff; + c = (t[0] >> 26); + t[1] += c; + state->h1 = (uint32_t)t[1] & 0x3ffffff; + b = (uint32_t)(t[1] >> 26); + t[2] += b; + state->h2 = (uint32_t)t[2] & 0x3ffffff; + b = (uint32_t)(t[2] >> 26); + t[3] += b; + state->h3 = (uint32_t)t[3] & 0x3ffffff; + b = (uint32_t)(t[3] >> 26); + t[4] += b; + state->h4 = (uint32_t)t[4] & 0x3ffffff; + b = (uint32_t)(t[4] >> 26); + state->h0 += b * 5; + + if (len >= 16) { + goto poly1305_donna_16bytes; + } + +// final bytes +poly1305_donna_atmost15bytes: + if (!len) { + return; + } + + for (j = 0; j < len; j++) { + mp[j] = in[j]; + } + mp[j++] = 1; + for (; j < 16; j++) { + mp[j] = 0; + } + len = 0; + + t0 = CRYPTO_load_u32_le(mp + 0); + t1 = CRYPTO_load_u32_le(mp + 4); + t2 = CRYPTO_load_u32_le(mp + 8); + t3 = CRYPTO_load_u32_le(mp + 12); + + state->h0 += t0 & 0x3ffffff; + state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff; + state->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff; + state->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff; + state->h4 += (t3 >> 8); + + goto poly1305_donna_mul; +} + +void CRYPTO_poly1305_init(struct poly1305_state_st *state, const uint8_t key[32]) { + debug_assert_nonsecret((uintptr_t)state % 64 == 0); + + uint32_t t0, t1, t2, t3; + + t0 = CRYPTO_load_u32_le(key + 0); + t1 = CRYPTO_load_u32_le(key + 4); + t2 = CRYPTO_load_u32_le(key + 8); + t3 = CRYPTO_load_u32_le(key + 12); + + // precompute multipliers + state->r0 = t0 & 0x3ffffff; + t0 >>= 26; + t0 |= t1 << 6; + state->r1 = t0 & 0x3ffff03; + t1 >>= 20; + t1 |= t2 << 12; + state->r2 = t1 & 0x3ffc0ff; + t2 >>= 14; + t2 |= t3 << 18; + state->r3 = t2 & 0x3f03fff; + t3 >>= 8; + state->r4 = t3 & 0x00fffff; + + state->s1 = state->r1 * 5; + state->s2 = state->r2 * 5; + state->s3 = state->r3 * 5; + state->s4 = state->r4 * 5; + + // init state + state->h0 = 0; + state->h1 = 0; + state->h2 = 0; + state->h3 = 0; + state->h4 = 0; + + OPENSSL_memcpy(state->key, key + 16, sizeof(state->key)); +} + +void CRYPTO_poly1305_update(struct poly1305_state_st *state, const uint8_t *in, + size_t in_len) { + // Work around a C language bug. See https://crbug.com/1019588. + if (in_len == 0) { + return; + } + + poly1305_update(state, in, in_len); +} + +void CRYPTO_poly1305_finish(struct poly1305_state_st *state, uint8_t mac[16]) { + uint32_t g0, g1, g2, g3, g4; + uint32_t b, nb; + + b = state->h0 >> 26; + state->h0 = state->h0 & 0x3ffffff; + state->h1 += b; + b = state->h1 >> 26; + state->h1 = state->h1 & 0x3ffffff; + state->h2 += b; + b = state->h2 >> 26; + state->h2 = state->h2 & 0x3ffffff; + state->h3 += b; + b = state->h3 >> 26; + state->h3 = state->h3 & 0x3ffffff; + state->h4 += b; + b = state->h4 >> 26; + state->h4 = state->h4 & 0x3ffffff; + state->h0 += b * 5; + + g0 = state->h0 + 5; + b = g0 >> 26; + g0 &= 0x3ffffff; + g1 = state->h1 + b; + b = g1 >> 26; + g1 &= 0x3ffffff; + g2 = state->h2 + b; + b = g2 >> 26; + g2 &= 0x3ffffff; + g3 = state->h3 + b; + b = g3 >> 26; + g3 &= 0x3ffffff; + g4 = state->h4 + b - (1 << 26); + + b = (g4 >> 31) - 1; + nb = ~b; + state->h0 = (state->h0 & nb) | (g0 & b); + state->h1 = (state->h1 & nb) | (g1 & b); + state->h2 = (state->h2 & nb) | (g2 & b); + state->h3 = (state->h3 & nb) | (g3 & b); + state->h4 = (state->h4 & nb) | (g4 & b); + + uint64_t f0 = ((state->h0) | (state->h1 << 26)) + + (uint64_t)CRYPTO_load_u32_le(&state->key[0]); + uint64_t f1 = ((state->h1 >> 6) | (state->h2 << 20)) + + (uint64_t)CRYPTO_load_u32_le(&state->key[4]); + uint64_t f2 = ((state->h2 >> 12) | (state->h3 << 14)) + + (uint64_t)CRYPTO_load_u32_le(&state->key[8]); + uint64_t f3 = ((state->h3 >> 18) | (state->h4 << 8)) + + (uint64_t)CRYPTO_load_u32_le(&state->key[12]); + + CRYPTO_store_u32_le(&mac[0], (uint32_t)f0); + f1 += (f0 >> 32); + CRYPTO_store_u32_le(&mac[4], (uint32_t)f1); + f2 += (f1 >> 32); + CRYPTO_store_u32_le(&mac[8], (uint32_t)f2); + f3 += (f2 >> 32); + CRYPTO_store_u32_le(&mac[12], (uint32_t)f3); +} diff --git a/ring-0.17.14/crypto/poly1305/poly1305_arm.c b/ring-0.17.14/crypto/poly1305/poly1305_arm.c new file mode 100644 index 0000000000..632eccafb1 --- /dev/null +++ b/ring-0.17.14/crypto/poly1305/poly1305_arm.c @@ -0,0 +1,302 @@ +/* Copyright (c) 2014, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +// This implementation was taken from the public domain, neon2 version in +// SUPERCOP by D. J. Bernstein and Peter Schwabe. + +#include + +#include "../internal.h" + + +#pragma GCC diagnostic ignored "-Wsign-conversion" + +// Keep in sync with ffi_arm_neon.rs +typedef struct { + uint32_t v[12]; // for alignment; only using 10 +} fe1305x2; + +#define addmulmod openssl_poly1305_neon2_addmulmod +#define blocks openssl_poly1305_neon2_blocks + +extern void addmulmod(fe1305x2 *r, const fe1305x2 *x, const fe1305x2 *y, + const fe1305x2 *c); + +extern int blocks(fe1305x2 *h, const fe1305x2 *precomp, const uint8_t *in, + size_t inlen); + +static void freeze(fe1305x2 *r) { + int i; + + uint32_t x0 = r->v[0]; + uint32_t x1 = r->v[2]; + uint32_t x2 = r->v[4]; + uint32_t x3 = r->v[6]; + uint32_t x4 = r->v[8]; + uint32_t y0; + uint32_t y1; + uint32_t y2; + uint32_t y3; + uint32_t y4; + uint32_t swap; + + for (i = 0; i < 3; ++i) { + x1 += x0 >> 26; + x0 &= 0x3ffffff; + x2 += x1 >> 26; + x1 &= 0x3ffffff; + x3 += x2 >> 26; + x2 &= 0x3ffffff; + x4 += x3 >> 26; + x3 &= 0x3ffffff; + x0 += 5 * (x4 >> 26); + x4 &= 0x3ffffff; + } + + y0 = x0 + 5; + y1 = x1 + (y0 >> 26); + y0 &= 0x3ffffff; + y2 = x2 + (y1 >> 26); + y1 &= 0x3ffffff; + y3 = x3 + (y2 >> 26); + y2 &= 0x3ffffff; + y4 = x4 + (y3 >> 26); + y3 &= 0x3ffffff; + swap = -(y4 >> 26); + y4 &= 0x3ffffff; + + y0 ^= x0; + y1 ^= x1; + y2 ^= x2; + y3 ^= x3; + y4 ^= x4; + + y0 &= swap; + y1 &= swap; + y2 &= swap; + y3 &= swap; + y4 &= swap; + + y0 ^= x0; + y1 ^= x1; + y2 ^= x2; + y3 ^= x3; + y4 ^= x4; + + r->v[0] = y0; + r->v[2] = y1; + r->v[4] = y2; + r->v[6] = y3; + r->v[8] = y4; +} + +static void store32(uint8_t out[4], uint32_t v) { OPENSSL_memcpy(out, &v, 4); } + +// load32 exists to avoid breaking strict aliasing rules in +// fe1305x2_frombytearray. +static uint32_t load32(const uint8_t t[4]) { + uint32_t tmp; + OPENSSL_memcpy(&tmp, t, sizeof(tmp)); + return tmp; +} + +static void fe1305x2_tobytearray(uint8_t r[16], fe1305x2 *x) { + uint32_t x0 = x->v[0]; + uint32_t x1 = x->v[2]; + uint32_t x2 = x->v[4]; + uint32_t x3 = x->v[6]; + uint32_t x4 = x->v[8]; + + x1 += x0 >> 26; + x0 &= 0x3ffffff; + x2 += x1 >> 26; + x1 &= 0x3ffffff; + x3 += x2 >> 26; + x2 &= 0x3ffffff; + x4 += x3 >> 26; + x3 &= 0x3ffffff; + + store32(r, x0 + (x1 << 26)); + store32(r + 4, (x1 >> 6) + (x2 << 20)); + store32(r + 8, (x2 >> 12) + (x3 << 14)); + store32(r + 12, (x3 >> 18) + (x4 << 8)); +} + +static void fe1305x2_frombytearray(fe1305x2 *r, const uint8_t *x, size_t xlen) { + size_t i; + uint8_t t[17]; + + for (i = 0; (i < 16) && (i < xlen); i++) { + t[i] = x[i]; + } + xlen -= i; + x += i; + t[i++] = 1; + for (; i < 17; i++) { + t[i] = 0; + } + + r->v[0] = 0x3ffffff & load32(t); + r->v[2] = 0x3ffffff & (load32(t + 3) >> 2); + r->v[4] = 0x3ffffff & (load32(t + 6) >> 4); + r->v[6] = 0x3ffffff & (load32(t + 9) >> 6); + r->v[8] = load32(t + 13); + + if (xlen) { + for (i = 0; (i < 16) && (i < xlen); i++) { + t[i] = x[i]; + } + t[i++] = 1; + for (; i < 17; i++) { + t[i] = 0; + } + + r->v[1] = 0x3ffffff & load32(t); + r->v[3] = 0x3ffffff & (load32(t + 3) >> 2); + r->v[5] = 0x3ffffff & (load32(t + 6) >> 4); + r->v[7] = 0x3ffffff & (load32(t + 9) >> 6); + r->v[9] = load32(t + 13); + } else { + r->v[1] = r->v[3] = r->v[5] = r->v[7] = r->v[9] = 0; + } +} + +static const alignas(16) fe1305x2 zero; + +// Keep in sync with ffi_arm_neon.rs +struct poly1305_state_st { + alignas(16) fe1305x2 r; + fe1305x2 h; + fe1305x2 c; + fe1305x2 precomp[2]; + uint8_t data[128]; + + uint8_t buf[32]; + size_t buf_used; + uint8_t key[16]; +}; + +OPENSSL_STATIC_ASSERT(sizeof(fe1305x2) == 48, "fe1305x2 size is different than expected"); + +void CRYPTO_poly1305_init_neon(struct poly1305_state_st *st, const uint8_t key[32]) { + fe1305x2 *const r = &st->r; + fe1305x2 *const h = &st->h; + fe1305x2 *const precomp = &st->precomp[0]; + + r->v[1] = r->v[0] = 0x3ffffff & load32(key); + r->v[3] = r->v[2] = 0x3ffff03 & (load32(key + 3) >> 2); + r->v[5] = r->v[4] = 0x3ffc0ff & (load32(key + 6) >> 4); + r->v[7] = r->v[6] = 0x3f03fff & (load32(key + 9) >> 6); + r->v[9] = r->v[8] = 0x00fffff & (load32(key + 12) >> 8); + + for (size_t j = 0; j < 10; j++) { + h->v[j] = 0; // XXX: should fast-forward a bit + } + + addmulmod(precomp, r, r, &zero); // precompute r^2 + addmulmod(precomp + 1, precomp, precomp, &zero); // precompute r^4 + + OPENSSL_memcpy(st->key, key + 16, 16); + st->buf_used = 0; +} + +void CRYPTO_poly1305_update_neon(struct poly1305_state_st *st, const uint8_t *in, + size_t in_len) { + fe1305x2 *const h = &st->h; + fe1305x2 *const c = &st->c; + fe1305x2 *const precomp = &st->precomp[0]; + + if (st->buf_used) { + size_t todo = 32 - st->buf_used; + if (todo > in_len) { + todo = in_len; + } + for (size_t i = 0; i < todo; i++) { + st->buf[st->buf_used + i] = in[i]; + } + st->buf_used += todo; + in_len -= todo; + in += todo; + + if (st->buf_used == sizeof(st->buf) && in_len) { + addmulmod(h, h, precomp, &zero); + fe1305x2_frombytearray(c, st->buf, sizeof(st->buf)); + for (size_t i = 0; i < 10; i++) { + h->v[i] += c->v[i]; + } + st->buf_used = 0; + } + } + + while (in_len > 32) { + size_t tlen = 1048576; + if (in_len < tlen) { + tlen = in_len; + } + tlen -= blocks(h, precomp, in, tlen); + in_len -= tlen; + in += tlen; + } + + if (in_len) { + for (size_t i = 0; i < in_len; i++) { + st->buf[i] = in[i]; + } + st->buf_used = in_len; + } +} + +void CRYPTO_poly1305_finish_neon(struct poly1305_state_st *st, uint8_t mac[16]) { + fe1305x2 *const r = &st->r; + fe1305x2 *const h = &st->h; + fe1305x2 *const c = &st->c; + fe1305x2 *const precomp = &st->precomp[0]; + + addmulmod(h, h, precomp, &zero); + + if (st->buf_used > 16) { + fe1305x2_frombytearray(c, st->buf, st->buf_used); + precomp->v[1] = r->v[1]; + precomp->v[3] = r->v[3]; + precomp->v[5] = r->v[5]; + precomp->v[7] = r->v[7]; + precomp->v[9] = r->v[9]; + addmulmod(h, h, precomp, c); + } else if (st->buf_used > 0) { + fe1305x2_frombytearray(c, st->buf, st->buf_used); + r->v[1] = 1; + r->v[3] = 0; + r->v[5] = 0; + r->v[7] = 0; + r->v[9] = 0; + addmulmod(h, h, r, c); + } + + h->v[0] += h->v[1]; + h->v[2] += h->v[3]; + h->v[4] += h->v[5]; + h->v[6] += h->v[7]; + h->v[8] += h->v[9]; + freeze(h); + + fe1305x2_frombytearray(c, st->key, 16); + c->v[8] ^= (1 << 24); + + h->v[0] += c->v[0]; + h->v[2] += c->v[2]; + h->v[4] += c->v[4]; + h->v[6] += c->v[6]; + h->v[8] += c->v[8]; + fe1305x2_tobytearray(mac, h); +} diff --git a/ring-0.17.14/crypto/poly1305/poly1305_arm_asm.S b/ring-0.17.14/crypto/poly1305/poly1305_arm_asm.S new file mode 100644 index 0000000000..df464d068d --- /dev/null +++ b/ring-0.17.14/crypto/poly1305/poly1305_arm_asm.S @@ -0,0 +1,2019 @@ +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__) + +#pragma GCC diagnostic ignored "-Wlanguage-extension-token" + +# This implementation was taken from the public domain, neon2 version in +# SUPERCOP by D. J. Bernstein and Peter Schwabe. + +# qhasm: int32 input_0 + +# qhasm: int32 input_1 + +# qhasm: int32 input_2 + +# qhasm: int32 input_3 + +# qhasm: stack32 input_4 + +# qhasm: stack32 input_5 + +# qhasm: stack32 input_6 + +# qhasm: stack32 input_7 + +# qhasm: int32 caller_r4 + +# qhasm: int32 caller_r5 + +# qhasm: int32 caller_r6 + +# qhasm: int32 caller_r7 + +# qhasm: int32 caller_r8 + +# qhasm: int32 caller_r9 + +# qhasm: int32 caller_r10 + +# qhasm: int32 caller_r11 + +# qhasm: int32 caller_r12 + +# qhasm: int32 caller_r14 + +# qhasm: reg128 caller_q4 + +# qhasm: reg128 caller_q5 + +# qhasm: reg128 caller_q6 + +# qhasm: reg128 caller_q7 + +# qhasm: startcode +.fpu neon +.text + +# qhasm: reg128 r0 + +# qhasm: reg128 r1 + +# qhasm: reg128 r2 + +# qhasm: reg128 r3 + +# qhasm: reg128 r4 + +# qhasm: reg128 x01 + +# qhasm: reg128 x23 + +# qhasm: reg128 x4 + +# qhasm: reg128 y0 + +# qhasm: reg128 y12 + +# qhasm: reg128 y34 + +# qhasm: reg128 5y12 + +# qhasm: reg128 5y34 + +# qhasm: stack128 y0_stack + +# qhasm: stack128 y12_stack + +# qhasm: stack128 y34_stack + +# qhasm: stack128 5y12_stack + +# qhasm: stack128 5y34_stack + +# qhasm: reg128 z0 + +# qhasm: reg128 z12 + +# qhasm: reg128 z34 + +# qhasm: reg128 5z12 + +# qhasm: reg128 5z34 + +# qhasm: stack128 z0_stack + +# qhasm: stack128 z12_stack + +# qhasm: stack128 z34_stack + +# qhasm: stack128 5z12_stack + +# qhasm: stack128 5z34_stack + +# qhasm: stack128 two24 + +# qhasm: int32 ptr + +# qhasm: reg128 c01 + +# qhasm: reg128 c23 + +# qhasm: reg128 d01 + +# qhasm: reg128 d23 + +# qhasm: reg128 t0 + +# qhasm: reg128 t1 + +# qhasm: reg128 t2 + +# qhasm: reg128 t3 + +# qhasm: reg128 t4 + +# qhasm: reg128 mask + +# qhasm: reg128 u0 + +# qhasm: reg128 u1 + +# qhasm: reg128 u2 + +# qhasm: reg128 u3 + +# qhasm: reg128 u4 + +# qhasm: reg128 v01 + +# qhasm: reg128 mid + +# qhasm: reg128 v23 + +# qhasm: reg128 v4 + +# qhasm: int32 len + +# qhasm: qpushenter crypto_onetimeauth_poly1305_neon2_blocks +.align 4 +.global openssl_poly1305_neon2_blocks +.hidden openssl_poly1305_neon2_blocks +.type openssl_poly1305_neon2_blocks STT_FUNC +openssl_poly1305_neon2_blocks: +vpush {q4,q5,q6,q7} +mov r12,sp +sub sp,sp,#192 +bic sp,sp,#31 + +# qhasm: len = input_3 +# asm 1: mov >len=int32#4,len=r3,y12=reg128#2%bot->y12=reg128#2%top},[y12=d2->y12=d3},[y34=reg128#3%bot->y34=reg128#3%top},[y34=d4->y34=d5},[input_1=int32#2,input_1=r1,z12=reg128#5%bot->z12=reg128#5%top},[z12=d8->z12=d9},[z34=reg128#6%bot->z34=reg128#6%top},[z34=d10->z34=d11},[mask=reg128#7,#0xffffffff +# asm 2: vmov.i64 >mask=q6,#0xffffffff +vmov.i64 q6,#0xffffffff + +# qhasm: 2x u4 = 0xff +# asm 1: vmov.i64 >u4=reg128#8,#0xff +# asm 2: vmov.i64 >u4=q7,#0xff +vmov.i64 q7,#0xff + +# qhasm: x01 aligned= mem128[input_0];input_0+=16 +# asm 1: vld1.8 {>x01=reg128#9%bot->x01=reg128#9%top},[x01=d16->x01=d17},[x23=reg128#10%bot->x23=reg128#10%top},[x23=d18->x23=d19},[input_0=int32#1,input_0=r0,>=6 +# asm 1: vshr.u64 >mask=reg128#7,mask=q6,>= 7 +# asm 1: vshr.u64 >u4=reg128#8,u4=q7,5y12=reg128#12,5y12=q11,5y34=reg128#13,5y34=q12,5y12=reg128#12,<5y12=reg128#12,5y12=q11,<5y12=q11,5y34=reg128#13,<5y34=reg128#13,5y34=q12,<5y34=q12,u4=reg128#8,u4=q7,5z12=reg128#14,5z12=q13,5z34=reg128#15,5z34=q14,5z12=reg128#14,<5z12=reg128#14,5z12=q13,<5z12=q13,5z34=reg128#15,<5z34=reg128#15,5z34=q14,<5z34=q14,ptr=int32#2,ptr=r1,r4=reg128#16,r4=q15,r0=reg128#8,r0=q7,ptr=int32#2,ptr=r1,ptr=int32#2,ptr=r1,ptr=int32#2,ptr=r1,ptr=int32#2,ptr=r1,ptr=int32#2,ptr=r1,ptr=int32#2,ptr=r1,ptr=int32#2,<5y12_stack=stack128#5 +# asm 2: lea >ptr=r1,<5y12_stack=[sp,#64] +add r1,sp,#64 + +# qhasm: mem128[ptr] aligned= 5y12 +# asm 1: vst1.8 {<5y12=reg128#12%bot-<5y12=reg128#12%top},[ptr=int32#2,<5y34_stack=stack128#6 +# asm 2: lea >ptr=r1,<5y34_stack=[sp,#80] +add r1,sp,#80 + +# qhasm: mem128[ptr] aligned= 5y34 +# asm 1: vst1.8 {<5y34=reg128#13%bot-<5y34=reg128#13%top},[ptr=int32#2,<5z12_stack=stack128#10 +# asm 2: lea >ptr=r1,<5z12_stack=[sp,#144] +add r1,sp,#144 + +# qhasm: mem128[ptr] aligned= 5z12 +# asm 1: vst1.8 {<5z12=reg128#14%bot-<5z12=reg128#14%top},[ptr=int32#2,<5z34_stack=stack128#11 +# asm 2: lea >ptr=r1,<5z34_stack=[sp,#160] +add r1,sp,#160 + +# qhasm: mem128[ptr] aligned= 5z34 +# asm 1: vst1.8 {<5z34=reg128#15%bot-<5z34=reg128#15%top},[? len - 64 +# asm 1: cmp +bls ._below64bytes + +# qhasm: input_2 += 32 +# asm 1: add >input_2=int32#2,input_2=r1,c01=reg128#1%bot->c01=reg128#1%top},[c01=d0->c01=d1},[c23=reg128#2%bot->c23=reg128#2%top},[c23=d2->c23=d3},[ptr=int32#3,ptr=r2,z12=reg128#3%bot->z12=reg128#3%top},[z12=d4->z12=d5},[ptr=int32#3,ptr=r2,z0=reg128#4%bot->z0=reg128#4%top},[z0=d6->z0=d7},[r3=reg128#5,r3=q4,input_2=int32#2,input_2=r1,ptr=int32#3,<5z34_stack=stack128#11 +# asm 2: lea >ptr=r2,<5z34_stack=[sp,#160] +add r2,sp,#160 + +# qhasm: 5z34 aligned= mem128[ptr] +# asm 1: vld1.8 {>5z34=reg128#6%bot->5z34=reg128#6%top},[5z34=d10->5z34=d11},[r0=reg128#8,r0=q7,r2=reg128#14,r2=q13,d01=reg128#12%bot->d01=reg128#12%top},[d01=d22->d01=d23},[r1=reg128#15,r1=q14,ptr=int32#3,<5z12_stack=stack128#10 +# asm 2: lea >ptr=r2,<5z12_stack=[sp,#144] +add r2,sp,#144 + +# qhasm: 5z12 aligned= mem128[ptr] +# asm 1: vld1.8 {>5z12=reg128#1%bot->5z12=reg128#1%top},[5z12=d0->5z12=d1},[d23=reg128#2%bot->d23=reg128#2%top},[d23=d2->d23=d3},[input_2=int32#2,input_2=r1,> 40 +# asm 1: vshr.u64 >v4=reg128#4,v4=q3,> 14; v23[3] = d23[2,3] unsigned>> 14 +# asm 1: vshrn.u64 > 26; v01[3] = d01[2,3] unsigned>> 26 +# asm 1: vshrn.u64 > 20; v23[1] = mid[2,3] unsigned>> 20 +# asm 1: vshrn.u64 ptr=int32#3,ptr=r2,y34=reg128#3%bot->y34=reg128#3%top},[y34=d4->y34=d5},[ptr=int32#3,ptr=r2,y12=reg128#2%bot->y12=reg128#2%top},[y12=d2->y12=d3},[ptr=int32#3,ptr=r2,y0=reg128#1%bot->y0=reg128#1%top},[y0=d0->y0=d1},[ptr=int32#3,<5y34_stack=stack128#6 +# asm 2: lea >ptr=r2,<5y34_stack=[sp,#80] +add r2,sp,#80 + +# qhasm: 5y34 aligned= mem128[ptr] +# asm 1: vld1.8 {>5y34=reg128#13%bot->5y34=reg128#13%top},[5y34=d24->5y34=d25},[ptr=int32#3,<5y12_stack=stack128#5 +# asm 2: lea >ptr=r2,<5y12_stack=[sp,#64] +add r2,sp,#64 + +# qhasm: 5y12 aligned= mem128[ptr] +# asm 1: vld1.8 {>5y12=reg128#12%bot->5y12=reg128#12%top},[5y12=d22->5y12=d23},[ptr=int32#3,ptr=r2,> 26 +# asm 1: vshr.u64 >t1=reg128#4,t1=q3,len=int32#4,len=r3,r0=reg128#6,r0=q5,r1=reg128#4,r1=q3,> 26 +# asm 1: vshr.u64 >t4=reg128#8,t4=q7,r3=reg128#5,r3=q4,x4=reg128#8,x4=q7,r4=reg128#16%bot->r4=reg128#16%top},[r4=d30->r4=d31},[> 26 +# asm 1: vshr.u64 >t2=reg128#9,t2=q8,r1=reg128#4,r1=q3,> 26 +# asm 1: vshr.u64 >t0=reg128#10,t0=q9,r2=reg128#9,r2=q8,x4=reg128#11,x4=q10,x01=reg128#6,x01=q5,r0=reg128#8%bot->r0=reg128#8%top},[r0=d14->r0=d15},[ptr=int32#3,ptr=r2,t0=reg128#10,t0=q9,> 26 +# asm 1: vshr.u64 >t3=reg128#14,t3=q13,x01=reg128#15,x01=q14,z34=reg128#6%bot->z34=reg128#6%top},[z34=d10->z34=d11},[x23=reg128#10,x23=q9,r3=reg128#5,r3=q4,input_2=int32#2,input_2=r1,> 26 +# asm 1: vshr.u64 >t1=reg128#14,t1=q13,x01=reg128#9,x01=q8,r1=reg128#4,r1=q3,> 26 +# asm 1: vshr.u64 >t4=reg128#14,t4=q13,r3=reg128#5,r3=q4,x4=reg128#11,x4=q10,? len - 64 +# asm 1: cmp +bhi ._mainloop2 + +# qhasm: input_2 -= 32 +# asm 1: sub >input_2=int32#3,input_2=r2,? len - 32 +# asm 1: cmp +bls ._end + +# qhasm: mainloop: +._mainloop: + +# qhasm: new r0 + +# qhasm: ptr = &two24 +# asm 1: lea >ptr=int32#2,ptr=r1,r4=reg128#5%bot->r4=reg128#5%top},[r4=d8->r4=d9},[u4=reg128#6%bot->u4=reg128#6%top},[u4=d10->u4=d11},[c01=reg128#8%bot->c01=reg128#8%top},[c01=d14->c01=d15},[c23=reg128#14%bot->c23=reg128#14%top},[c23=d26->c23=d27},[r0=reg128#4,r0=q3,r3=reg128#6,r3=q5,r1=reg128#14,r1=q13,r2=reg128#8,r2=q7,> 26 +# asm 1: vshr.u64 >t1=reg128#9,t1=q8,r0=reg128#4,r0=q3,r1=reg128#9,r1=q8,> 26 +# asm 1: vshr.u64 >t4=reg128#10,t4=q9,r3=reg128#6,r3=q5,r4=reg128#5,r4=q4,> 26 +# asm 1: vshr.u64 >t2=reg128#10,t2=q9,r1=reg128#11,r1=q10,> 26 +# asm 1: vshr.u64 >t0=reg128#9,t0=q8,r2=reg128#8,r2=q7,r4=reg128#5,r4=q4,r0=reg128#4,r0=q3,t0=reg128#9,t0=q8,> 26 +# asm 1: vshr.u64 >t3=reg128#14,t3=q13,r0=reg128#4,r0=q3,x23=reg128#10,x23=q9,r3=reg128#6,r3=q5,> 26 +# asm 1: vshr.u64 >t1=reg128#8,t1=q7,x01=reg128#9,x01=q8,r1=reg128#4,r1=q3,> 26 +# asm 1: vshr.u64 >t4=reg128#8,t4=q7,r3=reg128#6,r3=q5,x4=reg128#11,x4=q10,len=int32#4,len=r3,? len - 32 +# asm 1: cmp +bhi ._mainloop + +# qhasm: end: +._end: + +# qhasm: mem128[input_0] = x01;input_0+=16 +# asm 1: vst1.8 {len=int32#1,len=r0,mask=reg128#1,#0xffffffff +# asm 2: vmov.i64 >mask=q0,#0xffffffff +vmov.i64 q0,#0xffffffff + +# qhasm: y01 aligned= mem128[input_2];input_2+=16 +# asm 1: vld1.8 {>y01=reg128#2%bot->y01=reg128#2%top},[y01=d2->y01=d3},[_5y01=reg128#3,_5y01=q2,y23=reg128#4%bot->y23=reg128#4%top},[y23=d6->y23=d7},[_5y23=reg128#9,_5y23=q8,_5y4=reg128#11,_5y4=q10,x01=reg128#12%bot->x01=reg128#12%top},[x01=d22->x01=d23},[_5y01=reg128#3,<_5y01=reg128#3,_5y01=q2,<_5y01=q2,x23=reg128#13%bot->x23=reg128#13%top},[x23=d24->x23=d25},[_5y23=reg128#9,<_5y23=reg128#9,_5y23=q8,<_5y23=q8,_5y4=reg128#11,<_5y4=reg128#11,_5y4=q10,<_5y4=q10,c01=reg128#14%bot->c01=reg128#14%top},[c01=d26->c01=d27},[x01=reg128#12,x01=q11,c23=reg128#14%bot->c23=reg128#14%top},[c23=d26->c23=d27},[x23=reg128#13,x23=q12,>=6 +# asm 1: vshr.u64 >mask=reg128#1,mask=q0,x4=reg128#14,x4=q13,r0=reg128#15,r0=q14,r1=reg128#3,r1=q2,r2=reg128#16,r2=q15,r3=reg128#9,r3=q8,r4=reg128#10,r4=q9,> 26 +# asm 1: vshr.u64 >t1=reg128#2,t1=q1,r0=reg128#4,r0=q3,r1=reg128#2,r1=q1,> 26 +# asm 1: vshr.u64 >t4=reg128#3,t4=q2,r3=reg128#9,r3=q8,r4=reg128#3,r4=q2,> 26 +# asm 1: vshr.u64 >t2=reg128#10,t2=q9,r1=reg128#2,r1=q1,> 26 +# asm 1: vshr.u64 >t0=reg128#11,t0=q10,r2=reg128#10,r2=q9,r4=reg128#3,r4=q2,r0=reg128#4,r0=q3,t0=reg128#11,t0=q10,> 26 +# asm 1: vshr.u64 >t3=reg128#12,t3=q11,r0=reg128#4,r0=q3,x23=reg128#10,x23=q9,r3=reg128#9,r3=q8,> 26 +# asm 1: vshr.u64 >t1=reg128#11,t1=q10,x01=reg128#4,x01=q3,r1=reg128#2,r1=q1,> 26 +# asm 1: vshr.u64 >t4=reg128#11,t4=q10,r3=reg128#1,r3=q0,x4=reg128#3,x4=q2, + +// Raw AES functions. + + +// AES_MAXNR is the maximum number of AES rounds. +#define AES_MAXNR 14 + +// aes_key_st should be an opaque type, but EVP requires that the size be +// known. +struct aes_key_st { + uint32_t rd_key[4 * (AES_MAXNR + 1)]; + unsigned rounds; +}; +typedef struct aes_key_st AES_KEY; + +#endif // OPENSSL_HEADER_AES_H diff --git a/ring-0.17.14/include/ring-core/asm_base.h b/ring-0.17.14/include/ring-core/asm_base.h new file mode 100644 index 0000000000..ef8332680f --- /dev/null +++ b/ring-0.17.14/include/ring-core/asm_base.h @@ -0,0 +1,204 @@ +// Copyright 2023 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_ASM_BASE_H +#define OPENSSL_HEADER_ASM_BASE_H + +#include + + +// This header contains symbols and common sections used by assembly files. It +// is included as a public header to simplify the build, but is not intended for +// external use. +// +// Every assembly file must include this header. Some linker features require +// all object files to be tagged with some section metadata. This header file, +// when included in assembly, adds that metadata. It also makes defines like +// |OPENSSL_X86_64| available and includes the prefixing macros. +// +// Including this header in an assembly file imples: +// +// - The file does not require an executable stack. +// +// - The file, on aarch64, uses the macros defined below to be compatible with +// BTI and PAC. +// +// - The file, on x86_64, requires the program to be compatible with Intel IBT +// and SHSTK + +#if defined(__ASSEMBLER__) + +#include + +#if defined(__ELF__) +// Every ELF object file, even empty ones, should disable executable stacks. See +// https://www.airs.com/blog/archives/518. +.pushsection .note.GNU-stack, "", %progbits +.popsection +#endif + +#if defined(__CET__) && defined(OPENSSL_X86_64) +// Clang and GCC define __CET__ and provide when they support Intel's +// Indirect Branch Tracking. +// https://lpc.events/event/7/contributions/729/attachments/496/903/CET-LPC-2020.pdf +// +// cet.h defines _CET_ENDBR which is used to mark function entry points for IBT. +// and adds the assembly marker. The value of _CET_ENDBR is made dependant on if +// '-fcf-protection' is passed to the compiler. _CET_ENDBR is only required when +// the function is the target of an indirect jump, but BoringSSL chooses to mark +// all assembly entry points because it is easier, and allows BoringSSL's ABI +// tester to call the assembly entry points via an indirect jump. +#include +#else +#define _CET_ENDBR +#endif + +#if defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64) + +// We require the ARM assembler provide |__ARM_ARCH| from Arm C Language +// Extensions (ACLE). This is supported in GCC 4.8+ and Clang 3.2+. MSVC does +// not implement ACLE, but we require Clang's assembler on Windows. +#if !defined(__ARM_ARCH) +#error "ARM assembler must define __ARM_ARCH" +#endif + +// Even when building for 32-bit ARM, support for aarch64 crypto instructions +// will be included. +// +// TODO(davidben): Remove this and the corresponding ifdefs? This is only +// defined because some OpenSSL assembly files would allow disabling the NEON +// code entirely. I think we'd prefer to do that by lifting the dispatch to C +// anyway. +#define __ARM_MAX_ARCH__ 8 + +// Support macros for +// - Armv8.3-A Pointer Authentication and +// - Armv8.5-A Branch Target Identification +// features which require emitting a .note.gnu.property section with the +// appropriate architecture-dependent feature bits set. +// +// |AARCH64_SIGN_LINK_REGISTER| and |AARCH64_VALIDATE_LINK_REGISTER| expand to +// PACIxSP and AUTIxSP, respectively. |AARCH64_SIGN_LINK_REGISTER| should be +// used immediately before saving the LR register (x30) to the stack. +// |AARCH64_VALIDATE_LINK_REGISTER| should be used immediately after restoring +// it. Note |AARCH64_SIGN_LINK_REGISTER|'s modifications to LR must be undone +// with |AARCH64_VALIDATE_LINK_REGISTER| before RET. The SP register must also +// have the same value at the two points. For example: +// +// .global f +// f: +// AARCH64_SIGN_LINK_REGISTER +// stp x29, x30, [sp, #-96]! +// mov x29, sp +// ... +// ldp x29, x30, [sp], #96 +// AARCH64_VALIDATE_LINK_REGISTER +// ret +// +// |AARCH64_VALID_CALL_TARGET| expands to BTI 'c'. Either it, or +// |AARCH64_SIGN_LINK_REGISTER|, must be used at every point that may be an +// indirect call target. In particular, all symbols exported from a file must +// begin with one of these macros. For example, a leaf function that does not +// save LR can instead use |AARCH64_VALID_CALL_TARGET|: +// +// .globl return_zero +// return_zero: +// AARCH64_VALID_CALL_TARGET +// mov x0, #0 +// ret +// +// A non-leaf function which does not immediately save LR may need both macros +// because |AARCH64_SIGN_LINK_REGISTER| appears late. For example, the function +// may jump to an alternate implementation before setting up the stack: +// +// .globl with_early_jump +// with_early_jump: +// AARCH64_VALID_CALL_TARGET +// cmp x0, #128 +// b.lt .Lwith_early_jump_128 +// AARCH64_SIGN_LINK_REGISTER +// stp x29, x30, [sp, #-96]! +// mov x29, sp +// ... +// ldp x29, x30, [sp], #96 +// AARCH64_VALIDATE_LINK_REGISTER +// ret +// +// .Lwith_early_jump_128: +// ... +// ret +// +// These annotations are only required with indirect calls. Private symbols that +// are only the target of direct calls do not require annotations. Also note +// that |AARCH64_VALID_CALL_TARGET| is only valid for indirect calls (BLR), not +// indirect jumps (BR). Indirect jumps in assembly are currently not supported +// and would require a macro for BTI 'j'. +// +// Although not necessary, it is safe to use these macros in 32-bit ARM +// assembly. This may be used to simplify dual 32-bit and 64-bit files. +// +// References: +// - "ELF for the Arm® 64-bit Architecture" +// https://github.com/ARM-software/abi-aa/blob/master/aaelf64/aaelf64.rst +// - "Providing protection for complex software" +// https://developer.arm.com/architectures/learn-the-architecture/providing-protection-for-complex-software + +#if defined(__ARM_FEATURE_BTI_DEFAULT) && __ARM_FEATURE_BTI_DEFAULT == 1 +#define GNU_PROPERTY_AARCH64_BTI (1 << 0) // Has Branch Target Identification +#define AARCH64_VALID_CALL_TARGET hint #34 // BTI 'c' +#else +#define GNU_PROPERTY_AARCH64_BTI 0 // No Branch Target Identification +#define AARCH64_VALID_CALL_TARGET +#endif + +#if defined(__ARM_FEATURE_PAC_DEFAULT) && \ + (__ARM_FEATURE_PAC_DEFAULT & 1) == 1 // Signed with A-key +#define GNU_PROPERTY_AARCH64_POINTER_AUTH \ + (1 << 1) // Has Pointer Authentication +#define AARCH64_SIGN_LINK_REGISTER hint #25 // PACIASP +#define AARCH64_VALIDATE_LINK_REGISTER hint #29 // AUTIASP +#elif defined(__ARM_FEATURE_PAC_DEFAULT) && \ + (__ARM_FEATURE_PAC_DEFAULT & 2) == 2 // Signed with B-key +#define GNU_PROPERTY_AARCH64_POINTER_AUTH \ + (1 << 1) // Has Pointer Authentication +#define AARCH64_SIGN_LINK_REGISTER hint #27 // PACIBSP +#define AARCH64_VALIDATE_LINK_REGISTER hint #31 // AUTIBSP +#else +#define GNU_PROPERTY_AARCH64_POINTER_AUTH 0 // No Pointer Authentication +#if GNU_PROPERTY_AARCH64_BTI != 0 +#define AARCH64_SIGN_LINK_REGISTER AARCH64_VALID_CALL_TARGET +#else +#define AARCH64_SIGN_LINK_REGISTER +#endif +#define AARCH64_VALIDATE_LINK_REGISTER +#endif + +#if GNU_PROPERTY_AARCH64_POINTER_AUTH != 0 || GNU_PROPERTY_AARCH64_BTI != 0 +.pushsection .note.gnu.property, "a"; +.balign 8; +.long 4; +.long 0x10; +.long 0x5; +.asciz "GNU"; +.long 0xc0000000; /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */ +.long 4; +.long (GNU_PROPERTY_AARCH64_POINTER_AUTH | GNU_PROPERTY_AARCH64_BTI); +.long 0; +.popsection; +#endif +#endif // ARM || AARCH64 + +#endif // __ASSEMBLER__ + +#endif // OPENSSL_HEADER_ASM_BASE_H diff --git a/ring-0.17.14/include/ring-core/base.h b/ring-0.17.14/include/ring-core/base.h new file mode 100644 index 0000000000..d18a55e278 --- /dev/null +++ b/ring-0.17.14/include/ring-core/base.h @@ -0,0 +1,66 @@ +// Copyright 2001-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_BASE_H +#define OPENSSL_HEADER_BASE_H + + +// This file should be the first included by all BoringSSL headers. + +#if defined(_MSC_VER) && !defined(__clang__) +#pragma warning(push, 3) +#endif + +#include +#include + +#if defined(_MSC_VER) && !defined(__clang__) +#pragma warning(pop) +#endif + +#if defined(__APPLE__) +#include +#endif + +#include // IWYU pragma: export + +#include + +#include + +#if defined(__APPLE__) +// Note |TARGET_OS_MAC| is set for all Apple OS variants. |TARGET_OS_OSX| +// targets macOS specifically. +#if defined(TARGET_OS_OSX) && TARGET_OS_OSX +#define OPENSSL_MACOS +#endif +#if defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE +#define OPENSSL_IOS +#endif +#endif + +// *ring* doesn't support the `BORINGSSL_SHARED_LIBRARY` configuration, so +// the default (usually "hidden") visibility is always used, even for exported +// items. +#define OPENSSL_EXPORT + +// `ring::c` would need to be customized on any platform where these assertions +// fail. Keep in sync with `ring::c`. +OPENSSL_STATIC_ASSERT(sizeof(int32_t) == sizeof(int), "int isn't 32 bits."); +OPENSSL_STATIC_ASSERT(sizeof(uint32_t) == sizeof(unsigned int), "unsigned int isn't 32 bits."); +OPENSSL_STATIC_ASSERT(sizeof(size_t) == sizeof(uintptr_t), "uintptr_t and size_t differ."); +OPENSSL_STATIC_ASSERT(sizeof(size_t) <= sizeof(uint64_t), "size_t is larger than uint64_t."); +OPENSSL_STATIC_ASSERT(sizeof(size_t) >= sizeof(uint32_t), "size_t is smaller than uint32_t."); + +#endif // OPENSSL_HEADER_BASE_H diff --git a/ring-0.17.14/include/ring-core/check.h b/ring-0.17.14/include/ring-core/check.h new file mode 100644 index 0000000000..998289a1d0 --- /dev/null +++ b/ring-0.17.14/include/ring-core/check.h @@ -0,0 +1,52 @@ +// Copyright 2020 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#ifndef RING_CHECK_H +#define RING_CHECK_H + +// |debug_assert_nonsecret| is like |assert| and should be used (only) when the +// assertion does not have any potential to leak a secret. |NDEBUG| controls this +// exactly like |assert|. It is emulated when there is no assert.h to make +// cross-building easier. +// +// When reviewing uses of |debug_assert_nonsecret|, verify that the check +// really does not have potential to leak a secret. + +#if !defined(RING_CORE_NOSTDLIBINC) +# include +# define debug_assert_nonsecret(x) assert(x) +#else +# if !defined(NDEBUG) +# define debug_assert_nonsecret(x) ((x) ? ((void)0) : __builtin_trap()) +# else +# define debug_assert_nonsecret(x) ((void)0) +# endif +#endif + +// |dev_assert_secret| is like |assert| and should be used (only) when the +// assertion operates on secret data in a way that has the potential to leak +// the secret. |dev_assert_secret| can only be enabled by changing the |#if 0| +// here to |#if 1| (or equivalent) when |NDEBUG| is not defined. This is not +// controlled only through |NDEBUG| so that such checks do not leak into debug +// builds that may make it into production use. +// +// When reviewing uses of |dev_assert_secret|, verify that the check really +// does have the potential to leak a secret. +#if 0 // DO NOT COMMIT CHANGES TO THIS LINE. +# define dev_assert_secret debug_assert_nonsecret +#else +# define dev_assert_secret(x) ((void)0) +#endif + +#endif // RING_CHECK_H diff --git a/ring-0.17.14/include/ring-core/mem.h b/ring-0.17.14/include/ring-core/mem.h new file mode 100644 index 0000000000..8f8f32ec84 --- /dev/null +++ b/ring-0.17.14/include/ring-core/mem.h @@ -0,0 +1,27 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_MEM_H +#define OPENSSL_HEADER_MEM_H + +#include + +// CRYPTO_memcmp returns zero iff the |len| bytes at |a| and |b| are equal. It +// takes an amount of time dependent on |len|, but independent of the contents +// of |a| and |b|. Unlike memcmp, it cannot be used to put elements into a +// defined order as the return value when a != b is undefined, other than to be +// non-zero. +OPENSSL_EXPORT int CRYPTO_memcmp(const void *a, const void *b, size_t len); + +#endif // OPENSSL_HEADER_MEM_H diff --git a/ring-0.17.14/include/ring-core/target.h b/ring-0.17.14/include/ring-core/target.h new file mode 100644 index 0000000000..fe40db7722 --- /dev/null +++ b/ring-0.17.14/include/ring-core/target.h @@ -0,0 +1,97 @@ +// Copyright 2023 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_TARGET_H +#define OPENSSL_HEADER_TARGET_H + +// Preprocessor symbols that define the target platform. +// +// This file may be included in C, C++, and assembler and must be compatible +// with each environment. It is separated out only to share code between +// and . Prefer to include those headers +// instead. + +#if defined(__x86_64) || defined(_M_AMD64) || defined(_M_X64) +#define OPENSSL_64_BIT +#define OPENSSL_X86_64 +#elif defined(__x86) || defined(__i386) || defined(__i386__) || defined(_M_IX86) +#define OPENSSL_32_BIT +#define OPENSSL_X86 +#elif defined(__AARCH64EL__) || defined(_M_ARM64) +#define OPENSSL_64_BIT +#define OPENSSL_AARCH64 +#elif defined(__ARMEL__) || defined(_M_ARM) +#define OPENSSL_32_BIT +#define OPENSSL_ARM +// All of following architectures are only supported when `__BYTE_ORDER__` can be used to detect +// endianness (in crypto/internal.h). +#elif !defined(__BYTE_ORDER__) +#error "Cannot determine endianness because __BYTE_ORDER__ is not defined" +// Targets are assumed to be little-endian unless __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__. +#elif !(defined(__ORDER_LITTLE_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) && \ + !(defined(__ORDER_BIG_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)) +#error "Unsupported endianness" +#elif defined(__LP64__) +#define OPENSSL_64_BIT +#elif defined(__ILP32__) +#define OPENSSL_32_BIT +// Versions of GCC before 10.0 didn't define `__ILP32__` for all 32-bit targets. +#elif defined(__MIPSEL__) || defined(__MIPSEB__) || defined(__PPC__) || defined(__powerpc__) || defined(__csky__) || defined(__XTENSA__) +#define OPENSSL_32_BIT +#else +#error "Unknown target CPU" +#endif + +#if defined(__APPLE__) +#define OPENSSL_APPLE +#endif + +#if defined(_WIN32) +#define OPENSSL_WINDOWS +#endif + +#if defined(__has_feature) +#if __has_feature(address_sanitizer) +#define OPENSSL_ASAN +#endif +#if __has_feature(thread_sanitizer) +#define OPENSSL_TSAN +#endif +#if __has_feature(memory_sanitizer) +#define OPENSSL_MSAN +#define OPENSSL_ASM_INCOMPATIBLE +#endif +#if __has_feature(hwaddress_sanitizer) +#define OPENSSL_HWASAN +#endif +#endif + +// Disable 32-bit Arm assembly on Apple platforms. The last iOS version that +// supported 32-bit Arm was iOS 10. +#if defined(OPENSSL_APPLE) && defined(OPENSSL_ARM) +#define OPENSSL_ASM_INCOMPATIBLE +#endif + +#if defined(OPENSSL_ASM_INCOMPATIBLE) +#undef OPENSSL_ASM_INCOMPATIBLE +#if !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif // OPENSSL_ASM_INCOMPATIBLE + +#if !defined(OPENSSL_X86_64) && !defined(OPENSSL_AARCH64) +#define OPENSSL_SMALL +#endif + +#endif // OPENSSL_HEADER_TARGET_H diff --git a/ring-0.17.14/include/ring-core/type_check.h b/ring-0.17.14/include/ring-core/type_check.h new file mode 100644 index 0000000000..532fc8e8ff --- /dev/null +++ b/ring-0.17.14/include/ring-core/type_check.h @@ -0,0 +1,32 @@ +// Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_TYPE_CHECK_H +#define OPENSSL_HEADER_TYPE_CHECK_H + +#include + + +#if defined(__cplusplus) || (defined(_MSC_VER) && !defined(__clang__)) +// In C++ and non-clang MSVC, |static_assert| is a keyword. +#define OPENSSL_STATIC_ASSERT(cond, msg) static_assert(cond, msg) +#else +// C11 defines the |_Static_assert| keyword and the |static_assert| macro in +// assert.h. While the former is available at all versions in Clang and GCC, the +// later depends on libc and, in glibc, depends on being built in C11 mode. We +// do not require this, for now, so use |_Static_assert| directly. +#define OPENSSL_STATIC_ASSERT(cond, msg) _Static_assert(cond, msg) +#endif + +#endif // OPENSSL_HEADER_TYPE_CHECK_H diff --git a/ring-0.17.14/pregenerated/aes-gcm-avx2-x86_64-elf.S b/ring-0.17.14/pregenerated/aes-gcm-avx2-x86_64-elf.S new file mode 100644 index 0000000000..597264bcca --- /dev/null +++ b/ring-0.17.14/pregenerated/aes-gcm-avx2-x86_64-elf.S @@ -0,0 +1,1172 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +.section .rodata +.align 16 + + +.Lbswap_mask: +.quad 0x08090a0b0c0d0e0f, 0x0001020304050607 + + + + + + + + +.Lgfpoly: +.quad 1, 0xc200000000000000 + + +.Lgfpoly_and_internal_carrybit: +.quad 1, 0xc200000000000001 + +.align 32 + +.Lctr_pattern: +.quad 0, 0 +.quad 1, 0 +.Linc_2blocks: +.quad 2, 0 +.quad 2, 0 + +.text +.globl gcm_init_vpclmulqdq_avx2 +.hidden gcm_init_vpclmulqdq_avx2 +.type gcm_init_vpclmulqdq_avx2,@function +.align 32 +gcm_init_vpclmulqdq_avx2: +.cfi_startproc + +_CET_ENDBR + + + + + + vpshufd $0x4e,(%rsi),%xmm3 + + + + + + vpshufd $0xd3,%xmm3,%xmm0 + vpsrad $31,%xmm0,%xmm0 + vpaddq %xmm3,%xmm3,%xmm3 + vpand .Lgfpoly_and_internal_carrybit(%rip),%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + + vbroadcasti128 .Lgfpoly(%rip),%ymm6 + + + vpclmulqdq $0x00,%xmm3,%xmm3,%xmm0 + vpclmulqdq $0x01,%xmm3,%xmm3,%xmm1 + vpclmulqdq $0x10,%xmm3,%xmm3,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + vpclmulqdq $0x01,%xmm0,%xmm6,%xmm2 + vpshufd $0x4e,%xmm0,%xmm0 + vpxor %xmm0,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm3,%xmm3,%xmm5 + vpclmulqdq $0x01,%xmm1,%xmm6,%xmm0 + vpshufd $0x4e,%xmm1,%xmm1 + vpxor %xmm1,%xmm5,%xmm5 + vpxor %xmm0,%xmm5,%xmm5 + + + + vinserti128 $1,%xmm3,%ymm5,%ymm3 + vinserti128 $1,%xmm5,%ymm5,%ymm5 + + +.byte 0xc4,0xe3,0x65,0x44,0xc5,0x00 +.byte 0xc4,0xe3,0x65,0x44,0xcd,0x01 +.byte 0xc4,0xe3,0x65,0x44,0xd5,0x10 + vpxor %ymm2,%ymm1,%ymm1 +.byte 0xc4,0xe3,0x4d,0x44,0xd0,0x01 + vpshufd $0x4e,%ymm0,%ymm0 + vpxor %ymm0,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 +.byte 0xc4,0xe3,0x65,0x44,0xe5,0x11 +.byte 0xc4,0xe3,0x4d,0x44,0xc1,0x01 + vpshufd $0x4e,%ymm1,%ymm1 + vpxor %ymm1,%ymm4,%ymm4 + vpxor %ymm0,%ymm4,%ymm4 + + + + vmovdqu %ymm3,96(%rdi) + vmovdqu %ymm4,64(%rdi) + + + + vpunpcklqdq %ymm3,%ymm4,%ymm0 + vpunpckhqdq %ymm3,%ymm4,%ymm1 + vpxor %ymm1,%ymm0,%ymm0 + vmovdqu %ymm0,128+32(%rdi) + + +.byte 0xc4,0xe3,0x5d,0x44,0xc5,0x00 +.byte 0xc4,0xe3,0x5d,0x44,0xcd,0x01 +.byte 0xc4,0xe3,0x5d,0x44,0xd5,0x10 + vpxor %ymm2,%ymm1,%ymm1 +.byte 0xc4,0xe3,0x4d,0x44,0xd0,0x01 + vpshufd $0x4e,%ymm0,%ymm0 + vpxor %ymm0,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 +.byte 0xc4,0xe3,0x5d,0x44,0xdd,0x11 +.byte 0xc4,0xe3,0x4d,0x44,0xc1,0x01 + vpshufd $0x4e,%ymm1,%ymm1 + vpxor %ymm1,%ymm3,%ymm3 + vpxor %ymm0,%ymm3,%ymm3 + +.byte 0xc4,0xe3,0x65,0x44,0xc5,0x00 +.byte 0xc4,0xe3,0x65,0x44,0xcd,0x01 +.byte 0xc4,0xe3,0x65,0x44,0xd5,0x10 + vpxor %ymm2,%ymm1,%ymm1 +.byte 0xc4,0xe3,0x4d,0x44,0xd0,0x01 + vpshufd $0x4e,%ymm0,%ymm0 + vpxor %ymm0,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 +.byte 0xc4,0xe3,0x65,0x44,0xe5,0x11 +.byte 0xc4,0xe3,0x4d,0x44,0xc1,0x01 + vpshufd $0x4e,%ymm1,%ymm1 + vpxor %ymm1,%ymm4,%ymm4 + vpxor %ymm0,%ymm4,%ymm4 + + vmovdqu %ymm3,32(%rdi) + vmovdqu %ymm4,0(%rdi) + + + + vpunpcklqdq %ymm3,%ymm4,%ymm0 + vpunpckhqdq %ymm3,%ymm4,%ymm1 + vpxor %ymm1,%ymm0,%ymm0 + vmovdqu %ymm0,128(%rdi) + + vzeroupper + ret + +.cfi_endproc +.size gcm_init_vpclmulqdq_avx2, . - gcm_init_vpclmulqdq_avx2 +.globl gcm_ghash_vpclmulqdq_avx2_1 +.hidden gcm_ghash_vpclmulqdq_avx2_1 +.type gcm_ghash_vpclmulqdq_avx2_1,@function +.align 32 +gcm_ghash_vpclmulqdq_avx2_1: +.cfi_startproc + +_CET_ENDBR + + + + + + + vmovdqu .Lbswap_mask(%rip),%xmm6 + vmovdqu .Lgfpoly(%rip),%xmm7 + + + vmovdqu (%rdi),%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + + + +.Lghash_lastblock: + vmovdqu (%rdx),%xmm0 + vpshufb %xmm6,%xmm0,%xmm0 + vpxor %xmm0,%xmm5,%xmm5 + vmovdqu 128-16(%rsi),%xmm0 + vpclmulqdq $0x00,%xmm0,%xmm5,%xmm1 + vpclmulqdq $0x01,%xmm0,%xmm5,%xmm2 + vpclmulqdq $0x10,%xmm0,%xmm5,%xmm3 + vpxor %xmm3,%xmm2,%xmm2 + vpclmulqdq $0x01,%xmm1,%xmm7,%xmm3 + vpshufd $0x4e,%xmm1,%xmm1 + vpxor %xmm1,%xmm2,%xmm2 + vpxor %xmm3,%xmm2,%xmm2 + vpclmulqdq $0x11,%xmm0,%xmm5,%xmm5 + vpclmulqdq $0x01,%xmm2,%xmm7,%xmm1 + vpshufd $0x4e,%xmm2,%xmm2 + vpxor %xmm2,%xmm5,%xmm5 + vpxor %xmm1,%xmm5,%xmm5 + + +.Lghash_done: + + vpshufb %xmm6,%xmm5,%xmm5 + vmovdqu %xmm5,(%rdi) + + vzeroupper + ret + +.cfi_endproc +.size gcm_ghash_vpclmulqdq_avx2_1, . - gcm_ghash_vpclmulqdq_avx2_1 +.globl aes_gcm_enc_update_vaes_avx2 +.hidden aes_gcm_enc_update_vaes_avx2 +.type aes_gcm_enc_update_vaes_avx2,@function +.align 32 +aes_gcm_enc_update_vaes_avx2: +.cfi_startproc + +_CET_ENDBR + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-16 + + movq 16(%rsp),%r12 +#ifdef BORINGSSL_DISPATCH_TEST +.extern BORINGSSL_function_hit +.hidden BORINGSSL_function_hit + movb $1,BORINGSSL_function_hit+8(%rip) +#endif + vbroadcasti128 .Lbswap_mask(%rip),%ymm0 + + + + vmovdqu (%r12),%xmm1 + vpshufb %xmm0,%xmm1,%xmm1 + vbroadcasti128 (%r8),%ymm11 + vpshufb %ymm0,%ymm11,%ymm11 + + + + movl 240(%rcx),%r10d + leal -20(,%r10,4),%r10d + + + + + leaq 96(%rcx,%r10,4),%r11 + vbroadcasti128 (%rcx),%ymm9 + vbroadcasti128 (%r11),%ymm10 + + + vpaddd .Lctr_pattern(%rip),%ymm11,%ymm11 + + + + cmpq $127,%rdx + jbe .Lcrypt_loop_4x_done__func1 + + vmovdqu 128(%r9),%ymm7 + vmovdqu 128+32(%r9),%ymm8 + + + + vmovdqu .Linc_2blocks(%rip),%ymm2 + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm14 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm15 + vpaddd %ymm2,%ymm11,%ymm11 + + + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + vpxor %ymm9,%ymm14,%ymm14 + vpxor %ymm9,%ymm15,%ymm15 + + leaq 16(%rcx),%rax +.Lvaesenc_loop_first_4_vecs__func1: + vbroadcasti128 (%rax),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + addq $16,%rax + cmpq %rax,%r11 + jne .Lvaesenc_loop_first_4_vecs__func1 + vpxor 0(%rdi),%ymm10,%ymm2 + vpxor 32(%rdi),%ymm10,%ymm3 + vpxor 64(%rdi),%ymm10,%ymm5 + vpxor 96(%rdi),%ymm10,%ymm6 +.byte 0xc4,0x62,0x1d,0xdd,0xe2 +.byte 0xc4,0x62,0x15,0xdd,0xeb +.byte 0xc4,0x62,0x0d,0xdd,0xf5 +.byte 0xc4,0x62,0x05,0xdd,0xfe + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + vmovdqu %ymm14,64(%rsi) + vmovdqu %ymm15,96(%rsi) + + subq $-128,%rdi + addq $-128,%rdx + cmpq $127,%rdx + jbe .Lghash_last_ciphertext_4x__func1 +.align 16 +.Lcrypt_loop_4x__func1: + + + + + vmovdqu .Linc_2blocks(%rip),%ymm2 + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm14 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm15 + vpaddd %ymm2,%ymm11,%ymm11 + + + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + vpxor %ymm9,%ymm14,%ymm14 + vpxor %ymm9,%ymm15,%ymm15 + + cmpl $24,%r10d + jl .Laes128__func1 + je .Laes192__func1 + + vbroadcasti128 -208(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + vbroadcasti128 -192(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + +.Laes192__func1: + vbroadcasti128 -176(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + vbroadcasti128 -160(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + +.Laes128__func1: + prefetcht0 512(%rdi) + prefetcht0 512+64(%rdi) + + vmovdqu 0(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 0(%r9),%ymm4 + vpxor %ymm1,%ymm3,%ymm3 +.byte 0xc4,0xe3,0x65,0x44,0xec,0x00 +.byte 0xc4,0xe3,0x65,0x44,0xcc,0x11 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 +.byte 0xc4,0xe3,0x6d,0x44,0xf7,0x00 + + vbroadcasti128 -144(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + + vbroadcasti128 -128(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + + vmovdqu 32(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 32(%r9),%ymm4 +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 + vpxor %ymm2,%ymm5,%ymm5 +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 +.byte 0xc4,0xe3,0x6d,0x44,0xd7,0x10 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -112(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + + vmovdqu 64(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 64(%r9),%ymm4 + + vbroadcasti128 -96(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 + vpxor %ymm2,%ymm5,%ymm5 +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -80(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 +.byte 0xc4,0xc3,0x6d,0x44,0xd0,0x00 + vpxor %ymm2,%ymm6,%ymm6 + + + vmovdqu 96(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + + vbroadcasti128 -64(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + vmovdqu 96(%r9),%ymm4 +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 + vpxor %ymm2,%ymm5,%ymm5 +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 +.byte 0xc4,0xc3,0x6d,0x44,0xd0,0x10 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -48(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm1,%ymm6,%ymm6 + + + vbroadcasti128 .Lgfpoly(%rip),%ymm4 +.byte 0xc4,0xe3,0x5d,0x44,0xd5,0x01 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -32(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + +.byte 0xc4,0xe3,0x5d,0x44,0xd6,0x01 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -16(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + vextracti128 $1,%ymm1,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + + + subq $-128,%rsi + vpxor 0(%rdi),%ymm10,%ymm2 + vpxor 32(%rdi),%ymm10,%ymm3 + vpxor 64(%rdi),%ymm10,%ymm5 + vpxor 96(%rdi),%ymm10,%ymm6 +.byte 0xc4,0x62,0x1d,0xdd,0xe2 +.byte 0xc4,0x62,0x15,0xdd,0xeb +.byte 0xc4,0x62,0x0d,0xdd,0xf5 +.byte 0xc4,0x62,0x05,0xdd,0xfe + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + vmovdqu %ymm14,64(%rsi) + vmovdqu %ymm15,96(%rsi) + + subq $-128,%rdi + + addq $-128,%rdx + cmpq $127,%rdx + ja .Lcrypt_loop_4x__func1 +.Lghash_last_ciphertext_4x__func1: + + vmovdqu 0(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 0(%r9),%ymm4 + vpxor %ymm1,%ymm3,%ymm3 +.byte 0xc4,0xe3,0x65,0x44,0xec,0x00 +.byte 0xc4,0xe3,0x65,0x44,0xcc,0x11 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 +.byte 0xc4,0xe3,0x6d,0x44,0xf7,0x00 + + vmovdqu 32(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 32(%r9),%ymm4 +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 + vpxor %ymm2,%ymm5,%ymm5 +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 +.byte 0xc4,0xe3,0x6d,0x44,0xd7,0x10 + vpxor %ymm2,%ymm6,%ymm6 + + vmovdqu 64(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 64(%r9),%ymm4 +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 + vpxor %ymm2,%ymm5,%ymm5 +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 +.byte 0xc4,0xc3,0x6d,0x44,0xd0,0x00 + vpxor %ymm2,%ymm6,%ymm6 + + + vmovdqu 96(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 96(%r9),%ymm4 +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 + vpxor %ymm2,%ymm5,%ymm5 +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 +.byte 0xc4,0xc3,0x6d,0x44,0xd0,0x10 + vpxor %ymm2,%ymm6,%ymm6 + + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm1,%ymm6,%ymm6 + + + vbroadcasti128 .Lgfpoly(%rip),%ymm4 +.byte 0xc4,0xe3,0x5d,0x44,0xd5,0x01 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm2,%ymm6,%ymm6 + +.byte 0xc4,0xe3,0x5d,0x44,0xd6,0x01 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + vextracti128 $1,%ymm1,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + + subq $-128,%rsi +.Lcrypt_loop_4x_done__func1: + + testq %rdx,%rdx + jz .Ldone__func1 + + + + + + leaq 128(%r9),%r8 + subq %rdx,%r8 + + + vpxor %xmm5,%xmm5,%xmm5 + vpxor %xmm6,%xmm6,%xmm6 + vpxor %xmm7,%xmm7,%xmm7 + + cmpq $64,%rdx + jb .Llessthan64bytes__func1 + + + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +.Lvaesenc_loop_tail_1__func1: + vbroadcasti128 (%rax),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea + addq $16,%rax + cmpq %rax,%r11 + jne .Lvaesenc_loop_tail_1__func1 +.byte 0xc4,0x42,0x1d,0xdd,0xe2 +.byte 0xc4,0x42,0x15,0xdd,0xea + + + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%ymm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %ymm3,%ymm13,%ymm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + + + vpshufb %ymm0,%ymm12,%ymm12 + vpshufb %ymm0,%ymm13,%ymm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%ymm3 +.byte 0xc4,0xe3,0x1d,0x44,0xea,0x00 +.byte 0xc4,0xe3,0x1d,0x44,0xf2,0x01 +.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x10 + vpxor %ymm4,%ymm6,%ymm6 +.byte 0xc4,0xe3,0x1d,0x44,0xfa,0x11 +.byte 0xc4,0xe3,0x15,0x44,0xe3,0x00 + vpxor %ymm4,%ymm5,%ymm5 +.byte 0xc4,0xe3,0x15,0x44,0xe3,0x01 + vpxor %ymm4,%ymm6,%ymm6 +.byte 0xc4,0xe3,0x15,0x44,0xe3,0x10 + vpxor %ymm4,%ymm6,%ymm6 +.byte 0xc4,0xe3,0x15,0x44,0xe3,0x11 + vpxor %ymm4,%ymm7,%ymm7 + + addq $64,%r8 + addq $64,%rdi + addq $64,%rsi + subq $64,%rdx + jz .Lreduce__func1 + + vpxor %xmm1,%xmm1,%xmm1 + + +.Llessthan64bytes__func1: + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +.Lvaesenc_loop_tail_2__func1: + vbroadcasti128 (%rax),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea + addq $16,%rax + cmpq %rax,%r11 + jne .Lvaesenc_loop_tail_2__func1 +.byte 0xc4,0x42,0x1d,0xdd,0xe2 +.byte 0xc4,0x42,0x15,0xdd,0xea + + + + + cmpq $32,%rdx + jb .Lxor_one_block__func1 + je .Lxor_two_blocks__func1 + +.Lxor_three_blocks__func1: + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%xmm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %xmm3,%xmm13,%xmm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %xmm13,32(%rsi) + + vpshufb %ymm0,%ymm12,%ymm12 + vpshufb %xmm0,%xmm13,%xmm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%xmm3 + vpclmulqdq $0x00,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm7,%ymm7 + jmp .Lghash_mul_one_vec_unreduced__func1 + +.Lxor_two_blocks__func1: + vmovdqu (%rdi),%ymm2 + vpxor %ymm2,%ymm12,%ymm12 + vmovdqu %ymm12,(%rsi) + vpshufb %ymm0,%ymm12,%ymm12 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + jmp .Lghash_mul_one_vec_unreduced__func1 + +.Lxor_one_block__func1: + vmovdqu (%rdi),%xmm2 + vpxor %xmm2,%xmm12,%xmm12 + vmovdqu %xmm12,(%rsi) + vpshufb %xmm0,%xmm12,%xmm12 + vpxor %xmm1,%xmm12,%xmm12 + vmovdqu (%r8),%xmm2 + +.Lghash_mul_one_vec_unreduced__func1: +.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x00 + vpxor %ymm4,%ymm5,%ymm5 +.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x01 + vpxor %ymm4,%ymm6,%ymm6 +.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x10 + vpxor %ymm4,%ymm6,%ymm6 +.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x11 + vpxor %ymm4,%ymm7,%ymm7 + +.Lreduce__func1: + + vbroadcasti128 .Lgfpoly(%rip),%ymm2 +.byte 0xc4,0xe3,0x6d,0x44,0xdd,0x01 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 +.byte 0xc4,0xe3,0x6d,0x44,0xde,0x01 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm7,%ymm7 + vpxor %ymm3,%ymm7,%ymm7 + vextracti128 $1,%ymm7,%xmm1 + vpxor %xmm7,%xmm1,%xmm1 + +.Ldone__func1: + + vpshufb %xmm0,%xmm1,%xmm1 + vmovdqu %xmm1,(%r12) + + vzeroupper + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + ret + +.cfi_endproc +.size aes_gcm_enc_update_vaes_avx2, . - aes_gcm_enc_update_vaes_avx2 +.globl aes_gcm_dec_update_vaes_avx2 +.hidden aes_gcm_dec_update_vaes_avx2 +.type aes_gcm_dec_update_vaes_avx2,@function +.align 32 +aes_gcm_dec_update_vaes_avx2: +.cfi_startproc + +_CET_ENDBR + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-16 + + movq 16(%rsp),%r12 + vbroadcasti128 .Lbswap_mask(%rip),%ymm0 + + + + vmovdqu (%r12),%xmm1 + vpshufb %xmm0,%xmm1,%xmm1 + vbroadcasti128 (%r8),%ymm11 + vpshufb %ymm0,%ymm11,%ymm11 + + + + movl 240(%rcx),%r10d + leal -20(,%r10,4),%r10d + + + + + leaq 96(%rcx,%r10,4),%r11 + vbroadcasti128 (%rcx),%ymm9 + vbroadcasti128 (%r11),%ymm10 + + + vpaddd .Lctr_pattern(%rip),%ymm11,%ymm11 + + + + cmpq $127,%rdx + jbe .Lcrypt_loop_4x_done__func2 + + vmovdqu 128(%r9),%ymm7 + vmovdqu 128+32(%r9),%ymm8 +.align 16 +.Lcrypt_loop_4x__func2: + + + + + vmovdqu .Linc_2blocks(%rip),%ymm2 + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm14 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm15 + vpaddd %ymm2,%ymm11,%ymm11 + + + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + vpxor %ymm9,%ymm14,%ymm14 + vpxor %ymm9,%ymm15,%ymm15 + + cmpl $24,%r10d + jl .Laes128__func2 + je .Laes192__func2 + + vbroadcasti128 -208(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + vbroadcasti128 -192(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + +.Laes192__func2: + vbroadcasti128 -176(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + vbroadcasti128 -160(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + +.Laes128__func2: + prefetcht0 512(%rdi) + prefetcht0 512+64(%rdi) + + vmovdqu 0(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 0(%r9),%ymm4 + vpxor %ymm1,%ymm3,%ymm3 +.byte 0xc4,0xe3,0x65,0x44,0xec,0x00 +.byte 0xc4,0xe3,0x65,0x44,0xcc,0x11 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 +.byte 0xc4,0xe3,0x6d,0x44,0xf7,0x00 + + vbroadcasti128 -144(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + + vbroadcasti128 -128(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + + vmovdqu 32(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 32(%r9),%ymm4 +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 + vpxor %ymm2,%ymm5,%ymm5 +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 +.byte 0xc4,0xe3,0x6d,0x44,0xd7,0x10 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -112(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + + vmovdqu 64(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 64(%r9),%ymm4 + + vbroadcasti128 -96(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 + vpxor %ymm2,%ymm5,%ymm5 +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -80(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 +.byte 0xc4,0xc3,0x6d,0x44,0xd0,0x00 + vpxor %ymm2,%ymm6,%ymm6 + + + vmovdqu 96(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + + vbroadcasti128 -64(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + vmovdqu 96(%r9),%ymm4 +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 + vpxor %ymm2,%ymm5,%ymm5 +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 +.byte 0xc4,0xc3,0x6d,0x44,0xd0,0x10 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -48(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm1,%ymm6,%ymm6 + + + vbroadcasti128 .Lgfpoly(%rip),%ymm4 +.byte 0xc4,0xe3,0x5d,0x44,0xd5,0x01 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -32(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + +.byte 0xc4,0xe3,0x5d,0x44,0xd6,0x01 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -16(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + vextracti128 $1,%ymm1,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + + + + vpxor 0(%rdi),%ymm10,%ymm2 + vpxor 32(%rdi),%ymm10,%ymm3 + vpxor 64(%rdi),%ymm10,%ymm5 + vpxor 96(%rdi),%ymm10,%ymm6 +.byte 0xc4,0x62,0x1d,0xdd,0xe2 +.byte 0xc4,0x62,0x15,0xdd,0xeb +.byte 0xc4,0x62,0x0d,0xdd,0xf5 +.byte 0xc4,0x62,0x05,0xdd,0xfe + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + vmovdqu %ymm14,64(%rsi) + vmovdqu %ymm15,96(%rsi) + + subq $-128,%rdi + subq $-128,%rsi + addq $-128,%rdx + cmpq $127,%rdx + ja .Lcrypt_loop_4x__func2 +.Lcrypt_loop_4x_done__func2: + + testq %rdx,%rdx + jz .Ldone__func2 + + + + + + leaq 128(%r9),%r8 + subq %rdx,%r8 + + + vpxor %xmm5,%xmm5,%xmm5 + vpxor %xmm6,%xmm6,%xmm6 + vpxor %xmm7,%xmm7,%xmm7 + + cmpq $64,%rdx + jb .Llessthan64bytes__func2 + + + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +.Lvaesenc_loop_tail_1__func2: + vbroadcasti128 (%rax),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea + addq $16,%rax + cmpq %rax,%r11 + jne .Lvaesenc_loop_tail_1__func2 +.byte 0xc4,0x42,0x1d,0xdd,0xe2 +.byte 0xc4,0x42,0x15,0xdd,0xea + + + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%ymm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %ymm3,%ymm13,%ymm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + + + vpshufb %ymm0,%ymm2,%ymm12 + vpshufb %ymm0,%ymm3,%ymm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%ymm3 +.byte 0xc4,0xe3,0x1d,0x44,0xea,0x00 +.byte 0xc4,0xe3,0x1d,0x44,0xf2,0x01 +.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x10 + vpxor %ymm4,%ymm6,%ymm6 +.byte 0xc4,0xe3,0x1d,0x44,0xfa,0x11 +.byte 0xc4,0xe3,0x15,0x44,0xe3,0x00 + vpxor %ymm4,%ymm5,%ymm5 +.byte 0xc4,0xe3,0x15,0x44,0xe3,0x01 + vpxor %ymm4,%ymm6,%ymm6 +.byte 0xc4,0xe3,0x15,0x44,0xe3,0x10 + vpxor %ymm4,%ymm6,%ymm6 +.byte 0xc4,0xe3,0x15,0x44,0xe3,0x11 + vpxor %ymm4,%ymm7,%ymm7 + + addq $64,%r8 + addq $64,%rdi + addq $64,%rsi + subq $64,%rdx + jz .Lreduce__func2 + + vpxor %xmm1,%xmm1,%xmm1 + + +.Llessthan64bytes__func2: + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +.Lvaesenc_loop_tail_2__func2: + vbroadcasti128 (%rax),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea + addq $16,%rax + cmpq %rax,%r11 + jne .Lvaesenc_loop_tail_2__func2 +.byte 0xc4,0x42,0x1d,0xdd,0xe2 +.byte 0xc4,0x42,0x15,0xdd,0xea + + + + + cmpq $32,%rdx + jb .Lxor_one_block__func2 + je .Lxor_two_blocks__func2 + +.Lxor_three_blocks__func2: + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%xmm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %xmm3,%xmm13,%xmm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %xmm13,32(%rsi) + + vpshufb %ymm0,%ymm2,%ymm12 + vpshufb %xmm0,%xmm3,%xmm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%xmm3 + vpclmulqdq $0x00,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm7,%ymm7 + jmp .Lghash_mul_one_vec_unreduced__func2 + +.Lxor_two_blocks__func2: + vmovdqu (%rdi),%ymm2 + vpxor %ymm2,%ymm12,%ymm12 + vmovdqu %ymm12,(%rsi) + vpshufb %ymm0,%ymm2,%ymm12 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + jmp .Lghash_mul_one_vec_unreduced__func2 + +.Lxor_one_block__func2: + vmovdqu (%rdi),%xmm2 + vpxor %xmm2,%xmm12,%xmm12 + vmovdqu %xmm12,(%rsi) + vpshufb %xmm0,%xmm2,%xmm12 + vpxor %xmm1,%xmm12,%xmm12 + vmovdqu (%r8),%xmm2 + +.Lghash_mul_one_vec_unreduced__func2: +.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x00 + vpxor %ymm4,%ymm5,%ymm5 +.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x01 + vpxor %ymm4,%ymm6,%ymm6 +.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x10 + vpxor %ymm4,%ymm6,%ymm6 +.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x11 + vpxor %ymm4,%ymm7,%ymm7 + +.Lreduce__func2: + + vbroadcasti128 .Lgfpoly(%rip),%ymm2 +.byte 0xc4,0xe3,0x6d,0x44,0xdd,0x01 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 +.byte 0xc4,0xe3,0x6d,0x44,0xde,0x01 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm7,%ymm7 + vpxor %ymm3,%ymm7,%ymm7 + vextracti128 $1,%ymm7,%xmm1 + vpxor %xmm7,%xmm1,%xmm1 + +.Ldone__func2: + + vpshufb %xmm0,%xmm1,%xmm1 + vmovdqu %xmm1,(%r12) + + vzeroupper + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + ret + +.cfi_endproc +.size aes_gcm_dec_update_vaes_avx2, . - aes_gcm_dec_update_vaes_avx2 +#endif diff --git a/ring-0.17.14/pregenerated/aes-gcm-avx2-x86_64-macosx.S b/ring-0.17.14/pregenerated/aes-gcm-avx2-x86_64-macosx.S new file mode 100644 index 0000000000..9cb6127936 --- /dev/null +++ b/ring-0.17.14/pregenerated/aes-gcm-avx2-x86_64-macosx.S @@ -0,0 +1,1167 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +.section __DATA,__const +.p2align 4 + + +L$bswap_mask: +.quad 0x08090a0b0c0d0e0f, 0x0001020304050607 + + + + + + + + +L$gfpoly: +.quad 1, 0xc200000000000000 + + +L$gfpoly_and_internal_carrybit: +.quad 1, 0xc200000000000001 + +.p2align 5 + +L$ctr_pattern: +.quad 0, 0 +.quad 1, 0 +L$inc_2blocks: +.quad 2, 0 +.quad 2, 0 + +.text +.globl _gcm_init_vpclmulqdq_avx2 +.private_extern _gcm_init_vpclmulqdq_avx2 + +.p2align 5 +_gcm_init_vpclmulqdq_avx2: + + +_CET_ENDBR + + + + + + vpshufd $0x4e,(%rsi),%xmm3 + + + + + + vpshufd $0xd3,%xmm3,%xmm0 + vpsrad $31,%xmm0,%xmm0 + vpaddq %xmm3,%xmm3,%xmm3 + vpand L$gfpoly_and_internal_carrybit(%rip),%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + + vbroadcasti128 L$gfpoly(%rip),%ymm6 + + + vpclmulqdq $0x00,%xmm3,%xmm3,%xmm0 + vpclmulqdq $0x01,%xmm3,%xmm3,%xmm1 + vpclmulqdq $0x10,%xmm3,%xmm3,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + vpclmulqdq $0x01,%xmm0,%xmm6,%xmm2 + vpshufd $0x4e,%xmm0,%xmm0 + vpxor %xmm0,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm3,%xmm3,%xmm5 + vpclmulqdq $0x01,%xmm1,%xmm6,%xmm0 + vpshufd $0x4e,%xmm1,%xmm1 + vpxor %xmm1,%xmm5,%xmm5 + vpxor %xmm0,%xmm5,%xmm5 + + + + vinserti128 $1,%xmm3,%ymm5,%ymm3 + vinserti128 $1,%xmm5,%ymm5,%ymm5 + + +.byte 0xc4,0xe3,0x65,0x44,0xc5,0x00 +.byte 0xc4,0xe3,0x65,0x44,0xcd,0x01 +.byte 0xc4,0xe3,0x65,0x44,0xd5,0x10 + vpxor %ymm2,%ymm1,%ymm1 +.byte 0xc4,0xe3,0x4d,0x44,0xd0,0x01 + vpshufd $0x4e,%ymm0,%ymm0 + vpxor %ymm0,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 +.byte 0xc4,0xe3,0x65,0x44,0xe5,0x11 +.byte 0xc4,0xe3,0x4d,0x44,0xc1,0x01 + vpshufd $0x4e,%ymm1,%ymm1 + vpxor %ymm1,%ymm4,%ymm4 + vpxor %ymm0,%ymm4,%ymm4 + + + + vmovdqu %ymm3,96(%rdi) + vmovdqu %ymm4,64(%rdi) + + + + vpunpcklqdq %ymm3,%ymm4,%ymm0 + vpunpckhqdq %ymm3,%ymm4,%ymm1 + vpxor %ymm1,%ymm0,%ymm0 + vmovdqu %ymm0,128+32(%rdi) + + +.byte 0xc4,0xe3,0x5d,0x44,0xc5,0x00 +.byte 0xc4,0xe3,0x5d,0x44,0xcd,0x01 +.byte 0xc4,0xe3,0x5d,0x44,0xd5,0x10 + vpxor %ymm2,%ymm1,%ymm1 +.byte 0xc4,0xe3,0x4d,0x44,0xd0,0x01 + vpshufd $0x4e,%ymm0,%ymm0 + vpxor %ymm0,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 +.byte 0xc4,0xe3,0x5d,0x44,0xdd,0x11 +.byte 0xc4,0xe3,0x4d,0x44,0xc1,0x01 + vpshufd $0x4e,%ymm1,%ymm1 + vpxor %ymm1,%ymm3,%ymm3 + vpxor %ymm0,%ymm3,%ymm3 + +.byte 0xc4,0xe3,0x65,0x44,0xc5,0x00 +.byte 0xc4,0xe3,0x65,0x44,0xcd,0x01 +.byte 0xc4,0xe3,0x65,0x44,0xd5,0x10 + vpxor %ymm2,%ymm1,%ymm1 +.byte 0xc4,0xe3,0x4d,0x44,0xd0,0x01 + vpshufd $0x4e,%ymm0,%ymm0 + vpxor %ymm0,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 +.byte 0xc4,0xe3,0x65,0x44,0xe5,0x11 +.byte 0xc4,0xe3,0x4d,0x44,0xc1,0x01 + vpshufd $0x4e,%ymm1,%ymm1 + vpxor %ymm1,%ymm4,%ymm4 + vpxor %ymm0,%ymm4,%ymm4 + + vmovdqu %ymm3,32(%rdi) + vmovdqu %ymm4,0(%rdi) + + + + vpunpcklqdq %ymm3,%ymm4,%ymm0 + vpunpckhqdq %ymm3,%ymm4,%ymm1 + vpxor %ymm1,%ymm0,%ymm0 + vmovdqu %ymm0,128(%rdi) + + vzeroupper + ret + + + +.globl _gcm_ghash_vpclmulqdq_avx2_1 +.private_extern _gcm_ghash_vpclmulqdq_avx2_1 + +.p2align 5 +_gcm_ghash_vpclmulqdq_avx2_1: + + +_CET_ENDBR + + + + + + + vmovdqu L$bswap_mask(%rip),%xmm6 + vmovdqu L$gfpoly(%rip),%xmm7 + + + vmovdqu (%rdi),%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + + + +L$ghash_lastblock: + vmovdqu (%rdx),%xmm0 + vpshufb %xmm6,%xmm0,%xmm0 + vpxor %xmm0,%xmm5,%xmm5 + vmovdqu 128-16(%rsi),%xmm0 + vpclmulqdq $0x00,%xmm0,%xmm5,%xmm1 + vpclmulqdq $0x01,%xmm0,%xmm5,%xmm2 + vpclmulqdq $0x10,%xmm0,%xmm5,%xmm3 + vpxor %xmm3,%xmm2,%xmm2 + vpclmulqdq $0x01,%xmm1,%xmm7,%xmm3 + vpshufd $0x4e,%xmm1,%xmm1 + vpxor %xmm1,%xmm2,%xmm2 + vpxor %xmm3,%xmm2,%xmm2 + vpclmulqdq $0x11,%xmm0,%xmm5,%xmm5 + vpclmulqdq $0x01,%xmm2,%xmm7,%xmm1 + vpshufd $0x4e,%xmm2,%xmm2 + vpxor %xmm2,%xmm5,%xmm5 + vpxor %xmm1,%xmm5,%xmm5 + + +L$ghash_done: + + vpshufb %xmm6,%xmm5,%xmm5 + vmovdqu %xmm5,(%rdi) + + vzeroupper + ret + + + +.globl _aes_gcm_enc_update_vaes_avx2 +.private_extern _aes_gcm_enc_update_vaes_avx2 + +.p2align 5 +_aes_gcm_enc_update_vaes_avx2: + + +_CET_ENDBR + pushq %r12 + + + movq 16(%rsp),%r12 +#ifdef BORINGSSL_DISPATCH_TEST + + movb $1,_BORINGSSL_function_hit+8(%rip) +#endif + vbroadcasti128 L$bswap_mask(%rip),%ymm0 + + + + vmovdqu (%r12),%xmm1 + vpshufb %xmm0,%xmm1,%xmm1 + vbroadcasti128 (%r8),%ymm11 + vpshufb %ymm0,%ymm11,%ymm11 + + + + movl 240(%rcx),%r10d + leal -20(,%r10,4),%r10d + + + + + leaq 96(%rcx,%r10,4),%r11 + vbroadcasti128 (%rcx),%ymm9 + vbroadcasti128 (%r11),%ymm10 + + + vpaddd L$ctr_pattern(%rip),%ymm11,%ymm11 + + + + cmpq $127,%rdx + jbe L$crypt_loop_4x_done__func1 + + vmovdqu 128(%r9),%ymm7 + vmovdqu 128+32(%r9),%ymm8 + + + + vmovdqu L$inc_2blocks(%rip),%ymm2 + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm14 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm15 + vpaddd %ymm2,%ymm11,%ymm11 + + + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + vpxor %ymm9,%ymm14,%ymm14 + vpxor %ymm9,%ymm15,%ymm15 + + leaq 16(%rcx),%rax +L$vaesenc_loop_first_4_vecs__func1: + vbroadcasti128 (%rax),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + addq $16,%rax + cmpq %rax,%r11 + jne L$vaesenc_loop_first_4_vecs__func1 + vpxor 0(%rdi),%ymm10,%ymm2 + vpxor 32(%rdi),%ymm10,%ymm3 + vpxor 64(%rdi),%ymm10,%ymm5 + vpxor 96(%rdi),%ymm10,%ymm6 +.byte 0xc4,0x62,0x1d,0xdd,0xe2 +.byte 0xc4,0x62,0x15,0xdd,0xeb +.byte 0xc4,0x62,0x0d,0xdd,0xf5 +.byte 0xc4,0x62,0x05,0xdd,0xfe + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + vmovdqu %ymm14,64(%rsi) + vmovdqu %ymm15,96(%rsi) + + subq $-128,%rdi + addq $-128,%rdx + cmpq $127,%rdx + jbe L$ghash_last_ciphertext_4x__func1 +.p2align 4 +L$crypt_loop_4x__func1: + + + + + vmovdqu L$inc_2blocks(%rip),%ymm2 + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm14 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm15 + vpaddd %ymm2,%ymm11,%ymm11 + + + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + vpxor %ymm9,%ymm14,%ymm14 + vpxor %ymm9,%ymm15,%ymm15 + + cmpl $24,%r10d + jl L$aes128__func1 + je L$aes192__func1 + + vbroadcasti128 -208(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + vbroadcasti128 -192(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + +L$aes192__func1: + vbroadcasti128 -176(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + vbroadcasti128 -160(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + +L$aes128__func1: + prefetcht0 512(%rdi) + prefetcht0 512+64(%rdi) + + vmovdqu 0(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 0(%r9),%ymm4 + vpxor %ymm1,%ymm3,%ymm3 +.byte 0xc4,0xe3,0x65,0x44,0xec,0x00 +.byte 0xc4,0xe3,0x65,0x44,0xcc,0x11 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 +.byte 0xc4,0xe3,0x6d,0x44,0xf7,0x00 + + vbroadcasti128 -144(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + + vbroadcasti128 -128(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + + vmovdqu 32(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 32(%r9),%ymm4 +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 + vpxor %ymm2,%ymm5,%ymm5 +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 +.byte 0xc4,0xe3,0x6d,0x44,0xd7,0x10 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -112(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + + vmovdqu 64(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 64(%r9),%ymm4 + + vbroadcasti128 -96(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 + vpxor %ymm2,%ymm5,%ymm5 +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -80(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 +.byte 0xc4,0xc3,0x6d,0x44,0xd0,0x00 + vpxor %ymm2,%ymm6,%ymm6 + + + vmovdqu 96(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + + vbroadcasti128 -64(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + vmovdqu 96(%r9),%ymm4 +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 + vpxor %ymm2,%ymm5,%ymm5 +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 +.byte 0xc4,0xc3,0x6d,0x44,0xd0,0x10 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -48(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm1,%ymm6,%ymm6 + + + vbroadcasti128 L$gfpoly(%rip),%ymm4 +.byte 0xc4,0xe3,0x5d,0x44,0xd5,0x01 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -32(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + +.byte 0xc4,0xe3,0x5d,0x44,0xd6,0x01 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -16(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + vextracti128 $1,%ymm1,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + + + subq $-128,%rsi + vpxor 0(%rdi),%ymm10,%ymm2 + vpxor 32(%rdi),%ymm10,%ymm3 + vpxor 64(%rdi),%ymm10,%ymm5 + vpxor 96(%rdi),%ymm10,%ymm6 +.byte 0xc4,0x62,0x1d,0xdd,0xe2 +.byte 0xc4,0x62,0x15,0xdd,0xeb +.byte 0xc4,0x62,0x0d,0xdd,0xf5 +.byte 0xc4,0x62,0x05,0xdd,0xfe + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + vmovdqu %ymm14,64(%rsi) + vmovdqu %ymm15,96(%rsi) + + subq $-128,%rdi + + addq $-128,%rdx + cmpq $127,%rdx + ja L$crypt_loop_4x__func1 +L$ghash_last_ciphertext_4x__func1: + + vmovdqu 0(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 0(%r9),%ymm4 + vpxor %ymm1,%ymm3,%ymm3 +.byte 0xc4,0xe3,0x65,0x44,0xec,0x00 +.byte 0xc4,0xe3,0x65,0x44,0xcc,0x11 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 +.byte 0xc4,0xe3,0x6d,0x44,0xf7,0x00 + + vmovdqu 32(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 32(%r9),%ymm4 +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 + vpxor %ymm2,%ymm5,%ymm5 +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 +.byte 0xc4,0xe3,0x6d,0x44,0xd7,0x10 + vpxor %ymm2,%ymm6,%ymm6 + + vmovdqu 64(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 64(%r9),%ymm4 +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 + vpxor %ymm2,%ymm5,%ymm5 +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 +.byte 0xc4,0xc3,0x6d,0x44,0xd0,0x00 + vpxor %ymm2,%ymm6,%ymm6 + + + vmovdqu 96(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 96(%r9),%ymm4 +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 + vpxor %ymm2,%ymm5,%ymm5 +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 +.byte 0xc4,0xc3,0x6d,0x44,0xd0,0x10 + vpxor %ymm2,%ymm6,%ymm6 + + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm1,%ymm6,%ymm6 + + + vbroadcasti128 L$gfpoly(%rip),%ymm4 +.byte 0xc4,0xe3,0x5d,0x44,0xd5,0x01 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm2,%ymm6,%ymm6 + +.byte 0xc4,0xe3,0x5d,0x44,0xd6,0x01 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + vextracti128 $1,%ymm1,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + + subq $-128,%rsi +L$crypt_loop_4x_done__func1: + + testq %rdx,%rdx + jz L$done__func1 + + + + + + leaq 128(%r9),%r8 + subq %rdx,%r8 + + + vpxor %xmm5,%xmm5,%xmm5 + vpxor %xmm6,%xmm6,%xmm6 + vpxor %xmm7,%xmm7,%xmm7 + + cmpq $64,%rdx + jb L$lessthan64bytes__func1 + + + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd L$inc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd L$inc_2blocks(%rip),%ymm11,%ymm11 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +L$vaesenc_loop_tail_1__func1: + vbroadcasti128 (%rax),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea + addq $16,%rax + cmpq %rax,%r11 + jne L$vaesenc_loop_tail_1__func1 +.byte 0xc4,0x42,0x1d,0xdd,0xe2 +.byte 0xc4,0x42,0x15,0xdd,0xea + + + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%ymm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %ymm3,%ymm13,%ymm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + + + vpshufb %ymm0,%ymm12,%ymm12 + vpshufb %ymm0,%ymm13,%ymm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%ymm3 +.byte 0xc4,0xe3,0x1d,0x44,0xea,0x00 +.byte 0xc4,0xe3,0x1d,0x44,0xf2,0x01 +.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x10 + vpxor %ymm4,%ymm6,%ymm6 +.byte 0xc4,0xe3,0x1d,0x44,0xfa,0x11 +.byte 0xc4,0xe3,0x15,0x44,0xe3,0x00 + vpxor %ymm4,%ymm5,%ymm5 +.byte 0xc4,0xe3,0x15,0x44,0xe3,0x01 + vpxor %ymm4,%ymm6,%ymm6 +.byte 0xc4,0xe3,0x15,0x44,0xe3,0x10 + vpxor %ymm4,%ymm6,%ymm6 +.byte 0xc4,0xe3,0x15,0x44,0xe3,0x11 + vpxor %ymm4,%ymm7,%ymm7 + + addq $64,%r8 + addq $64,%rdi + addq $64,%rsi + subq $64,%rdx + jz L$reduce__func1 + + vpxor %xmm1,%xmm1,%xmm1 + + +L$lessthan64bytes__func1: + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd L$inc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +L$vaesenc_loop_tail_2__func1: + vbroadcasti128 (%rax),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea + addq $16,%rax + cmpq %rax,%r11 + jne L$vaesenc_loop_tail_2__func1 +.byte 0xc4,0x42,0x1d,0xdd,0xe2 +.byte 0xc4,0x42,0x15,0xdd,0xea + + + + + cmpq $32,%rdx + jb L$xor_one_block__func1 + je L$xor_two_blocks__func1 + +L$xor_three_blocks__func1: + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%xmm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %xmm3,%xmm13,%xmm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %xmm13,32(%rsi) + + vpshufb %ymm0,%ymm12,%ymm12 + vpshufb %xmm0,%xmm13,%xmm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%xmm3 + vpclmulqdq $0x00,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm7,%ymm7 + jmp L$ghash_mul_one_vec_unreduced__func1 + +L$xor_two_blocks__func1: + vmovdqu (%rdi),%ymm2 + vpxor %ymm2,%ymm12,%ymm12 + vmovdqu %ymm12,(%rsi) + vpshufb %ymm0,%ymm12,%ymm12 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + jmp L$ghash_mul_one_vec_unreduced__func1 + +L$xor_one_block__func1: + vmovdqu (%rdi),%xmm2 + vpxor %xmm2,%xmm12,%xmm12 + vmovdqu %xmm12,(%rsi) + vpshufb %xmm0,%xmm12,%xmm12 + vpxor %xmm1,%xmm12,%xmm12 + vmovdqu (%r8),%xmm2 + +L$ghash_mul_one_vec_unreduced__func1: +.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x00 + vpxor %ymm4,%ymm5,%ymm5 +.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x01 + vpxor %ymm4,%ymm6,%ymm6 +.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x10 + vpxor %ymm4,%ymm6,%ymm6 +.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x11 + vpxor %ymm4,%ymm7,%ymm7 + +L$reduce__func1: + + vbroadcasti128 L$gfpoly(%rip),%ymm2 +.byte 0xc4,0xe3,0x6d,0x44,0xdd,0x01 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 +.byte 0xc4,0xe3,0x6d,0x44,0xde,0x01 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm7,%ymm7 + vpxor %ymm3,%ymm7,%ymm7 + vextracti128 $1,%ymm7,%xmm1 + vpxor %xmm7,%xmm1,%xmm1 + +L$done__func1: + + vpshufb %xmm0,%xmm1,%xmm1 + vmovdqu %xmm1,(%r12) + + vzeroupper + popq %r12 + + ret + + + +.globl _aes_gcm_dec_update_vaes_avx2 +.private_extern _aes_gcm_dec_update_vaes_avx2 + +.p2align 5 +_aes_gcm_dec_update_vaes_avx2: + + +_CET_ENDBR + pushq %r12 + + + movq 16(%rsp),%r12 + vbroadcasti128 L$bswap_mask(%rip),%ymm0 + + + + vmovdqu (%r12),%xmm1 + vpshufb %xmm0,%xmm1,%xmm1 + vbroadcasti128 (%r8),%ymm11 + vpshufb %ymm0,%ymm11,%ymm11 + + + + movl 240(%rcx),%r10d + leal -20(,%r10,4),%r10d + + + + + leaq 96(%rcx,%r10,4),%r11 + vbroadcasti128 (%rcx),%ymm9 + vbroadcasti128 (%r11),%ymm10 + + + vpaddd L$ctr_pattern(%rip),%ymm11,%ymm11 + + + + cmpq $127,%rdx + jbe L$crypt_loop_4x_done__func2 + + vmovdqu 128(%r9),%ymm7 + vmovdqu 128+32(%r9),%ymm8 +.p2align 4 +L$crypt_loop_4x__func2: + + + + + vmovdqu L$inc_2blocks(%rip),%ymm2 + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm14 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm15 + vpaddd %ymm2,%ymm11,%ymm11 + + + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + vpxor %ymm9,%ymm14,%ymm14 + vpxor %ymm9,%ymm15,%ymm15 + + cmpl $24,%r10d + jl L$aes128__func2 + je L$aes192__func2 + + vbroadcasti128 -208(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + vbroadcasti128 -192(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + +L$aes192__func2: + vbroadcasti128 -176(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + vbroadcasti128 -160(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + +L$aes128__func2: + prefetcht0 512(%rdi) + prefetcht0 512+64(%rdi) + + vmovdqu 0(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 0(%r9),%ymm4 + vpxor %ymm1,%ymm3,%ymm3 +.byte 0xc4,0xe3,0x65,0x44,0xec,0x00 +.byte 0xc4,0xe3,0x65,0x44,0xcc,0x11 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 +.byte 0xc4,0xe3,0x6d,0x44,0xf7,0x00 + + vbroadcasti128 -144(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + + vbroadcasti128 -128(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + + vmovdqu 32(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 32(%r9),%ymm4 +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 + vpxor %ymm2,%ymm5,%ymm5 +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 +.byte 0xc4,0xe3,0x6d,0x44,0xd7,0x10 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -112(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + + vmovdqu 64(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 64(%r9),%ymm4 + + vbroadcasti128 -96(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 + vpxor %ymm2,%ymm5,%ymm5 +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -80(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 +.byte 0xc4,0xc3,0x6d,0x44,0xd0,0x00 + vpxor %ymm2,%ymm6,%ymm6 + + + vmovdqu 96(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + + vbroadcasti128 -64(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + vmovdqu 96(%r9),%ymm4 +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x00 + vpxor %ymm2,%ymm5,%ymm5 +.byte 0xc4,0xe3,0x65,0x44,0xd4,0x11 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 +.byte 0xc4,0xc3,0x6d,0x44,0xd0,0x10 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -48(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm1,%ymm6,%ymm6 + + + vbroadcasti128 L$gfpoly(%rip),%ymm4 +.byte 0xc4,0xe3,0x5d,0x44,0xd5,0x01 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -32(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + +.byte 0xc4,0xe3,0x5d,0x44,0xd6,0x01 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -16(%r11),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea +.byte 0xc4,0x62,0x0d,0xdc,0xf2 +.byte 0xc4,0x62,0x05,0xdc,0xfa + + vextracti128 $1,%ymm1,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + + + + vpxor 0(%rdi),%ymm10,%ymm2 + vpxor 32(%rdi),%ymm10,%ymm3 + vpxor 64(%rdi),%ymm10,%ymm5 + vpxor 96(%rdi),%ymm10,%ymm6 +.byte 0xc4,0x62,0x1d,0xdd,0xe2 +.byte 0xc4,0x62,0x15,0xdd,0xeb +.byte 0xc4,0x62,0x0d,0xdd,0xf5 +.byte 0xc4,0x62,0x05,0xdd,0xfe + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + vmovdqu %ymm14,64(%rsi) + vmovdqu %ymm15,96(%rsi) + + subq $-128,%rdi + subq $-128,%rsi + addq $-128,%rdx + cmpq $127,%rdx + ja L$crypt_loop_4x__func2 +L$crypt_loop_4x_done__func2: + + testq %rdx,%rdx + jz L$done__func2 + + + + + + leaq 128(%r9),%r8 + subq %rdx,%r8 + + + vpxor %xmm5,%xmm5,%xmm5 + vpxor %xmm6,%xmm6,%xmm6 + vpxor %xmm7,%xmm7,%xmm7 + + cmpq $64,%rdx + jb L$lessthan64bytes__func2 + + + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd L$inc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd L$inc_2blocks(%rip),%ymm11,%ymm11 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +L$vaesenc_loop_tail_1__func2: + vbroadcasti128 (%rax),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea + addq $16,%rax + cmpq %rax,%r11 + jne L$vaesenc_loop_tail_1__func2 +.byte 0xc4,0x42,0x1d,0xdd,0xe2 +.byte 0xc4,0x42,0x15,0xdd,0xea + + + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%ymm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %ymm3,%ymm13,%ymm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + + + vpshufb %ymm0,%ymm2,%ymm12 + vpshufb %ymm0,%ymm3,%ymm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%ymm3 +.byte 0xc4,0xe3,0x1d,0x44,0xea,0x00 +.byte 0xc4,0xe3,0x1d,0x44,0xf2,0x01 +.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x10 + vpxor %ymm4,%ymm6,%ymm6 +.byte 0xc4,0xe3,0x1d,0x44,0xfa,0x11 +.byte 0xc4,0xe3,0x15,0x44,0xe3,0x00 + vpxor %ymm4,%ymm5,%ymm5 +.byte 0xc4,0xe3,0x15,0x44,0xe3,0x01 + vpxor %ymm4,%ymm6,%ymm6 +.byte 0xc4,0xe3,0x15,0x44,0xe3,0x10 + vpxor %ymm4,%ymm6,%ymm6 +.byte 0xc4,0xe3,0x15,0x44,0xe3,0x11 + vpxor %ymm4,%ymm7,%ymm7 + + addq $64,%r8 + addq $64,%rdi + addq $64,%rsi + subq $64,%rdx + jz L$reduce__func2 + + vpxor %xmm1,%xmm1,%xmm1 + + +L$lessthan64bytes__func2: + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd L$inc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +L$vaesenc_loop_tail_2__func2: + vbroadcasti128 (%rax),%ymm2 +.byte 0xc4,0x62,0x1d,0xdc,0xe2 +.byte 0xc4,0x62,0x15,0xdc,0xea + addq $16,%rax + cmpq %rax,%r11 + jne L$vaesenc_loop_tail_2__func2 +.byte 0xc4,0x42,0x1d,0xdd,0xe2 +.byte 0xc4,0x42,0x15,0xdd,0xea + + + + + cmpq $32,%rdx + jb L$xor_one_block__func2 + je L$xor_two_blocks__func2 + +L$xor_three_blocks__func2: + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%xmm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %xmm3,%xmm13,%xmm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %xmm13,32(%rsi) + + vpshufb %ymm0,%ymm2,%ymm12 + vpshufb %xmm0,%xmm3,%xmm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%xmm3 + vpclmulqdq $0x00,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm7,%ymm7 + jmp L$ghash_mul_one_vec_unreduced__func2 + +L$xor_two_blocks__func2: + vmovdqu (%rdi),%ymm2 + vpxor %ymm2,%ymm12,%ymm12 + vmovdqu %ymm12,(%rsi) + vpshufb %ymm0,%ymm2,%ymm12 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + jmp L$ghash_mul_one_vec_unreduced__func2 + +L$xor_one_block__func2: + vmovdqu (%rdi),%xmm2 + vpxor %xmm2,%xmm12,%xmm12 + vmovdqu %xmm12,(%rsi) + vpshufb %xmm0,%xmm2,%xmm12 + vpxor %xmm1,%xmm12,%xmm12 + vmovdqu (%r8),%xmm2 + +L$ghash_mul_one_vec_unreduced__func2: +.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x00 + vpxor %ymm4,%ymm5,%ymm5 +.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x01 + vpxor %ymm4,%ymm6,%ymm6 +.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x10 + vpxor %ymm4,%ymm6,%ymm6 +.byte 0xc4,0xe3,0x1d,0x44,0xe2,0x11 + vpxor %ymm4,%ymm7,%ymm7 + +L$reduce__func2: + + vbroadcasti128 L$gfpoly(%rip),%ymm2 +.byte 0xc4,0xe3,0x6d,0x44,0xdd,0x01 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 +.byte 0xc4,0xe3,0x6d,0x44,0xde,0x01 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm7,%ymm7 + vpxor %ymm3,%ymm7,%ymm7 + vextracti128 $1,%ymm7,%xmm1 + vpxor %xmm7,%xmm1,%xmm1 + +L$done__func2: + + vpshufb %xmm0,%xmm1,%xmm1 + vmovdqu %xmm1,(%r12) + + vzeroupper + popq %r12 + + ret + + + +#endif diff --git a/ring-0.17.14/pregenerated/aes-gcm-avx2-x86_64-nasm.asm b/ring-0.17.14/pregenerated/aes-gcm-avx2-x86_64-nasm.asm new file mode 100644 index 0000000000..f07e5a3d6a --- /dev/null +++ b/ring-0.17.14/pregenerated/aes-gcm-avx2-x86_64-nasm.asm @@ -0,0 +1,1423 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%include "ring_core_generated/prefix_symbols_nasm.inc" +section .rdata rdata align=8 +ALIGN 16 + + +$L$bswap_mask: + DQ 0x08090a0b0c0d0e0f,0x0001020304050607 + + + + + + + + +$L$gfpoly: + DQ 1,0xc200000000000000 + + +$L$gfpoly_and_internal_carrybit: + DQ 1,0xc200000000000001 + +ALIGN 32 + +$L$ctr_pattern: + DQ 0,0 + DQ 1,0 +$L$inc_2blocks: + DQ 2,0 + DQ 2,0 + +section .text code align=64 + +global gcm_init_vpclmulqdq_avx2 + +ALIGN 32 +gcm_init_vpclmulqdq_avx2: + +$L$SEH_begin_gcm_init_vpclmulqdq_avx2_1: +_CET_ENDBR + sub rsp,24 +$L$SEH_prologue_gcm_init_vpclmulqdq_avx2_2: + movdqa XMMWORD[rsp],xmm6 +$L$SEH_prologue_gcm_init_vpclmulqdq_avx2_3: + +$L$SEH_endprologue_gcm_init_vpclmulqdq_avx2_4: + + + + vpshufd xmm3,XMMWORD[rdx],0x4e + + + + + + vpshufd xmm0,xmm3,0xd3 + vpsrad xmm0,xmm0,31 + vpaddq xmm3,xmm3,xmm3 + vpand xmm0,xmm0,XMMWORD[$L$gfpoly_and_internal_carrybit] + vpxor xmm3,xmm3,xmm0 + + vbroadcasti128 ymm6,XMMWORD[$L$gfpoly] + + + vpclmulqdq xmm0,xmm3,xmm3,0x00 + vpclmulqdq xmm1,xmm3,xmm3,0x01 + vpclmulqdq xmm2,xmm3,xmm3,0x10 + vpxor xmm1,xmm1,xmm2 + vpclmulqdq xmm2,xmm6,xmm0,0x01 + vpshufd xmm0,xmm0,0x4e + vpxor xmm1,xmm1,xmm0 + vpxor xmm1,xmm1,xmm2 + vpclmulqdq xmm5,xmm3,xmm3,0x11 + vpclmulqdq xmm0,xmm6,xmm1,0x01 + vpshufd xmm1,xmm1,0x4e + vpxor xmm5,xmm5,xmm1 + vpxor xmm5,xmm5,xmm0 + + + + vinserti128 ymm3,ymm5,xmm3,1 + vinserti128 ymm5,ymm5,xmm5,1 + + + DB 0xc4,0xe3,0x65,0x44,0xc5,0x00 + DB 0xc4,0xe3,0x65,0x44,0xcd,0x01 + DB 0xc4,0xe3,0x65,0x44,0xd5,0x10 + vpxor ymm1,ymm1,ymm2 + DB 0xc4,0xe3,0x4d,0x44,0xd0,0x01 + vpshufd ymm0,ymm0,0x4e + vpxor ymm1,ymm1,ymm0 + vpxor ymm1,ymm1,ymm2 + DB 0xc4,0xe3,0x65,0x44,0xe5,0x11 + DB 0xc4,0xe3,0x4d,0x44,0xc1,0x01 + vpshufd ymm1,ymm1,0x4e + vpxor ymm4,ymm4,ymm1 + vpxor ymm4,ymm4,ymm0 + + + + vmovdqu YMMWORD[96+rcx],ymm3 + vmovdqu YMMWORD[64+rcx],ymm4 + + + + vpunpcklqdq ymm0,ymm4,ymm3 + vpunpckhqdq ymm1,ymm4,ymm3 + vpxor ymm0,ymm0,ymm1 + vmovdqu YMMWORD[(128+32)+rcx],ymm0 + + + DB 0xc4,0xe3,0x5d,0x44,0xc5,0x00 + DB 0xc4,0xe3,0x5d,0x44,0xcd,0x01 + DB 0xc4,0xe3,0x5d,0x44,0xd5,0x10 + vpxor ymm1,ymm1,ymm2 + DB 0xc4,0xe3,0x4d,0x44,0xd0,0x01 + vpshufd ymm0,ymm0,0x4e + vpxor ymm1,ymm1,ymm0 + vpxor ymm1,ymm1,ymm2 + DB 0xc4,0xe3,0x5d,0x44,0xdd,0x11 + DB 0xc4,0xe3,0x4d,0x44,0xc1,0x01 + vpshufd ymm1,ymm1,0x4e + vpxor ymm3,ymm3,ymm1 + vpxor ymm3,ymm3,ymm0 + + DB 0xc4,0xe3,0x65,0x44,0xc5,0x00 + DB 0xc4,0xe3,0x65,0x44,0xcd,0x01 + DB 0xc4,0xe3,0x65,0x44,0xd5,0x10 + vpxor ymm1,ymm1,ymm2 + DB 0xc4,0xe3,0x4d,0x44,0xd0,0x01 + vpshufd ymm0,ymm0,0x4e + vpxor ymm1,ymm1,ymm0 + vpxor ymm1,ymm1,ymm2 + DB 0xc4,0xe3,0x65,0x44,0xe5,0x11 + DB 0xc4,0xe3,0x4d,0x44,0xc1,0x01 + vpshufd ymm1,ymm1,0x4e + vpxor ymm4,ymm4,ymm1 + vpxor ymm4,ymm4,ymm0 + + vmovdqu YMMWORD[32+rcx],ymm3 + vmovdqu YMMWORD[rcx],ymm4 + + + + vpunpcklqdq ymm0,ymm4,ymm3 + vpunpckhqdq ymm1,ymm4,ymm3 + vpxor ymm0,ymm0,ymm1 + vmovdqu YMMWORD[128+rcx],ymm0 + + vzeroupper + movdqa xmm6,XMMWORD[rsp] + add rsp,24 + ret +$L$SEH_end_gcm_init_vpclmulqdq_avx2_5: + + +global gcm_ghash_vpclmulqdq_avx2_1 + +ALIGN 32 +gcm_ghash_vpclmulqdq_avx2_1: + +$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1_1: +_CET_ENDBR + sub rsp,72 +$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_2: + movdqa XMMWORD[rsp],xmm6 +$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_3: + movdqa XMMWORD[16+rsp],xmm7 +$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_4: + movdqa XMMWORD[32+rsp],xmm8 +$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_5: + movdqa XMMWORD[48+rsp],xmm9 +$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_6: + +$L$SEH_endprologue_gcm_ghash_vpclmulqdq_avx2_1_7: + + + + + vmovdqu xmm6,XMMWORD[$L$bswap_mask] + vmovdqu xmm7,XMMWORD[$L$gfpoly] + + + vmovdqu xmm5,XMMWORD[rcx] + vpshufb xmm5,xmm5,xmm6 + + + +$L$ghash_lastblock: + vmovdqu xmm0,XMMWORD[r8] + vpshufb xmm0,xmm0,xmm6 + vpxor xmm5,xmm5,xmm0 + vmovdqu xmm0,XMMWORD[((128-16))+rdx] + vpclmulqdq xmm1,xmm5,xmm0,0x00 + vpclmulqdq xmm2,xmm5,xmm0,0x01 + vpclmulqdq xmm3,xmm5,xmm0,0x10 + vpxor xmm2,xmm2,xmm3 + vpclmulqdq xmm3,xmm7,xmm1,0x01 + vpshufd xmm1,xmm1,0x4e + vpxor xmm2,xmm2,xmm1 + vpxor xmm2,xmm2,xmm3 + vpclmulqdq xmm5,xmm5,xmm0,0x11 + vpclmulqdq xmm1,xmm7,xmm2,0x01 + vpshufd xmm2,xmm2,0x4e + vpxor xmm5,xmm5,xmm2 + vpxor xmm5,xmm5,xmm1 + + +$L$ghash_done: + + vpshufb xmm5,xmm5,xmm6 + vmovdqu XMMWORD[rcx],xmm5 + + vzeroupper + movdqa xmm6,XMMWORD[rsp] + movdqa xmm7,XMMWORD[16+rsp] + movdqa xmm8,XMMWORD[32+rsp] + movdqa xmm9,XMMWORD[48+rsp] + add rsp,72 + ret +$L$SEH_end_gcm_ghash_vpclmulqdq_avx2_1_8: + + +global aes_gcm_enc_update_vaes_avx2 + +ALIGN 32 +aes_gcm_enc_update_vaes_avx2: + +$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1: +_CET_ENDBR + push rsi +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_2: + push rdi +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_3: + push r12 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_4: + + mov rsi,QWORD[64+rsp] + mov rdi,QWORD[72+rsp] + mov r12,QWORD[80+rsp] + sub rsp,160 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_5: + movdqa XMMWORD[rsp],xmm6 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_6: + movdqa XMMWORD[16+rsp],xmm7 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_7: + movdqa XMMWORD[32+rsp],xmm8 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_8: + movdqa XMMWORD[48+rsp],xmm9 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_9: + movdqa XMMWORD[64+rsp],xmm10 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_10: + movdqa XMMWORD[80+rsp],xmm11 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_11: + movdqa XMMWORD[96+rsp],xmm12 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_12: + movdqa XMMWORD[112+rsp],xmm13 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_13: + movdqa XMMWORD[128+rsp],xmm14 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_14: + movdqa XMMWORD[144+rsp],xmm15 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_15: + +$L$SEH_endprologue_aes_gcm_enc_update_vaes_avx2_16: +%ifdef BORINGSSL_DISPATCH_TEST +EXTERN BORINGSSL_function_hit + mov BYTE[((BORINGSSL_function_hit+8))],1 +%endif + vbroadcasti128 ymm0,XMMWORD[$L$bswap_mask] + + + + vmovdqu xmm1,XMMWORD[r12] + vpshufb xmm1,xmm1,xmm0 + vbroadcasti128 ymm11,XMMWORD[rsi] + vpshufb ymm11,ymm11,ymm0 + + + + mov r10d,DWORD[240+r9] + lea r10d,[((-20))+r10*4] + + + + + lea r11,[96+r10*4+r9] + vbroadcasti128 ymm9,XMMWORD[r9] + vbroadcasti128 ymm10,XMMWORD[r11] + + + vpaddd ymm11,ymm11,YMMWORD[$L$ctr_pattern] + + + + cmp r8,127 + jbe NEAR $L$crypt_loop_4x_done__func1 + + vmovdqu ymm7,YMMWORD[128+rdi] + vmovdqu ymm8,YMMWORD[((128+32))+rdi] + + + + vmovdqu ymm2,YMMWORD[$L$inc_2blocks] + vpshufb ymm12,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + vpshufb ymm13,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + vpshufb ymm14,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + vpshufb ymm15,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + + + vpxor ymm12,ymm12,ymm9 + vpxor ymm13,ymm13,ymm9 + vpxor ymm14,ymm14,ymm9 + vpxor ymm15,ymm15,ymm9 + + lea rax,[16+r9] +$L$vaesenc_loop_first_4_vecs__func1: + vbroadcasti128 ymm2,XMMWORD[rax] + DB 0xc4,0x62,0x1d,0xdc,0xe2 + DB 0xc4,0x62,0x15,0xdc,0xea + DB 0xc4,0x62,0x0d,0xdc,0xf2 + DB 0xc4,0x62,0x05,0xdc,0xfa + + add rax,16 + cmp r11,rax + jne NEAR $L$vaesenc_loop_first_4_vecs__func1 + vpxor ymm2,ymm10,YMMWORD[rcx] + vpxor ymm3,ymm10,YMMWORD[32+rcx] + vpxor ymm5,ymm10,YMMWORD[64+rcx] + vpxor ymm6,ymm10,YMMWORD[96+rcx] + DB 0xc4,0x62,0x1d,0xdd,0xe2 + DB 0xc4,0x62,0x15,0xdd,0xeb + DB 0xc4,0x62,0x0d,0xdd,0xf5 + DB 0xc4,0x62,0x05,0xdd,0xfe + vmovdqu YMMWORD[rdx],ymm12 + vmovdqu YMMWORD[32+rdx],ymm13 + vmovdqu YMMWORD[64+rdx],ymm14 + vmovdqu YMMWORD[96+rdx],ymm15 + + sub rcx,-128 + add r8,-128 + cmp r8,127 + jbe NEAR $L$ghash_last_ciphertext_4x__func1 +ALIGN 16 +$L$crypt_loop_4x__func1: + + + + + vmovdqu ymm2,YMMWORD[$L$inc_2blocks] + vpshufb ymm12,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + vpshufb ymm13,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + vpshufb ymm14,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + vpshufb ymm15,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + + + vpxor ymm12,ymm12,ymm9 + vpxor ymm13,ymm13,ymm9 + vpxor ymm14,ymm14,ymm9 + vpxor ymm15,ymm15,ymm9 + + cmp r10d,24 + jl NEAR $L$aes128__func1 + je NEAR $L$aes192__func1 + + vbroadcasti128 ymm2,XMMWORD[((-208))+r11] + DB 0xc4,0x62,0x1d,0xdc,0xe2 + DB 0xc4,0x62,0x15,0xdc,0xea + DB 0xc4,0x62,0x0d,0xdc,0xf2 + DB 0xc4,0x62,0x05,0xdc,0xfa + + vbroadcasti128 ymm2,XMMWORD[((-192))+r11] + DB 0xc4,0x62,0x1d,0xdc,0xe2 + DB 0xc4,0x62,0x15,0xdc,0xea + DB 0xc4,0x62,0x0d,0xdc,0xf2 + DB 0xc4,0x62,0x05,0xdc,0xfa + +$L$aes192__func1: + vbroadcasti128 ymm2,XMMWORD[((-176))+r11] + DB 0xc4,0x62,0x1d,0xdc,0xe2 + DB 0xc4,0x62,0x15,0xdc,0xea + DB 0xc4,0x62,0x0d,0xdc,0xf2 + DB 0xc4,0x62,0x05,0xdc,0xfa + + vbroadcasti128 ymm2,XMMWORD[((-160))+r11] + DB 0xc4,0x62,0x1d,0xdc,0xe2 + DB 0xc4,0x62,0x15,0xdc,0xea + DB 0xc4,0x62,0x0d,0xdc,0xf2 + DB 0xc4,0x62,0x05,0xdc,0xfa + +$L$aes128__func1: + prefetcht0 [512+rcx] + prefetcht0 [((512+64))+rcx] + + vmovdqu ymm3,YMMWORD[rdx] + vpshufb ymm3,ymm3,ymm0 + vmovdqu ymm4,YMMWORD[rdi] + vpxor ymm3,ymm3,ymm1 + DB 0xc4,0xe3,0x65,0x44,0xec,0x00 + DB 0xc4,0xe3,0x65,0x44,0xcc,0x11 + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + DB 0xc4,0xe3,0x6d,0x44,0xf7,0x00 + + vbroadcasti128 ymm2,XMMWORD[((-144))+r11] + DB 0xc4,0x62,0x1d,0xdc,0xe2 + DB 0xc4,0x62,0x15,0xdc,0xea + DB 0xc4,0x62,0x0d,0xdc,0xf2 + DB 0xc4,0x62,0x05,0xdc,0xfa + + + vbroadcasti128 ymm2,XMMWORD[((-128))+r11] + DB 0xc4,0x62,0x1d,0xdc,0xe2 + DB 0xc4,0x62,0x15,0xdc,0xea + DB 0xc4,0x62,0x0d,0xdc,0xf2 + DB 0xc4,0x62,0x05,0xdc,0xfa + + + vmovdqu ymm3,YMMWORD[32+rdx] + vpshufb ymm3,ymm3,ymm0 + vmovdqu ymm4,YMMWORD[32+rdi] + DB 0xc4,0xe3,0x65,0x44,0xd4,0x00 + vpxor ymm5,ymm5,ymm2 + DB 0xc4,0xe3,0x65,0x44,0xd4,0x11 + vpxor ymm1,ymm1,ymm2 + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + DB 0xc4,0xe3,0x6d,0x44,0xd7,0x10 + vpxor ymm6,ymm6,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-112))+r11] + DB 0xc4,0x62,0x1d,0xdc,0xe2 + DB 0xc4,0x62,0x15,0xdc,0xea + DB 0xc4,0x62,0x0d,0xdc,0xf2 + DB 0xc4,0x62,0x05,0xdc,0xfa + + + vmovdqu ymm3,YMMWORD[64+rdx] + vpshufb ymm3,ymm3,ymm0 + vmovdqu ymm4,YMMWORD[64+rdi] + + vbroadcasti128 ymm2,XMMWORD[((-96))+r11] + DB 0xc4,0x62,0x1d,0xdc,0xe2 + DB 0xc4,0x62,0x15,0xdc,0xea + DB 0xc4,0x62,0x0d,0xdc,0xf2 + DB 0xc4,0x62,0x05,0xdc,0xfa + + DB 0xc4,0xe3,0x65,0x44,0xd4,0x00 + vpxor ymm5,ymm5,ymm2 + DB 0xc4,0xe3,0x65,0x44,0xd4,0x11 + vpxor ymm1,ymm1,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-80))+r11] + DB 0xc4,0x62,0x1d,0xdc,0xe2 + DB 0xc4,0x62,0x15,0xdc,0xea + DB 0xc4,0x62,0x0d,0xdc,0xf2 + DB 0xc4,0x62,0x05,0xdc,0xfa + + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + DB 0xc4,0xc3,0x6d,0x44,0xd0,0x00 + vpxor ymm6,ymm6,ymm2 + + + vmovdqu ymm3,YMMWORD[96+rdx] + vpshufb ymm3,ymm3,ymm0 + + vbroadcasti128 ymm2,XMMWORD[((-64))+r11] + DB 0xc4,0x62,0x1d,0xdc,0xe2 + DB 0xc4,0x62,0x15,0xdc,0xea + DB 0xc4,0x62,0x0d,0xdc,0xf2 + DB 0xc4,0x62,0x05,0xdc,0xfa + + vmovdqu ymm4,YMMWORD[96+rdi] + DB 0xc4,0xe3,0x65,0x44,0xd4,0x00 + vpxor ymm5,ymm5,ymm2 + DB 0xc4,0xe3,0x65,0x44,0xd4,0x11 + vpxor ymm1,ymm1,ymm2 + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + DB 0xc4,0xc3,0x6d,0x44,0xd0,0x10 + vpxor ymm6,ymm6,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-48))+r11] + DB 0xc4,0x62,0x1d,0xdc,0xe2 + DB 0xc4,0x62,0x15,0xdc,0xea + DB 0xc4,0x62,0x0d,0xdc,0xf2 + DB 0xc4,0x62,0x05,0xdc,0xfa + + + vpxor ymm6,ymm6,ymm5 + vpxor ymm6,ymm6,ymm1 + + + vbroadcasti128 ymm4,XMMWORD[$L$gfpoly] + DB 0xc4,0xe3,0x5d,0x44,0xd5,0x01 + vpshufd ymm5,ymm5,0x4e + vpxor ymm6,ymm6,ymm5 + vpxor ymm6,ymm6,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-32))+r11] + DB 0xc4,0x62,0x1d,0xdc,0xe2 + DB 0xc4,0x62,0x15,0xdc,0xea + DB 0xc4,0x62,0x0d,0xdc,0xf2 + DB 0xc4,0x62,0x05,0xdc,0xfa + + + DB 0xc4,0xe3,0x5d,0x44,0xd6,0x01 + vpshufd ymm6,ymm6,0x4e + vpxor ymm1,ymm1,ymm6 + vpxor ymm1,ymm1,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-16))+r11] + DB 0xc4,0x62,0x1d,0xdc,0xe2 + DB 0xc4,0x62,0x15,0xdc,0xea + DB 0xc4,0x62,0x0d,0xdc,0xf2 + DB 0xc4,0x62,0x05,0xdc,0xfa + + vextracti128 xmm2,ymm1,1 + vpxor xmm1,xmm1,xmm2 + + + sub rdx,-128 + vpxor ymm2,ymm10,YMMWORD[rcx] + vpxor ymm3,ymm10,YMMWORD[32+rcx] + vpxor ymm5,ymm10,YMMWORD[64+rcx] + vpxor ymm6,ymm10,YMMWORD[96+rcx] + DB 0xc4,0x62,0x1d,0xdd,0xe2 + DB 0xc4,0x62,0x15,0xdd,0xeb + DB 0xc4,0x62,0x0d,0xdd,0xf5 + DB 0xc4,0x62,0x05,0xdd,0xfe + vmovdqu YMMWORD[rdx],ymm12 + vmovdqu YMMWORD[32+rdx],ymm13 + vmovdqu YMMWORD[64+rdx],ymm14 + vmovdqu YMMWORD[96+rdx],ymm15 + + sub rcx,-128 + + add r8,-128 + cmp r8,127 + ja NEAR $L$crypt_loop_4x__func1 +$L$ghash_last_ciphertext_4x__func1: + + vmovdqu ymm3,YMMWORD[rdx] + vpshufb ymm3,ymm3,ymm0 + vmovdqu ymm4,YMMWORD[rdi] + vpxor ymm3,ymm3,ymm1 + DB 0xc4,0xe3,0x65,0x44,0xec,0x00 + DB 0xc4,0xe3,0x65,0x44,0xcc,0x11 + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + DB 0xc4,0xe3,0x6d,0x44,0xf7,0x00 + + vmovdqu ymm3,YMMWORD[32+rdx] + vpshufb ymm3,ymm3,ymm0 + vmovdqu ymm4,YMMWORD[32+rdi] + DB 0xc4,0xe3,0x65,0x44,0xd4,0x00 + vpxor ymm5,ymm5,ymm2 + DB 0xc4,0xe3,0x65,0x44,0xd4,0x11 + vpxor ymm1,ymm1,ymm2 + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + DB 0xc4,0xe3,0x6d,0x44,0xd7,0x10 + vpxor ymm6,ymm6,ymm2 + + vmovdqu ymm3,YMMWORD[64+rdx] + vpshufb ymm3,ymm3,ymm0 + vmovdqu ymm4,YMMWORD[64+rdi] + DB 0xc4,0xe3,0x65,0x44,0xd4,0x00 + vpxor ymm5,ymm5,ymm2 + DB 0xc4,0xe3,0x65,0x44,0xd4,0x11 + vpxor ymm1,ymm1,ymm2 + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + DB 0xc4,0xc3,0x6d,0x44,0xd0,0x00 + vpxor ymm6,ymm6,ymm2 + + + vmovdqu ymm3,YMMWORD[96+rdx] + vpshufb ymm3,ymm3,ymm0 + vmovdqu ymm4,YMMWORD[96+rdi] + DB 0xc4,0xe3,0x65,0x44,0xd4,0x00 + vpxor ymm5,ymm5,ymm2 + DB 0xc4,0xe3,0x65,0x44,0xd4,0x11 + vpxor ymm1,ymm1,ymm2 + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + DB 0xc4,0xc3,0x6d,0x44,0xd0,0x10 + vpxor ymm6,ymm6,ymm2 + + vpxor ymm6,ymm6,ymm5 + vpxor ymm6,ymm6,ymm1 + + + vbroadcasti128 ymm4,XMMWORD[$L$gfpoly] + DB 0xc4,0xe3,0x5d,0x44,0xd5,0x01 + vpshufd ymm5,ymm5,0x4e + vpxor ymm6,ymm6,ymm5 + vpxor ymm6,ymm6,ymm2 + + DB 0xc4,0xe3,0x5d,0x44,0xd6,0x01 + vpshufd ymm6,ymm6,0x4e + vpxor ymm1,ymm1,ymm6 + vpxor ymm1,ymm1,ymm2 + vextracti128 xmm2,ymm1,1 + vpxor xmm1,xmm1,xmm2 + + sub rdx,-128 +$L$crypt_loop_4x_done__func1: + + test r8,r8 + jz NEAR $L$done__func1 + + + + + + lea rsi,[128+rdi] + sub rsi,r8 + + + vpxor xmm5,xmm5,xmm5 + vpxor xmm6,xmm6,xmm6 + vpxor xmm7,xmm7,xmm7 + + cmp r8,64 + jb NEAR $L$lessthan64bytes__func1 + + + vpshufb ymm12,ymm11,ymm0 + vpaddd ymm11,ymm11,YMMWORD[$L$inc_2blocks] + vpshufb ymm13,ymm11,ymm0 + vpaddd ymm11,ymm11,YMMWORD[$L$inc_2blocks] + vpxor ymm12,ymm12,ymm9 + vpxor ymm13,ymm13,ymm9 + lea rax,[16+r9] +$L$vaesenc_loop_tail_1__func1: + vbroadcasti128 ymm2,XMMWORD[rax] + DB 0xc4,0x62,0x1d,0xdc,0xe2 + DB 0xc4,0x62,0x15,0xdc,0xea + add rax,16 + cmp r11,rax + jne NEAR $L$vaesenc_loop_tail_1__func1 + DB 0xc4,0x42,0x1d,0xdd,0xe2 + DB 0xc4,0x42,0x15,0xdd,0xea + + + vmovdqu ymm2,YMMWORD[rcx] + vmovdqu ymm3,YMMWORD[32+rcx] + vpxor ymm12,ymm12,ymm2 + vpxor ymm13,ymm13,ymm3 + vmovdqu YMMWORD[rdx],ymm12 + vmovdqu YMMWORD[32+rdx],ymm13 + + + vpshufb ymm12,ymm12,ymm0 + vpshufb ymm13,ymm13,ymm0 + vpxor ymm12,ymm12,ymm1 + vmovdqu ymm2,YMMWORD[rsi] + vmovdqu ymm3,YMMWORD[32+rsi] + DB 0xc4,0xe3,0x1d,0x44,0xea,0x00 + DB 0xc4,0xe3,0x1d,0x44,0xf2,0x01 + DB 0xc4,0xe3,0x1d,0x44,0xe2,0x10 + vpxor ymm6,ymm6,ymm4 + DB 0xc4,0xe3,0x1d,0x44,0xfa,0x11 + DB 0xc4,0xe3,0x15,0x44,0xe3,0x00 + vpxor ymm5,ymm5,ymm4 + DB 0xc4,0xe3,0x15,0x44,0xe3,0x01 + vpxor ymm6,ymm6,ymm4 + DB 0xc4,0xe3,0x15,0x44,0xe3,0x10 + vpxor ymm6,ymm6,ymm4 + DB 0xc4,0xe3,0x15,0x44,0xe3,0x11 + vpxor ymm7,ymm7,ymm4 + + add rsi,64 + add rcx,64 + add rdx,64 + sub r8,64 + jz NEAR $L$reduce__func1 + + vpxor xmm1,xmm1,xmm1 + + +$L$lessthan64bytes__func1: + vpshufb ymm12,ymm11,ymm0 + vpaddd ymm11,ymm11,YMMWORD[$L$inc_2blocks] + vpshufb ymm13,ymm11,ymm0 + vpxor ymm12,ymm12,ymm9 + vpxor ymm13,ymm13,ymm9 + lea rax,[16+r9] +$L$vaesenc_loop_tail_2__func1: + vbroadcasti128 ymm2,XMMWORD[rax] + DB 0xc4,0x62,0x1d,0xdc,0xe2 + DB 0xc4,0x62,0x15,0xdc,0xea + add rax,16 + cmp r11,rax + jne NEAR $L$vaesenc_loop_tail_2__func1 + DB 0xc4,0x42,0x1d,0xdd,0xe2 + DB 0xc4,0x42,0x15,0xdd,0xea + + + + + cmp r8,32 + jb NEAR $L$xor_one_block__func1 + je NEAR $L$xor_two_blocks__func1 + +$L$xor_three_blocks__func1: + vmovdqu ymm2,YMMWORD[rcx] + vmovdqu xmm3,XMMWORD[32+rcx] + vpxor ymm12,ymm12,ymm2 + vpxor xmm13,xmm13,xmm3 + vmovdqu YMMWORD[rdx],ymm12 + vmovdqu XMMWORD[32+rdx],xmm13 + + vpshufb ymm12,ymm12,ymm0 + vpshufb xmm13,xmm13,xmm0 + vpxor ymm12,ymm12,ymm1 + vmovdqu ymm2,YMMWORD[rsi] + vmovdqu xmm3,XMMWORD[32+rsi] + vpclmulqdq xmm4,xmm13,xmm3,0x00 + vpxor ymm5,ymm5,ymm4 + vpclmulqdq xmm4,xmm13,xmm3,0x01 + vpxor ymm6,ymm6,ymm4 + vpclmulqdq xmm4,xmm13,xmm3,0x10 + vpxor ymm6,ymm6,ymm4 + vpclmulqdq xmm4,xmm13,xmm3,0x11 + vpxor ymm7,ymm7,ymm4 + jmp NEAR $L$ghash_mul_one_vec_unreduced__func1 + +$L$xor_two_blocks__func1: + vmovdqu ymm2,YMMWORD[rcx] + vpxor ymm12,ymm12,ymm2 + vmovdqu YMMWORD[rdx],ymm12 + vpshufb ymm12,ymm12,ymm0 + vpxor ymm12,ymm12,ymm1 + vmovdqu ymm2,YMMWORD[rsi] + jmp NEAR $L$ghash_mul_one_vec_unreduced__func1 + +$L$xor_one_block__func1: + vmovdqu xmm2,XMMWORD[rcx] + vpxor xmm12,xmm12,xmm2 + vmovdqu XMMWORD[rdx],xmm12 + vpshufb xmm12,xmm12,xmm0 + vpxor xmm12,xmm12,xmm1 + vmovdqu xmm2,XMMWORD[rsi] + +$L$ghash_mul_one_vec_unreduced__func1: + DB 0xc4,0xe3,0x1d,0x44,0xe2,0x00 + vpxor ymm5,ymm5,ymm4 + DB 0xc4,0xe3,0x1d,0x44,0xe2,0x01 + vpxor ymm6,ymm6,ymm4 + DB 0xc4,0xe3,0x1d,0x44,0xe2,0x10 + vpxor ymm6,ymm6,ymm4 + DB 0xc4,0xe3,0x1d,0x44,0xe2,0x11 + vpxor ymm7,ymm7,ymm4 + +$L$reduce__func1: + + vbroadcasti128 ymm2,XMMWORD[$L$gfpoly] + DB 0xc4,0xe3,0x6d,0x44,0xdd,0x01 + vpshufd ymm5,ymm5,0x4e + vpxor ymm6,ymm6,ymm5 + vpxor ymm6,ymm6,ymm3 + DB 0xc4,0xe3,0x6d,0x44,0xde,0x01 + vpshufd ymm6,ymm6,0x4e + vpxor ymm7,ymm7,ymm6 + vpxor ymm7,ymm7,ymm3 + vextracti128 xmm1,ymm7,1 + vpxor xmm1,xmm1,xmm7 + +$L$done__func1: + + vpshufb xmm1,xmm1,xmm0 + vmovdqu XMMWORD[r12],xmm1 + + vzeroupper + movdqa xmm6,XMMWORD[rsp] + movdqa xmm7,XMMWORD[16+rsp] + movdqa xmm8,XMMWORD[32+rsp] + movdqa xmm9,XMMWORD[48+rsp] + movdqa xmm10,XMMWORD[64+rsp] + movdqa xmm11,XMMWORD[80+rsp] + movdqa xmm12,XMMWORD[96+rsp] + movdqa xmm13,XMMWORD[112+rsp] + movdqa xmm14,XMMWORD[128+rsp] + movdqa xmm15,XMMWORD[144+rsp] + add rsp,160 + pop r12 + pop rdi + pop rsi + ret +$L$SEH_end_aes_gcm_enc_update_vaes_avx2_17: + + +global aes_gcm_dec_update_vaes_avx2 + +ALIGN 32 +aes_gcm_dec_update_vaes_avx2: + +$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1: +_CET_ENDBR + push rsi +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_2: + push rdi +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_3: + push r12 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_4: + + mov rsi,QWORD[64+rsp] + mov rdi,QWORD[72+rsp] + mov r12,QWORD[80+rsp] + sub rsp,160 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_5: + movdqa XMMWORD[rsp],xmm6 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_6: + movdqa XMMWORD[16+rsp],xmm7 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_7: + movdqa XMMWORD[32+rsp],xmm8 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_8: + movdqa XMMWORD[48+rsp],xmm9 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_9: + movdqa XMMWORD[64+rsp],xmm10 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_10: + movdqa XMMWORD[80+rsp],xmm11 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_11: + movdqa XMMWORD[96+rsp],xmm12 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_12: + movdqa XMMWORD[112+rsp],xmm13 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_13: + movdqa XMMWORD[128+rsp],xmm14 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_14: + movdqa XMMWORD[144+rsp],xmm15 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_15: + +$L$SEH_endprologue_aes_gcm_dec_update_vaes_avx2_16: + vbroadcasti128 ymm0,XMMWORD[$L$bswap_mask] + + + + vmovdqu xmm1,XMMWORD[r12] + vpshufb xmm1,xmm1,xmm0 + vbroadcasti128 ymm11,XMMWORD[rsi] + vpshufb ymm11,ymm11,ymm0 + + + + mov r10d,DWORD[240+r9] + lea r10d,[((-20))+r10*4] + + + + + lea r11,[96+r10*4+r9] + vbroadcasti128 ymm9,XMMWORD[r9] + vbroadcasti128 ymm10,XMMWORD[r11] + + + vpaddd ymm11,ymm11,YMMWORD[$L$ctr_pattern] + + + + cmp r8,127 + jbe NEAR $L$crypt_loop_4x_done__func2 + + vmovdqu ymm7,YMMWORD[128+rdi] + vmovdqu ymm8,YMMWORD[((128+32))+rdi] +ALIGN 16 +$L$crypt_loop_4x__func2: + + + + + vmovdqu ymm2,YMMWORD[$L$inc_2blocks] + vpshufb ymm12,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + vpshufb ymm13,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + vpshufb ymm14,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + vpshufb ymm15,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + + + vpxor ymm12,ymm12,ymm9 + vpxor ymm13,ymm13,ymm9 + vpxor ymm14,ymm14,ymm9 + vpxor ymm15,ymm15,ymm9 + + cmp r10d,24 + jl NEAR $L$aes128__func2 + je NEAR $L$aes192__func2 + + vbroadcasti128 ymm2,XMMWORD[((-208))+r11] + DB 0xc4,0x62,0x1d,0xdc,0xe2 + DB 0xc4,0x62,0x15,0xdc,0xea + DB 0xc4,0x62,0x0d,0xdc,0xf2 + DB 0xc4,0x62,0x05,0xdc,0xfa + + vbroadcasti128 ymm2,XMMWORD[((-192))+r11] + DB 0xc4,0x62,0x1d,0xdc,0xe2 + DB 0xc4,0x62,0x15,0xdc,0xea + DB 0xc4,0x62,0x0d,0xdc,0xf2 + DB 0xc4,0x62,0x05,0xdc,0xfa + +$L$aes192__func2: + vbroadcasti128 ymm2,XMMWORD[((-176))+r11] + DB 0xc4,0x62,0x1d,0xdc,0xe2 + DB 0xc4,0x62,0x15,0xdc,0xea + DB 0xc4,0x62,0x0d,0xdc,0xf2 + DB 0xc4,0x62,0x05,0xdc,0xfa + + vbroadcasti128 ymm2,XMMWORD[((-160))+r11] + DB 0xc4,0x62,0x1d,0xdc,0xe2 + DB 0xc4,0x62,0x15,0xdc,0xea + DB 0xc4,0x62,0x0d,0xdc,0xf2 + DB 0xc4,0x62,0x05,0xdc,0xfa + +$L$aes128__func2: + prefetcht0 [512+rcx] + prefetcht0 [((512+64))+rcx] + + vmovdqu ymm3,YMMWORD[rcx] + vpshufb ymm3,ymm3,ymm0 + vmovdqu ymm4,YMMWORD[rdi] + vpxor ymm3,ymm3,ymm1 + DB 0xc4,0xe3,0x65,0x44,0xec,0x00 + DB 0xc4,0xe3,0x65,0x44,0xcc,0x11 + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + DB 0xc4,0xe3,0x6d,0x44,0xf7,0x00 + + vbroadcasti128 ymm2,XMMWORD[((-144))+r11] + DB 0xc4,0x62,0x1d,0xdc,0xe2 + DB 0xc4,0x62,0x15,0xdc,0xea + DB 0xc4,0x62,0x0d,0xdc,0xf2 + DB 0xc4,0x62,0x05,0xdc,0xfa + + + vbroadcasti128 ymm2,XMMWORD[((-128))+r11] + DB 0xc4,0x62,0x1d,0xdc,0xe2 + DB 0xc4,0x62,0x15,0xdc,0xea + DB 0xc4,0x62,0x0d,0xdc,0xf2 + DB 0xc4,0x62,0x05,0xdc,0xfa + + + vmovdqu ymm3,YMMWORD[32+rcx] + vpshufb ymm3,ymm3,ymm0 + vmovdqu ymm4,YMMWORD[32+rdi] + DB 0xc4,0xe3,0x65,0x44,0xd4,0x00 + vpxor ymm5,ymm5,ymm2 + DB 0xc4,0xe3,0x65,0x44,0xd4,0x11 + vpxor ymm1,ymm1,ymm2 + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + DB 0xc4,0xe3,0x6d,0x44,0xd7,0x10 + vpxor ymm6,ymm6,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-112))+r11] + DB 0xc4,0x62,0x1d,0xdc,0xe2 + DB 0xc4,0x62,0x15,0xdc,0xea + DB 0xc4,0x62,0x0d,0xdc,0xf2 + DB 0xc4,0x62,0x05,0xdc,0xfa + + + vmovdqu ymm3,YMMWORD[64+rcx] + vpshufb ymm3,ymm3,ymm0 + vmovdqu ymm4,YMMWORD[64+rdi] + + vbroadcasti128 ymm2,XMMWORD[((-96))+r11] + DB 0xc4,0x62,0x1d,0xdc,0xe2 + DB 0xc4,0x62,0x15,0xdc,0xea + DB 0xc4,0x62,0x0d,0xdc,0xf2 + DB 0xc4,0x62,0x05,0xdc,0xfa + + DB 0xc4,0xe3,0x65,0x44,0xd4,0x00 + vpxor ymm5,ymm5,ymm2 + DB 0xc4,0xe3,0x65,0x44,0xd4,0x11 + vpxor ymm1,ymm1,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-80))+r11] + DB 0xc4,0x62,0x1d,0xdc,0xe2 + DB 0xc4,0x62,0x15,0xdc,0xea + DB 0xc4,0x62,0x0d,0xdc,0xf2 + DB 0xc4,0x62,0x05,0xdc,0xfa + + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + DB 0xc4,0xc3,0x6d,0x44,0xd0,0x00 + vpxor ymm6,ymm6,ymm2 + + + vmovdqu ymm3,YMMWORD[96+rcx] + vpshufb ymm3,ymm3,ymm0 + + vbroadcasti128 ymm2,XMMWORD[((-64))+r11] + DB 0xc4,0x62,0x1d,0xdc,0xe2 + DB 0xc4,0x62,0x15,0xdc,0xea + DB 0xc4,0x62,0x0d,0xdc,0xf2 + DB 0xc4,0x62,0x05,0xdc,0xfa + + vmovdqu ymm4,YMMWORD[96+rdi] + DB 0xc4,0xe3,0x65,0x44,0xd4,0x00 + vpxor ymm5,ymm5,ymm2 + DB 0xc4,0xe3,0x65,0x44,0xd4,0x11 + vpxor ymm1,ymm1,ymm2 + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + DB 0xc4,0xc3,0x6d,0x44,0xd0,0x10 + vpxor ymm6,ymm6,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-48))+r11] + DB 0xc4,0x62,0x1d,0xdc,0xe2 + DB 0xc4,0x62,0x15,0xdc,0xea + DB 0xc4,0x62,0x0d,0xdc,0xf2 + DB 0xc4,0x62,0x05,0xdc,0xfa + + + vpxor ymm6,ymm6,ymm5 + vpxor ymm6,ymm6,ymm1 + + + vbroadcasti128 ymm4,XMMWORD[$L$gfpoly] + DB 0xc4,0xe3,0x5d,0x44,0xd5,0x01 + vpshufd ymm5,ymm5,0x4e + vpxor ymm6,ymm6,ymm5 + vpxor ymm6,ymm6,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-32))+r11] + DB 0xc4,0x62,0x1d,0xdc,0xe2 + DB 0xc4,0x62,0x15,0xdc,0xea + DB 0xc4,0x62,0x0d,0xdc,0xf2 + DB 0xc4,0x62,0x05,0xdc,0xfa + + + DB 0xc4,0xe3,0x5d,0x44,0xd6,0x01 + vpshufd ymm6,ymm6,0x4e + vpxor ymm1,ymm1,ymm6 + vpxor ymm1,ymm1,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-16))+r11] + DB 0xc4,0x62,0x1d,0xdc,0xe2 + DB 0xc4,0x62,0x15,0xdc,0xea + DB 0xc4,0x62,0x0d,0xdc,0xf2 + DB 0xc4,0x62,0x05,0xdc,0xfa + + vextracti128 xmm2,ymm1,1 + vpxor xmm1,xmm1,xmm2 + + + + vpxor ymm2,ymm10,YMMWORD[rcx] + vpxor ymm3,ymm10,YMMWORD[32+rcx] + vpxor ymm5,ymm10,YMMWORD[64+rcx] + vpxor ymm6,ymm10,YMMWORD[96+rcx] + DB 0xc4,0x62,0x1d,0xdd,0xe2 + DB 0xc4,0x62,0x15,0xdd,0xeb + DB 0xc4,0x62,0x0d,0xdd,0xf5 + DB 0xc4,0x62,0x05,0xdd,0xfe + vmovdqu YMMWORD[rdx],ymm12 + vmovdqu YMMWORD[32+rdx],ymm13 + vmovdqu YMMWORD[64+rdx],ymm14 + vmovdqu YMMWORD[96+rdx],ymm15 + + sub rcx,-128 + sub rdx,-128 + add r8,-128 + cmp r8,127 + ja NEAR $L$crypt_loop_4x__func2 +$L$crypt_loop_4x_done__func2: + + test r8,r8 + jz NEAR $L$done__func2 + + + + + + lea rsi,[128+rdi] + sub rsi,r8 + + + vpxor xmm5,xmm5,xmm5 + vpxor xmm6,xmm6,xmm6 + vpxor xmm7,xmm7,xmm7 + + cmp r8,64 + jb NEAR $L$lessthan64bytes__func2 + + + vpshufb ymm12,ymm11,ymm0 + vpaddd ymm11,ymm11,YMMWORD[$L$inc_2blocks] + vpshufb ymm13,ymm11,ymm0 + vpaddd ymm11,ymm11,YMMWORD[$L$inc_2blocks] + vpxor ymm12,ymm12,ymm9 + vpxor ymm13,ymm13,ymm9 + lea rax,[16+r9] +$L$vaesenc_loop_tail_1__func2: + vbroadcasti128 ymm2,XMMWORD[rax] + DB 0xc4,0x62,0x1d,0xdc,0xe2 + DB 0xc4,0x62,0x15,0xdc,0xea + add rax,16 + cmp r11,rax + jne NEAR $L$vaesenc_loop_tail_1__func2 + DB 0xc4,0x42,0x1d,0xdd,0xe2 + DB 0xc4,0x42,0x15,0xdd,0xea + + + vmovdqu ymm2,YMMWORD[rcx] + vmovdqu ymm3,YMMWORD[32+rcx] + vpxor ymm12,ymm12,ymm2 + vpxor ymm13,ymm13,ymm3 + vmovdqu YMMWORD[rdx],ymm12 + vmovdqu YMMWORD[32+rdx],ymm13 + + + vpshufb ymm12,ymm2,ymm0 + vpshufb ymm13,ymm3,ymm0 + vpxor ymm12,ymm12,ymm1 + vmovdqu ymm2,YMMWORD[rsi] + vmovdqu ymm3,YMMWORD[32+rsi] + DB 0xc4,0xe3,0x1d,0x44,0xea,0x00 + DB 0xc4,0xe3,0x1d,0x44,0xf2,0x01 + DB 0xc4,0xe3,0x1d,0x44,0xe2,0x10 + vpxor ymm6,ymm6,ymm4 + DB 0xc4,0xe3,0x1d,0x44,0xfa,0x11 + DB 0xc4,0xe3,0x15,0x44,0xe3,0x00 + vpxor ymm5,ymm5,ymm4 + DB 0xc4,0xe3,0x15,0x44,0xe3,0x01 + vpxor ymm6,ymm6,ymm4 + DB 0xc4,0xe3,0x15,0x44,0xe3,0x10 + vpxor ymm6,ymm6,ymm4 + DB 0xc4,0xe3,0x15,0x44,0xe3,0x11 + vpxor ymm7,ymm7,ymm4 + + add rsi,64 + add rcx,64 + add rdx,64 + sub r8,64 + jz NEAR $L$reduce__func2 + + vpxor xmm1,xmm1,xmm1 + + +$L$lessthan64bytes__func2: + vpshufb ymm12,ymm11,ymm0 + vpaddd ymm11,ymm11,YMMWORD[$L$inc_2blocks] + vpshufb ymm13,ymm11,ymm0 + vpxor ymm12,ymm12,ymm9 + vpxor ymm13,ymm13,ymm9 + lea rax,[16+r9] +$L$vaesenc_loop_tail_2__func2: + vbroadcasti128 ymm2,XMMWORD[rax] + DB 0xc4,0x62,0x1d,0xdc,0xe2 + DB 0xc4,0x62,0x15,0xdc,0xea + add rax,16 + cmp r11,rax + jne NEAR $L$vaesenc_loop_tail_2__func2 + DB 0xc4,0x42,0x1d,0xdd,0xe2 + DB 0xc4,0x42,0x15,0xdd,0xea + + + + + cmp r8,32 + jb NEAR $L$xor_one_block__func2 + je NEAR $L$xor_two_blocks__func2 + +$L$xor_three_blocks__func2: + vmovdqu ymm2,YMMWORD[rcx] + vmovdqu xmm3,XMMWORD[32+rcx] + vpxor ymm12,ymm12,ymm2 + vpxor xmm13,xmm13,xmm3 + vmovdqu YMMWORD[rdx],ymm12 + vmovdqu XMMWORD[32+rdx],xmm13 + + vpshufb ymm12,ymm2,ymm0 + vpshufb xmm13,xmm3,xmm0 + vpxor ymm12,ymm12,ymm1 + vmovdqu ymm2,YMMWORD[rsi] + vmovdqu xmm3,XMMWORD[32+rsi] + vpclmulqdq xmm4,xmm13,xmm3,0x00 + vpxor ymm5,ymm5,ymm4 + vpclmulqdq xmm4,xmm13,xmm3,0x01 + vpxor ymm6,ymm6,ymm4 + vpclmulqdq xmm4,xmm13,xmm3,0x10 + vpxor ymm6,ymm6,ymm4 + vpclmulqdq xmm4,xmm13,xmm3,0x11 + vpxor ymm7,ymm7,ymm4 + jmp NEAR $L$ghash_mul_one_vec_unreduced__func2 + +$L$xor_two_blocks__func2: + vmovdqu ymm2,YMMWORD[rcx] + vpxor ymm12,ymm12,ymm2 + vmovdqu YMMWORD[rdx],ymm12 + vpshufb ymm12,ymm2,ymm0 + vpxor ymm12,ymm12,ymm1 + vmovdqu ymm2,YMMWORD[rsi] + jmp NEAR $L$ghash_mul_one_vec_unreduced__func2 + +$L$xor_one_block__func2: + vmovdqu xmm2,XMMWORD[rcx] + vpxor xmm12,xmm12,xmm2 + vmovdqu XMMWORD[rdx],xmm12 + vpshufb xmm12,xmm2,xmm0 + vpxor xmm12,xmm12,xmm1 + vmovdqu xmm2,XMMWORD[rsi] + +$L$ghash_mul_one_vec_unreduced__func2: + DB 0xc4,0xe3,0x1d,0x44,0xe2,0x00 + vpxor ymm5,ymm5,ymm4 + DB 0xc4,0xe3,0x1d,0x44,0xe2,0x01 + vpxor ymm6,ymm6,ymm4 + DB 0xc4,0xe3,0x1d,0x44,0xe2,0x10 + vpxor ymm6,ymm6,ymm4 + DB 0xc4,0xe3,0x1d,0x44,0xe2,0x11 + vpxor ymm7,ymm7,ymm4 + +$L$reduce__func2: + + vbroadcasti128 ymm2,XMMWORD[$L$gfpoly] + DB 0xc4,0xe3,0x6d,0x44,0xdd,0x01 + vpshufd ymm5,ymm5,0x4e + vpxor ymm6,ymm6,ymm5 + vpxor ymm6,ymm6,ymm3 + DB 0xc4,0xe3,0x6d,0x44,0xde,0x01 + vpshufd ymm6,ymm6,0x4e + vpxor ymm7,ymm7,ymm6 + vpxor ymm7,ymm7,ymm3 + vextracti128 xmm1,ymm7,1 + vpxor xmm1,xmm1,xmm7 + +$L$done__func2: + + vpshufb xmm1,xmm1,xmm0 + vmovdqu XMMWORD[r12],xmm1 + + vzeroupper + movdqa xmm6,XMMWORD[rsp] + movdqa xmm7,XMMWORD[16+rsp] + movdqa xmm8,XMMWORD[32+rsp] + movdqa xmm9,XMMWORD[48+rsp] + movdqa xmm10,XMMWORD[64+rsp] + movdqa xmm11,XMMWORD[80+rsp] + movdqa xmm12,XMMWORD[96+rsp] + movdqa xmm13,XMMWORD[112+rsp] + movdqa xmm14,XMMWORD[128+rsp] + movdqa xmm15,XMMWORD[144+rsp] + add rsp,160 + pop r12 + pop rdi + pop rsi + ret +$L$SEH_end_aes_gcm_dec_update_vaes_avx2_17: + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_gcm_init_vpclmulqdq_avx2_1 wrt ..imagebase + DD $L$SEH_end_gcm_init_vpclmulqdq_avx2_5 wrt ..imagebase + DD $L$SEH_info_gcm_init_vpclmulqdq_avx2_0 wrt ..imagebase + + DD $L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1_1 wrt ..imagebase + DD $L$SEH_end_gcm_ghash_vpclmulqdq_avx2_1_8 wrt ..imagebase + DD $L$SEH_info_gcm_ghash_vpclmulqdq_avx2_1_0 wrt ..imagebase + + DD $L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 wrt ..imagebase + DD $L$SEH_end_aes_gcm_enc_update_vaes_avx2_17 wrt ..imagebase + DD $L$SEH_info_aes_gcm_enc_update_vaes_avx2_0 wrt ..imagebase + + DD $L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 wrt ..imagebase + DD $L$SEH_end_aes_gcm_dec_update_vaes_avx2_17 wrt ..imagebase + DD $L$SEH_info_aes_gcm_dec_update_vaes_avx2_0 wrt ..imagebase + + +section .xdata rdata align=8 +ALIGN 4 +$L$SEH_info_gcm_init_vpclmulqdq_avx2_0: + DB 1 + DB $L$SEH_endprologue_gcm_init_vpclmulqdq_avx2_4-$L$SEH_begin_gcm_init_vpclmulqdq_avx2_1 + DB 3 + DB 0 + DB $L$SEH_prologue_gcm_init_vpclmulqdq_avx2_3-$L$SEH_begin_gcm_init_vpclmulqdq_avx2_1 + DB 104 + DW 0 + DB $L$SEH_prologue_gcm_init_vpclmulqdq_avx2_2-$L$SEH_begin_gcm_init_vpclmulqdq_avx2_1 + DB 34 + + DW 0 +$L$SEH_info_gcm_ghash_vpclmulqdq_avx2_1_0: + DB 1 + DB $L$SEH_endprologue_gcm_ghash_vpclmulqdq_avx2_1_7-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1_1 + DB 9 + DB 0 + DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_6-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1_1 + DB 152 + DW 3 + DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_5-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1_1 + DB 136 + DW 2 + DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_4-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1_1 + DB 120 + DW 1 + DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_3-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1_1 + DB 104 + DW 0 + DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_1_2-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1_1 + DB 130 + + DW 0 +$L$SEH_info_aes_gcm_enc_update_vaes_avx2_0: + DB 1 + DB $L$SEH_endprologue_aes_gcm_enc_update_vaes_avx2_16-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 25 + DB 0 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_15-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 248 + DW 9 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_14-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 232 + DW 8 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_13-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 216 + DW 7 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_12-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 200 + DW 6 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_11-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 184 + DW 5 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_10-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 168 + DW 4 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_9-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 152 + DW 3 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_8-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 136 + DW 2 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_7-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 120 + DW 1 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_6-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 104 + DW 0 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_5-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 1 + DW 20 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_4-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 192 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_3-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 112 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_2-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 96 + + DW 0 +$L$SEH_info_aes_gcm_dec_update_vaes_avx2_0: + DB 1 + DB $L$SEH_endprologue_aes_gcm_dec_update_vaes_avx2_16-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 25 + DB 0 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_15-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 248 + DW 9 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_14-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 232 + DW 8 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_13-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 216 + DW 7 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_12-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 200 + DW 6 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_11-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 184 + DW 5 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_10-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 168 + DW 4 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_9-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 152 + DW 3 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_8-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 136 + DW 2 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_7-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 120 + DW 1 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_6-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 104 + DW 0 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_5-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 1 + DW 20 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_4-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 192 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_3-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 112 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_2-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 96 + + DW 0 +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/ring-0.17.14/pregenerated/aes-gcm-avx2-x86_64-nasm.o b/ring-0.17.14/pregenerated/aes-gcm-avx2-x86_64-nasm.o new file mode 100644 index 0000000000..89ecad4569 Binary files /dev/null and b/ring-0.17.14/pregenerated/aes-gcm-avx2-x86_64-nasm.o differ diff --git a/ring-0.17.14/pregenerated/aesni-gcm-x86_64-elf.S b/ring-0.17.14/pregenerated/aesni-gcm-x86_64-elf.S new file mode 100644 index 0000000000..c547df6367 --- /dev/null +++ b/ring-0.17.14/pregenerated/aesni-gcm-x86_64-elf.S @@ -0,0 +1,883 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +.text + +.type _aesni_ctr32_ghash_6x,@function +.align 32 +_aesni_ctr32_ghash_6x: +.cfi_startproc + vmovdqu 32(%r11),%xmm2 + subq $6,%rdx + vpxor %xmm4,%xmm4,%xmm4 + vmovdqu 0-128(%rcx),%xmm15 + vpaddb %xmm2,%xmm1,%xmm10 + vpaddb %xmm2,%xmm10,%xmm11 + vpaddb %xmm2,%xmm11,%xmm12 + vpaddb %xmm2,%xmm12,%xmm13 + vpaddb %xmm2,%xmm13,%xmm14 + vpxor %xmm15,%xmm1,%xmm9 + vmovdqu %xmm4,16+8(%rsp) + jmp .Loop6x + +.align 32 +.Loop6x: + addl $100663296,%ebx + jc .Lhandle_ctr32 + vmovdqu 0-32(%r9),%xmm3 + vpaddb %xmm2,%xmm14,%xmm1 + vpxor %xmm15,%xmm10,%xmm10 + vpxor %xmm15,%xmm11,%xmm11 + +.Lresume_ctr32: + vmovdqu %xmm1,(%r8) + vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 + vpxor %xmm15,%xmm12,%xmm12 + vmovups 16-128(%rcx),%xmm2 + vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 + + + + + + + + + + + + + + + + + + xorq %r12,%r12 + cmpq %r14,%r15 + + vaesenc %xmm2,%xmm9,%xmm9 + vmovdqu 48+8(%rsp),%xmm0 + vpxor %xmm15,%xmm13,%xmm13 + vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 + vaesenc %xmm2,%xmm10,%xmm10 + vpxor %xmm15,%xmm14,%xmm14 + setnc %r12b + vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vmovdqu 16-32(%r9),%xmm3 + negq %r12 + vaesenc %xmm2,%xmm12,%xmm12 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 + vpxor %xmm4,%xmm8,%xmm8 + vaesenc %xmm2,%xmm13,%xmm13 + vpxor %xmm5,%xmm1,%xmm4 + andq $0x60,%r12 + vmovups 32-128(%rcx),%xmm15 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 + vaesenc %xmm2,%xmm14,%xmm14 + + vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 + leaq (%r14,%r12,1),%r14 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor 16+8(%rsp),%xmm8,%xmm8 + vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 + vmovdqu 64+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 88(%r14),%r13 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 80(%r14),%r12 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,32+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,40+8(%rsp) + vmovdqu 48-32(%r9),%xmm5 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 48-128(%rcx),%xmm15 + vpxor %xmm1,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm2,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor %xmm3,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 + vaesenc %xmm15,%xmm11,%xmm11 + vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 + vmovdqu 80+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqu 64-32(%r9),%xmm1 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 64-128(%rcx),%xmm15 + vpxor %xmm2,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm3,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 72(%r14),%r13 + vpxor %xmm5,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 64(%r14),%r12 + vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 + vmovdqu 96+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,48+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,56+8(%rsp) + vpxor %xmm2,%xmm4,%xmm4 + vmovdqu 96-32(%r9),%xmm2 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 80-128(%rcx),%xmm15 + vpxor %xmm3,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 56(%r14),%r13 + vpxor %xmm1,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 + vpxor 112+8(%rsp),%xmm8,%xmm8 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 48(%r14),%r12 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,64+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,72+8(%rsp) + vpxor %xmm3,%xmm4,%xmm4 + vmovdqu 112-32(%r9),%xmm3 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 96-128(%rcx),%xmm15 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm1,%xmm6,%xmm6 + vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 40(%r14),%r13 + vpxor %xmm2,%xmm7,%xmm7 + vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 32(%r14),%r12 + vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,80+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,88+8(%rsp) + vpxor %xmm5,%xmm6,%xmm6 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor %xmm1,%xmm6,%xmm6 + + vmovups 112-128(%rcx),%xmm15 + vpslldq $8,%xmm6,%xmm5 + vpxor %xmm2,%xmm4,%xmm4 + vmovdqu 16(%r11),%xmm3 + + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm8,%xmm7,%xmm7 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor %xmm5,%xmm4,%xmm4 + movbeq 24(%r14),%r13 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 16(%r14),%r12 + vpalignr $8,%xmm4,%xmm4,%xmm0 + vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 + movq %r13,96+8(%rsp) + vaesenc %xmm15,%xmm12,%xmm12 + movq %r12,104+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + vmovups 128-128(%rcx),%xmm1 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vmovups 144-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm10,%xmm10 + vpsrldq $8,%xmm6,%xmm6 + vaesenc %xmm1,%xmm11,%xmm11 + vpxor %xmm6,%xmm7,%xmm7 + vaesenc %xmm1,%xmm12,%xmm12 + vpxor %xmm0,%xmm4,%xmm4 + movbeq 8(%r14),%r13 + vaesenc %xmm1,%xmm13,%xmm13 + movbeq 0(%r14),%r12 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 160-128(%rcx),%xmm1 + cmpl $11,%r10d + jb .Lenc_tail + + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + vmovups 176-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 192-128(%rcx),%xmm1 + + + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + vmovups 208-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 224-128(%rcx),%xmm1 + jmp .Lenc_tail + +.align 32 +.Lhandle_ctr32: + vmovdqu (%r11),%xmm0 + vpshufb %xmm0,%xmm1,%xmm6 + vmovdqu 48(%r11),%xmm5 + vpaddd 64(%r11),%xmm6,%xmm10 + vpaddd %xmm5,%xmm6,%xmm11 + vmovdqu 0-32(%r9),%xmm3 + vpaddd %xmm5,%xmm10,%xmm12 + vpshufb %xmm0,%xmm10,%xmm10 + vpaddd %xmm5,%xmm11,%xmm13 + vpshufb %xmm0,%xmm11,%xmm11 + vpxor %xmm15,%xmm10,%xmm10 + vpaddd %xmm5,%xmm12,%xmm14 + vpshufb %xmm0,%xmm12,%xmm12 + vpxor %xmm15,%xmm11,%xmm11 + vpaddd %xmm5,%xmm13,%xmm1 + vpshufb %xmm0,%xmm13,%xmm13 + vpshufb %xmm0,%xmm14,%xmm14 + vpshufb %xmm0,%xmm1,%xmm1 + jmp .Lresume_ctr32 + +.align 32 +.Lenc_tail: + vaesenc %xmm15,%xmm9,%xmm9 + vmovdqu %xmm7,16+8(%rsp) + vpalignr $8,%xmm4,%xmm4,%xmm8 + vaesenc %xmm15,%xmm10,%xmm10 + vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 + vpxor 0(%rdi),%xmm1,%xmm2 + vaesenc %xmm15,%xmm11,%xmm11 + vpxor 16(%rdi),%xmm1,%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + vpxor 32(%rdi),%xmm1,%xmm5 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor 48(%rdi),%xmm1,%xmm6 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor 64(%rdi),%xmm1,%xmm7 + vpxor 80(%rdi),%xmm1,%xmm3 + vmovdqu (%r8),%xmm1 + + vaesenclast %xmm2,%xmm9,%xmm9 + vmovdqu 32(%r11),%xmm2 + vaesenclast %xmm0,%xmm10,%xmm10 + vpaddb %xmm2,%xmm1,%xmm0 + movq %r13,112+8(%rsp) + leaq 96(%rdi),%rdi + + prefetcht0 512(%rdi) + prefetcht0 576(%rdi) + vaesenclast %xmm5,%xmm11,%xmm11 + vpaddb %xmm2,%xmm0,%xmm5 + movq %r12,120+8(%rsp) + leaq 96(%rsi),%rsi + vmovdqu 0-128(%rcx),%xmm15 + vaesenclast %xmm6,%xmm12,%xmm12 + vpaddb %xmm2,%xmm5,%xmm6 + vaesenclast %xmm7,%xmm13,%xmm13 + vpaddb %xmm2,%xmm6,%xmm7 + vaesenclast %xmm3,%xmm14,%xmm14 + vpaddb %xmm2,%xmm7,%xmm3 + + addq $0x60,%rax + subq $0x6,%rdx + jc .L6x_done + + vmovups %xmm9,-96(%rsi) + vpxor %xmm15,%xmm1,%xmm9 + vmovups %xmm10,-80(%rsi) + vmovdqa %xmm0,%xmm10 + vmovups %xmm11,-64(%rsi) + vmovdqa %xmm5,%xmm11 + vmovups %xmm12,-48(%rsi) + vmovdqa %xmm6,%xmm12 + vmovups %xmm13,-32(%rsi) + vmovdqa %xmm7,%xmm13 + vmovups %xmm14,-16(%rsi) + vmovdqa %xmm3,%xmm14 + vmovdqu 32+8(%rsp),%xmm7 + jmp .Loop6x + +.L6x_done: + vpxor 16+8(%rsp),%xmm8,%xmm8 + vpxor %xmm4,%xmm8,%xmm8 + + ret +.cfi_endproc +.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x +.globl aesni_gcm_decrypt +.hidden aesni_gcm_decrypt +.type aesni_gcm_decrypt,@function +.align 32 +aesni_gcm_decrypt: +.cfi_startproc + +_CET_ENDBR + xorq %rax,%rax + + + + cmpq $0x60,%rdx + jb .Lgcm_dec_abort + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + + movq %rsp,%rbp +.cfi_def_cfa_register %rbp + pushq %rbx +.cfi_offset %rbx,-24 + + pushq %r12 +.cfi_offset %r12,-32 + + pushq %r13 +.cfi_offset %r13,-40 + + pushq %r14 +.cfi_offset %r14,-48 + + pushq %r15 +.cfi_offset %r15,-56 + + vzeroupper + + movq 16(%rbp),%r12 + vmovdqu (%r8),%xmm1 + addq $-128,%rsp + movl 12(%r8),%ebx + leaq .Lbswap_mask(%rip),%r11 + leaq -128(%rcx),%r14 + movq $0xf80,%r15 + vmovdqu (%r12),%xmm8 + andq $-128,%rsp + vmovdqu (%r11),%xmm0 + leaq 128(%rcx),%rcx + leaq 32(%r9),%r9 + movl 240-128(%rcx),%r10d + vpshufb %xmm0,%xmm8,%xmm8 + + andq %r15,%r14 + andq %rsp,%r15 + subq %r14,%r15 + jc .Ldec_no_key_aliasing + cmpq $768,%r15 + jnc .Ldec_no_key_aliasing + subq %r15,%rsp +.Ldec_no_key_aliasing: + + vmovdqu 80(%rdi),%xmm7 + movq %rdi,%r14 + vmovdqu 64(%rdi),%xmm4 + + + + + + + + leaq -192(%rdi,%rdx,1),%r15 + + vmovdqu 48(%rdi),%xmm5 + shrq $4,%rdx + xorq %rax,%rax + vmovdqu 32(%rdi),%xmm6 + vpshufb %xmm0,%xmm7,%xmm7 + vmovdqu 16(%rdi),%xmm2 + vpshufb %xmm0,%xmm4,%xmm4 + vmovdqu (%rdi),%xmm3 + vpshufb %xmm0,%xmm5,%xmm5 + vmovdqu %xmm4,48(%rsp) + vpshufb %xmm0,%xmm6,%xmm6 + vmovdqu %xmm5,64(%rsp) + vpshufb %xmm0,%xmm2,%xmm2 + vmovdqu %xmm6,80(%rsp) + vpshufb %xmm0,%xmm3,%xmm3 + vmovdqu %xmm2,96(%rsp) + vmovdqu %xmm3,112(%rsp) + + call _aesni_ctr32_ghash_6x + + movq 16(%rbp),%r12 + vmovups %xmm9,-96(%rsi) + vmovups %xmm10,-80(%rsi) + vmovups %xmm11,-64(%rsi) + vmovups %xmm12,-48(%rsi) + vmovups %xmm13,-32(%rsi) + vmovups %xmm14,-16(%rsi) + + vpshufb (%r11),%xmm8,%xmm8 + vmovdqu %xmm8,(%r12) + + vzeroupper + leaq -40(%rbp),%rsp +.cfi_def_cfa %rsp, 0x38 + popq %r15 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r15 + popq %r14 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r14 + popq %r13 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r13 + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + popq %rbx +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbx + popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbp +.Lgcm_dec_abort: + ret + +.cfi_endproc +.size aesni_gcm_decrypt,.-aesni_gcm_decrypt +.type _aesni_ctr32_6x,@function +.align 32 +_aesni_ctr32_6x: +.cfi_startproc + vmovdqu 0-128(%rcx),%xmm4 + vmovdqu 32(%r11),%xmm2 + leaq -1(%r10),%r13 + vmovups 16-128(%rcx),%xmm15 + leaq 32-128(%rcx),%r12 + vpxor %xmm4,%xmm1,%xmm9 + addl $100663296,%ebx + jc .Lhandle_ctr32_2 + vpaddb %xmm2,%xmm1,%xmm10 + vpaddb %xmm2,%xmm10,%xmm11 + vpxor %xmm4,%xmm10,%xmm10 + vpaddb %xmm2,%xmm11,%xmm12 + vpxor %xmm4,%xmm11,%xmm11 + vpaddb %xmm2,%xmm12,%xmm13 + vpxor %xmm4,%xmm12,%xmm12 + vpaddb %xmm2,%xmm13,%xmm14 + vpxor %xmm4,%xmm13,%xmm13 + vpaddb %xmm2,%xmm14,%xmm1 + vpxor %xmm4,%xmm14,%xmm14 + jmp .Loop_ctr32 + +.align 16 +.Loop_ctr32: + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + vmovups (%r12),%xmm15 + leaq 16(%r12),%r12 + decl %r13d + jnz .Loop_ctr32 + + vmovdqu (%r12),%xmm3 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor 0(%rdi),%xmm3,%xmm4 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor 16(%rdi),%xmm3,%xmm5 + vaesenc %xmm15,%xmm11,%xmm11 + vpxor 32(%rdi),%xmm3,%xmm6 + vaesenc %xmm15,%xmm12,%xmm12 + vpxor 48(%rdi),%xmm3,%xmm8 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor 64(%rdi),%xmm3,%xmm2 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor 80(%rdi),%xmm3,%xmm3 + leaq 96(%rdi),%rdi + + vaesenclast %xmm4,%xmm9,%xmm9 + vaesenclast %xmm5,%xmm10,%xmm10 + vaesenclast %xmm6,%xmm11,%xmm11 + vaesenclast %xmm8,%xmm12,%xmm12 + vaesenclast %xmm2,%xmm13,%xmm13 + vaesenclast %xmm3,%xmm14,%xmm14 + vmovups %xmm9,0(%rsi) + vmovups %xmm10,16(%rsi) + vmovups %xmm11,32(%rsi) + vmovups %xmm12,48(%rsi) + vmovups %xmm13,64(%rsi) + vmovups %xmm14,80(%rsi) + leaq 96(%rsi),%rsi + + ret +.align 32 +.Lhandle_ctr32_2: + vpshufb %xmm0,%xmm1,%xmm6 + vmovdqu 48(%r11),%xmm5 + vpaddd 64(%r11),%xmm6,%xmm10 + vpaddd %xmm5,%xmm6,%xmm11 + vpaddd %xmm5,%xmm10,%xmm12 + vpshufb %xmm0,%xmm10,%xmm10 + vpaddd %xmm5,%xmm11,%xmm13 + vpshufb %xmm0,%xmm11,%xmm11 + vpxor %xmm4,%xmm10,%xmm10 + vpaddd %xmm5,%xmm12,%xmm14 + vpshufb %xmm0,%xmm12,%xmm12 + vpxor %xmm4,%xmm11,%xmm11 + vpaddd %xmm5,%xmm13,%xmm1 + vpshufb %xmm0,%xmm13,%xmm13 + vpxor %xmm4,%xmm12,%xmm12 + vpshufb %xmm0,%xmm14,%xmm14 + vpxor %xmm4,%xmm13,%xmm13 + vpshufb %xmm0,%xmm1,%xmm1 + vpxor %xmm4,%xmm14,%xmm14 + jmp .Loop_ctr32 +.cfi_endproc +.size _aesni_ctr32_6x,.-_aesni_ctr32_6x + +.globl aesni_gcm_encrypt +.hidden aesni_gcm_encrypt +.type aesni_gcm_encrypt,@function +.align 32 +aesni_gcm_encrypt: +.cfi_startproc + +_CET_ENDBR +#ifdef BORINGSSL_DISPATCH_TEST +.extern BORINGSSL_function_hit +.hidden BORINGSSL_function_hit + movb $1,BORINGSSL_function_hit+2(%rip) +#endif + xorq %rax,%rax + + + + + cmpq $288,%rdx + jb .Lgcm_enc_abort + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + + movq %rsp,%rbp +.cfi_def_cfa_register %rbp + pushq %rbx +.cfi_offset %rbx,-24 + + pushq %r12 +.cfi_offset %r12,-32 + + pushq %r13 +.cfi_offset %r13,-40 + + pushq %r14 +.cfi_offset %r14,-48 + + pushq %r15 +.cfi_offset %r15,-56 + + vzeroupper + + vmovdqu (%r8),%xmm1 + addq $-128,%rsp + movl 12(%r8),%ebx + leaq .Lbswap_mask(%rip),%r11 + leaq -128(%rcx),%r14 + movq $0xf80,%r15 + leaq 128(%rcx),%rcx + vmovdqu (%r11),%xmm0 + andq $-128,%rsp + movl 240-128(%rcx),%r10d + + andq %r15,%r14 + andq %rsp,%r15 + subq %r14,%r15 + jc .Lenc_no_key_aliasing + cmpq $768,%r15 + jnc .Lenc_no_key_aliasing + subq %r15,%rsp +.Lenc_no_key_aliasing: + + movq %rsi,%r14 + + + + + + + + + leaq -192(%rsi,%rdx,1),%r15 + + shrq $4,%rdx + + call _aesni_ctr32_6x + vpshufb %xmm0,%xmm9,%xmm8 + vpshufb %xmm0,%xmm10,%xmm2 + vmovdqu %xmm8,112(%rsp) + vpshufb %xmm0,%xmm11,%xmm4 + vmovdqu %xmm2,96(%rsp) + vpshufb %xmm0,%xmm12,%xmm5 + vmovdqu %xmm4,80(%rsp) + vpshufb %xmm0,%xmm13,%xmm6 + vmovdqu %xmm5,64(%rsp) + vpshufb %xmm0,%xmm14,%xmm7 + vmovdqu %xmm6,48(%rsp) + + call _aesni_ctr32_6x + + movq 16(%rbp),%r12 + leaq 32(%r9),%r9 + vmovdqu (%r12),%xmm8 + subq $12,%rdx + movq $192,%rax + vpshufb %xmm0,%xmm8,%xmm8 + + call _aesni_ctr32_ghash_6x + vmovdqu 32(%rsp),%xmm7 + vmovdqu (%r11),%xmm0 + vmovdqu 0-32(%r9),%xmm3 + vpunpckhqdq %xmm7,%xmm7,%xmm1 + vmovdqu 32-32(%r9),%xmm15 + vmovups %xmm9,-96(%rsi) + vpshufb %xmm0,%xmm9,%xmm9 + vpxor %xmm7,%xmm1,%xmm1 + vmovups %xmm10,-80(%rsi) + vpshufb %xmm0,%xmm10,%xmm10 + vmovups %xmm11,-64(%rsi) + vpshufb %xmm0,%xmm11,%xmm11 + vmovups %xmm12,-48(%rsi) + vpshufb %xmm0,%xmm12,%xmm12 + vmovups %xmm13,-32(%rsi) + vpshufb %xmm0,%xmm13,%xmm13 + vmovups %xmm14,-16(%rsi) + vpshufb %xmm0,%xmm14,%xmm14 + vmovdqu %xmm9,16(%rsp) + vmovdqu 48(%rsp),%xmm6 + vmovdqu 16-32(%r9),%xmm0 + vpunpckhqdq %xmm6,%xmm6,%xmm2 + vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5 + vpxor %xmm6,%xmm2,%xmm2 + vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 + vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 + + vmovdqu 64(%rsp),%xmm9 + vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4 + vmovdqu 48-32(%r9),%xmm3 + vpxor %xmm5,%xmm4,%xmm4 + vpunpckhqdq %xmm9,%xmm9,%xmm5 + vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6 + vpxor %xmm9,%xmm5,%xmm5 + vpxor %xmm7,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 + vmovdqu 80-32(%r9),%xmm15 + vpxor %xmm1,%xmm2,%xmm2 + + vmovdqu 80(%rsp),%xmm1 + vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7 + vmovdqu 64-32(%r9),%xmm0 + vpxor %xmm4,%xmm7,%xmm7 + vpunpckhqdq %xmm1,%xmm1,%xmm4 + vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9 + vpxor %xmm1,%xmm4,%xmm4 + vpxor %xmm6,%xmm9,%xmm9 + vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu 96(%rsp),%xmm2 + vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6 + vmovdqu 96-32(%r9),%xmm3 + vpxor %xmm7,%xmm6,%xmm6 + vpunpckhqdq %xmm2,%xmm2,%xmm7 + vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpxor %xmm9,%xmm1,%xmm1 + vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4 + vmovdqu 128-32(%r9),%xmm15 + vpxor %xmm5,%xmm4,%xmm4 + + vpxor 112(%rsp),%xmm8,%xmm8 + vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5 + vmovdqu 112-32(%r9),%xmm0 + vpunpckhqdq %xmm8,%xmm8,%xmm9 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2 + vpxor %xmm8,%xmm9,%xmm9 + vpxor %xmm1,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7 + vpxor %xmm4,%xmm7,%xmm4 + + vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6 + vmovdqu 0-32(%r9),%xmm3 + vpunpckhqdq %xmm14,%xmm14,%xmm1 + vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8 + vpxor %xmm14,%xmm1,%xmm1 + vpxor %xmm5,%xmm6,%xmm5 + vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9 + vmovdqu 32-32(%r9),%xmm15 + vpxor %xmm2,%xmm8,%xmm7 + vpxor %xmm4,%xmm9,%xmm6 + + vmovdqu 16-32(%r9),%xmm0 + vpxor %xmm5,%xmm7,%xmm9 + vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4 + vpxor %xmm9,%xmm6,%xmm6 + vpunpckhqdq %xmm13,%xmm13,%xmm2 + vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14 + vpxor %xmm13,%xmm2,%xmm2 + vpslldq $8,%xmm6,%xmm9 + vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 + vpxor %xmm9,%xmm5,%xmm8 + vpsrldq $8,%xmm6,%xmm6 + vpxor %xmm6,%xmm7,%xmm7 + + vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5 + vmovdqu 48-32(%r9),%xmm3 + vpxor %xmm4,%xmm5,%xmm5 + vpunpckhqdq %xmm12,%xmm12,%xmm9 + vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13 + vpxor %xmm12,%xmm9,%xmm9 + vpxor %xmm14,%xmm13,%xmm13 + vpalignr $8,%xmm8,%xmm8,%xmm14 + vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 + vmovdqu 80-32(%r9),%xmm15 + vpxor %xmm1,%xmm2,%xmm2 + + vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4 + vmovdqu 64-32(%r9),%xmm0 + vpxor %xmm5,%xmm4,%xmm4 + vpunpckhqdq %xmm11,%xmm11,%xmm1 + vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12 + vpxor %xmm11,%xmm1,%xmm1 + vpxor %xmm13,%xmm12,%xmm12 + vxorps 16(%rsp),%xmm7,%xmm7 + vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9 + vpxor %xmm2,%xmm9,%xmm9 + + vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 + vxorps %xmm14,%xmm8,%xmm8 + + vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5 + vmovdqu 96-32(%r9),%xmm3 + vpxor %xmm4,%xmm5,%xmm5 + vpunpckhqdq %xmm10,%xmm10,%xmm2 + vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11 + vpxor %xmm10,%xmm2,%xmm2 + vpalignr $8,%xmm8,%xmm8,%xmm14 + vpxor %xmm12,%xmm11,%xmm11 + vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1 + vmovdqu 128-32(%r9),%xmm15 + vpxor %xmm9,%xmm1,%xmm1 + + vxorps %xmm7,%xmm14,%xmm14 + vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 + vxorps %xmm14,%xmm8,%xmm8 + + vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4 + vmovdqu 112-32(%r9),%xmm0 + vpxor %xmm5,%xmm4,%xmm4 + vpunpckhqdq %xmm8,%xmm8,%xmm9 + vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10 + vpxor %xmm8,%xmm9,%xmm9 + vpxor %xmm11,%xmm10,%xmm10 + vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2 + vpxor %xmm1,%xmm2,%xmm2 + + vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5 + vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7 + vpxor %xmm4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6 + vpxor %xmm10,%xmm7,%xmm7 + vpxor %xmm2,%xmm6,%xmm6 + + vpxor %xmm5,%xmm7,%xmm4 + vpxor %xmm4,%xmm6,%xmm6 + vpslldq $8,%xmm6,%xmm1 + vmovdqu 16(%r11),%xmm3 + vpsrldq $8,%xmm6,%xmm6 + vpxor %xmm1,%xmm5,%xmm8 + vpxor %xmm6,%xmm7,%xmm7 + + vpalignr $8,%xmm8,%xmm8,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 + vpxor %xmm2,%xmm8,%xmm8 + + vpalignr $8,%xmm8,%xmm8,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 + vpxor %xmm7,%xmm2,%xmm2 + vpxor %xmm2,%xmm8,%xmm8 + movq 16(%rbp),%r12 + vpshufb (%r11),%xmm8,%xmm8 + vmovdqu %xmm8,(%r12) + + vzeroupper + leaq -40(%rbp),%rsp +.cfi_def_cfa %rsp, 0x38 + popq %r15 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r15 + popq %r14 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r14 + popq %r13 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r13 + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + popq %rbx +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbx + popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbp +.Lgcm_enc_abort: + ret + +.cfi_endproc +.size aesni_gcm_encrypt,.-aesni_gcm_encrypt +.section .rodata +.align 64 +.Lbswap_mask: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.Lpoly: +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +.Lone_msb: +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +.Ltwo_lsb: +.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +.Lone_lsb: +.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 64 +.text +#endif diff --git a/ring-0.17.14/pregenerated/aesni-gcm-x86_64-macosx.S b/ring-0.17.14/pregenerated/aesni-gcm-x86_64-macosx.S new file mode 100644 index 0000000000..b9a70cfc8c --- /dev/null +++ b/ring-0.17.14/pregenerated/aesni-gcm-x86_64-macosx.S @@ -0,0 +1,868 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +.text + + +.p2align 5 +_aesni_ctr32_ghash_6x: + + vmovdqu 32(%r11),%xmm2 + subq $6,%rdx + vpxor %xmm4,%xmm4,%xmm4 + vmovdqu 0-128(%rcx),%xmm15 + vpaddb %xmm2,%xmm1,%xmm10 + vpaddb %xmm2,%xmm10,%xmm11 + vpaddb %xmm2,%xmm11,%xmm12 + vpaddb %xmm2,%xmm12,%xmm13 + vpaddb %xmm2,%xmm13,%xmm14 + vpxor %xmm15,%xmm1,%xmm9 + vmovdqu %xmm4,16+8(%rsp) + jmp L$oop6x + +.p2align 5 +L$oop6x: + addl $100663296,%ebx + jc L$handle_ctr32 + vmovdqu 0-32(%r9),%xmm3 + vpaddb %xmm2,%xmm14,%xmm1 + vpxor %xmm15,%xmm10,%xmm10 + vpxor %xmm15,%xmm11,%xmm11 + +L$resume_ctr32: + vmovdqu %xmm1,(%r8) + vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 + vpxor %xmm15,%xmm12,%xmm12 + vmovups 16-128(%rcx),%xmm2 + vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 + + + + + + + + + + + + + + + + + + xorq %r12,%r12 + cmpq %r14,%r15 + + vaesenc %xmm2,%xmm9,%xmm9 + vmovdqu 48+8(%rsp),%xmm0 + vpxor %xmm15,%xmm13,%xmm13 + vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 + vaesenc %xmm2,%xmm10,%xmm10 + vpxor %xmm15,%xmm14,%xmm14 + setnc %r12b + vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vmovdqu 16-32(%r9),%xmm3 + negq %r12 + vaesenc %xmm2,%xmm12,%xmm12 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 + vpxor %xmm4,%xmm8,%xmm8 + vaesenc %xmm2,%xmm13,%xmm13 + vpxor %xmm5,%xmm1,%xmm4 + andq $0x60,%r12 + vmovups 32-128(%rcx),%xmm15 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 + vaesenc %xmm2,%xmm14,%xmm14 + + vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 + leaq (%r14,%r12,1),%r14 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor 16+8(%rsp),%xmm8,%xmm8 + vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 + vmovdqu 64+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 88(%r14),%r13 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 80(%r14),%r12 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,32+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,40+8(%rsp) + vmovdqu 48-32(%r9),%xmm5 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 48-128(%rcx),%xmm15 + vpxor %xmm1,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm2,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor %xmm3,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 + vaesenc %xmm15,%xmm11,%xmm11 + vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 + vmovdqu 80+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqu 64-32(%r9),%xmm1 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 64-128(%rcx),%xmm15 + vpxor %xmm2,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm3,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 72(%r14),%r13 + vpxor %xmm5,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 64(%r14),%r12 + vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 + vmovdqu 96+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,48+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,56+8(%rsp) + vpxor %xmm2,%xmm4,%xmm4 + vmovdqu 96-32(%r9),%xmm2 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 80-128(%rcx),%xmm15 + vpxor %xmm3,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 56(%r14),%r13 + vpxor %xmm1,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 + vpxor 112+8(%rsp),%xmm8,%xmm8 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 48(%r14),%r12 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,64+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,72+8(%rsp) + vpxor %xmm3,%xmm4,%xmm4 + vmovdqu 112-32(%r9),%xmm3 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 96-128(%rcx),%xmm15 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm1,%xmm6,%xmm6 + vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 40(%r14),%r13 + vpxor %xmm2,%xmm7,%xmm7 + vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 32(%r14),%r12 + vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,80+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,88+8(%rsp) + vpxor %xmm5,%xmm6,%xmm6 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor %xmm1,%xmm6,%xmm6 + + vmovups 112-128(%rcx),%xmm15 + vpslldq $8,%xmm6,%xmm5 + vpxor %xmm2,%xmm4,%xmm4 + vmovdqu 16(%r11),%xmm3 + + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm8,%xmm7,%xmm7 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor %xmm5,%xmm4,%xmm4 + movbeq 24(%r14),%r13 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 16(%r14),%r12 + vpalignr $8,%xmm4,%xmm4,%xmm0 + vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 + movq %r13,96+8(%rsp) + vaesenc %xmm15,%xmm12,%xmm12 + movq %r12,104+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + vmovups 128-128(%rcx),%xmm1 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vmovups 144-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm10,%xmm10 + vpsrldq $8,%xmm6,%xmm6 + vaesenc %xmm1,%xmm11,%xmm11 + vpxor %xmm6,%xmm7,%xmm7 + vaesenc %xmm1,%xmm12,%xmm12 + vpxor %xmm0,%xmm4,%xmm4 + movbeq 8(%r14),%r13 + vaesenc %xmm1,%xmm13,%xmm13 + movbeq 0(%r14),%r12 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 160-128(%rcx),%xmm1 + cmpl $11,%r10d + jb L$enc_tail + + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + vmovups 176-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 192-128(%rcx),%xmm1 + + + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + vmovups 208-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 224-128(%rcx),%xmm1 + jmp L$enc_tail + +.p2align 5 +L$handle_ctr32: + vmovdqu (%r11),%xmm0 + vpshufb %xmm0,%xmm1,%xmm6 + vmovdqu 48(%r11),%xmm5 + vpaddd 64(%r11),%xmm6,%xmm10 + vpaddd %xmm5,%xmm6,%xmm11 + vmovdqu 0-32(%r9),%xmm3 + vpaddd %xmm5,%xmm10,%xmm12 + vpshufb %xmm0,%xmm10,%xmm10 + vpaddd %xmm5,%xmm11,%xmm13 + vpshufb %xmm0,%xmm11,%xmm11 + vpxor %xmm15,%xmm10,%xmm10 + vpaddd %xmm5,%xmm12,%xmm14 + vpshufb %xmm0,%xmm12,%xmm12 + vpxor %xmm15,%xmm11,%xmm11 + vpaddd %xmm5,%xmm13,%xmm1 + vpshufb %xmm0,%xmm13,%xmm13 + vpshufb %xmm0,%xmm14,%xmm14 + vpshufb %xmm0,%xmm1,%xmm1 + jmp L$resume_ctr32 + +.p2align 5 +L$enc_tail: + vaesenc %xmm15,%xmm9,%xmm9 + vmovdqu %xmm7,16+8(%rsp) + vpalignr $8,%xmm4,%xmm4,%xmm8 + vaesenc %xmm15,%xmm10,%xmm10 + vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 + vpxor 0(%rdi),%xmm1,%xmm2 + vaesenc %xmm15,%xmm11,%xmm11 + vpxor 16(%rdi),%xmm1,%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + vpxor 32(%rdi),%xmm1,%xmm5 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor 48(%rdi),%xmm1,%xmm6 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor 64(%rdi),%xmm1,%xmm7 + vpxor 80(%rdi),%xmm1,%xmm3 + vmovdqu (%r8),%xmm1 + + vaesenclast %xmm2,%xmm9,%xmm9 + vmovdqu 32(%r11),%xmm2 + vaesenclast %xmm0,%xmm10,%xmm10 + vpaddb %xmm2,%xmm1,%xmm0 + movq %r13,112+8(%rsp) + leaq 96(%rdi),%rdi + + prefetcht0 512(%rdi) + prefetcht0 576(%rdi) + vaesenclast %xmm5,%xmm11,%xmm11 + vpaddb %xmm2,%xmm0,%xmm5 + movq %r12,120+8(%rsp) + leaq 96(%rsi),%rsi + vmovdqu 0-128(%rcx),%xmm15 + vaesenclast %xmm6,%xmm12,%xmm12 + vpaddb %xmm2,%xmm5,%xmm6 + vaesenclast %xmm7,%xmm13,%xmm13 + vpaddb %xmm2,%xmm6,%xmm7 + vaesenclast %xmm3,%xmm14,%xmm14 + vpaddb %xmm2,%xmm7,%xmm3 + + addq $0x60,%rax + subq $0x6,%rdx + jc L$6x_done + + vmovups %xmm9,-96(%rsi) + vpxor %xmm15,%xmm1,%xmm9 + vmovups %xmm10,-80(%rsi) + vmovdqa %xmm0,%xmm10 + vmovups %xmm11,-64(%rsi) + vmovdqa %xmm5,%xmm11 + vmovups %xmm12,-48(%rsi) + vmovdqa %xmm6,%xmm12 + vmovups %xmm13,-32(%rsi) + vmovdqa %xmm7,%xmm13 + vmovups %xmm14,-16(%rsi) + vmovdqa %xmm3,%xmm14 + vmovdqu 32+8(%rsp),%xmm7 + jmp L$oop6x + +L$6x_done: + vpxor 16+8(%rsp),%xmm8,%xmm8 + vpxor %xmm4,%xmm8,%xmm8 + + ret + + +.globl _aesni_gcm_decrypt +.private_extern _aesni_gcm_decrypt + +.p2align 5 +_aesni_gcm_decrypt: + + +_CET_ENDBR + xorq %rax,%rax + + + + cmpq $0x60,%rdx + jb L$gcm_dec_abort + + pushq %rbp + + + movq %rsp,%rbp + + pushq %rbx + + + pushq %r12 + + + pushq %r13 + + + pushq %r14 + + + pushq %r15 + + + vzeroupper + + movq 16(%rbp),%r12 + vmovdqu (%r8),%xmm1 + addq $-128,%rsp + movl 12(%r8),%ebx + leaq L$bswap_mask(%rip),%r11 + leaq -128(%rcx),%r14 + movq $0xf80,%r15 + vmovdqu (%r12),%xmm8 + andq $-128,%rsp + vmovdqu (%r11),%xmm0 + leaq 128(%rcx),%rcx + leaq 32(%r9),%r9 + movl 240-128(%rcx),%r10d + vpshufb %xmm0,%xmm8,%xmm8 + + andq %r15,%r14 + andq %rsp,%r15 + subq %r14,%r15 + jc L$dec_no_key_aliasing + cmpq $768,%r15 + jnc L$dec_no_key_aliasing + subq %r15,%rsp +L$dec_no_key_aliasing: + + vmovdqu 80(%rdi),%xmm7 + movq %rdi,%r14 + vmovdqu 64(%rdi),%xmm4 + + + + + + + + leaq -192(%rdi,%rdx,1),%r15 + + vmovdqu 48(%rdi),%xmm5 + shrq $4,%rdx + xorq %rax,%rax + vmovdqu 32(%rdi),%xmm6 + vpshufb %xmm0,%xmm7,%xmm7 + vmovdqu 16(%rdi),%xmm2 + vpshufb %xmm0,%xmm4,%xmm4 + vmovdqu (%rdi),%xmm3 + vpshufb %xmm0,%xmm5,%xmm5 + vmovdqu %xmm4,48(%rsp) + vpshufb %xmm0,%xmm6,%xmm6 + vmovdqu %xmm5,64(%rsp) + vpshufb %xmm0,%xmm2,%xmm2 + vmovdqu %xmm6,80(%rsp) + vpshufb %xmm0,%xmm3,%xmm3 + vmovdqu %xmm2,96(%rsp) + vmovdqu %xmm3,112(%rsp) + + call _aesni_ctr32_ghash_6x + + movq 16(%rbp),%r12 + vmovups %xmm9,-96(%rsi) + vmovups %xmm10,-80(%rsi) + vmovups %xmm11,-64(%rsi) + vmovups %xmm12,-48(%rsi) + vmovups %xmm13,-32(%rsi) + vmovups %xmm14,-16(%rsi) + + vpshufb (%r11),%xmm8,%xmm8 + vmovdqu %xmm8,(%r12) + + vzeroupper + leaq -40(%rbp),%rsp + + popq %r15 + + popq %r14 + + popq %r13 + + popq %r12 + + popq %rbx + + popq %rbp + +L$gcm_dec_abort: + ret + + + + +.p2align 5 +_aesni_ctr32_6x: + + vmovdqu 0-128(%rcx),%xmm4 + vmovdqu 32(%r11),%xmm2 + leaq -1(%r10),%r13 + vmovups 16-128(%rcx),%xmm15 + leaq 32-128(%rcx),%r12 + vpxor %xmm4,%xmm1,%xmm9 + addl $100663296,%ebx + jc L$handle_ctr32_2 + vpaddb %xmm2,%xmm1,%xmm10 + vpaddb %xmm2,%xmm10,%xmm11 + vpxor %xmm4,%xmm10,%xmm10 + vpaddb %xmm2,%xmm11,%xmm12 + vpxor %xmm4,%xmm11,%xmm11 + vpaddb %xmm2,%xmm12,%xmm13 + vpxor %xmm4,%xmm12,%xmm12 + vpaddb %xmm2,%xmm13,%xmm14 + vpxor %xmm4,%xmm13,%xmm13 + vpaddb %xmm2,%xmm14,%xmm1 + vpxor %xmm4,%xmm14,%xmm14 + jmp L$oop_ctr32 + +.p2align 4 +L$oop_ctr32: + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + vmovups (%r12),%xmm15 + leaq 16(%r12),%r12 + decl %r13d + jnz L$oop_ctr32 + + vmovdqu (%r12),%xmm3 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor 0(%rdi),%xmm3,%xmm4 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor 16(%rdi),%xmm3,%xmm5 + vaesenc %xmm15,%xmm11,%xmm11 + vpxor 32(%rdi),%xmm3,%xmm6 + vaesenc %xmm15,%xmm12,%xmm12 + vpxor 48(%rdi),%xmm3,%xmm8 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor 64(%rdi),%xmm3,%xmm2 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor 80(%rdi),%xmm3,%xmm3 + leaq 96(%rdi),%rdi + + vaesenclast %xmm4,%xmm9,%xmm9 + vaesenclast %xmm5,%xmm10,%xmm10 + vaesenclast %xmm6,%xmm11,%xmm11 + vaesenclast %xmm8,%xmm12,%xmm12 + vaesenclast %xmm2,%xmm13,%xmm13 + vaesenclast %xmm3,%xmm14,%xmm14 + vmovups %xmm9,0(%rsi) + vmovups %xmm10,16(%rsi) + vmovups %xmm11,32(%rsi) + vmovups %xmm12,48(%rsi) + vmovups %xmm13,64(%rsi) + vmovups %xmm14,80(%rsi) + leaq 96(%rsi),%rsi + + ret +.p2align 5 +L$handle_ctr32_2: + vpshufb %xmm0,%xmm1,%xmm6 + vmovdqu 48(%r11),%xmm5 + vpaddd 64(%r11),%xmm6,%xmm10 + vpaddd %xmm5,%xmm6,%xmm11 + vpaddd %xmm5,%xmm10,%xmm12 + vpshufb %xmm0,%xmm10,%xmm10 + vpaddd %xmm5,%xmm11,%xmm13 + vpshufb %xmm0,%xmm11,%xmm11 + vpxor %xmm4,%xmm10,%xmm10 + vpaddd %xmm5,%xmm12,%xmm14 + vpshufb %xmm0,%xmm12,%xmm12 + vpxor %xmm4,%xmm11,%xmm11 + vpaddd %xmm5,%xmm13,%xmm1 + vpshufb %xmm0,%xmm13,%xmm13 + vpxor %xmm4,%xmm12,%xmm12 + vpshufb %xmm0,%xmm14,%xmm14 + vpxor %xmm4,%xmm13,%xmm13 + vpshufb %xmm0,%xmm1,%xmm1 + vpxor %xmm4,%xmm14,%xmm14 + jmp L$oop_ctr32 + + + +.globl _aesni_gcm_encrypt +.private_extern _aesni_gcm_encrypt + +.p2align 5 +_aesni_gcm_encrypt: + + +_CET_ENDBR +#ifdef BORINGSSL_DISPATCH_TEST + + movb $1,_BORINGSSL_function_hit+2(%rip) +#endif + xorq %rax,%rax + + + + + cmpq $288,%rdx + jb L$gcm_enc_abort + + pushq %rbp + + + movq %rsp,%rbp + + pushq %rbx + + + pushq %r12 + + + pushq %r13 + + + pushq %r14 + + + pushq %r15 + + + vzeroupper + + vmovdqu (%r8),%xmm1 + addq $-128,%rsp + movl 12(%r8),%ebx + leaq L$bswap_mask(%rip),%r11 + leaq -128(%rcx),%r14 + movq $0xf80,%r15 + leaq 128(%rcx),%rcx + vmovdqu (%r11),%xmm0 + andq $-128,%rsp + movl 240-128(%rcx),%r10d + + andq %r15,%r14 + andq %rsp,%r15 + subq %r14,%r15 + jc L$enc_no_key_aliasing + cmpq $768,%r15 + jnc L$enc_no_key_aliasing + subq %r15,%rsp +L$enc_no_key_aliasing: + + movq %rsi,%r14 + + + + + + + + + leaq -192(%rsi,%rdx,1),%r15 + + shrq $4,%rdx + + call _aesni_ctr32_6x + vpshufb %xmm0,%xmm9,%xmm8 + vpshufb %xmm0,%xmm10,%xmm2 + vmovdqu %xmm8,112(%rsp) + vpshufb %xmm0,%xmm11,%xmm4 + vmovdqu %xmm2,96(%rsp) + vpshufb %xmm0,%xmm12,%xmm5 + vmovdqu %xmm4,80(%rsp) + vpshufb %xmm0,%xmm13,%xmm6 + vmovdqu %xmm5,64(%rsp) + vpshufb %xmm0,%xmm14,%xmm7 + vmovdqu %xmm6,48(%rsp) + + call _aesni_ctr32_6x + + movq 16(%rbp),%r12 + leaq 32(%r9),%r9 + vmovdqu (%r12),%xmm8 + subq $12,%rdx + movq $192,%rax + vpshufb %xmm0,%xmm8,%xmm8 + + call _aesni_ctr32_ghash_6x + vmovdqu 32(%rsp),%xmm7 + vmovdqu (%r11),%xmm0 + vmovdqu 0-32(%r9),%xmm3 + vpunpckhqdq %xmm7,%xmm7,%xmm1 + vmovdqu 32-32(%r9),%xmm15 + vmovups %xmm9,-96(%rsi) + vpshufb %xmm0,%xmm9,%xmm9 + vpxor %xmm7,%xmm1,%xmm1 + vmovups %xmm10,-80(%rsi) + vpshufb %xmm0,%xmm10,%xmm10 + vmovups %xmm11,-64(%rsi) + vpshufb %xmm0,%xmm11,%xmm11 + vmovups %xmm12,-48(%rsi) + vpshufb %xmm0,%xmm12,%xmm12 + vmovups %xmm13,-32(%rsi) + vpshufb %xmm0,%xmm13,%xmm13 + vmovups %xmm14,-16(%rsi) + vpshufb %xmm0,%xmm14,%xmm14 + vmovdqu %xmm9,16(%rsp) + vmovdqu 48(%rsp),%xmm6 + vmovdqu 16-32(%r9),%xmm0 + vpunpckhqdq %xmm6,%xmm6,%xmm2 + vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5 + vpxor %xmm6,%xmm2,%xmm2 + vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 + vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 + + vmovdqu 64(%rsp),%xmm9 + vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4 + vmovdqu 48-32(%r9),%xmm3 + vpxor %xmm5,%xmm4,%xmm4 + vpunpckhqdq %xmm9,%xmm9,%xmm5 + vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6 + vpxor %xmm9,%xmm5,%xmm5 + vpxor %xmm7,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 + vmovdqu 80-32(%r9),%xmm15 + vpxor %xmm1,%xmm2,%xmm2 + + vmovdqu 80(%rsp),%xmm1 + vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7 + vmovdqu 64-32(%r9),%xmm0 + vpxor %xmm4,%xmm7,%xmm7 + vpunpckhqdq %xmm1,%xmm1,%xmm4 + vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9 + vpxor %xmm1,%xmm4,%xmm4 + vpxor %xmm6,%xmm9,%xmm9 + vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu 96(%rsp),%xmm2 + vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6 + vmovdqu 96-32(%r9),%xmm3 + vpxor %xmm7,%xmm6,%xmm6 + vpunpckhqdq %xmm2,%xmm2,%xmm7 + vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpxor %xmm9,%xmm1,%xmm1 + vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4 + vmovdqu 128-32(%r9),%xmm15 + vpxor %xmm5,%xmm4,%xmm4 + + vpxor 112(%rsp),%xmm8,%xmm8 + vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5 + vmovdqu 112-32(%r9),%xmm0 + vpunpckhqdq %xmm8,%xmm8,%xmm9 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2 + vpxor %xmm8,%xmm9,%xmm9 + vpxor %xmm1,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7 + vpxor %xmm4,%xmm7,%xmm4 + + vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6 + vmovdqu 0-32(%r9),%xmm3 + vpunpckhqdq %xmm14,%xmm14,%xmm1 + vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8 + vpxor %xmm14,%xmm1,%xmm1 + vpxor %xmm5,%xmm6,%xmm5 + vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9 + vmovdqu 32-32(%r9),%xmm15 + vpxor %xmm2,%xmm8,%xmm7 + vpxor %xmm4,%xmm9,%xmm6 + + vmovdqu 16-32(%r9),%xmm0 + vpxor %xmm5,%xmm7,%xmm9 + vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4 + vpxor %xmm9,%xmm6,%xmm6 + vpunpckhqdq %xmm13,%xmm13,%xmm2 + vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14 + vpxor %xmm13,%xmm2,%xmm2 + vpslldq $8,%xmm6,%xmm9 + vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 + vpxor %xmm9,%xmm5,%xmm8 + vpsrldq $8,%xmm6,%xmm6 + vpxor %xmm6,%xmm7,%xmm7 + + vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5 + vmovdqu 48-32(%r9),%xmm3 + vpxor %xmm4,%xmm5,%xmm5 + vpunpckhqdq %xmm12,%xmm12,%xmm9 + vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13 + vpxor %xmm12,%xmm9,%xmm9 + vpxor %xmm14,%xmm13,%xmm13 + vpalignr $8,%xmm8,%xmm8,%xmm14 + vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 + vmovdqu 80-32(%r9),%xmm15 + vpxor %xmm1,%xmm2,%xmm2 + + vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4 + vmovdqu 64-32(%r9),%xmm0 + vpxor %xmm5,%xmm4,%xmm4 + vpunpckhqdq %xmm11,%xmm11,%xmm1 + vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12 + vpxor %xmm11,%xmm1,%xmm1 + vpxor %xmm13,%xmm12,%xmm12 + vxorps 16(%rsp),%xmm7,%xmm7 + vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9 + vpxor %xmm2,%xmm9,%xmm9 + + vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 + vxorps %xmm14,%xmm8,%xmm8 + + vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5 + vmovdqu 96-32(%r9),%xmm3 + vpxor %xmm4,%xmm5,%xmm5 + vpunpckhqdq %xmm10,%xmm10,%xmm2 + vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11 + vpxor %xmm10,%xmm2,%xmm2 + vpalignr $8,%xmm8,%xmm8,%xmm14 + vpxor %xmm12,%xmm11,%xmm11 + vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1 + vmovdqu 128-32(%r9),%xmm15 + vpxor %xmm9,%xmm1,%xmm1 + + vxorps %xmm7,%xmm14,%xmm14 + vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 + vxorps %xmm14,%xmm8,%xmm8 + + vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4 + vmovdqu 112-32(%r9),%xmm0 + vpxor %xmm5,%xmm4,%xmm4 + vpunpckhqdq %xmm8,%xmm8,%xmm9 + vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10 + vpxor %xmm8,%xmm9,%xmm9 + vpxor %xmm11,%xmm10,%xmm10 + vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2 + vpxor %xmm1,%xmm2,%xmm2 + + vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5 + vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7 + vpxor %xmm4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6 + vpxor %xmm10,%xmm7,%xmm7 + vpxor %xmm2,%xmm6,%xmm6 + + vpxor %xmm5,%xmm7,%xmm4 + vpxor %xmm4,%xmm6,%xmm6 + vpslldq $8,%xmm6,%xmm1 + vmovdqu 16(%r11),%xmm3 + vpsrldq $8,%xmm6,%xmm6 + vpxor %xmm1,%xmm5,%xmm8 + vpxor %xmm6,%xmm7,%xmm7 + + vpalignr $8,%xmm8,%xmm8,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 + vpxor %xmm2,%xmm8,%xmm8 + + vpalignr $8,%xmm8,%xmm8,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 + vpxor %xmm7,%xmm2,%xmm2 + vpxor %xmm2,%xmm8,%xmm8 + movq 16(%rbp),%r12 + vpshufb (%r11),%xmm8,%xmm8 + vmovdqu %xmm8,(%r12) + + vzeroupper + leaq -40(%rbp),%rsp + + popq %r15 + + popq %r14 + + popq %r13 + + popq %r12 + + popq %rbx + + popq %rbp + +L$gcm_enc_abort: + ret + + + +.section __DATA,__const +.p2align 6 +L$bswap_mask: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +L$poly: +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +L$one_msb: +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +L$two_lsb: +.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +L$one_lsb: +.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.p2align 6 +.text +#endif diff --git a/ring-0.17.14/pregenerated/aesni-gcm-x86_64-nasm.asm b/ring-0.17.14/pregenerated/aesni-gcm-x86_64-nasm.asm new file mode 100644 index 0000000000..f1a7ca8bd1 --- /dev/null +++ b/ring-0.17.14/pregenerated/aesni-gcm-x86_64-nasm.asm @@ -0,0 +1,1104 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%include "ring_core_generated/prefix_symbols_nasm.inc" +section .text code align=64 + + + +ALIGN 32 +_aesni_ctr32_ghash_6x: + + vmovdqu xmm2,XMMWORD[32+r11] + sub r8,6 + vpxor xmm4,xmm4,xmm4 + vmovdqu xmm15,XMMWORD[((0-128))+r9] + vpaddb xmm10,xmm1,xmm2 + vpaddb xmm11,xmm10,xmm2 + vpaddb xmm12,xmm11,xmm2 + vpaddb xmm13,xmm12,xmm2 + vpaddb xmm14,xmm13,xmm2 + vpxor xmm9,xmm1,xmm15 + vmovdqu XMMWORD[(16+8)+rsp],xmm4 + jmp NEAR $L$oop6x + +ALIGN 32 +$L$oop6x: + add ebx,100663296 + jc NEAR $L$handle_ctr32 + vmovdqu xmm3,XMMWORD[((0-32))+rsi] + vpaddb xmm1,xmm14,xmm2 + vpxor xmm10,xmm10,xmm15 + vpxor xmm11,xmm11,xmm15 + +$L$resume_ctr32: + vmovdqu XMMWORD[rdi],xmm1 + vpclmulqdq xmm5,xmm7,xmm3,0x10 + vpxor xmm12,xmm12,xmm15 + vmovups xmm2,XMMWORD[((16-128))+r9] + vpclmulqdq xmm6,xmm7,xmm3,0x01 + + + + + + + + + + + + + + + + + + xor r12,r12 + cmp r15,r14 + + vaesenc xmm9,xmm9,xmm2 + vmovdqu xmm0,XMMWORD[((48+8))+rsp] + vpxor xmm13,xmm13,xmm15 + vpclmulqdq xmm1,xmm7,xmm3,0x00 + vaesenc xmm10,xmm10,xmm2 + vpxor xmm14,xmm14,xmm15 + setnc r12b + vpclmulqdq xmm7,xmm7,xmm3,0x11 + vaesenc xmm11,xmm11,xmm2 + vmovdqu xmm3,XMMWORD[((16-32))+rsi] + neg r12 + vaesenc xmm12,xmm12,xmm2 + vpxor xmm6,xmm6,xmm5 + vpclmulqdq xmm5,xmm0,xmm3,0x00 + vpxor xmm8,xmm8,xmm4 + vaesenc xmm13,xmm13,xmm2 + vpxor xmm4,xmm1,xmm5 + and r12,0x60 + vmovups xmm15,XMMWORD[((32-128))+r9] + vpclmulqdq xmm1,xmm0,xmm3,0x10 + vaesenc xmm14,xmm14,xmm2 + + vpclmulqdq xmm2,xmm0,xmm3,0x01 + lea r14,[r12*1+r14] + vaesenc xmm9,xmm9,xmm15 + vpxor xmm8,xmm8,XMMWORD[((16+8))+rsp] + vpclmulqdq xmm3,xmm0,xmm3,0x11 + vmovdqu xmm0,XMMWORD[((64+8))+rsp] + vaesenc xmm10,xmm10,xmm15 + movbe r13,QWORD[88+r14] + vaesenc xmm11,xmm11,xmm15 + movbe r12,QWORD[80+r14] + vaesenc xmm12,xmm12,xmm15 + mov QWORD[((32+8))+rsp],r13 + vaesenc xmm13,xmm13,xmm15 + mov QWORD[((40+8))+rsp],r12 + vmovdqu xmm5,XMMWORD[((48-32))+rsi] + vaesenc xmm14,xmm14,xmm15 + + vmovups xmm15,XMMWORD[((48-128))+r9] + vpxor xmm6,xmm6,xmm1 + vpclmulqdq xmm1,xmm0,xmm5,0x00 + vaesenc xmm9,xmm9,xmm15 + vpxor xmm6,xmm6,xmm2 + vpclmulqdq xmm2,xmm0,xmm5,0x10 + vaesenc xmm10,xmm10,xmm15 + vpxor xmm7,xmm7,xmm3 + vpclmulqdq xmm3,xmm0,xmm5,0x01 + vaesenc xmm11,xmm11,xmm15 + vpclmulqdq xmm5,xmm0,xmm5,0x11 + vmovdqu xmm0,XMMWORD[((80+8))+rsp] + vaesenc xmm12,xmm12,xmm15 + vaesenc xmm13,xmm13,xmm15 + vpxor xmm4,xmm4,xmm1 + vmovdqu xmm1,XMMWORD[((64-32))+rsi] + vaesenc xmm14,xmm14,xmm15 + + vmovups xmm15,XMMWORD[((64-128))+r9] + vpxor xmm6,xmm6,xmm2 + vpclmulqdq xmm2,xmm0,xmm1,0x00 + vaesenc xmm9,xmm9,xmm15 + vpxor xmm6,xmm6,xmm3 + vpclmulqdq xmm3,xmm0,xmm1,0x10 + vaesenc xmm10,xmm10,xmm15 + movbe r13,QWORD[72+r14] + vpxor xmm7,xmm7,xmm5 + vpclmulqdq xmm5,xmm0,xmm1,0x01 + vaesenc xmm11,xmm11,xmm15 + movbe r12,QWORD[64+r14] + vpclmulqdq xmm1,xmm0,xmm1,0x11 + vmovdqu xmm0,XMMWORD[((96+8))+rsp] + vaesenc xmm12,xmm12,xmm15 + mov QWORD[((48+8))+rsp],r13 + vaesenc xmm13,xmm13,xmm15 + mov QWORD[((56+8))+rsp],r12 + vpxor xmm4,xmm4,xmm2 + vmovdqu xmm2,XMMWORD[((96-32))+rsi] + vaesenc xmm14,xmm14,xmm15 + + vmovups xmm15,XMMWORD[((80-128))+r9] + vpxor xmm6,xmm6,xmm3 + vpclmulqdq xmm3,xmm0,xmm2,0x00 + vaesenc xmm9,xmm9,xmm15 + vpxor xmm6,xmm6,xmm5 + vpclmulqdq xmm5,xmm0,xmm2,0x10 + vaesenc xmm10,xmm10,xmm15 + movbe r13,QWORD[56+r14] + vpxor xmm7,xmm7,xmm1 + vpclmulqdq xmm1,xmm0,xmm2,0x01 + vpxor xmm8,xmm8,XMMWORD[((112+8))+rsp] + vaesenc xmm11,xmm11,xmm15 + movbe r12,QWORD[48+r14] + vpclmulqdq xmm2,xmm0,xmm2,0x11 + vaesenc xmm12,xmm12,xmm15 + mov QWORD[((64+8))+rsp],r13 + vaesenc xmm13,xmm13,xmm15 + mov QWORD[((72+8))+rsp],r12 + vpxor xmm4,xmm4,xmm3 + vmovdqu xmm3,XMMWORD[((112-32))+rsi] + vaesenc xmm14,xmm14,xmm15 + + vmovups xmm15,XMMWORD[((96-128))+r9] + vpxor xmm6,xmm6,xmm5 + vpclmulqdq xmm5,xmm8,xmm3,0x10 + vaesenc xmm9,xmm9,xmm15 + vpxor xmm6,xmm6,xmm1 + vpclmulqdq xmm1,xmm8,xmm3,0x01 + vaesenc xmm10,xmm10,xmm15 + movbe r13,QWORD[40+r14] + vpxor xmm7,xmm7,xmm2 + vpclmulqdq xmm2,xmm8,xmm3,0x00 + vaesenc xmm11,xmm11,xmm15 + movbe r12,QWORD[32+r14] + vpclmulqdq xmm8,xmm8,xmm3,0x11 + vaesenc xmm12,xmm12,xmm15 + mov QWORD[((80+8))+rsp],r13 + vaesenc xmm13,xmm13,xmm15 + mov QWORD[((88+8))+rsp],r12 + vpxor xmm6,xmm6,xmm5 + vaesenc xmm14,xmm14,xmm15 + vpxor xmm6,xmm6,xmm1 + + vmovups xmm15,XMMWORD[((112-128))+r9] + vpslldq xmm5,xmm6,8 + vpxor xmm4,xmm4,xmm2 + vmovdqu xmm3,XMMWORD[16+r11] + + vaesenc xmm9,xmm9,xmm15 + vpxor xmm7,xmm7,xmm8 + vaesenc xmm10,xmm10,xmm15 + vpxor xmm4,xmm4,xmm5 + movbe r13,QWORD[24+r14] + vaesenc xmm11,xmm11,xmm15 + movbe r12,QWORD[16+r14] + vpalignr xmm0,xmm4,xmm4,8 + vpclmulqdq xmm4,xmm4,xmm3,0x10 + mov QWORD[((96+8))+rsp],r13 + vaesenc xmm12,xmm12,xmm15 + mov QWORD[((104+8))+rsp],r12 + vaesenc xmm13,xmm13,xmm15 + vmovups xmm1,XMMWORD[((128-128))+r9] + vaesenc xmm14,xmm14,xmm15 + + vaesenc xmm9,xmm9,xmm1 + vmovups xmm15,XMMWORD[((144-128))+r9] + vaesenc xmm10,xmm10,xmm1 + vpsrldq xmm6,xmm6,8 + vaesenc xmm11,xmm11,xmm1 + vpxor xmm7,xmm7,xmm6 + vaesenc xmm12,xmm12,xmm1 + vpxor xmm4,xmm4,xmm0 + movbe r13,QWORD[8+r14] + vaesenc xmm13,xmm13,xmm1 + movbe r12,QWORD[r14] + vaesenc xmm14,xmm14,xmm1 + vmovups xmm1,XMMWORD[((160-128))+r9] + cmp r10d,11 + jb NEAR $L$enc_tail + + vaesenc xmm9,xmm9,xmm15 + vaesenc xmm10,xmm10,xmm15 + vaesenc xmm11,xmm11,xmm15 + vaesenc xmm12,xmm12,xmm15 + vaesenc xmm13,xmm13,xmm15 + vaesenc xmm14,xmm14,xmm15 + + vaesenc xmm9,xmm9,xmm1 + vaesenc xmm10,xmm10,xmm1 + vaesenc xmm11,xmm11,xmm1 + vaesenc xmm12,xmm12,xmm1 + vaesenc xmm13,xmm13,xmm1 + vmovups xmm15,XMMWORD[((176-128))+r9] + vaesenc xmm14,xmm14,xmm1 + vmovups xmm1,XMMWORD[((192-128))+r9] + + + vaesenc xmm9,xmm9,xmm15 + vaesenc xmm10,xmm10,xmm15 + vaesenc xmm11,xmm11,xmm15 + vaesenc xmm12,xmm12,xmm15 + vaesenc xmm13,xmm13,xmm15 + vaesenc xmm14,xmm14,xmm15 + + vaesenc xmm9,xmm9,xmm1 + vaesenc xmm10,xmm10,xmm1 + vaesenc xmm11,xmm11,xmm1 + vaesenc xmm12,xmm12,xmm1 + vaesenc xmm13,xmm13,xmm1 + vmovups xmm15,XMMWORD[((208-128))+r9] + vaesenc xmm14,xmm14,xmm1 + vmovups xmm1,XMMWORD[((224-128))+r9] + jmp NEAR $L$enc_tail + +ALIGN 32 +$L$handle_ctr32: + vmovdqu xmm0,XMMWORD[r11] + vpshufb xmm6,xmm1,xmm0 + vmovdqu xmm5,XMMWORD[48+r11] + vpaddd xmm10,xmm6,XMMWORD[64+r11] + vpaddd xmm11,xmm6,xmm5 + vmovdqu xmm3,XMMWORD[((0-32))+rsi] + vpaddd xmm12,xmm10,xmm5 + vpshufb xmm10,xmm10,xmm0 + vpaddd xmm13,xmm11,xmm5 + vpshufb xmm11,xmm11,xmm0 + vpxor xmm10,xmm10,xmm15 + vpaddd xmm14,xmm12,xmm5 + vpshufb xmm12,xmm12,xmm0 + vpxor xmm11,xmm11,xmm15 + vpaddd xmm1,xmm13,xmm5 + vpshufb xmm13,xmm13,xmm0 + vpshufb xmm14,xmm14,xmm0 + vpshufb xmm1,xmm1,xmm0 + jmp NEAR $L$resume_ctr32 + +ALIGN 32 +$L$enc_tail: + vaesenc xmm9,xmm9,xmm15 + vmovdqu XMMWORD[(16+8)+rsp],xmm7 + vpalignr xmm8,xmm4,xmm4,8 + vaesenc xmm10,xmm10,xmm15 + vpclmulqdq xmm4,xmm4,xmm3,0x10 + vpxor xmm2,xmm1,XMMWORD[rcx] + vaesenc xmm11,xmm11,xmm15 + vpxor xmm0,xmm1,XMMWORD[16+rcx] + vaesenc xmm12,xmm12,xmm15 + vpxor xmm5,xmm1,XMMWORD[32+rcx] + vaesenc xmm13,xmm13,xmm15 + vpxor xmm6,xmm1,XMMWORD[48+rcx] + vaesenc xmm14,xmm14,xmm15 + vpxor xmm7,xmm1,XMMWORD[64+rcx] + vpxor xmm3,xmm1,XMMWORD[80+rcx] + vmovdqu xmm1,XMMWORD[rdi] + + vaesenclast xmm9,xmm9,xmm2 + vmovdqu xmm2,XMMWORD[32+r11] + vaesenclast xmm10,xmm10,xmm0 + vpaddb xmm0,xmm1,xmm2 + mov QWORD[((112+8))+rsp],r13 + lea rcx,[96+rcx] + + prefetcht0 [512+rcx] + prefetcht0 [576+rcx] + vaesenclast xmm11,xmm11,xmm5 + vpaddb xmm5,xmm0,xmm2 + mov QWORD[((120+8))+rsp],r12 + lea rdx,[96+rdx] + vmovdqu xmm15,XMMWORD[((0-128))+r9] + vaesenclast xmm12,xmm12,xmm6 + vpaddb xmm6,xmm5,xmm2 + vaesenclast xmm13,xmm13,xmm7 + vpaddb xmm7,xmm6,xmm2 + vaesenclast xmm14,xmm14,xmm3 + vpaddb xmm3,xmm7,xmm2 + + add rax,0x60 + sub r8,0x6 + jc NEAR $L$6x_done + + vmovups XMMWORD[(-96)+rdx],xmm9 + vpxor xmm9,xmm1,xmm15 + vmovups XMMWORD[(-80)+rdx],xmm10 + vmovdqa xmm10,xmm0 + vmovups XMMWORD[(-64)+rdx],xmm11 + vmovdqa xmm11,xmm5 + vmovups XMMWORD[(-48)+rdx],xmm12 + vmovdqa xmm12,xmm6 + vmovups XMMWORD[(-32)+rdx],xmm13 + vmovdqa xmm13,xmm7 + vmovups XMMWORD[(-16)+rdx],xmm14 + vmovdqa xmm14,xmm3 + vmovdqu xmm7,XMMWORD[((32+8))+rsp] + jmp NEAR $L$oop6x + +$L$6x_done: + vpxor xmm8,xmm8,XMMWORD[((16+8))+rsp] + vpxor xmm8,xmm8,xmm4 + + ret + + +global aesni_gcm_decrypt + +ALIGN 32 +aesni_gcm_decrypt: + +$L$SEH_begin_aesni_gcm_decrypt_1: +_CET_ENDBR + xor rax,rax + + + + cmp r8,0x60 + jb NEAR $L$gcm_dec_abort + + push rbp + +$L$SEH_prologue_aesni_gcm_decrypt_2: + mov rbp,rsp + + push rbx + +$L$SEH_prologue_aesni_gcm_decrypt_3: + push r12 + +$L$SEH_prologue_aesni_gcm_decrypt_4: + push r13 + +$L$SEH_prologue_aesni_gcm_decrypt_5: + push r14 + +$L$SEH_prologue_aesni_gcm_decrypt_6: + push r15 + +$L$SEH_prologue_aesni_gcm_decrypt_7: + lea rsp,[((-168))+rsp] +$L$SEH_prologue_aesni_gcm_decrypt_8: +$L$SEH_prologue_aesni_gcm_decrypt_9: + + + + mov QWORD[16+rbp],rdi +$L$SEH_prologue_aesni_gcm_decrypt_10: + mov QWORD[24+rbp],rsi +$L$SEH_prologue_aesni_gcm_decrypt_11: + mov rdi,QWORD[48+rbp] + mov rsi,QWORD[56+rbp] + + movaps XMMWORD[(-208)+rbp],xmm6 +$L$SEH_prologue_aesni_gcm_decrypt_12: + movaps XMMWORD[(-192)+rbp],xmm7 +$L$SEH_prologue_aesni_gcm_decrypt_13: + movaps XMMWORD[(-176)+rbp],xmm8 +$L$SEH_prologue_aesni_gcm_decrypt_14: + movaps XMMWORD[(-160)+rbp],xmm9 +$L$SEH_prologue_aesni_gcm_decrypt_15: + movaps XMMWORD[(-144)+rbp],xmm10 +$L$SEH_prologue_aesni_gcm_decrypt_16: + movaps XMMWORD[(-128)+rbp],xmm11 +$L$SEH_prologue_aesni_gcm_decrypt_17: + movaps XMMWORD[(-112)+rbp],xmm12 +$L$SEH_prologue_aesni_gcm_decrypt_18: + movaps XMMWORD[(-96)+rbp],xmm13 +$L$SEH_prologue_aesni_gcm_decrypt_19: + movaps XMMWORD[(-80)+rbp],xmm14 +$L$SEH_prologue_aesni_gcm_decrypt_20: + movaps XMMWORD[(-64)+rbp],xmm15 +$L$SEH_prologue_aesni_gcm_decrypt_21: +$L$SEH_endprologue_aesni_gcm_decrypt_22: + vzeroupper + + mov r12,QWORD[64+rbp] + vmovdqu xmm1,XMMWORD[rdi] + add rsp,-128 + mov ebx,DWORD[12+rdi] + lea r11,[$L$bswap_mask] + lea r14,[((-128))+r9] + mov r15,0xf80 + vmovdqu xmm8,XMMWORD[r12] + and rsp,-128 + vmovdqu xmm0,XMMWORD[r11] + lea r9,[128+r9] + lea rsi,[32+rsi] + mov r10d,DWORD[((240-128))+r9] + vpshufb xmm8,xmm8,xmm0 + + and r14,r15 + and r15,rsp + sub r15,r14 + jc NEAR $L$dec_no_key_aliasing + cmp r15,768 + jnc NEAR $L$dec_no_key_aliasing + sub rsp,r15 +$L$dec_no_key_aliasing: + + vmovdqu xmm7,XMMWORD[80+rcx] + mov r14,rcx + vmovdqu xmm4,XMMWORD[64+rcx] + + + + + + + + lea r15,[((-192))+r8*1+rcx] + + vmovdqu xmm5,XMMWORD[48+rcx] + shr r8,4 + xor rax,rax + vmovdqu xmm6,XMMWORD[32+rcx] + vpshufb xmm7,xmm7,xmm0 + vmovdqu xmm2,XMMWORD[16+rcx] + vpshufb xmm4,xmm4,xmm0 + vmovdqu xmm3,XMMWORD[rcx] + vpshufb xmm5,xmm5,xmm0 + vmovdqu XMMWORD[48+rsp],xmm4 + vpshufb xmm6,xmm6,xmm0 + vmovdqu XMMWORD[64+rsp],xmm5 + vpshufb xmm2,xmm2,xmm0 + vmovdqu XMMWORD[80+rsp],xmm6 + vpshufb xmm3,xmm3,xmm0 + vmovdqu XMMWORD[96+rsp],xmm2 + vmovdqu XMMWORD[112+rsp],xmm3 + + call _aesni_ctr32_ghash_6x + + mov r12,QWORD[64+rbp] + vmovups XMMWORD[(-96)+rdx],xmm9 + vmovups XMMWORD[(-80)+rdx],xmm10 + vmovups XMMWORD[(-64)+rdx],xmm11 + vmovups XMMWORD[(-48)+rdx],xmm12 + vmovups XMMWORD[(-32)+rdx],xmm13 + vmovups XMMWORD[(-16)+rdx],xmm14 + + vpshufb xmm8,xmm8,XMMWORD[r11] + vmovdqu XMMWORD[r12],xmm8 + + vzeroupper + movaps xmm6,XMMWORD[((-208))+rbp] + movaps xmm7,XMMWORD[((-192))+rbp] + movaps xmm8,XMMWORD[((-176))+rbp] + movaps xmm9,XMMWORD[((-160))+rbp] + movaps xmm10,XMMWORD[((-144))+rbp] + movaps xmm11,XMMWORD[((-128))+rbp] + movaps xmm12,XMMWORD[((-112))+rbp] + movaps xmm13,XMMWORD[((-96))+rbp] + movaps xmm14,XMMWORD[((-80))+rbp] + movaps xmm15,XMMWORD[((-64))+rbp] + mov rdi,QWORD[16+rbp] + mov rsi,QWORD[24+rbp] + lea rsp,[((-40))+rbp] + + pop r15 + + pop r14 + + pop r13 + + pop r12 + + pop rbx + + pop rbp + +$L$gcm_dec_abort: + ret +$L$SEH_end_aesni_gcm_decrypt_23: + + + +ALIGN 32 +_aesni_ctr32_6x: + + vmovdqu xmm4,XMMWORD[((0-128))+r9] + vmovdqu xmm2,XMMWORD[32+r11] + lea r13,[((-1))+r10] + vmovups xmm15,XMMWORD[((16-128))+r9] + lea r12,[((32-128))+r9] + vpxor xmm9,xmm1,xmm4 + add ebx,100663296 + jc NEAR $L$handle_ctr32_2 + vpaddb xmm10,xmm1,xmm2 + vpaddb xmm11,xmm10,xmm2 + vpxor xmm10,xmm10,xmm4 + vpaddb xmm12,xmm11,xmm2 + vpxor xmm11,xmm11,xmm4 + vpaddb xmm13,xmm12,xmm2 + vpxor xmm12,xmm12,xmm4 + vpaddb xmm14,xmm13,xmm2 + vpxor xmm13,xmm13,xmm4 + vpaddb xmm1,xmm14,xmm2 + vpxor xmm14,xmm14,xmm4 + jmp NEAR $L$oop_ctr32 + +ALIGN 16 +$L$oop_ctr32: + vaesenc xmm9,xmm9,xmm15 + vaesenc xmm10,xmm10,xmm15 + vaesenc xmm11,xmm11,xmm15 + vaesenc xmm12,xmm12,xmm15 + vaesenc xmm13,xmm13,xmm15 + vaesenc xmm14,xmm14,xmm15 + vmovups xmm15,XMMWORD[r12] + lea r12,[16+r12] + dec r13d + jnz NEAR $L$oop_ctr32 + + vmovdqu xmm3,XMMWORD[r12] + vaesenc xmm9,xmm9,xmm15 + vpxor xmm4,xmm3,XMMWORD[rcx] + vaesenc xmm10,xmm10,xmm15 + vpxor xmm5,xmm3,XMMWORD[16+rcx] + vaesenc xmm11,xmm11,xmm15 + vpxor xmm6,xmm3,XMMWORD[32+rcx] + vaesenc xmm12,xmm12,xmm15 + vpxor xmm8,xmm3,XMMWORD[48+rcx] + vaesenc xmm13,xmm13,xmm15 + vpxor xmm2,xmm3,XMMWORD[64+rcx] + vaesenc xmm14,xmm14,xmm15 + vpxor xmm3,xmm3,XMMWORD[80+rcx] + lea rcx,[96+rcx] + + vaesenclast xmm9,xmm9,xmm4 + vaesenclast xmm10,xmm10,xmm5 + vaesenclast xmm11,xmm11,xmm6 + vaesenclast xmm12,xmm12,xmm8 + vaesenclast xmm13,xmm13,xmm2 + vaesenclast xmm14,xmm14,xmm3 + vmovups XMMWORD[rdx],xmm9 + vmovups XMMWORD[16+rdx],xmm10 + vmovups XMMWORD[32+rdx],xmm11 + vmovups XMMWORD[48+rdx],xmm12 + vmovups XMMWORD[64+rdx],xmm13 + vmovups XMMWORD[80+rdx],xmm14 + lea rdx,[96+rdx] + + ret +ALIGN 32 +$L$handle_ctr32_2: + vpshufb xmm6,xmm1,xmm0 + vmovdqu xmm5,XMMWORD[48+r11] + vpaddd xmm10,xmm6,XMMWORD[64+r11] + vpaddd xmm11,xmm6,xmm5 + vpaddd xmm12,xmm10,xmm5 + vpshufb xmm10,xmm10,xmm0 + vpaddd xmm13,xmm11,xmm5 + vpshufb xmm11,xmm11,xmm0 + vpxor xmm10,xmm10,xmm4 + vpaddd xmm14,xmm12,xmm5 + vpshufb xmm12,xmm12,xmm0 + vpxor xmm11,xmm11,xmm4 + vpaddd xmm1,xmm13,xmm5 + vpshufb xmm13,xmm13,xmm0 + vpxor xmm12,xmm12,xmm4 + vpshufb xmm14,xmm14,xmm0 + vpxor xmm13,xmm13,xmm4 + vpshufb xmm1,xmm1,xmm0 + vpxor xmm14,xmm14,xmm4 + jmp NEAR $L$oop_ctr32 + + + +global aesni_gcm_encrypt + +ALIGN 32 +aesni_gcm_encrypt: + +$L$SEH_begin_aesni_gcm_encrypt_1: +_CET_ENDBR +%ifdef BORINGSSL_DISPATCH_TEST +EXTERN BORINGSSL_function_hit + mov BYTE[((BORINGSSL_function_hit+2))],1 +%endif + xor rax,rax + + + + + cmp r8,0x60*3 + jb NEAR $L$gcm_enc_abort + + push rbp + +$L$SEH_prologue_aesni_gcm_encrypt_2: + mov rbp,rsp + + push rbx + +$L$SEH_prologue_aesni_gcm_encrypt_3: + push r12 + +$L$SEH_prologue_aesni_gcm_encrypt_4: + push r13 + +$L$SEH_prologue_aesni_gcm_encrypt_5: + push r14 + +$L$SEH_prologue_aesni_gcm_encrypt_6: + push r15 + +$L$SEH_prologue_aesni_gcm_encrypt_7: + lea rsp,[((-168))+rsp] +$L$SEH_prologue_aesni_gcm_encrypt_8: +$L$SEH_prologue_aesni_gcm_encrypt_9: + + + + mov QWORD[16+rbp],rdi +$L$SEH_prologue_aesni_gcm_encrypt_10: + mov QWORD[24+rbp],rsi +$L$SEH_prologue_aesni_gcm_encrypt_11: + mov rdi,QWORD[48+rbp] + mov rsi,QWORD[56+rbp] + + movaps XMMWORD[(-208)+rbp],xmm6 +$L$SEH_prologue_aesni_gcm_encrypt_12: + movaps XMMWORD[(-192)+rbp],xmm7 +$L$SEH_prologue_aesni_gcm_encrypt_13: + movaps XMMWORD[(-176)+rbp],xmm8 +$L$SEH_prologue_aesni_gcm_encrypt_14: + movaps XMMWORD[(-160)+rbp],xmm9 +$L$SEH_prologue_aesni_gcm_encrypt_15: + movaps XMMWORD[(-144)+rbp],xmm10 +$L$SEH_prologue_aesni_gcm_encrypt_16: + movaps XMMWORD[(-128)+rbp],xmm11 +$L$SEH_prologue_aesni_gcm_encrypt_17: + movaps XMMWORD[(-112)+rbp],xmm12 +$L$SEH_prologue_aesni_gcm_encrypt_18: + movaps XMMWORD[(-96)+rbp],xmm13 +$L$SEH_prologue_aesni_gcm_encrypt_19: + movaps XMMWORD[(-80)+rbp],xmm14 +$L$SEH_prologue_aesni_gcm_encrypt_20: + movaps XMMWORD[(-64)+rbp],xmm15 +$L$SEH_prologue_aesni_gcm_encrypt_21: +$L$SEH_endprologue_aesni_gcm_encrypt_22: + vzeroupper + + vmovdqu xmm1,XMMWORD[rdi] + add rsp,-128 + mov ebx,DWORD[12+rdi] + lea r11,[$L$bswap_mask] + lea r14,[((-128))+r9] + mov r15,0xf80 + lea r9,[128+r9] + vmovdqu xmm0,XMMWORD[r11] + and rsp,-128 + mov r10d,DWORD[((240-128))+r9] + + and r14,r15 + and r15,rsp + sub r15,r14 + jc NEAR $L$enc_no_key_aliasing + cmp r15,768 + jnc NEAR $L$enc_no_key_aliasing + sub rsp,r15 +$L$enc_no_key_aliasing: + + mov r14,rdx + + + + + + + + + lea r15,[((-192))+r8*1+rdx] + + shr r8,4 + + call _aesni_ctr32_6x + vpshufb xmm8,xmm9,xmm0 + vpshufb xmm2,xmm10,xmm0 + vmovdqu XMMWORD[112+rsp],xmm8 + vpshufb xmm4,xmm11,xmm0 + vmovdqu XMMWORD[96+rsp],xmm2 + vpshufb xmm5,xmm12,xmm0 + vmovdqu XMMWORD[80+rsp],xmm4 + vpshufb xmm6,xmm13,xmm0 + vmovdqu XMMWORD[64+rsp],xmm5 + vpshufb xmm7,xmm14,xmm0 + vmovdqu XMMWORD[48+rsp],xmm6 + + call _aesni_ctr32_6x + + mov r12,QWORD[64+rbp] + lea rsi,[32+rsi] + vmovdqu xmm8,XMMWORD[r12] + sub r8,12 + mov rax,0x60*2 + vpshufb xmm8,xmm8,xmm0 + + call _aesni_ctr32_ghash_6x + vmovdqu xmm7,XMMWORD[32+rsp] + vmovdqu xmm0,XMMWORD[r11] + vmovdqu xmm3,XMMWORD[((0-32))+rsi] + vpunpckhqdq xmm1,xmm7,xmm7 + vmovdqu xmm15,XMMWORD[((32-32))+rsi] + vmovups XMMWORD[(-96)+rdx],xmm9 + vpshufb xmm9,xmm9,xmm0 + vpxor xmm1,xmm1,xmm7 + vmovups XMMWORD[(-80)+rdx],xmm10 + vpshufb xmm10,xmm10,xmm0 + vmovups XMMWORD[(-64)+rdx],xmm11 + vpshufb xmm11,xmm11,xmm0 + vmovups XMMWORD[(-48)+rdx],xmm12 + vpshufb xmm12,xmm12,xmm0 + vmovups XMMWORD[(-32)+rdx],xmm13 + vpshufb xmm13,xmm13,xmm0 + vmovups XMMWORD[(-16)+rdx],xmm14 + vpshufb xmm14,xmm14,xmm0 + vmovdqu XMMWORD[16+rsp],xmm9 + vmovdqu xmm6,XMMWORD[48+rsp] + vmovdqu xmm0,XMMWORD[((16-32))+rsi] + vpunpckhqdq xmm2,xmm6,xmm6 + vpclmulqdq xmm5,xmm7,xmm3,0x00 + vpxor xmm2,xmm2,xmm6 + vpclmulqdq xmm7,xmm7,xmm3,0x11 + vpclmulqdq xmm1,xmm1,xmm15,0x00 + + vmovdqu xmm9,XMMWORD[64+rsp] + vpclmulqdq xmm4,xmm6,xmm0,0x00 + vmovdqu xmm3,XMMWORD[((48-32))+rsi] + vpxor xmm4,xmm4,xmm5 + vpunpckhqdq xmm5,xmm9,xmm9 + vpclmulqdq xmm6,xmm6,xmm0,0x11 + vpxor xmm5,xmm5,xmm9 + vpxor xmm6,xmm6,xmm7 + vpclmulqdq xmm2,xmm2,xmm15,0x10 + vmovdqu xmm15,XMMWORD[((80-32))+rsi] + vpxor xmm2,xmm2,xmm1 + + vmovdqu xmm1,XMMWORD[80+rsp] + vpclmulqdq xmm7,xmm9,xmm3,0x00 + vmovdqu xmm0,XMMWORD[((64-32))+rsi] + vpxor xmm7,xmm7,xmm4 + vpunpckhqdq xmm4,xmm1,xmm1 + vpclmulqdq xmm9,xmm9,xmm3,0x11 + vpxor xmm4,xmm4,xmm1 + vpxor xmm9,xmm9,xmm6 + vpclmulqdq xmm5,xmm5,xmm15,0x00 + vpxor xmm5,xmm5,xmm2 + + vmovdqu xmm2,XMMWORD[96+rsp] + vpclmulqdq xmm6,xmm1,xmm0,0x00 + vmovdqu xmm3,XMMWORD[((96-32))+rsi] + vpxor xmm6,xmm6,xmm7 + vpunpckhqdq xmm7,xmm2,xmm2 + vpclmulqdq xmm1,xmm1,xmm0,0x11 + vpxor xmm7,xmm7,xmm2 + vpxor xmm1,xmm1,xmm9 + vpclmulqdq xmm4,xmm4,xmm15,0x10 + vmovdqu xmm15,XMMWORD[((128-32))+rsi] + vpxor xmm4,xmm4,xmm5 + + vpxor xmm8,xmm8,XMMWORD[112+rsp] + vpclmulqdq xmm5,xmm2,xmm3,0x00 + vmovdqu xmm0,XMMWORD[((112-32))+rsi] + vpunpckhqdq xmm9,xmm8,xmm8 + vpxor xmm5,xmm5,xmm6 + vpclmulqdq xmm2,xmm2,xmm3,0x11 + vpxor xmm9,xmm9,xmm8 + vpxor xmm2,xmm2,xmm1 + vpclmulqdq xmm7,xmm7,xmm15,0x00 + vpxor xmm4,xmm7,xmm4 + + vpclmulqdq xmm6,xmm8,xmm0,0x00 + vmovdqu xmm3,XMMWORD[((0-32))+rsi] + vpunpckhqdq xmm1,xmm14,xmm14 + vpclmulqdq xmm8,xmm8,xmm0,0x11 + vpxor xmm1,xmm1,xmm14 + vpxor xmm5,xmm6,xmm5 + vpclmulqdq xmm9,xmm9,xmm15,0x10 + vmovdqu xmm15,XMMWORD[((32-32))+rsi] + vpxor xmm7,xmm8,xmm2 + vpxor xmm6,xmm9,xmm4 + + vmovdqu xmm0,XMMWORD[((16-32))+rsi] + vpxor xmm9,xmm7,xmm5 + vpclmulqdq xmm4,xmm14,xmm3,0x00 + vpxor xmm6,xmm6,xmm9 + vpunpckhqdq xmm2,xmm13,xmm13 + vpclmulqdq xmm14,xmm14,xmm3,0x11 + vpxor xmm2,xmm2,xmm13 + vpslldq xmm9,xmm6,8 + vpclmulqdq xmm1,xmm1,xmm15,0x00 + vpxor xmm8,xmm5,xmm9 + vpsrldq xmm6,xmm6,8 + vpxor xmm7,xmm7,xmm6 + + vpclmulqdq xmm5,xmm13,xmm0,0x00 + vmovdqu xmm3,XMMWORD[((48-32))+rsi] + vpxor xmm5,xmm5,xmm4 + vpunpckhqdq xmm9,xmm12,xmm12 + vpclmulqdq xmm13,xmm13,xmm0,0x11 + vpxor xmm9,xmm9,xmm12 + vpxor xmm13,xmm13,xmm14 + vpalignr xmm14,xmm8,xmm8,8 + vpclmulqdq xmm2,xmm2,xmm15,0x10 + vmovdqu xmm15,XMMWORD[((80-32))+rsi] + vpxor xmm2,xmm2,xmm1 + + vpclmulqdq xmm4,xmm12,xmm3,0x00 + vmovdqu xmm0,XMMWORD[((64-32))+rsi] + vpxor xmm4,xmm4,xmm5 + vpunpckhqdq xmm1,xmm11,xmm11 + vpclmulqdq xmm12,xmm12,xmm3,0x11 + vpxor xmm1,xmm1,xmm11 + vpxor xmm12,xmm12,xmm13 + vxorps xmm7,xmm7,XMMWORD[16+rsp] + vpclmulqdq xmm9,xmm9,xmm15,0x00 + vpxor xmm9,xmm9,xmm2 + + vpclmulqdq xmm8,xmm8,XMMWORD[16+r11],0x10 + vxorps xmm8,xmm8,xmm14 + + vpclmulqdq xmm5,xmm11,xmm0,0x00 + vmovdqu xmm3,XMMWORD[((96-32))+rsi] + vpxor xmm5,xmm5,xmm4 + vpunpckhqdq xmm2,xmm10,xmm10 + vpclmulqdq xmm11,xmm11,xmm0,0x11 + vpxor xmm2,xmm2,xmm10 + vpalignr xmm14,xmm8,xmm8,8 + vpxor xmm11,xmm11,xmm12 + vpclmulqdq xmm1,xmm1,xmm15,0x10 + vmovdqu xmm15,XMMWORD[((128-32))+rsi] + vpxor xmm1,xmm1,xmm9 + + vxorps xmm14,xmm14,xmm7 + vpclmulqdq xmm8,xmm8,XMMWORD[16+r11],0x10 + vxorps xmm8,xmm8,xmm14 + + vpclmulqdq xmm4,xmm10,xmm3,0x00 + vmovdqu xmm0,XMMWORD[((112-32))+rsi] + vpxor xmm4,xmm4,xmm5 + vpunpckhqdq xmm9,xmm8,xmm8 + vpclmulqdq xmm10,xmm10,xmm3,0x11 + vpxor xmm9,xmm9,xmm8 + vpxor xmm10,xmm10,xmm11 + vpclmulqdq xmm2,xmm2,xmm15,0x00 + vpxor xmm2,xmm2,xmm1 + + vpclmulqdq xmm5,xmm8,xmm0,0x00 + vpclmulqdq xmm7,xmm8,xmm0,0x11 + vpxor xmm5,xmm5,xmm4 + vpclmulqdq xmm6,xmm9,xmm15,0x10 + vpxor xmm7,xmm7,xmm10 + vpxor xmm6,xmm6,xmm2 + + vpxor xmm4,xmm7,xmm5 + vpxor xmm6,xmm6,xmm4 + vpslldq xmm1,xmm6,8 + vmovdqu xmm3,XMMWORD[16+r11] + vpsrldq xmm6,xmm6,8 + vpxor xmm8,xmm5,xmm1 + vpxor xmm7,xmm7,xmm6 + + vpalignr xmm2,xmm8,xmm8,8 + vpclmulqdq xmm8,xmm8,xmm3,0x10 + vpxor xmm8,xmm8,xmm2 + + vpalignr xmm2,xmm8,xmm8,8 + vpclmulqdq xmm8,xmm8,xmm3,0x10 + vpxor xmm2,xmm2,xmm7 + vpxor xmm8,xmm8,xmm2 + mov r12,QWORD[64+rbp] + vpshufb xmm8,xmm8,XMMWORD[r11] + vmovdqu XMMWORD[r12],xmm8 + + vzeroupper + movaps xmm6,XMMWORD[((-208))+rbp] + movaps xmm7,XMMWORD[((-192))+rbp] + movaps xmm8,XMMWORD[((-176))+rbp] + movaps xmm9,XMMWORD[((-160))+rbp] + movaps xmm10,XMMWORD[((-144))+rbp] + movaps xmm11,XMMWORD[((-128))+rbp] + movaps xmm12,XMMWORD[((-112))+rbp] + movaps xmm13,XMMWORD[((-96))+rbp] + movaps xmm14,XMMWORD[((-80))+rbp] + movaps xmm15,XMMWORD[((-64))+rbp] + mov rdi,QWORD[16+rbp] + mov rsi,QWORD[24+rbp] + lea rsp,[((-40))+rbp] + + pop r15 + + pop r14 + + pop r13 + + pop r12 + + pop rbx + + pop rbp + +$L$gcm_enc_abort: + ret +$L$SEH_end_aesni_gcm_encrypt_23: + + +section .rdata rdata align=8 +ALIGN 64 +$L$bswap_mask: + DB 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +$L$poly: + DB 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +$L$one_msb: + DB 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +$L$two_lsb: + DB 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +$L$one_lsb: + DB 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 + DB 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108 + DB 101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82 + DB 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112 + DB 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +ALIGN 64 +section .text + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_aesni_gcm_decrypt_1 wrt ..imagebase + DD $L$SEH_end_aesni_gcm_decrypt_23 wrt ..imagebase + DD $L$SEH_info_aesni_gcm_decrypt_0 wrt ..imagebase + + DD $L$SEH_begin_aesni_gcm_encrypt_1 wrt ..imagebase + DD $L$SEH_end_aesni_gcm_encrypt_23 wrt ..imagebase + DD $L$SEH_info_aesni_gcm_encrypt_0 wrt ..imagebase + + +section .xdata rdata align=8 +ALIGN 4 +$L$SEH_info_aesni_gcm_decrypt_0: + DB 1 + DB $L$SEH_endprologue_aesni_gcm_decrypt_22-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 33 + DB 213 + DB $L$SEH_prologue_aesni_gcm_decrypt_21-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 248 + DW 9 + DB $L$SEH_prologue_aesni_gcm_decrypt_20-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 232 + DW 8 + DB $L$SEH_prologue_aesni_gcm_decrypt_19-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 216 + DW 7 + DB $L$SEH_prologue_aesni_gcm_decrypt_18-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 200 + DW 6 + DB $L$SEH_prologue_aesni_gcm_decrypt_17-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 184 + DW 5 + DB $L$SEH_prologue_aesni_gcm_decrypt_16-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 168 + DW 4 + DB $L$SEH_prologue_aesni_gcm_decrypt_15-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 152 + DW 3 + DB $L$SEH_prologue_aesni_gcm_decrypt_14-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 136 + DW 2 + DB $L$SEH_prologue_aesni_gcm_decrypt_13-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 120 + DW 1 + DB $L$SEH_prologue_aesni_gcm_decrypt_12-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 104 + DW 0 + DB $L$SEH_prologue_aesni_gcm_decrypt_11-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 100 + DW 29 + DB $L$SEH_prologue_aesni_gcm_decrypt_10-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 116 + DW 28 + DB $L$SEH_prologue_aesni_gcm_decrypt_9-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 3 + DB $L$SEH_prologue_aesni_gcm_decrypt_8-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 1 + DW 21 + DB $L$SEH_prologue_aesni_gcm_decrypt_7-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 240 + DB $L$SEH_prologue_aesni_gcm_decrypt_6-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 224 + DB $L$SEH_prologue_aesni_gcm_decrypt_5-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 208 + DB $L$SEH_prologue_aesni_gcm_decrypt_4-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 192 + DB $L$SEH_prologue_aesni_gcm_decrypt_3-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 48 + DB $L$SEH_prologue_aesni_gcm_decrypt_2-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 80 + + DW 0 +$L$SEH_info_aesni_gcm_encrypt_0: + DB 1 + DB $L$SEH_endprologue_aesni_gcm_encrypt_22-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 33 + DB 213 + DB $L$SEH_prologue_aesni_gcm_encrypt_21-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 248 + DW 9 + DB $L$SEH_prologue_aesni_gcm_encrypt_20-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 232 + DW 8 + DB $L$SEH_prologue_aesni_gcm_encrypt_19-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 216 + DW 7 + DB $L$SEH_prologue_aesni_gcm_encrypt_18-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 200 + DW 6 + DB $L$SEH_prologue_aesni_gcm_encrypt_17-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 184 + DW 5 + DB $L$SEH_prologue_aesni_gcm_encrypt_16-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 168 + DW 4 + DB $L$SEH_prologue_aesni_gcm_encrypt_15-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 152 + DW 3 + DB $L$SEH_prologue_aesni_gcm_encrypt_14-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 136 + DW 2 + DB $L$SEH_prologue_aesni_gcm_encrypt_13-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 120 + DW 1 + DB $L$SEH_prologue_aesni_gcm_encrypt_12-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 104 + DW 0 + DB $L$SEH_prologue_aesni_gcm_encrypt_11-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 100 + DW 29 + DB $L$SEH_prologue_aesni_gcm_encrypt_10-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 116 + DW 28 + DB $L$SEH_prologue_aesni_gcm_encrypt_9-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 3 + DB $L$SEH_prologue_aesni_gcm_encrypt_8-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 1 + DW 21 + DB $L$SEH_prologue_aesni_gcm_encrypt_7-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 240 + DB $L$SEH_prologue_aesni_gcm_encrypt_6-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 224 + DB $L$SEH_prologue_aesni_gcm_encrypt_5-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 208 + DB $L$SEH_prologue_aesni_gcm_encrypt_4-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 192 + DB $L$SEH_prologue_aesni_gcm_encrypt_3-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 48 + DB $L$SEH_prologue_aesni_gcm_encrypt_2-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 80 + + DW 0 +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/ring-0.17.14/pregenerated/aesni-gcm-x86_64-nasm.o b/ring-0.17.14/pregenerated/aesni-gcm-x86_64-nasm.o new file mode 100644 index 0000000000..4aca940369 Binary files /dev/null and b/ring-0.17.14/pregenerated/aesni-gcm-x86_64-nasm.o differ diff --git a/ring-0.17.14/pregenerated/aesni-x86-elf.S b/ring-0.17.14/pregenerated/aesni-x86-elf.S new file mode 100644 index 0000000000..20a5cdf89f --- /dev/null +++ b/ring-0.17.14/pregenerated/aesni-x86-elf.S @@ -0,0 +1,720 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) +.text +#ifdef BORINGSSL_DISPATCH_TEST +#endif +.hidden _aesni_encrypt2 +.type _aesni_encrypt2,@function +.align 16 +_aesni_encrypt2: + movups (%edx),%xmm0 + shll $4,%ecx + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +.L000enc2_loop: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + movups -16(%edx,%ecx,1),%xmm0 + jnz .L000enc2_loop +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 + ret +.size _aesni_encrypt2,.-_aesni_encrypt2 +.hidden _aesni_encrypt3 +.type _aesni_encrypt3,@function +.align 16 +_aesni_encrypt3: + movups (%edx),%xmm0 + shll $4,%ecx + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + pxor %xmm0,%xmm4 + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +.L001enc3_loop: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 + movups -16(%edx,%ecx,1),%xmm0 + jnz .L001enc3_loop +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 +.byte 102,15,56,221,224 + ret +.size _aesni_encrypt3,.-_aesni_encrypt3 +.hidden _aesni_encrypt4 +.type _aesni_encrypt4,@function +.align 16 +_aesni_encrypt4: + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + shll $4,%ecx + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + pxor %xmm0,%xmm4 + pxor %xmm0,%xmm5 + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx +.byte 15,31,64,0 + addl $16,%ecx +.L002enc4_loop: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movups -16(%edx,%ecx,1),%xmm0 + jnz .L002enc4_loop +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 +.byte 102,15,56,221,224 +.byte 102,15,56,221,232 + ret +.size _aesni_encrypt4,.-_aesni_encrypt4 +.hidden _aesni_encrypt6 +.type _aesni_encrypt6,@function +.align 16 +_aesni_encrypt6: + movups (%edx),%xmm0 + shll $4,%ecx + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + pxor %xmm0,%xmm4 +.byte 102,15,56,220,209 + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 +.byte 102,15,56,220,217 + leal 32(%edx,%ecx,1),%edx + negl %ecx +.byte 102,15,56,220,225 + pxor %xmm0,%xmm7 + movups (%edx,%ecx,1),%xmm0 + addl $16,%ecx + jmp .L003_aesni_encrypt6_inner +.align 16 +.L004enc6_loop: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.L003_aesni_encrypt6_inner: +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.L_aesni_encrypt6_enter: + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 + movups -16(%edx,%ecx,1),%xmm0 + jnz .L004enc6_loop +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 +.byte 102,15,56,221,224 +.byte 102,15,56,221,232 +.byte 102,15,56,221,240 +.byte 102,15,56,221,248 + ret +.size _aesni_encrypt6,.-_aesni_encrypt6 +.globl aes_hw_ctr32_encrypt_blocks +.hidden aes_hw_ctr32_encrypt_blocks +.type aes_hw_ctr32_encrypt_blocks,@function +.align 16 +aes_hw_ctr32_encrypt_blocks: +.L_aes_hw_ctr32_encrypt_blocks_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call .L005pic_for_function_hit +.L005pic_for_function_hit: + popl %ebx + leal BORINGSSL_function_hit+0-.L005pic_for_function_hit(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl 32(%esp),%edx + movl 36(%esp),%ebx + movl %esp,%ebp + subl $88,%esp + andl $-16,%esp + movl %ebp,80(%esp) + cmpl $1,%eax + je .L006ctr32_one_shortcut + movdqu (%ebx),%xmm7 + movl $202182159,(%esp) + movl $134810123,4(%esp) + movl $67438087,8(%esp) + movl $66051,12(%esp) + movl $6,%ecx + xorl %ebp,%ebp + movl %ecx,16(%esp) + movl %ecx,20(%esp) + movl %ecx,24(%esp) + movl %ebp,28(%esp) +.byte 102,15,58,22,251,3 +.byte 102,15,58,34,253,3 + movl 240(%edx),%ecx + bswap %ebx + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movdqa (%esp),%xmm2 +.byte 102,15,58,34,195,0 + leal 3(%ebx),%ebp +.byte 102,15,58,34,205,0 + incl %ebx +.byte 102,15,58,34,195,1 + incl %ebp +.byte 102,15,58,34,205,1 + incl %ebx +.byte 102,15,58,34,195,2 + incl %ebp +.byte 102,15,58,34,205,2 + movdqa %xmm0,48(%esp) +.byte 102,15,56,0,194 + movdqu (%edx),%xmm6 + movdqa %xmm1,64(%esp) +.byte 102,15,56,0,202 + pshufd $192,%xmm0,%xmm2 + pshufd $128,%xmm0,%xmm3 + cmpl $6,%eax + jb .L007ctr32_tail + pxor %xmm6,%xmm7 + shll $4,%ecx + movl $16,%ebx + movdqa %xmm7,32(%esp) + movl %edx,%ebp + subl %ecx,%ebx + leal 32(%edx,%ecx,1),%edx + subl $6,%eax + jmp .L008ctr32_loop6 +.align 16 +.L008ctr32_loop6: + pshufd $64,%xmm0,%xmm4 + movdqa 32(%esp),%xmm0 + pshufd $192,%xmm1,%xmm5 + pxor %xmm0,%xmm2 + pshufd $128,%xmm1,%xmm6 + pxor %xmm0,%xmm3 + pshufd $64,%xmm1,%xmm7 + movups 16(%ebp),%xmm1 + pxor %xmm0,%xmm4 + pxor %xmm0,%xmm5 +.byte 102,15,56,220,209 + pxor %xmm0,%xmm6 + pxor %xmm0,%xmm7 +.byte 102,15,56,220,217 + movups 32(%ebp),%xmm0 + movl %ebx,%ecx +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 + call .L_aesni_encrypt6_enter + movups (%esi),%xmm1 + movups 16(%esi),%xmm0 + xorps %xmm1,%xmm2 + movups 32(%esi),%xmm1 + xorps %xmm0,%xmm3 + movups %xmm2,(%edi) + movdqa 16(%esp),%xmm0 + xorps %xmm1,%xmm4 + movdqa 64(%esp),%xmm1 + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + paddd %xmm0,%xmm1 + paddd 48(%esp),%xmm0 + movdqa (%esp),%xmm2 + movups 48(%esi),%xmm3 + movups 64(%esi),%xmm4 + xorps %xmm3,%xmm5 + movups 80(%esi),%xmm3 + leal 96(%esi),%esi + movdqa %xmm0,48(%esp) +.byte 102,15,56,0,194 + xorps %xmm4,%xmm6 + movups %xmm5,48(%edi) + xorps %xmm3,%xmm7 + movdqa %xmm1,64(%esp) +.byte 102,15,56,0,202 + movups %xmm6,64(%edi) + pshufd $192,%xmm0,%xmm2 + movups %xmm7,80(%edi) + leal 96(%edi),%edi + pshufd $128,%xmm0,%xmm3 + subl $6,%eax + jnc .L008ctr32_loop6 + addl $6,%eax + jz .L009ctr32_ret + movdqu (%ebp),%xmm7 + movl %ebp,%edx + pxor 32(%esp),%xmm7 + movl 240(%ebp),%ecx +.L007ctr32_tail: + por %xmm7,%xmm2 + cmpl $2,%eax + jb .L010ctr32_one + pshufd $64,%xmm0,%xmm4 + por %xmm7,%xmm3 + je .L011ctr32_two + pshufd $192,%xmm1,%xmm5 + por %xmm7,%xmm4 + cmpl $4,%eax + jb .L012ctr32_three + pshufd $128,%xmm1,%xmm6 + por %xmm7,%xmm5 + je .L013ctr32_four + por %xmm7,%xmm6 + call _aesni_encrypt6 + movups (%esi),%xmm1 + movups 16(%esi),%xmm0 + xorps %xmm1,%xmm2 + movups 32(%esi),%xmm1 + xorps %xmm0,%xmm3 + movups 48(%esi),%xmm0 + xorps %xmm1,%xmm4 + movups 64(%esi),%xmm1 + xorps %xmm0,%xmm5 + movups %xmm2,(%edi) + xorps %xmm1,%xmm6 + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) + movups %xmm6,64(%edi) + jmp .L009ctr32_ret +.align 16 +.L006ctr32_one_shortcut: + movups (%ebx),%xmm2 + movl 240(%edx),%ecx +.L010ctr32_one: + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +.L014enc1_loop_1: +.byte 102,15,56,220,209 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz .L014enc1_loop_1 +.byte 102,15,56,221,209 + movups (%esi),%xmm6 + xorps %xmm2,%xmm6 + movups %xmm6,(%edi) + jmp .L009ctr32_ret +.align 16 +.L011ctr32_two: + call _aesni_encrypt2 + movups (%esi),%xmm5 + movups 16(%esi),%xmm6 + xorps %xmm5,%xmm2 + xorps %xmm6,%xmm3 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + jmp .L009ctr32_ret +.align 16 +.L012ctr32_three: + call _aesni_encrypt3 + movups (%esi),%xmm5 + movups 16(%esi),%xmm6 + xorps %xmm5,%xmm2 + movups 32(%esi),%xmm7 + xorps %xmm6,%xmm3 + movups %xmm2,(%edi) + xorps %xmm7,%xmm4 + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + jmp .L009ctr32_ret +.align 16 +.L013ctr32_four: + call _aesni_encrypt4 + movups (%esi),%xmm6 + movups 16(%esi),%xmm7 + movups 32(%esi),%xmm1 + xorps %xmm6,%xmm2 + movups 48(%esi),%xmm0 + xorps %xmm7,%xmm3 + movups %xmm2,(%edi) + xorps %xmm1,%xmm4 + movups %xmm3,16(%edi) + xorps %xmm0,%xmm5 + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) +.L009ctr32_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + movdqa %xmm0,32(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm0,48(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm0,64(%esp) + pxor %xmm7,%xmm7 + movl 80(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size aes_hw_ctr32_encrypt_blocks,.-.L_aes_hw_ctr32_encrypt_blocks_begin +.globl aes_hw_set_encrypt_key_base +.hidden aes_hw_set_encrypt_key_base +.type aes_hw_set_encrypt_key_base,@function +.align 16 +aes_hw_set_encrypt_key_base: +.L_aes_hw_set_encrypt_key_base_begin: +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call .L015pic_for_function_hit +.L015pic_for_function_hit: + popl %ebx + leal BORINGSSL_function_hit+3-.L015pic_for_function_hit(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif + movl 4(%esp),%eax + movl 8(%esp),%ecx + movl 12(%esp),%edx + pushl %ebx + call .L016pic +.L016pic: + popl %ebx + leal .Lkey_const-.L016pic(%ebx),%ebx + movups (%eax),%xmm0 + xorps %xmm4,%xmm4 + leal 16(%edx),%edx + cmpl $256,%ecx + je .L01714rounds + cmpl $128,%ecx + jne .L018bad_keybits +.align 16 +.L01910rounds: + movl $9,%ecx + movups %xmm0,-16(%edx) +.byte 102,15,58,223,200,1 + call .L020key_128_cold +.byte 102,15,58,223,200,2 + call .L021key_128 +.byte 102,15,58,223,200,4 + call .L021key_128 +.byte 102,15,58,223,200,8 + call .L021key_128 +.byte 102,15,58,223,200,16 + call .L021key_128 +.byte 102,15,58,223,200,32 + call .L021key_128 +.byte 102,15,58,223,200,64 + call .L021key_128 +.byte 102,15,58,223,200,128 + call .L021key_128 +.byte 102,15,58,223,200,27 + call .L021key_128 +.byte 102,15,58,223,200,54 + call .L021key_128 + movups %xmm0,(%edx) + movl %ecx,80(%edx) + jmp .L022good_key +.align 16 +.L021key_128: + movups %xmm0,(%edx) + leal 16(%edx),%edx +.L020key_128_cold: + shufps $16,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $140,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $255,%xmm1,%xmm1 + xorps %xmm1,%xmm0 + ret +.align 16 +.L01714rounds: + movups 16(%eax),%xmm2 + leal 16(%edx),%edx + movl $13,%ecx + movups %xmm0,-32(%edx) + movups %xmm2,-16(%edx) +.byte 102,15,58,223,202,1 + call .L023key_256a_cold +.byte 102,15,58,223,200,1 + call .L024key_256b +.byte 102,15,58,223,202,2 + call .L025key_256a +.byte 102,15,58,223,200,2 + call .L024key_256b +.byte 102,15,58,223,202,4 + call .L025key_256a +.byte 102,15,58,223,200,4 + call .L024key_256b +.byte 102,15,58,223,202,8 + call .L025key_256a +.byte 102,15,58,223,200,8 + call .L024key_256b +.byte 102,15,58,223,202,16 + call .L025key_256a +.byte 102,15,58,223,200,16 + call .L024key_256b +.byte 102,15,58,223,202,32 + call .L025key_256a +.byte 102,15,58,223,200,32 + call .L024key_256b +.byte 102,15,58,223,202,64 + call .L025key_256a + movups %xmm0,(%edx) + movl %ecx,16(%edx) + xorl %eax,%eax + jmp .L022good_key +.align 16 +.L025key_256a: + movups %xmm2,(%edx) + leal 16(%edx),%edx +.L023key_256a_cold: + shufps $16,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $140,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $255,%xmm1,%xmm1 + xorps %xmm1,%xmm0 + ret +.align 16 +.L024key_256b: + movups %xmm0,(%edx) + leal 16(%edx),%edx + shufps $16,%xmm2,%xmm4 + xorps %xmm4,%xmm2 + shufps $140,%xmm2,%xmm4 + xorps %xmm4,%xmm2 + shufps $170,%xmm1,%xmm1 + xorps %xmm1,%xmm2 + ret +.L022good_key: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + xorl %eax,%eax + popl %ebx + ret +.align 4 +.L018bad_keybits: + pxor %xmm0,%xmm0 + movl $-2,%eax + popl %ebx + ret +.size aes_hw_set_encrypt_key_base,.-.L_aes_hw_set_encrypt_key_base_begin +.globl aes_hw_set_encrypt_key_alt +.hidden aes_hw_set_encrypt_key_alt +.type aes_hw_set_encrypt_key_alt,@function +.align 16 +aes_hw_set_encrypt_key_alt: +.L_aes_hw_set_encrypt_key_alt_begin: +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call .L026pic_for_function_hit +.L026pic_for_function_hit: + popl %ebx + leal BORINGSSL_function_hit+3-.L026pic_for_function_hit(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif + movl 4(%esp),%eax + movl 8(%esp),%ecx + movl 12(%esp),%edx + pushl %ebx + call .L027pic +.L027pic: + popl %ebx + leal .Lkey_const-.L027pic(%ebx),%ebx + movups (%eax),%xmm0 + xorps %xmm4,%xmm4 + leal 16(%edx),%edx + cmpl $256,%ecx + je .L02814rounds_alt + cmpl $128,%ecx + jne .L029bad_keybits +.align 16 +.L03010rounds_alt: + movdqa (%ebx),%xmm5 + movl $8,%ecx + movdqa 32(%ebx),%xmm4 + movdqa %xmm0,%xmm2 + movdqu %xmm0,-16(%edx) +.L031loop_key128: +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + leal 16(%edx),%edx + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,-16(%edx) + movdqa %xmm0,%xmm2 + decl %ecx + jnz .L031loop_key128 + movdqa 48(%ebx),%xmm4 +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,(%edx) + movdqa %xmm0,%xmm2 +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,16(%edx) + movl $9,%ecx + movl %ecx,96(%edx) + jmp .L032good_key +.align 16 +.L02814rounds_alt: + movups 16(%eax),%xmm2 + leal 16(%edx),%edx + movdqa (%ebx),%xmm5 + movdqa 32(%ebx),%xmm4 + movl $7,%ecx + movdqu %xmm0,-32(%edx) + movdqa %xmm2,%xmm1 + movdqu %xmm2,-16(%edx) +.L033loop_key256: +.byte 102,15,56,0,213 +.byte 102,15,56,221,212 + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + pslld $1,%xmm4 + pxor %xmm2,%xmm0 + movdqu %xmm0,(%edx) + decl %ecx + jz .L034done_key256 + pshufd $255,%xmm0,%xmm2 + pxor %xmm3,%xmm3 +.byte 102,15,56,221,211 + movdqa %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm3,%xmm1 + pxor %xmm1,%xmm2 + movdqu %xmm2,16(%edx) + leal 32(%edx),%edx + movdqa %xmm2,%xmm1 + jmp .L033loop_key256 +.L034done_key256: + movl $13,%ecx + movl %ecx,16(%edx) +.L032good_key: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + xorl %eax,%eax + popl %ebx + ret +.align 4 +.L029bad_keybits: + pxor %xmm0,%xmm0 + movl $-2,%eax + popl %ebx + ret +.size aes_hw_set_encrypt_key_alt,.-.L_aes_hw_set_encrypt_key_alt_begin +.align 64 +.Lkey_const: +.long 202313229,202313229,202313229,202313229 +.long 67569157,67569157,67569157,67569157 +.long 1,1,1,1 +.long 27,27,27,27 +.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69 +.byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83 +.byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 +.byte 115,108,46,111,114,103,62,0 +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) diff --git a/ring-0.17.14/pregenerated/aesni-x86-win32n.asm b/ring-0.17.14/pregenerated/aesni-x86-win32n.asm new file mode 100644 index 0000000000..b9fe666eb7 --- /dev/null +++ b/ring-0.17.14/pregenerated/aesni-x86-win32n.asm @@ -0,0 +1,709 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%include "ring_core_generated/prefix_symbols_nasm.inc" +%ifidn __OUTPUT_FORMAT__, win32 +%ifidn __OUTPUT_FORMAT__,obj +section code use32 class=code align=64 +%elifidn __OUTPUT_FORMAT__,win32 +$@feat.00 equ 1 +section .text code align=64 +%else +section .text code +%endif +%ifdef BORINGSSL_DISPATCH_TEST +extern _BORINGSSL_function_hit +%endif +align 16 +__aesni_encrypt2: + movups xmm0,[edx] + shl ecx,4 + movups xmm1,[16+edx] + xorps xmm2,xmm0 + pxor xmm3,xmm0 + movups xmm0,[32+edx] + lea edx,[32+ecx*1+edx] + neg ecx + add ecx,16 +L$000enc2_loop: +db 102,15,56,220,209 +db 102,15,56,220,217 + movups xmm1,[ecx*1+edx] + add ecx,32 +db 102,15,56,220,208 +db 102,15,56,220,216 + movups xmm0,[ecx*1+edx-16] + jnz NEAR L$000enc2_loop +db 102,15,56,220,209 +db 102,15,56,220,217 +db 102,15,56,221,208 +db 102,15,56,221,216 + ret +align 16 +__aesni_encrypt3: + movups xmm0,[edx] + shl ecx,4 + movups xmm1,[16+edx] + xorps xmm2,xmm0 + pxor xmm3,xmm0 + pxor xmm4,xmm0 + movups xmm0,[32+edx] + lea edx,[32+ecx*1+edx] + neg ecx + add ecx,16 +L$001enc3_loop: +db 102,15,56,220,209 +db 102,15,56,220,217 +db 102,15,56,220,225 + movups xmm1,[ecx*1+edx] + add ecx,32 +db 102,15,56,220,208 +db 102,15,56,220,216 +db 102,15,56,220,224 + movups xmm0,[ecx*1+edx-16] + jnz NEAR L$001enc3_loop +db 102,15,56,220,209 +db 102,15,56,220,217 +db 102,15,56,220,225 +db 102,15,56,221,208 +db 102,15,56,221,216 +db 102,15,56,221,224 + ret +align 16 +__aesni_encrypt4: + movups xmm0,[edx] + movups xmm1,[16+edx] + shl ecx,4 + xorps xmm2,xmm0 + pxor xmm3,xmm0 + pxor xmm4,xmm0 + pxor xmm5,xmm0 + movups xmm0,[32+edx] + lea edx,[32+ecx*1+edx] + neg ecx +db 15,31,64,0 + add ecx,16 +L$002enc4_loop: +db 102,15,56,220,209 +db 102,15,56,220,217 +db 102,15,56,220,225 +db 102,15,56,220,233 + movups xmm1,[ecx*1+edx] + add ecx,32 +db 102,15,56,220,208 +db 102,15,56,220,216 +db 102,15,56,220,224 +db 102,15,56,220,232 + movups xmm0,[ecx*1+edx-16] + jnz NEAR L$002enc4_loop +db 102,15,56,220,209 +db 102,15,56,220,217 +db 102,15,56,220,225 +db 102,15,56,220,233 +db 102,15,56,221,208 +db 102,15,56,221,216 +db 102,15,56,221,224 +db 102,15,56,221,232 + ret +align 16 +__aesni_encrypt6: + movups xmm0,[edx] + shl ecx,4 + movups xmm1,[16+edx] + xorps xmm2,xmm0 + pxor xmm3,xmm0 + pxor xmm4,xmm0 +db 102,15,56,220,209 + pxor xmm5,xmm0 + pxor xmm6,xmm0 +db 102,15,56,220,217 + lea edx,[32+ecx*1+edx] + neg ecx +db 102,15,56,220,225 + pxor xmm7,xmm0 + movups xmm0,[ecx*1+edx] + add ecx,16 + jmp NEAR L$003_aesni_encrypt6_inner +align 16 +L$004enc6_loop: +db 102,15,56,220,209 +db 102,15,56,220,217 +db 102,15,56,220,225 +L$003_aesni_encrypt6_inner: +db 102,15,56,220,233 +db 102,15,56,220,241 +db 102,15,56,220,249 +L$_aesni_encrypt6_enter: + movups xmm1,[ecx*1+edx] + add ecx,32 +db 102,15,56,220,208 +db 102,15,56,220,216 +db 102,15,56,220,224 +db 102,15,56,220,232 +db 102,15,56,220,240 +db 102,15,56,220,248 + movups xmm0,[ecx*1+edx-16] + jnz NEAR L$004enc6_loop +db 102,15,56,220,209 +db 102,15,56,220,217 +db 102,15,56,220,225 +db 102,15,56,220,233 +db 102,15,56,220,241 +db 102,15,56,220,249 +db 102,15,56,221,208 +db 102,15,56,221,216 +db 102,15,56,221,224 +db 102,15,56,221,232 +db 102,15,56,221,240 +db 102,15,56,221,248 + ret +global _aes_hw_ctr32_encrypt_blocks +align 16 +_aes_hw_ctr32_encrypt_blocks: +L$_aes_hw_ctr32_encrypt_blocks_begin: + push ebp + push ebx + push esi + push edi +%ifdef BORINGSSL_DISPATCH_TEST + push ebx + push edx + call L$005pic_for_function_hit +L$005pic_for_function_hit: + pop ebx + lea ebx,[(_BORINGSSL_function_hit+0-L$005pic_for_function_hit)+ebx] + mov edx,1 + mov BYTE [ebx],dl + pop edx + pop ebx +%endif + mov esi,DWORD [20+esp] + mov edi,DWORD [24+esp] + mov eax,DWORD [28+esp] + mov edx,DWORD [32+esp] + mov ebx,DWORD [36+esp] + mov ebp,esp + sub esp,88 + and esp,-16 + mov DWORD [80+esp],ebp + cmp eax,1 + je NEAR L$006ctr32_one_shortcut + movdqu xmm7,[ebx] + mov DWORD [esp],202182159 + mov DWORD [4+esp],134810123 + mov DWORD [8+esp],67438087 + mov DWORD [12+esp],66051 + mov ecx,6 + xor ebp,ebp + mov DWORD [16+esp],ecx + mov DWORD [20+esp],ecx + mov DWORD [24+esp],ecx + mov DWORD [28+esp],ebp +db 102,15,58,22,251,3 +db 102,15,58,34,253,3 + mov ecx,DWORD [240+edx] + bswap ebx + pxor xmm0,xmm0 + pxor xmm1,xmm1 + movdqa xmm2,[esp] +db 102,15,58,34,195,0 + lea ebp,[3+ebx] +db 102,15,58,34,205,0 + inc ebx +db 102,15,58,34,195,1 + inc ebp +db 102,15,58,34,205,1 + inc ebx +db 102,15,58,34,195,2 + inc ebp +db 102,15,58,34,205,2 + movdqa [48+esp],xmm0 +db 102,15,56,0,194 + movdqu xmm6,[edx] + movdqa [64+esp],xmm1 +db 102,15,56,0,202 + pshufd xmm2,xmm0,192 + pshufd xmm3,xmm0,128 + cmp eax,6 + jb NEAR L$007ctr32_tail + pxor xmm7,xmm6 + shl ecx,4 + mov ebx,16 + movdqa [32+esp],xmm7 + mov ebp,edx + sub ebx,ecx + lea edx,[32+ecx*1+edx] + sub eax,6 + jmp NEAR L$008ctr32_loop6 +align 16 +L$008ctr32_loop6: + pshufd xmm4,xmm0,64 + movdqa xmm0,[32+esp] + pshufd xmm5,xmm1,192 + pxor xmm2,xmm0 + pshufd xmm6,xmm1,128 + pxor xmm3,xmm0 + pshufd xmm7,xmm1,64 + movups xmm1,[16+ebp] + pxor xmm4,xmm0 + pxor xmm5,xmm0 +db 102,15,56,220,209 + pxor xmm6,xmm0 + pxor xmm7,xmm0 +db 102,15,56,220,217 + movups xmm0,[32+ebp] + mov ecx,ebx +db 102,15,56,220,225 +db 102,15,56,220,233 +db 102,15,56,220,241 +db 102,15,56,220,249 + call L$_aesni_encrypt6_enter + movups xmm1,[esi] + movups xmm0,[16+esi] + xorps xmm2,xmm1 + movups xmm1,[32+esi] + xorps xmm3,xmm0 + movups [edi],xmm2 + movdqa xmm0,[16+esp] + xorps xmm4,xmm1 + movdqa xmm1,[64+esp] + movups [16+edi],xmm3 + movups [32+edi],xmm4 + paddd xmm1,xmm0 + paddd xmm0,[48+esp] + movdqa xmm2,[esp] + movups xmm3,[48+esi] + movups xmm4,[64+esi] + xorps xmm5,xmm3 + movups xmm3,[80+esi] + lea esi,[96+esi] + movdqa [48+esp],xmm0 +db 102,15,56,0,194 + xorps xmm6,xmm4 + movups [48+edi],xmm5 + xorps xmm7,xmm3 + movdqa [64+esp],xmm1 +db 102,15,56,0,202 + movups [64+edi],xmm6 + pshufd xmm2,xmm0,192 + movups [80+edi],xmm7 + lea edi,[96+edi] + pshufd xmm3,xmm0,128 + sub eax,6 + jnc NEAR L$008ctr32_loop6 + add eax,6 + jz NEAR L$009ctr32_ret + movdqu xmm7,[ebp] + mov edx,ebp + pxor xmm7,[32+esp] + mov ecx,DWORD [240+ebp] +L$007ctr32_tail: + por xmm2,xmm7 + cmp eax,2 + jb NEAR L$010ctr32_one + pshufd xmm4,xmm0,64 + por xmm3,xmm7 + je NEAR L$011ctr32_two + pshufd xmm5,xmm1,192 + por xmm4,xmm7 + cmp eax,4 + jb NEAR L$012ctr32_three + pshufd xmm6,xmm1,128 + por xmm5,xmm7 + je NEAR L$013ctr32_four + por xmm6,xmm7 + call __aesni_encrypt6 + movups xmm1,[esi] + movups xmm0,[16+esi] + xorps xmm2,xmm1 + movups xmm1,[32+esi] + xorps xmm3,xmm0 + movups xmm0,[48+esi] + xorps xmm4,xmm1 + movups xmm1,[64+esi] + xorps xmm5,xmm0 + movups [edi],xmm2 + xorps xmm6,xmm1 + movups [16+edi],xmm3 + movups [32+edi],xmm4 + movups [48+edi],xmm5 + movups [64+edi],xmm6 + jmp NEAR L$009ctr32_ret +align 16 +L$006ctr32_one_shortcut: + movups xmm2,[ebx] + mov ecx,DWORD [240+edx] +L$010ctr32_one: + movups xmm0,[edx] + movups xmm1,[16+edx] + lea edx,[32+edx] + xorps xmm2,xmm0 +L$014enc1_loop_1: +db 102,15,56,220,209 + dec ecx + movups xmm1,[edx] + lea edx,[16+edx] + jnz NEAR L$014enc1_loop_1 +db 102,15,56,221,209 + movups xmm6,[esi] + xorps xmm6,xmm2 + movups [edi],xmm6 + jmp NEAR L$009ctr32_ret +align 16 +L$011ctr32_two: + call __aesni_encrypt2 + movups xmm5,[esi] + movups xmm6,[16+esi] + xorps xmm2,xmm5 + xorps xmm3,xmm6 + movups [edi],xmm2 + movups [16+edi],xmm3 + jmp NEAR L$009ctr32_ret +align 16 +L$012ctr32_three: + call __aesni_encrypt3 + movups xmm5,[esi] + movups xmm6,[16+esi] + xorps xmm2,xmm5 + movups xmm7,[32+esi] + xorps xmm3,xmm6 + movups [edi],xmm2 + xorps xmm4,xmm7 + movups [16+edi],xmm3 + movups [32+edi],xmm4 + jmp NEAR L$009ctr32_ret +align 16 +L$013ctr32_four: + call __aesni_encrypt4 + movups xmm6,[esi] + movups xmm7,[16+esi] + movups xmm1,[32+esi] + xorps xmm2,xmm6 + movups xmm0,[48+esi] + xorps xmm3,xmm7 + movups [edi],xmm2 + xorps xmm4,xmm1 + movups [16+edi],xmm3 + xorps xmm5,xmm0 + movups [32+edi],xmm4 + movups [48+edi],xmm5 +L$009ctr32_ret: + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + movdqa [32+esp],xmm0 + pxor xmm5,xmm5 + movdqa [48+esp],xmm0 + pxor xmm6,xmm6 + movdqa [64+esp],xmm0 + pxor xmm7,xmm7 + mov esp,DWORD [80+esp] + pop edi + pop esi + pop ebx + pop ebp + ret +global _aes_hw_set_encrypt_key_base +align 16 +_aes_hw_set_encrypt_key_base: +L$_aes_hw_set_encrypt_key_base_begin: +%ifdef BORINGSSL_DISPATCH_TEST + push ebx + push edx + call L$015pic_for_function_hit +L$015pic_for_function_hit: + pop ebx + lea ebx,[(_BORINGSSL_function_hit+3-L$015pic_for_function_hit)+ebx] + mov edx,1 + mov BYTE [ebx],dl + pop edx + pop ebx +%endif + mov eax,DWORD [4+esp] + mov ecx,DWORD [8+esp] + mov edx,DWORD [12+esp] + push ebx + call L$016pic +L$016pic: + pop ebx + lea ebx,[(L$key_const-L$016pic)+ebx] + movups xmm0,[eax] + xorps xmm4,xmm4 + lea edx,[16+edx] + cmp ecx,256 + je NEAR L$01714rounds + cmp ecx,128 + jne NEAR L$018bad_keybits +align 16 +L$01910rounds: + mov ecx,9 + movups [edx-16],xmm0 +db 102,15,58,223,200,1 + call L$020key_128_cold +db 102,15,58,223,200,2 + call L$021key_128 +db 102,15,58,223,200,4 + call L$021key_128 +db 102,15,58,223,200,8 + call L$021key_128 +db 102,15,58,223,200,16 + call L$021key_128 +db 102,15,58,223,200,32 + call L$021key_128 +db 102,15,58,223,200,64 + call L$021key_128 +db 102,15,58,223,200,128 + call L$021key_128 +db 102,15,58,223,200,27 + call L$021key_128 +db 102,15,58,223,200,54 + call L$021key_128 + movups [edx],xmm0 + mov DWORD [80+edx],ecx + jmp NEAR L$022good_key +align 16 +L$021key_128: + movups [edx],xmm0 + lea edx,[16+edx] +L$020key_128_cold: + shufps xmm4,xmm0,16 + xorps xmm0,xmm4 + shufps xmm4,xmm0,140 + xorps xmm0,xmm4 + shufps xmm1,xmm1,255 + xorps xmm0,xmm1 + ret +align 16 +L$01714rounds: + movups xmm2,[16+eax] + lea edx,[16+edx] + mov ecx,13 + movups [edx-32],xmm0 + movups [edx-16],xmm2 +db 102,15,58,223,202,1 + call L$023key_256a_cold +db 102,15,58,223,200,1 + call L$024key_256b +db 102,15,58,223,202,2 + call L$025key_256a +db 102,15,58,223,200,2 + call L$024key_256b +db 102,15,58,223,202,4 + call L$025key_256a +db 102,15,58,223,200,4 + call L$024key_256b +db 102,15,58,223,202,8 + call L$025key_256a +db 102,15,58,223,200,8 + call L$024key_256b +db 102,15,58,223,202,16 + call L$025key_256a +db 102,15,58,223,200,16 + call L$024key_256b +db 102,15,58,223,202,32 + call L$025key_256a +db 102,15,58,223,200,32 + call L$024key_256b +db 102,15,58,223,202,64 + call L$025key_256a + movups [edx],xmm0 + mov DWORD [16+edx],ecx + xor eax,eax + jmp NEAR L$022good_key +align 16 +L$025key_256a: + movups [edx],xmm2 + lea edx,[16+edx] +L$023key_256a_cold: + shufps xmm4,xmm0,16 + xorps xmm0,xmm4 + shufps xmm4,xmm0,140 + xorps xmm0,xmm4 + shufps xmm1,xmm1,255 + xorps xmm0,xmm1 + ret +align 16 +L$024key_256b: + movups [edx],xmm0 + lea edx,[16+edx] + shufps xmm4,xmm2,16 + xorps xmm2,xmm4 + shufps xmm4,xmm2,140 + xorps xmm2,xmm4 + shufps xmm1,xmm1,170 + xorps xmm2,xmm1 + ret +L$022good_key: + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + xor eax,eax + pop ebx + ret +align 4 +L$018bad_keybits: + pxor xmm0,xmm0 + mov eax,-2 + pop ebx + ret +global _aes_hw_set_encrypt_key_alt +align 16 +_aes_hw_set_encrypt_key_alt: +L$_aes_hw_set_encrypt_key_alt_begin: +%ifdef BORINGSSL_DISPATCH_TEST + push ebx + push edx + call L$026pic_for_function_hit +L$026pic_for_function_hit: + pop ebx + lea ebx,[(_BORINGSSL_function_hit+3-L$026pic_for_function_hit)+ebx] + mov edx,1 + mov BYTE [ebx],dl + pop edx + pop ebx +%endif + mov eax,DWORD [4+esp] + mov ecx,DWORD [8+esp] + mov edx,DWORD [12+esp] + push ebx + call L$027pic +L$027pic: + pop ebx + lea ebx,[(L$key_const-L$027pic)+ebx] + movups xmm0,[eax] + xorps xmm4,xmm4 + lea edx,[16+edx] + cmp ecx,256 + je NEAR L$02814rounds_alt + cmp ecx,128 + jne NEAR L$029bad_keybits +align 16 +L$03010rounds_alt: + movdqa xmm5,[ebx] + mov ecx,8 + movdqa xmm4,[32+ebx] + movdqa xmm2,xmm0 + movdqu [edx-16],xmm0 +L$031loop_key128: +db 102,15,56,0,197 +db 102,15,56,221,196 + pslld xmm4,1 + lea edx,[16+edx] + movdqa xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm2,xmm3 + pxor xmm0,xmm2 + movdqu [edx-16],xmm0 + movdqa xmm2,xmm0 + dec ecx + jnz NEAR L$031loop_key128 + movdqa xmm4,[48+ebx] +db 102,15,56,0,197 +db 102,15,56,221,196 + pslld xmm4,1 + movdqa xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm2,xmm3 + pxor xmm0,xmm2 + movdqu [edx],xmm0 + movdqa xmm2,xmm0 +db 102,15,56,0,197 +db 102,15,56,221,196 + movdqa xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm2,xmm3 + pxor xmm0,xmm2 + movdqu [16+edx],xmm0 + mov ecx,9 + mov DWORD [96+edx],ecx + jmp NEAR L$032good_key +align 16 +L$02814rounds_alt: + movups xmm2,[16+eax] + lea edx,[16+edx] + movdqa xmm5,[ebx] + movdqa xmm4,[32+ebx] + mov ecx,7 + movdqu [edx-32],xmm0 + movdqa xmm1,xmm2 + movdqu [edx-16],xmm2 +L$033loop_key256: +db 102,15,56,0,213 +db 102,15,56,221,212 + movdqa xmm3,xmm0 + pslldq xmm0,4 + pxor xmm3,xmm0 + pslldq xmm0,4 + pxor xmm3,xmm0 + pslldq xmm0,4 + pxor xmm0,xmm3 + pslld xmm4,1 + pxor xmm0,xmm2 + movdqu [edx],xmm0 + dec ecx + jz NEAR L$034done_key256 + pshufd xmm2,xmm0,255 + pxor xmm3,xmm3 +db 102,15,56,221,211 + movdqa xmm3,xmm1 + pslldq xmm1,4 + pxor xmm3,xmm1 + pslldq xmm1,4 + pxor xmm3,xmm1 + pslldq xmm1,4 + pxor xmm1,xmm3 + pxor xmm2,xmm1 + movdqu [16+edx],xmm2 + lea edx,[32+edx] + movdqa xmm1,xmm2 + jmp NEAR L$033loop_key256 +L$034done_key256: + mov ecx,13 + mov DWORD [16+edx],ecx +L$032good_key: + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + xor eax,eax + pop ebx + ret +align 4 +L$029bad_keybits: + pxor xmm0,xmm0 + mov eax,-2 + pop ebx + ret +align 64 +L$key_const: +dd 202313229,202313229,202313229,202313229 +dd 67569157,67569157,67569157,67569157 +dd 1,1,1,1 +dd 27,27,27,27 +db 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69 +db 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83 +db 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 +db 115,108,46,111,114,103,62,0 +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/ring-0.17.14/pregenerated/aesni-x86-win32n.o b/ring-0.17.14/pregenerated/aesni-x86-win32n.o new file mode 100644 index 0000000000..12f7b0f1a7 Binary files /dev/null and b/ring-0.17.14/pregenerated/aesni-x86-win32n.o differ diff --git a/ring-0.17.14/pregenerated/aesni-x86_64-elf.S b/ring-0.17.14/pregenerated/aesni-x86_64-elf.S new file mode 100644 index 0000000000..3504d56a8a --- /dev/null +++ b/ring-0.17.14/pregenerated/aesni-x86_64-elf.S @@ -0,0 +1,1072 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +.text +.type _aesni_encrypt2,@function +.align 16 +_aesni_encrypt2: +.cfi_startproc + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + addq $16,%rax + +.Lenc_loop2: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Lenc_loop2 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 + ret +.cfi_endproc +.size _aesni_encrypt2,.-_aesni_encrypt2 +.type _aesni_encrypt3,@function +.align 16 +_aesni_encrypt3: +.cfi_startproc + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + xorps %xmm0,%xmm4 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + addq $16,%rax + +.Lenc_loop3: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Lenc_loop3 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 +.byte 102,15,56,221,224 + ret +.cfi_endproc +.size _aesni_encrypt3,.-_aesni_encrypt3 +.type _aesni_encrypt4,@function +.align 16 +_aesni_encrypt4: +.cfi_startproc + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + xorps %xmm0,%xmm4 + xorps %xmm0,%xmm5 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax +.byte 0x0f,0x1f,0x00 + addq $16,%rax + +.Lenc_loop4: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Lenc_loop4 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 +.byte 102,15,56,221,224 +.byte 102,15,56,221,232 + ret +.cfi_endproc +.size _aesni_encrypt4,.-_aesni_encrypt4 +.type _aesni_encrypt6,@function +.align 16 +_aesni_encrypt6: +.cfi_startproc + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + pxor %xmm0,%xmm4 +.byte 102,15,56,220,209 + leaq 32(%rcx,%rax,1),%rcx + negq %rax +.byte 102,15,56,220,217 + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 +.byte 102,15,56,220,225 + pxor %xmm0,%xmm7 + movups (%rcx,%rax,1),%xmm0 + addq $16,%rax + jmp .Lenc_loop6_enter +.align 16 +.Lenc_loop6: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.Lenc_loop6_enter: +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Lenc_loop6 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 +.byte 102,15,56,221,224 +.byte 102,15,56,221,232 +.byte 102,15,56,221,240 +.byte 102,15,56,221,248 + ret +.cfi_endproc +.size _aesni_encrypt6,.-_aesni_encrypt6 +.type _aesni_encrypt8,@function +.align 16 +_aesni_encrypt8: +.cfi_startproc + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + pxor %xmm0,%xmm4 + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 + leaq 32(%rcx,%rax,1),%rcx + negq %rax +.byte 102,15,56,220,209 + pxor %xmm0,%xmm7 + pxor %xmm0,%xmm8 +.byte 102,15,56,220,217 + pxor %xmm0,%xmm9 + movups (%rcx,%rax,1),%xmm0 + addq $16,%rax + jmp .Lenc_loop8_inner +.align 16 +.Lenc_loop8: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.Lenc_loop8_inner: +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 +.Lenc_loop8_enter: + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Lenc_loop8 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 +.byte 102,15,56,221,224 +.byte 102,15,56,221,232 +.byte 102,15,56,221,240 +.byte 102,15,56,221,248 +.byte 102,68,15,56,221,192 +.byte 102,68,15,56,221,200 + ret +.cfi_endproc +.size _aesni_encrypt8,.-_aesni_encrypt8 +.globl aes_hw_ctr32_encrypt_blocks +.hidden aes_hw_ctr32_encrypt_blocks +.type aes_hw_ctr32_encrypt_blocks,@function +.align 16 +aes_hw_ctr32_encrypt_blocks: +.cfi_startproc +_CET_ENDBR +#ifdef BORINGSSL_DISPATCH_TEST + movb $1,BORINGSSL_function_hit(%rip) +#endif + cmpq $1,%rdx + jne .Lctr32_bulk + + + + movups (%r8),%xmm2 + movups (%rdi),%xmm3 + movl 240(%rcx),%edx + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +.Loop_enc1_1: +.byte 102,15,56,220,209 + decl %edx + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz .Loop_enc1_1 +.byte 102,15,56,221,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + xorps %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movups %xmm2,(%rsi) + xorps %xmm2,%xmm2 + jmp .Lctr32_epilogue + +.align 16 +.Lctr32_bulk: + leaq (%rsp),%r11 +.cfi_def_cfa_register %r11 + pushq %rbp +.cfi_offset %rbp,-16 + subq $128,%rsp + andq $-16,%rsp + + + + + movdqu (%r8),%xmm2 + movdqu (%rcx),%xmm0 + movl 12(%r8),%r8d + pxor %xmm0,%xmm2 + movl 12(%rcx),%ebp + movdqa %xmm2,0(%rsp) + bswapl %r8d + movdqa %xmm2,%xmm3 + movdqa %xmm2,%xmm4 + movdqa %xmm2,%xmm5 + movdqa %xmm2,64(%rsp) + movdqa %xmm2,80(%rsp) + movdqa %xmm2,96(%rsp) + movq %rdx,%r10 + movdqa %xmm2,112(%rsp) + + leaq 1(%r8),%rax + leaq 2(%r8),%rdx + bswapl %eax + bswapl %edx + xorl %ebp,%eax + xorl %ebp,%edx +.byte 102,15,58,34,216,3 + leaq 3(%r8),%rax + movdqa %xmm3,16(%rsp) +.byte 102,15,58,34,226,3 + bswapl %eax + movq %r10,%rdx + leaq 4(%r8),%r10 + movdqa %xmm4,32(%rsp) + xorl %ebp,%eax + bswapl %r10d +.byte 102,15,58,34,232,3 + xorl %ebp,%r10d + movdqa %xmm5,48(%rsp) + leaq 5(%r8),%r9 + movl %r10d,64+12(%rsp) + bswapl %r9d + leaq 6(%r8),%r10 + movl 240(%rcx),%eax + xorl %ebp,%r9d + bswapl %r10d + movl %r9d,80+12(%rsp) + xorl %ebp,%r10d + leaq 7(%r8),%r9 + movl %r10d,96+12(%rsp) + bswapl %r9d + xorl %ebp,%r9d + movl %r9d,112+12(%rsp) + + movups 16(%rcx),%xmm1 + + movdqa 64(%rsp),%xmm6 + movdqa 80(%rsp),%xmm7 + + cmpq $8,%rdx + jb .Lctr32_tail + + leaq 128(%rcx),%rcx + subq $8,%rdx + jmp .Lctr32_loop8 + +.align 32 +.Lctr32_loop8: + addl $8,%r8d + movdqa 96(%rsp),%xmm8 +.byte 102,15,56,220,209 + movl %r8d,%r9d + movdqa 112(%rsp),%xmm9 +.byte 102,15,56,220,217 + bswapl %r9d + movups 32-128(%rcx),%xmm0 +.byte 102,15,56,220,225 + xorl %ebp,%r9d + nop +.byte 102,15,56,220,233 + movl %r9d,0+12(%rsp) + leaq 1(%r8),%r9 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 48-128(%rcx),%xmm1 + bswapl %r9d +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + xorl %ebp,%r9d +.byte 0x66,0x90 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movl %r9d,16+12(%rsp) + leaq 2(%r8),%r9 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 64-128(%rcx),%xmm0 + bswapl %r9d +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + xorl %ebp,%r9d +.byte 0x66,0x90 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movl %r9d,32+12(%rsp) + leaq 3(%r8),%r9 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 80-128(%rcx),%xmm1 + bswapl %r9d +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + xorl %ebp,%r9d +.byte 0x66,0x90 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movl %r9d,48+12(%rsp) + leaq 4(%r8),%r9 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 96-128(%rcx),%xmm0 + bswapl %r9d +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + xorl %ebp,%r9d +.byte 0x66,0x90 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movl %r9d,64+12(%rsp) + leaq 5(%r8),%r9 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 112-128(%rcx),%xmm1 + bswapl %r9d +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + xorl %ebp,%r9d +.byte 0x66,0x90 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movl %r9d,80+12(%rsp) + leaq 6(%r8),%r9 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 128-128(%rcx),%xmm0 + bswapl %r9d +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + xorl %ebp,%r9d +.byte 0x66,0x90 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movl %r9d,96+12(%rsp) + leaq 7(%r8),%r9 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 144-128(%rcx),%xmm1 + bswapl %r9d +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 + xorl %ebp,%r9d + movdqu 0(%rdi),%xmm10 +.byte 102,15,56,220,232 + movl %r9d,112+12(%rsp) + cmpl $11,%eax +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 160-128(%rcx),%xmm0 + + jb .Lctr32_enc_done + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 176-128(%rcx),%xmm1 + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 192-128(%rcx),%xmm0 + + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 208-128(%rcx),%xmm1 + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 224-128(%rcx),%xmm0 + jmp .Lctr32_enc_done + +.align 16 +.Lctr32_enc_done: + movdqu 16(%rdi),%xmm11 + pxor %xmm0,%xmm10 + movdqu 32(%rdi),%xmm12 + pxor %xmm0,%xmm11 + movdqu 48(%rdi),%xmm13 + pxor %xmm0,%xmm12 + movdqu 64(%rdi),%xmm14 + pxor %xmm0,%xmm13 + movdqu 80(%rdi),%xmm15 + pxor %xmm0,%xmm14 + prefetcht0 448(%rdi) + prefetcht0 512(%rdi) + pxor %xmm0,%xmm15 +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movdqu 96(%rdi),%xmm1 + leaq 128(%rdi),%rdi + +.byte 102,65,15,56,221,210 + pxor %xmm0,%xmm1 + movdqu 112-128(%rdi),%xmm10 +.byte 102,65,15,56,221,219 + pxor %xmm0,%xmm10 + movdqa 0(%rsp),%xmm11 +.byte 102,65,15,56,221,228 +.byte 102,65,15,56,221,237 + movdqa 16(%rsp),%xmm12 + movdqa 32(%rsp),%xmm13 +.byte 102,65,15,56,221,246 +.byte 102,65,15,56,221,255 + movdqa 48(%rsp),%xmm14 + movdqa 64(%rsp),%xmm15 +.byte 102,68,15,56,221,193 + movdqa 80(%rsp),%xmm0 + movups 16-128(%rcx),%xmm1 +.byte 102,69,15,56,221,202 + + movups %xmm2,(%rsi) + movdqa %xmm11,%xmm2 + movups %xmm3,16(%rsi) + movdqa %xmm12,%xmm3 + movups %xmm4,32(%rsi) + movdqa %xmm13,%xmm4 + movups %xmm5,48(%rsi) + movdqa %xmm14,%xmm5 + movups %xmm6,64(%rsi) + movdqa %xmm15,%xmm6 + movups %xmm7,80(%rsi) + movdqa %xmm0,%xmm7 + movups %xmm8,96(%rsi) + movups %xmm9,112(%rsi) + leaq 128(%rsi),%rsi + + subq $8,%rdx + jnc .Lctr32_loop8 + + addq $8,%rdx + jz .Lctr32_done + leaq -128(%rcx),%rcx + +.Lctr32_tail: + + + leaq 16(%rcx),%rcx + cmpq $4,%rdx + jb .Lctr32_loop3 + je .Lctr32_loop4 + + + shll $4,%eax + movdqa 96(%rsp),%xmm8 + pxor %xmm9,%xmm9 + + movups 16(%rcx),%xmm0 +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + leaq 32-16(%rcx,%rax,1),%rcx + negq %rax +.byte 102,15,56,220,225 + addq $16,%rax + movups (%rdi),%xmm10 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 + movups 16(%rdi),%xmm11 + movups 32(%rdi),%xmm12 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 + + call .Lenc_loop8_enter + + movdqu 48(%rdi),%xmm13 + pxor %xmm10,%xmm2 + movdqu 64(%rdi),%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm10,%xmm6 + movdqu %xmm5,48(%rsi) + movdqu %xmm6,64(%rsi) + cmpq $6,%rdx + jb .Lctr32_done + + movups 80(%rdi),%xmm11 + xorps %xmm11,%xmm7 + movups %xmm7,80(%rsi) + je .Lctr32_done + + movups 96(%rdi),%xmm12 + xorps %xmm12,%xmm8 + movups %xmm8,96(%rsi) + jmp .Lctr32_done + +.align 32 +.Lctr32_loop4: +.byte 102,15,56,220,209 + leaq 16(%rcx),%rcx + decl %eax +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups (%rcx),%xmm1 + jnz .Lctr32_loop4 +.byte 102,15,56,221,209 +.byte 102,15,56,221,217 + movups (%rdi),%xmm10 + movups 16(%rdi),%xmm11 +.byte 102,15,56,221,225 +.byte 102,15,56,221,233 + movups 32(%rdi),%xmm12 + movups 48(%rdi),%xmm13 + + xorps %xmm10,%xmm2 + movups %xmm2,(%rsi) + xorps %xmm11,%xmm3 + movups %xmm3,16(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm4,32(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm5,48(%rsi) + jmp .Lctr32_done + +.align 32 +.Lctr32_loop3: +.byte 102,15,56,220,209 + leaq 16(%rcx),%rcx + decl %eax +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 + movups (%rcx),%xmm1 + jnz .Lctr32_loop3 +.byte 102,15,56,221,209 +.byte 102,15,56,221,217 +.byte 102,15,56,221,225 + + movups (%rdi),%xmm10 + xorps %xmm10,%xmm2 + movups %xmm2,(%rsi) + cmpq $2,%rdx + jb .Lctr32_done + + movups 16(%rdi),%xmm11 + xorps %xmm11,%xmm3 + movups %xmm3,16(%rsi) + je .Lctr32_done + + movups 32(%rdi),%xmm12 + xorps %xmm12,%xmm4 + movups %xmm4,32(%rsi) + +.Lctr32_done: + xorps %xmm0,%xmm0 + xorl %ebp,%ebp + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + movaps %xmm0,0(%rsp) + pxor %xmm8,%xmm8 + movaps %xmm0,16(%rsp) + pxor %xmm9,%xmm9 + movaps %xmm0,32(%rsp) + pxor %xmm10,%xmm10 + movaps %xmm0,48(%rsp) + pxor %xmm11,%xmm11 + movaps %xmm0,64(%rsp) + pxor %xmm12,%xmm12 + movaps %xmm0,80(%rsp) + pxor %xmm13,%xmm13 + movaps %xmm0,96(%rsp) + pxor %xmm14,%xmm14 + movaps %xmm0,112(%rsp) + pxor %xmm15,%xmm15 + movq -8(%r11),%rbp +.cfi_restore %rbp + leaq (%r11),%rsp +.cfi_def_cfa_register %rsp +.Lctr32_epilogue: + ret +.cfi_endproc +.size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks +.globl aes_hw_set_encrypt_key_base +.hidden aes_hw_set_encrypt_key_base +.type aes_hw_set_encrypt_key_base,@function +.align 16 +aes_hw_set_encrypt_key_base: +.cfi_startproc + +_CET_ENDBR +#ifdef BORINGSSL_DISPATCH_TEST + movb $1,BORINGSSL_function_hit+3(%rip) +#endif + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movups (%rdi),%xmm0 + xorps %xmm4,%xmm4 + leaq 16(%rdx),%rax + cmpl $256,%esi + je .L14rounds + + + cmpl $128,%esi + jne .Lbad_keybits + +.L10rounds: + movl $9,%esi + + movups %xmm0,(%rdx) +.byte 102,15,58,223,200,1 + call .Lkey_expansion_128_cold +.byte 102,15,58,223,200,2 + call .Lkey_expansion_128 +.byte 102,15,58,223,200,4 + call .Lkey_expansion_128 +.byte 102,15,58,223,200,8 + call .Lkey_expansion_128 +.byte 102,15,58,223,200,16 + call .Lkey_expansion_128 +.byte 102,15,58,223,200,32 + call .Lkey_expansion_128 +.byte 102,15,58,223,200,64 + call .Lkey_expansion_128 +.byte 102,15,58,223,200,128 + call .Lkey_expansion_128 +.byte 102,15,58,223,200,27 + call .Lkey_expansion_128 +.byte 102,15,58,223,200,54 + call .Lkey_expansion_128 + movups %xmm0,(%rax) + movl %esi,80(%rax) + xorl %eax,%eax + jmp .Lenc_key_ret + + + +.align 16 +.L14rounds: + movups 16(%rdi),%xmm2 + movl $13,%esi + leaq 16(%rax),%rax + + movups %xmm0,(%rdx) + movups %xmm2,16(%rdx) +.byte 102,15,58,223,202,1 + call .Lkey_expansion_256a_cold +.byte 102,15,58,223,200,1 + call .Lkey_expansion_256b +.byte 102,15,58,223,202,2 + call .Lkey_expansion_256a +.byte 102,15,58,223,200,2 + call .Lkey_expansion_256b +.byte 102,15,58,223,202,4 + call .Lkey_expansion_256a +.byte 102,15,58,223,200,4 + call .Lkey_expansion_256b +.byte 102,15,58,223,202,8 + call .Lkey_expansion_256a +.byte 102,15,58,223,200,8 + call .Lkey_expansion_256b +.byte 102,15,58,223,202,16 + call .Lkey_expansion_256a +.byte 102,15,58,223,200,16 + call .Lkey_expansion_256b +.byte 102,15,58,223,202,32 + call .Lkey_expansion_256a +.byte 102,15,58,223,200,32 + call .Lkey_expansion_256b +.byte 102,15,58,223,202,64 + call .Lkey_expansion_256a + movups %xmm0,(%rax) + movl %esi,16(%rax) + xorq %rax,%rax + jmp .Lenc_key_ret + +.align 16 +.Lbad_keybits: + movq $-2,%rax +.Lenc_key_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + addq $8,%rsp +.cfi_adjust_cfa_offset -8 + ret +.cfi_endproc + + +.align 16 +.Lkey_expansion_128: +.cfi_startproc + movups %xmm0,(%rax) + leaq 16(%rax),%rax +.Lkey_expansion_128_cold: + shufps $16,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $140,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $255,%xmm1,%xmm1 + xorps %xmm1,%xmm0 + ret +.cfi_endproc + +.align 16 +.Lkey_expansion_256a: +.cfi_startproc + movups %xmm2,(%rax) + leaq 16(%rax),%rax +.Lkey_expansion_256a_cold: + shufps $16,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $140,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $255,%xmm1,%xmm1 + xorps %xmm1,%xmm0 + ret +.cfi_endproc + +.align 16 +.Lkey_expansion_256b: +.cfi_startproc + movups %xmm0,(%rax) + leaq 16(%rax),%rax + + shufps $16,%xmm2,%xmm4 + xorps %xmm4,%xmm2 + shufps $140,%xmm2,%xmm4 + xorps %xmm4,%xmm2 + shufps $170,%xmm1,%xmm1 + xorps %xmm1,%xmm2 + ret +.cfi_endproc +.size aes_hw_set_encrypt_key_base,.-aes_hw_set_encrypt_key_base + +.globl aes_hw_set_encrypt_key_alt +.hidden aes_hw_set_encrypt_key_alt +.type aes_hw_set_encrypt_key_alt,@function +.align 16 +aes_hw_set_encrypt_key_alt: +.cfi_startproc + +_CET_ENDBR +#ifdef BORINGSSL_DISPATCH_TEST + movb $1,BORINGSSL_function_hit+3(%rip) +#endif + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movups (%rdi),%xmm0 + xorps %xmm4,%xmm4 + leaq 16(%rdx),%rax + cmpl $256,%esi + je .L14rounds_alt + + cmpl $128,%esi + jne .Lbad_keybits_alt + + movl $9,%esi + movdqa .Lkey_rotate(%rip),%xmm5 + movl $8,%r10d + movdqa .Lkey_rcon1(%rip),%xmm4 + movdqa %xmm0,%xmm2 + movdqu %xmm0,(%rdx) + jmp .Loop_key128 + +.align 16 +.Loop_key128: +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + leaq 16(%rax),%rax + + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,-16(%rax) + movdqa %xmm0,%xmm2 + + decl %r10d + jnz .Loop_key128 + + movdqa .Lkey_rcon1b(%rip),%xmm4 + +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,(%rax) + + movdqa %xmm0,%xmm2 +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,16(%rax) + + movl %esi,96(%rax) + xorl %eax,%eax + jmp .Lenc_key_ret_alt + + + +.align 16 +.L14rounds_alt: + movups 16(%rdi),%xmm2 + movl $13,%esi + leaq 16(%rax),%rax + movdqa .Lkey_rotate(%rip),%xmm5 + movdqa .Lkey_rcon1(%rip),%xmm4 + movl $7,%r10d + movdqu %xmm0,0(%rdx) + movdqa %xmm2,%xmm1 + movdqu %xmm2,16(%rdx) + jmp .Loop_key256 + +.align 16 +.Loop_key256: +.byte 102,15,56,0,213 +.byte 102,15,56,221,212 + + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + pslld $1,%xmm4 + + pxor %xmm2,%xmm0 + movdqu %xmm0,(%rax) + + decl %r10d + jz .Ldone_key256 + + pshufd $0xff,%xmm0,%xmm2 + pxor %xmm3,%xmm3 +.byte 102,15,56,221,211 + + movdqa %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm3,%xmm1 + + pxor %xmm1,%xmm2 + movdqu %xmm2,16(%rax) + leaq 32(%rax),%rax + movdqa %xmm2,%xmm1 + + jmp .Loop_key256 + +.Ldone_key256: + movl %esi,16(%rax) + xorl %eax,%eax + jmp .Lenc_key_ret_alt + +.align 16 +.Lbad_keybits_alt: + movq $-2,%rax +.Lenc_key_ret_alt: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + addq $8,%rsp +.cfi_adjust_cfa_offset -8 + ret +.cfi_endproc + +.size aes_hw_set_encrypt_key_alt,.-aes_hw_set_encrypt_key_alt +.section .rodata +.align 64 +.Lbswap_mask: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.Lincrement32: +.long 6,6,6,0 +.Lincrement64: +.long 1,0,0,0 +.Lincrement1: +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +.Lkey_rotate: +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d +.Lkey_rotate192: +.long 0x04070605,0x04070605,0x04070605,0x04070605 +.Lkey_rcon1: +.long 1,1,1,1 +.Lkey_rcon1b: +.long 0x1b,0x1b,0x1b,0x1b + +.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 64 +.text +#endif diff --git a/ring-0.17.14/pregenerated/aesni-x86_64-macosx.S b/ring-0.17.14/pregenerated/aesni-x86_64-macosx.S new file mode 100644 index 0000000000..5737063d55 --- /dev/null +++ b/ring-0.17.14/pregenerated/aesni-x86_64-macosx.S @@ -0,0 +1,1072 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +.text + +.p2align 4 +_aesni_encrypt2: + + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + addq $16,%rax + +L$enc_loop2: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + movups -16(%rcx,%rax,1),%xmm0 + jnz L$enc_loop2 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 + ret + + + +.p2align 4 +_aesni_encrypt3: + + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + xorps %xmm0,%xmm4 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + addq $16,%rax + +L$enc_loop3: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 + movups -16(%rcx,%rax,1),%xmm0 + jnz L$enc_loop3 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 +.byte 102,15,56,221,224 + ret + + + +.p2align 4 +_aesni_encrypt4: + + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + xorps %xmm0,%xmm4 + xorps %xmm0,%xmm5 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax +.byte 0x0f,0x1f,0x00 + addq $16,%rax + +L$enc_loop4: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movups -16(%rcx,%rax,1),%xmm0 + jnz L$enc_loop4 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 +.byte 102,15,56,221,224 +.byte 102,15,56,221,232 + ret + + + +.p2align 4 +_aesni_encrypt6: + + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + pxor %xmm0,%xmm4 +.byte 102,15,56,220,209 + leaq 32(%rcx,%rax,1),%rcx + negq %rax +.byte 102,15,56,220,217 + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 +.byte 102,15,56,220,225 + pxor %xmm0,%xmm7 + movups (%rcx,%rax,1),%xmm0 + addq $16,%rax + jmp L$enc_loop6_enter +.p2align 4 +L$enc_loop6: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +L$enc_loop6_enter: +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 + movups -16(%rcx,%rax,1),%xmm0 + jnz L$enc_loop6 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 +.byte 102,15,56,221,224 +.byte 102,15,56,221,232 +.byte 102,15,56,221,240 +.byte 102,15,56,221,248 + ret + + + +.p2align 4 +_aesni_encrypt8: + + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + pxor %xmm0,%xmm4 + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 + leaq 32(%rcx,%rax,1),%rcx + negq %rax +.byte 102,15,56,220,209 + pxor %xmm0,%xmm7 + pxor %xmm0,%xmm8 +.byte 102,15,56,220,217 + pxor %xmm0,%xmm9 + movups (%rcx,%rax,1),%xmm0 + addq $16,%rax + jmp L$enc_loop8_inner +.p2align 4 +L$enc_loop8: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +L$enc_loop8_inner: +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 +L$enc_loop8_enter: + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups -16(%rcx,%rax,1),%xmm0 + jnz L$enc_loop8 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 +.byte 102,15,56,221,224 +.byte 102,15,56,221,232 +.byte 102,15,56,221,240 +.byte 102,15,56,221,248 +.byte 102,68,15,56,221,192 +.byte 102,68,15,56,221,200 + ret + + +.globl _aes_hw_ctr32_encrypt_blocks +.private_extern _aes_hw_ctr32_encrypt_blocks + +.p2align 4 +_aes_hw_ctr32_encrypt_blocks: + +_CET_ENDBR +#ifdef BORINGSSL_DISPATCH_TEST + movb $1,BORINGSSL_function_hit(%rip) +#endif + cmpq $1,%rdx + jne L$ctr32_bulk + + + + movups (%r8),%xmm2 + movups (%rdi),%xmm3 + movl 240(%rcx),%edx + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +L$oop_enc1_1: +.byte 102,15,56,220,209 + decl %edx + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz L$oop_enc1_1 +.byte 102,15,56,221,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + xorps %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movups %xmm2,(%rsi) + xorps %xmm2,%xmm2 + jmp L$ctr32_epilogue + +.p2align 4 +L$ctr32_bulk: + leaq (%rsp),%r11 + + pushq %rbp + + subq $128,%rsp + andq $-16,%rsp + + + + + movdqu (%r8),%xmm2 + movdqu (%rcx),%xmm0 + movl 12(%r8),%r8d + pxor %xmm0,%xmm2 + movl 12(%rcx),%ebp + movdqa %xmm2,0(%rsp) + bswapl %r8d + movdqa %xmm2,%xmm3 + movdqa %xmm2,%xmm4 + movdqa %xmm2,%xmm5 + movdqa %xmm2,64(%rsp) + movdqa %xmm2,80(%rsp) + movdqa %xmm2,96(%rsp) + movq %rdx,%r10 + movdqa %xmm2,112(%rsp) + + leaq 1(%r8),%rax + leaq 2(%r8),%rdx + bswapl %eax + bswapl %edx + xorl %ebp,%eax + xorl %ebp,%edx +.byte 102,15,58,34,216,3 + leaq 3(%r8),%rax + movdqa %xmm3,16(%rsp) +.byte 102,15,58,34,226,3 + bswapl %eax + movq %r10,%rdx + leaq 4(%r8),%r10 + movdqa %xmm4,32(%rsp) + xorl %ebp,%eax + bswapl %r10d +.byte 102,15,58,34,232,3 + xorl %ebp,%r10d + movdqa %xmm5,48(%rsp) + leaq 5(%r8),%r9 + movl %r10d,64+12(%rsp) + bswapl %r9d + leaq 6(%r8),%r10 + movl 240(%rcx),%eax + xorl %ebp,%r9d + bswapl %r10d + movl %r9d,80+12(%rsp) + xorl %ebp,%r10d + leaq 7(%r8),%r9 + movl %r10d,96+12(%rsp) + bswapl %r9d + xorl %ebp,%r9d + movl %r9d,112+12(%rsp) + + movups 16(%rcx),%xmm1 + + movdqa 64(%rsp),%xmm6 + movdqa 80(%rsp),%xmm7 + + cmpq $8,%rdx + jb L$ctr32_tail + + leaq 128(%rcx),%rcx + subq $8,%rdx + jmp L$ctr32_loop8 + +.p2align 5 +L$ctr32_loop8: + addl $8,%r8d + movdqa 96(%rsp),%xmm8 +.byte 102,15,56,220,209 + movl %r8d,%r9d + movdqa 112(%rsp),%xmm9 +.byte 102,15,56,220,217 + bswapl %r9d + movups 32-128(%rcx),%xmm0 +.byte 102,15,56,220,225 + xorl %ebp,%r9d + nop +.byte 102,15,56,220,233 + movl %r9d,0+12(%rsp) + leaq 1(%r8),%r9 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 48-128(%rcx),%xmm1 + bswapl %r9d +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + xorl %ebp,%r9d +.byte 0x66,0x90 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movl %r9d,16+12(%rsp) + leaq 2(%r8),%r9 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 64-128(%rcx),%xmm0 + bswapl %r9d +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + xorl %ebp,%r9d +.byte 0x66,0x90 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movl %r9d,32+12(%rsp) + leaq 3(%r8),%r9 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 80-128(%rcx),%xmm1 + bswapl %r9d +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + xorl %ebp,%r9d +.byte 0x66,0x90 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movl %r9d,48+12(%rsp) + leaq 4(%r8),%r9 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 96-128(%rcx),%xmm0 + bswapl %r9d +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + xorl %ebp,%r9d +.byte 0x66,0x90 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movl %r9d,64+12(%rsp) + leaq 5(%r8),%r9 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 112-128(%rcx),%xmm1 + bswapl %r9d +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + xorl %ebp,%r9d +.byte 0x66,0x90 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movl %r9d,80+12(%rsp) + leaq 6(%r8),%r9 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 128-128(%rcx),%xmm0 + bswapl %r9d +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + xorl %ebp,%r9d +.byte 0x66,0x90 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movl %r9d,96+12(%rsp) + leaq 7(%r8),%r9 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 144-128(%rcx),%xmm1 + bswapl %r9d +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 + xorl %ebp,%r9d + movdqu 0(%rdi),%xmm10 +.byte 102,15,56,220,232 + movl %r9d,112+12(%rsp) + cmpl $11,%eax +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 160-128(%rcx),%xmm0 + + jb L$ctr32_enc_done + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 176-128(%rcx),%xmm1 + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 192-128(%rcx),%xmm0 + + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 208-128(%rcx),%xmm1 + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 224-128(%rcx),%xmm0 + jmp L$ctr32_enc_done + +.p2align 4 +L$ctr32_enc_done: + movdqu 16(%rdi),%xmm11 + pxor %xmm0,%xmm10 + movdqu 32(%rdi),%xmm12 + pxor %xmm0,%xmm11 + movdqu 48(%rdi),%xmm13 + pxor %xmm0,%xmm12 + movdqu 64(%rdi),%xmm14 + pxor %xmm0,%xmm13 + movdqu 80(%rdi),%xmm15 + pxor %xmm0,%xmm14 + prefetcht0 448(%rdi) + prefetcht0 512(%rdi) + pxor %xmm0,%xmm15 +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movdqu 96(%rdi),%xmm1 + leaq 128(%rdi),%rdi + +.byte 102,65,15,56,221,210 + pxor %xmm0,%xmm1 + movdqu 112-128(%rdi),%xmm10 +.byte 102,65,15,56,221,219 + pxor %xmm0,%xmm10 + movdqa 0(%rsp),%xmm11 +.byte 102,65,15,56,221,228 +.byte 102,65,15,56,221,237 + movdqa 16(%rsp),%xmm12 + movdqa 32(%rsp),%xmm13 +.byte 102,65,15,56,221,246 +.byte 102,65,15,56,221,255 + movdqa 48(%rsp),%xmm14 + movdqa 64(%rsp),%xmm15 +.byte 102,68,15,56,221,193 + movdqa 80(%rsp),%xmm0 + movups 16-128(%rcx),%xmm1 +.byte 102,69,15,56,221,202 + + movups %xmm2,(%rsi) + movdqa %xmm11,%xmm2 + movups %xmm3,16(%rsi) + movdqa %xmm12,%xmm3 + movups %xmm4,32(%rsi) + movdqa %xmm13,%xmm4 + movups %xmm5,48(%rsi) + movdqa %xmm14,%xmm5 + movups %xmm6,64(%rsi) + movdqa %xmm15,%xmm6 + movups %xmm7,80(%rsi) + movdqa %xmm0,%xmm7 + movups %xmm8,96(%rsi) + movups %xmm9,112(%rsi) + leaq 128(%rsi),%rsi + + subq $8,%rdx + jnc L$ctr32_loop8 + + addq $8,%rdx + jz L$ctr32_done + leaq -128(%rcx),%rcx + +L$ctr32_tail: + + + leaq 16(%rcx),%rcx + cmpq $4,%rdx + jb L$ctr32_loop3 + je L$ctr32_loop4 + + + shll $4,%eax + movdqa 96(%rsp),%xmm8 + pxor %xmm9,%xmm9 + + movups 16(%rcx),%xmm0 +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + leaq 32-16(%rcx,%rax,1),%rcx + negq %rax +.byte 102,15,56,220,225 + addq $16,%rax + movups (%rdi),%xmm10 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 + movups 16(%rdi),%xmm11 + movups 32(%rdi),%xmm12 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 + + call L$enc_loop8_enter + + movdqu 48(%rdi),%xmm13 + pxor %xmm10,%xmm2 + movdqu 64(%rdi),%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm10,%xmm6 + movdqu %xmm5,48(%rsi) + movdqu %xmm6,64(%rsi) + cmpq $6,%rdx + jb L$ctr32_done + + movups 80(%rdi),%xmm11 + xorps %xmm11,%xmm7 + movups %xmm7,80(%rsi) + je L$ctr32_done + + movups 96(%rdi),%xmm12 + xorps %xmm12,%xmm8 + movups %xmm8,96(%rsi) + jmp L$ctr32_done + +.p2align 5 +L$ctr32_loop4: +.byte 102,15,56,220,209 + leaq 16(%rcx),%rcx + decl %eax +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups (%rcx),%xmm1 + jnz L$ctr32_loop4 +.byte 102,15,56,221,209 +.byte 102,15,56,221,217 + movups (%rdi),%xmm10 + movups 16(%rdi),%xmm11 +.byte 102,15,56,221,225 +.byte 102,15,56,221,233 + movups 32(%rdi),%xmm12 + movups 48(%rdi),%xmm13 + + xorps %xmm10,%xmm2 + movups %xmm2,(%rsi) + xorps %xmm11,%xmm3 + movups %xmm3,16(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm4,32(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm5,48(%rsi) + jmp L$ctr32_done + +.p2align 5 +L$ctr32_loop3: +.byte 102,15,56,220,209 + leaq 16(%rcx),%rcx + decl %eax +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 + movups (%rcx),%xmm1 + jnz L$ctr32_loop3 +.byte 102,15,56,221,209 +.byte 102,15,56,221,217 +.byte 102,15,56,221,225 + + movups (%rdi),%xmm10 + xorps %xmm10,%xmm2 + movups %xmm2,(%rsi) + cmpq $2,%rdx + jb L$ctr32_done + + movups 16(%rdi),%xmm11 + xorps %xmm11,%xmm3 + movups %xmm3,16(%rsi) + je L$ctr32_done + + movups 32(%rdi),%xmm12 + xorps %xmm12,%xmm4 + movups %xmm4,32(%rsi) + +L$ctr32_done: + xorps %xmm0,%xmm0 + xorl %ebp,%ebp + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + movaps %xmm0,0(%rsp) + pxor %xmm8,%xmm8 + movaps %xmm0,16(%rsp) + pxor %xmm9,%xmm9 + movaps %xmm0,32(%rsp) + pxor %xmm10,%xmm10 + movaps %xmm0,48(%rsp) + pxor %xmm11,%xmm11 + movaps %xmm0,64(%rsp) + pxor %xmm12,%xmm12 + movaps %xmm0,80(%rsp) + pxor %xmm13,%xmm13 + movaps %xmm0,96(%rsp) + pxor %xmm14,%xmm14 + movaps %xmm0,112(%rsp) + pxor %xmm15,%xmm15 + movq -8(%r11),%rbp + + leaq (%r11),%rsp + +L$ctr32_epilogue: + ret + + +.globl _aes_hw_set_encrypt_key_base +.private_extern _aes_hw_set_encrypt_key_base + +.p2align 4 +_aes_hw_set_encrypt_key_base: + + +_CET_ENDBR +#ifdef BORINGSSL_DISPATCH_TEST + movb $1,BORINGSSL_function_hit+3(%rip) +#endif + subq $8,%rsp + + + + movups (%rdi),%xmm0 + xorps %xmm4,%xmm4 + leaq 16(%rdx),%rax + cmpl $256,%esi + je L$14rounds + + + cmpl $128,%esi + jne L$bad_keybits + +L$10rounds: + movl $9,%esi + + movups %xmm0,(%rdx) +.byte 102,15,58,223,200,1 + call L$key_expansion_128_cold +.byte 102,15,58,223,200,2 + call L$key_expansion_128 +.byte 102,15,58,223,200,4 + call L$key_expansion_128 +.byte 102,15,58,223,200,8 + call L$key_expansion_128 +.byte 102,15,58,223,200,16 + call L$key_expansion_128 +.byte 102,15,58,223,200,32 + call L$key_expansion_128 +.byte 102,15,58,223,200,64 + call L$key_expansion_128 +.byte 102,15,58,223,200,128 + call L$key_expansion_128 +.byte 102,15,58,223,200,27 + call L$key_expansion_128 +.byte 102,15,58,223,200,54 + call L$key_expansion_128 + movups %xmm0,(%rax) + movl %esi,80(%rax) + xorl %eax,%eax + jmp L$enc_key_ret + + + +.p2align 4 +L$14rounds: + movups 16(%rdi),%xmm2 + movl $13,%esi + leaq 16(%rax),%rax + + movups %xmm0,(%rdx) + movups %xmm2,16(%rdx) +.byte 102,15,58,223,202,1 + call L$key_expansion_256a_cold +.byte 102,15,58,223,200,1 + call L$key_expansion_256b +.byte 102,15,58,223,202,2 + call L$key_expansion_256a +.byte 102,15,58,223,200,2 + call L$key_expansion_256b +.byte 102,15,58,223,202,4 + call L$key_expansion_256a +.byte 102,15,58,223,200,4 + call L$key_expansion_256b +.byte 102,15,58,223,202,8 + call L$key_expansion_256a +.byte 102,15,58,223,200,8 + call L$key_expansion_256b +.byte 102,15,58,223,202,16 + call L$key_expansion_256a +.byte 102,15,58,223,200,16 + call L$key_expansion_256b +.byte 102,15,58,223,202,32 + call L$key_expansion_256a +.byte 102,15,58,223,200,32 + call L$key_expansion_256b +.byte 102,15,58,223,202,64 + call L$key_expansion_256a + movups %xmm0,(%rax) + movl %esi,16(%rax) + xorq %rax,%rax + jmp L$enc_key_ret + +.p2align 4 +L$bad_keybits: + movq $-2,%rax +L$enc_key_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + addq $8,%rsp + + ret + + + +.p2align 4 +L$key_expansion_128: + + movups %xmm0,(%rax) + leaq 16(%rax),%rax +L$key_expansion_128_cold: + shufps $16,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $140,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $255,%xmm1,%xmm1 + xorps %xmm1,%xmm0 + ret + + +.p2align 4 +L$key_expansion_256a: + + movups %xmm2,(%rax) + leaq 16(%rax),%rax +L$key_expansion_256a_cold: + shufps $16,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $140,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $255,%xmm1,%xmm1 + xorps %xmm1,%xmm0 + ret + + +.p2align 4 +L$key_expansion_256b: + + movups %xmm0,(%rax) + leaq 16(%rax),%rax + + shufps $16,%xmm2,%xmm4 + xorps %xmm4,%xmm2 + shufps $140,%xmm2,%xmm4 + xorps %xmm4,%xmm2 + shufps $170,%xmm1,%xmm1 + xorps %xmm1,%xmm2 + ret + + + +.globl _aes_hw_set_encrypt_key_alt +.private_extern _aes_hw_set_encrypt_key_alt + +.p2align 4 +_aes_hw_set_encrypt_key_alt: + + +_CET_ENDBR +#ifdef BORINGSSL_DISPATCH_TEST + movb $1,BORINGSSL_function_hit+3(%rip) +#endif + subq $8,%rsp + + + + movups (%rdi),%xmm0 + xorps %xmm4,%xmm4 + leaq 16(%rdx),%rax + cmpl $256,%esi + je L$14rounds_alt + + cmpl $128,%esi + jne L$bad_keybits_alt + + movl $9,%esi + movdqa L$key_rotate(%rip),%xmm5 + movl $8,%r10d + movdqa L$key_rcon1(%rip),%xmm4 + movdqa %xmm0,%xmm2 + movdqu %xmm0,(%rdx) + jmp L$oop_key128 + +.p2align 4 +L$oop_key128: +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + leaq 16(%rax),%rax + + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,-16(%rax) + movdqa %xmm0,%xmm2 + + decl %r10d + jnz L$oop_key128 + + movdqa L$key_rcon1b(%rip),%xmm4 + +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,(%rax) + + movdqa %xmm0,%xmm2 +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,16(%rax) + + movl %esi,96(%rax) + xorl %eax,%eax + jmp L$enc_key_ret_alt + + + +.p2align 4 +L$14rounds_alt: + movups 16(%rdi),%xmm2 + movl $13,%esi + leaq 16(%rax),%rax + movdqa L$key_rotate(%rip),%xmm5 + movdqa L$key_rcon1(%rip),%xmm4 + movl $7,%r10d + movdqu %xmm0,0(%rdx) + movdqa %xmm2,%xmm1 + movdqu %xmm2,16(%rdx) + jmp L$oop_key256 + +.p2align 4 +L$oop_key256: +.byte 102,15,56,0,213 +.byte 102,15,56,221,212 + + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + pslld $1,%xmm4 + + pxor %xmm2,%xmm0 + movdqu %xmm0,(%rax) + + decl %r10d + jz L$done_key256 + + pshufd $0xff,%xmm0,%xmm2 + pxor %xmm3,%xmm3 +.byte 102,15,56,221,211 + + movdqa %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm3,%xmm1 + + pxor %xmm1,%xmm2 + movdqu %xmm2,16(%rax) + leaq 32(%rax),%rax + movdqa %xmm2,%xmm1 + + jmp L$oop_key256 + +L$done_key256: + movl %esi,16(%rax) + xorl %eax,%eax + jmp L$enc_key_ret_alt + +.p2align 4 +L$bad_keybits_alt: + movq $-2,%rax +L$enc_key_ret_alt: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + addq $8,%rsp + + ret + + + +.section __DATA,__const +.p2align 6 +L$bswap_mask: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +L$increment32: +.long 6,6,6,0 +L$increment64: +.long 1,0,0,0 +L$increment1: +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +L$key_rotate: +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d +L$key_rotate192: +.long 0x04070605,0x04070605,0x04070605,0x04070605 +L$key_rcon1: +.long 1,1,1,1 +L$key_rcon1b: +.long 0x1b,0x1b,0x1b,0x1b + +.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.p2align 6 +.text +#endif diff --git a/ring-0.17.14/pregenerated/aesni-x86_64-nasm.asm b/ring-0.17.14/pregenerated/aesni-x86_64-nasm.asm new file mode 100644 index 0000000000..ceeec55321 --- /dev/null +++ b/ring-0.17.14/pregenerated/aesni-x86_64-nasm.asm @@ -0,0 +1,1242 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%include "ring_core_generated/prefix_symbols_nasm.inc" +section .text code align=64 + + +ALIGN 16 +_aesni_encrypt2: + + movups xmm0,XMMWORD[rcx] + shl eax,4 + movups xmm1,XMMWORD[16+rcx] + xorps xmm2,xmm0 + xorps xmm3,xmm0 + movups xmm0,XMMWORD[32+rcx] + lea rcx,[32+rax*1+rcx] + neg rax + add rax,16 + +$L$enc_loop2: + DB 102,15,56,220,209 + DB 102,15,56,220,217 + movups xmm1,XMMWORD[rax*1+rcx] + add rax,32 + DB 102,15,56,220,208 + DB 102,15,56,220,216 + movups xmm0,XMMWORD[((-16))+rax*1+rcx] + jnz NEAR $L$enc_loop2 + + DB 102,15,56,220,209 + DB 102,15,56,220,217 + DB 102,15,56,221,208 + DB 102,15,56,221,216 + ret + + + +ALIGN 16 +_aesni_encrypt3: + + movups xmm0,XMMWORD[rcx] + shl eax,4 + movups xmm1,XMMWORD[16+rcx] + xorps xmm2,xmm0 + xorps xmm3,xmm0 + xorps xmm4,xmm0 + movups xmm0,XMMWORD[32+rcx] + lea rcx,[32+rax*1+rcx] + neg rax + add rax,16 + +$L$enc_loop3: + DB 102,15,56,220,209 + DB 102,15,56,220,217 + DB 102,15,56,220,225 + movups xmm1,XMMWORD[rax*1+rcx] + add rax,32 + DB 102,15,56,220,208 + DB 102,15,56,220,216 + DB 102,15,56,220,224 + movups xmm0,XMMWORD[((-16))+rax*1+rcx] + jnz NEAR $L$enc_loop3 + + DB 102,15,56,220,209 + DB 102,15,56,220,217 + DB 102,15,56,220,225 + DB 102,15,56,221,208 + DB 102,15,56,221,216 + DB 102,15,56,221,224 + ret + + + +ALIGN 16 +_aesni_encrypt4: + + movups xmm0,XMMWORD[rcx] + shl eax,4 + movups xmm1,XMMWORD[16+rcx] + xorps xmm2,xmm0 + xorps xmm3,xmm0 + xorps xmm4,xmm0 + xorps xmm5,xmm0 + movups xmm0,XMMWORD[32+rcx] + lea rcx,[32+rax*1+rcx] + neg rax + DB 0x0f,0x1f,0x00 + add rax,16 + +$L$enc_loop4: + DB 102,15,56,220,209 + DB 102,15,56,220,217 + DB 102,15,56,220,225 + DB 102,15,56,220,233 + movups xmm1,XMMWORD[rax*1+rcx] + add rax,32 + DB 102,15,56,220,208 + DB 102,15,56,220,216 + DB 102,15,56,220,224 + DB 102,15,56,220,232 + movups xmm0,XMMWORD[((-16))+rax*1+rcx] + jnz NEAR $L$enc_loop4 + + DB 102,15,56,220,209 + DB 102,15,56,220,217 + DB 102,15,56,220,225 + DB 102,15,56,220,233 + DB 102,15,56,221,208 + DB 102,15,56,221,216 + DB 102,15,56,221,224 + DB 102,15,56,221,232 + ret + + + +ALIGN 16 +_aesni_encrypt6: + + movups xmm0,XMMWORD[rcx] + shl eax,4 + movups xmm1,XMMWORD[16+rcx] + xorps xmm2,xmm0 + pxor xmm3,xmm0 + pxor xmm4,xmm0 + DB 102,15,56,220,209 + lea rcx,[32+rax*1+rcx] + neg rax + DB 102,15,56,220,217 + pxor xmm5,xmm0 + pxor xmm6,xmm0 + DB 102,15,56,220,225 + pxor xmm7,xmm0 + movups xmm0,XMMWORD[rax*1+rcx] + add rax,16 + jmp NEAR $L$enc_loop6_enter +ALIGN 16 +$L$enc_loop6: + DB 102,15,56,220,209 + DB 102,15,56,220,217 + DB 102,15,56,220,225 +$L$enc_loop6_enter: + DB 102,15,56,220,233 + DB 102,15,56,220,241 + DB 102,15,56,220,249 + movups xmm1,XMMWORD[rax*1+rcx] + add rax,32 + DB 102,15,56,220,208 + DB 102,15,56,220,216 + DB 102,15,56,220,224 + DB 102,15,56,220,232 + DB 102,15,56,220,240 + DB 102,15,56,220,248 + movups xmm0,XMMWORD[((-16))+rax*1+rcx] + jnz NEAR $L$enc_loop6 + + DB 102,15,56,220,209 + DB 102,15,56,220,217 + DB 102,15,56,220,225 + DB 102,15,56,220,233 + DB 102,15,56,220,241 + DB 102,15,56,220,249 + DB 102,15,56,221,208 + DB 102,15,56,221,216 + DB 102,15,56,221,224 + DB 102,15,56,221,232 + DB 102,15,56,221,240 + DB 102,15,56,221,248 + ret + + + +ALIGN 16 +_aesni_encrypt8: + + movups xmm0,XMMWORD[rcx] + shl eax,4 + movups xmm1,XMMWORD[16+rcx] + xorps xmm2,xmm0 + xorps xmm3,xmm0 + pxor xmm4,xmm0 + pxor xmm5,xmm0 + pxor xmm6,xmm0 + lea rcx,[32+rax*1+rcx] + neg rax + DB 102,15,56,220,209 + pxor xmm7,xmm0 + pxor xmm8,xmm0 + DB 102,15,56,220,217 + pxor xmm9,xmm0 + movups xmm0,XMMWORD[rax*1+rcx] + add rax,16 + jmp NEAR $L$enc_loop8_inner +ALIGN 16 +$L$enc_loop8: + DB 102,15,56,220,209 + DB 102,15,56,220,217 +$L$enc_loop8_inner: + DB 102,15,56,220,225 + DB 102,15,56,220,233 + DB 102,15,56,220,241 + DB 102,15,56,220,249 + DB 102,68,15,56,220,193 + DB 102,68,15,56,220,201 +$L$enc_loop8_enter: + movups xmm1,XMMWORD[rax*1+rcx] + add rax,32 + DB 102,15,56,220,208 + DB 102,15,56,220,216 + DB 102,15,56,220,224 + DB 102,15,56,220,232 + DB 102,15,56,220,240 + DB 102,15,56,220,248 + DB 102,68,15,56,220,192 + DB 102,68,15,56,220,200 + movups xmm0,XMMWORD[((-16))+rax*1+rcx] + jnz NEAR $L$enc_loop8 + + DB 102,15,56,220,209 + DB 102,15,56,220,217 + DB 102,15,56,220,225 + DB 102,15,56,220,233 + DB 102,15,56,220,241 + DB 102,15,56,220,249 + DB 102,68,15,56,220,193 + DB 102,68,15,56,220,201 + DB 102,15,56,221,208 + DB 102,15,56,221,216 + DB 102,15,56,221,224 + DB 102,15,56,221,232 + DB 102,15,56,221,240 + DB 102,15,56,221,248 + DB 102,68,15,56,221,192 + DB 102,68,15,56,221,200 + ret + + +global aes_hw_ctr32_encrypt_blocks + +ALIGN 16 +aes_hw_ctr32_encrypt_blocks: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aes_hw_ctr32_encrypt_blocks: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +_CET_ENDBR +%ifdef BORINGSSL_DISPATCH_TEST + mov BYTE[BORINGSSL_function_hit],1 +%endif + cmp rdx,1 + jne NEAR $L$ctr32_bulk + + + + movups xmm2,XMMWORD[r8] + movups xmm3,XMMWORD[rdi] + mov edx,DWORD[240+rcx] + movups xmm0,XMMWORD[rcx] + movups xmm1,XMMWORD[16+rcx] + lea rcx,[32+rcx] + xorps xmm2,xmm0 +$L$oop_enc1_1: + DB 102,15,56,220,209 + dec edx + movups xmm1,XMMWORD[rcx] + lea rcx,[16+rcx] + jnz NEAR $L$oop_enc1_1 + DB 102,15,56,221,209 + pxor xmm0,xmm0 + pxor xmm1,xmm1 + xorps xmm2,xmm3 + pxor xmm3,xmm3 + movups XMMWORD[rsi],xmm2 + xorps xmm2,xmm2 + jmp NEAR $L$ctr32_epilogue + +ALIGN 16 +$L$ctr32_bulk: + lea r11,[rsp] + + push rbp + + sub rsp,288 + and rsp,-16 + movaps XMMWORD[(-168)+r11],xmm6 + movaps XMMWORD[(-152)+r11],xmm7 + movaps XMMWORD[(-136)+r11],xmm8 + movaps XMMWORD[(-120)+r11],xmm9 + movaps XMMWORD[(-104)+r11],xmm10 + movaps XMMWORD[(-88)+r11],xmm11 + movaps XMMWORD[(-72)+r11],xmm12 + movaps XMMWORD[(-56)+r11],xmm13 + movaps XMMWORD[(-40)+r11],xmm14 + movaps XMMWORD[(-24)+r11],xmm15 +$L$ctr32_body: + + + + + movdqu xmm2,XMMWORD[r8] + movdqu xmm0,XMMWORD[rcx] + mov r8d,DWORD[12+r8] + pxor xmm2,xmm0 + mov ebp,DWORD[12+rcx] + movdqa XMMWORD[rsp],xmm2 + bswap r8d + movdqa xmm3,xmm2 + movdqa xmm4,xmm2 + movdqa xmm5,xmm2 + movdqa XMMWORD[64+rsp],xmm2 + movdqa XMMWORD[80+rsp],xmm2 + movdqa XMMWORD[96+rsp],xmm2 + mov r10,rdx + movdqa XMMWORD[112+rsp],xmm2 + + lea rax,[1+r8] + lea rdx,[2+r8] + bswap eax + bswap edx + xor eax,ebp + xor edx,ebp +DB 102,15,58,34,216,3 + lea rax,[3+r8] + movdqa XMMWORD[16+rsp],xmm3 +DB 102,15,58,34,226,3 + bswap eax + mov rdx,r10 + lea r10,[4+r8] + movdqa XMMWORD[32+rsp],xmm4 + xor eax,ebp + bswap r10d +DB 102,15,58,34,232,3 + xor r10d,ebp + movdqa XMMWORD[48+rsp],xmm5 + lea r9,[5+r8] + mov DWORD[((64+12))+rsp],r10d + bswap r9d + lea r10,[6+r8] + mov eax,DWORD[240+rcx] + xor r9d,ebp + bswap r10d + mov DWORD[((80+12))+rsp],r9d + xor r10d,ebp + lea r9,[7+r8] + mov DWORD[((96+12))+rsp],r10d + bswap r9d + xor r9d,ebp + mov DWORD[((112+12))+rsp],r9d + + movups xmm1,XMMWORD[16+rcx] + + movdqa xmm6,XMMWORD[64+rsp] + movdqa xmm7,XMMWORD[80+rsp] + + cmp rdx,8 + jb NEAR $L$ctr32_tail + + lea rcx,[128+rcx] + sub rdx,8 + jmp NEAR $L$ctr32_loop8 + +ALIGN 32 +$L$ctr32_loop8: + add r8d,8 + movdqa xmm8,XMMWORD[96+rsp] + DB 102,15,56,220,209 + mov r9d,r8d + movdqa xmm9,XMMWORD[112+rsp] + DB 102,15,56,220,217 + bswap r9d + movups xmm0,XMMWORD[((32-128))+rcx] + DB 102,15,56,220,225 + xor r9d,ebp + nop + DB 102,15,56,220,233 + mov DWORD[((0+12))+rsp],r9d + lea r9,[1+r8] + DB 102,15,56,220,241 + DB 102,15,56,220,249 + DB 102,68,15,56,220,193 + DB 102,68,15,56,220,201 + movups xmm1,XMMWORD[((48-128))+rcx] + bswap r9d + DB 102,15,56,220,208 + DB 102,15,56,220,216 + xor r9d,ebp + DB 0x66,0x90 + DB 102,15,56,220,224 + DB 102,15,56,220,232 + mov DWORD[((16+12))+rsp],r9d + lea r9,[2+r8] + DB 102,15,56,220,240 + DB 102,15,56,220,248 + DB 102,68,15,56,220,192 + DB 102,68,15,56,220,200 + movups xmm0,XMMWORD[((64-128))+rcx] + bswap r9d + DB 102,15,56,220,209 + DB 102,15,56,220,217 + xor r9d,ebp + DB 0x66,0x90 + DB 102,15,56,220,225 + DB 102,15,56,220,233 + mov DWORD[((32+12))+rsp],r9d + lea r9,[3+r8] + DB 102,15,56,220,241 + DB 102,15,56,220,249 + DB 102,68,15,56,220,193 + DB 102,68,15,56,220,201 + movups xmm1,XMMWORD[((80-128))+rcx] + bswap r9d + DB 102,15,56,220,208 + DB 102,15,56,220,216 + xor r9d,ebp + DB 0x66,0x90 + DB 102,15,56,220,224 + DB 102,15,56,220,232 + mov DWORD[((48+12))+rsp],r9d + lea r9,[4+r8] + DB 102,15,56,220,240 + DB 102,15,56,220,248 + DB 102,68,15,56,220,192 + DB 102,68,15,56,220,200 + movups xmm0,XMMWORD[((96-128))+rcx] + bswap r9d + DB 102,15,56,220,209 + DB 102,15,56,220,217 + xor r9d,ebp + DB 0x66,0x90 + DB 102,15,56,220,225 + DB 102,15,56,220,233 + mov DWORD[((64+12))+rsp],r9d + lea r9,[5+r8] + DB 102,15,56,220,241 + DB 102,15,56,220,249 + DB 102,68,15,56,220,193 + DB 102,68,15,56,220,201 + movups xmm1,XMMWORD[((112-128))+rcx] + bswap r9d + DB 102,15,56,220,208 + DB 102,15,56,220,216 + xor r9d,ebp + DB 0x66,0x90 + DB 102,15,56,220,224 + DB 102,15,56,220,232 + mov DWORD[((80+12))+rsp],r9d + lea r9,[6+r8] + DB 102,15,56,220,240 + DB 102,15,56,220,248 + DB 102,68,15,56,220,192 + DB 102,68,15,56,220,200 + movups xmm0,XMMWORD[((128-128))+rcx] + bswap r9d + DB 102,15,56,220,209 + DB 102,15,56,220,217 + xor r9d,ebp + DB 0x66,0x90 + DB 102,15,56,220,225 + DB 102,15,56,220,233 + mov DWORD[((96+12))+rsp],r9d + lea r9,[7+r8] + DB 102,15,56,220,241 + DB 102,15,56,220,249 + DB 102,68,15,56,220,193 + DB 102,68,15,56,220,201 + movups xmm1,XMMWORD[((144-128))+rcx] + bswap r9d + DB 102,15,56,220,208 + DB 102,15,56,220,216 + DB 102,15,56,220,224 + xor r9d,ebp + movdqu xmm10,XMMWORD[rdi] + DB 102,15,56,220,232 + mov DWORD[((112+12))+rsp],r9d + cmp eax,11 + DB 102,15,56,220,240 + DB 102,15,56,220,248 + DB 102,68,15,56,220,192 + DB 102,68,15,56,220,200 + movups xmm0,XMMWORD[((160-128))+rcx] + + jb NEAR $L$ctr32_enc_done + + DB 102,15,56,220,209 + DB 102,15,56,220,217 + DB 102,15,56,220,225 + DB 102,15,56,220,233 + DB 102,15,56,220,241 + DB 102,15,56,220,249 + DB 102,68,15,56,220,193 + DB 102,68,15,56,220,201 + movups xmm1,XMMWORD[((176-128))+rcx] + + DB 102,15,56,220,208 + DB 102,15,56,220,216 + DB 102,15,56,220,224 + DB 102,15,56,220,232 + DB 102,15,56,220,240 + DB 102,15,56,220,248 + DB 102,68,15,56,220,192 + DB 102,68,15,56,220,200 + movups xmm0,XMMWORD[((192-128))+rcx] + + + DB 102,15,56,220,209 + DB 102,15,56,220,217 + DB 102,15,56,220,225 + DB 102,15,56,220,233 + DB 102,15,56,220,241 + DB 102,15,56,220,249 + DB 102,68,15,56,220,193 + DB 102,68,15,56,220,201 + movups xmm1,XMMWORD[((208-128))+rcx] + + DB 102,15,56,220,208 + DB 102,15,56,220,216 + DB 102,15,56,220,224 + DB 102,15,56,220,232 + DB 102,15,56,220,240 + DB 102,15,56,220,248 + DB 102,68,15,56,220,192 + DB 102,68,15,56,220,200 + movups xmm0,XMMWORD[((224-128))+rcx] + jmp NEAR $L$ctr32_enc_done + +ALIGN 16 +$L$ctr32_enc_done: + movdqu xmm11,XMMWORD[16+rdi] + pxor xmm10,xmm0 + movdqu xmm12,XMMWORD[32+rdi] + pxor xmm11,xmm0 + movdqu xmm13,XMMWORD[48+rdi] + pxor xmm12,xmm0 + movdqu xmm14,XMMWORD[64+rdi] + pxor xmm13,xmm0 + movdqu xmm15,XMMWORD[80+rdi] + pxor xmm14,xmm0 + prefetcht0 [448+rdi] + prefetcht0 [512+rdi] + pxor xmm15,xmm0 + DB 102,15,56,220,209 + DB 102,15,56,220,217 + DB 102,15,56,220,225 + DB 102,15,56,220,233 + DB 102,15,56,220,241 + DB 102,15,56,220,249 + DB 102,68,15,56,220,193 + DB 102,68,15,56,220,201 + movdqu xmm1,XMMWORD[96+rdi] + lea rdi,[128+rdi] + + DB 102,65,15,56,221,210 + pxor xmm1,xmm0 + movdqu xmm10,XMMWORD[((112-128))+rdi] + DB 102,65,15,56,221,219 + pxor xmm10,xmm0 + movdqa xmm11,XMMWORD[rsp] + DB 102,65,15,56,221,228 + DB 102,65,15,56,221,237 + movdqa xmm12,XMMWORD[16+rsp] + movdqa xmm13,XMMWORD[32+rsp] + DB 102,65,15,56,221,246 + DB 102,65,15,56,221,255 + movdqa xmm14,XMMWORD[48+rsp] + movdqa xmm15,XMMWORD[64+rsp] + DB 102,68,15,56,221,193 + movdqa xmm0,XMMWORD[80+rsp] + movups xmm1,XMMWORD[((16-128))+rcx] + DB 102,69,15,56,221,202 + + movups XMMWORD[rsi],xmm2 + movdqa xmm2,xmm11 + movups XMMWORD[16+rsi],xmm3 + movdqa xmm3,xmm12 + movups XMMWORD[32+rsi],xmm4 + movdqa xmm4,xmm13 + movups XMMWORD[48+rsi],xmm5 + movdqa xmm5,xmm14 + movups XMMWORD[64+rsi],xmm6 + movdqa xmm6,xmm15 + movups XMMWORD[80+rsi],xmm7 + movdqa xmm7,xmm0 + movups XMMWORD[96+rsi],xmm8 + movups XMMWORD[112+rsi],xmm9 + lea rsi,[128+rsi] + + sub rdx,8 + jnc NEAR $L$ctr32_loop8 + + add rdx,8 + jz NEAR $L$ctr32_done + lea rcx,[((-128))+rcx] + +$L$ctr32_tail: + + + lea rcx,[16+rcx] + cmp rdx,4 + jb NEAR $L$ctr32_loop3 + je NEAR $L$ctr32_loop4 + + + shl eax,4 + movdqa xmm8,XMMWORD[96+rsp] + pxor xmm9,xmm9 + + movups xmm0,XMMWORD[16+rcx] + DB 102,15,56,220,209 + DB 102,15,56,220,217 + lea rcx,[((32-16))+rax*1+rcx] + neg rax + DB 102,15,56,220,225 + add rax,16 + movups xmm10,XMMWORD[rdi] + DB 102,15,56,220,233 + DB 102,15,56,220,241 + movups xmm11,XMMWORD[16+rdi] + movups xmm12,XMMWORD[32+rdi] + DB 102,15,56,220,249 + DB 102,68,15,56,220,193 + + call $L$enc_loop8_enter + + movdqu xmm13,XMMWORD[48+rdi] + pxor xmm2,xmm10 + movdqu xmm10,XMMWORD[64+rdi] + pxor xmm3,xmm11 + movdqu XMMWORD[rsi],xmm2 + pxor xmm4,xmm12 + movdqu XMMWORD[16+rsi],xmm3 + pxor xmm5,xmm13 + movdqu XMMWORD[32+rsi],xmm4 + pxor xmm6,xmm10 + movdqu XMMWORD[48+rsi],xmm5 + movdqu XMMWORD[64+rsi],xmm6 + cmp rdx,6 + jb NEAR $L$ctr32_done + + movups xmm11,XMMWORD[80+rdi] + xorps xmm7,xmm11 + movups XMMWORD[80+rsi],xmm7 + je NEAR $L$ctr32_done + + movups xmm12,XMMWORD[96+rdi] + xorps xmm8,xmm12 + movups XMMWORD[96+rsi],xmm8 + jmp NEAR $L$ctr32_done + +ALIGN 32 +$L$ctr32_loop4: + DB 102,15,56,220,209 + lea rcx,[16+rcx] + dec eax + DB 102,15,56,220,217 + DB 102,15,56,220,225 + DB 102,15,56,220,233 + movups xmm1,XMMWORD[rcx] + jnz NEAR $L$ctr32_loop4 + DB 102,15,56,221,209 + DB 102,15,56,221,217 + movups xmm10,XMMWORD[rdi] + movups xmm11,XMMWORD[16+rdi] + DB 102,15,56,221,225 + DB 102,15,56,221,233 + movups xmm12,XMMWORD[32+rdi] + movups xmm13,XMMWORD[48+rdi] + + xorps xmm2,xmm10 + movups XMMWORD[rsi],xmm2 + xorps xmm3,xmm11 + movups XMMWORD[16+rsi],xmm3 + pxor xmm4,xmm12 + movdqu XMMWORD[32+rsi],xmm4 + pxor xmm5,xmm13 + movdqu XMMWORD[48+rsi],xmm5 + jmp NEAR $L$ctr32_done + +ALIGN 32 +$L$ctr32_loop3: + DB 102,15,56,220,209 + lea rcx,[16+rcx] + dec eax + DB 102,15,56,220,217 + DB 102,15,56,220,225 + movups xmm1,XMMWORD[rcx] + jnz NEAR $L$ctr32_loop3 + DB 102,15,56,221,209 + DB 102,15,56,221,217 + DB 102,15,56,221,225 + + movups xmm10,XMMWORD[rdi] + xorps xmm2,xmm10 + movups XMMWORD[rsi],xmm2 + cmp rdx,2 + jb NEAR $L$ctr32_done + + movups xmm11,XMMWORD[16+rdi] + xorps xmm3,xmm11 + movups XMMWORD[16+rsi],xmm3 + je NEAR $L$ctr32_done + + movups xmm12,XMMWORD[32+rdi] + xorps xmm4,xmm12 + movups XMMWORD[32+rsi],xmm4 + +$L$ctr32_done: + xorps xmm0,xmm0 + xor ebp,ebp + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + movaps xmm6,XMMWORD[((-168))+r11] + movaps XMMWORD[(-168)+r11],xmm0 + movaps xmm7,XMMWORD[((-152))+r11] + movaps XMMWORD[(-152)+r11],xmm0 + movaps xmm8,XMMWORD[((-136))+r11] + movaps XMMWORD[(-136)+r11],xmm0 + movaps xmm9,XMMWORD[((-120))+r11] + movaps XMMWORD[(-120)+r11],xmm0 + movaps xmm10,XMMWORD[((-104))+r11] + movaps XMMWORD[(-104)+r11],xmm0 + movaps xmm11,XMMWORD[((-88))+r11] + movaps XMMWORD[(-88)+r11],xmm0 + movaps xmm12,XMMWORD[((-72))+r11] + movaps XMMWORD[(-72)+r11],xmm0 + movaps xmm13,XMMWORD[((-56))+r11] + movaps XMMWORD[(-56)+r11],xmm0 + movaps xmm14,XMMWORD[((-40))+r11] + movaps XMMWORD[(-40)+r11],xmm0 + movaps xmm15,XMMWORD[((-24))+r11] + movaps XMMWORD[(-24)+r11],xmm0 + movaps XMMWORD[rsp],xmm0 + movaps XMMWORD[16+rsp],xmm0 + movaps XMMWORD[32+rsp],xmm0 + movaps XMMWORD[48+rsp],xmm0 + movaps XMMWORD[64+rsp],xmm0 + movaps XMMWORD[80+rsp],xmm0 + movaps XMMWORD[96+rsp],xmm0 + movaps XMMWORD[112+rsp],xmm0 + mov rbp,QWORD[((-8))+r11] + + lea rsp,[r11] + +$L$ctr32_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_aes_hw_ctr32_encrypt_blocks: +global aes_hw_set_encrypt_key_base + +ALIGN 16 +aes_hw_set_encrypt_key_base: + +$L$SEH_begin_aes_hw_set_encrypt_key_base_1: +_CET_ENDBR +%ifdef BORINGSSL_DISPATCH_TEST + mov BYTE[((BORINGSSL_function_hit+3))],1 +%endif + sub rsp,8 + +$L$SEH_prologue_aes_hw_set_encrypt_key_base_2: +$L$SEH_endprologue_aes_hw_set_encrypt_key_base_3: + movups xmm0,XMMWORD[rcx] + xorps xmm4,xmm4 + lea rax,[16+r8] + cmp edx,256 + je NEAR $L$14rounds + + + cmp edx,128 + jne NEAR $L$bad_keybits + +$L$10rounds: + mov edx,9 + + movups XMMWORD[r8],xmm0 + DB 102,15,58,223,200,1 + call $L$key_expansion_128_cold + DB 102,15,58,223,200,2 + call $L$key_expansion_128 + DB 102,15,58,223,200,4 + call $L$key_expansion_128 + DB 102,15,58,223,200,8 + call $L$key_expansion_128 + DB 102,15,58,223,200,16 + call $L$key_expansion_128 + DB 102,15,58,223,200,32 + call $L$key_expansion_128 + DB 102,15,58,223,200,64 + call $L$key_expansion_128 + DB 102,15,58,223,200,128 + call $L$key_expansion_128 + DB 102,15,58,223,200,27 + call $L$key_expansion_128 + DB 102,15,58,223,200,54 + call $L$key_expansion_128 + movups XMMWORD[rax],xmm0 + mov DWORD[80+rax],edx + xor eax,eax + jmp NEAR $L$enc_key_ret + + + +ALIGN 16 +$L$14rounds: + movups xmm2,XMMWORD[16+rcx] + mov edx,13 + lea rax,[16+rax] + + movups XMMWORD[r8],xmm0 + movups XMMWORD[16+r8],xmm2 + DB 102,15,58,223,202,1 + call $L$key_expansion_256a_cold + DB 102,15,58,223,200,1 + call $L$key_expansion_256b + DB 102,15,58,223,202,2 + call $L$key_expansion_256a + DB 102,15,58,223,200,2 + call $L$key_expansion_256b + DB 102,15,58,223,202,4 + call $L$key_expansion_256a + DB 102,15,58,223,200,4 + call $L$key_expansion_256b + DB 102,15,58,223,202,8 + call $L$key_expansion_256a + DB 102,15,58,223,200,8 + call $L$key_expansion_256b + DB 102,15,58,223,202,16 + call $L$key_expansion_256a + DB 102,15,58,223,200,16 + call $L$key_expansion_256b + DB 102,15,58,223,202,32 + call $L$key_expansion_256a + DB 102,15,58,223,200,32 + call $L$key_expansion_256b + DB 102,15,58,223,202,64 + call $L$key_expansion_256a + movups XMMWORD[rax],xmm0 + mov DWORD[16+rax],edx + xor rax,rax + jmp NEAR $L$enc_key_ret + +ALIGN 16 +$L$bad_keybits: + mov rax,-2 +$L$enc_key_ret: + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + add rsp,8 + + ret + +$L$SEH_end_aes_hw_set_encrypt_key_base_4: + +ALIGN 16 +$L$key_expansion_128: + + movups XMMWORD[rax],xmm0 + lea rax,[16+rax] +$L$key_expansion_128_cold: + shufps xmm4,xmm0,16 + xorps xmm0,xmm4 + shufps xmm4,xmm0,140 + xorps xmm0,xmm4 + shufps xmm1,xmm1,255 + xorps xmm0,xmm1 + ret + + +ALIGN 16 +$L$key_expansion_256a: + + movups XMMWORD[rax],xmm2 + lea rax,[16+rax] +$L$key_expansion_256a_cold: + shufps xmm4,xmm0,16 + xorps xmm0,xmm4 + shufps xmm4,xmm0,140 + xorps xmm0,xmm4 + shufps xmm1,xmm1,255 + xorps xmm0,xmm1 + ret + + +ALIGN 16 +$L$key_expansion_256b: + + movups XMMWORD[rax],xmm0 + lea rax,[16+rax] + + shufps xmm4,xmm2,16 + xorps xmm2,xmm4 + shufps xmm4,xmm2,140 + xorps xmm2,xmm4 + shufps xmm1,xmm1,170 + xorps xmm2,xmm1 + ret + + + +global aes_hw_set_encrypt_key_alt + +ALIGN 16 +aes_hw_set_encrypt_key_alt: + +$L$SEH_begin_aes_hw_set_encrypt_key_alt_1: +_CET_ENDBR +%ifdef BORINGSSL_DISPATCH_TEST + mov BYTE[((BORINGSSL_function_hit+3))],1 +%endif + sub rsp,8 + +$L$SEH_prologue_aes_hw_set_encrypt_key_alt_2: +$L$SEH_endprologue_aes_hw_set_encrypt_key_alt_3: + movups xmm0,XMMWORD[rcx] + xorps xmm4,xmm4 + lea rax,[16+r8] + cmp edx,256 + je NEAR $L$14rounds_alt + + cmp edx,128 + jne NEAR $L$bad_keybits_alt + + mov edx,9 + movdqa xmm5,XMMWORD[$L$key_rotate] + mov r10d,8 + movdqa xmm4,XMMWORD[$L$key_rcon1] + movdqa xmm2,xmm0 + movdqu XMMWORD[r8],xmm0 + jmp NEAR $L$oop_key128 + +ALIGN 16 +$L$oop_key128: +DB 102,15,56,0,197 + DB 102,15,56,221,196 + pslld xmm4,1 + lea rax,[16+rax] + + movdqa xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm2,xmm3 + + pxor xmm0,xmm2 + movdqu XMMWORD[(-16)+rax],xmm0 + movdqa xmm2,xmm0 + + dec r10d + jnz NEAR $L$oop_key128 + + movdqa xmm4,XMMWORD[$L$key_rcon1b] + +DB 102,15,56,0,197 + DB 102,15,56,221,196 + pslld xmm4,1 + + movdqa xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm2,xmm3 + + pxor xmm0,xmm2 + movdqu XMMWORD[rax],xmm0 + + movdqa xmm2,xmm0 +DB 102,15,56,0,197 + DB 102,15,56,221,196 + + movdqa xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm2,xmm3 + + pxor xmm0,xmm2 + movdqu XMMWORD[16+rax],xmm0 + + mov DWORD[96+rax],edx + xor eax,eax + jmp NEAR $L$enc_key_ret_alt + + + +ALIGN 16 +$L$14rounds_alt: + movups xmm2,XMMWORD[16+rcx] + mov edx,13 + lea rax,[16+rax] + movdqa xmm5,XMMWORD[$L$key_rotate] + movdqa xmm4,XMMWORD[$L$key_rcon1] + mov r10d,7 + movdqu XMMWORD[r8],xmm0 + movdqa xmm1,xmm2 + movdqu XMMWORD[16+r8],xmm2 + jmp NEAR $L$oop_key256 + +ALIGN 16 +$L$oop_key256: +DB 102,15,56,0,213 + DB 102,15,56,221,212 + + movdqa xmm3,xmm0 + pslldq xmm0,4 + pxor xmm3,xmm0 + pslldq xmm0,4 + pxor xmm3,xmm0 + pslldq xmm0,4 + pxor xmm0,xmm3 + pslld xmm4,1 + + pxor xmm0,xmm2 + movdqu XMMWORD[rax],xmm0 + + dec r10d + jz NEAR $L$done_key256 + + pshufd xmm2,xmm0,0xff + pxor xmm3,xmm3 + DB 102,15,56,221,211 + + movdqa xmm3,xmm1 + pslldq xmm1,4 + pxor xmm3,xmm1 + pslldq xmm1,4 + pxor xmm3,xmm1 + pslldq xmm1,4 + pxor xmm1,xmm3 + + pxor xmm2,xmm1 + movdqu XMMWORD[16+rax],xmm2 + lea rax,[32+rax] + movdqa xmm1,xmm2 + + jmp NEAR $L$oop_key256 + +$L$done_key256: + mov DWORD[16+rax],edx + xor eax,eax + jmp NEAR $L$enc_key_ret_alt + +ALIGN 16 +$L$bad_keybits_alt: + mov rax,-2 +$L$enc_key_ret_alt: + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + add rsp,8 + + ret + +$L$SEH_end_aes_hw_set_encrypt_key_alt_4: + +section .rdata rdata align=8 +ALIGN 64 +$L$bswap_mask: + DB 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +$L$increment32: + DD 6,6,6,0 +$L$increment64: + DD 1,0,0,0 +$L$increment1: + DB 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +$L$key_rotate: + DD 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d +$L$key_rotate192: + DD 0x04070605,0x04070605,0x04070605,0x04070605 +$L$key_rcon1: + DD 1,1,1,1 +$L$key_rcon1b: + DD 0x1b,0x1b,0x1b,0x1b + + DB 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69 + DB 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83 + DB 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 + DB 115,108,46,111,114,103,62,0 +ALIGN 64 +section .text + +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +ctr_xts_se_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + mov rax,QWORD[208+r8] + + lea rsi,[((-168))+rax] + lea rdi,[512+r8] + mov ecx,20 + DD 0xa548f3fc + + mov rbp,QWORD[((-8))+rax] + mov QWORD[160+r8],rbp + + +$L$common_seh_tail: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + ret + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_aes_hw_ctr32_encrypt_blocks wrt ..imagebase + DD $L$SEH_end_aes_hw_ctr32_encrypt_blocks wrt ..imagebase + DD $L$SEH_info_ctr32 wrt ..imagebase +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_ctr32: + DB 9,0,0,0 + DD ctr_xts_se_handler wrt ..imagebase + DD $L$ctr32_body wrt ..imagebase,$L$ctr32_epilogue wrt ..imagebase +section .pdata +ALIGN 4 + DD $L$SEH_begin_aes_hw_set_encrypt_key_base_1 wrt ..imagebase + DD $L$SEH_end_aes_hw_set_encrypt_key_base_4 wrt ..imagebase + DD $L$SEH_info_aes_hw_set_encrypt_key_base_0 wrt ..imagebase + + DD $L$SEH_begin_aes_hw_set_encrypt_key_alt_1 wrt ..imagebase + DD $L$SEH_end_aes_hw_set_encrypt_key_alt_4 wrt ..imagebase + DD $L$SEH_info_aes_hw_set_encrypt_key_alt_0 wrt ..imagebase + + +section .xdata +ALIGN 4 +$L$SEH_info_aes_hw_set_encrypt_key_base_0: + DB 1 + DB $L$SEH_endprologue_aes_hw_set_encrypt_key_base_3-$L$SEH_begin_aes_hw_set_encrypt_key_base_1 + DB 1 + DB 0 + DB $L$SEH_prologue_aes_hw_set_encrypt_key_base_2-$L$SEH_begin_aes_hw_set_encrypt_key_base_1 + DB 2 + + DW 0 +$L$SEH_info_aes_hw_set_encrypt_key_alt_0: + DB 1 + DB $L$SEH_endprologue_aes_hw_set_encrypt_key_alt_3-$L$SEH_begin_aes_hw_set_encrypt_key_alt_1 + DB 1 + DB 0 + DB $L$SEH_prologue_aes_hw_set_encrypt_key_alt_2-$L$SEH_begin_aes_hw_set_encrypt_key_alt_1 + DB 2 + + DW 0 +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/ring-0.17.14/pregenerated/aesni-x86_64-nasm.o b/ring-0.17.14/pregenerated/aesni-x86_64-nasm.o new file mode 100644 index 0000000000..f0a1501b54 Binary files /dev/null and b/ring-0.17.14/pregenerated/aesni-x86_64-nasm.o differ diff --git a/ring-0.17.14/pregenerated/aesv8-armx-ios64.S b/ring-0.17.14/pregenerated/aesv8-armx-ios64.S new file mode 100644 index 0000000000..75c626a017 --- /dev/null +++ b/ring-0.17.14/pregenerated/aesv8-armx-ios64.S @@ -0,0 +1,353 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) +#if __ARM_MAX_ARCH__>=7 +.text + +.section __TEXT,__const +.align 5 +Lrcon: +.long 0x01,0x01,0x01,0x01 +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat +.long 0x1b,0x1b,0x1b,0x1b + +.text + +.globl _aes_hw_set_encrypt_key +.private_extern _aes_hw_set_encrypt_key + +.align 5 +_aes_hw_set_encrypt_key: +Lenc_key: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + mov x3,#-2 + cmp w1,#128 + b.lt Lenc_key_abort + cmp w1,#256 + b.gt Lenc_key_abort + tst w1,#0x3f + b.ne Lenc_key_abort + + adrp x3,Lrcon@PAGE + add x3,x3,Lrcon@PAGEOFF + cmp w1,#192 + + eor v0.16b,v0.16b,v0.16b + ld1 {v3.16b},[x0],#16 + mov w1,#8 // reuse w1 + ld1 {v1.4s,v2.4s},[x3],#32 + + b.lt Loop128 + // 192-bit key support was removed. + b L256 + +.align 4 +Loop128: + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + b.ne Loop128 + + ld1 {v1.4s},[x3] + + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + eor v3.16b,v3.16b,v6.16b + st1 {v3.4s},[x2] + add x2,x2,#0x50 + + mov w12,#10 + b Ldone + +// 192-bit key support was removed. + +.align 4 +L256: + ld1 {v4.16b},[x0] + mov w1,#7 + mov w12,#14 + st1 {v3.4s},[x2],#16 + +Loop256: + tbl v6.16b,{v4.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v4.4s},[x2],#16 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + st1 {v3.4s},[x2],#16 + b.eq Ldone + + dup v6.4s,v3.s[3] // just splat + ext v5.16b,v0.16b,v4.16b,#12 + aese v6.16b,v0.16b + + eor v4.16b,v4.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v4.16b,v4.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v4.16b,v4.16b,v5.16b + + eor v4.16b,v4.16b,v6.16b + b Loop256 + +Ldone: + str w12,[x2] + mov x3,#0 + +Lenc_key_abort: + mov x0,x3 // return value + ldr x29,[sp],#16 + ret + +.globl _aes_hw_ctr32_encrypt_blocks +.private_extern _aes_hw_ctr32_encrypt_blocks + +.align 5 +_aes_hw_ctr32_encrypt_blocks: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + ldr w5,[x3,#240] + + ldr w8, [x4, #12] + ld1 {v0.4s},[x4] + + ld1 {v16.4s,v17.4s},[x3] // load key schedule... + sub w5,w5,#4 + mov x12,#16 + cmp x2,#2 + add x7,x3,x5,lsl#4 // pointer to last 5 round keys + sub w5,w5,#2 + ld1 {v20.4s,v21.4s},[x7],#32 + ld1 {v22.4s,v23.4s},[x7],#32 + ld1 {v7.4s},[x7] + add x7,x3,#32 + mov w6,w5 + csel x12,xzr,x12,lo + + // ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are + // affected by silicon errata #1742098 [0] and #1655431 [1], + // respectively, where the second instruction of an aese/aesmc + // instruction pair may execute twice if an interrupt is taken right + // after the first instruction consumes an input register of which a + // single 32-bit lane has been updated the last time it was modified. + // + // This function uses a counter in one 32-bit lane. The vmov lines + // could write to v1.16b and v18.16b directly, but that trips this bugs. + // We write to v6.16b and copy to the final register as a workaround. + // + // [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice + // [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice +#ifndef __AARCH64EB__ + rev w8, w8 +#endif + add w10, w8, #1 + orr v6.16b,v0.16b,v0.16b + rev w10, w10 + mov v6.s[3],w10 + add w8, w8, #2 + orr v1.16b,v6.16b,v6.16b + b.ls Lctr32_tail + rev w12, w8 + mov v6.s[3],w12 + sub x2,x2,#3 // bias + orr v18.16b,v6.16b,v6.16b + b Loop3x_ctr32 + +.align 4 +Loop3x_ctr32: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v18.16b,v16.16b + aesmc v18.16b,v18.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + aese v18.16b,v17.16b + aesmc v18.16b,v18.16b + ld1 {v17.4s},[x7],#16 + b.gt Loop3x_ctr32 + + aese v0.16b,v16.16b + aesmc v4.16b,v0.16b + aese v1.16b,v16.16b + aesmc v5.16b,v1.16b + ld1 {v2.16b},[x0],#16 + add w9,w8,#1 + aese v18.16b,v16.16b + aesmc v18.16b,v18.16b + ld1 {v3.16b},[x0],#16 + rev w9,w9 + aese v4.16b,v17.16b + aesmc v4.16b,v4.16b + aese v5.16b,v17.16b + aesmc v5.16b,v5.16b + ld1 {v19.16b},[x0],#16 + mov x7,x3 + aese v18.16b,v17.16b + aesmc v17.16b,v18.16b + aese v4.16b,v20.16b + aesmc v4.16b,v4.16b + aese v5.16b,v20.16b + aesmc v5.16b,v5.16b + eor v2.16b,v2.16b,v7.16b + add w10,w8,#2 + aese v17.16b,v20.16b + aesmc v17.16b,v17.16b + eor v3.16b,v3.16b,v7.16b + add w8,w8,#3 + aese v4.16b,v21.16b + aesmc v4.16b,v4.16b + aese v5.16b,v21.16b + aesmc v5.16b,v5.16b + // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work + // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in + // 32-bit mode. See the comment above. + eor v19.16b,v19.16b,v7.16b + mov v6.s[3], w9 + aese v17.16b,v21.16b + aesmc v17.16b,v17.16b + orr v0.16b,v6.16b,v6.16b + rev w10,w10 + aese v4.16b,v22.16b + aesmc v4.16b,v4.16b + mov v6.s[3], w10 + rev w12,w8 + aese v5.16b,v22.16b + aesmc v5.16b,v5.16b + orr v1.16b,v6.16b,v6.16b + mov v6.s[3], w12 + aese v17.16b,v22.16b + aesmc v17.16b,v17.16b + orr v18.16b,v6.16b,v6.16b + subs x2,x2,#3 + aese v4.16b,v23.16b + aese v5.16b,v23.16b + aese v17.16b,v23.16b + + eor v2.16b,v2.16b,v4.16b + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + st1 {v2.16b},[x1],#16 + eor v3.16b,v3.16b,v5.16b + mov w6,w5 + st1 {v3.16b},[x1],#16 + eor v19.16b,v19.16b,v17.16b + ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + st1 {v19.16b},[x1],#16 + b.hs Loop3x_ctr32 + + adds x2,x2,#3 + b.eq Lctr32_done + cmp x2,#1 + mov x12,#16 + csel x12,xzr,x12,eq + +Lctr32_tail: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + ld1 {v17.4s},[x7],#16 + b.gt Lctr32_tail + + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + ld1 {v2.16b},[x0],x12 + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + aese v1.16b,v20.16b + aesmc v1.16b,v1.16b + ld1 {v3.16b},[x0] + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + aese v1.16b,v21.16b + aesmc v1.16b,v1.16b + eor v2.16b,v2.16b,v7.16b + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + aese v1.16b,v22.16b + aesmc v1.16b,v1.16b + eor v3.16b,v3.16b,v7.16b + aese v0.16b,v23.16b + aese v1.16b,v23.16b + + cmp x2,#1 + eor v2.16b,v2.16b,v0.16b + eor v3.16b,v3.16b,v1.16b + st1 {v2.16b},[x1],#16 + b.eq Lctr32_done + st1 {v3.16b},[x1] + +Lctr32_done: + ldr x29,[sp],#16 + ret + +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) diff --git a/ring-0.17.14/pregenerated/aesv8-armx-linux64.S b/ring-0.17.14/pregenerated/aesv8-armx-linux64.S new file mode 100644 index 0000000000..5ed5a72347 --- /dev/null +++ b/ring-0.17.14/pregenerated/aesv8-armx-linux64.S @@ -0,0 +1,353 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) +#if __ARM_MAX_ARCH__>=7 +.text +.arch armv8-a+crypto +.section .rodata +.align 5 +.Lrcon: +.long 0x01,0x01,0x01,0x01 +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat +.long 0x1b,0x1b,0x1b,0x1b + +.text + +.globl aes_hw_set_encrypt_key +.hidden aes_hw_set_encrypt_key +.type aes_hw_set_encrypt_key,%function +.align 5 +aes_hw_set_encrypt_key: +.Lenc_key: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + mov x3,#-2 + cmp w1,#128 + b.lt .Lenc_key_abort + cmp w1,#256 + b.gt .Lenc_key_abort + tst w1,#0x3f + b.ne .Lenc_key_abort + + adrp x3,.Lrcon + add x3,x3,:lo12:.Lrcon + cmp w1,#192 + + eor v0.16b,v0.16b,v0.16b + ld1 {v3.16b},[x0],#16 + mov w1,#8 // reuse w1 + ld1 {v1.4s,v2.4s},[x3],#32 + + b.lt .Loop128 + // 192-bit key support was removed. + b .L256 + +.align 4 +.Loop128: + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + b.ne .Loop128 + + ld1 {v1.4s},[x3] + + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + eor v3.16b,v3.16b,v6.16b + st1 {v3.4s},[x2] + add x2,x2,#0x50 + + mov w12,#10 + b .Ldone + +// 192-bit key support was removed. + +.align 4 +.L256: + ld1 {v4.16b},[x0] + mov w1,#7 + mov w12,#14 + st1 {v3.4s},[x2],#16 + +.Loop256: + tbl v6.16b,{v4.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v4.4s},[x2],#16 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + st1 {v3.4s},[x2],#16 + b.eq .Ldone + + dup v6.4s,v3.s[3] // just splat + ext v5.16b,v0.16b,v4.16b,#12 + aese v6.16b,v0.16b + + eor v4.16b,v4.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v4.16b,v4.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v4.16b,v4.16b,v5.16b + + eor v4.16b,v4.16b,v6.16b + b .Loop256 + +.Ldone: + str w12,[x2] + mov x3,#0 + +.Lenc_key_abort: + mov x0,x3 // return value + ldr x29,[sp],#16 + ret +.size aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key +.globl aes_hw_ctr32_encrypt_blocks +.hidden aes_hw_ctr32_encrypt_blocks +.type aes_hw_ctr32_encrypt_blocks,%function +.align 5 +aes_hw_ctr32_encrypt_blocks: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + ldr w5,[x3,#240] + + ldr w8, [x4, #12] + ld1 {v0.4s},[x4] + + ld1 {v16.4s,v17.4s},[x3] // load key schedule... + sub w5,w5,#4 + mov x12,#16 + cmp x2,#2 + add x7,x3,x5,lsl#4 // pointer to last 5 round keys + sub w5,w5,#2 + ld1 {v20.4s,v21.4s},[x7],#32 + ld1 {v22.4s,v23.4s},[x7],#32 + ld1 {v7.4s},[x7] + add x7,x3,#32 + mov w6,w5 + csel x12,xzr,x12,lo + + // ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are + // affected by silicon errata #1742098 [0] and #1655431 [1], + // respectively, where the second instruction of an aese/aesmc + // instruction pair may execute twice if an interrupt is taken right + // after the first instruction consumes an input register of which a + // single 32-bit lane has been updated the last time it was modified. + // + // This function uses a counter in one 32-bit lane. The vmov lines + // could write to v1.16b and v18.16b directly, but that trips this bugs. + // We write to v6.16b and copy to the final register as a workaround. + // + // [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice + // [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice +#ifndef __AARCH64EB__ + rev w8, w8 +#endif + add w10, w8, #1 + orr v6.16b,v0.16b,v0.16b + rev w10, w10 + mov v6.s[3],w10 + add w8, w8, #2 + orr v1.16b,v6.16b,v6.16b + b.ls .Lctr32_tail + rev w12, w8 + mov v6.s[3],w12 + sub x2,x2,#3 // bias + orr v18.16b,v6.16b,v6.16b + b .Loop3x_ctr32 + +.align 4 +.Loop3x_ctr32: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v18.16b,v16.16b + aesmc v18.16b,v18.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + aese v18.16b,v17.16b + aesmc v18.16b,v18.16b + ld1 {v17.4s},[x7],#16 + b.gt .Loop3x_ctr32 + + aese v0.16b,v16.16b + aesmc v4.16b,v0.16b + aese v1.16b,v16.16b + aesmc v5.16b,v1.16b + ld1 {v2.16b},[x0],#16 + add w9,w8,#1 + aese v18.16b,v16.16b + aesmc v18.16b,v18.16b + ld1 {v3.16b},[x0],#16 + rev w9,w9 + aese v4.16b,v17.16b + aesmc v4.16b,v4.16b + aese v5.16b,v17.16b + aesmc v5.16b,v5.16b + ld1 {v19.16b},[x0],#16 + mov x7,x3 + aese v18.16b,v17.16b + aesmc v17.16b,v18.16b + aese v4.16b,v20.16b + aesmc v4.16b,v4.16b + aese v5.16b,v20.16b + aesmc v5.16b,v5.16b + eor v2.16b,v2.16b,v7.16b + add w10,w8,#2 + aese v17.16b,v20.16b + aesmc v17.16b,v17.16b + eor v3.16b,v3.16b,v7.16b + add w8,w8,#3 + aese v4.16b,v21.16b + aesmc v4.16b,v4.16b + aese v5.16b,v21.16b + aesmc v5.16b,v5.16b + // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work + // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in + // 32-bit mode. See the comment above. + eor v19.16b,v19.16b,v7.16b + mov v6.s[3], w9 + aese v17.16b,v21.16b + aesmc v17.16b,v17.16b + orr v0.16b,v6.16b,v6.16b + rev w10,w10 + aese v4.16b,v22.16b + aesmc v4.16b,v4.16b + mov v6.s[3], w10 + rev w12,w8 + aese v5.16b,v22.16b + aesmc v5.16b,v5.16b + orr v1.16b,v6.16b,v6.16b + mov v6.s[3], w12 + aese v17.16b,v22.16b + aesmc v17.16b,v17.16b + orr v18.16b,v6.16b,v6.16b + subs x2,x2,#3 + aese v4.16b,v23.16b + aese v5.16b,v23.16b + aese v17.16b,v23.16b + + eor v2.16b,v2.16b,v4.16b + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + st1 {v2.16b},[x1],#16 + eor v3.16b,v3.16b,v5.16b + mov w6,w5 + st1 {v3.16b},[x1],#16 + eor v19.16b,v19.16b,v17.16b + ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + st1 {v19.16b},[x1],#16 + b.hs .Loop3x_ctr32 + + adds x2,x2,#3 + b.eq .Lctr32_done + cmp x2,#1 + mov x12,#16 + csel x12,xzr,x12,eq + +.Lctr32_tail: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + ld1 {v17.4s},[x7],#16 + b.gt .Lctr32_tail + + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + ld1 {v2.16b},[x0],x12 + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + aese v1.16b,v20.16b + aesmc v1.16b,v1.16b + ld1 {v3.16b},[x0] + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + aese v1.16b,v21.16b + aesmc v1.16b,v1.16b + eor v2.16b,v2.16b,v7.16b + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + aese v1.16b,v22.16b + aesmc v1.16b,v1.16b + eor v3.16b,v3.16b,v7.16b + aese v0.16b,v23.16b + aese v1.16b,v23.16b + + cmp x2,#1 + eor v2.16b,v2.16b,v0.16b + eor v3.16b,v3.16b,v1.16b + st1 {v2.16b},[x1],#16 + b.eq .Lctr32_done + st1 {v3.16b},[x1] + +.Lctr32_done: + ldr x29,[sp],#16 + ret +.size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) diff --git a/ring-0.17.14/pregenerated/aesv8-armx-win64.S b/ring-0.17.14/pregenerated/aesv8-armx-win64.S new file mode 100644 index 0000000000..70cc6420a6 --- /dev/null +++ b/ring-0.17.14/pregenerated/aesv8-armx-win64.S @@ -0,0 +1,357 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +#if __ARM_MAX_ARCH__>=7 +.text +.arch armv8-a+crypto +.section .rodata +.align 5 +Lrcon: +.long 0x01,0x01,0x01,0x01 +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat +.long 0x1b,0x1b,0x1b,0x1b + +.text + +.globl aes_hw_set_encrypt_key + +.def aes_hw_set_encrypt_key + .type 32 +.endef +.align 5 +aes_hw_set_encrypt_key: +Lenc_key: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + mov x3,#-2 + cmp w1,#128 + b.lt Lenc_key_abort + cmp w1,#256 + b.gt Lenc_key_abort + tst w1,#0x3f + b.ne Lenc_key_abort + + adrp x3,Lrcon + add x3,x3,:lo12:Lrcon + cmp w1,#192 + + eor v0.16b,v0.16b,v0.16b + ld1 {v3.16b},[x0],#16 + mov w1,#8 // reuse w1 + ld1 {v1.4s,v2.4s},[x3],#32 + + b.lt Loop128 + // 192-bit key support was removed. + b L256 + +.align 4 +Loop128: + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + b.ne Loop128 + + ld1 {v1.4s},[x3] + + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + eor v3.16b,v3.16b,v6.16b + st1 {v3.4s},[x2] + add x2,x2,#0x50 + + mov w12,#10 + b Ldone + +// 192-bit key support was removed. + +.align 4 +L256: + ld1 {v4.16b},[x0] + mov w1,#7 + mov w12,#14 + st1 {v3.4s},[x2],#16 + +Loop256: + tbl v6.16b,{v4.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v4.4s},[x2],#16 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + st1 {v3.4s},[x2],#16 + b.eq Ldone + + dup v6.4s,v3.s[3] // just splat + ext v5.16b,v0.16b,v4.16b,#12 + aese v6.16b,v0.16b + + eor v4.16b,v4.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v4.16b,v4.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v4.16b,v4.16b,v5.16b + + eor v4.16b,v4.16b,v6.16b + b Loop256 + +Ldone: + str w12,[x2] + mov x3,#0 + +Lenc_key_abort: + mov x0,x3 // return value + ldr x29,[sp],#16 + ret + +.globl aes_hw_ctr32_encrypt_blocks + +.def aes_hw_ctr32_encrypt_blocks + .type 32 +.endef +.align 5 +aes_hw_ctr32_encrypt_blocks: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + ldr w5,[x3,#240] + + ldr w8, [x4, #12] + ld1 {v0.4s},[x4] + + ld1 {v16.4s,v17.4s},[x3] // load key schedule... + sub w5,w5,#4 + mov x12,#16 + cmp x2,#2 + add x7,x3,x5,lsl#4 // pointer to last 5 round keys + sub w5,w5,#2 + ld1 {v20.4s,v21.4s},[x7],#32 + ld1 {v22.4s,v23.4s},[x7],#32 + ld1 {v7.4s},[x7] + add x7,x3,#32 + mov w6,w5 + csel x12,xzr,x12,lo + + // ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are + // affected by silicon errata #1742098 [0] and #1655431 [1], + // respectively, where the second instruction of an aese/aesmc + // instruction pair may execute twice if an interrupt is taken right + // after the first instruction consumes an input register of which a + // single 32-bit lane has been updated the last time it was modified. + // + // This function uses a counter in one 32-bit lane. The vmov lines + // could write to v1.16b and v18.16b directly, but that trips this bugs. + // We write to v6.16b and copy to the final register as a workaround. + // + // [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice + // [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice +#ifndef __AARCH64EB__ + rev w8, w8 +#endif + add w10, w8, #1 + orr v6.16b,v0.16b,v0.16b + rev w10, w10 + mov v6.s[3],w10 + add w8, w8, #2 + orr v1.16b,v6.16b,v6.16b + b.ls Lctr32_tail + rev w12, w8 + mov v6.s[3],w12 + sub x2,x2,#3 // bias + orr v18.16b,v6.16b,v6.16b + b Loop3x_ctr32 + +.align 4 +Loop3x_ctr32: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v18.16b,v16.16b + aesmc v18.16b,v18.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + aese v18.16b,v17.16b + aesmc v18.16b,v18.16b + ld1 {v17.4s},[x7],#16 + b.gt Loop3x_ctr32 + + aese v0.16b,v16.16b + aesmc v4.16b,v0.16b + aese v1.16b,v16.16b + aesmc v5.16b,v1.16b + ld1 {v2.16b},[x0],#16 + add w9,w8,#1 + aese v18.16b,v16.16b + aesmc v18.16b,v18.16b + ld1 {v3.16b},[x0],#16 + rev w9,w9 + aese v4.16b,v17.16b + aesmc v4.16b,v4.16b + aese v5.16b,v17.16b + aesmc v5.16b,v5.16b + ld1 {v19.16b},[x0],#16 + mov x7,x3 + aese v18.16b,v17.16b + aesmc v17.16b,v18.16b + aese v4.16b,v20.16b + aesmc v4.16b,v4.16b + aese v5.16b,v20.16b + aesmc v5.16b,v5.16b + eor v2.16b,v2.16b,v7.16b + add w10,w8,#2 + aese v17.16b,v20.16b + aesmc v17.16b,v17.16b + eor v3.16b,v3.16b,v7.16b + add w8,w8,#3 + aese v4.16b,v21.16b + aesmc v4.16b,v4.16b + aese v5.16b,v21.16b + aesmc v5.16b,v5.16b + // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work + // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in + // 32-bit mode. See the comment above. + eor v19.16b,v19.16b,v7.16b + mov v6.s[3], w9 + aese v17.16b,v21.16b + aesmc v17.16b,v17.16b + orr v0.16b,v6.16b,v6.16b + rev w10,w10 + aese v4.16b,v22.16b + aesmc v4.16b,v4.16b + mov v6.s[3], w10 + rev w12,w8 + aese v5.16b,v22.16b + aesmc v5.16b,v5.16b + orr v1.16b,v6.16b,v6.16b + mov v6.s[3], w12 + aese v17.16b,v22.16b + aesmc v17.16b,v17.16b + orr v18.16b,v6.16b,v6.16b + subs x2,x2,#3 + aese v4.16b,v23.16b + aese v5.16b,v23.16b + aese v17.16b,v23.16b + + eor v2.16b,v2.16b,v4.16b + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + st1 {v2.16b},[x1],#16 + eor v3.16b,v3.16b,v5.16b + mov w6,w5 + st1 {v3.16b},[x1],#16 + eor v19.16b,v19.16b,v17.16b + ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + st1 {v19.16b},[x1],#16 + b.hs Loop3x_ctr32 + + adds x2,x2,#3 + b.eq Lctr32_done + cmp x2,#1 + mov x12,#16 + csel x12,xzr,x12,eq + +Lctr32_tail: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + ld1 {v17.4s},[x7],#16 + b.gt Lctr32_tail + + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + ld1 {v2.16b},[x0],x12 + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + aese v1.16b,v20.16b + aesmc v1.16b,v1.16b + ld1 {v3.16b},[x0] + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + aese v1.16b,v21.16b + aesmc v1.16b,v1.16b + eor v2.16b,v2.16b,v7.16b + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + aese v1.16b,v22.16b + aesmc v1.16b,v1.16b + eor v3.16b,v3.16b,v7.16b + aese v0.16b,v23.16b + aese v1.16b,v23.16b + + cmp x2,#1 + eor v2.16b,v2.16b,v0.16b + eor v3.16b,v3.16b,v1.16b + st1 {v2.16b},[x1],#16 + b.eq Lctr32_done + st1 {v3.16b},[x1] + +Lctr32_done: + ldr x29,[sp],#16 + ret + +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) diff --git a/ring-0.17.14/pregenerated/aesv8-gcm-armv8-ios64.S b/ring-0.17.14/pregenerated/aesv8-gcm-armv8-ios64.S new file mode 100644 index 0000000000..1450b89446 --- /dev/null +++ b/ring-0.17.14/pregenerated/aesv8-gcm-armv8-ios64.S @@ -0,0 +1,1554 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) +#if __ARM_MAX_ARCH__ >= 8 + + +.text +.globl _aes_gcm_enc_kernel +.private_extern _aes_gcm_enc_kernel + +.align 4 +_aes_gcm_enc_kernel: + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp, #-128]! + mov x29, sp + stp x19, x20, [sp, #16] + mov x16, x4 + mov x8, x5 + stp x21, x22, [sp, #32] + stp x23, x24, [sp, #48] + stp d8, d9, [sp, #64] + stp d10, d11, [sp, #80] + stp d12, d13, [sp, #96] + stp d14, d15, [sp, #112] + ldr w17, [x8, #240] + add x19, x8, x17, lsl #4 // borrow input_l1 for last key + ldp x13, x14, [x19] // load round N keys + ldr q31, [x19, #-16] // load round N-1 keys + add x4, x0, x1, lsr #3 // end_input_ptr + lsr x5, x1, #3 // byte_len + mov x15, x5 + ldp x10, x11, [x16] // ctr96_b64, ctr96_t32 + ld1 { v0.16b}, [x16] // special case vector load initial counter so we can start first AES block as quickly as possible + sub x5, x5, #1 // byte_len - 1 + ldr q18, [x8, #0] // load rk0 + and x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + ldr q25, [x8, #112] // load rk7 + add x5, x5, x0 + lsr x12, x11, #32 + fmov d2, x10 // CTR block 2 + orr w11, w11, w11 + rev w12, w12 // rev_ctr32 + fmov d1, x10 // CTR block 1 + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 0 - round 0 + add w12, w12, #1 // increment rev_ctr32 + rev w9, w12 // CTR block 1 + fmov d3, x10 // CTR block 3 + orr x9, x11, x9, lsl #32 // CTR block 1 + add w12, w12, #1 // CTR block 1 + ldr q19, [x8, #16] // load rk1 + fmov v1.d[1], x9 // CTR block 1 + rev w9, w12 // CTR block 2 + add w12, w12, #1 // CTR block 2 + orr x9, x11, x9, lsl #32 // CTR block 2 + ldr q20, [x8, #32] // load rk2 + fmov v2.d[1], x9 // CTR block 2 + rev w9, w12 // CTR block 3 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 0 - round 1 + orr x9, x11, x9, lsl #32 // CTR block 3 + fmov v3.d[1], x9 // CTR block 3 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 1 - round 0 + ldr q21, [x8, #48] // load rk3 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 0 - round 2 + ldr q24, [x8, #96] // load rk6 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 2 - round 0 + ldr q23, [x8, #80] // load rk5 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 1 - round 1 + ldr q14, [x6, #48] // load h3l | h3h + ext v14.16b, v14.16b, v14.16b, #8 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 3 - round 0 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 2 - round 1 + ldr q22, [x8, #64] // load rk4 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 1 - round 2 + ldr q13, [x6, #32] // load h2l | h2h + ext v13.16b, v13.16b, v13.16b, #8 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 3 - round 1 + ldr q30, [x8, #192] // load rk12 + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 2 - round 2 + ldr q15, [x6, #80] // load h4l | h4h + ext v15.16b, v15.16b, v15.16b, #8 + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 1 - round 3 + ldr q29, [x8, #176] // load rk11 + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 3 - round 2 + ldr q26, [x8, #128] // load rk8 + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 2 - round 3 + add w12, w12, #1 // CTR block 3 + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 0 - round 3 + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 3 - round 3 + ld1 { v11.16b}, [x3] + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 2 - round 4 + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 0 - round 4 + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 1 - round 4 + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 3 - round 4 + cmp x17, #12 // setup flags for AES-128/192/256 check + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 0 - round 5 + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 1 - round 5 + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 3 - round 5 + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 2 - round 5 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 1 - round 6 + trn2 v17.2d, v14.2d, v15.2d // h4l | h3l + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 3 - round 6 + ldr q27, [x8, #144] // load rk9 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 0 - round 6 + ldr q12, [x6] // load h1l | h1h + ext v12.16b, v12.16b, v12.16b, #8 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 2 - round 6 + ldr q28, [x8, #160] // load rk10 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 1 - round 7 + trn1 v9.2d, v14.2d, v15.2d // h4h | h3h + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 0 - round 7 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 2 - round 7 + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 3 - round 7 + trn2 v16.2d, v12.2d, v13.2d // h2l | h1l + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 1 - round 8 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 2 - round 8 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 3 - round 8 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 0 - round 8 + b.lt Lenc_finish_first_blocks // branch if AES-128 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 9 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 9 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 10 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 10 + b.eq Lenc_finish_first_blocks // branch if AES-192 + + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b // AES block 1 - round 11 + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b // AES block 2 - round 11 + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b // AES block 0 - round 11 + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b // AES block 3 - round 11 + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b // AES block 1 - round 12 + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b // AES block 2 - round 12 + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b // AES block 0 - round 12 + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b // AES block 3 - round 12 + +Lenc_finish_first_blocks: + cmp x0, x5 // check if we have <= 4 blocks + eor v17.16b, v17.16b, v9.16b // h4k | h3k + aese v2.16b, v31.16b // AES block 2 - round N-1 + trn1 v8.2d, v12.2d, v13.2d // h2h | h1h + aese v1.16b, v31.16b // AES block 1 - round N-1 + aese v0.16b, v31.16b // AES block 0 - round N-1 + aese v3.16b, v31.16b // AES block 3 - round N-1 + eor v16.16b, v16.16b, v8.16b // h2k | h1k + b.ge Lenc_tail // handle tail + + ldp x19, x20, [x0, #16] // AES block 1 - load plaintext + rev w9, w12 // CTR block 4 + ldp x6, x7, [x0, #0] // AES block 0 - load plaintext + ldp x23, x24, [x0, #48] // AES block 3 - load plaintext + ldp x21, x22, [x0, #32] // AES block 2 - load plaintext + add x0, x0, #64 // AES input_ptr update + eor x19, x19, x13 // AES block 1 - round N low + eor x20, x20, x14 // AES block 1 - round N high + fmov d5, x19 // AES block 1 - mov low + eor x6, x6, x13 // AES block 0 - round N low + eor x7, x7, x14 // AES block 0 - round N high + eor x24, x24, x14 // AES block 3 - round N high + fmov d4, x6 // AES block 0 - mov low + cmp x0, x5 // check if we have <= 8 blocks + fmov v4.d[1], x7 // AES block 0 - mov high + eor x23, x23, x13 // AES block 3 - round N low + eor x21, x21, x13 // AES block 2 - round N low + fmov v5.d[1], x20 // AES block 1 - mov high + fmov d6, x21 // AES block 2 - mov low + add w12, w12, #1 // CTR block 4 + orr x9, x11, x9, lsl #32 // CTR block 4 + fmov d7, x23 // AES block 3 - mov low + eor x22, x22, x14 // AES block 2 - round N high + fmov v6.d[1], x22 // AES block 2 - mov high + eor v4.16b, v4.16b, v0.16b // AES block 0 - result + fmov d0, x10 // CTR block 4 + fmov v0.d[1], x9 // CTR block 4 + rev w9, w12 // CTR block 5 + add w12, w12, #1 // CTR block 5 + eor v5.16b, v5.16b, v1.16b // AES block 1 - result + fmov d1, x10 // CTR block 5 + orr x9, x11, x9, lsl #32 // CTR block 5 + fmov v1.d[1], x9 // CTR block 5 + rev w9, w12 // CTR block 6 + st1 { v4.16b}, [x2], #16 // AES block 0 - store result + fmov v7.d[1], x24 // AES block 3 - mov high + orr x9, x11, x9, lsl #32 // CTR block 6 + eor v6.16b, v6.16b, v2.16b // AES block 2 - result + st1 { v5.16b}, [x2], #16 // AES block 1 - store result + add w12, w12, #1 // CTR block 6 + fmov d2, x10 // CTR block 6 + fmov v2.d[1], x9 // CTR block 6 + st1 { v6.16b}, [x2], #16 // AES block 2 - store result + rev w9, w12 // CTR block 7 + orr x9, x11, x9, lsl #32 // CTR block 7 + eor v7.16b, v7.16b, v3.16b // AES block 3 - result + st1 { v7.16b}, [x2], #16 // AES block 3 - store result + b.ge Lenc_prepretail // do prepretail + +Lenc_main_loop: // main loop start + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 + rev64 v4.16b, v4.16b // GHASH block 4k (only t0 is free) + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 + fmov d3, x10 // CTR block 4k+3 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 + fmov v3.d[1], x9 // CTR block 4k+3 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 + ldp x23, x24, [x0, #48] // AES block 4k+7 - load plaintext + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 + ldp x21, x22, [x0, #32] // AES block 4k+6 - load plaintext + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 + eor v4.16b, v4.16b, v11.16b // PRE 1 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 + eor x23, x23, x13 // AES block 4k+7 - round N low + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 + mov d10, v17.d[1] // GHASH block 4k - mid + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high + eor x22, x22, x14 // AES block 4k+6 - round N high + mov d8, v4.d[1] // GHASH block 4k - mid + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 + rev64 v5.16b, v5.16b // GHASH block 4k+1 (t0 and t1 free) + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 + pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low + eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 + rev64 v7.16b, v7.16b // GHASH block 4k+3 (t0, t1, t2 and t3 free) + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high + pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid + rev64 v6.16b, v6.16b // GHASH block 4k+2 (t0, t1, and t2 free) + pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high + mov d4, v5.d[1] // GHASH block 4k+1 - mid + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 + eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 + mov d8, v6.d[1] // GHASH block 4k+2 - mid + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 + eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 + eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 + pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 + ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 + eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid + pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high + pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 + pmull v6.1q, v7.1d, v12.1d // GHASH block 4k+3 - low + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 + ldp x19, x20, [x0, #16] // AES block 4k+5 - load plaintext + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 + mov d4, v7.d[1] // GHASH block 4k+3 - mid + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 + eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high + eor v4.8b, v4.8b, v7.8b // GHASH block 4k+3 - mid + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 + eor x19, x19, x13 // AES block 4k+5 - round N low + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 + eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 + eor x21, x21, x13 // AES block 4k+6 - round N low + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 + movi v8.8b, #0xc2 + pmull v4.1q, v4.1d, v16.1d // GHASH block 4k+3 - mid + eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high + cmp x17, #12 // setup flags for AES-128/192/256 check + fmov d5, x19 // AES block 4k+5 - mov low + ldp x6, x7, [x0, #0] // AES block 4k+4 - load plaintext + b.lt Lenc_main_loop_continue // branch if AES-128 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 + b.eq Lenc_main_loop_continue // branch if AES-192 + + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 + +Lenc_main_loop_continue: + shl d8, d8, #56 // mod_constant + eor v11.16b, v11.16b, v6.16b // GHASH block 4k+3 - low + eor v10.16b, v10.16b, v4.16b // GHASH block 4k+3 - mid + add w12, w12, #1 // CTR block 4k+3 + eor v4.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + add x0, x0, #64 // AES input_ptr update + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + rev w9, w12 // CTR block 4k+8 + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor x6, x6, x13 // AES block 4k+4 - round N low + eor v10.16b, v10.16b, v4.16b // MODULO - karatsuba tidy up + eor x7, x7, x14 // AES block 4k+4 - round N high + fmov d4, x6 // AES block 4k+4 - mov low + orr x9, x11, x9, lsl #32 // CTR block 4k+8 + eor v7.16b, v9.16b, v7.16b // MODULO - fold into mid + eor x20, x20, x14 // AES block 4k+5 - round N high + eor x24, x24, x14 // AES block 4k+7 - round N high + add w12, w12, #1 // CTR block 4k+8 + aese v0.16b, v31.16b // AES block 4k+4 - round N-1 + fmov v4.d[1], x7 // AES block 4k+4 - mov high + eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid + fmov d7, x23 // AES block 4k+7 - mov low + aese v1.16b, v31.16b // AES block 4k+5 - round N-1 + fmov v5.d[1], x20 // AES block 4k+5 - mov high + fmov d6, x21 // AES block 4k+6 - mov low + cmp x0, x5 // LOOP CONTROL + fmov v6.d[1], x22 // AES block 4k+6 - mov high + pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + eor v4.16b, v4.16b, v0.16b // AES block 4k+4 - result + fmov d0, x10 // CTR block 4k+8 + fmov v0.d[1], x9 // CTR block 4k+8 + rev w9, w12 // CTR block 4k+9 + add w12, w12, #1 // CTR block 4k+9 + eor v5.16b, v5.16b, v1.16b // AES block 4k+5 - result + fmov d1, x10 // CTR block 4k+9 + orr x9, x11, x9, lsl #32 // CTR block 4k+9 + fmov v1.d[1], x9 // CTR block 4k+9 + aese v2.16b, v31.16b // AES block 4k+6 - round N-1 + rev w9, w12 // CTR block 4k+10 + st1 { v4.16b}, [x2], #16 // AES block 4k+4 - store result + orr x9, x11, x9, lsl #32 // CTR block 4k+10 + eor v11.16b, v11.16b, v9.16b // MODULO - fold into low + fmov v7.d[1], x24 // AES block 4k+7 - mov high + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + st1 { v5.16b}, [x2], #16 // AES block 4k+5 - store result + add w12, w12, #1 // CTR block 4k+10 + aese v3.16b, v31.16b // AES block 4k+7 - round N-1 + eor v6.16b, v6.16b, v2.16b // AES block 4k+6 - result + fmov d2, x10 // CTR block 4k+10 + st1 { v6.16b}, [x2], #16 // AES block 4k+6 - store result + fmov v2.d[1], x9 // CTR block 4k+10 + rev w9, w12 // CTR block 4k+11 + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + orr x9, x11, x9, lsl #32 // CTR block 4k+11 + eor v7.16b, v7.16b, v3.16b // AES block 4k+7 - result + st1 { v7.16b}, [x2], #16 // AES block 4k+7 - store result + b.lt Lenc_main_loop + +Lenc_prepretail: // PREPRETAIL + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 + rev64 v6.16b, v6.16b // GHASH block 4k+2 (t0, t1, and t2 free) + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 + fmov d3, x10 // CTR block 4k+3 + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 + rev64 v4.16b, v4.16b // GHASH block 4k (only t0 is free) + fmov v3.d[1], x9 // CTR block 4k+3 + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 + eor v4.16b, v4.16b, v11.16b // PRE 1 + rev64 v5.16b, v5.16b // GHASH block 4k+1 (t0 and t1 free) + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 + mov d10, v17.d[1] // GHASH block 4k - mid + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 + pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low + mov d8, v4.d[1] // GHASH block 4k - mid + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 + eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 + pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high + pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high + mov d4, v5.d[1] // GHASH block 4k+1 - mid + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 + eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 + eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid + mov d8, v6.d[1] // GHASH block 4k+2 - mid + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 + rev64 v7.16b, v7.16b // GHASH block 4k+3 (t0, t1, t2 and t3 free) + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 + pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid + eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid + add w12, w12, #1 // CTR block 4k+3 + pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 + eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid + pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high + eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low + ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high + mov d4, v7.d[1] // GHASH block 4k+3 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid + eor v4.8b, v4.8b, v7.8b // GHASH block 4k+3 - mid + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 + pmull v4.1q, v4.1d, v16.1d // GHASH block 4k+3 - mid + eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 + movi v8.8b, #0xc2 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 + eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 + shl d8, d8, #56 // mod_constant + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 + eor v10.16b, v10.16b, v4.16b // GHASH block 4k+3 - mid + pmull v6.1q, v7.1d, v12.1d // GHASH block 4k+3 - low + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 + cmp x17, #12 // setup flags for AES-128/192/256 check + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 + eor v11.16b, v11.16b, v6.16b // GHASH block 4k+3 - low + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 + eor v10.16b, v10.16b, v9.16b // karatsuba tidy up + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 + pmull v4.1q, v9.1d, v8.1d + ext v9.16b, v9.16b, v9.16b, #8 + eor v10.16b, v10.16b, v11.16b + b.lt Lenc_finish_prepretail // branch if AES-128 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 + b.eq Lenc_finish_prepretail // branch if AES-192 + + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 + +Lenc_finish_prepretail: + eor v10.16b, v10.16b, v4.16b + eor v10.16b, v10.16b, v9.16b + pmull v4.1q, v10.1d, v8.1d + ext v10.16b, v10.16b, v10.16b, #8 + aese v1.16b, v31.16b // AES block 4k+5 - round N-1 + eor v11.16b, v11.16b, v4.16b + aese v3.16b, v31.16b // AES block 4k+7 - round N-1 + aese v0.16b, v31.16b // AES block 4k+4 - round N-1 + aese v2.16b, v31.16b // AES block 4k+6 - round N-1 + eor v11.16b, v11.16b, v10.16b + +Lenc_tail: // TAIL + ext v8.16b, v11.16b, v11.16b, #8 // prepare final partial tag + sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process + ldp x6, x7, [x0], #16 // AES block 4k+4 - load plaintext + eor x6, x6, x13 // AES block 4k+4 - round N low + eor x7, x7, x14 // AES block 4k+4 - round N high + cmp x5, #48 + fmov d4, x6 // AES block 4k+4 - mov low + fmov v4.d[1], x7 // AES block 4k+4 - mov high + eor v5.16b, v4.16b, v0.16b // AES block 4k+4 - result + b.gt Lenc_blocks_more_than_3 + cmp x5, #32 + mov v3.16b, v2.16b + movi v11.8b, #0 + movi v9.8b, #0 + sub w12, w12, #1 + mov v2.16b, v1.16b + movi v10.8b, #0 + b.gt Lenc_blocks_more_than_2 + mov v3.16b, v1.16b + sub w12, w12, #1 + cmp x5, #16 + b.gt Lenc_blocks_more_than_1 + sub w12, w12, #1 + b Lenc_blocks_less_than_1 +Lenc_blocks_more_than_3: // blocks left > 3 + st1 { v5.16b}, [x2], #16 // AES final-3 block - store result + ldp x6, x7, [x0], #16 // AES final-2 block - load input low & high + rev64 v4.16b, v5.16b // GHASH final-3 block + eor x6, x6, x13 // AES final-2 block - round N low + eor v4.16b, v4.16b, v8.16b // feed in partial tag + eor x7, x7, x14 // AES final-2 block - round N high + mov d22, v4.d[1] // GHASH final-3 block - mid + fmov d5, x6 // AES final-2 block - mov low + fmov v5.d[1], x7 // AES final-2 block - mov high + eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid + movi v8.8b, #0 // suppress further partial tag feed in + mov d10, v17.d[1] // GHASH final-3 block - mid + pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low + pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high + pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid + eor v5.16b, v5.16b, v1.16b // AES final-2 block - result +Lenc_blocks_more_than_2: // blocks left > 2 + st1 { v5.16b}, [x2], #16 // AES final-2 block - store result + ldp x6, x7, [x0], #16 // AES final-1 block - load input low & high + rev64 v4.16b, v5.16b // GHASH final-2 block + eor x6, x6, x13 // AES final-1 block - round N low + eor v4.16b, v4.16b, v8.16b // feed in partial tag + fmov d5, x6 // AES final-1 block - mov low + eor x7, x7, x14 // AES final-1 block - round N high + fmov v5.d[1], x7 // AES final-1 block - mov high + movi v8.8b, #0 // suppress further partial tag feed in + pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high + mov d22, v4.d[1] // GHASH final-2 block - mid + pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low + eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid + eor v5.16b, v5.16b, v2.16b // AES final-1 block - result + eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high + pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid + eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low + eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid +Lenc_blocks_more_than_1: // blocks left > 1 + st1 { v5.16b}, [x2], #16 // AES final-1 block - store result + rev64 v4.16b, v5.16b // GHASH final-1 block + ldp x6, x7, [x0], #16 // AES final block - load input low & high + eor v4.16b, v4.16b, v8.16b // feed in partial tag + movi v8.8b, #0 // suppress further partial tag feed in + eor x6, x6, x13 // AES final block - round N low + mov d22, v4.d[1] // GHASH final-1 block - mid + pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high + eor x7, x7, x14 // AES final block - round N high + eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high + ins v22.d[1], v22.d[0] // GHASH final-1 block - mid + fmov d5, x6 // AES final block - mov low + fmov v5.d[1], x7 // AES final block - mov high + pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid + pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low + eor v5.16b, v5.16b, v3.16b // AES final block - result + eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid + eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low +Lenc_blocks_less_than_1: // blocks left <= 1 + and x1, x1, #127 // bit_length %= 128 + mvn x13, xzr // rkN_l = 0xffffffffffffffff + sub x1, x1, #128 // bit_length -= 128 + neg x1, x1 // bit_length = 128 - #bits in input (in range [1,128]) + ld1 { v18.16b}, [x2] // load existing bytes where the possibly partial last block is to be stored + mvn x14, xzr // rkN_h = 0xffffffffffffffff + and x1, x1, #127 // bit_length %= 128 + lsr x14, x14, x1 // rkN_h is mask for top 64b of last block + cmp x1, #64 + csel x6, x13, x14, lt + csel x7, x14, xzr, lt + fmov d0, x6 // ctr0b is mask for last block + fmov v0.d[1], x7 + and v5.16b, v5.16b, v0.16b // possibly partial last block has zeroes in highest bits + rev64 v4.16b, v5.16b // GHASH final block + eor v4.16b, v4.16b, v8.16b // feed in partial tag + bif v5.16b, v18.16b, v0.16b // insert existing bytes in top end of result before storing + pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high + mov d8, v4.d[1] // GHASH final block - mid + rev w9, w12 + pmull v21.1q, v4.1d, v12.1d // GHASH final block - low + eor v9.16b, v9.16b, v20.16b // GHASH final block - high + eor v8.8b, v8.8b, v4.8b // GHASH final block - mid + pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid + eor v11.16b, v11.16b, v21.16b // GHASH final block - low + eor v10.16b, v10.16b, v8.16b // GHASH final block - mid + movi v8.8b, #0xc2 + eor v4.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + shl d8, d8, #56 // mod_constant + eor v10.16b, v10.16b, v4.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + str w9, [x16, #12] // store the updated counter + st1 { v5.16b}, [x2] // store all 16B + eor v11.16b, v11.16b, v9.16b // MODULO - fold into low + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + mov x0, x15 + st1 { v11.16b }, [x3] + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x23, x24, [sp, #48] + ldp d8, d9, [sp, #64] + ldp d10, d11, [sp, #80] + ldp d12, d13, [sp, #96] + ldp d14, d15, [sp, #112] + ldp x29, x30, [sp], #128 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.globl _aes_gcm_dec_kernel +.private_extern _aes_gcm_dec_kernel + +.align 4 +_aes_gcm_dec_kernel: + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp, #-128]! + mov x29, sp + stp x19, x20, [sp, #16] + mov x16, x4 + mov x8, x5 + stp x21, x22, [sp, #32] + stp x23, x24, [sp, #48] + stp d8, d9, [sp, #64] + stp d10, d11, [sp, #80] + stp d12, d13, [sp, #96] + stp d14, d15, [sp, #112] + ldr w17, [x8, #240] + add x19, x8, x17, lsl #4 // borrow input_l1 for last key + ldp x13, x14, [x19] // load round N keys + ldr q31, [x19, #-16] // load round N-1 keys + lsr x5, x1, #3 // byte_len + mov x15, x5 + ldp x10, x11, [x16] // ctr96_b64, ctr96_t32 + ldr q26, [x8, #128] // load rk8 + sub x5, x5, #1 // byte_len - 1 + ldr q25, [x8, #112] // load rk7 + and x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + add x4, x0, x1, lsr #3 // end_input_ptr + ldr q24, [x8, #96] // load rk6 + lsr x12, x11, #32 + ldr q23, [x8, #80] // load rk5 + orr w11, w11, w11 + ldr q21, [x8, #48] // load rk3 + add x5, x5, x0 + rev w12, w12 // rev_ctr32 + add w12, w12, #1 // increment rev_ctr32 + fmov d3, x10 // CTR block 3 + rev w9, w12 // CTR block 1 + add w12, w12, #1 // CTR block 1 + fmov d1, x10 // CTR block 1 + orr x9, x11, x9, lsl #32 // CTR block 1 + ld1 { v0.16b}, [x16] // special case vector load initial counter so we can start first AES block as quickly as possible + fmov v1.d[1], x9 // CTR block 1 + rev w9, w12 // CTR block 2 + add w12, w12, #1 // CTR block 2 + fmov d2, x10 // CTR block 2 + orr x9, x11, x9, lsl #32 // CTR block 2 + fmov v2.d[1], x9 // CTR block 2 + rev w9, w12 // CTR block 3 + orr x9, x11, x9, lsl #32 // CTR block 3 + ldr q18, [x8, #0] // load rk0 + fmov v3.d[1], x9 // CTR block 3 + add w12, w12, #1 // CTR block 3 + ldr q22, [x8, #64] // load rk4 + ldr q19, [x8, #16] // load rk1 + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 0 - round 0 + ldr q14, [x6, #48] // load h3l | h3h + ext v14.16b, v14.16b, v14.16b, #8 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 3 - round 0 + ldr q15, [x6, #80] // load h4l | h4h + ext v15.16b, v15.16b, v15.16b, #8 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 1 - round 0 + ldr q13, [x6, #32] // load h2l | h2h + ext v13.16b, v13.16b, v13.16b, #8 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 2 - round 0 + ldr q20, [x8, #32] // load rk2 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 0 - round 1 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 1 - round 1 + ld1 { v11.16b}, [x3] + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 2 - round 1 + ldr q27, [x8, #144] // load rk9 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 3 - round 1 + ldr q30, [x8, #192] // load rk12 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 0 - round 2 + ldr q12, [x6] // load h1l | h1h + ext v12.16b, v12.16b, v12.16b, #8 + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 2 - round 2 + ldr q28, [x8, #160] // load rk10 + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 3 - round 2 + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 0 - round 3 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 1 - round 2 + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 3 - round 3 + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 0 - round 4 + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 2 - round 3 + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 1 - round 3 + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 3 - round 4 + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 2 - round 4 + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 1 - round 4 + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 3 - round 5 + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 0 - round 5 + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 1 - round 5 + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 2 - round 5 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 0 - round 6 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 3 - round 6 + cmp x17, #12 // setup flags for AES-128/192/256 check + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 1 - round 6 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 2 - round 6 + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 0 - round 7 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 1 - round 7 + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 3 - round 7 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 0 - round 8 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 2 - round 7 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 3 - round 8 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 1 - round 8 + ldr q29, [x8, #176] // load rk11 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 2 - round 8 + b.lt Ldec_finish_first_blocks // branch if AES-128 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 10 + b.eq Ldec_finish_first_blocks // branch if AES-192 + + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b // AES block 0 - round 11 + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b // AES block 3 - round 11 + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b // AES block 1 - round 11 + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b // AES block 2 - round 11 + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b // AES block 1 - round 12 + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b // AES block 0 - round 12 + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b // AES block 2 - round 12 + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b // AES block 3 - round 12 + +Ldec_finish_first_blocks: + cmp x0, x5 // check if we have <= 4 blocks + trn1 v9.2d, v14.2d, v15.2d // h4h | h3h + trn2 v17.2d, v14.2d, v15.2d // h4l | h3l + trn1 v8.2d, v12.2d, v13.2d // h2h | h1h + trn2 v16.2d, v12.2d, v13.2d // h2l | h1l + eor v17.16b, v17.16b, v9.16b // h4k | h3k + aese v1.16b, v31.16b // AES block 1 - round N-1 + aese v2.16b, v31.16b // AES block 2 - round N-1 + eor v16.16b, v16.16b, v8.16b // h2k | h1k + aese v3.16b, v31.16b // AES block 3 - round N-1 + aese v0.16b, v31.16b // AES block 0 - round N-1 + b.ge Ldec_tail // handle tail + + ldr q4, [x0, #0] // AES block 0 - load ciphertext + ldr q5, [x0, #16] // AES block 1 - load ciphertext + rev w9, w12 // CTR block 4 + eor v0.16b, v4.16b, v0.16b // AES block 0 - result + eor v1.16b, v5.16b, v1.16b // AES block 1 - result + rev64 v5.16b, v5.16b // GHASH block 1 + ldr q7, [x0, #48] // AES block 3 - load ciphertext + mov x7, v0.d[1] // AES block 0 - mov high + mov x6, v0.d[0] // AES block 0 - mov low + rev64 v4.16b, v4.16b // GHASH block 0 + add w12, w12, #1 // CTR block 4 + fmov d0, x10 // CTR block 4 + orr x9, x11, x9, lsl #32 // CTR block 4 + fmov v0.d[1], x9 // CTR block 4 + rev w9, w12 // CTR block 5 + add w12, w12, #1 // CTR block 5 + mov x19, v1.d[0] // AES block 1 - mov low + orr x9, x11, x9, lsl #32 // CTR block 5 + mov x20, v1.d[1] // AES block 1 - mov high + eor x7, x7, x14 // AES block 0 - round N high + eor x6, x6, x13 // AES block 0 - round N low + stp x6, x7, [x2], #16 // AES block 0 - store result + fmov d1, x10 // CTR block 5 + ldr q6, [x0, #32] // AES block 2 - load ciphertext + add x0, x0, #64 // AES input_ptr update + fmov v1.d[1], x9 // CTR block 5 + rev w9, w12 // CTR block 6 + add w12, w12, #1 // CTR block 6 + eor x19, x19, x13 // AES block 1 - round N low + orr x9, x11, x9, lsl #32 // CTR block 6 + eor x20, x20, x14 // AES block 1 - round N high + stp x19, x20, [x2], #16 // AES block 1 - store result + eor v2.16b, v6.16b, v2.16b // AES block 2 - result + cmp x0, x5 // check if we have <= 8 blocks + b.ge Ldec_prepretail // do prepretail + +Ldec_main_loop: // main loop start + mov x21, v2.d[0] // AES block 4k+2 - mov low + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + eor v3.16b, v7.16b, v3.16b // AES block 4k+3 - result + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 + mov x22, v2.d[1] // AES block 4k+2 - mov high + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 + fmov d2, x10 // CTR block 4k+6 + fmov v2.d[1], x9 // CTR block 4k+6 + eor v4.16b, v4.16b, v11.16b // PRE 1 + rev w9, w12 // CTR block 4k+7 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 + mov x24, v3.d[1] // AES block 4k+3 - mov high + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 + mov x23, v3.d[0] // AES block 4k+3 - mov low + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high + mov d8, v4.d[1] // GHASH block 4k - mid + fmov d3, x10 // CTR block 4k+7 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 + orr x9, x11, x9, lsl #32 // CTR block 4k+7 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 + fmov v3.d[1], x9 // CTR block 4k+7 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 + eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 + eor x22, x22, x14 // AES block 4k+2 - round N high + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 + mov d10, v17.d[1] // GHASH block 4k - mid + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 + rev64 v6.16b, v6.16b // GHASH block 4k+2 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 + eor x21, x21, x13 // AES block 4k+2 - round N low + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 + stp x21, x22, [x2], #16 // AES block 4k+2 - store result + pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 + rev64 v7.16b, v7.16b // GHASH block 4k+3 + pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid + eor x23, x23, x13 // AES block 4k+3 - round N low + pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low + eor x24, x24, x14 // AES block 4k+3 - round N high + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 + mov d4, v5.d[1] // GHASH block 4k+1 - mid + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 + eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 + add w12, w12, #1 // CTR block 4k+7 + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 + mov d8, v6.d[1] // GHASH block 4k+2 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 + eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid + pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 + eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 + eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low + pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid + rev w9, w12 // CTR block 4k+8 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 + ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 + add w12, w12, #1 // CTR block 4k+8 + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 + eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 + pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high + mov d6, v7.d[1] // GHASH block 4k+3 - mid + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 + pmull v4.1q, v7.1d, v12.1d // GHASH block 4k+3 - low + orr x9, x11, x9, lsl #32 // CTR block 4k+8 + eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high + cmp x17, #12 // setup flags for AES-128/192/256 check + eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 + eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high + pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid + movi v8.8b, #0xc2 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 + eor v11.16b, v11.16b, v4.16b // GHASH block 4k+3 - low + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 + shl d8, d8, #56 // mod_constant + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 + eor v10.16b, v10.16b, v6.16b // GHASH block 4k+3 - mid + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 + b.lt Ldec_main_loop_continue // branch if AES-128 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 + b.eq Ldec_main_loop_continue // branch if AES-192 + + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 + +Ldec_main_loop_continue: + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + ldr q4, [x0, #0] // AES block 4k+4 - load ciphertext + aese v0.16b, v31.16b // AES block 4k+4 - round N-1 + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up + ldr q5, [x0, #16] // AES block 4k+5 - load ciphertext + eor v0.16b, v4.16b, v0.16b // AES block 4k+4 - result + stp x23, x24, [x2], #16 // AES block 4k+3 - store result + eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid + ldr q7, [x0, #48] // AES block 4k+7 - load ciphertext + ldr q6, [x0, #32] // AES block 4k+6 - load ciphertext + mov x7, v0.d[1] // AES block 4k+4 - mov high + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + aese v1.16b, v31.16b // AES block 4k+5 - round N-1 + add x0, x0, #64 // AES input_ptr update + mov x6, v0.d[0] // AES block 4k+4 - mov low + fmov d0, x10 // CTR block 4k+8 + fmov v0.d[1], x9 // CTR block 4k+8 + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + eor v1.16b, v5.16b, v1.16b // AES block 4k+5 - result + rev w9, w12 // CTR block 4k+9 + aese v2.16b, v31.16b // AES block 4k+6 - round N-1 + orr x9, x11, x9, lsl #32 // CTR block 4k+9 + cmp x0, x5 // LOOP CONTROL + add w12, w12, #1 // CTR block 4k+9 + eor x6, x6, x13 // AES block 4k+4 - round N low + eor x7, x7, x14 // AES block 4k+4 - round N high + mov x20, v1.d[1] // AES block 4k+5 - mov high + eor v2.16b, v6.16b, v2.16b // AES block 4k+6 - result + eor v11.16b, v11.16b, v8.16b // MODULO - fold into low + mov x19, v1.d[0] // AES block 4k+5 - mov low + fmov d1, x10 // CTR block 4k+9 + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + fmov v1.d[1], x9 // CTR block 4k+9 + rev w9, w12 // CTR block 4k+10 + add w12, w12, #1 // CTR block 4k+10 + aese v3.16b, v31.16b // AES block 4k+7 - round N-1 + orr x9, x11, x9, lsl #32 // CTR block 4k+10 + rev64 v5.16b, v5.16b // GHASH block 4k+5 + eor x20, x20, x14 // AES block 4k+5 - round N high + stp x6, x7, [x2], #16 // AES block 4k+4 - store result + eor x19, x19, x13 // AES block 4k+5 - round N low + stp x19, x20, [x2], #16 // AES block 4k+5 - store result + rev64 v4.16b, v4.16b // GHASH block 4k+4 + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + b.lt Ldec_main_loop + +Ldec_prepretail: // PREPRETAIL + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + mov x21, v2.d[0] // AES block 4k+2 - mov low + eor v3.16b, v7.16b, v3.16b // AES block 4k+3 - result + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 + mov x22, v2.d[1] // AES block 4k+2 - mov high + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 + fmov d2, x10 // CTR block 4k+6 + fmov v2.d[1], x9 // CTR block 4k+6 + rev w9, w12 // CTR block 4k+7 + eor v4.16b, v4.16b, v11.16b // PRE 1 + rev64 v6.16b, v6.16b // GHASH block 4k+2 + orr x9, x11, x9, lsl #32 // CTR block 4k+7 + mov x23, v3.d[0] // AES block 4k+3 - mov low + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 + mov x24, v3.d[1] // AES block 4k+3 - mov high + pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low + mov d8, v4.d[1] // GHASH block 4k - mid + fmov d3, x10 // CTR block 4k+7 + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high + fmov v3.d[1], x9 // CTR block 4k+7 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 + mov d10, v17.d[1] // GHASH block 4k - mid + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 + eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 + rev64 v7.16b, v7.16b // GHASH block 4k+3 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 + pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high + pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 + mov d4, v5.d[1] // GHASH block 4k+1 - mid + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 + eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 + mov d8, v6.d[1] // GHASH block 4k+2 - mid + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 + eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid + pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 + eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid + pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 + eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high + eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid + pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 + ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high + pmull v4.1q, v7.1d, v12.1d // GHASH block 4k+3 - low + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 + mov d6, v7.d[1] // GHASH block 4k+3 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 + eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 + eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 + movi v8.8b, #0xc2 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 + eor v11.16b, v11.16b, v4.16b // GHASH block 4k+3 - low + pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 + cmp x17, #12 // setup flags for AES-128/192/256 check + eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 + eor v10.16b, v10.16b, v6.16b // GHASH block 4k+3 - mid + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 + shl d8, d8, #56 // mod_constant + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 + b.lt Ldec_finish_prepretail // branch if AES-128 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 + b.eq Ldec_finish_prepretail // branch if AES-192 + + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 + +Ldec_finish_prepretail: + eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid + eor x22, x22, x14 // AES block 4k+2 - round N high + eor x23, x23, x13 // AES block 4k+3 - round N low + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + add w12, w12, #1 // CTR block 4k+7 + eor x21, x21, x13 // AES block 4k+2 - round N low + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + eor x24, x24, x14 // AES block 4k+3 - round N high + stp x21, x22, [x2], #16 // AES block 4k+2 - store result + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + stp x23, x24, [x2], #16 // AES block 4k+3 - store result + + eor v11.16b, v11.16b, v8.16b // MODULO - fold into low + aese v1.16b, v31.16b // AES block 4k+5 - round N-1 + aese v0.16b, v31.16b // AES block 4k+4 - round N-1 + aese v3.16b, v31.16b // AES block 4k+7 - round N-1 + aese v2.16b, v31.16b // AES block 4k+6 - round N-1 + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + +Ldec_tail: // TAIL + sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process + ld1 { v5.16b}, [x0], #16 // AES block 4k+4 - load ciphertext + eor v0.16b, v5.16b, v0.16b // AES block 4k+4 - result + mov x6, v0.d[0] // AES block 4k+4 - mov low + mov x7, v0.d[1] // AES block 4k+4 - mov high + ext v8.16b, v11.16b, v11.16b, #8 // prepare final partial tag + cmp x5, #48 + eor x6, x6, x13 // AES block 4k+4 - round N low + eor x7, x7, x14 // AES block 4k+4 - round N high + b.gt Ldec_blocks_more_than_3 + sub w12, w12, #1 + mov v3.16b, v2.16b + movi v10.8b, #0 + movi v11.8b, #0 + cmp x5, #32 + movi v9.8b, #0 + mov v2.16b, v1.16b + b.gt Ldec_blocks_more_than_2 + sub w12, w12, #1 + mov v3.16b, v1.16b + cmp x5, #16 + b.gt Ldec_blocks_more_than_1 + sub w12, w12, #1 + b Ldec_blocks_less_than_1 +Ldec_blocks_more_than_3: // blocks left > 3 + rev64 v4.16b, v5.16b // GHASH final-3 block + ld1 { v5.16b}, [x0], #16 // AES final-2 block - load ciphertext + stp x6, x7, [x2], #16 // AES final-3 block - store result + mov d10, v17.d[1] // GHASH final-3 block - mid + eor v4.16b, v4.16b, v8.16b // feed in partial tag + eor v0.16b, v5.16b, v1.16b // AES final-2 block - result + mov d22, v4.d[1] // GHASH final-3 block - mid + mov x6, v0.d[0] // AES final-2 block - mov low + mov x7, v0.d[1] // AES final-2 block - mov high + eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid + movi v8.8b, #0 // suppress further partial tag feed in + pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high + pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid + eor x6, x6, x13 // AES final-2 block - round N low + pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low + eor x7, x7, x14 // AES final-2 block - round N high +Ldec_blocks_more_than_2: // blocks left > 2 + rev64 v4.16b, v5.16b // GHASH final-2 block + ld1 { v5.16b}, [x0], #16 // AES final-1 block - load ciphertext + eor v4.16b, v4.16b, v8.16b // feed in partial tag + stp x6, x7, [x2], #16 // AES final-2 block - store result + eor v0.16b, v5.16b, v2.16b // AES final-1 block - result + mov d22, v4.d[1] // GHASH final-2 block - mid + pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low + pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high + eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid + mov x6, v0.d[0] // AES final-1 block - mov low + mov x7, v0.d[1] // AES final-1 block - mov high + eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low + movi v8.8b, #0 // suppress further partial tag feed in + pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high + eor x6, x6, x13 // AES final-1 block - round N low + eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid + eor x7, x7, x14 // AES final-1 block - round N high +Ldec_blocks_more_than_1: // blocks left > 1 + stp x6, x7, [x2], #16 // AES final-1 block - store result + rev64 v4.16b, v5.16b // GHASH final-1 block + ld1 { v5.16b}, [x0], #16 // AES final block - load ciphertext + eor v4.16b, v4.16b, v8.16b // feed in partial tag + movi v8.8b, #0 // suppress further partial tag feed in + mov d22, v4.d[1] // GHASH final-1 block - mid + eor v0.16b, v5.16b, v3.16b // AES final block - result + pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high + eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid + pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low + mov x6, v0.d[0] // AES final block - mov low + ins v22.d[1], v22.d[0] // GHASH final-1 block - mid + mov x7, v0.d[1] // AES final block - mov high + pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid + eor x6, x6, x13 // AES final block - round N low + eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low + eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high + eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid + eor x7, x7, x14 // AES final block - round N high +Ldec_blocks_less_than_1: // blocks left <= 1 + and x1, x1, #127 // bit_length %= 128 + mvn x14, xzr // rkN_h = 0xffffffffffffffff + sub x1, x1, #128 // bit_length -= 128 + mvn x13, xzr // rkN_l = 0xffffffffffffffff + ldp x4, x5, [x2] // load existing bytes we need to not overwrite + neg x1, x1 // bit_length = 128 - #bits in input (in range [1,128]) + and x1, x1, #127 // bit_length %= 128 + lsr x14, x14, x1 // rkN_h is mask for top 64b of last block + cmp x1, #64 + csel x9, x13, x14, lt + csel x10, x14, xzr, lt + fmov d0, x9 // ctr0b is mask for last block + and x6, x6, x9 + mov v0.d[1], x10 + bic x4, x4, x9 // mask out low existing bytes + rev w9, w12 + bic x5, x5, x10 // mask out high existing bytes + orr x6, x6, x4 + and x7, x7, x10 + orr x7, x7, x5 + and v5.16b, v5.16b, v0.16b // possibly partial last block has zeroes in highest bits + rev64 v4.16b, v5.16b // GHASH final block + eor v4.16b, v4.16b, v8.16b // feed in partial tag + pmull v21.1q, v4.1d, v12.1d // GHASH final block - low + mov d8, v4.d[1] // GHASH final block - mid + eor v8.8b, v8.8b, v4.8b // GHASH final block - mid + pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high + pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final block - high + eor v11.16b, v11.16b, v21.16b // GHASH final block - low + eor v10.16b, v10.16b, v8.16b // GHASH final block - mid + movi v8.8b, #0xc2 + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + shl d8, d8, #56 // mod_constant + eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + eor v11.16b, v11.16b, v8.16b // MODULO - fold into low + stp x6, x7, [x2] + str w9, [x16, #12] // store the updated counter + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + mov x0, x15 + st1 { v11.16b }, [x3] + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x23, x24, [sp, #48] + ldp d8, d9, [sp, #64] + ldp d10, d11, [sp, #80] + ldp d12, d13, [sp, #96] + ldp d14, d15, [sp, #112] + ldp x29, x30, [sp], #128 + AARCH64_VALIDATE_LINK_REGISTER + ret + +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) diff --git a/ring-0.17.14/pregenerated/aesv8-gcm-armv8-linux64.S b/ring-0.17.14/pregenerated/aesv8-gcm-armv8-linux64.S new file mode 100644 index 0000000000..4e87ce7592 --- /dev/null +++ b/ring-0.17.14/pregenerated/aesv8-gcm-armv8-linux64.S @@ -0,0 +1,1554 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) +#if __ARM_MAX_ARCH__ >= 8 + +.arch armv8-a+crypto +.text +.globl aes_gcm_enc_kernel +.hidden aes_gcm_enc_kernel +.type aes_gcm_enc_kernel,%function +.align 4 +aes_gcm_enc_kernel: + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp, #-128]! + mov x29, sp + stp x19, x20, [sp, #16] + mov x16, x4 + mov x8, x5 + stp x21, x22, [sp, #32] + stp x23, x24, [sp, #48] + stp d8, d9, [sp, #64] + stp d10, d11, [sp, #80] + stp d12, d13, [sp, #96] + stp d14, d15, [sp, #112] + ldr w17, [x8, #240] + add x19, x8, x17, lsl #4 // borrow input_l1 for last key + ldp x13, x14, [x19] // load round N keys + ldr q31, [x19, #-16] // load round N-1 keys + add x4, x0, x1, lsr #3 // end_input_ptr + lsr x5, x1, #3 // byte_len + mov x15, x5 + ldp x10, x11, [x16] // ctr96_b64, ctr96_t32 + ld1 { v0.16b}, [x16] // special case vector load initial counter so we can start first AES block as quickly as possible + sub x5, x5, #1 // byte_len - 1 + ldr q18, [x8, #0] // load rk0 + and x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + ldr q25, [x8, #112] // load rk7 + add x5, x5, x0 + lsr x12, x11, #32 + fmov d2, x10 // CTR block 2 + orr w11, w11, w11 + rev w12, w12 // rev_ctr32 + fmov d1, x10 // CTR block 1 + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 0 - round 0 + add w12, w12, #1 // increment rev_ctr32 + rev w9, w12 // CTR block 1 + fmov d3, x10 // CTR block 3 + orr x9, x11, x9, lsl #32 // CTR block 1 + add w12, w12, #1 // CTR block 1 + ldr q19, [x8, #16] // load rk1 + fmov v1.d[1], x9 // CTR block 1 + rev w9, w12 // CTR block 2 + add w12, w12, #1 // CTR block 2 + orr x9, x11, x9, lsl #32 // CTR block 2 + ldr q20, [x8, #32] // load rk2 + fmov v2.d[1], x9 // CTR block 2 + rev w9, w12 // CTR block 3 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 0 - round 1 + orr x9, x11, x9, lsl #32 // CTR block 3 + fmov v3.d[1], x9 // CTR block 3 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 1 - round 0 + ldr q21, [x8, #48] // load rk3 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 0 - round 2 + ldr q24, [x8, #96] // load rk6 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 2 - round 0 + ldr q23, [x8, #80] // load rk5 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 1 - round 1 + ldr q14, [x6, #48] // load h3l | h3h + ext v14.16b, v14.16b, v14.16b, #8 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 3 - round 0 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 2 - round 1 + ldr q22, [x8, #64] // load rk4 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 1 - round 2 + ldr q13, [x6, #32] // load h2l | h2h + ext v13.16b, v13.16b, v13.16b, #8 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 3 - round 1 + ldr q30, [x8, #192] // load rk12 + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 2 - round 2 + ldr q15, [x6, #80] // load h4l | h4h + ext v15.16b, v15.16b, v15.16b, #8 + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 1 - round 3 + ldr q29, [x8, #176] // load rk11 + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 3 - round 2 + ldr q26, [x8, #128] // load rk8 + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 2 - round 3 + add w12, w12, #1 // CTR block 3 + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 0 - round 3 + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 3 - round 3 + ld1 { v11.16b}, [x3] + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 2 - round 4 + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 0 - round 4 + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 1 - round 4 + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 3 - round 4 + cmp x17, #12 // setup flags for AES-128/192/256 check + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 0 - round 5 + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 1 - round 5 + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 3 - round 5 + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 2 - round 5 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 1 - round 6 + trn2 v17.2d, v14.2d, v15.2d // h4l | h3l + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 3 - round 6 + ldr q27, [x8, #144] // load rk9 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 0 - round 6 + ldr q12, [x6] // load h1l | h1h + ext v12.16b, v12.16b, v12.16b, #8 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 2 - round 6 + ldr q28, [x8, #160] // load rk10 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 1 - round 7 + trn1 v9.2d, v14.2d, v15.2d // h4h | h3h + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 0 - round 7 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 2 - round 7 + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 3 - round 7 + trn2 v16.2d, v12.2d, v13.2d // h2l | h1l + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 1 - round 8 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 2 - round 8 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 3 - round 8 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 0 - round 8 + b.lt .Lenc_finish_first_blocks // branch if AES-128 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 9 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 9 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 10 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 10 + b.eq .Lenc_finish_first_blocks // branch if AES-192 + + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b // AES block 1 - round 11 + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b // AES block 2 - round 11 + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b // AES block 0 - round 11 + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b // AES block 3 - round 11 + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b // AES block 1 - round 12 + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b // AES block 2 - round 12 + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b // AES block 0 - round 12 + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b // AES block 3 - round 12 + +.Lenc_finish_first_blocks: + cmp x0, x5 // check if we have <= 4 blocks + eor v17.16b, v17.16b, v9.16b // h4k | h3k + aese v2.16b, v31.16b // AES block 2 - round N-1 + trn1 v8.2d, v12.2d, v13.2d // h2h | h1h + aese v1.16b, v31.16b // AES block 1 - round N-1 + aese v0.16b, v31.16b // AES block 0 - round N-1 + aese v3.16b, v31.16b // AES block 3 - round N-1 + eor v16.16b, v16.16b, v8.16b // h2k | h1k + b.ge .Lenc_tail // handle tail + + ldp x19, x20, [x0, #16] // AES block 1 - load plaintext + rev w9, w12 // CTR block 4 + ldp x6, x7, [x0, #0] // AES block 0 - load plaintext + ldp x23, x24, [x0, #48] // AES block 3 - load plaintext + ldp x21, x22, [x0, #32] // AES block 2 - load plaintext + add x0, x0, #64 // AES input_ptr update + eor x19, x19, x13 // AES block 1 - round N low + eor x20, x20, x14 // AES block 1 - round N high + fmov d5, x19 // AES block 1 - mov low + eor x6, x6, x13 // AES block 0 - round N low + eor x7, x7, x14 // AES block 0 - round N high + eor x24, x24, x14 // AES block 3 - round N high + fmov d4, x6 // AES block 0 - mov low + cmp x0, x5 // check if we have <= 8 blocks + fmov v4.d[1], x7 // AES block 0 - mov high + eor x23, x23, x13 // AES block 3 - round N low + eor x21, x21, x13 // AES block 2 - round N low + fmov v5.d[1], x20 // AES block 1 - mov high + fmov d6, x21 // AES block 2 - mov low + add w12, w12, #1 // CTR block 4 + orr x9, x11, x9, lsl #32 // CTR block 4 + fmov d7, x23 // AES block 3 - mov low + eor x22, x22, x14 // AES block 2 - round N high + fmov v6.d[1], x22 // AES block 2 - mov high + eor v4.16b, v4.16b, v0.16b // AES block 0 - result + fmov d0, x10 // CTR block 4 + fmov v0.d[1], x9 // CTR block 4 + rev w9, w12 // CTR block 5 + add w12, w12, #1 // CTR block 5 + eor v5.16b, v5.16b, v1.16b // AES block 1 - result + fmov d1, x10 // CTR block 5 + orr x9, x11, x9, lsl #32 // CTR block 5 + fmov v1.d[1], x9 // CTR block 5 + rev w9, w12 // CTR block 6 + st1 { v4.16b}, [x2], #16 // AES block 0 - store result + fmov v7.d[1], x24 // AES block 3 - mov high + orr x9, x11, x9, lsl #32 // CTR block 6 + eor v6.16b, v6.16b, v2.16b // AES block 2 - result + st1 { v5.16b}, [x2], #16 // AES block 1 - store result + add w12, w12, #1 // CTR block 6 + fmov d2, x10 // CTR block 6 + fmov v2.d[1], x9 // CTR block 6 + st1 { v6.16b}, [x2], #16 // AES block 2 - store result + rev w9, w12 // CTR block 7 + orr x9, x11, x9, lsl #32 // CTR block 7 + eor v7.16b, v7.16b, v3.16b // AES block 3 - result + st1 { v7.16b}, [x2], #16 // AES block 3 - store result + b.ge .Lenc_prepretail // do prepretail + +.Lenc_main_loop: // main loop start + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 + rev64 v4.16b, v4.16b // GHASH block 4k (only t0 is free) + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 + fmov d3, x10 // CTR block 4k+3 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 + fmov v3.d[1], x9 // CTR block 4k+3 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 + ldp x23, x24, [x0, #48] // AES block 4k+7 - load plaintext + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 + ldp x21, x22, [x0, #32] // AES block 4k+6 - load plaintext + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 + eor v4.16b, v4.16b, v11.16b // PRE 1 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 + eor x23, x23, x13 // AES block 4k+7 - round N low + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 + mov d10, v17.d[1] // GHASH block 4k - mid + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high + eor x22, x22, x14 // AES block 4k+6 - round N high + mov d8, v4.d[1] // GHASH block 4k - mid + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 + rev64 v5.16b, v5.16b // GHASH block 4k+1 (t0 and t1 free) + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 + pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low + eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 + rev64 v7.16b, v7.16b // GHASH block 4k+3 (t0, t1, t2 and t3 free) + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high + pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid + rev64 v6.16b, v6.16b // GHASH block 4k+2 (t0, t1, and t2 free) + pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high + mov d4, v5.d[1] // GHASH block 4k+1 - mid + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 + eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 + mov d8, v6.d[1] // GHASH block 4k+2 - mid + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 + eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 + eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 + pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 + ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 + eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid + pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high + pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 + pmull v6.1q, v7.1d, v12.1d // GHASH block 4k+3 - low + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 + ldp x19, x20, [x0, #16] // AES block 4k+5 - load plaintext + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 + mov d4, v7.d[1] // GHASH block 4k+3 - mid + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 + eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high + eor v4.8b, v4.8b, v7.8b // GHASH block 4k+3 - mid + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 + eor x19, x19, x13 // AES block 4k+5 - round N low + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 + eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 + eor x21, x21, x13 // AES block 4k+6 - round N low + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 + movi v8.8b, #0xc2 + pmull v4.1q, v4.1d, v16.1d // GHASH block 4k+3 - mid + eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high + cmp x17, #12 // setup flags for AES-128/192/256 check + fmov d5, x19 // AES block 4k+5 - mov low + ldp x6, x7, [x0, #0] // AES block 4k+4 - load plaintext + b.lt .Lenc_main_loop_continue // branch if AES-128 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 + b.eq .Lenc_main_loop_continue // branch if AES-192 + + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 + +.Lenc_main_loop_continue: + shl d8, d8, #56 // mod_constant + eor v11.16b, v11.16b, v6.16b // GHASH block 4k+3 - low + eor v10.16b, v10.16b, v4.16b // GHASH block 4k+3 - mid + add w12, w12, #1 // CTR block 4k+3 + eor v4.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + add x0, x0, #64 // AES input_ptr update + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + rev w9, w12 // CTR block 4k+8 + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor x6, x6, x13 // AES block 4k+4 - round N low + eor v10.16b, v10.16b, v4.16b // MODULO - karatsuba tidy up + eor x7, x7, x14 // AES block 4k+4 - round N high + fmov d4, x6 // AES block 4k+4 - mov low + orr x9, x11, x9, lsl #32 // CTR block 4k+8 + eor v7.16b, v9.16b, v7.16b // MODULO - fold into mid + eor x20, x20, x14 // AES block 4k+5 - round N high + eor x24, x24, x14 // AES block 4k+7 - round N high + add w12, w12, #1 // CTR block 4k+8 + aese v0.16b, v31.16b // AES block 4k+4 - round N-1 + fmov v4.d[1], x7 // AES block 4k+4 - mov high + eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid + fmov d7, x23 // AES block 4k+7 - mov low + aese v1.16b, v31.16b // AES block 4k+5 - round N-1 + fmov v5.d[1], x20 // AES block 4k+5 - mov high + fmov d6, x21 // AES block 4k+6 - mov low + cmp x0, x5 // .LOOP CONTROL + fmov v6.d[1], x22 // AES block 4k+6 - mov high + pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + eor v4.16b, v4.16b, v0.16b // AES block 4k+4 - result + fmov d0, x10 // CTR block 4k+8 + fmov v0.d[1], x9 // CTR block 4k+8 + rev w9, w12 // CTR block 4k+9 + add w12, w12, #1 // CTR block 4k+9 + eor v5.16b, v5.16b, v1.16b // AES block 4k+5 - result + fmov d1, x10 // CTR block 4k+9 + orr x9, x11, x9, lsl #32 // CTR block 4k+9 + fmov v1.d[1], x9 // CTR block 4k+9 + aese v2.16b, v31.16b // AES block 4k+6 - round N-1 + rev w9, w12 // CTR block 4k+10 + st1 { v4.16b}, [x2], #16 // AES block 4k+4 - store result + orr x9, x11, x9, lsl #32 // CTR block 4k+10 + eor v11.16b, v11.16b, v9.16b // MODULO - fold into low + fmov v7.d[1], x24 // AES block 4k+7 - mov high + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + st1 { v5.16b}, [x2], #16 // AES block 4k+5 - store result + add w12, w12, #1 // CTR block 4k+10 + aese v3.16b, v31.16b // AES block 4k+7 - round N-1 + eor v6.16b, v6.16b, v2.16b // AES block 4k+6 - result + fmov d2, x10 // CTR block 4k+10 + st1 { v6.16b}, [x2], #16 // AES block 4k+6 - store result + fmov v2.d[1], x9 // CTR block 4k+10 + rev w9, w12 // CTR block 4k+11 + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + orr x9, x11, x9, lsl #32 // CTR block 4k+11 + eor v7.16b, v7.16b, v3.16b // AES block 4k+7 - result + st1 { v7.16b}, [x2], #16 // AES block 4k+7 - store result + b.lt .Lenc_main_loop + +.Lenc_prepretail: // PREPRETAIL + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 + rev64 v6.16b, v6.16b // GHASH block 4k+2 (t0, t1, and t2 free) + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 + fmov d3, x10 // CTR block 4k+3 + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 + rev64 v4.16b, v4.16b // GHASH block 4k (only t0 is free) + fmov v3.d[1], x9 // CTR block 4k+3 + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 + eor v4.16b, v4.16b, v11.16b // PRE 1 + rev64 v5.16b, v5.16b // GHASH block 4k+1 (t0 and t1 free) + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 + mov d10, v17.d[1] // GHASH block 4k - mid + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 + pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low + mov d8, v4.d[1] // GHASH block 4k - mid + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 + eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 + pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high + pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high + mov d4, v5.d[1] // GHASH block 4k+1 - mid + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 + eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 + eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid + mov d8, v6.d[1] // GHASH block 4k+2 - mid + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 + rev64 v7.16b, v7.16b // GHASH block 4k+3 (t0, t1, t2 and t3 free) + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 + pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid + eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid + add w12, w12, #1 // CTR block 4k+3 + pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 + eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid + pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high + eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low + ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high + mov d4, v7.d[1] // GHASH block 4k+3 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid + eor v4.8b, v4.8b, v7.8b // GHASH block 4k+3 - mid + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 + pmull v4.1q, v4.1d, v16.1d // GHASH block 4k+3 - mid + eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 + movi v8.8b, #0xc2 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 + eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 + shl d8, d8, #56 // mod_constant + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 + eor v10.16b, v10.16b, v4.16b // GHASH block 4k+3 - mid + pmull v6.1q, v7.1d, v12.1d // GHASH block 4k+3 - low + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 + cmp x17, #12 // setup flags for AES-128/192/256 check + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 + eor v11.16b, v11.16b, v6.16b // GHASH block 4k+3 - low + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 + eor v10.16b, v10.16b, v9.16b // karatsuba tidy up + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 + pmull v4.1q, v9.1d, v8.1d + ext v9.16b, v9.16b, v9.16b, #8 + eor v10.16b, v10.16b, v11.16b + b.lt .Lenc_finish_prepretail // branch if AES-128 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 + b.eq .Lenc_finish_prepretail // branch if AES-192 + + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 + +.Lenc_finish_prepretail: + eor v10.16b, v10.16b, v4.16b + eor v10.16b, v10.16b, v9.16b + pmull v4.1q, v10.1d, v8.1d + ext v10.16b, v10.16b, v10.16b, #8 + aese v1.16b, v31.16b // AES block 4k+5 - round N-1 + eor v11.16b, v11.16b, v4.16b + aese v3.16b, v31.16b // AES block 4k+7 - round N-1 + aese v0.16b, v31.16b // AES block 4k+4 - round N-1 + aese v2.16b, v31.16b // AES block 4k+6 - round N-1 + eor v11.16b, v11.16b, v10.16b + +.Lenc_tail: // TAIL + ext v8.16b, v11.16b, v11.16b, #8 // prepare final partial tag + sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process + ldp x6, x7, [x0], #16 // AES block 4k+4 - load plaintext + eor x6, x6, x13 // AES block 4k+4 - round N low + eor x7, x7, x14 // AES block 4k+4 - round N high + cmp x5, #48 + fmov d4, x6 // AES block 4k+4 - mov low + fmov v4.d[1], x7 // AES block 4k+4 - mov high + eor v5.16b, v4.16b, v0.16b // AES block 4k+4 - result + b.gt .Lenc_blocks_more_than_3 + cmp x5, #32 + mov v3.16b, v2.16b + movi v11.8b, #0 + movi v9.8b, #0 + sub w12, w12, #1 + mov v2.16b, v1.16b + movi v10.8b, #0 + b.gt .Lenc_blocks_more_than_2 + mov v3.16b, v1.16b + sub w12, w12, #1 + cmp x5, #16 + b.gt .Lenc_blocks_more_than_1 + sub w12, w12, #1 + b .Lenc_blocks_less_than_1 +.Lenc_blocks_more_than_3: // blocks left > 3 + st1 { v5.16b}, [x2], #16 // AES final-3 block - store result + ldp x6, x7, [x0], #16 // AES final-2 block - load input low & high + rev64 v4.16b, v5.16b // GHASH final-3 block + eor x6, x6, x13 // AES final-2 block - round N low + eor v4.16b, v4.16b, v8.16b // feed in partial tag + eor x7, x7, x14 // AES final-2 block - round N high + mov d22, v4.d[1] // GHASH final-3 block - mid + fmov d5, x6 // AES final-2 block - mov low + fmov v5.d[1], x7 // AES final-2 block - mov high + eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid + movi v8.8b, #0 // suppress further partial tag feed in + mov d10, v17.d[1] // GHASH final-3 block - mid + pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low + pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high + pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid + eor v5.16b, v5.16b, v1.16b // AES final-2 block - result +.Lenc_blocks_more_than_2: // blocks left > 2 + st1 { v5.16b}, [x2], #16 // AES final-2 block - store result + ldp x6, x7, [x0], #16 // AES final-1 block - load input low & high + rev64 v4.16b, v5.16b // GHASH final-2 block + eor x6, x6, x13 // AES final-1 block - round N low + eor v4.16b, v4.16b, v8.16b // feed in partial tag + fmov d5, x6 // AES final-1 block - mov low + eor x7, x7, x14 // AES final-1 block - round N high + fmov v5.d[1], x7 // AES final-1 block - mov high + movi v8.8b, #0 // suppress further partial tag feed in + pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high + mov d22, v4.d[1] // GHASH final-2 block - mid + pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low + eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid + eor v5.16b, v5.16b, v2.16b // AES final-1 block - result + eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high + pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid + eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low + eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid +.Lenc_blocks_more_than_1: // blocks left > 1 + st1 { v5.16b}, [x2], #16 // AES final-1 block - store result + rev64 v4.16b, v5.16b // GHASH final-1 block + ldp x6, x7, [x0], #16 // AES final block - load input low & high + eor v4.16b, v4.16b, v8.16b // feed in partial tag + movi v8.8b, #0 // suppress further partial tag feed in + eor x6, x6, x13 // AES final block - round N low + mov d22, v4.d[1] // GHASH final-1 block - mid + pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high + eor x7, x7, x14 // AES final block - round N high + eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high + ins v22.d[1], v22.d[0] // GHASH final-1 block - mid + fmov d5, x6 // AES final block - mov low + fmov v5.d[1], x7 // AES final block - mov high + pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid + pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low + eor v5.16b, v5.16b, v3.16b // AES final block - result + eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid + eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low +.Lenc_blocks_less_than_1: // blocks left <= 1 + and x1, x1, #127 // bit_length %= 128 + mvn x13, xzr // rkN_l = 0xffffffffffffffff + sub x1, x1, #128 // bit_length -= 128 + neg x1, x1 // bit_length = 128 - #bits in input (in range [1,128]) + ld1 { v18.16b}, [x2] // load existing bytes where the possibly partial last block is to be stored + mvn x14, xzr // rkN_h = 0xffffffffffffffff + and x1, x1, #127 // bit_length %= 128 + lsr x14, x14, x1 // rkN_h is mask for top 64b of last block + cmp x1, #64 + csel x6, x13, x14, lt + csel x7, x14, xzr, lt + fmov d0, x6 // ctr0b is mask for last block + fmov v0.d[1], x7 + and v5.16b, v5.16b, v0.16b // possibly partial last block has zeroes in highest bits + rev64 v4.16b, v5.16b // GHASH final block + eor v4.16b, v4.16b, v8.16b // feed in partial tag + bif v5.16b, v18.16b, v0.16b // insert existing bytes in top end of result before storing + pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high + mov d8, v4.d[1] // GHASH final block - mid + rev w9, w12 + pmull v21.1q, v4.1d, v12.1d // GHASH final block - low + eor v9.16b, v9.16b, v20.16b // GHASH final block - high + eor v8.8b, v8.8b, v4.8b // GHASH final block - mid + pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid + eor v11.16b, v11.16b, v21.16b // GHASH final block - low + eor v10.16b, v10.16b, v8.16b // GHASH final block - mid + movi v8.8b, #0xc2 + eor v4.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + shl d8, d8, #56 // mod_constant + eor v10.16b, v10.16b, v4.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + str w9, [x16, #12] // store the updated counter + st1 { v5.16b}, [x2] // store all 16B + eor v11.16b, v11.16b, v9.16b // MODULO - fold into low + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + mov x0, x15 + st1 { v11.16b }, [x3] + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x23, x24, [sp, #48] + ldp d8, d9, [sp, #64] + ldp d10, d11, [sp, #80] + ldp d12, d13, [sp, #96] + ldp d14, d15, [sp, #112] + ldp x29, x30, [sp], #128 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size aes_gcm_enc_kernel,.-aes_gcm_enc_kernel +.globl aes_gcm_dec_kernel +.hidden aes_gcm_dec_kernel +.type aes_gcm_dec_kernel,%function +.align 4 +aes_gcm_dec_kernel: + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp, #-128]! + mov x29, sp + stp x19, x20, [sp, #16] + mov x16, x4 + mov x8, x5 + stp x21, x22, [sp, #32] + stp x23, x24, [sp, #48] + stp d8, d9, [sp, #64] + stp d10, d11, [sp, #80] + stp d12, d13, [sp, #96] + stp d14, d15, [sp, #112] + ldr w17, [x8, #240] + add x19, x8, x17, lsl #4 // borrow input_l1 for last key + ldp x13, x14, [x19] // load round N keys + ldr q31, [x19, #-16] // load round N-1 keys + lsr x5, x1, #3 // byte_len + mov x15, x5 + ldp x10, x11, [x16] // ctr96_b64, ctr96_t32 + ldr q26, [x8, #128] // load rk8 + sub x5, x5, #1 // byte_len - 1 + ldr q25, [x8, #112] // load rk7 + and x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + add x4, x0, x1, lsr #3 // end_input_ptr + ldr q24, [x8, #96] // load rk6 + lsr x12, x11, #32 + ldr q23, [x8, #80] // load rk5 + orr w11, w11, w11 + ldr q21, [x8, #48] // load rk3 + add x5, x5, x0 + rev w12, w12 // rev_ctr32 + add w12, w12, #1 // increment rev_ctr32 + fmov d3, x10 // CTR block 3 + rev w9, w12 // CTR block 1 + add w12, w12, #1 // CTR block 1 + fmov d1, x10 // CTR block 1 + orr x9, x11, x9, lsl #32 // CTR block 1 + ld1 { v0.16b}, [x16] // special case vector load initial counter so we can start first AES block as quickly as possible + fmov v1.d[1], x9 // CTR block 1 + rev w9, w12 // CTR block 2 + add w12, w12, #1 // CTR block 2 + fmov d2, x10 // CTR block 2 + orr x9, x11, x9, lsl #32 // CTR block 2 + fmov v2.d[1], x9 // CTR block 2 + rev w9, w12 // CTR block 3 + orr x9, x11, x9, lsl #32 // CTR block 3 + ldr q18, [x8, #0] // load rk0 + fmov v3.d[1], x9 // CTR block 3 + add w12, w12, #1 // CTR block 3 + ldr q22, [x8, #64] // load rk4 + ldr q19, [x8, #16] // load rk1 + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 0 - round 0 + ldr q14, [x6, #48] // load h3l | h3h + ext v14.16b, v14.16b, v14.16b, #8 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 3 - round 0 + ldr q15, [x6, #80] // load h4l | h4h + ext v15.16b, v15.16b, v15.16b, #8 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 1 - round 0 + ldr q13, [x6, #32] // load h2l | h2h + ext v13.16b, v13.16b, v13.16b, #8 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 2 - round 0 + ldr q20, [x8, #32] // load rk2 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 0 - round 1 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 1 - round 1 + ld1 { v11.16b}, [x3] + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 2 - round 1 + ldr q27, [x8, #144] // load rk9 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 3 - round 1 + ldr q30, [x8, #192] // load rk12 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 0 - round 2 + ldr q12, [x6] // load h1l | h1h + ext v12.16b, v12.16b, v12.16b, #8 + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 2 - round 2 + ldr q28, [x8, #160] // load rk10 + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 3 - round 2 + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 0 - round 3 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 1 - round 2 + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 3 - round 3 + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 0 - round 4 + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 2 - round 3 + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 1 - round 3 + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 3 - round 4 + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 2 - round 4 + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 1 - round 4 + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 3 - round 5 + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 0 - round 5 + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 1 - round 5 + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 2 - round 5 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 0 - round 6 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 3 - round 6 + cmp x17, #12 // setup flags for AES-128/192/256 check + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 1 - round 6 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 2 - round 6 + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 0 - round 7 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 1 - round 7 + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 3 - round 7 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 0 - round 8 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 2 - round 7 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 3 - round 8 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 1 - round 8 + ldr q29, [x8, #176] // load rk11 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 2 - round 8 + b.lt .Ldec_finish_first_blocks // branch if AES-128 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 10 + b.eq .Ldec_finish_first_blocks // branch if AES-192 + + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b // AES block 0 - round 11 + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b // AES block 3 - round 11 + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b // AES block 1 - round 11 + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b // AES block 2 - round 11 + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b // AES block 1 - round 12 + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b // AES block 0 - round 12 + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b // AES block 2 - round 12 + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b // AES block 3 - round 12 + +.Ldec_finish_first_blocks: + cmp x0, x5 // check if we have <= 4 blocks + trn1 v9.2d, v14.2d, v15.2d // h4h | h3h + trn2 v17.2d, v14.2d, v15.2d // h4l | h3l + trn1 v8.2d, v12.2d, v13.2d // h2h | h1h + trn2 v16.2d, v12.2d, v13.2d // h2l | h1l + eor v17.16b, v17.16b, v9.16b // h4k | h3k + aese v1.16b, v31.16b // AES block 1 - round N-1 + aese v2.16b, v31.16b // AES block 2 - round N-1 + eor v16.16b, v16.16b, v8.16b // h2k | h1k + aese v3.16b, v31.16b // AES block 3 - round N-1 + aese v0.16b, v31.16b // AES block 0 - round N-1 + b.ge .Ldec_tail // handle tail + + ldr q4, [x0, #0] // AES block 0 - load ciphertext + ldr q5, [x0, #16] // AES block 1 - load ciphertext + rev w9, w12 // CTR block 4 + eor v0.16b, v4.16b, v0.16b // AES block 0 - result + eor v1.16b, v5.16b, v1.16b // AES block 1 - result + rev64 v5.16b, v5.16b // GHASH block 1 + ldr q7, [x0, #48] // AES block 3 - load ciphertext + mov x7, v0.d[1] // AES block 0 - mov high + mov x6, v0.d[0] // AES block 0 - mov low + rev64 v4.16b, v4.16b // GHASH block 0 + add w12, w12, #1 // CTR block 4 + fmov d0, x10 // CTR block 4 + orr x9, x11, x9, lsl #32 // CTR block 4 + fmov v0.d[1], x9 // CTR block 4 + rev w9, w12 // CTR block 5 + add w12, w12, #1 // CTR block 5 + mov x19, v1.d[0] // AES block 1 - mov low + orr x9, x11, x9, lsl #32 // CTR block 5 + mov x20, v1.d[1] // AES block 1 - mov high + eor x7, x7, x14 // AES block 0 - round N high + eor x6, x6, x13 // AES block 0 - round N low + stp x6, x7, [x2], #16 // AES block 0 - store result + fmov d1, x10 // CTR block 5 + ldr q6, [x0, #32] // AES block 2 - load ciphertext + add x0, x0, #64 // AES input_ptr update + fmov v1.d[1], x9 // CTR block 5 + rev w9, w12 // CTR block 6 + add w12, w12, #1 // CTR block 6 + eor x19, x19, x13 // AES block 1 - round N low + orr x9, x11, x9, lsl #32 // CTR block 6 + eor x20, x20, x14 // AES block 1 - round N high + stp x19, x20, [x2], #16 // AES block 1 - store result + eor v2.16b, v6.16b, v2.16b // AES block 2 - result + cmp x0, x5 // check if we have <= 8 blocks + b.ge .Ldec_prepretail // do prepretail + +.Ldec_main_loop: // main loop start + mov x21, v2.d[0] // AES block 4k+2 - mov low + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + eor v3.16b, v7.16b, v3.16b // AES block 4k+3 - result + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 + mov x22, v2.d[1] // AES block 4k+2 - mov high + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 + fmov d2, x10 // CTR block 4k+6 + fmov v2.d[1], x9 // CTR block 4k+6 + eor v4.16b, v4.16b, v11.16b // PRE 1 + rev w9, w12 // CTR block 4k+7 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 + mov x24, v3.d[1] // AES block 4k+3 - mov high + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 + mov x23, v3.d[0] // AES block 4k+3 - mov low + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high + mov d8, v4.d[1] // GHASH block 4k - mid + fmov d3, x10 // CTR block 4k+7 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 + orr x9, x11, x9, lsl #32 // CTR block 4k+7 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 + fmov v3.d[1], x9 // CTR block 4k+7 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 + eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 + eor x22, x22, x14 // AES block 4k+2 - round N high + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 + mov d10, v17.d[1] // GHASH block 4k - mid + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 + rev64 v6.16b, v6.16b // GHASH block 4k+2 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 + eor x21, x21, x13 // AES block 4k+2 - round N low + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 + stp x21, x22, [x2], #16 // AES block 4k+2 - store result + pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 + rev64 v7.16b, v7.16b // GHASH block 4k+3 + pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid + eor x23, x23, x13 // AES block 4k+3 - round N low + pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low + eor x24, x24, x14 // AES block 4k+3 - round N high + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 + mov d4, v5.d[1] // GHASH block 4k+1 - mid + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 + eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 + add w12, w12, #1 // CTR block 4k+7 + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 + mov d8, v6.d[1] // GHASH block 4k+2 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 + eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid + pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 + eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 + eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low + pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid + rev w9, w12 // CTR block 4k+8 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 + ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 + add w12, w12, #1 // CTR block 4k+8 + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 + eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 + pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high + mov d6, v7.d[1] // GHASH block 4k+3 - mid + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 + pmull v4.1q, v7.1d, v12.1d // GHASH block 4k+3 - low + orr x9, x11, x9, lsl #32 // CTR block 4k+8 + eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high + cmp x17, #12 // setup flags for AES-128/192/256 check + eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 + eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high + pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid + movi v8.8b, #0xc2 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 + eor v11.16b, v11.16b, v4.16b // GHASH block 4k+3 - low + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 + shl d8, d8, #56 // mod_constant + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 + eor v10.16b, v10.16b, v6.16b // GHASH block 4k+3 - mid + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 + b.lt .Ldec_main_loop_continue // branch if AES-128 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 + b.eq .Ldec_main_loop_continue // branch if AES-192 + + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 + +.Ldec_main_loop_continue: + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + ldr q4, [x0, #0] // AES block 4k+4 - load ciphertext + aese v0.16b, v31.16b // AES block 4k+4 - round N-1 + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up + ldr q5, [x0, #16] // AES block 4k+5 - load ciphertext + eor v0.16b, v4.16b, v0.16b // AES block 4k+4 - result + stp x23, x24, [x2], #16 // AES block 4k+3 - store result + eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid + ldr q7, [x0, #48] // AES block 4k+7 - load ciphertext + ldr q6, [x0, #32] // AES block 4k+6 - load ciphertext + mov x7, v0.d[1] // AES block 4k+4 - mov high + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + aese v1.16b, v31.16b // AES block 4k+5 - round N-1 + add x0, x0, #64 // AES input_ptr update + mov x6, v0.d[0] // AES block 4k+4 - mov low + fmov d0, x10 // CTR block 4k+8 + fmov v0.d[1], x9 // CTR block 4k+8 + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + eor v1.16b, v5.16b, v1.16b // AES block 4k+5 - result + rev w9, w12 // CTR block 4k+9 + aese v2.16b, v31.16b // AES block 4k+6 - round N-1 + orr x9, x11, x9, lsl #32 // CTR block 4k+9 + cmp x0, x5 // .LOOP CONTROL + add w12, w12, #1 // CTR block 4k+9 + eor x6, x6, x13 // AES block 4k+4 - round N low + eor x7, x7, x14 // AES block 4k+4 - round N high + mov x20, v1.d[1] // AES block 4k+5 - mov high + eor v2.16b, v6.16b, v2.16b // AES block 4k+6 - result + eor v11.16b, v11.16b, v8.16b // MODULO - fold into low + mov x19, v1.d[0] // AES block 4k+5 - mov low + fmov d1, x10 // CTR block 4k+9 + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + fmov v1.d[1], x9 // CTR block 4k+9 + rev w9, w12 // CTR block 4k+10 + add w12, w12, #1 // CTR block 4k+10 + aese v3.16b, v31.16b // AES block 4k+7 - round N-1 + orr x9, x11, x9, lsl #32 // CTR block 4k+10 + rev64 v5.16b, v5.16b // GHASH block 4k+5 + eor x20, x20, x14 // AES block 4k+5 - round N high + stp x6, x7, [x2], #16 // AES block 4k+4 - store result + eor x19, x19, x13 // AES block 4k+5 - round N low + stp x19, x20, [x2], #16 // AES block 4k+5 - store result + rev64 v4.16b, v4.16b // GHASH block 4k+4 + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + b.lt .Ldec_main_loop + +.Ldec_prepretail: // PREPRETAIL + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + mov x21, v2.d[0] // AES block 4k+2 - mov low + eor v3.16b, v7.16b, v3.16b // AES block 4k+3 - result + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 + mov x22, v2.d[1] // AES block 4k+2 - mov high + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 + fmov d2, x10 // CTR block 4k+6 + fmov v2.d[1], x9 // CTR block 4k+6 + rev w9, w12 // CTR block 4k+7 + eor v4.16b, v4.16b, v11.16b // PRE 1 + rev64 v6.16b, v6.16b // GHASH block 4k+2 + orr x9, x11, x9, lsl #32 // CTR block 4k+7 + mov x23, v3.d[0] // AES block 4k+3 - mov low + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 + mov x24, v3.d[1] // AES block 4k+3 - mov high + pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low + mov d8, v4.d[1] // GHASH block 4k - mid + fmov d3, x10 // CTR block 4k+7 + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high + fmov v3.d[1], x9 // CTR block 4k+7 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 + mov d10, v17.d[1] // GHASH block 4k - mid + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 + eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 + rev64 v7.16b, v7.16b // GHASH block 4k+3 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 + pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high + pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 + mov d4, v5.d[1] // GHASH block 4k+1 - mid + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 + eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 + mov d8, v6.d[1] // GHASH block 4k+2 - mid + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 + eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid + pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 + eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid + pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 + eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high + eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid + pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 + ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high + pmull v4.1q, v7.1d, v12.1d // GHASH block 4k+3 - low + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 + mov d6, v7.d[1] // GHASH block 4k+3 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 + eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 + eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 + movi v8.8b, #0xc2 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 + eor v11.16b, v11.16b, v4.16b // GHASH block 4k+3 - low + pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 + cmp x17, #12 // setup flags for AES-128/192/256 check + eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 + eor v10.16b, v10.16b, v6.16b // GHASH block 4k+3 - mid + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 + shl d8, d8, #56 // mod_constant + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 + b.lt .Ldec_finish_prepretail // branch if AES-128 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 + b.eq .Ldec_finish_prepretail // branch if AES-192 + + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 + +.Ldec_finish_prepretail: + eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid + eor x22, x22, x14 // AES block 4k+2 - round N high + eor x23, x23, x13 // AES block 4k+3 - round N low + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + add w12, w12, #1 // CTR block 4k+7 + eor x21, x21, x13 // AES block 4k+2 - round N low + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + eor x24, x24, x14 // AES block 4k+3 - round N high + stp x21, x22, [x2], #16 // AES block 4k+2 - store result + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + stp x23, x24, [x2], #16 // AES block 4k+3 - store result + + eor v11.16b, v11.16b, v8.16b // MODULO - fold into low + aese v1.16b, v31.16b // AES block 4k+5 - round N-1 + aese v0.16b, v31.16b // AES block 4k+4 - round N-1 + aese v3.16b, v31.16b // AES block 4k+7 - round N-1 + aese v2.16b, v31.16b // AES block 4k+6 - round N-1 + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + +.Ldec_tail: // TAIL + sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process + ld1 { v5.16b}, [x0], #16 // AES block 4k+4 - load ciphertext + eor v0.16b, v5.16b, v0.16b // AES block 4k+4 - result + mov x6, v0.d[0] // AES block 4k+4 - mov low + mov x7, v0.d[1] // AES block 4k+4 - mov high + ext v8.16b, v11.16b, v11.16b, #8 // prepare final partial tag + cmp x5, #48 + eor x6, x6, x13 // AES block 4k+4 - round N low + eor x7, x7, x14 // AES block 4k+4 - round N high + b.gt .Ldec_blocks_more_than_3 + sub w12, w12, #1 + mov v3.16b, v2.16b + movi v10.8b, #0 + movi v11.8b, #0 + cmp x5, #32 + movi v9.8b, #0 + mov v2.16b, v1.16b + b.gt .Ldec_blocks_more_than_2 + sub w12, w12, #1 + mov v3.16b, v1.16b + cmp x5, #16 + b.gt .Ldec_blocks_more_than_1 + sub w12, w12, #1 + b .Ldec_blocks_less_than_1 +.Ldec_blocks_more_than_3: // blocks left > 3 + rev64 v4.16b, v5.16b // GHASH final-3 block + ld1 { v5.16b}, [x0], #16 // AES final-2 block - load ciphertext + stp x6, x7, [x2], #16 // AES final-3 block - store result + mov d10, v17.d[1] // GHASH final-3 block - mid + eor v4.16b, v4.16b, v8.16b // feed in partial tag + eor v0.16b, v5.16b, v1.16b // AES final-2 block - result + mov d22, v4.d[1] // GHASH final-3 block - mid + mov x6, v0.d[0] // AES final-2 block - mov low + mov x7, v0.d[1] // AES final-2 block - mov high + eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid + movi v8.8b, #0 // suppress further partial tag feed in + pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high + pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid + eor x6, x6, x13 // AES final-2 block - round N low + pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low + eor x7, x7, x14 // AES final-2 block - round N high +.Ldec_blocks_more_than_2: // blocks left > 2 + rev64 v4.16b, v5.16b // GHASH final-2 block + ld1 { v5.16b}, [x0], #16 // AES final-1 block - load ciphertext + eor v4.16b, v4.16b, v8.16b // feed in partial tag + stp x6, x7, [x2], #16 // AES final-2 block - store result + eor v0.16b, v5.16b, v2.16b // AES final-1 block - result + mov d22, v4.d[1] // GHASH final-2 block - mid + pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low + pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high + eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid + mov x6, v0.d[0] // AES final-1 block - mov low + mov x7, v0.d[1] // AES final-1 block - mov high + eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low + movi v8.8b, #0 // suppress further partial tag feed in + pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high + eor x6, x6, x13 // AES final-1 block - round N low + eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid + eor x7, x7, x14 // AES final-1 block - round N high +.Ldec_blocks_more_than_1: // blocks left > 1 + stp x6, x7, [x2], #16 // AES final-1 block - store result + rev64 v4.16b, v5.16b // GHASH final-1 block + ld1 { v5.16b}, [x0], #16 // AES final block - load ciphertext + eor v4.16b, v4.16b, v8.16b // feed in partial tag + movi v8.8b, #0 // suppress further partial tag feed in + mov d22, v4.d[1] // GHASH final-1 block - mid + eor v0.16b, v5.16b, v3.16b // AES final block - result + pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high + eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid + pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low + mov x6, v0.d[0] // AES final block - mov low + ins v22.d[1], v22.d[0] // GHASH final-1 block - mid + mov x7, v0.d[1] // AES final block - mov high + pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid + eor x6, x6, x13 // AES final block - round N low + eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low + eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high + eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid + eor x7, x7, x14 // AES final block - round N high +.Ldec_blocks_less_than_1: // blocks left <= 1 + and x1, x1, #127 // bit_length %= 128 + mvn x14, xzr // rkN_h = 0xffffffffffffffff + sub x1, x1, #128 // bit_length -= 128 + mvn x13, xzr // rkN_l = 0xffffffffffffffff + ldp x4, x5, [x2] // load existing bytes we need to not overwrite + neg x1, x1 // bit_length = 128 - #bits in input (in range [1,128]) + and x1, x1, #127 // bit_length %= 128 + lsr x14, x14, x1 // rkN_h is mask for top 64b of last block + cmp x1, #64 + csel x9, x13, x14, lt + csel x10, x14, xzr, lt + fmov d0, x9 // ctr0b is mask for last block + and x6, x6, x9 + mov v0.d[1], x10 + bic x4, x4, x9 // mask out low existing bytes + rev w9, w12 + bic x5, x5, x10 // mask out high existing bytes + orr x6, x6, x4 + and x7, x7, x10 + orr x7, x7, x5 + and v5.16b, v5.16b, v0.16b // possibly partial last block has zeroes in highest bits + rev64 v4.16b, v5.16b // GHASH final block + eor v4.16b, v4.16b, v8.16b // feed in partial tag + pmull v21.1q, v4.1d, v12.1d // GHASH final block - low + mov d8, v4.d[1] // GHASH final block - mid + eor v8.8b, v8.8b, v4.8b // GHASH final block - mid + pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high + pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final block - high + eor v11.16b, v11.16b, v21.16b // GHASH final block - low + eor v10.16b, v10.16b, v8.16b // GHASH final block - mid + movi v8.8b, #0xc2 + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + shl d8, d8, #56 // mod_constant + eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + eor v11.16b, v11.16b, v8.16b // MODULO - fold into low + stp x6, x7, [x2] + str w9, [x16, #12] // store the updated counter + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + mov x0, x15 + st1 { v11.16b }, [x3] + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x23, x24, [sp, #48] + ldp d8, d9, [sp, #64] + ldp d10, d11, [sp, #80] + ldp d12, d13, [sp, #96] + ldp d14, d15, [sp, #112] + ldp x29, x30, [sp], #128 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size aes_gcm_dec_kernel,.-aes_gcm_dec_kernel +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) diff --git a/ring-0.17.14/pregenerated/aesv8-gcm-armv8-win64.S b/ring-0.17.14/pregenerated/aesv8-gcm-armv8-win64.S new file mode 100644 index 0000000000..6f39544f29 --- /dev/null +++ b/ring-0.17.14/pregenerated/aesv8-gcm-armv8-win64.S @@ -0,0 +1,1558 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +#if __ARM_MAX_ARCH__ >= 8 + +.arch armv8-a+crypto +.text +.globl aes_gcm_enc_kernel + +.def aes_gcm_enc_kernel + .type 32 +.endef +.align 4 +aes_gcm_enc_kernel: + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp, #-128]! + mov x29, sp + stp x19, x20, [sp, #16] + mov x16, x4 + mov x8, x5 + stp x21, x22, [sp, #32] + stp x23, x24, [sp, #48] + stp d8, d9, [sp, #64] + stp d10, d11, [sp, #80] + stp d12, d13, [sp, #96] + stp d14, d15, [sp, #112] + ldr w17, [x8, #240] + add x19, x8, x17, lsl #4 // borrow input_l1 for last key + ldp x13, x14, [x19] // load round N keys + ldr q31, [x19, #-16] // load round N-1 keys + add x4, x0, x1, lsr #3 // end_input_ptr + lsr x5, x1, #3 // byte_len + mov x15, x5 + ldp x10, x11, [x16] // ctr96_b64, ctr96_t32 + ld1 { v0.16b}, [x16] // special case vector load initial counter so we can start first AES block as quickly as possible + sub x5, x5, #1 // byte_len - 1 + ldr q18, [x8, #0] // load rk0 + and x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + ldr q25, [x8, #112] // load rk7 + add x5, x5, x0 + lsr x12, x11, #32 + fmov d2, x10 // CTR block 2 + orr w11, w11, w11 + rev w12, w12 // rev_ctr32 + fmov d1, x10 // CTR block 1 + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 0 - round 0 + add w12, w12, #1 // increment rev_ctr32 + rev w9, w12 // CTR block 1 + fmov d3, x10 // CTR block 3 + orr x9, x11, x9, lsl #32 // CTR block 1 + add w12, w12, #1 // CTR block 1 + ldr q19, [x8, #16] // load rk1 + fmov v1.d[1], x9 // CTR block 1 + rev w9, w12 // CTR block 2 + add w12, w12, #1 // CTR block 2 + orr x9, x11, x9, lsl #32 // CTR block 2 + ldr q20, [x8, #32] // load rk2 + fmov v2.d[1], x9 // CTR block 2 + rev w9, w12 // CTR block 3 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 0 - round 1 + orr x9, x11, x9, lsl #32 // CTR block 3 + fmov v3.d[1], x9 // CTR block 3 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 1 - round 0 + ldr q21, [x8, #48] // load rk3 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 0 - round 2 + ldr q24, [x8, #96] // load rk6 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 2 - round 0 + ldr q23, [x8, #80] // load rk5 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 1 - round 1 + ldr q14, [x6, #48] // load h3l | h3h + ext v14.16b, v14.16b, v14.16b, #8 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 3 - round 0 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 2 - round 1 + ldr q22, [x8, #64] // load rk4 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 1 - round 2 + ldr q13, [x6, #32] // load h2l | h2h + ext v13.16b, v13.16b, v13.16b, #8 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 3 - round 1 + ldr q30, [x8, #192] // load rk12 + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 2 - round 2 + ldr q15, [x6, #80] // load h4l | h4h + ext v15.16b, v15.16b, v15.16b, #8 + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 1 - round 3 + ldr q29, [x8, #176] // load rk11 + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 3 - round 2 + ldr q26, [x8, #128] // load rk8 + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 2 - round 3 + add w12, w12, #1 // CTR block 3 + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 0 - round 3 + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 3 - round 3 + ld1 { v11.16b}, [x3] + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 2 - round 4 + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 0 - round 4 + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 1 - round 4 + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 3 - round 4 + cmp x17, #12 // setup flags for AES-128/192/256 check + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 0 - round 5 + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 1 - round 5 + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 3 - round 5 + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 2 - round 5 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 1 - round 6 + trn2 v17.2d, v14.2d, v15.2d // h4l | h3l + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 3 - round 6 + ldr q27, [x8, #144] // load rk9 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 0 - round 6 + ldr q12, [x6] // load h1l | h1h + ext v12.16b, v12.16b, v12.16b, #8 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 2 - round 6 + ldr q28, [x8, #160] // load rk10 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 1 - round 7 + trn1 v9.2d, v14.2d, v15.2d // h4h | h3h + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 0 - round 7 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 2 - round 7 + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 3 - round 7 + trn2 v16.2d, v12.2d, v13.2d // h2l | h1l + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 1 - round 8 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 2 - round 8 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 3 - round 8 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 0 - round 8 + b.lt Lenc_finish_first_blocks // branch if AES-128 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 9 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 9 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 10 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 10 + b.eq Lenc_finish_first_blocks // branch if AES-192 + + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b // AES block 1 - round 11 + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b // AES block 2 - round 11 + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b // AES block 0 - round 11 + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b // AES block 3 - round 11 + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b // AES block 1 - round 12 + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b // AES block 2 - round 12 + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b // AES block 0 - round 12 + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b // AES block 3 - round 12 + +Lenc_finish_first_blocks: + cmp x0, x5 // check if we have <= 4 blocks + eor v17.16b, v17.16b, v9.16b // h4k | h3k + aese v2.16b, v31.16b // AES block 2 - round N-1 + trn1 v8.2d, v12.2d, v13.2d // h2h | h1h + aese v1.16b, v31.16b // AES block 1 - round N-1 + aese v0.16b, v31.16b // AES block 0 - round N-1 + aese v3.16b, v31.16b // AES block 3 - round N-1 + eor v16.16b, v16.16b, v8.16b // h2k | h1k + b.ge Lenc_tail // handle tail + + ldp x19, x20, [x0, #16] // AES block 1 - load plaintext + rev w9, w12 // CTR block 4 + ldp x6, x7, [x0, #0] // AES block 0 - load plaintext + ldp x23, x24, [x0, #48] // AES block 3 - load plaintext + ldp x21, x22, [x0, #32] // AES block 2 - load plaintext + add x0, x0, #64 // AES input_ptr update + eor x19, x19, x13 // AES block 1 - round N low + eor x20, x20, x14 // AES block 1 - round N high + fmov d5, x19 // AES block 1 - mov low + eor x6, x6, x13 // AES block 0 - round N low + eor x7, x7, x14 // AES block 0 - round N high + eor x24, x24, x14 // AES block 3 - round N high + fmov d4, x6 // AES block 0 - mov low + cmp x0, x5 // check if we have <= 8 blocks + fmov v4.d[1], x7 // AES block 0 - mov high + eor x23, x23, x13 // AES block 3 - round N low + eor x21, x21, x13 // AES block 2 - round N low + fmov v5.d[1], x20 // AES block 1 - mov high + fmov d6, x21 // AES block 2 - mov low + add w12, w12, #1 // CTR block 4 + orr x9, x11, x9, lsl #32 // CTR block 4 + fmov d7, x23 // AES block 3 - mov low + eor x22, x22, x14 // AES block 2 - round N high + fmov v6.d[1], x22 // AES block 2 - mov high + eor v4.16b, v4.16b, v0.16b // AES block 0 - result + fmov d0, x10 // CTR block 4 + fmov v0.d[1], x9 // CTR block 4 + rev w9, w12 // CTR block 5 + add w12, w12, #1 // CTR block 5 + eor v5.16b, v5.16b, v1.16b // AES block 1 - result + fmov d1, x10 // CTR block 5 + orr x9, x11, x9, lsl #32 // CTR block 5 + fmov v1.d[1], x9 // CTR block 5 + rev w9, w12 // CTR block 6 + st1 { v4.16b}, [x2], #16 // AES block 0 - store result + fmov v7.d[1], x24 // AES block 3 - mov high + orr x9, x11, x9, lsl #32 // CTR block 6 + eor v6.16b, v6.16b, v2.16b // AES block 2 - result + st1 { v5.16b}, [x2], #16 // AES block 1 - store result + add w12, w12, #1 // CTR block 6 + fmov d2, x10 // CTR block 6 + fmov v2.d[1], x9 // CTR block 6 + st1 { v6.16b}, [x2], #16 // AES block 2 - store result + rev w9, w12 // CTR block 7 + orr x9, x11, x9, lsl #32 // CTR block 7 + eor v7.16b, v7.16b, v3.16b // AES block 3 - result + st1 { v7.16b}, [x2], #16 // AES block 3 - store result + b.ge Lenc_prepretail // do prepretail + +Lenc_main_loop: // main loop start + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 + rev64 v4.16b, v4.16b // GHASH block 4k (only t0 is free) + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 + fmov d3, x10 // CTR block 4k+3 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 + fmov v3.d[1], x9 // CTR block 4k+3 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 + ldp x23, x24, [x0, #48] // AES block 4k+7 - load plaintext + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 + ldp x21, x22, [x0, #32] // AES block 4k+6 - load plaintext + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 + eor v4.16b, v4.16b, v11.16b // PRE 1 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 + eor x23, x23, x13 // AES block 4k+7 - round N low + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 + mov d10, v17.d[1] // GHASH block 4k - mid + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high + eor x22, x22, x14 // AES block 4k+6 - round N high + mov d8, v4.d[1] // GHASH block 4k - mid + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 + rev64 v5.16b, v5.16b // GHASH block 4k+1 (t0 and t1 free) + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 + pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low + eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 + rev64 v7.16b, v7.16b // GHASH block 4k+3 (t0, t1, t2 and t3 free) + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high + pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid + rev64 v6.16b, v6.16b // GHASH block 4k+2 (t0, t1, and t2 free) + pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high + mov d4, v5.d[1] // GHASH block 4k+1 - mid + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 + eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 + mov d8, v6.d[1] // GHASH block 4k+2 - mid + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 + eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 + eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 + pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 + ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 + eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid + pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high + pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 + pmull v6.1q, v7.1d, v12.1d // GHASH block 4k+3 - low + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 + ldp x19, x20, [x0, #16] // AES block 4k+5 - load plaintext + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 + mov d4, v7.d[1] // GHASH block 4k+3 - mid + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 + eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high + eor v4.8b, v4.8b, v7.8b // GHASH block 4k+3 - mid + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 + eor x19, x19, x13 // AES block 4k+5 - round N low + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 + eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 + eor x21, x21, x13 // AES block 4k+6 - round N low + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 + movi v8.8b, #0xc2 + pmull v4.1q, v4.1d, v16.1d // GHASH block 4k+3 - mid + eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high + cmp x17, #12 // setup flags for AES-128/192/256 check + fmov d5, x19 // AES block 4k+5 - mov low + ldp x6, x7, [x0, #0] // AES block 4k+4 - load plaintext + b.lt Lenc_main_loop_continue // branch if AES-128 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 + b.eq Lenc_main_loop_continue // branch if AES-192 + + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 + +Lenc_main_loop_continue: + shl d8, d8, #56 // mod_constant + eor v11.16b, v11.16b, v6.16b // GHASH block 4k+3 - low + eor v10.16b, v10.16b, v4.16b // GHASH block 4k+3 - mid + add w12, w12, #1 // CTR block 4k+3 + eor v4.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + add x0, x0, #64 // AES input_ptr update + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + rev w9, w12 // CTR block 4k+8 + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor x6, x6, x13 // AES block 4k+4 - round N low + eor v10.16b, v10.16b, v4.16b // MODULO - karatsuba tidy up + eor x7, x7, x14 // AES block 4k+4 - round N high + fmov d4, x6 // AES block 4k+4 - mov low + orr x9, x11, x9, lsl #32 // CTR block 4k+8 + eor v7.16b, v9.16b, v7.16b // MODULO - fold into mid + eor x20, x20, x14 // AES block 4k+5 - round N high + eor x24, x24, x14 // AES block 4k+7 - round N high + add w12, w12, #1 // CTR block 4k+8 + aese v0.16b, v31.16b // AES block 4k+4 - round N-1 + fmov v4.d[1], x7 // AES block 4k+4 - mov high + eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid + fmov d7, x23 // AES block 4k+7 - mov low + aese v1.16b, v31.16b // AES block 4k+5 - round N-1 + fmov v5.d[1], x20 // AES block 4k+5 - mov high + fmov d6, x21 // AES block 4k+6 - mov low + cmp x0, x5 // LOOP CONTROL + fmov v6.d[1], x22 // AES block 4k+6 - mov high + pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + eor v4.16b, v4.16b, v0.16b // AES block 4k+4 - result + fmov d0, x10 // CTR block 4k+8 + fmov v0.d[1], x9 // CTR block 4k+8 + rev w9, w12 // CTR block 4k+9 + add w12, w12, #1 // CTR block 4k+9 + eor v5.16b, v5.16b, v1.16b // AES block 4k+5 - result + fmov d1, x10 // CTR block 4k+9 + orr x9, x11, x9, lsl #32 // CTR block 4k+9 + fmov v1.d[1], x9 // CTR block 4k+9 + aese v2.16b, v31.16b // AES block 4k+6 - round N-1 + rev w9, w12 // CTR block 4k+10 + st1 { v4.16b}, [x2], #16 // AES block 4k+4 - store result + orr x9, x11, x9, lsl #32 // CTR block 4k+10 + eor v11.16b, v11.16b, v9.16b // MODULO - fold into low + fmov v7.d[1], x24 // AES block 4k+7 - mov high + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + st1 { v5.16b}, [x2], #16 // AES block 4k+5 - store result + add w12, w12, #1 // CTR block 4k+10 + aese v3.16b, v31.16b // AES block 4k+7 - round N-1 + eor v6.16b, v6.16b, v2.16b // AES block 4k+6 - result + fmov d2, x10 // CTR block 4k+10 + st1 { v6.16b}, [x2], #16 // AES block 4k+6 - store result + fmov v2.d[1], x9 // CTR block 4k+10 + rev w9, w12 // CTR block 4k+11 + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + orr x9, x11, x9, lsl #32 // CTR block 4k+11 + eor v7.16b, v7.16b, v3.16b // AES block 4k+7 - result + st1 { v7.16b}, [x2], #16 // AES block 4k+7 - store result + b.lt Lenc_main_loop + +Lenc_prepretail: // PREPRETAIL + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 + rev64 v6.16b, v6.16b // GHASH block 4k+2 (t0, t1, and t2 free) + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 + fmov d3, x10 // CTR block 4k+3 + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 + rev64 v4.16b, v4.16b // GHASH block 4k (only t0 is free) + fmov v3.d[1], x9 // CTR block 4k+3 + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 + eor v4.16b, v4.16b, v11.16b // PRE 1 + rev64 v5.16b, v5.16b // GHASH block 4k+1 (t0 and t1 free) + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 + mov d10, v17.d[1] // GHASH block 4k - mid + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 + pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low + mov d8, v4.d[1] // GHASH block 4k - mid + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 + eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 + pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high + pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high + mov d4, v5.d[1] // GHASH block 4k+1 - mid + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 + eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 + eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid + mov d8, v6.d[1] // GHASH block 4k+2 - mid + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 + rev64 v7.16b, v7.16b // GHASH block 4k+3 (t0, t1, t2 and t3 free) + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 + pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid + eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid + add w12, w12, #1 // CTR block 4k+3 + pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 + eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid + pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high + eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low + ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high + mov d4, v7.d[1] // GHASH block 4k+3 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid + eor v4.8b, v4.8b, v7.8b // GHASH block 4k+3 - mid + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 + pmull v4.1q, v4.1d, v16.1d // GHASH block 4k+3 - mid + eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 + movi v8.8b, #0xc2 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 + eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 + shl d8, d8, #56 // mod_constant + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 + eor v10.16b, v10.16b, v4.16b // GHASH block 4k+3 - mid + pmull v6.1q, v7.1d, v12.1d // GHASH block 4k+3 - low + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 + cmp x17, #12 // setup flags for AES-128/192/256 check + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 + eor v11.16b, v11.16b, v6.16b // GHASH block 4k+3 - low + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 + eor v10.16b, v10.16b, v9.16b // karatsuba tidy up + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 + pmull v4.1q, v9.1d, v8.1d + ext v9.16b, v9.16b, v9.16b, #8 + eor v10.16b, v10.16b, v11.16b + b.lt Lenc_finish_prepretail // branch if AES-128 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 + b.eq Lenc_finish_prepretail // branch if AES-192 + + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 + +Lenc_finish_prepretail: + eor v10.16b, v10.16b, v4.16b + eor v10.16b, v10.16b, v9.16b + pmull v4.1q, v10.1d, v8.1d + ext v10.16b, v10.16b, v10.16b, #8 + aese v1.16b, v31.16b // AES block 4k+5 - round N-1 + eor v11.16b, v11.16b, v4.16b + aese v3.16b, v31.16b // AES block 4k+7 - round N-1 + aese v0.16b, v31.16b // AES block 4k+4 - round N-1 + aese v2.16b, v31.16b // AES block 4k+6 - round N-1 + eor v11.16b, v11.16b, v10.16b + +Lenc_tail: // TAIL + ext v8.16b, v11.16b, v11.16b, #8 // prepare final partial tag + sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process + ldp x6, x7, [x0], #16 // AES block 4k+4 - load plaintext + eor x6, x6, x13 // AES block 4k+4 - round N low + eor x7, x7, x14 // AES block 4k+4 - round N high + cmp x5, #48 + fmov d4, x6 // AES block 4k+4 - mov low + fmov v4.d[1], x7 // AES block 4k+4 - mov high + eor v5.16b, v4.16b, v0.16b // AES block 4k+4 - result + b.gt Lenc_blocks_more_than_3 + cmp x5, #32 + mov v3.16b, v2.16b + movi v11.8b, #0 + movi v9.8b, #0 + sub w12, w12, #1 + mov v2.16b, v1.16b + movi v10.8b, #0 + b.gt Lenc_blocks_more_than_2 + mov v3.16b, v1.16b + sub w12, w12, #1 + cmp x5, #16 + b.gt Lenc_blocks_more_than_1 + sub w12, w12, #1 + b Lenc_blocks_less_than_1 +Lenc_blocks_more_than_3: // blocks left > 3 + st1 { v5.16b}, [x2], #16 // AES final-3 block - store result + ldp x6, x7, [x0], #16 // AES final-2 block - load input low & high + rev64 v4.16b, v5.16b // GHASH final-3 block + eor x6, x6, x13 // AES final-2 block - round N low + eor v4.16b, v4.16b, v8.16b // feed in partial tag + eor x7, x7, x14 // AES final-2 block - round N high + mov d22, v4.d[1] // GHASH final-3 block - mid + fmov d5, x6 // AES final-2 block - mov low + fmov v5.d[1], x7 // AES final-2 block - mov high + eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid + movi v8.8b, #0 // suppress further partial tag feed in + mov d10, v17.d[1] // GHASH final-3 block - mid + pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low + pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high + pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid + eor v5.16b, v5.16b, v1.16b // AES final-2 block - result +Lenc_blocks_more_than_2: // blocks left > 2 + st1 { v5.16b}, [x2], #16 // AES final-2 block - store result + ldp x6, x7, [x0], #16 // AES final-1 block - load input low & high + rev64 v4.16b, v5.16b // GHASH final-2 block + eor x6, x6, x13 // AES final-1 block - round N low + eor v4.16b, v4.16b, v8.16b // feed in partial tag + fmov d5, x6 // AES final-1 block - mov low + eor x7, x7, x14 // AES final-1 block - round N high + fmov v5.d[1], x7 // AES final-1 block - mov high + movi v8.8b, #0 // suppress further partial tag feed in + pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high + mov d22, v4.d[1] // GHASH final-2 block - mid + pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low + eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid + eor v5.16b, v5.16b, v2.16b // AES final-1 block - result + eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high + pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid + eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low + eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid +Lenc_blocks_more_than_1: // blocks left > 1 + st1 { v5.16b}, [x2], #16 // AES final-1 block - store result + rev64 v4.16b, v5.16b // GHASH final-1 block + ldp x6, x7, [x0], #16 // AES final block - load input low & high + eor v4.16b, v4.16b, v8.16b // feed in partial tag + movi v8.8b, #0 // suppress further partial tag feed in + eor x6, x6, x13 // AES final block - round N low + mov d22, v4.d[1] // GHASH final-1 block - mid + pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high + eor x7, x7, x14 // AES final block - round N high + eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high + ins v22.d[1], v22.d[0] // GHASH final-1 block - mid + fmov d5, x6 // AES final block - mov low + fmov v5.d[1], x7 // AES final block - mov high + pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid + pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low + eor v5.16b, v5.16b, v3.16b // AES final block - result + eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid + eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low +Lenc_blocks_less_than_1: // blocks left <= 1 + and x1, x1, #127 // bit_length %= 128 + mvn x13, xzr // rkN_l = 0xffffffffffffffff + sub x1, x1, #128 // bit_length -= 128 + neg x1, x1 // bit_length = 128 - #bits in input (in range [1,128]) + ld1 { v18.16b}, [x2] // load existing bytes where the possibly partial last block is to be stored + mvn x14, xzr // rkN_h = 0xffffffffffffffff + and x1, x1, #127 // bit_length %= 128 + lsr x14, x14, x1 // rkN_h is mask for top 64b of last block + cmp x1, #64 + csel x6, x13, x14, lt + csel x7, x14, xzr, lt + fmov d0, x6 // ctr0b is mask for last block + fmov v0.d[1], x7 + and v5.16b, v5.16b, v0.16b // possibly partial last block has zeroes in highest bits + rev64 v4.16b, v5.16b // GHASH final block + eor v4.16b, v4.16b, v8.16b // feed in partial tag + bif v5.16b, v18.16b, v0.16b // insert existing bytes in top end of result before storing + pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high + mov d8, v4.d[1] // GHASH final block - mid + rev w9, w12 + pmull v21.1q, v4.1d, v12.1d // GHASH final block - low + eor v9.16b, v9.16b, v20.16b // GHASH final block - high + eor v8.8b, v8.8b, v4.8b // GHASH final block - mid + pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid + eor v11.16b, v11.16b, v21.16b // GHASH final block - low + eor v10.16b, v10.16b, v8.16b // GHASH final block - mid + movi v8.8b, #0xc2 + eor v4.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + shl d8, d8, #56 // mod_constant + eor v10.16b, v10.16b, v4.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + str w9, [x16, #12] // store the updated counter + st1 { v5.16b}, [x2] // store all 16B + eor v11.16b, v11.16b, v9.16b // MODULO - fold into low + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + mov x0, x15 + st1 { v11.16b }, [x3] + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x23, x24, [sp, #48] + ldp d8, d9, [sp, #64] + ldp d10, d11, [sp, #80] + ldp d12, d13, [sp, #96] + ldp d14, d15, [sp, #112] + ldp x29, x30, [sp], #128 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.globl aes_gcm_dec_kernel + +.def aes_gcm_dec_kernel + .type 32 +.endef +.align 4 +aes_gcm_dec_kernel: + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp, #-128]! + mov x29, sp + stp x19, x20, [sp, #16] + mov x16, x4 + mov x8, x5 + stp x21, x22, [sp, #32] + stp x23, x24, [sp, #48] + stp d8, d9, [sp, #64] + stp d10, d11, [sp, #80] + stp d12, d13, [sp, #96] + stp d14, d15, [sp, #112] + ldr w17, [x8, #240] + add x19, x8, x17, lsl #4 // borrow input_l1 for last key + ldp x13, x14, [x19] // load round N keys + ldr q31, [x19, #-16] // load round N-1 keys + lsr x5, x1, #3 // byte_len + mov x15, x5 + ldp x10, x11, [x16] // ctr96_b64, ctr96_t32 + ldr q26, [x8, #128] // load rk8 + sub x5, x5, #1 // byte_len - 1 + ldr q25, [x8, #112] // load rk7 + and x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + add x4, x0, x1, lsr #3 // end_input_ptr + ldr q24, [x8, #96] // load rk6 + lsr x12, x11, #32 + ldr q23, [x8, #80] // load rk5 + orr w11, w11, w11 + ldr q21, [x8, #48] // load rk3 + add x5, x5, x0 + rev w12, w12 // rev_ctr32 + add w12, w12, #1 // increment rev_ctr32 + fmov d3, x10 // CTR block 3 + rev w9, w12 // CTR block 1 + add w12, w12, #1 // CTR block 1 + fmov d1, x10 // CTR block 1 + orr x9, x11, x9, lsl #32 // CTR block 1 + ld1 { v0.16b}, [x16] // special case vector load initial counter so we can start first AES block as quickly as possible + fmov v1.d[1], x9 // CTR block 1 + rev w9, w12 // CTR block 2 + add w12, w12, #1 // CTR block 2 + fmov d2, x10 // CTR block 2 + orr x9, x11, x9, lsl #32 // CTR block 2 + fmov v2.d[1], x9 // CTR block 2 + rev w9, w12 // CTR block 3 + orr x9, x11, x9, lsl #32 // CTR block 3 + ldr q18, [x8, #0] // load rk0 + fmov v3.d[1], x9 // CTR block 3 + add w12, w12, #1 // CTR block 3 + ldr q22, [x8, #64] // load rk4 + ldr q19, [x8, #16] // load rk1 + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 0 - round 0 + ldr q14, [x6, #48] // load h3l | h3h + ext v14.16b, v14.16b, v14.16b, #8 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 3 - round 0 + ldr q15, [x6, #80] // load h4l | h4h + ext v15.16b, v15.16b, v15.16b, #8 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 1 - round 0 + ldr q13, [x6, #32] // load h2l | h2h + ext v13.16b, v13.16b, v13.16b, #8 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 2 - round 0 + ldr q20, [x8, #32] // load rk2 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 0 - round 1 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 1 - round 1 + ld1 { v11.16b}, [x3] + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 2 - round 1 + ldr q27, [x8, #144] // load rk9 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 3 - round 1 + ldr q30, [x8, #192] // load rk12 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 0 - round 2 + ldr q12, [x6] // load h1l | h1h + ext v12.16b, v12.16b, v12.16b, #8 + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 2 - round 2 + ldr q28, [x8, #160] // load rk10 + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 3 - round 2 + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 0 - round 3 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 1 - round 2 + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 3 - round 3 + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 0 - round 4 + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 2 - round 3 + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 1 - round 3 + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 3 - round 4 + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 2 - round 4 + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 1 - round 4 + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 3 - round 5 + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 0 - round 5 + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 1 - round 5 + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 2 - round 5 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 0 - round 6 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 3 - round 6 + cmp x17, #12 // setup flags for AES-128/192/256 check + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 1 - round 6 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 2 - round 6 + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 0 - round 7 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 1 - round 7 + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 3 - round 7 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 0 - round 8 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 2 - round 7 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 3 - round 8 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 1 - round 8 + ldr q29, [x8, #176] // load rk11 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 2 - round 8 + b.lt Ldec_finish_first_blocks // branch if AES-128 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 10 + b.eq Ldec_finish_first_blocks // branch if AES-192 + + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b // AES block 0 - round 11 + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b // AES block 3 - round 11 + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b // AES block 1 - round 11 + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b // AES block 2 - round 11 + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b // AES block 1 - round 12 + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b // AES block 0 - round 12 + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b // AES block 2 - round 12 + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b // AES block 3 - round 12 + +Ldec_finish_first_blocks: + cmp x0, x5 // check if we have <= 4 blocks + trn1 v9.2d, v14.2d, v15.2d // h4h | h3h + trn2 v17.2d, v14.2d, v15.2d // h4l | h3l + trn1 v8.2d, v12.2d, v13.2d // h2h | h1h + trn2 v16.2d, v12.2d, v13.2d // h2l | h1l + eor v17.16b, v17.16b, v9.16b // h4k | h3k + aese v1.16b, v31.16b // AES block 1 - round N-1 + aese v2.16b, v31.16b // AES block 2 - round N-1 + eor v16.16b, v16.16b, v8.16b // h2k | h1k + aese v3.16b, v31.16b // AES block 3 - round N-1 + aese v0.16b, v31.16b // AES block 0 - round N-1 + b.ge Ldec_tail // handle tail + + ldr q4, [x0, #0] // AES block 0 - load ciphertext + ldr q5, [x0, #16] // AES block 1 - load ciphertext + rev w9, w12 // CTR block 4 + eor v0.16b, v4.16b, v0.16b // AES block 0 - result + eor v1.16b, v5.16b, v1.16b // AES block 1 - result + rev64 v5.16b, v5.16b // GHASH block 1 + ldr q7, [x0, #48] // AES block 3 - load ciphertext + mov x7, v0.d[1] // AES block 0 - mov high + mov x6, v0.d[0] // AES block 0 - mov low + rev64 v4.16b, v4.16b // GHASH block 0 + add w12, w12, #1 // CTR block 4 + fmov d0, x10 // CTR block 4 + orr x9, x11, x9, lsl #32 // CTR block 4 + fmov v0.d[1], x9 // CTR block 4 + rev w9, w12 // CTR block 5 + add w12, w12, #1 // CTR block 5 + mov x19, v1.d[0] // AES block 1 - mov low + orr x9, x11, x9, lsl #32 // CTR block 5 + mov x20, v1.d[1] // AES block 1 - mov high + eor x7, x7, x14 // AES block 0 - round N high + eor x6, x6, x13 // AES block 0 - round N low + stp x6, x7, [x2], #16 // AES block 0 - store result + fmov d1, x10 // CTR block 5 + ldr q6, [x0, #32] // AES block 2 - load ciphertext + add x0, x0, #64 // AES input_ptr update + fmov v1.d[1], x9 // CTR block 5 + rev w9, w12 // CTR block 6 + add w12, w12, #1 // CTR block 6 + eor x19, x19, x13 // AES block 1 - round N low + orr x9, x11, x9, lsl #32 // CTR block 6 + eor x20, x20, x14 // AES block 1 - round N high + stp x19, x20, [x2], #16 // AES block 1 - store result + eor v2.16b, v6.16b, v2.16b // AES block 2 - result + cmp x0, x5 // check if we have <= 8 blocks + b.ge Ldec_prepretail // do prepretail + +Ldec_main_loop: // main loop start + mov x21, v2.d[0] // AES block 4k+2 - mov low + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + eor v3.16b, v7.16b, v3.16b // AES block 4k+3 - result + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 + mov x22, v2.d[1] // AES block 4k+2 - mov high + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 + fmov d2, x10 // CTR block 4k+6 + fmov v2.d[1], x9 // CTR block 4k+6 + eor v4.16b, v4.16b, v11.16b // PRE 1 + rev w9, w12 // CTR block 4k+7 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 + mov x24, v3.d[1] // AES block 4k+3 - mov high + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 + mov x23, v3.d[0] // AES block 4k+3 - mov low + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high + mov d8, v4.d[1] // GHASH block 4k - mid + fmov d3, x10 // CTR block 4k+7 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 + orr x9, x11, x9, lsl #32 // CTR block 4k+7 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 + fmov v3.d[1], x9 // CTR block 4k+7 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 + eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 + eor x22, x22, x14 // AES block 4k+2 - round N high + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 + mov d10, v17.d[1] // GHASH block 4k - mid + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 + rev64 v6.16b, v6.16b // GHASH block 4k+2 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 + eor x21, x21, x13 // AES block 4k+2 - round N low + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 + stp x21, x22, [x2], #16 // AES block 4k+2 - store result + pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 + rev64 v7.16b, v7.16b // GHASH block 4k+3 + pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid + eor x23, x23, x13 // AES block 4k+3 - round N low + pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low + eor x24, x24, x14 // AES block 4k+3 - round N high + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 + mov d4, v5.d[1] // GHASH block 4k+1 - mid + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 + eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 + add w12, w12, #1 // CTR block 4k+7 + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 + mov d8, v6.d[1] // GHASH block 4k+2 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 + eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid + pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 + eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 + eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low + pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid + rev w9, w12 // CTR block 4k+8 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 + ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 + add w12, w12, #1 // CTR block 4k+8 + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 + eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 + pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high + mov d6, v7.d[1] // GHASH block 4k+3 - mid + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 + pmull v4.1q, v7.1d, v12.1d // GHASH block 4k+3 - low + orr x9, x11, x9, lsl #32 // CTR block 4k+8 + eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high + cmp x17, #12 // setup flags for AES-128/192/256 check + eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 + eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high + pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid + movi v8.8b, #0xc2 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 + eor v11.16b, v11.16b, v4.16b // GHASH block 4k+3 - low + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 + shl d8, d8, #56 // mod_constant + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 + eor v10.16b, v10.16b, v6.16b // GHASH block 4k+3 - mid + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 + b.lt Ldec_main_loop_continue // branch if AES-128 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 + b.eq Ldec_main_loop_continue // branch if AES-192 + + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 + +Ldec_main_loop_continue: + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + ldr q4, [x0, #0] // AES block 4k+4 - load ciphertext + aese v0.16b, v31.16b // AES block 4k+4 - round N-1 + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up + ldr q5, [x0, #16] // AES block 4k+5 - load ciphertext + eor v0.16b, v4.16b, v0.16b // AES block 4k+4 - result + stp x23, x24, [x2], #16 // AES block 4k+3 - store result + eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid + ldr q7, [x0, #48] // AES block 4k+7 - load ciphertext + ldr q6, [x0, #32] // AES block 4k+6 - load ciphertext + mov x7, v0.d[1] // AES block 4k+4 - mov high + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + aese v1.16b, v31.16b // AES block 4k+5 - round N-1 + add x0, x0, #64 // AES input_ptr update + mov x6, v0.d[0] // AES block 4k+4 - mov low + fmov d0, x10 // CTR block 4k+8 + fmov v0.d[1], x9 // CTR block 4k+8 + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + eor v1.16b, v5.16b, v1.16b // AES block 4k+5 - result + rev w9, w12 // CTR block 4k+9 + aese v2.16b, v31.16b // AES block 4k+6 - round N-1 + orr x9, x11, x9, lsl #32 // CTR block 4k+9 + cmp x0, x5 // LOOP CONTROL + add w12, w12, #1 // CTR block 4k+9 + eor x6, x6, x13 // AES block 4k+4 - round N low + eor x7, x7, x14 // AES block 4k+4 - round N high + mov x20, v1.d[1] // AES block 4k+5 - mov high + eor v2.16b, v6.16b, v2.16b // AES block 4k+6 - result + eor v11.16b, v11.16b, v8.16b // MODULO - fold into low + mov x19, v1.d[0] // AES block 4k+5 - mov low + fmov d1, x10 // CTR block 4k+9 + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + fmov v1.d[1], x9 // CTR block 4k+9 + rev w9, w12 // CTR block 4k+10 + add w12, w12, #1 // CTR block 4k+10 + aese v3.16b, v31.16b // AES block 4k+7 - round N-1 + orr x9, x11, x9, lsl #32 // CTR block 4k+10 + rev64 v5.16b, v5.16b // GHASH block 4k+5 + eor x20, x20, x14 // AES block 4k+5 - round N high + stp x6, x7, [x2], #16 // AES block 4k+4 - store result + eor x19, x19, x13 // AES block 4k+5 - round N low + stp x19, x20, [x2], #16 // AES block 4k+5 - store result + rev64 v4.16b, v4.16b // GHASH block 4k+4 + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + b.lt Ldec_main_loop + +Ldec_prepretail: // PREPRETAIL + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + mov x21, v2.d[0] // AES block 4k+2 - mov low + eor v3.16b, v7.16b, v3.16b // AES block 4k+3 - result + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 + mov x22, v2.d[1] // AES block 4k+2 - mov high + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 + fmov d2, x10 // CTR block 4k+6 + fmov v2.d[1], x9 // CTR block 4k+6 + rev w9, w12 // CTR block 4k+7 + eor v4.16b, v4.16b, v11.16b // PRE 1 + rev64 v6.16b, v6.16b // GHASH block 4k+2 + orr x9, x11, x9, lsl #32 // CTR block 4k+7 + mov x23, v3.d[0] // AES block 4k+3 - mov low + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 + mov x24, v3.d[1] // AES block 4k+3 - mov high + pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low + mov d8, v4.d[1] // GHASH block 4k - mid + fmov d3, x10 // CTR block 4k+7 + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high + fmov v3.d[1], x9 // CTR block 4k+7 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 + mov d10, v17.d[1] // GHASH block 4k - mid + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 + eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 + rev64 v7.16b, v7.16b // GHASH block 4k+3 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 + pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high + pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 + mov d4, v5.d[1] // GHASH block 4k+1 - mid + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 + eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 + mov d8, v6.d[1] // GHASH block 4k+2 - mid + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 + eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid + pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 + eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid + pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 + eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high + eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid + pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 + ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high + pmull v4.1q, v7.1d, v12.1d // GHASH block 4k+3 - low + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 + mov d6, v7.d[1] // GHASH block 4k+3 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 + eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 + eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 + movi v8.8b, #0xc2 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 + eor v11.16b, v11.16b, v4.16b // GHASH block 4k+3 - low + pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 + cmp x17, #12 // setup flags for AES-128/192/256 check + eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 + eor v10.16b, v10.16b, v6.16b // GHASH block 4k+3 - mid + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 + shl d8, d8, #56 // mod_constant + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 + b.lt Ldec_finish_prepretail // branch if AES-128 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 + b.eq Ldec_finish_prepretail // branch if AES-192 + + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 + +Ldec_finish_prepretail: + eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid + eor x22, x22, x14 // AES block 4k+2 - round N high + eor x23, x23, x13 // AES block 4k+3 - round N low + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + add w12, w12, #1 // CTR block 4k+7 + eor x21, x21, x13 // AES block 4k+2 - round N low + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + eor x24, x24, x14 // AES block 4k+3 - round N high + stp x21, x22, [x2], #16 // AES block 4k+2 - store result + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + stp x23, x24, [x2], #16 // AES block 4k+3 - store result + + eor v11.16b, v11.16b, v8.16b // MODULO - fold into low + aese v1.16b, v31.16b // AES block 4k+5 - round N-1 + aese v0.16b, v31.16b // AES block 4k+4 - round N-1 + aese v3.16b, v31.16b // AES block 4k+7 - round N-1 + aese v2.16b, v31.16b // AES block 4k+6 - round N-1 + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + +Ldec_tail: // TAIL + sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process + ld1 { v5.16b}, [x0], #16 // AES block 4k+4 - load ciphertext + eor v0.16b, v5.16b, v0.16b // AES block 4k+4 - result + mov x6, v0.d[0] // AES block 4k+4 - mov low + mov x7, v0.d[1] // AES block 4k+4 - mov high + ext v8.16b, v11.16b, v11.16b, #8 // prepare final partial tag + cmp x5, #48 + eor x6, x6, x13 // AES block 4k+4 - round N low + eor x7, x7, x14 // AES block 4k+4 - round N high + b.gt Ldec_blocks_more_than_3 + sub w12, w12, #1 + mov v3.16b, v2.16b + movi v10.8b, #0 + movi v11.8b, #0 + cmp x5, #32 + movi v9.8b, #0 + mov v2.16b, v1.16b + b.gt Ldec_blocks_more_than_2 + sub w12, w12, #1 + mov v3.16b, v1.16b + cmp x5, #16 + b.gt Ldec_blocks_more_than_1 + sub w12, w12, #1 + b Ldec_blocks_less_than_1 +Ldec_blocks_more_than_3: // blocks left > 3 + rev64 v4.16b, v5.16b // GHASH final-3 block + ld1 { v5.16b}, [x0], #16 // AES final-2 block - load ciphertext + stp x6, x7, [x2], #16 // AES final-3 block - store result + mov d10, v17.d[1] // GHASH final-3 block - mid + eor v4.16b, v4.16b, v8.16b // feed in partial tag + eor v0.16b, v5.16b, v1.16b // AES final-2 block - result + mov d22, v4.d[1] // GHASH final-3 block - mid + mov x6, v0.d[0] // AES final-2 block - mov low + mov x7, v0.d[1] // AES final-2 block - mov high + eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid + movi v8.8b, #0 // suppress further partial tag feed in + pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high + pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid + eor x6, x6, x13 // AES final-2 block - round N low + pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low + eor x7, x7, x14 // AES final-2 block - round N high +Ldec_blocks_more_than_2: // blocks left > 2 + rev64 v4.16b, v5.16b // GHASH final-2 block + ld1 { v5.16b}, [x0], #16 // AES final-1 block - load ciphertext + eor v4.16b, v4.16b, v8.16b // feed in partial tag + stp x6, x7, [x2], #16 // AES final-2 block - store result + eor v0.16b, v5.16b, v2.16b // AES final-1 block - result + mov d22, v4.d[1] // GHASH final-2 block - mid + pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low + pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high + eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid + mov x6, v0.d[0] // AES final-1 block - mov low + mov x7, v0.d[1] // AES final-1 block - mov high + eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low + movi v8.8b, #0 // suppress further partial tag feed in + pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high + eor x6, x6, x13 // AES final-1 block - round N low + eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid + eor x7, x7, x14 // AES final-1 block - round N high +Ldec_blocks_more_than_1: // blocks left > 1 + stp x6, x7, [x2], #16 // AES final-1 block - store result + rev64 v4.16b, v5.16b // GHASH final-1 block + ld1 { v5.16b}, [x0], #16 // AES final block - load ciphertext + eor v4.16b, v4.16b, v8.16b // feed in partial tag + movi v8.8b, #0 // suppress further partial tag feed in + mov d22, v4.d[1] // GHASH final-1 block - mid + eor v0.16b, v5.16b, v3.16b // AES final block - result + pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high + eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid + pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low + mov x6, v0.d[0] // AES final block - mov low + ins v22.d[1], v22.d[0] // GHASH final-1 block - mid + mov x7, v0.d[1] // AES final block - mov high + pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid + eor x6, x6, x13 // AES final block - round N low + eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low + eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high + eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid + eor x7, x7, x14 // AES final block - round N high +Ldec_blocks_less_than_1: // blocks left <= 1 + and x1, x1, #127 // bit_length %= 128 + mvn x14, xzr // rkN_h = 0xffffffffffffffff + sub x1, x1, #128 // bit_length -= 128 + mvn x13, xzr // rkN_l = 0xffffffffffffffff + ldp x4, x5, [x2] // load existing bytes we need to not overwrite + neg x1, x1 // bit_length = 128 - #bits in input (in range [1,128]) + and x1, x1, #127 // bit_length %= 128 + lsr x14, x14, x1 // rkN_h is mask for top 64b of last block + cmp x1, #64 + csel x9, x13, x14, lt + csel x10, x14, xzr, lt + fmov d0, x9 // ctr0b is mask for last block + and x6, x6, x9 + mov v0.d[1], x10 + bic x4, x4, x9 // mask out low existing bytes + rev w9, w12 + bic x5, x5, x10 // mask out high existing bytes + orr x6, x6, x4 + and x7, x7, x10 + orr x7, x7, x5 + and v5.16b, v5.16b, v0.16b // possibly partial last block has zeroes in highest bits + rev64 v4.16b, v5.16b // GHASH final block + eor v4.16b, v4.16b, v8.16b // feed in partial tag + pmull v21.1q, v4.1d, v12.1d // GHASH final block - low + mov d8, v4.d[1] // GHASH final block - mid + eor v8.8b, v8.8b, v4.8b // GHASH final block - mid + pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high + pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final block - high + eor v11.16b, v11.16b, v21.16b // GHASH final block - low + eor v10.16b, v10.16b, v8.16b // GHASH final block - mid + movi v8.8b, #0xc2 + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + shl d8, d8, #56 // mod_constant + eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + eor v11.16b, v11.16b, v8.16b // MODULO - fold into low + stp x6, x7, [x2] + str w9, [x16, #12] // store the updated counter + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + mov x0, x15 + st1 { v11.16b }, [x3] + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x23, x24, [sp, #48] + ldp d8, d9, [sp, #64] + ldp d10, d11, [sp, #80] + ldp d12, d13, [sp, #96] + ldp d14, d15, [sp, #112] + ldp x29, x30, [sp], #128 + AARCH64_VALIDATE_LINK_REGISTER + ret + +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) diff --git a/ring-0.17.14/pregenerated/armv4-mont-linux32.S b/ring-0.17.14/pregenerated/armv4-mont-linux32.S new file mode 100644 index 0000000000..e1c005501e --- /dev/null +++ b/ring-0.17.14/pregenerated/armv4-mont-linux32.S @@ -0,0 +1,937 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__) +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. +.arch armv7-a + +.text +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif + +.globl bn_mul_mont_nohw +.hidden bn_mul_mont_nohw +.type bn_mul_mont_nohw,%function + +.align 5 +bn_mul_mont_nohw: + ldr ip,[sp,#4] @ load num + stmdb sp!,{r0,r2} @ sp points at argument block + cmp ip,#2 + mov r0,ip @ load num +#ifdef __thumb2__ + ittt lt +#endif + movlt r0,#0 + addlt sp,sp,#2*4 + blt .Labrt + + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ save 10 registers + + mov r0,r0,lsl#2 @ rescale r0 for byte count + sub sp,sp,r0 @ alloca(4*num) + sub sp,sp,#4 @ +extra dword + sub r0,r0,#4 @ "num=num-1" + add r4,r2,r0 @ &bp[num-1] + + add r0,sp,r0 @ r0 to point at &tp[num-1] + ldr r8,[r0,#14*4] @ &n0 + ldr r2,[r2] @ bp[0] + ldr r5,[r1],#4 @ ap[0],ap++ + ldr r6,[r3],#4 @ np[0],np++ + ldr r8,[r8] @ *n0 + str r4,[r0,#15*4] @ save &bp[num] + + umull r10,r11,r5,r2 @ ap[0]*bp[0] + str r8,[r0,#14*4] @ save n0 value + mul r8,r10,r8 @ "tp[0]"*n0 + mov r12,#0 + umlal r10,r12,r6,r8 @ np[0]*n0+"t[0]" + mov r4,sp + +.L1st: + ldr r5,[r1],#4 @ ap[j],ap++ + mov r10,r11 + ldr r6,[r3],#4 @ np[j],np++ + mov r11,#0 + umlal r10,r11,r5,r2 @ ap[j]*bp[0] + mov r14,#0 + umlal r12,r14,r6,r8 @ np[j]*n0 + adds r12,r12,r10 + str r12,[r4],#4 @ tp[j-1]=,tp++ + adc r12,r14,#0 + cmp r4,r0 + bne .L1st + + adds r12,r12,r11 + ldr r4,[r0,#13*4] @ restore bp + mov r14,#0 + ldr r8,[r0,#14*4] @ restore n0 + adc r14,r14,#0 + str r12,[r0] @ tp[num-1]= + mov r7,sp + str r14,[r0,#4] @ tp[num]= + +.Louter: + sub r7,r0,r7 @ "original" r0-1 value + sub r1,r1,r7 @ "rewind" ap to &ap[1] + ldr r2,[r4,#4]! @ *(++bp) + sub r3,r3,r7 @ "rewind" np to &np[1] + ldr r5,[r1,#-4] @ ap[0] + ldr r10,[sp] @ tp[0] + ldr r6,[r3,#-4] @ np[0] + ldr r7,[sp,#4] @ tp[1] + + mov r11,#0 + umlal r10,r11,r5,r2 @ ap[0]*bp[i]+tp[0] + str r4,[r0,#13*4] @ save bp + mul r8,r10,r8 + mov r12,#0 + umlal r10,r12,r6,r8 @ np[0]*n0+"tp[0]" + mov r4,sp + +.Linner: + ldr r5,[r1],#4 @ ap[j],ap++ + adds r10,r11,r7 @ +=tp[j] + ldr r6,[r3],#4 @ np[j],np++ + mov r11,#0 + umlal r10,r11,r5,r2 @ ap[j]*bp[i] + mov r14,#0 + umlal r12,r14,r6,r8 @ np[j]*n0 + adc r11,r11,#0 + ldr r7,[r4,#8] @ tp[j+1] + adds r12,r12,r10 + str r12,[r4],#4 @ tp[j-1]=,tp++ + adc r12,r14,#0 + cmp r4,r0 + bne .Linner + + adds r12,r12,r11 + mov r14,#0 + ldr r4,[r0,#13*4] @ restore bp + adc r14,r14,#0 + ldr r8,[r0,#14*4] @ restore n0 + adds r12,r12,r7 + ldr r7,[r0,#15*4] @ restore &bp[num] + adc r14,r14,#0 + str r12,[r0] @ tp[num-1]= + str r14,[r0,#4] @ tp[num]= + + cmp r4,r7 +#ifdef __thumb2__ + itt ne +#endif + movne r7,sp + bne .Louter + + ldr r2,[r0,#12*4] @ pull rp + mov r5,sp + add r0,r0,#4 @ r0 to point at &tp[num] + sub r5,r0,r5 @ "original" num value + mov r4,sp @ "rewind" r4 + mov r1,r4 @ "borrow" r1 + sub r3,r3,r5 @ "rewind" r3 to &np[0] + + subs r7,r7,r7 @ "clear" carry flag +.Lsub: ldr r7,[r4],#4 + ldr r6,[r3],#4 + sbcs r7,r7,r6 @ tp[j]-np[j] + str r7,[r2],#4 @ rp[j]= + teq r4,r0 @ preserve carry + bne .Lsub + sbcs r14,r14,#0 @ upmost carry + mov r4,sp @ "rewind" r4 + sub r2,r2,r5 @ "rewind" r2 + +.Lcopy: ldr r7,[r4] @ conditional copy + ldr r5,[r2] + str sp,[r4],#4 @ zap tp +#ifdef __thumb2__ + it cc +#endif + movcc r5,r7 + str r5,[r2],#4 + teq r4,r0 @ preserve carry + bne .Lcopy + + mov sp,r0 + add sp,sp,#4 @ skip over tp[num+1] + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ restore registers + add sp,sp,#2*4 @ skip over {r0,r2} + mov r0,#1 +.Labrt: +#if __ARM_ARCH>=5 + bx lr @ bx lr +#else + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size bn_mul_mont_nohw,.-bn_mul_mont_nohw +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.globl bn_mul8x_mont_neon +.hidden bn_mul8x_mont_neon +.type bn_mul8x_mont_neon,%function +.align 5 +bn_mul8x_mont_neon: + mov ip,sp + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} + vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so + ldmia ip,{r4,r5} @ load rest of parameter block + mov ip,sp + + cmp r5,#8 + bhi .LNEON_8n + + @ special case for r5==8, everything is in register bank... + + vld1.32 {d28[0]}, [r2,:32]! + veor d8,d8,d8 + sub r7,sp,r5,lsl#4 + vld1.32 {d0,d1,d2,d3}, [r1]! @ can't specify :32 :-( + and r7,r7,#-64 + vld1.32 {d30[0]}, [r4,:32] + mov sp,r7 @ alloca + vzip.16 d28,d8 + + vmull.u32 q6,d28,d0[0] + vmull.u32 q7,d28,d0[1] + vmull.u32 q8,d28,d1[0] + vshl.i64 d29,d13,#16 + vmull.u32 q9,d28,d1[1] + + vadd.u64 d29,d29,d12 + veor d8,d8,d8 + vmul.u32 d29,d29,d30 + + vmull.u32 q10,d28,d2[0] + vld1.32 {d4,d5,d6,d7}, [r3]! + vmull.u32 q11,d28,d2[1] + vmull.u32 q12,d28,d3[0] + vzip.16 d29,d8 + vmull.u32 q13,d28,d3[1] + + vmlal.u32 q6,d29,d4[0] + sub r9,r5,#1 + vmlal.u32 q7,d29,d4[1] + vmlal.u32 q8,d29,d5[0] + vmlal.u32 q9,d29,d5[1] + + vmlal.u32 q10,d29,d6[0] + vmov q5,q6 + vmlal.u32 q11,d29,d6[1] + vmov q6,q7 + vmlal.u32 q12,d29,d7[0] + vmov q7,q8 + vmlal.u32 q13,d29,d7[1] + vmov q8,q9 + vmov q9,q10 + vshr.u64 d10,d10,#16 + vmov q10,q11 + vmov q11,q12 + vadd.u64 d10,d10,d11 + vmov q12,q13 + veor q13,q13 + vshr.u64 d10,d10,#16 + + b .LNEON_outer8 + +.align 4 +.LNEON_outer8: + vld1.32 {d28[0]}, [r2,:32]! + veor d8,d8,d8 + vzip.16 d28,d8 + vadd.u64 d12,d12,d10 + + vmlal.u32 q6,d28,d0[0] + vmlal.u32 q7,d28,d0[1] + vmlal.u32 q8,d28,d1[0] + vshl.i64 d29,d13,#16 + vmlal.u32 q9,d28,d1[1] + + vadd.u64 d29,d29,d12 + veor d8,d8,d8 + subs r9,r9,#1 + vmul.u32 d29,d29,d30 + + vmlal.u32 q10,d28,d2[0] + vmlal.u32 q11,d28,d2[1] + vmlal.u32 q12,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q13,d28,d3[1] + + vmlal.u32 q6,d29,d4[0] + vmlal.u32 q7,d29,d4[1] + vmlal.u32 q8,d29,d5[0] + vmlal.u32 q9,d29,d5[1] + + vmlal.u32 q10,d29,d6[0] + vmov q5,q6 + vmlal.u32 q11,d29,d6[1] + vmov q6,q7 + vmlal.u32 q12,d29,d7[0] + vmov q7,q8 + vmlal.u32 q13,d29,d7[1] + vmov q8,q9 + vmov q9,q10 + vshr.u64 d10,d10,#16 + vmov q10,q11 + vmov q11,q12 + vadd.u64 d10,d10,d11 + vmov q12,q13 + veor q13,q13 + vshr.u64 d10,d10,#16 + + bne .LNEON_outer8 + + vadd.u64 d12,d12,d10 + mov r7,sp + vshr.u64 d10,d12,#16 + mov r8,r5 + vadd.u64 d13,d13,d10 + add r6,sp,#96 + vshr.u64 d10,d13,#16 + vzip.16 d12,d13 + + b .LNEON_tail_entry + +.align 4 +.LNEON_8n: + veor q6,q6,q6 + sub r7,sp,#128 + veor q7,q7,q7 + sub r7,r7,r5,lsl#4 + veor q8,q8,q8 + and r7,r7,#-64 + veor q9,q9,q9 + mov sp,r7 @ alloca + veor q10,q10,q10 + add r7,r7,#256 + veor q11,q11,q11 + sub r8,r5,#8 + veor q12,q12,q12 + veor q13,q13,q13 + +.LNEON_8n_init: + vst1.64 {q6,q7},[r7,:256]! + subs r8,r8,#8 + vst1.64 {q8,q9},[r7,:256]! + vst1.64 {q10,q11},[r7,:256]! + vst1.64 {q12,q13},[r7,:256]! + bne .LNEON_8n_init + + add r6,sp,#256 + vld1.32 {d0,d1,d2,d3},[r1]! + add r10,sp,#8 + vld1.32 {d30[0]},[r4,:32] + mov r9,r5 + b .LNEON_8n_outer + +.align 4 +.LNEON_8n_outer: + vld1.32 {d28[0]},[r2,:32]! @ *b++ + veor d8,d8,d8 + vzip.16 d28,d8 + add r7,sp,#128 + vld1.32 {d4,d5,d6,d7},[r3]! + + vmlal.u32 q6,d28,d0[0] + vmlal.u32 q7,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q8,d28,d1[0] + vshl.i64 d29,d13,#16 + vmlal.u32 q9,d28,d1[1] + vadd.u64 d29,d29,d12 + vmlal.u32 q10,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q11,d28,d2[1] + vst1.32 {d28},[sp,:64] @ put aside smashed b[8*i+0] + vmlal.u32 q12,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q13,d28,d3[1] + vld1.32 {d28[0]},[r2,:32]! @ *b++ + vmlal.u32 q6,d29,d4[0] + veor d10,d10,d10 + vmlal.u32 q7,d29,d4[1] + vzip.16 d28,d10 + vmlal.u32 q8,d29,d5[0] + vshr.u64 d12,d12,#16 + vmlal.u32 q9,d29,d5[1] + vmlal.u32 q10,d29,d6[0] + vadd.u64 d12,d12,d13 + vmlal.u32 q11,d29,d6[1] + vshr.u64 d12,d12,#16 + vmlal.u32 q12,d29,d7[0] + vmlal.u32 q13,d29,d7[1] + vadd.u64 d14,d14,d12 + vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+0] + vmlal.u32 q7,d28,d0[0] + vld1.64 {q6},[r6,:128]! + vmlal.u32 q8,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q9,d28,d1[0] + vshl.i64 d29,d15,#16 + vmlal.u32 q10,d28,d1[1] + vadd.u64 d29,d29,d14 + vmlal.u32 q11,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q12,d28,d2[1] + vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+1] + vmlal.u32 q13,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q6,d28,d3[1] + vld1.32 {d28[0]},[r2,:32]! @ *b++ + vmlal.u32 q7,d29,d4[0] + veor d10,d10,d10 + vmlal.u32 q8,d29,d4[1] + vzip.16 d28,d10 + vmlal.u32 q9,d29,d5[0] + vshr.u64 d14,d14,#16 + vmlal.u32 q10,d29,d5[1] + vmlal.u32 q11,d29,d6[0] + vadd.u64 d14,d14,d15 + vmlal.u32 q12,d29,d6[1] + vshr.u64 d14,d14,#16 + vmlal.u32 q13,d29,d7[0] + vmlal.u32 q6,d29,d7[1] + vadd.u64 d16,d16,d14 + vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+1] + vmlal.u32 q8,d28,d0[0] + vld1.64 {q7},[r6,:128]! + vmlal.u32 q9,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q10,d28,d1[0] + vshl.i64 d29,d17,#16 + vmlal.u32 q11,d28,d1[1] + vadd.u64 d29,d29,d16 + vmlal.u32 q12,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q13,d28,d2[1] + vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+2] + vmlal.u32 q6,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q7,d28,d3[1] + vld1.32 {d28[0]},[r2,:32]! @ *b++ + vmlal.u32 q8,d29,d4[0] + veor d10,d10,d10 + vmlal.u32 q9,d29,d4[1] + vzip.16 d28,d10 + vmlal.u32 q10,d29,d5[0] + vshr.u64 d16,d16,#16 + vmlal.u32 q11,d29,d5[1] + vmlal.u32 q12,d29,d6[0] + vadd.u64 d16,d16,d17 + vmlal.u32 q13,d29,d6[1] + vshr.u64 d16,d16,#16 + vmlal.u32 q6,d29,d7[0] + vmlal.u32 q7,d29,d7[1] + vadd.u64 d18,d18,d16 + vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+2] + vmlal.u32 q9,d28,d0[0] + vld1.64 {q8},[r6,:128]! + vmlal.u32 q10,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q11,d28,d1[0] + vshl.i64 d29,d19,#16 + vmlal.u32 q12,d28,d1[1] + vadd.u64 d29,d29,d18 + vmlal.u32 q13,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q6,d28,d2[1] + vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+3] + vmlal.u32 q7,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q8,d28,d3[1] + vld1.32 {d28[0]},[r2,:32]! @ *b++ + vmlal.u32 q9,d29,d4[0] + veor d10,d10,d10 + vmlal.u32 q10,d29,d4[1] + vzip.16 d28,d10 + vmlal.u32 q11,d29,d5[0] + vshr.u64 d18,d18,#16 + vmlal.u32 q12,d29,d5[1] + vmlal.u32 q13,d29,d6[0] + vadd.u64 d18,d18,d19 + vmlal.u32 q6,d29,d6[1] + vshr.u64 d18,d18,#16 + vmlal.u32 q7,d29,d7[0] + vmlal.u32 q8,d29,d7[1] + vadd.u64 d20,d20,d18 + vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+3] + vmlal.u32 q10,d28,d0[0] + vld1.64 {q9},[r6,:128]! + vmlal.u32 q11,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q12,d28,d1[0] + vshl.i64 d29,d21,#16 + vmlal.u32 q13,d28,d1[1] + vadd.u64 d29,d29,d20 + vmlal.u32 q6,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q7,d28,d2[1] + vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+4] + vmlal.u32 q8,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q9,d28,d3[1] + vld1.32 {d28[0]},[r2,:32]! @ *b++ + vmlal.u32 q10,d29,d4[0] + veor d10,d10,d10 + vmlal.u32 q11,d29,d4[1] + vzip.16 d28,d10 + vmlal.u32 q12,d29,d5[0] + vshr.u64 d20,d20,#16 + vmlal.u32 q13,d29,d5[1] + vmlal.u32 q6,d29,d6[0] + vadd.u64 d20,d20,d21 + vmlal.u32 q7,d29,d6[1] + vshr.u64 d20,d20,#16 + vmlal.u32 q8,d29,d7[0] + vmlal.u32 q9,d29,d7[1] + vadd.u64 d22,d22,d20 + vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+4] + vmlal.u32 q11,d28,d0[0] + vld1.64 {q10},[r6,:128]! + vmlal.u32 q12,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q13,d28,d1[0] + vshl.i64 d29,d23,#16 + vmlal.u32 q6,d28,d1[1] + vadd.u64 d29,d29,d22 + vmlal.u32 q7,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q8,d28,d2[1] + vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+5] + vmlal.u32 q9,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q10,d28,d3[1] + vld1.32 {d28[0]},[r2,:32]! @ *b++ + vmlal.u32 q11,d29,d4[0] + veor d10,d10,d10 + vmlal.u32 q12,d29,d4[1] + vzip.16 d28,d10 + vmlal.u32 q13,d29,d5[0] + vshr.u64 d22,d22,#16 + vmlal.u32 q6,d29,d5[1] + vmlal.u32 q7,d29,d6[0] + vadd.u64 d22,d22,d23 + vmlal.u32 q8,d29,d6[1] + vshr.u64 d22,d22,#16 + vmlal.u32 q9,d29,d7[0] + vmlal.u32 q10,d29,d7[1] + vadd.u64 d24,d24,d22 + vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+5] + vmlal.u32 q12,d28,d0[0] + vld1.64 {q11},[r6,:128]! + vmlal.u32 q13,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q6,d28,d1[0] + vshl.i64 d29,d25,#16 + vmlal.u32 q7,d28,d1[1] + vadd.u64 d29,d29,d24 + vmlal.u32 q8,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q9,d28,d2[1] + vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+6] + vmlal.u32 q10,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q11,d28,d3[1] + vld1.32 {d28[0]},[r2,:32]! @ *b++ + vmlal.u32 q12,d29,d4[0] + veor d10,d10,d10 + vmlal.u32 q13,d29,d4[1] + vzip.16 d28,d10 + vmlal.u32 q6,d29,d5[0] + vshr.u64 d24,d24,#16 + vmlal.u32 q7,d29,d5[1] + vmlal.u32 q8,d29,d6[0] + vadd.u64 d24,d24,d25 + vmlal.u32 q9,d29,d6[1] + vshr.u64 d24,d24,#16 + vmlal.u32 q10,d29,d7[0] + vmlal.u32 q11,d29,d7[1] + vadd.u64 d26,d26,d24 + vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+6] + vmlal.u32 q13,d28,d0[0] + vld1.64 {q12},[r6,:128]! + vmlal.u32 q6,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q7,d28,d1[0] + vshl.i64 d29,d27,#16 + vmlal.u32 q8,d28,d1[1] + vadd.u64 d29,d29,d26 + vmlal.u32 q9,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q10,d28,d2[1] + vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+7] + vmlal.u32 q11,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q12,d28,d3[1] + vld1.32 {d28},[sp,:64] @ pull smashed b[8*i+0] + vmlal.u32 q13,d29,d4[0] + vld1.32 {d0,d1,d2,d3},[r1]! + vmlal.u32 q6,d29,d4[1] + vmlal.u32 q7,d29,d5[0] + vshr.u64 d26,d26,#16 + vmlal.u32 q8,d29,d5[1] + vmlal.u32 q9,d29,d6[0] + vadd.u64 d26,d26,d27 + vmlal.u32 q10,d29,d6[1] + vshr.u64 d26,d26,#16 + vmlal.u32 q11,d29,d7[0] + vmlal.u32 q12,d29,d7[1] + vadd.u64 d12,d12,d26 + vst1.32 {d29},[r10,:64] @ put aside smashed m[8*i+7] + add r10,sp,#8 @ rewind + sub r8,r5,#8 + b .LNEON_8n_inner + +.align 4 +.LNEON_8n_inner: + subs r8,r8,#8 + vmlal.u32 q6,d28,d0[0] + vld1.64 {q13},[r6,:128] + vmlal.u32 q7,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+0] + vmlal.u32 q8,d28,d1[0] + vld1.32 {d4,d5,d6,d7},[r3]! + vmlal.u32 q9,d28,d1[1] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q10,d28,d2[0] + vmlal.u32 q11,d28,d2[1] + vmlal.u32 q12,d28,d3[0] + vmlal.u32 q13,d28,d3[1] + vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+1] + vmlal.u32 q6,d29,d4[0] + vmlal.u32 q7,d29,d4[1] + vmlal.u32 q8,d29,d5[0] + vmlal.u32 q9,d29,d5[1] + vmlal.u32 q10,d29,d6[0] + vmlal.u32 q11,d29,d6[1] + vmlal.u32 q12,d29,d7[0] + vmlal.u32 q13,d29,d7[1] + vst1.64 {q6},[r7,:128]! + vmlal.u32 q7,d28,d0[0] + vld1.64 {q6},[r6,:128] + vmlal.u32 q8,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+1] + vmlal.u32 q9,d28,d1[0] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q10,d28,d1[1] + vmlal.u32 q11,d28,d2[0] + vmlal.u32 q12,d28,d2[1] + vmlal.u32 q13,d28,d3[0] + vmlal.u32 q6,d28,d3[1] + vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+2] + vmlal.u32 q7,d29,d4[0] + vmlal.u32 q8,d29,d4[1] + vmlal.u32 q9,d29,d5[0] + vmlal.u32 q10,d29,d5[1] + vmlal.u32 q11,d29,d6[0] + vmlal.u32 q12,d29,d6[1] + vmlal.u32 q13,d29,d7[0] + vmlal.u32 q6,d29,d7[1] + vst1.64 {q7},[r7,:128]! + vmlal.u32 q8,d28,d0[0] + vld1.64 {q7},[r6,:128] + vmlal.u32 q9,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+2] + vmlal.u32 q10,d28,d1[0] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q11,d28,d1[1] + vmlal.u32 q12,d28,d2[0] + vmlal.u32 q13,d28,d2[1] + vmlal.u32 q6,d28,d3[0] + vmlal.u32 q7,d28,d3[1] + vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+3] + vmlal.u32 q8,d29,d4[0] + vmlal.u32 q9,d29,d4[1] + vmlal.u32 q10,d29,d5[0] + vmlal.u32 q11,d29,d5[1] + vmlal.u32 q12,d29,d6[0] + vmlal.u32 q13,d29,d6[1] + vmlal.u32 q6,d29,d7[0] + vmlal.u32 q7,d29,d7[1] + vst1.64 {q8},[r7,:128]! + vmlal.u32 q9,d28,d0[0] + vld1.64 {q8},[r6,:128] + vmlal.u32 q10,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+3] + vmlal.u32 q11,d28,d1[0] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q12,d28,d1[1] + vmlal.u32 q13,d28,d2[0] + vmlal.u32 q6,d28,d2[1] + vmlal.u32 q7,d28,d3[0] + vmlal.u32 q8,d28,d3[1] + vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+4] + vmlal.u32 q9,d29,d4[0] + vmlal.u32 q10,d29,d4[1] + vmlal.u32 q11,d29,d5[0] + vmlal.u32 q12,d29,d5[1] + vmlal.u32 q13,d29,d6[0] + vmlal.u32 q6,d29,d6[1] + vmlal.u32 q7,d29,d7[0] + vmlal.u32 q8,d29,d7[1] + vst1.64 {q9},[r7,:128]! + vmlal.u32 q10,d28,d0[0] + vld1.64 {q9},[r6,:128] + vmlal.u32 q11,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+4] + vmlal.u32 q12,d28,d1[0] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q13,d28,d1[1] + vmlal.u32 q6,d28,d2[0] + vmlal.u32 q7,d28,d2[1] + vmlal.u32 q8,d28,d3[0] + vmlal.u32 q9,d28,d3[1] + vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+5] + vmlal.u32 q10,d29,d4[0] + vmlal.u32 q11,d29,d4[1] + vmlal.u32 q12,d29,d5[0] + vmlal.u32 q13,d29,d5[1] + vmlal.u32 q6,d29,d6[0] + vmlal.u32 q7,d29,d6[1] + vmlal.u32 q8,d29,d7[0] + vmlal.u32 q9,d29,d7[1] + vst1.64 {q10},[r7,:128]! + vmlal.u32 q11,d28,d0[0] + vld1.64 {q10},[r6,:128] + vmlal.u32 q12,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+5] + vmlal.u32 q13,d28,d1[0] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q6,d28,d1[1] + vmlal.u32 q7,d28,d2[0] + vmlal.u32 q8,d28,d2[1] + vmlal.u32 q9,d28,d3[0] + vmlal.u32 q10,d28,d3[1] + vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+6] + vmlal.u32 q11,d29,d4[0] + vmlal.u32 q12,d29,d4[1] + vmlal.u32 q13,d29,d5[0] + vmlal.u32 q6,d29,d5[1] + vmlal.u32 q7,d29,d6[0] + vmlal.u32 q8,d29,d6[1] + vmlal.u32 q9,d29,d7[0] + vmlal.u32 q10,d29,d7[1] + vst1.64 {q11},[r7,:128]! + vmlal.u32 q12,d28,d0[0] + vld1.64 {q11},[r6,:128] + vmlal.u32 q13,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+6] + vmlal.u32 q6,d28,d1[0] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q7,d28,d1[1] + vmlal.u32 q8,d28,d2[0] + vmlal.u32 q9,d28,d2[1] + vmlal.u32 q10,d28,d3[0] + vmlal.u32 q11,d28,d3[1] + vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+7] + vmlal.u32 q12,d29,d4[0] + vmlal.u32 q13,d29,d4[1] + vmlal.u32 q6,d29,d5[0] + vmlal.u32 q7,d29,d5[1] + vmlal.u32 q8,d29,d6[0] + vmlal.u32 q9,d29,d6[1] + vmlal.u32 q10,d29,d7[0] + vmlal.u32 q11,d29,d7[1] + vst1.64 {q12},[r7,:128]! + vmlal.u32 q13,d28,d0[0] + vld1.64 {q12},[r6,:128] + vmlal.u32 q6,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+7] + vmlal.u32 q7,d28,d1[0] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q8,d28,d1[1] + vmlal.u32 q9,d28,d2[0] + vmlal.u32 q10,d28,d2[1] + vmlal.u32 q11,d28,d3[0] + vmlal.u32 q12,d28,d3[1] + it eq + subeq r1,r1,r5,lsl#2 @ rewind + vmlal.u32 q13,d29,d4[0] + vld1.32 {d28},[sp,:64] @ pull smashed b[8*i+0] + vmlal.u32 q6,d29,d4[1] + vld1.32 {d0,d1,d2,d3},[r1]! + vmlal.u32 q7,d29,d5[0] + add r10,sp,#8 @ rewind + vmlal.u32 q8,d29,d5[1] + vmlal.u32 q9,d29,d6[0] + vmlal.u32 q10,d29,d6[1] + vmlal.u32 q11,d29,d7[0] + vst1.64 {q13},[r7,:128]! + vmlal.u32 q12,d29,d7[1] + + bne .LNEON_8n_inner + add r6,sp,#128 + vst1.64 {q6,q7},[r7,:256]! + veor q2,q2,q2 @ d4-d5 + vst1.64 {q8,q9},[r7,:256]! + veor q3,q3,q3 @ d6-d7 + vst1.64 {q10,q11},[r7,:256]! + vst1.64 {q12},[r7,:128] + + subs r9,r9,#8 + vld1.64 {q6,q7},[r6,:256]! + vld1.64 {q8,q9},[r6,:256]! + vld1.64 {q10,q11},[r6,:256]! + vld1.64 {q12,q13},[r6,:256]! + + itt ne + subne r3,r3,r5,lsl#2 @ rewind + bne .LNEON_8n_outer + + add r7,sp,#128 + vst1.64 {q2,q3}, [sp,:256]! @ start wiping stack frame + vshr.u64 d10,d12,#16 + vst1.64 {q2,q3},[sp,:256]! + vadd.u64 d13,d13,d10 + vst1.64 {q2,q3}, [sp,:256]! + vshr.u64 d10,d13,#16 + vst1.64 {q2,q3}, [sp,:256]! + vzip.16 d12,d13 + + mov r8,r5 + b .LNEON_tail_entry + +.align 4 +.LNEON_tail: + vadd.u64 d12,d12,d10 + vshr.u64 d10,d12,#16 + vld1.64 {q8,q9}, [r6, :256]! + vadd.u64 d13,d13,d10 + vld1.64 {q10,q11}, [r6, :256]! + vshr.u64 d10,d13,#16 + vld1.64 {q12,q13}, [r6, :256]! + vzip.16 d12,d13 + +.LNEON_tail_entry: + vadd.u64 d14,d14,d10 + vst1.32 {d12[0]}, [r7, :32]! + vshr.u64 d10,d14,#16 + vadd.u64 d15,d15,d10 + vshr.u64 d10,d15,#16 + vzip.16 d14,d15 + vadd.u64 d16,d16,d10 + vst1.32 {d14[0]}, [r7, :32]! + vshr.u64 d10,d16,#16 + vadd.u64 d17,d17,d10 + vshr.u64 d10,d17,#16 + vzip.16 d16,d17 + vadd.u64 d18,d18,d10 + vst1.32 {d16[0]}, [r7, :32]! + vshr.u64 d10,d18,#16 + vadd.u64 d19,d19,d10 + vshr.u64 d10,d19,#16 + vzip.16 d18,d19 + vadd.u64 d20,d20,d10 + vst1.32 {d18[0]}, [r7, :32]! + vshr.u64 d10,d20,#16 + vadd.u64 d21,d21,d10 + vshr.u64 d10,d21,#16 + vzip.16 d20,d21 + vadd.u64 d22,d22,d10 + vst1.32 {d20[0]}, [r7, :32]! + vshr.u64 d10,d22,#16 + vadd.u64 d23,d23,d10 + vshr.u64 d10,d23,#16 + vzip.16 d22,d23 + vadd.u64 d24,d24,d10 + vst1.32 {d22[0]}, [r7, :32]! + vshr.u64 d10,d24,#16 + vadd.u64 d25,d25,d10 + vshr.u64 d10,d25,#16 + vzip.16 d24,d25 + vadd.u64 d26,d26,d10 + vst1.32 {d24[0]}, [r7, :32]! + vshr.u64 d10,d26,#16 + vadd.u64 d27,d27,d10 + vshr.u64 d10,d27,#16 + vzip.16 d26,d27 + vld1.64 {q6,q7}, [r6, :256]! + subs r8,r8,#8 + vst1.32 {d26[0]}, [r7, :32]! + bne .LNEON_tail + + vst1.32 {d10[0]}, [r7, :32] @ top-most bit + sub r3,r3,r5,lsl#2 @ rewind r3 + subs r1,sp,#0 @ clear carry flag + add r2,sp,r5,lsl#2 + +.LNEON_sub: + ldmia r1!, {r4,r5,r6,r7} + ldmia r3!, {r8,r9,r10,r11} + sbcs r8, r4,r8 + sbcs r9, r5,r9 + sbcs r10,r6,r10 + sbcs r11,r7,r11 + teq r1,r2 @ preserves carry + stmia r0!, {r8,r9,r10,r11} + bne .LNEON_sub + + ldr r10, [r1] @ load top-most bit + mov r11,sp + veor q0,q0,q0 + sub r11,r2,r11 @ this is num*4 + veor q1,q1,q1 + mov r1,sp + sub r0,r0,r11 @ rewind r0 + mov r3,r2 @ second 3/4th of frame + sbcs r10,r10,#0 @ result is carry flag + +.LNEON_copy_n_zap: + ldmia r1!, {r4,r5,r6,r7} + ldmia r0, {r8,r9,r10,r11} + it cc + movcc r8, r4 + vst1.64 {q0,q1}, [r3,:256]! @ wipe + itt cc + movcc r9, r5 + movcc r10,r6 + vst1.64 {q0,q1}, [r3,:256]! @ wipe + it cc + movcc r11,r7 + ldmia r1, {r4,r5,r6,r7} + stmia r0!, {r8,r9,r10,r11} + sub r1,r1,#16 + ldmia r0, {r8,r9,r10,r11} + it cc + movcc r8, r4 + vst1.64 {q0,q1}, [r1,:256]! @ wipe + itt cc + movcc r9, r5 + movcc r10,r6 + vst1.64 {q0,q1}, [r3,:256]! @ wipe + it cc + movcc r11,r7 + teq r1,r2 @ preserves carry + stmia r0!, {r8,r9,r10,r11} + bne .LNEON_copy_n_zap + + mov sp,ip + vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11} + bx lr @ bx lr +.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon +#endif +.byte 77,111,110,116,103,111,109,101,114,121,32,109,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__) diff --git a/ring-0.17.14/pregenerated/armv8-mont-ios64.S b/ring-0.17.14/pregenerated/armv8-mont-ios64.S new file mode 100644 index 0000000000..ef7d5b6761 --- /dev/null +++ b/ring-0.17.14/pregenerated/armv8-mont-ios64.S @@ -0,0 +1,1416 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) +.text + +.globl _bn_mul_mont_nohw +.private_extern _bn_mul_mont_nohw + +.align 5 +_bn_mul_mont_nohw: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldr x9,[x2],#8 // bp[0] + sub x22,sp,x5,lsl#3 + ldp x7,x8,[x1],#16 // ap[0..1] + lsl x5,x5,#3 + ldr x4,[x4] // *n0 + and x22,x22,#-16 // ABI says so + ldp x13,x14,[x3],#16 // np[0..1] + + mul x6,x7,x9 // ap[0]*bp[0] + sub x21,x5,#16 // j=num-2 + umulh x7,x7,x9 + mul x10,x8,x9 // ap[1]*bp[0] + umulh x11,x8,x9 + + mul x15,x6,x4 // "tp[0]"*n0 + mov sp,x22 // alloca + + // (*) mul x12,x13,x15 // np[0]*m1 + umulh x13,x13,x15 + mul x16,x14,x15 // np[1]*m1 + // (*) adds x12,x12,x6 // discarded + // (*) As for removal of first multiplication and addition + // instructions. The outcome of first addition is + // guaranteed to be zero, which leaves two computationally + // significant outcomes: it either carries or not. Then + // question is when does it carry? Is there alternative + // way to deduce it? If you follow operations, you can + // observe that condition for carry is quite simple: + // x6 being non-zero. So that carry can be calculated + // by adding -1 to x6. That's what next instruction does. + subs xzr,x6,#1 // (*) + umulh x17,x14,x15 + adc x13,x13,xzr + cbz x21,L1st_skip + +L1st: + ldr x8,[x1],#8 + adds x6,x10,x7 + sub x21,x21,#8 // j-- + adc x7,x11,xzr + + ldr x14,[x3],#8 + adds x12,x16,x13 + mul x10,x8,x9 // ap[j]*bp[0] + adc x13,x17,xzr + umulh x11,x8,x9 + + adds x12,x12,x6 + mul x16,x14,x15 // np[j]*m1 + adc x13,x13,xzr + umulh x17,x14,x15 + str x12,[x22],#8 // tp[j-1] + cbnz x21,L1st + +L1st_skip: + adds x6,x10,x7 + sub x1,x1,x5 // rewind x1 + adc x7,x11,xzr + + adds x12,x16,x13 + sub x3,x3,x5 // rewind x3 + adc x13,x17,xzr + + adds x12,x12,x6 + sub x20,x5,#8 // i=num-1 + adcs x13,x13,x7 + + adc x19,xzr,xzr // upmost overflow bit + stp x12,x13,[x22] + +Louter: + ldr x9,[x2],#8 // bp[i] + ldp x7,x8,[x1],#16 + ldr x23,[sp] // tp[0] + add x22,sp,#8 + + mul x6,x7,x9 // ap[0]*bp[i] + sub x21,x5,#16 // j=num-2 + umulh x7,x7,x9 + ldp x13,x14,[x3],#16 + mul x10,x8,x9 // ap[1]*bp[i] + adds x6,x6,x23 + umulh x11,x8,x9 + adc x7,x7,xzr + + mul x15,x6,x4 + sub x20,x20,#8 // i-- + + // (*) mul x12,x13,x15 // np[0]*m1 + umulh x13,x13,x15 + mul x16,x14,x15 // np[1]*m1 + // (*) adds x12,x12,x6 + subs xzr,x6,#1 // (*) + umulh x17,x14,x15 + cbz x21,Linner_skip + +Linner: + ldr x8,[x1],#8 + adc x13,x13,xzr + ldr x23,[x22],#8 // tp[j] + adds x6,x10,x7 + sub x21,x21,#8 // j-- + adc x7,x11,xzr + + adds x12,x16,x13 + ldr x14,[x3],#8 + adc x13,x17,xzr + + mul x10,x8,x9 // ap[j]*bp[i] + adds x6,x6,x23 + umulh x11,x8,x9 + adc x7,x7,xzr + + mul x16,x14,x15 // np[j]*m1 + adds x12,x12,x6 + umulh x17,x14,x15 + str x12,[x22,#-16] // tp[j-1] + cbnz x21,Linner + +Linner_skip: + ldr x23,[x22],#8 // tp[j] + adc x13,x13,xzr + adds x6,x10,x7 + sub x1,x1,x5 // rewind x1 + adc x7,x11,xzr + + adds x12,x16,x13 + sub x3,x3,x5 // rewind x3 + adcs x13,x17,x19 + adc x19,xzr,xzr + + adds x6,x6,x23 + adc x7,x7,xzr + + adds x12,x12,x6 + adcs x13,x13,x7 + adc x19,x19,xzr // upmost overflow bit + stp x12,x13,[x22,#-16] + + cbnz x20,Louter + + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + ldr x23,[sp] // tp[0] + add x22,sp,#8 + ldr x14,[x3],#8 // np[0] + subs x21,x5,#8 // j=num-1 and clear borrow + mov x1,x0 +Lsub: + sbcs x8,x23,x14 // tp[j]-np[j] + ldr x23,[x22],#8 + sub x21,x21,#8 // j-- + ldr x14,[x3],#8 + str x8,[x1],#8 // rp[j]=tp[j]-np[j] + cbnz x21,Lsub + + sbcs x8,x23,x14 + sbcs x19,x19,xzr // did it borrow? + str x8,[x1],#8 // rp[num-1] + + ldr x23,[sp] // tp[0] + add x22,sp,#8 + ldr x8,[x0],#8 // rp[0] + sub x5,x5,#8 // num-- + nop +Lcond_copy: + sub x5,x5,#8 // num-- + csel x14,x23,x8,lo // did it borrow? + ldr x23,[x22],#8 + ldr x8,[x0],#8 + str xzr,[x22,#-16] // wipe tp + str x14,[x0,#-16] + cbnz x5,Lcond_copy + + csel x14,x23,x8,lo + str xzr,[x22,#-8] // wipe tp + str x14,[x0,#-8] + + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + mov x0,#1 + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.globl _bn_sqr8x_mont +.private_extern _bn_sqr8x_mont + +.align 5 +_bn_sqr8x_mont: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x0,x3,[sp,#96] // offload rp and np + + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + ldp x10,x11,[x1,#8*4] + ldp x12,x13,[x1,#8*6] + + sub x2,sp,x5,lsl#4 + lsl x5,x5,#3 + ldr x4,[x4] // *n0 + mov sp,x2 // alloca + sub x27,x5,#8*8 + b Lsqr8x_zero_start + +Lsqr8x_zero: + sub x27,x27,#8*8 + stp xzr,xzr,[x2,#8*0] + stp xzr,xzr,[x2,#8*2] + stp xzr,xzr,[x2,#8*4] + stp xzr,xzr,[x2,#8*6] +Lsqr8x_zero_start: + stp xzr,xzr,[x2,#8*8] + stp xzr,xzr,[x2,#8*10] + stp xzr,xzr,[x2,#8*12] + stp xzr,xzr,[x2,#8*14] + add x2,x2,#8*16 + cbnz x27,Lsqr8x_zero + + add x3,x1,x5 + add x1,x1,#8*8 + mov x19,xzr + mov x20,xzr + mov x21,xzr + mov x22,xzr + mov x23,xzr + mov x24,xzr + mov x25,xzr + mov x26,xzr + mov x2,sp + str x4,[x29,#112] // offload n0 + + // Multiply everything but a[i]*a[i] +.align 4 +Lsqr8x_outer_loop: + // a[1]a[0] (i) + // a[2]a[0] + // a[3]a[0] + // a[4]a[0] + // a[5]a[0] + // a[6]a[0] + // a[7]a[0] + // a[2]a[1] (ii) + // a[3]a[1] + // a[4]a[1] + // a[5]a[1] + // a[6]a[1] + // a[7]a[1] + // a[3]a[2] (iii) + // a[4]a[2] + // a[5]a[2] + // a[6]a[2] + // a[7]a[2] + // a[4]a[3] (iv) + // a[5]a[3] + // a[6]a[3] + // a[7]a[3] + // a[5]a[4] (v) + // a[6]a[4] + // a[7]a[4] + // a[6]a[5] (vi) + // a[7]a[5] + // a[7]a[6] (vii) + + mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) + mul x15,x8,x6 + mul x16,x9,x6 + mul x17,x10,x6 + adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) + mul x14,x11,x6 + adcs x21,x21,x15 + mul x15,x12,x6 + adcs x22,x22,x16 + mul x16,x13,x6 + adcs x23,x23,x17 + umulh x17,x7,x6 // hi(a[1..7]*a[0]) + adcs x24,x24,x14 + umulh x14,x8,x6 + adcs x25,x25,x15 + umulh x15,x9,x6 + adcs x26,x26,x16 + umulh x16,x10,x6 + stp x19,x20,[x2],#8*2 // t[0..1] + adc x19,xzr,xzr // t[8] + adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) + umulh x17,x11,x6 + adcs x22,x22,x14 + umulh x14,x12,x6 + adcs x23,x23,x15 + umulh x15,x13,x6 + adcs x24,x24,x16 + mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) + adcs x25,x25,x17 + mul x17,x9,x7 + adcs x26,x26,x14 + mul x14,x10,x7 + adc x19,x19,x15 + + mul x15,x11,x7 + adds x22,x22,x16 + mul x16,x12,x7 + adcs x23,x23,x17 + mul x17,x13,x7 + adcs x24,x24,x14 + umulh x14,x8,x7 // hi(a[2..7]*a[1]) + adcs x25,x25,x15 + umulh x15,x9,x7 + adcs x26,x26,x16 + umulh x16,x10,x7 + adcs x19,x19,x17 + umulh x17,x11,x7 + stp x21,x22,[x2],#8*2 // t[2..3] + adc x20,xzr,xzr // t[9] + adds x23,x23,x14 + umulh x14,x12,x7 + adcs x24,x24,x15 + umulh x15,x13,x7 + adcs x25,x25,x16 + mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) + adcs x26,x26,x17 + mul x17,x10,x8 + adcs x19,x19,x14 + mul x14,x11,x8 + adc x20,x20,x15 + + mul x15,x12,x8 + adds x24,x24,x16 + mul x16,x13,x8 + adcs x25,x25,x17 + umulh x17,x9,x8 // hi(a[3..7]*a[2]) + adcs x26,x26,x14 + umulh x14,x10,x8 + adcs x19,x19,x15 + umulh x15,x11,x8 + adcs x20,x20,x16 + umulh x16,x12,x8 + stp x23,x24,[x2],#8*2 // t[4..5] + adc x21,xzr,xzr // t[10] + adds x25,x25,x17 + umulh x17,x13,x8 + adcs x26,x26,x14 + mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) + adcs x19,x19,x15 + mul x15,x11,x9 + adcs x20,x20,x16 + mul x16,x12,x9 + adc x21,x21,x17 + + mul x17,x13,x9 + adds x26,x26,x14 + umulh x14,x10,x9 // hi(a[4..7]*a[3]) + adcs x19,x19,x15 + umulh x15,x11,x9 + adcs x20,x20,x16 + umulh x16,x12,x9 + adcs x21,x21,x17 + umulh x17,x13,x9 + stp x25,x26,[x2],#8*2 // t[6..7] + adc x22,xzr,xzr // t[11] + adds x19,x19,x14 + mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) + adcs x20,x20,x15 + mul x15,x12,x10 + adcs x21,x21,x16 + mul x16,x13,x10 + adc x22,x22,x17 + + umulh x17,x11,x10 // hi(a[5..7]*a[4]) + adds x20,x20,x14 + umulh x14,x12,x10 + adcs x21,x21,x15 + umulh x15,x13,x10 + adcs x22,x22,x16 + mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) + adc x23,xzr,xzr // t[12] + adds x21,x21,x17 + mul x17,x13,x11 + adcs x22,x22,x14 + umulh x14,x12,x11 // hi(a[6..7]*a[5]) + adc x23,x23,x15 + + umulh x15,x13,x11 + adds x22,x22,x16 + mul x16,x13,x12 // lo(a[7]*a[6]) (vii) + adcs x23,x23,x17 + umulh x17,x13,x12 // hi(a[7]*a[6]) + adc x24,xzr,xzr // t[13] + adds x23,x23,x14 + sub x27,x3,x1 // done yet? + adc x24,x24,x15 + + adds x24,x24,x16 + sub x14,x3,x5 // rewinded ap + adc x25,xzr,xzr // t[14] + add x25,x25,x17 + + cbz x27,Lsqr8x_outer_break + + mov x4,x6 + ldp x6,x7,[x2,#8*0] + ldp x8,x9,[x2,#8*2] + ldp x10,x11,[x2,#8*4] + ldp x12,x13,[x2,#8*6] + adds x19,x19,x6 + adcs x20,x20,x7 + ldp x6,x7,[x1,#8*0] + adcs x21,x21,x8 + adcs x22,x22,x9 + ldp x8,x9,[x1,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x1,#8*4] + adcs x25,x25,x12 + mov x0,x1 + adcs x26,xzr,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + //adc x28,xzr,xzr // moved below + mov x27,#-8*8 + + // a[8]a[0] + // a[9]a[0] + // a[a]a[0] + // a[b]a[0] + // a[c]a[0] + // a[d]a[0] + // a[e]a[0] + // a[f]a[0] + // a[8]a[1] + // a[f]a[1]........................ + // a[8]a[2] + // a[f]a[2]........................ + // a[8]a[3] + // a[f]a[3]........................ + // a[8]a[4] + // a[f]a[4]........................ + // a[8]a[5] + // a[f]a[5]........................ + // a[8]a[6] + // a[f]a[6]........................ + // a[8]a[7] + // a[f]a[7]........................ +Lsqr8x_mul: + mul x14,x6,x4 + adc x28,xzr,xzr // carry bit, modulo-scheduled + mul x15,x7,x4 + add x27,x27,#8 + mul x16,x8,x4 + mul x17,x9,x4 + adds x19,x19,x14 + mul x14,x10,x4 + adcs x20,x20,x15 + mul x15,x11,x4 + adcs x21,x21,x16 + mul x16,x12,x4 + adcs x22,x22,x17 + mul x17,x13,x4 + adcs x23,x23,x14 + umulh x14,x6,x4 + adcs x24,x24,x15 + umulh x15,x7,x4 + adcs x25,x25,x16 + umulh x16,x8,x4 + adcs x26,x26,x17 + umulh x17,x9,x4 + adc x28,x28,xzr + str x19,[x2],#8 + adds x19,x20,x14 + umulh x14,x10,x4 + adcs x20,x21,x15 + umulh x15,x11,x4 + adcs x21,x22,x16 + umulh x16,x12,x4 + adcs x22,x23,x17 + umulh x17,x13,x4 + ldr x4,[x0,x27] + adcs x23,x24,x14 + adcs x24,x25,x15 + adcs x25,x26,x16 + adcs x26,x28,x17 + //adc x28,xzr,xzr // moved above + cbnz x27,Lsqr8x_mul + // note that carry flag is guaranteed + // to be zero at this point + cmp x1,x3 // done yet? + b.eq Lsqr8x_break + + ldp x6,x7,[x2,#8*0] + ldp x8,x9,[x2,#8*2] + ldp x10,x11,[x2,#8*4] + ldp x12,x13,[x2,#8*6] + adds x19,x19,x6 + ldr x4,[x0,#-8*8] + adcs x20,x20,x7 + ldp x6,x7,[x1,#8*0] + adcs x21,x21,x8 + adcs x22,x22,x9 + ldp x8,x9,[x1,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x1,#8*4] + adcs x25,x25,x12 + mov x27,#-8*8 + adcs x26,x26,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + //adc x28,xzr,xzr // moved above + b Lsqr8x_mul + +.align 4 +Lsqr8x_break: + ldp x6,x7,[x0,#8*0] + add x1,x0,#8*8 + ldp x8,x9,[x0,#8*2] + sub x14,x3,x1 // is it last iteration? + ldp x10,x11,[x0,#8*4] + sub x15,x2,x14 + ldp x12,x13,[x0,#8*6] + cbz x14,Lsqr8x_outer_loop + + stp x19,x20,[x2,#8*0] + ldp x19,x20,[x15,#8*0] + stp x21,x22,[x2,#8*2] + ldp x21,x22,[x15,#8*2] + stp x23,x24,[x2,#8*4] + ldp x23,x24,[x15,#8*4] + stp x25,x26,[x2,#8*6] + mov x2,x15 + ldp x25,x26,[x15,#8*6] + b Lsqr8x_outer_loop + +.align 4 +Lsqr8x_outer_break: + // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] + ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] + ldp x15,x16,[sp,#8*1] + ldp x11,x13,[x14,#8*2] + add x1,x14,#8*4 + ldp x17,x14,[sp,#8*3] + + stp x19,x20,[x2,#8*0] + mul x19,x7,x7 + stp x21,x22,[x2,#8*2] + umulh x7,x7,x7 + stp x23,x24,[x2,#8*4] + mul x8,x9,x9 + stp x25,x26,[x2,#8*6] + mov x2,sp + umulh x9,x9,x9 + adds x20,x7,x15,lsl#1 + extr x15,x16,x15,#63 + sub x27,x5,#8*4 + +Lsqr4x_shift_n_add: + adcs x21,x8,x15 + extr x16,x17,x16,#63 + sub x27,x27,#8*4 + adcs x22,x9,x16 + ldp x15,x16,[x2,#8*5] + mul x10,x11,x11 + ldp x7,x9,[x1],#8*2 + umulh x11,x11,x11 + mul x12,x13,x13 + umulh x13,x13,x13 + extr x17,x14,x17,#63 + stp x19,x20,[x2,#8*0] + adcs x23,x10,x17 + extr x14,x15,x14,#63 + stp x21,x22,[x2,#8*2] + adcs x24,x11,x14 + ldp x17,x14,[x2,#8*7] + extr x15,x16,x15,#63 + adcs x25,x12,x15 + extr x16,x17,x16,#63 + adcs x26,x13,x16 + ldp x15,x16,[x2,#8*9] + mul x6,x7,x7 + ldp x11,x13,[x1],#8*2 + umulh x7,x7,x7 + mul x8,x9,x9 + umulh x9,x9,x9 + stp x23,x24,[x2,#8*4] + extr x17,x14,x17,#63 + stp x25,x26,[x2,#8*6] + add x2,x2,#8*8 + adcs x19,x6,x17 + extr x14,x15,x14,#63 + adcs x20,x7,x14 + ldp x17,x14,[x2,#8*3] + extr x15,x16,x15,#63 + cbnz x27,Lsqr4x_shift_n_add + ldp x1,x4,[x29,#104] // pull np and n0 + + adcs x21,x8,x15 + extr x16,x17,x16,#63 + adcs x22,x9,x16 + ldp x15,x16,[x2,#8*5] + mul x10,x11,x11 + umulh x11,x11,x11 + stp x19,x20,[x2,#8*0] + mul x12,x13,x13 + umulh x13,x13,x13 + stp x21,x22,[x2,#8*2] + extr x17,x14,x17,#63 + adcs x23,x10,x17 + extr x14,x15,x14,#63 + ldp x19,x20,[sp,#8*0] + adcs x24,x11,x14 + extr x15,x16,x15,#63 + ldp x6,x7,[x1,#8*0] + adcs x25,x12,x15 + extr x16,xzr,x16,#63 + ldp x8,x9,[x1,#8*2] + adc x26,x13,x16 + ldp x10,x11,[x1,#8*4] + + // Reduce by 512 bits per iteration + mul x28,x4,x19 // t[0]*n0 + ldp x12,x13,[x1,#8*6] + add x3,x1,x5 + ldp x21,x22,[sp,#8*2] + stp x23,x24,[x2,#8*4] + ldp x23,x24,[sp,#8*4] + stp x25,x26,[x2,#8*6] + ldp x25,x26,[sp,#8*6] + add x1,x1,#8*8 + mov x30,xzr // initial top-most carry + mov x2,sp + mov x27,#8 + +Lsqr8x_reduction: + // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) + mul x15,x7,x28 + sub x27,x27,#1 + mul x16,x8,x28 + str x28,[x2],#8 // put aside t[0]*n0 for tail processing + mul x17,x9,x28 + // (*) adds xzr,x19,x14 + subs xzr,x19,#1 // (*) + mul x14,x10,x28 + adcs x19,x20,x15 + mul x15,x11,x28 + adcs x20,x21,x16 + mul x16,x12,x28 + adcs x21,x22,x17 + mul x17,x13,x28 + adcs x22,x23,x14 + umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) + adcs x23,x24,x15 + umulh x15,x7,x28 + adcs x24,x25,x16 + umulh x16,x8,x28 + adcs x25,x26,x17 + umulh x17,x9,x28 + adc x26,xzr,xzr + adds x19,x19,x14 + umulh x14,x10,x28 + adcs x20,x20,x15 + umulh x15,x11,x28 + adcs x21,x21,x16 + umulh x16,x12,x28 + adcs x22,x22,x17 + umulh x17,x13,x28 + mul x28,x4,x19 // next t[0]*n0 + adcs x23,x23,x14 + adcs x24,x24,x15 + adcs x25,x25,x16 + adc x26,x26,x17 + cbnz x27,Lsqr8x_reduction + + ldp x14,x15,[x2,#8*0] + ldp x16,x17,[x2,#8*2] + mov x0,x2 + sub x27,x3,x1 // done yet? + adds x19,x19,x14 + adcs x20,x20,x15 + ldp x14,x15,[x2,#8*4] + adcs x21,x21,x16 + adcs x22,x22,x17 + ldp x16,x17,[x2,#8*6] + adcs x23,x23,x14 + adcs x24,x24,x15 + adcs x25,x25,x16 + adcs x26,x26,x17 + //adc x28,xzr,xzr // moved below + cbz x27,Lsqr8x8_post_condition + + ldr x4,[x2,#-8*8] + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + ldp x10,x11,[x1,#8*4] + mov x27,#-8*8 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + +Lsqr8x_tail: + mul x14,x6,x4 + adc x28,xzr,xzr // carry bit, modulo-scheduled + mul x15,x7,x4 + add x27,x27,#8 + mul x16,x8,x4 + mul x17,x9,x4 + adds x19,x19,x14 + mul x14,x10,x4 + adcs x20,x20,x15 + mul x15,x11,x4 + adcs x21,x21,x16 + mul x16,x12,x4 + adcs x22,x22,x17 + mul x17,x13,x4 + adcs x23,x23,x14 + umulh x14,x6,x4 + adcs x24,x24,x15 + umulh x15,x7,x4 + adcs x25,x25,x16 + umulh x16,x8,x4 + adcs x26,x26,x17 + umulh x17,x9,x4 + adc x28,x28,xzr + str x19,[x2],#8 + adds x19,x20,x14 + umulh x14,x10,x4 + adcs x20,x21,x15 + umulh x15,x11,x4 + adcs x21,x22,x16 + umulh x16,x12,x4 + adcs x22,x23,x17 + umulh x17,x13,x4 + ldr x4,[x0,x27] + adcs x23,x24,x14 + adcs x24,x25,x15 + adcs x25,x26,x16 + adcs x26,x28,x17 + //adc x28,xzr,xzr // moved above + cbnz x27,Lsqr8x_tail + // note that carry flag is guaranteed + // to be zero at this point + ldp x6,x7,[x2,#8*0] + sub x27,x3,x1 // done yet? + sub x16,x3,x5 // rewinded np + ldp x8,x9,[x2,#8*2] + ldp x10,x11,[x2,#8*4] + ldp x12,x13,[x2,#8*6] + cbz x27,Lsqr8x_tail_break + + ldr x4,[x0,#-8*8] + adds x19,x19,x6 + adcs x20,x20,x7 + ldp x6,x7,[x1,#8*0] + adcs x21,x21,x8 + adcs x22,x22,x9 + ldp x8,x9,[x1,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x1,#8*4] + adcs x25,x25,x12 + mov x27,#-8*8 + adcs x26,x26,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + //adc x28,xzr,xzr // moved above + b Lsqr8x_tail + +.align 4 +Lsqr8x_tail_break: + ldr x4,[x29,#112] // pull n0 + add x27,x2,#8*8 // end of current t[num] window + + subs xzr,x30,#1 // "move" top-most carry to carry bit + adcs x14,x19,x6 + adcs x15,x20,x7 + ldp x19,x20,[x0,#8*0] + adcs x21,x21,x8 + ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] + adcs x22,x22,x9 + ldp x8,x9,[x16,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x16,#8*4] + adcs x25,x25,x12 + adcs x26,x26,x13 + ldp x12,x13,[x16,#8*6] + add x1,x16,#8*8 + adc x30,xzr,xzr // top-most carry + mul x28,x4,x19 + stp x14,x15,[x2,#8*0] + stp x21,x22,[x2,#8*2] + ldp x21,x22,[x0,#8*2] + stp x23,x24,[x2,#8*4] + ldp x23,x24,[x0,#8*4] + cmp x27,x29 // did we hit the bottom? + stp x25,x26,[x2,#8*6] + mov x2,x0 // slide the window + ldp x25,x26,[x0,#8*6] + mov x27,#8 + b.ne Lsqr8x_reduction + + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + ldr x0,[x29,#96] // pull rp + add x2,x2,#8*8 + subs x14,x19,x6 + sbcs x15,x20,x7 + sub x27,x5,#8*8 + mov x3,x0 // x0 copy + +Lsqr8x_sub: + sbcs x16,x21,x8 + ldp x6,x7,[x1,#8*0] + sbcs x17,x22,x9 + stp x14,x15,[x0,#8*0] + sbcs x14,x23,x10 + ldp x8,x9,[x1,#8*2] + sbcs x15,x24,x11 + stp x16,x17,[x0,#8*2] + sbcs x16,x25,x12 + ldp x10,x11,[x1,#8*4] + sbcs x17,x26,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + ldp x19,x20,[x2,#8*0] + sub x27,x27,#8*8 + ldp x21,x22,[x2,#8*2] + ldp x23,x24,[x2,#8*4] + ldp x25,x26,[x2,#8*6] + add x2,x2,#8*8 + stp x14,x15,[x0,#8*4] + sbcs x14,x19,x6 + stp x16,x17,[x0,#8*6] + add x0,x0,#8*8 + sbcs x15,x20,x7 + cbnz x27,Lsqr8x_sub + + sbcs x16,x21,x8 + mov x2,sp + add x1,sp,x5 + ldp x6,x7,[x3,#8*0] + sbcs x17,x22,x9 + stp x14,x15,[x0,#8*0] + sbcs x14,x23,x10 + ldp x8,x9,[x3,#8*2] + sbcs x15,x24,x11 + stp x16,x17,[x0,#8*2] + sbcs x16,x25,x12 + ldp x19,x20,[x1,#8*0] + sbcs x17,x26,x13 + ldp x21,x22,[x1,#8*2] + sbcs xzr,x30,xzr // did it borrow? + ldr x30,[x29,#8] // pull return address + stp x14,x15,[x0,#8*4] + stp x16,x17,[x0,#8*6] + + sub x27,x5,#8*4 +Lsqr4x_cond_copy: + sub x27,x27,#8*4 + csel x14,x19,x6,lo + stp xzr,xzr,[x2,#8*0] + csel x15,x20,x7,lo + ldp x6,x7,[x3,#8*4] + ldp x19,x20,[x1,#8*4] + csel x16,x21,x8,lo + stp xzr,xzr,[x2,#8*2] + add x2,x2,#8*4 + csel x17,x22,x9,lo + ldp x8,x9,[x3,#8*6] + ldp x21,x22,[x1,#8*6] + add x1,x1,#8*4 + stp x14,x15,[x3,#8*0] + stp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + stp xzr,xzr,[x1,#8*0] + stp xzr,xzr,[x1,#8*2] + cbnz x27,Lsqr4x_cond_copy + + csel x14,x19,x6,lo + stp xzr,xzr,[x2,#8*0] + csel x15,x20,x7,lo + stp xzr,xzr,[x2,#8*2] + csel x16,x21,x8,lo + csel x17,x22,x9,lo + stp x14,x15,[x3,#8*0] + stp x16,x17,[x3,#8*2] + + b Lsqr8x_done + +.align 4 +Lsqr8x8_post_condition: + adc x28,xzr,xzr + ldr x30,[x29,#8] // pull return address + // x19-7,x28 hold result, x6-7 hold modulus + subs x6,x19,x6 + ldr x1,[x29,#96] // pull rp + sbcs x7,x20,x7 + stp xzr,xzr,[sp,#8*0] + sbcs x8,x21,x8 + stp xzr,xzr,[sp,#8*2] + sbcs x9,x22,x9 + stp xzr,xzr,[sp,#8*4] + sbcs x10,x23,x10 + stp xzr,xzr,[sp,#8*6] + sbcs x11,x24,x11 + stp xzr,xzr,[sp,#8*8] + sbcs x12,x25,x12 + stp xzr,xzr,[sp,#8*10] + sbcs x13,x26,x13 + stp xzr,xzr,[sp,#8*12] + sbcs x28,x28,xzr // did it borrow? + stp xzr,xzr,[sp,#8*14] + + // x6-7 hold result-modulus + csel x6,x19,x6,lo + csel x7,x20,x7,lo + csel x8,x21,x8,lo + csel x9,x22,x9,lo + stp x6,x7,[x1,#8*0] + csel x10,x23,x10,lo + csel x11,x24,x11,lo + stp x8,x9,[x1,#8*2] + csel x12,x25,x12,lo + csel x13,x26,x13,lo + stp x10,x11,[x1,#8*4] + stp x12,x13,[x1,#8*6] + +Lsqr8x_done: + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + mov x0,#1 + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + // x30 is popped earlier + AARCH64_VALIDATE_LINK_REGISTER + ret + +.globl _bn_mul4x_mont +.private_extern _bn_mul4x_mont + +.align 5 +_bn_mul4x_mont: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + sub x26,sp,x5,lsl#3 + lsl x5,x5,#3 + ldr x4,[x4] // *n0 + sub sp,x26,#8*4 // alloca + + add x10,x2,x5 + add x27,x1,x5 + stp x0,x10,[x29,#96] // offload rp and &b[num] + + ldr x24,[x2,#8*0] // b[0] + ldp x6,x7,[x1,#8*0] // a[0..3] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + mov x19,xzr + mov x20,xzr + mov x21,xzr + mov x22,xzr + ldp x14,x15,[x3,#8*0] // n[0..3] + ldp x16,x17,[x3,#8*2] + adds x3,x3,#8*4 // clear carry bit + mov x0,xzr + mov x28,#0 + mov x26,sp + +Loop_mul4x_1st_reduction: + mul x10,x6,x24 // lo(a[0..3]*b[0]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[0..3]*b[0]) + adcs x20,x20,x11 + mul x25,x19,x4 // t[0]*n0 + adcs x21,x21,x12 + umulh x11,x7,x24 + adcs x22,x22,x13 + umulh x12,x8,x24 + adc x23,xzr,xzr + umulh x13,x9,x24 + ldr x24,[x2,x28] // next b[i] (or b[0]) + adds x20,x20,x10 + // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) + str x25,[x26],#8 // put aside t[0]*n0 for tail processing + adcs x21,x21,x11 + mul x11,x15,x25 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + // (*) adds xzr,x19,x10 + subs xzr,x19,#1 // (*) + umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) + adcs x19,x20,x11 + umulh x11,x15,x25 + adcs x20,x21,x12 + umulh x12,x16,x25 + adcs x21,x22,x13 + umulh x13,x17,x25 + adcs x22,x23,x0 + adc x0,xzr,xzr + adds x19,x19,x10 + sub x10,x27,x1 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + cbnz x28,Loop_mul4x_1st_reduction + + cbz x10,Lmul4x4_post_condition + + ldp x6,x7,[x1,#8*0] // a[4..7] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + ldr x25,[sp] // a[0]*n0 + ldp x14,x15,[x3,#8*0] // n[4..7] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + +Loop_mul4x_1st_tail: + mul x10,x6,x24 // lo(a[4..7]*b[i]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[4..7]*b[i]) + adcs x20,x20,x11 + umulh x11,x7,x24 + adcs x21,x21,x12 + umulh x12,x8,x24 + adcs x22,x22,x13 + umulh x13,x9,x24 + adc x23,xzr,xzr + ldr x24,[x2,x28] // next b[i] (or b[0]) + adds x20,x20,x10 + mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) + adcs x21,x21,x11 + mul x11,x15,x25 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + adds x19,x19,x10 + umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) + adcs x20,x20,x11 + umulh x11,x15,x25 + adcs x21,x21,x12 + umulh x12,x16,x25 + adcs x22,x22,x13 + adcs x23,x23,x0 + umulh x13,x17,x25 + adc x0,xzr,xzr + ldr x25,[sp,x28] // next t[0]*n0 + str x19,[x26],#8 // result!!! + adds x19,x20,x10 + sub x10,x27,x1 // done yet? + adcs x20,x21,x11 + adcs x21,x22,x12 + adcs x22,x23,x13 + //adc x0,x0,xzr + cbnz x28,Loop_mul4x_1st_tail + + sub x11,x27,x5 // rewinded x1 + cbz x10,Lmul4x_proceed + + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + ldp x14,x15,[x3,#8*0] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + b Loop_mul4x_1st_tail + +.align 5 +Lmul4x_proceed: + ldr x24,[x2,#8*4]! // *++b + adc x30,x0,xzr + ldp x6,x7,[x11,#8*0] // a[0..3] + sub x3,x3,x5 // rewind np + ldp x8,x9,[x11,#8*2] + add x1,x11,#8*4 + + stp x19,x20,[x26,#8*0] // result!!! + ldp x19,x20,[sp,#8*4] // t[0..3] + stp x21,x22,[x26,#8*2] // result!!! + ldp x21,x22,[sp,#8*6] + + ldp x14,x15,[x3,#8*0] // n[0..3] + mov x26,sp + ldp x16,x17,[x3,#8*2] + adds x3,x3,#8*4 // clear carry bit + mov x0,xzr + +.align 4 +Loop_mul4x_reduction: + mul x10,x6,x24 // lo(a[0..3]*b[4]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[0..3]*b[4]) + adcs x20,x20,x11 + mul x25,x19,x4 // t[0]*n0 + adcs x21,x21,x12 + umulh x11,x7,x24 + adcs x22,x22,x13 + umulh x12,x8,x24 + adc x23,xzr,xzr + umulh x13,x9,x24 + ldr x24,[x2,x28] // next b[i] + adds x20,x20,x10 + // (*) mul x10,x14,x25 + str x25,[x26],#8 // put aside t[0]*n0 for tail processing + adcs x21,x21,x11 + mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + // (*) adds xzr,x19,x10 + subs xzr,x19,#1 // (*) + umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 + adcs x19,x20,x11 + umulh x11,x15,x25 + adcs x20,x21,x12 + umulh x12,x16,x25 + adcs x21,x22,x13 + umulh x13,x17,x25 + adcs x22,x23,x0 + adc x0,xzr,xzr + adds x19,x19,x10 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + cbnz x28,Loop_mul4x_reduction + + adc x0,x0,xzr + ldp x10,x11,[x26,#8*4] // t[4..7] + ldp x12,x13,[x26,#8*6] + ldp x6,x7,[x1,#8*0] // a[4..7] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + adds x19,x19,x10 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + + ldr x25,[sp] // t[0]*n0 + ldp x14,x15,[x3,#8*0] // n[4..7] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + +.align 4 +Loop_mul4x_tail: + mul x10,x6,x24 // lo(a[4..7]*b[4]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[4..7]*b[4]) + adcs x20,x20,x11 + umulh x11,x7,x24 + adcs x21,x21,x12 + umulh x12,x8,x24 + adcs x22,x22,x13 + umulh x13,x9,x24 + adc x23,xzr,xzr + ldr x24,[x2,x28] // next b[i] + adds x20,x20,x10 + mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) + adcs x21,x21,x11 + mul x11,x15,x25 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + adds x19,x19,x10 + umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) + adcs x20,x20,x11 + umulh x11,x15,x25 + adcs x21,x21,x12 + umulh x12,x16,x25 + adcs x22,x22,x13 + umulh x13,x17,x25 + adcs x23,x23,x0 + ldr x25,[sp,x28] // next a[0]*n0 + adc x0,xzr,xzr + str x19,[x26],#8 // result!!! + adds x19,x20,x10 + sub x10,x27,x1 // done yet? + adcs x20,x21,x11 + adcs x21,x22,x12 + adcs x22,x23,x13 + //adc x0,x0,xzr + cbnz x28,Loop_mul4x_tail + + sub x11,x3,x5 // rewinded np? + adc x0,x0,xzr + cbz x10,Loop_mul4x_break + + ldp x10,x11,[x26,#8*4] + ldp x12,x13,[x26,#8*6] + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + adds x19,x19,x10 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + ldp x14,x15,[x3,#8*0] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + b Loop_mul4x_tail + +.align 4 +Loop_mul4x_break: + ldp x12,x13,[x29,#96] // pull rp and &b[num] + adds x19,x19,x30 + add x2,x2,#8*4 // bp++ + adcs x20,x20,xzr + sub x1,x1,x5 // rewind ap + adcs x21,x21,xzr + stp x19,x20,[x26,#8*0] // result!!! + adcs x22,x22,xzr + ldp x19,x20,[sp,#8*4] // t[0..3] + adc x30,x0,xzr + stp x21,x22,[x26,#8*2] // result!!! + cmp x2,x13 // done yet? + ldp x21,x22,[sp,#8*6] + ldp x14,x15,[x11,#8*0] // n[0..3] + ldp x16,x17,[x11,#8*2] + add x3,x11,#8*4 + b.eq Lmul4x_post + + ldr x24,[x2] + ldp x6,x7,[x1,#8*0] // a[0..3] + ldp x8,x9,[x1,#8*2] + adds x1,x1,#8*4 // clear carry bit + mov x0,xzr + mov x26,sp + b Loop_mul4x_reduction + +.align 4 +Lmul4x_post: + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + mov x0,x12 + mov x27,x12 // x0 copy + subs x10,x19,x14 + add x26,sp,#8*8 + sbcs x11,x20,x15 + sub x28,x5,#8*4 + +Lmul4x_sub: + sbcs x12,x21,x16 + ldp x14,x15,[x3,#8*0] + sub x28,x28,#8*4 + ldp x19,x20,[x26,#8*0] + sbcs x13,x22,x17 + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + ldp x21,x22,[x26,#8*2] + add x26,x26,#8*4 + stp x10,x11,[x0,#8*0] + sbcs x10,x19,x14 + stp x12,x13,[x0,#8*2] + add x0,x0,#8*4 + sbcs x11,x20,x15 + cbnz x28,Lmul4x_sub + + sbcs x12,x21,x16 + mov x26,sp + add x1,sp,#8*4 + ldp x6,x7,[x27,#8*0] + sbcs x13,x22,x17 + stp x10,x11,[x0,#8*0] + ldp x8,x9,[x27,#8*2] + stp x12,x13,[x0,#8*2] + ldp x19,x20,[x1,#8*0] + ldp x21,x22,[x1,#8*2] + sbcs xzr,x30,xzr // did it borrow? + ldr x30,[x29,#8] // pull return address + + sub x28,x5,#8*4 +Lmul4x_cond_copy: + sub x28,x28,#8*4 + csel x10,x19,x6,lo + stp xzr,xzr,[x26,#8*0] + csel x11,x20,x7,lo + ldp x6,x7,[x27,#8*4] + ldp x19,x20,[x1,#8*4] + csel x12,x21,x8,lo + stp xzr,xzr,[x26,#8*2] + add x26,x26,#8*4 + csel x13,x22,x9,lo + ldp x8,x9,[x27,#8*6] + ldp x21,x22,[x1,#8*6] + add x1,x1,#8*4 + stp x10,x11,[x27,#8*0] + stp x12,x13,[x27,#8*2] + add x27,x27,#8*4 + cbnz x28,Lmul4x_cond_copy + + csel x10,x19,x6,lo + stp xzr,xzr,[x26,#8*0] + csel x11,x20,x7,lo + stp xzr,xzr,[x26,#8*2] + csel x12,x21,x8,lo + stp xzr,xzr,[x26,#8*3] + csel x13,x22,x9,lo + stp xzr,xzr,[x26,#8*4] + stp x10,x11,[x27,#8*0] + stp x12,x13,[x27,#8*2] + + b Lmul4x_done + +.align 4 +Lmul4x4_post_condition: + adc x0,x0,xzr + ldr x1,[x29,#96] // pull rp + // x19-3,x0 hold result, x14-7 hold modulus + subs x6,x19,x14 + ldr x30,[x29,#8] // pull return address + sbcs x7,x20,x15 + stp xzr,xzr,[sp,#8*0] + sbcs x8,x21,x16 + stp xzr,xzr,[sp,#8*2] + sbcs x9,x22,x17 + stp xzr,xzr,[sp,#8*4] + sbcs xzr,x0,xzr // did it borrow? + stp xzr,xzr,[sp,#8*6] + + // x6-3 hold result-modulus + csel x6,x19,x6,lo + csel x7,x20,x7,lo + csel x8,x21,x8,lo + csel x9,x22,x9,lo + stp x6,x7,[x1,#8*0] + stp x8,x9,[x1,#8*2] + +Lmul4x_done: + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + mov x0,#1 + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + // x30 is popped earlier + AARCH64_VALIDATE_LINK_REGISTER + ret + +.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 4 +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) diff --git a/ring-0.17.14/pregenerated/armv8-mont-linux64.S b/ring-0.17.14/pregenerated/armv8-mont-linux64.S new file mode 100644 index 0000000000..4e8a5c226d --- /dev/null +++ b/ring-0.17.14/pregenerated/armv8-mont-linux64.S @@ -0,0 +1,1416 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) +.text + +.globl bn_mul_mont_nohw +.hidden bn_mul_mont_nohw +.type bn_mul_mont_nohw,%function +.align 5 +bn_mul_mont_nohw: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldr x9,[x2],#8 // bp[0] + sub x22,sp,x5,lsl#3 + ldp x7,x8,[x1],#16 // ap[0..1] + lsl x5,x5,#3 + ldr x4,[x4] // *n0 + and x22,x22,#-16 // ABI says so + ldp x13,x14,[x3],#16 // np[0..1] + + mul x6,x7,x9 // ap[0]*bp[0] + sub x21,x5,#16 // j=num-2 + umulh x7,x7,x9 + mul x10,x8,x9 // ap[1]*bp[0] + umulh x11,x8,x9 + + mul x15,x6,x4 // "tp[0]"*n0 + mov sp,x22 // alloca + + // (*) mul x12,x13,x15 // np[0]*m1 + umulh x13,x13,x15 + mul x16,x14,x15 // np[1]*m1 + // (*) adds x12,x12,x6 // discarded + // (*) As for removal of first multiplication and addition + // instructions. The outcome of first addition is + // guaranteed to be zero, which leaves two computationally + // significant outcomes: it either carries or not. Then + // question is when does it carry? Is there alternative + // way to deduce it? If you follow operations, you can + // observe that condition for carry is quite simple: + // x6 being non-zero. So that carry can be calculated + // by adding -1 to x6. That's what next instruction does. + subs xzr,x6,#1 // (*) + umulh x17,x14,x15 + adc x13,x13,xzr + cbz x21,.L1st_skip + +.L1st: + ldr x8,[x1],#8 + adds x6,x10,x7 + sub x21,x21,#8 // j-- + adc x7,x11,xzr + + ldr x14,[x3],#8 + adds x12,x16,x13 + mul x10,x8,x9 // ap[j]*bp[0] + adc x13,x17,xzr + umulh x11,x8,x9 + + adds x12,x12,x6 + mul x16,x14,x15 // np[j]*m1 + adc x13,x13,xzr + umulh x17,x14,x15 + str x12,[x22],#8 // tp[j-1] + cbnz x21,.L1st + +.L1st_skip: + adds x6,x10,x7 + sub x1,x1,x5 // rewind x1 + adc x7,x11,xzr + + adds x12,x16,x13 + sub x3,x3,x5 // rewind x3 + adc x13,x17,xzr + + adds x12,x12,x6 + sub x20,x5,#8 // i=num-1 + adcs x13,x13,x7 + + adc x19,xzr,xzr // upmost overflow bit + stp x12,x13,[x22] + +.Louter: + ldr x9,[x2],#8 // bp[i] + ldp x7,x8,[x1],#16 + ldr x23,[sp] // tp[0] + add x22,sp,#8 + + mul x6,x7,x9 // ap[0]*bp[i] + sub x21,x5,#16 // j=num-2 + umulh x7,x7,x9 + ldp x13,x14,[x3],#16 + mul x10,x8,x9 // ap[1]*bp[i] + adds x6,x6,x23 + umulh x11,x8,x9 + adc x7,x7,xzr + + mul x15,x6,x4 + sub x20,x20,#8 // i-- + + // (*) mul x12,x13,x15 // np[0]*m1 + umulh x13,x13,x15 + mul x16,x14,x15 // np[1]*m1 + // (*) adds x12,x12,x6 + subs xzr,x6,#1 // (*) + umulh x17,x14,x15 + cbz x21,.Linner_skip + +.Linner: + ldr x8,[x1],#8 + adc x13,x13,xzr + ldr x23,[x22],#8 // tp[j] + adds x6,x10,x7 + sub x21,x21,#8 // j-- + adc x7,x11,xzr + + adds x12,x16,x13 + ldr x14,[x3],#8 + adc x13,x17,xzr + + mul x10,x8,x9 // ap[j]*bp[i] + adds x6,x6,x23 + umulh x11,x8,x9 + adc x7,x7,xzr + + mul x16,x14,x15 // np[j]*m1 + adds x12,x12,x6 + umulh x17,x14,x15 + str x12,[x22,#-16] // tp[j-1] + cbnz x21,.Linner + +.Linner_skip: + ldr x23,[x22],#8 // tp[j] + adc x13,x13,xzr + adds x6,x10,x7 + sub x1,x1,x5 // rewind x1 + adc x7,x11,xzr + + adds x12,x16,x13 + sub x3,x3,x5 // rewind x3 + adcs x13,x17,x19 + adc x19,xzr,xzr + + adds x6,x6,x23 + adc x7,x7,xzr + + adds x12,x12,x6 + adcs x13,x13,x7 + adc x19,x19,xzr // upmost overflow bit + stp x12,x13,[x22,#-16] + + cbnz x20,.Louter + + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + ldr x23,[sp] // tp[0] + add x22,sp,#8 + ldr x14,[x3],#8 // np[0] + subs x21,x5,#8 // j=num-1 and clear borrow + mov x1,x0 +.Lsub: + sbcs x8,x23,x14 // tp[j]-np[j] + ldr x23,[x22],#8 + sub x21,x21,#8 // j-- + ldr x14,[x3],#8 + str x8,[x1],#8 // rp[j]=tp[j]-np[j] + cbnz x21,.Lsub + + sbcs x8,x23,x14 + sbcs x19,x19,xzr // did it borrow? + str x8,[x1],#8 // rp[num-1] + + ldr x23,[sp] // tp[0] + add x22,sp,#8 + ldr x8,[x0],#8 // rp[0] + sub x5,x5,#8 // num-- + nop +.Lcond_copy: + sub x5,x5,#8 // num-- + csel x14,x23,x8,lo // did it borrow? + ldr x23,[x22],#8 + ldr x8,[x0],#8 + str xzr,[x22,#-16] // wipe tp + str x14,[x0,#-16] + cbnz x5,.Lcond_copy + + csel x14,x23,x8,lo + str xzr,[x22,#-8] // wipe tp + str x14,[x0,#-8] + + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + mov x0,#1 + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size bn_mul_mont_nohw,.-bn_mul_mont_nohw +.globl bn_sqr8x_mont +.hidden bn_sqr8x_mont +.type bn_sqr8x_mont,%function +.align 5 +bn_sqr8x_mont: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x0,x3,[sp,#96] // offload rp and np + + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + ldp x10,x11,[x1,#8*4] + ldp x12,x13,[x1,#8*6] + + sub x2,sp,x5,lsl#4 + lsl x5,x5,#3 + ldr x4,[x4] // *n0 + mov sp,x2 // alloca + sub x27,x5,#8*8 + b .Lsqr8x_zero_start + +.Lsqr8x_zero: + sub x27,x27,#8*8 + stp xzr,xzr,[x2,#8*0] + stp xzr,xzr,[x2,#8*2] + stp xzr,xzr,[x2,#8*4] + stp xzr,xzr,[x2,#8*6] +.Lsqr8x_zero_start: + stp xzr,xzr,[x2,#8*8] + stp xzr,xzr,[x2,#8*10] + stp xzr,xzr,[x2,#8*12] + stp xzr,xzr,[x2,#8*14] + add x2,x2,#8*16 + cbnz x27,.Lsqr8x_zero + + add x3,x1,x5 + add x1,x1,#8*8 + mov x19,xzr + mov x20,xzr + mov x21,xzr + mov x22,xzr + mov x23,xzr + mov x24,xzr + mov x25,xzr + mov x26,xzr + mov x2,sp + str x4,[x29,#112] // offload n0 + + // Multiply everything but a[i]*a[i] +.align 4 +.Lsqr8x_outer_loop: + // a[1]a[0] (i) + // a[2]a[0] + // a[3]a[0] + // a[4]a[0] + // a[5]a[0] + // a[6]a[0] + // a[7]a[0] + // a[2]a[1] (ii) + // a[3]a[1] + // a[4]a[1] + // a[5]a[1] + // a[6]a[1] + // a[7]a[1] + // a[3]a[2] (iii) + // a[4]a[2] + // a[5]a[2] + // a[6]a[2] + // a[7]a[2] + // a[4]a[3] (iv) + // a[5]a[3] + // a[6]a[3] + // a[7]a[3] + // a[5]a[4] (v) + // a[6]a[4] + // a[7]a[4] + // a[6]a[5] (vi) + // a[7]a[5] + // a[7]a[6] (vii) + + mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) + mul x15,x8,x6 + mul x16,x9,x6 + mul x17,x10,x6 + adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) + mul x14,x11,x6 + adcs x21,x21,x15 + mul x15,x12,x6 + adcs x22,x22,x16 + mul x16,x13,x6 + adcs x23,x23,x17 + umulh x17,x7,x6 // hi(a[1..7]*a[0]) + adcs x24,x24,x14 + umulh x14,x8,x6 + adcs x25,x25,x15 + umulh x15,x9,x6 + adcs x26,x26,x16 + umulh x16,x10,x6 + stp x19,x20,[x2],#8*2 // t[0..1] + adc x19,xzr,xzr // t[8] + adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) + umulh x17,x11,x6 + adcs x22,x22,x14 + umulh x14,x12,x6 + adcs x23,x23,x15 + umulh x15,x13,x6 + adcs x24,x24,x16 + mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) + adcs x25,x25,x17 + mul x17,x9,x7 + adcs x26,x26,x14 + mul x14,x10,x7 + adc x19,x19,x15 + + mul x15,x11,x7 + adds x22,x22,x16 + mul x16,x12,x7 + adcs x23,x23,x17 + mul x17,x13,x7 + adcs x24,x24,x14 + umulh x14,x8,x7 // hi(a[2..7]*a[1]) + adcs x25,x25,x15 + umulh x15,x9,x7 + adcs x26,x26,x16 + umulh x16,x10,x7 + adcs x19,x19,x17 + umulh x17,x11,x7 + stp x21,x22,[x2],#8*2 // t[2..3] + adc x20,xzr,xzr // t[9] + adds x23,x23,x14 + umulh x14,x12,x7 + adcs x24,x24,x15 + umulh x15,x13,x7 + adcs x25,x25,x16 + mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) + adcs x26,x26,x17 + mul x17,x10,x8 + adcs x19,x19,x14 + mul x14,x11,x8 + adc x20,x20,x15 + + mul x15,x12,x8 + adds x24,x24,x16 + mul x16,x13,x8 + adcs x25,x25,x17 + umulh x17,x9,x8 // hi(a[3..7]*a[2]) + adcs x26,x26,x14 + umulh x14,x10,x8 + adcs x19,x19,x15 + umulh x15,x11,x8 + adcs x20,x20,x16 + umulh x16,x12,x8 + stp x23,x24,[x2],#8*2 // t[4..5] + adc x21,xzr,xzr // t[10] + adds x25,x25,x17 + umulh x17,x13,x8 + adcs x26,x26,x14 + mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) + adcs x19,x19,x15 + mul x15,x11,x9 + adcs x20,x20,x16 + mul x16,x12,x9 + adc x21,x21,x17 + + mul x17,x13,x9 + adds x26,x26,x14 + umulh x14,x10,x9 // hi(a[4..7]*a[3]) + adcs x19,x19,x15 + umulh x15,x11,x9 + adcs x20,x20,x16 + umulh x16,x12,x9 + adcs x21,x21,x17 + umulh x17,x13,x9 + stp x25,x26,[x2],#8*2 // t[6..7] + adc x22,xzr,xzr // t[11] + adds x19,x19,x14 + mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) + adcs x20,x20,x15 + mul x15,x12,x10 + adcs x21,x21,x16 + mul x16,x13,x10 + adc x22,x22,x17 + + umulh x17,x11,x10 // hi(a[5..7]*a[4]) + adds x20,x20,x14 + umulh x14,x12,x10 + adcs x21,x21,x15 + umulh x15,x13,x10 + adcs x22,x22,x16 + mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) + adc x23,xzr,xzr // t[12] + adds x21,x21,x17 + mul x17,x13,x11 + adcs x22,x22,x14 + umulh x14,x12,x11 // hi(a[6..7]*a[5]) + adc x23,x23,x15 + + umulh x15,x13,x11 + adds x22,x22,x16 + mul x16,x13,x12 // lo(a[7]*a[6]) (vii) + adcs x23,x23,x17 + umulh x17,x13,x12 // hi(a[7]*a[6]) + adc x24,xzr,xzr // t[13] + adds x23,x23,x14 + sub x27,x3,x1 // done yet? + adc x24,x24,x15 + + adds x24,x24,x16 + sub x14,x3,x5 // rewinded ap + adc x25,xzr,xzr // t[14] + add x25,x25,x17 + + cbz x27,.Lsqr8x_outer_break + + mov x4,x6 + ldp x6,x7,[x2,#8*0] + ldp x8,x9,[x2,#8*2] + ldp x10,x11,[x2,#8*4] + ldp x12,x13,[x2,#8*6] + adds x19,x19,x6 + adcs x20,x20,x7 + ldp x6,x7,[x1,#8*0] + adcs x21,x21,x8 + adcs x22,x22,x9 + ldp x8,x9,[x1,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x1,#8*4] + adcs x25,x25,x12 + mov x0,x1 + adcs x26,xzr,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + //adc x28,xzr,xzr // moved below + mov x27,#-8*8 + + // a[8]a[0] + // a[9]a[0] + // a[a]a[0] + // a[b]a[0] + // a[c]a[0] + // a[d]a[0] + // a[e]a[0] + // a[f]a[0] + // a[8]a[1] + // a[f]a[1]........................ + // a[8]a[2] + // a[f]a[2]........................ + // a[8]a[3] + // a[f]a[3]........................ + // a[8]a[4] + // a[f]a[4]........................ + // a[8]a[5] + // a[f]a[5]........................ + // a[8]a[6] + // a[f]a[6]........................ + // a[8]a[7] + // a[f]a[7]........................ +.Lsqr8x_mul: + mul x14,x6,x4 + adc x28,xzr,xzr // carry bit, modulo-scheduled + mul x15,x7,x4 + add x27,x27,#8 + mul x16,x8,x4 + mul x17,x9,x4 + adds x19,x19,x14 + mul x14,x10,x4 + adcs x20,x20,x15 + mul x15,x11,x4 + adcs x21,x21,x16 + mul x16,x12,x4 + adcs x22,x22,x17 + mul x17,x13,x4 + adcs x23,x23,x14 + umulh x14,x6,x4 + adcs x24,x24,x15 + umulh x15,x7,x4 + adcs x25,x25,x16 + umulh x16,x8,x4 + adcs x26,x26,x17 + umulh x17,x9,x4 + adc x28,x28,xzr + str x19,[x2],#8 + adds x19,x20,x14 + umulh x14,x10,x4 + adcs x20,x21,x15 + umulh x15,x11,x4 + adcs x21,x22,x16 + umulh x16,x12,x4 + adcs x22,x23,x17 + umulh x17,x13,x4 + ldr x4,[x0,x27] + adcs x23,x24,x14 + adcs x24,x25,x15 + adcs x25,x26,x16 + adcs x26,x28,x17 + //adc x28,xzr,xzr // moved above + cbnz x27,.Lsqr8x_mul + // note that carry flag is guaranteed + // to be zero at this point + cmp x1,x3 // done yet? + b.eq .Lsqr8x_break + + ldp x6,x7,[x2,#8*0] + ldp x8,x9,[x2,#8*2] + ldp x10,x11,[x2,#8*4] + ldp x12,x13,[x2,#8*6] + adds x19,x19,x6 + ldr x4,[x0,#-8*8] + adcs x20,x20,x7 + ldp x6,x7,[x1,#8*0] + adcs x21,x21,x8 + adcs x22,x22,x9 + ldp x8,x9,[x1,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x1,#8*4] + adcs x25,x25,x12 + mov x27,#-8*8 + adcs x26,x26,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + //adc x28,xzr,xzr // moved above + b .Lsqr8x_mul + +.align 4 +.Lsqr8x_break: + ldp x6,x7,[x0,#8*0] + add x1,x0,#8*8 + ldp x8,x9,[x0,#8*2] + sub x14,x3,x1 // is it last iteration? + ldp x10,x11,[x0,#8*4] + sub x15,x2,x14 + ldp x12,x13,[x0,#8*6] + cbz x14,.Lsqr8x_outer_loop + + stp x19,x20,[x2,#8*0] + ldp x19,x20,[x15,#8*0] + stp x21,x22,[x2,#8*2] + ldp x21,x22,[x15,#8*2] + stp x23,x24,[x2,#8*4] + ldp x23,x24,[x15,#8*4] + stp x25,x26,[x2,#8*6] + mov x2,x15 + ldp x25,x26,[x15,#8*6] + b .Lsqr8x_outer_loop + +.align 4 +.Lsqr8x_outer_break: + // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] + ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] + ldp x15,x16,[sp,#8*1] + ldp x11,x13,[x14,#8*2] + add x1,x14,#8*4 + ldp x17,x14,[sp,#8*3] + + stp x19,x20,[x2,#8*0] + mul x19,x7,x7 + stp x21,x22,[x2,#8*2] + umulh x7,x7,x7 + stp x23,x24,[x2,#8*4] + mul x8,x9,x9 + stp x25,x26,[x2,#8*6] + mov x2,sp + umulh x9,x9,x9 + adds x20,x7,x15,lsl#1 + extr x15,x16,x15,#63 + sub x27,x5,#8*4 + +.Lsqr4x_shift_n_add: + adcs x21,x8,x15 + extr x16,x17,x16,#63 + sub x27,x27,#8*4 + adcs x22,x9,x16 + ldp x15,x16,[x2,#8*5] + mul x10,x11,x11 + ldp x7,x9,[x1],#8*2 + umulh x11,x11,x11 + mul x12,x13,x13 + umulh x13,x13,x13 + extr x17,x14,x17,#63 + stp x19,x20,[x2,#8*0] + adcs x23,x10,x17 + extr x14,x15,x14,#63 + stp x21,x22,[x2,#8*2] + adcs x24,x11,x14 + ldp x17,x14,[x2,#8*7] + extr x15,x16,x15,#63 + adcs x25,x12,x15 + extr x16,x17,x16,#63 + adcs x26,x13,x16 + ldp x15,x16,[x2,#8*9] + mul x6,x7,x7 + ldp x11,x13,[x1],#8*2 + umulh x7,x7,x7 + mul x8,x9,x9 + umulh x9,x9,x9 + stp x23,x24,[x2,#8*4] + extr x17,x14,x17,#63 + stp x25,x26,[x2,#8*6] + add x2,x2,#8*8 + adcs x19,x6,x17 + extr x14,x15,x14,#63 + adcs x20,x7,x14 + ldp x17,x14,[x2,#8*3] + extr x15,x16,x15,#63 + cbnz x27,.Lsqr4x_shift_n_add + ldp x1,x4,[x29,#104] // pull np and n0 + + adcs x21,x8,x15 + extr x16,x17,x16,#63 + adcs x22,x9,x16 + ldp x15,x16,[x2,#8*5] + mul x10,x11,x11 + umulh x11,x11,x11 + stp x19,x20,[x2,#8*0] + mul x12,x13,x13 + umulh x13,x13,x13 + stp x21,x22,[x2,#8*2] + extr x17,x14,x17,#63 + adcs x23,x10,x17 + extr x14,x15,x14,#63 + ldp x19,x20,[sp,#8*0] + adcs x24,x11,x14 + extr x15,x16,x15,#63 + ldp x6,x7,[x1,#8*0] + adcs x25,x12,x15 + extr x16,xzr,x16,#63 + ldp x8,x9,[x1,#8*2] + adc x26,x13,x16 + ldp x10,x11,[x1,#8*4] + + // Reduce by 512 bits per iteration + mul x28,x4,x19 // t[0]*n0 + ldp x12,x13,[x1,#8*6] + add x3,x1,x5 + ldp x21,x22,[sp,#8*2] + stp x23,x24,[x2,#8*4] + ldp x23,x24,[sp,#8*4] + stp x25,x26,[x2,#8*6] + ldp x25,x26,[sp,#8*6] + add x1,x1,#8*8 + mov x30,xzr // initial top-most carry + mov x2,sp + mov x27,#8 + +.Lsqr8x_reduction: + // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) + mul x15,x7,x28 + sub x27,x27,#1 + mul x16,x8,x28 + str x28,[x2],#8 // put aside t[0]*n0 for tail processing + mul x17,x9,x28 + // (*) adds xzr,x19,x14 + subs xzr,x19,#1 // (*) + mul x14,x10,x28 + adcs x19,x20,x15 + mul x15,x11,x28 + adcs x20,x21,x16 + mul x16,x12,x28 + adcs x21,x22,x17 + mul x17,x13,x28 + adcs x22,x23,x14 + umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) + adcs x23,x24,x15 + umulh x15,x7,x28 + adcs x24,x25,x16 + umulh x16,x8,x28 + adcs x25,x26,x17 + umulh x17,x9,x28 + adc x26,xzr,xzr + adds x19,x19,x14 + umulh x14,x10,x28 + adcs x20,x20,x15 + umulh x15,x11,x28 + adcs x21,x21,x16 + umulh x16,x12,x28 + adcs x22,x22,x17 + umulh x17,x13,x28 + mul x28,x4,x19 // next t[0]*n0 + adcs x23,x23,x14 + adcs x24,x24,x15 + adcs x25,x25,x16 + adc x26,x26,x17 + cbnz x27,.Lsqr8x_reduction + + ldp x14,x15,[x2,#8*0] + ldp x16,x17,[x2,#8*2] + mov x0,x2 + sub x27,x3,x1 // done yet? + adds x19,x19,x14 + adcs x20,x20,x15 + ldp x14,x15,[x2,#8*4] + adcs x21,x21,x16 + adcs x22,x22,x17 + ldp x16,x17,[x2,#8*6] + adcs x23,x23,x14 + adcs x24,x24,x15 + adcs x25,x25,x16 + adcs x26,x26,x17 + //adc x28,xzr,xzr // moved below + cbz x27,.Lsqr8x8_post_condition + + ldr x4,[x2,#-8*8] + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + ldp x10,x11,[x1,#8*4] + mov x27,#-8*8 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + +.Lsqr8x_tail: + mul x14,x6,x4 + adc x28,xzr,xzr // carry bit, modulo-scheduled + mul x15,x7,x4 + add x27,x27,#8 + mul x16,x8,x4 + mul x17,x9,x4 + adds x19,x19,x14 + mul x14,x10,x4 + adcs x20,x20,x15 + mul x15,x11,x4 + adcs x21,x21,x16 + mul x16,x12,x4 + adcs x22,x22,x17 + mul x17,x13,x4 + adcs x23,x23,x14 + umulh x14,x6,x4 + adcs x24,x24,x15 + umulh x15,x7,x4 + adcs x25,x25,x16 + umulh x16,x8,x4 + adcs x26,x26,x17 + umulh x17,x9,x4 + adc x28,x28,xzr + str x19,[x2],#8 + adds x19,x20,x14 + umulh x14,x10,x4 + adcs x20,x21,x15 + umulh x15,x11,x4 + adcs x21,x22,x16 + umulh x16,x12,x4 + adcs x22,x23,x17 + umulh x17,x13,x4 + ldr x4,[x0,x27] + adcs x23,x24,x14 + adcs x24,x25,x15 + adcs x25,x26,x16 + adcs x26,x28,x17 + //adc x28,xzr,xzr // moved above + cbnz x27,.Lsqr8x_tail + // note that carry flag is guaranteed + // to be zero at this point + ldp x6,x7,[x2,#8*0] + sub x27,x3,x1 // done yet? + sub x16,x3,x5 // rewinded np + ldp x8,x9,[x2,#8*2] + ldp x10,x11,[x2,#8*4] + ldp x12,x13,[x2,#8*6] + cbz x27,.Lsqr8x_tail_break + + ldr x4,[x0,#-8*8] + adds x19,x19,x6 + adcs x20,x20,x7 + ldp x6,x7,[x1,#8*0] + adcs x21,x21,x8 + adcs x22,x22,x9 + ldp x8,x9,[x1,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x1,#8*4] + adcs x25,x25,x12 + mov x27,#-8*8 + adcs x26,x26,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + //adc x28,xzr,xzr // moved above + b .Lsqr8x_tail + +.align 4 +.Lsqr8x_tail_break: + ldr x4,[x29,#112] // pull n0 + add x27,x2,#8*8 // end of current t[num] window + + subs xzr,x30,#1 // "move" top-most carry to carry bit + adcs x14,x19,x6 + adcs x15,x20,x7 + ldp x19,x20,[x0,#8*0] + adcs x21,x21,x8 + ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] + adcs x22,x22,x9 + ldp x8,x9,[x16,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x16,#8*4] + adcs x25,x25,x12 + adcs x26,x26,x13 + ldp x12,x13,[x16,#8*6] + add x1,x16,#8*8 + adc x30,xzr,xzr // top-most carry + mul x28,x4,x19 + stp x14,x15,[x2,#8*0] + stp x21,x22,[x2,#8*2] + ldp x21,x22,[x0,#8*2] + stp x23,x24,[x2,#8*4] + ldp x23,x24,[x0,#8*4] + cmp x27,x29 // did we hit the bottom? + stp x25,x26,[x2,#8*6] + mov x2,x0 // slide the window + ldp x25,x26,[x0,#8*6] + mov x27,#8 + b.ne .Lsqr8x_reduction + + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + ldr x0,[x29,#96] // pull rp + add x2,x2,#8*8 + subs x14,x19,x6 + sbcs x15,x20,x7 + sub x27,x5,#8*8 + mov x3,x0 // x0 copy + +.Lsqr8x_sub: + sbcs x16,x21,x8 + ldp x6,x7,[x1,#8*0] + sbcs x17,x22,x9 + stp x14,x15,[x0,#8*0] + sbcs x14,x23,x10 + ldp x8,x9,[x1,#8*2] + sbcs x15,x24,x11 + stp x16,x17,[x0,#8*2] + sbcs x16,x25,x12 + ldp x10,x11,[x1,#8*4] + sbcs x17,x26,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + ldp x19,x20,[x2,#8*0] + sub x27,x27,#8*8 + ldp x21,x22,[x2,#8*2] + ldp x23,x24,[x2,#8*4] + ldp x25,x26,[x2,#8*6] + add x2,x2,#8*8 + stp x14,x15,[x0,#8*4] + sbcs x14,x19,x6 + stp x16,x17,[x0,#8*6] + add x0,x0,#8*8 + sbcs x15,x20,x7 + cbnz x27,.Lsqr8x_sub + + sbcs x16,x21,x8 + mov x2,sp + add x1,sp,x5 + ldp x6,x7,[x3,#8*0] + sbcs x17,x22,x9 + stp x14,x15,[x0,#8*0] + sbcs x14,x23,x10 + ldp x8,x9,[x3,#8*2] + sbcs x15,x24,x11 + stp x16,x17,[x0,#8*2] + sbcs x16,x25,x12 + ldp x19,x20,[x1,#8*0] + sbcs x17,x26,x13 + ldp x21,x22,[x1,#8*2] + sbcs xzr,x30,xzr // did it borrow? + ldr x30,[x29,#8] // pull return address + stp x14,x15,[x0,#8*4] + stp x16,x17,[x0,#8*6] + + sub x27,x5,#8*4 +.Lsqr4x_cond_copy: + sub x27,x27,#8*4 + csel x14,x19,x6,lo + stp xzr,xzr,[x2,#8*0] + csel x15,x20,x7,lo + ldp x6,x7,[x3,#8*4] + ldp x19,x20,[x1,#8*4] + csel x16,x21,x8,lo + stp xzr,xzr,[x2,#8*2] + add x2,x2,#8*4 + csel x17,x22,x9,lo + ldp x8,x9,[x3,#8*6] + ldp x21,x22,[x1,#8*6] + add x1,x1,#8*4 + stp x14,x15,[x3,#8*0] + stp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + stp xzr,xzr,[x1,#8*0] + stp xzr,xzr,[x1,#8*2] + cbnz x27,.Lsqr4x_cond_copy + + csel x14,x19,x6,lo + stp xzr,xzr,[x2,#8*0] + csel x15,x20,x7,lo + stp xzr,xzr,[x2,#8*2] + csel x16,x21,x8,lo + csel x17,x22,x9,lo + stp x14,x15,[x3,#8*0] + stp x16,x17,[x3,#8*2] + + b .Lsqr8x_done + +.align 4 +.Lsqr8x8_post_condition: + adc x28,xzr,xzr + ldr x30,[x29,#8] // pull return address + // x19-7,x28 hold result, x6-7 hold modulus + subs x6,x19,x6 + ldr x1,[x29,#96] // pull rp + sbcs x7,x20,x7 + stp xzr,xzr,[sp,#8*0] + sbcs x8,x21,x8 + stp xzr,xzr,[sp,#8*2] + sbcs x9,x22,x9 + stp xzr,xzr,[sp,#8*4] + sbcs x10,x23,x10 + stp xzr,xzr,[sp,#8*6] + sbcs x11,x24,x11 + stp xzr,xzr,[sp,#8*8] + sbcs x12,x25,x12 + stp xzr,xzr,[sp,#8*10] + sbcs x13,x26,x13 + stp xzr,xzr,[sp,#8*12] + sbcs x28,x28,xzr // did it borrow? + stp xzr,xzr,[sp,#8*14] + + // x6-7 hold result-modulus + csel x6,x19,x6,lo + csel x7,x20,x7,lo + csel x8,x21,x8,lo + csel x9,x22,x9,lo + stp x6,x7,[x1,#8*0] + csel x10,x23,x10,lo + csel x11,x24,x11,lo + stp x8,x9,[x1,#8*2] + csel x12,x25,x12,lo + csel x13,x26,x13,lo + stp x10,x11,[x1,#8*4] + stp x12,x13,[x1,#8*6] + +.Lsqr8x_done: + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + mov x0,#1 + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + // x30 is popped earlier + AARCH64_VALIDATE_LINK_REGISTER + ret +.size bn_sqr8x_mont,.-bn_sqr8x_mont +.globl bn_mul4x_mont +.hidden bn_mul4x_mont +.type bn_mul4x_mont,%function +.align 5 +bn_mul4x_mont: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + sub x26,sp,x5,lsl#3 + lsl x5,x5,#3 + ldr x4,[x4] // *n0 + sub sp,x26,#8*4 // alloca + + add x10,x2,x5 + add x27,x1,x5 + stp x0,x10,[x29,#96] // offload rp and &b[num] + + ldr x24,[x2,#8*0] // b[0] + ldp x6,x7,[x1,#8*0] // a[0..3] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + mov x19,xzr + mov x20,xzr + mov x21,xzr + mov x22,xzr + ldp x14,x15,[x3,#8*0] // n[0..3] + ldp x16,x17,[x3,#8*2] + adds x3,x3,#8*4 // clear carry bit + mov x0,xzr + mov x28,#0 + mov x26,sp + +.Loop_mul4x_1st_reduction: + mul x10,x6,x24 // lo(a[0..3]*b[0]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[0..3]*b[0]) + adcs x20,x20,x11 + mul x25,x19,x4 // t[0]*n0 + adcs x21,x21,x12 + umulh x11,x7,x24 + adcs x22,x22,x13 + umulh x12,x8,x24 + adc x23,xzr,xzr + umulh x13,x9,x24 + ldr x24,[x2,x28] // next b[i] (or b[0]) + adds x20,x20,x10 + // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) + str x25,[x26],#8 // put aside t[0]*n0 for tail processing + adcs x21,x21,x11 + mul x11,x15,x25 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + // (*) adds xzr,x19,x10 + subs xzr,x19,#1 // (*) + umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) + adcs x19,x20,x11 + umulh x11,x15,x25 + adcs x20,x21,x12 + umulh x12,x16,x25 + adcs x21,x22,x13 + umulh x13,x17,x25 + adcs x22,x23,x0 + adc x0,xzr,xzr + adds x19,x19,x10 + sub x10,x27,x1 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + cbnz x28,.Loop_mul4x_1st_reduction + + cbz x10,.Lmul4x4_post_condition + + ldp x6,x7,[x1,#8*0] // a[4..7] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + ldr x25,[sp] // a[0]*n0 + ldp x14,x15,[x3,#8*0] // n[4..7] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + +.Loop_mul4x_1st_tail: + mul x10,x6,x24 // lo(a[4..7]*b[i]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[4..7]*b[i]) + adcs x20,x20,x11 + umulh x11,x7,x24 + adcs x21,x21,x12 + umulh x12,x8,x24 + adcs x22,x22,x13 + umulh x13,x9,x24 + adc x23,xzr,xzr + ldr x24,[x2,x28] // next b[i] (or b[0]) + adds x20,x20,x10 + mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) + adcs x21,x21,x11 + mul x11,x15,x25 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + adds x19,x19,x10 + umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) + adcs x20,x20,x11 + umulh x11,x15,x25 + adcs x21,x21,x12 + umulh x12,x16,x25 + adcs x22,x22,x13 + adcs x23,x23,x0 + umulh x13,x17,x25 + adc x0,xzr,xzr + ldr x25,[sp,x28] // next t[0]*n0 + str x19,[x26],#8 // result!!! + adds x19,x20,x10 + sub x10,x27,x1 // done yet? + adcs x20,x21,x11 + adcs x21,x22,x12 + adcs x22,x23,x13 + //adc x0,x0,xzr + cbnz x28,.Loop_mul4x_1st_tail + + sub x11,x27,x5 // rewinded x1 + cbz x10,.Lmul4x_proceed + + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + ldp x14,x15,[x3,#8*0] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + b .Loop_mul4x_1st_tail + +.align 5 +.Lmul4x_proceed: + ldr x24,[x2,#8*4]! // *++b + adc x30,x0,xzr + ldp x6,x7,[x11,#8*0] // a[0..3] + sub x3,x3,x5 // rewind np + ldp x8,x9,[x11,#8*2] + add x1,x11,#8*4 + + stp x19,x20,[x26,#8*0] // result!!! + ldp x19,x20,[sp,#8*4] // t[0..3] + stp x21,x22,[x26,#8*2] // result!!! + ldp x21,x22,[sp,#8*6] + + ldp x14,x15,[x3,#8*0] // n[0..3] + mov x26,sp + ldp x16,x17,[x3,#8*2] + adds x3,x3,#8*4 // clear carry bit + mov x0,xzr + +.align 4 +.Loop_mul4x_reduction: + mul x10,x6,x24 // lo(a[0..3]*b[4]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[0..3]*b[4]) + adcs x20,x20,x11 + mul x25,x19,x4 // t[0]*n0 + adcs x21,x21,x12 + umulh x11,x7,x24 + adcs x22,x22,x13 + umulh x12,x8,x24 + adc x23,xzr,xzr + umulh x13,x9,x24 + ldr x24,[x2,x28] // next b[i] + adds x20,x20,x10 + // (*) mul x10,x14,x25 + str x25,[x26],#8 // put aside t[0]*n0 for tail processing + adcs x21,x21,x11 + mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + // (*) adds xzr,x19,x10 + subs xzr,x19,#1 // (*) + umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 + adcs x19,x20,x11 + umulh x11,x15,x25 + adcs x20,x21,x12 + umulh x12,x16,x25 + adcs x21,x22,x13 + umulh x13,x17,x25 + adcs x22,x23,x0 + adc x0,xzr,xzr + adds x19,x19,x10 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + cbnz x28,.Loop_mul4x_reduction + + adc x0,x0,xzr + ldp x10,x11,[x26,#8*4] // t[4..7] + ldp x12,x13,[x26,#8*6] + ldp x6,x7,[x1,#8*0] // a[4..7] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + adds x19,x19,x10 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + + ldr x25,[sp] // t[0]*n0 + ldp x14,x15,[x3,#8*0] // n[4..7] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + +.align 4 +.Loop_mul4x_tail: + mul x10,x6,x24 // lo(a[4..7]*b[4]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[4..7]*b[4]) + adcs x20,x20,x11 + umulh x11,x7,x24 + adcs x21,x21,x12 + umulh x12,x8,x24 + adcs x22,x22,x13 + umulh x13,x9,x24 + adc x23,xzr,xzr + ldr x24,[x2,x28] // next b[i] + adds x20,x20,x10 + mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) + adcs x21,x21,x11 + mul x11,x15,x25 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + adds x19,x19,x10 + umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) + adcs x20,x20,x11 + umulh x11,x15,x25 + adcs x21,x21,x12 + umulh x12,x16,x25 + adcs x22,x22,x13 + umulh x13,x17,x25 + adcs x23,x23,x0 + ldr x25,[sp,x28] // next a[0]*n0 + adc x0,xzr,xzr + str x19,[x26],#8 // result!!! + adds x19,x20,x10 + sub x10,x27,x1 // done yet? + adcs x20,x21,x11 + adcs x21,x22,x12 + adcs x22,x23,x13 + //adc x0,x0,xzr + cbnz x28,.Loop_mul4x_tail + + sub x11,x3,x5 // rewinded np? + adc x0,x0,xzr + cbz x10,.Loop_mul4x_break + + ldp x10,x11,[x26,#8*4] + ldp x12,x13,[x26,#8*6] + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + adds x19,x19,x10 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + ldp x14,x15,[x3,#8*0] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + b .Loop_mul4x_tail + +.align 4 +.Loop_mul4x_break: + ldp x12,x13,[x29,#96] // pull rp and &b[num] + adds x19,x19,x30 + add x2,x2,#8*4 // bp++ + adcs x20,x20,xzr + sub x1,x1,x5 // rewind ap + adcs x21,x21,xzr + stp x19,x20,[x26,#8*0] // result!!! + adcs x22,x22,xzr + ldp x19,x20,[sp,#8*4] // t[0..3] + adc x30,x0,xzr + stp x21,x22,[x26,#8*2] // result!!! + cmp x2,x13 // done yet? + ldp x21,x22,[sp,#8*6] + ldp x14,x15,[x11,#8*0] // n[0..3] + ldp x16,x17,[x11,#8*2] + add x3,x11,#8*4 + b.eq .Lmul4x_post + + ldr x24,[x2] + ldp x6,x7,[x1,#8*0] // a[0..3] + ldp x8,x9,[x1,#8*2] + adds x1,x1,#8*4 // clear carry bit + mov x0,xzr + mov x26,sp + b .Loop_mul4x_reduction + +.align 4 +.Lmul4x_post: + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + mov x0,x12 + mov x27,x12 // x0 copy + subs x10,x19,x14 + add x26,sp,#8*8 + sbcs x11,x20,x15 + sub x28,x5,#8*4 + +.Lmul4x_sub: + sbcs x12,x21,x16 + ldp x14,x15,[x3,#8*0] + sub x28,x28,#8*4 + ldp x19,x20,[x26,#8*0] + sbcs x13,x22,x17 + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + ldp x21,x22,[x26,#8*2] + add x26,x26,#8*4 + stp x10,x11,[x0,#8*0] + sbcs x10,x19,x14 + stp x12,x13,[x0,#8*2] + add x0,x0,#8*4 + sbcs x11,x20,x15 + cbnz x28,.Lmul4x_sub + + sbcs x12,x21,x16 + mov x26,sp + add x1,sp,#8*4 + ldp x6,x7,[x27,#8*0] + sbcs x13,x22,x17 + stp x10,x11,[x0,#8*0] + ldp x8,x9,[x27,#8*2] + stp x12,x13,[x0,#8*2] + ldp x19,x20,[x1,#8*0] + ldp x21,x22,[x1,#8*2] + sbcs xzr,x30,xzr // did it borrow? + ldr x30,[x29,#8] // pull return address + + sub x28,x5,#8*4 +.Lmul4x_cond_copy: + sub x28,x28,#8*4 + csel x10,x19,x6,lo + stp xzr,xzr,[x26,#8*0] + csel x11,x20,x7,lo + ldp x6,x7,[x27,#8*4] + ldp x19,x20,[x1,#8*4] + csel x12,x21,x8,lo + stp xzr,xzr,[x26,#8*2] + add x26,x26,#8*4 + csel x13,x22,x9,lo + ldp x8,x9,[x27,#8*6] + ldp x21,x22,[x1,#8*6] + add x1,x1,#8*4 + stp x10,x11,[x27,#8*0] + stp x12,x13,[x27,#8*2] + add x27,x27,#8*4 + cbnz x28,.Lmul4x_cond_copy + + csel x10,x19,x6,lo + stp xzr,xzr,[x26,#8*0] + csel x11,x20,x7,lo + stp xzr,xzr,[x26,#8*2] + csel x12,x21,x8,lo + stp xzr,xzr,[x26,#8*3] + csel x13,x22,x9,lo + stp xzr,xzr,[x26,#8*4] + stp x10,x11,[x27,#8*0] + stp x12,x13,[x27,#8*2] + + b .Lmul4x_done + +.align 4 +.Lmul4x4_post_condition: + adc x0,x0,xzr + ldr x1,[x29,#96] // pull rp + // x19-3,x0 hold result, x14-7 hold modulus + subs x6,x19,x14 + ldr x30,[x29,#8] // pull return address + sbcs x7,x20,x15 + stp xzr,xzr,[sp,#8*0] + sbcs x8,x21,x16 + stp xzr,xzr,[sp,#8*2] + sbcs x9,x22,x17 + stp xzr,xzr,[sp,#8*4] + sbcs xzr,x0,xzr // did it borrow? + stp xzr,xzr,[sp,#8*6] + + // x6-3 hold result-modulus + csel x6,x19,x6,lo + csel x7,x20,x7,lo + csel x8,x21,x8,lo + csel x9,x22,x9,lo + stp x6,x7,[x1,#8*0] + stp x8,x9,[x1,#8*2] + +.Lmul4x_done: + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + mov x0,#1 + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + // x30 is popped earlier + AARCH64_VALIDATE_LINK_REGISTER + ret +.size bn_mul4x_mont,.-bn_mul4x_mont +.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 4 +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) diff --git a/ring-0.17.14/pregenerated/armv8-mont-win64.S b/ring-0.17.14/pregenerated/armv8-mont-win64.S new file mode 100644 index 0000000000..b5f2b9eb80 --- /dev/null +++ b/ring-0.17.14/pregenerated/armv8-mont-win64.S @@ -0,0 +1,1422 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +.text + +.globl bn_mul_mont_nohw + +.def bn_mul_mont_nohw + .type 32 +.endef +.align 5 +bn_mul_mont_nohw: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldr x9,[x2],#8 // bp[0] + sub x22,sp,x5,lsl#3 + ldp x7,x8,[x1],#16 // ap[0..1] + lsl x5,x5,#3 + ldr x4,[x4] // *n0 + and x22,x22,#-16 // ABI says so + ldp x13,x14,[x3],#16 // np[0..1] + + mul x6,x7,x9 // ap[0]*bp[0] + sub x21,x5,#16 // j=num-2 + umulh x7,x7,x9 + mul x10,x8,x9 // ap[1]*bp[0] + umulh x11,x8,x9 + + mul x15,x6,x4 // "tp[0]"*n0 + mov sp,x22 // alloca + + // (*) mul x12,x13,x15 // np[0]*m1 + umulh x13,x13,x15 + mul x16,x14,x15 // np[1]*m1 + // (*) adds x12,x12,x6 // discarded + // (*) As for removal of first multiplication and addition + // instructions. The outcome of first addition is + // guaranteed to be zero, which leaves two computationally + // significant outcomes: it either carries or not. Then + // question is when does it carry? Is there alternative + // way to deduce it? If you follow operations, you can + // observe that condition for carry is quite simple: + // x6 being non-zero. So that carry can be calculated + // by adding -1 to x6. That's what next instruction does. + subs xzr,x6,#1 // (*) + umulh x17,x14,x15 + adc x13,x13,xzr + cbz x21,L1st_skip + +L1st: + ldr x8,[x1],#8 + adds x6,x10,x7 + sub x21,x21,#8 // j-- + adc x7,x11,xzr + + ldr x14,[x3],#8 + adds x12,x16,x13 + mul x10,x8,x9 // ap[j]*bp[0] + adc x13,x17,xzr + umulh x11,x8,x9 + + adds x12,x12,x6 + mul x16,x14,x15 // np[j]*m1 + adc x13,x13,xzr + umulh x17,x14,x15 + str x12,[x22],#8 // tp[j-1] + cbnz x21,L1st + +L1st_skip: + adds x6,x10,x7 + sub x1,x1,x5 // rewind x1 + adc x7,x11,xzr + + adds x12,x16,x13 + sub x3,x3,x5 // rewind x3 + adc x13,x17,xzr + + adds x12,x12,x6 + sub x20,x5,#8 // i=num-1 + adcs x13,x13,x7 + + adc x19,xzr,xzr // upmost overflow bit + stp x12,x13,[x22] + +Louter: + ldr x9,[x2],#8 // bp[i] + ldp x7,x8,[x1],#16 + ldr x23,[sp] // tp[0] + add x22,sp,#8 + + mul x6,x7,x9 // ap[0]*bp[i] + sub x21,x5,#16 // j=num-2 + umulh x7,x7,x9 + ldp x13,x14,[x3],#16 + mul x10,x8,x9 // ap[1]*bp[i] + adds x6,x6,x23 + umulh x11,x8,x9 + adc x7,x7,xzr + + mul x15,x6,x4 + sub x20,x20,#8 // i-- + + // (*) mul x12,x13,x15 // np[0]*m1 + umulh x13,x13,x15 + mul x16,x14,x15 // np[1]*m1 + // (*) adds x12,x12,x6 + subs xzr,x6,#1 // (*) + umulh x17,x14,x15 + cbz x21,Linner_skip + +Linner: + ldr x8,[x1],#8 + adc x13,x13,xzr + ldr x23,[x22],#8 // tp[j] + adds x6,x10,x7 + sub x21,x21,#8 // j-- + adc x7,x11,xzr + + adds x12,x16,x13 + ldr x14,[x3],#8 + adc x13,x17,xzr + + mul x10,x8,x9 // ap[j]*bp[i] + adds x6,x6,x23 + umulh x11,x8,x9 + adc x7,x7,xzr + + mul x16,x14,x15 // np[j]*m1 + adds x12,x12,x6 + umulh x17,x14,x15 + str x12,[x22,#-16] // tp[j-1] + cbnz x21,Linner + +Linner_skip: + ldr x23,[x22],#8 // tp[j] + adc x13,x13,xzr + adds x6,x10,x7 + sub x1,x1,x5 // rewind x1 + adc x7,x11,xzr + + adds x12,x16,x13 + sub x3,x3,x5 // rewind x3 + adcs x13,x17,x19 + adc x19,xzr,xzr + + adds x6,x6,x23 + adc x7,x7,xzr + + adds x12,x12,x6 + adcs x13,x13,x7 + adc x19,x19,xzr // upmost overflow bit + stp x12,x13,[x22,#-16] + + cbnz x20,Louter + + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + ldr x23,[sp] // tp[0] + add x22,sp,#8 + ldr x14,[x3],#8 // np[0] + subs x21,x5,#8 // j=num-1 and clear borrow + mov x1,x0 +Lsub: + sbcs x8,x23,x14 // tp[j]-np[j] + ldr x23,[x22],#8 + sub x21,x21,#8 // j-- + ldr x14,[x3],#8 + str x8,[x1],#8 // rp[j]=tp[j]-np[j] + cbnz x21,Lsub + + sbcs x8,x23,x14 + sbcs x19,x19,xzr // did it borrow? + str x8,[x1],#8 // rp[num-1] + + ldr x23,[sp] // tp[0] + add x22,sp,#8 + ldr x8,[x0],#8 // rp[0] + sub x5,x5,#8 // num-- + nop +Lcond_copy: + sub x5,x5,#8 // num-- + csel x14,x23,x8,lo // did it borrow? + ldr x23,[x22],#8 + ldr x8,[x0],#8 + str xzr,[x22,#-16] // wipe tp + str x14,[x0,#-16] + cbnz x5,Lcond_copy + + csel x14,x23,x8,lo + str xzr,[x22,#-8] // wipe tp + str x14,[x0,#-8] + + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + mov x0,#1 + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.globl bn_sqr8x_mont + +.def bn_sqr8x_mont + .type 32 +.endef +.align 5 +bn_sqr8x_mont: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x0,x3,[sp,#96] // offload rp and np + + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + ldp x10,x11,[x1,#8*4] + ldp x12,x13,[x1,#8*6] + + sub x2,sp,x5,lsl#4 + lsl x5,x5,#3 + ldr x4,[x4] // *n0 + mov sp,x2 // alloca + sub x27,x5,#8*8 + b Lsqr8x_zero_start + +Lsqr8x_zero: + sub x27,x27,#8*8 + stp xzr,xzr,[x2,#8*0] + stp xzr,xzr,[x2,#8*2] + stp xzr,xzr,[x2,#8*4] + stp xzr,xzr,[x2,#8*6] +Lsqr8x_zero_start: + stp xzr,xzr,[x2,#8*8] + stp xzr,xzr,[x2,#8*10] + stp xzr,xzr,[x2,#8*12] + stp xzr,xzr,[x2,#8*14] + add x2,x2,#8*16 + cbnz x27,Lsqr8x_zero + + add x3,x1,x5 + add x1,x1,#8*8 + mov x19,xzr + mov x20,xzr + mov x21,xzr + mov x22,xzr + mov x23,xzr + mov x24,xzr + mov x25,xzr + mov x26,xzr + mov x2,sp + str x4,[x29,#112] // offload n0 + + // Multiply everything but a[i]*a[i] +.align 4 +Lsqr8x_outer_loop: + // a[1]a[0] (i) + // a[2]a[0] + // a[3]a[0] + // a[4]a[0] + // a[5]a[0] + // a[6]a[0] + // a[7]a[0] + // a[2]a[1] (ii) + // a[3]a[1] + // a[4]a[1] + // a[5]a[1] + // a[6]a[1] + // a[7]a[1] + // a[3]a[2] (iii) + // a[4]a[2] + // a[5]a[2] + // a[6]a[2] + // a[7]a[2] + // a[4]a[3] (iv) + // a[5]a[3] + // a[6]a[3] + // a[7]a[3] + // a[5]a[4] (v) + // a[6]a[4] + // a[7]a[4] + // a[6]a[5] (vi) + // a[7]a[5] + // a[7]a[6] (vii) + + mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) + mul x15,x8,x6 + mul x16,x9,x6 + mul x17,x10,x6 + adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) + mul x14,x11,x6 + adcs x21,x21,x15 + mul x15,x12,x6 + adcs x22,x22,x16 + mul x16,x13,x6 + adcs x23,x23,x17 + umulh x17,x7,x6 // hi(a[1..7]*a[0]) + adcs x24,x24,x14 + umulh x14,x8,x6 + adcs x25,x25,x15 + umulh x15,x9,x6 + adcs x26,x26,x16 + umulh x16,x10,x6 + stp x19,x20,[x2],#8*2 // t[0..1] + adc x19,xzr,xzr // t[8] + adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) + umulh x17,x11,x6 + adcs x22,x22,x14 + umulh x14,x12,x6 + adcs x23,x23,x15 + umulh x15,x13,x6 + adcs x24,x24,x16 + mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) + adcs x25,x25,x17 + mul x17,x9,x7 + adcs x26,x26,x14 + mul x14,x10,x7 + adc x19,x19,x15 + + mul x15,x11,x7 + adds x22,x22,x16 + mul x16,x12,x7 + adcs x23,x23,x17 + mul x17,x13,x7 + adcs x24,x24,x14 + umulh x14,x8,x7 // hi(a[2..7]*a[1]) + adcs x25,x25,x15 + umulh x15,x9,x7 + adcs x26,x26,x16 + umulh x16,x10,x7 + adcs x19,x19,x17 + umulh x17,x11,x7 + stp x21,x22,[x2],#8*2 // t[2..3] + adc x20,xzr,xzr // t[9] + adds x23,x23,x14 + umulh x14,x12,x7 + adcs x24,x24,x15 + umulh x15,x13,x7 + adcs x25,x25,x16 + mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) + adcs x26,x26,x17 + mul x17,x10,x8 + adcs x19,x19,x14 + mul x14,x11,x8 + adc x20,x20,x15 + + mul x15,x12,x8 + adds x24,x24,x16 + mul x16,x13,x8 + adcs x25,x25,x17 + umulh x17,x9,x8 // hi(a[3..7]*a[2]) + adcs x26,x26,x14 + umulh x14,x10,x8 + adcs x19,x19,x15 + umulh x15,x11,x8 + adcs x20,x20,x16 + umulh x16,x12,x8 + stp x23,x24,[x2],#8*2 // t[4..5] + adc x21,xzr,xzr // t[10] + adds x25,x25,x17 + umulh x17,x13,x8 + adcs x26,x26,x14 + mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) + adcs x19,x19,x15 + mul x15,x11,x9 + adcs x20,x20,x16 + mul x16,x12,x9 + adc x21,x21,x17 + + mul x17,x13,x9 + adds x26,x26,x14 + umulh x14,x10,x9 // hi(a[4..7]*a[3]) + adcs x19,x19,x15 + umulh x15,x11,x9 + adcs x20,x20,x16 + umulh x16,x12,x9 + adcs x21,x21,x17 + umulh x17,x13,x9 + stp x25,x26,[x2],#8*2 // t[6..7] + adc x22,xzr,xzr // t[11] + adds x19,x19,x14 + mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) + adcs x20,x20,x15 + mul x15,x12,x10 + adcs x21,x21,x16 + mul x16,x13,x10 + adc x22,x22,x17 + + umulh x17,x11,x10 // hi(a[5..7]*a[4]) + adds x20,x20,x14 + umulh x14,x12,x10 + adcs x21,x21,x15 + umulh x15,x13,x10 + adcs x22,x22,x16 + mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) + adc x23,xzr,xzr // t[12] + adds x21,x21,x17 + mul x17,x13,x11 + adcs x22,x22,x14 + umulh x14,x12,x11 // hi(a[6..7]*a[5]) + adc x23,x23,x15 + + umulh x15,x13,x11 + adds x22,x22,x16 + mul x16,x13,x12 // lo(a[7]*a[6]) (vii) + adcs x23,x23,x17 + umulh x17,x13,x12 // hi(a[7]*a[6]) + adc x24,xzr,xzr // t[13] + adds x23,x23,x14 + sub x27,x3,x1 // done yet? + adc x24,x24,x15 + + adds x24,x24,x16 + sub x14,x3,x5 // rewinded ap + adc x25,xzr,xzr // t[14] + add x25,x25,x17 + + cbz x27,Lsqr8x_outer_break + + mov x4,x6 + ldp x6,x7,[x2,#8*0] + ldp x8,x9,[x2,#8*2] + ldp x10,x11,[x2,#8*4] + ldp x12,x13,[x2,#8*6] + adds x19,x19,x6 + adcs x20,x20,x7 + ldp x6,x7,[x1,#8*0] + adcs x21,x21,x8 + adcs x22,x22,x9 + ldp x8,x9,[x1,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x1,#8*4] + adcs x25,x25,x12 + mov x0,x1 + adcs x26,xzr,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + //adc x28,xzr,xzr // moved below + mov x27,#-8*8 + + // a[8]a[0] + // a[9]a[0] + // a[a]a[0] + // a[b]a[0] + // a[c]a[0] + // a[d]a[0] + // a[e]a[0] + // a[f]a[0] + // a[8]a[1] + // a[f]a[1]........................ + // a[8]a[2] + // a[f]a[2]........................ + // a[8]a[3] + // a[f]a[3]........................ + // a[8]a[4] + // a[f]a[4]........................ + // a[8]a[5] + // a[f]a[5]........................ + // a[8]a[6] + // a[f]a[6]........................ + // a[8]a[7] + // a[f]a[7]........................ +Lsqr8x_mul: + mul x14,x6,x4 + adc x28,xzr,xzr // carry bit, modulo-scheduled + mul x15,x7,x4 + add x27,x27,#8 + mul x16,x8,x4 + mul x17,x9,x4 + adds x19,x19,x14 + mul x14,x10,x4 + adcs x20,x20,x15 + mul x15,x11,x4 + adcs x21,x21,x16 + mul x16,x12,x4 + adcs x22,x22,x17 + mul x17,x13,x4 + adcs x23,x23,x14 + umulh x14,x6,x4 + adcs x24,x24,x15 + umulh x15,x7,x4 + adcs x25,x25,x16 + umulh x16,x8,x4 + adcs x26,x26,x17 + umulh x17,x9,x4 + adc x28,x28,xzr + str x19,[x2],#8 + adds x19,x20,x14 + umulh x14,x10,x4 + adcs x20,x21,x15 + umulh x15,x11,x4 + adcs x21,x22,x16 + umulh x16,x12,x4 + adcs x22,x23,x17 + umulh x17,x13,x4 + ldr x4,[x0,x27] + adcs x23,x24,x14 + adcs x24,x25,x15 + adcs x25,x26,x16 + adcs x26,x28,x17 + //adc x28,xzr,xzr // moved above + cbnz x27,Lsqr8x_mul + // note that carry flag is guaranteed + // to be zero at this point + cmp x1,x3 // done yet? + b.eq Lsqr8x_break + + ldp x6,x7,[x2,#8*0] + ldp x8,x9,[x2,#8*2] + ldp x10,x11,[x2,#8*4] + ldp x12,x13,[x2,#8*6] + adds x19,x19,x6 + ldr x4,[x0,#-8*8] + adcs x20,x20,x7 + ldp x6,x7,[x1,#8*0] + adcs x21,x21,x8 + adcs x22,x22,x9 + ldp x8,x9,[x1,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x1,#8*4] + adcs x25,x25,x12 + mov x27,#-8*8 + adcs x26,x26,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + //adc x28,xzr,xzr // moved above + b Lsqr8x_mul + +.align 4 +Lsqr8x_break: + ldp x6,x7,[x0,#8*0] + add x1,x0,#8*8 + ldp x8,x9,[x0,#8*2] + sub x14,x3,x1 // is it last iteration? + ldp x10,x11,[x0,#8*4] + sub x15,x2,x14 + ldp x12,x13,[x0,#8*6] + cbz x14,Lsqr8x_outer_loop + + stp x19,x20,[x2,#8*0] + ldp x19,x20,[x15,#8*0] + stp x21,x22,[x2,#8*2] + ldp x21,x22,[x15,#8*2] + stp x23,x24,[x2,#8*4] + ldp x23,x24,[x15,#8*4] + stp x25,x26,[x2,#8*6] + mov x2,x15 + ldp x25,x26,[x15,#8*6] + b Lsqr8x_outer_loop + +.align 4 +Lsqr8x_outer_break: + // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] + ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] + ldp x15,x16,[sp,#8*1] + ldp x11,x13,[x14,#8*2] + add x1,x14,#8*4 + ldp x17,x14,[sp,#8*3] + + stp x19,x20,[x2,#8*0] + mul x19,x7,x7 + stp x21,x22,[x2,#8*2] + umulh x7,x7,x7 + stp x23,x24,[x2,#8*4] + mul x8,x9,x9 + stp x25,x26,[x2,#8*6] + mov x2,sp + umulh x9,x9,x9 + adds x20,x7,x15,lsl#1 + extr x15,x16,x15,#63 + sub x27,x5,#8*4 + +Lsqr4x_shift_n_add: + adcs x21,x8,x15 + extr x16,x17,x16,#63 + sub x27,x27,#8*4 + adcs x22,x9,x16 + ldp x15,x16,[x2,#8*5] + mul x10,x11,x11 + ldp x7,x9,[x1],#8*2 + umulh x11,x11,x11 + mul x12,x13,x13 + umulh x13,x13,x13 + extr x17,x14,x17,#63 + stp x19,x20,[x2,#8*0] + adcs x23,x10,x17 + extr x14,x15,x14,#63 + stp x21,x22,[x2,#8*2] + adcs x24,x11,x14 + ldp x17,x14,[x2,#8*7] + extr x15,x16,x15,#63 + adcs x25,x12,x15 + extr x16,x17,x16,#63 + adcs x26,x13,x16 + ldp x15,x16,[x2,#8*9] + mul x6,x7,x7 + ldp x11,x13,[x1],#8*2 + umulh x7,x7,x7 + mul x8,x9,x9 + umulh x9,x9,x9 + stp x23,x24,[x2,#8*4] + extr x17,x14,x17,#63 + stp x25,x26,[x2,#8*6] + add x2,x2,#8*8 + adcs x19,x6,x17 + extr x14,x15,x14,#63 + adcs x20,x7,x14 + ldp x17,x14,[x2,#8*3] + extr x15,x16,x15,#63 + cbnz x27,Lsqr4x_shift_n_add + ldp x1,x4,[x29,#104] // pull np and n0 + + adcs x21,x8,x15 + extr x16,x17,x16,#63 + adcs x22,x9,x16 + ldp x15,x16,[x2,#8*5] + mul x10,x11,x11 + umulh x11,x11,x11 + stp x19,x20,[x2,#8*0] + mul x12,x13,x13 + umulh x13,x13,x13 + stp x21,x22,[x2,#8*2] + extr x17,x14,x17,#63 + adcs x23,x10,x17 + extr x14,x15,x14,#63 + ldp x19,x20,[sp,#8*0] + adcs x24,x11,x14 + extr x15,x16,x15,#63 + ldp x6,x7,[x1,#8*0] + adcs x25,x12,x15 + extr x16,xzr,x16,#63 + ldp x8,x9,[x1,#8*2] + adc x26,x13,x16 + ldp x10,x11,[x1,#8*4] + + // Reduce by 512 bits per iteration + mul x28,x4,x19 // t[0]*n0 + ldp x12,x13,[x1,#8*6] + add x3,x1,x5 + ldp x21,x22,[sp,#8*2] + stp x23,x24,[x2,#8*4] + ldp x23,x24,[sp,#8*4] + stp x25,x26,[x2,#8*6] + ldp x25,x26,[sp,#8*6] + add x1,x1,#8*8 + mov x30,xzr // initial top-most carry + mov x2,sp + mov x27,#8 + +Lsqr8x_reduction: + // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) + mul x15,x7,x28 + sub x27,x27,#1 + mul x16,x8,x28 + str x28,[x2],#8 // put aside t[0]*n0 for tail processing + mul x17,x9,x28 + // (*) adds xzr,x19,x14 + subs xzr,x19,#1 // (*) + mul x14,x10,x28 + adcs x19,x20,x15 + mul x15,x11,x28 + adcs x20,x21,x16 + mul x16,x12,x28 + adcs x21,x22,x17 + mul x17,x13,x28 + adcs x22,x23,x14 + umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) + adcs x23,x24,x15 + umulh x15,x7,x28 + adcs x24,x25,x16 + umulh x16,x8,x28 + adcs x25,x26,x17 + umulh x17,x9,x28 + adc x26,xzr,xzr + adds x19,x19,x14 + umulh x14,x10,x28 + adcs x20,x20,x15 + umulh x15,x11,x28 + adcs x21,x21,x16 + umulh x16,x12,x28 + adcs x22,x22,x17 + umulh x17,x13,x28 + mul x28,x4,x19 // next t[0]*n0 + adcs x23,x23,x14 + adcs x24,x24,x15 + adcs x25,x25,x16 + adc x26,x26,x17 + cbnz x27,Lsqr8x_reduction + + ldp x14,x15,[x2,#8*0] + ldp x16,x17,[x2,#8*2] + mov x0,x2 + sub x27,x3,x1 // done yet? + adds x19,x19,x14 + adcs x20,x20,x15 + ldp x14,x15,[x2,#8*4] + adcs x21,x21,x16 + adcs x22,x22,x17 + ldp x16,x17,[x2,#8*6] + adcs x23,x23,x14 + adcs x24,x24,x15 + adcs x25,x25,x16 + adcs x26,x26,x17 + //adc x28,xzr,xzr // moved below + cbz x27,Lsqr8x8_post_condition + + ldr x4,[x2,#-8*8] + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + ldp x10,x11,[x1,#8*4] + mov x27,#-8*8 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + +Lsqr8x_tail: + mul x14,x6,x4 + adc x28,xzr,xzr // carry bit, modulo-scheduled + mul x15,x7,x4 + add x27,x27,#8 + mul x16,x8,x4 + mul x17,x9,x4 + adds x19,x19,x14 + mul x14,x10,x4 + adcs x20,x20,x15 + mul x15,x11,x4 + adcs x21,x21,x16 + mul x16,x12,x4 + adcs x22,x22,x17 + mul x17,x13,x4 + adcs x23,x23,x14 + umulh x14,x6,x4 + adcs x24,x24,x15 + umulh x15,x7,x4 + adcs x25,x25,x16 + umulh x16,x8,x4 + adcs x26,x26,x17 + umulh x17,x9,x4 + adc x28,x28,xzr + str x19,[x2],#8 + adds x19,x20,x14 + umulh x14,x10,x4 + adcs x20,x21,x15 + umulh x15,x11,x4 + adcs x21,x22,x16 + umulh x16,x12,x4 + adcs x22,x23,x17 + umulh x17,x13,x4 + ldr x4,[x0,x27] + adcs x23,x24,x14 + adcs x24,x25,x15 + adcs x25,x26,x16 + adcs x26,x28,x17 + //adc x28,xzr,xzr // moved above + cbnz x27,Lsqr8x_tail + // note that carry flag is guaranteed + // to be zero at this point + ldp x6,x7,[x2,#8*0] + sub x27,x3,x1 // done yet? + sub x16,x3,x5 // rewinded np + ldp x8,x9,[x2,#8*2] + ldp x10,x11,[x2,#8*4] + ldp x12,x13,[x2,#8*6] + cbz x27,Lsqr8x_tail_break + + ldr x4,[x0,#-8*8] + adds x19,x19,x6 + adcs x20,x20,x7 + ldp x6,x7,[x1,#8*0] + adcs x21,x21,x8 + adcs x22,x22,x9 + ldp x8,x9,[x1,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x1,#8*4] + adcs x25,x25,x12 + mov x27,#-8*8 + adcs x26,x26,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + //adc x28,xzr,xzr // moved above + b Lsqr8x_tail + +.align 4 +Lsqr8x_tail_break: + ldr x4,[x29,#112] // pull n0 + add x27,x2,#8*8 // end of current t[num] window + + subs xzr,x30,#1 // "move" top-most carry to carry bit + adcs x14,x19,x6 + adcs x15,x20,x7 + ldp x19,x20,[x0,#8*0] + adcs x21,x21,x8 + ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] + adcs x22,x22,x9 + ldp x8,x9,[x16,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x16,#8*4] + adcs x25,x25,x12 + adcs x26,x26,x13 + ldp x12,x13,[x16,#8*6] + add x1,x16,#8*8 + adc x30,xzr,xzr // top-most carry + mul x28,x4,x19 + stp x14,x15,[x2,#8*0] + stp x21,x22,[x2,#8*2] + ldp x21,x22,[x0,#8*2] + stp x23,x24,[x2,#8*4] + ldp x23,x24,[x0,#8*4] + cmp x27,x29 // did we hit the bottom? + stp x25,x26,[x2,#8*6] + mov x2,x0 // slide the window + ldp x25,x26,[x0,#8*6] + mov x27,#8 + b.ne Lsqr8x_reduction + + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + ldr x0,[x29,#96] // pull rp + add x2,x2,#8*8 + subs x14,x19,x6 + sbcs x15,x20,x7 + sub x27,x5,#8*8 + mov x3,x0 // x0 copy + +Lsqr8x_sub: + sbcs x16,x21,x8 + ldp x6,x7,[x1,#8*0] + sbcs x17,x22,x9 + stp x14,x15,[x0,#8*0] + sbcs x14,x23,x10 + ldp x8,x9,[x1,#8*2] + sbcs x15,x24,x11 + stp x16,x17,[x0,#8*2] + sbcs x16,x25,x12 + ldp x10,x11,[x1,#8*4] + sbcs x17,x26,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + ldp x19,x20,[x2,#8*0] + sub x27,x27,#8*8 + ldp x21,x22,[x2,#8*2] + ldp x23,x24,[x2,#8*4] + ldp x25,x26,[x2,#8*6] + add x2,x2,#8*8 + stp x14,x15,[x0,#8*4] + sbcs x14,x19,x6 + stp x16,x17,[x0,#8*6] + add x0,x0,#8*8 + sbcs x15,x20,x7 + cbnz x27,Lsqr8x_sub + + sbcs x16,x21,x8 + mov x2,sp + add x1,sp,x5 + ldp x6,x7,[x3,#8*0] + sbcs x17,x22,x9 + stp x14,x15,[x0,#8*0] + sbcs x14,x23,x10 + ldp x8,x9,[x3,#8*2] + sbcs x15,x24,x11 + stp x16,x17,[x0,#8*2] + sbcs x16,x25,x12 + ldp x19,x20,[x1,#8*0] + sbcs x17,x26,x13 + ldp x21,x22,[x1,#8*2] + sbcs xzr,x30,xzr // did it borrow? + ldr x30,[x29,#8] // pull return address + stp x14,x15,[x0,#8*4] + stp x16,x17,[x0,#8*6] + + sub x27,x5,#8*4 +Lsqr4x_cond_copy: + sub x27,x27,#8*4 + csel x14,x19,x6,lo + stp xzr,xzr,[x2,#8*0] + csel x15,x20,x7,lo + ldp x6,x7,[x3,#8*4] + ldp x19,x20,[x1,#8*4] + csel x16,x21,x8,lo + stp xzr,xzr,[x2,#8*2] + add x2,x2,#8*4 + csel x17,x22,x9,lo + ldp x8,x9,[x3,#8*6] + ldp x21,x22,[x1,#8*6] + add x1,x1,#8*4 + stp x14,x15,[x3,#8*0] + stp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + stp xzr,xzr,[x1,#8*0] + stp xzr,xzr,[x1,#8*2] + cbnz x27,Lsqr4x_cond_copy + + csel x14,x19,x6,lo + stp xzr,xzr,[x2,#8*0] + csel x15,x20,x7,lo + stp xzr,xzr,[x2,#8*2] + csel x16,x21,x8,lo + csel x17,x22,x9,lo + stp x14,x15,[x3,#8*0] + stp x16,x17,[x3,#8*2] + + b Lsqr8x_done + +.align 4 +Lsqr8x8_post_condition: + adc x28,xzr,xzr + ldr x30,[x29,#8] // pull return address + // x19-7,x28 hold result, x6-7 hold modulus + subs x6,x19,x6 + ldr x1,[x29,#96] // pull rp + sbcs x7,x20,x7 + stp xzr,xzr,[sp,#8*0] + sbcs x8,x21,x8 + stp xzr,xzr,[sp,#8*2] + sbcs x9,x22,x9 + stp xzr,xzr,[sp,#8*4] + sbcs x10,x23,x10 + stp xzr,xzr,[sp,#8*6] + sbcs x11,x24,x11 + stp xzr,xzr,[sp,#8*8] + sbcs x12,x25,x12 + stp xzr,xzr,[sp,#8*10] + sbcs x13,x26,x13 + stp xzr,xzr,[sp,#8*12] + sbcs x28,x28,xzr // did it borrow? + stp xzr,xzr,[sp,#8*14] + + // x6-7 hold result-modulus + csel x6,x19,x6,lo + csel x7,x20,x7,lo + csel x8,x21,x8,lo + csel x9,x22,x9,lo + stp x6,x7,[x1,#8*0] + csel x10,x23,x10,lo + csel x11,x24,x11,lo + stp x8,x9,[x1,#8*2] + csel x12,x25,x12,lo + csel x13,x26,x13,lo + stp x10,x11,[x1,#8*4] + stp x12,x13,[x1,#8*6] + +Lsqr8x_done: + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + mov x0,#1 + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + // x30 is popped earlier + AARCH64_VALIDATE_LINK_REGISTER + ret + +.globl bn_mul4x_mont + +.def bn_mul4x_mont + .type 32 +.endef +.align 5 +bn_mul4x_mont: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + sub x26,sp,x5,lsl#3 + lsl x5,x5,#3 + ldr x4,[x4] // *n0 + sub sp,x26,#8*4 // alloca + + add x10,x2,x5 + add x27,x1,x5 + stp x0,x10,[x29,#96] // offload rp and &b[num] + + ldr x24,[x2,#8*0] // b[0] + ldp x6,x7,[x1,#8*0] // a[0..3] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + mov x19,xzr + mov x20,xzr + mov x21,xzr + mov x22,xzr + ldp x14,x15,[x3,#8*0] // n[0..3] + ldp x16,x17,[x3,#8*2] + adds x3,x3,#8*4 // clear carry bit + mov x0,xzr + mov x28,#0 + mov x26,sp + +Loop_mul4x_1st_reduction: + mul x10,x6,x24 // lo(a[0..3]*b[0]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[0..3]*b[0]) + adcs x20,x20,x11 + mul x25,x19,x4 // t[0]*n0 + adcs x21,x21,x12 + umulh x11,x7,x24 + adcs x22,x22,x13 + umulh x12,x8,x24 + adc x23,xzr,xzr + umulh x13,x9,x24 + ldr x24,[x2,x28] // next b[i] (or b[0]) + adds x20,x20,x10 + // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) + str x25,[x26],#8 // put aside t[0]*n0 for tail processing + adcs x21,x21,x11 + mul x11,x15,x25 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + // (*) adds xzr,x19,x10 + subs xzr,x19,#1 // (*) + umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) + adcs x19,x20,x11 + umulh x11,x15,x25 + adcs x20,x21,x12 + umulh x12,x16,x25 + adcs x21,x22,x13 + umulh x13,x17,x25 + adcs x22,x23,x0 + adc x0,xzr,xzr + adds x19,x19,x10 + sub x10,x27,x1 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + cbnz x28,Loop_mul4x_1st_reduction + + cbz x10,Lmul4x4_post_condition + + ldp x6,x7,[x1,#8*0] // a[4..7] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + ldr x25,[sp] // a[0]*n0 + ldp x14,x15,[x3,#8*0] // n[4..7] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + +Loop_mul4x_1st_tail: + mul x10,x6,x24 // lo(a[4..7]*b[i]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[4..7]*b[i]) + adcs x20,x20,x11 + umulh x11,x7,x24 + adcs x21,x21,x12 + umulh x12,x8,x24 + adcs x22,x22,x13 + umulh x13,x9,x24 + adc x23,xzr,xzr + ldr x24,[x2,x28] // next b[i] (or b[0]) + adds x20,x20,x10 + mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) + adcs x21,x21,x11 + mul x11,x15,x25 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + adds x19,x19,x10 + umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) + adcs x20,x20,x11 + umulh x11,x15,x25 + adcs x21,x21,x12 + umulh x12,x16,x25 + adcs x22,x22,x13 + adcs x23,x23,x0 + umulh x13,x17,x25 + adc x0,xzr,xzr + ldr x25,[sp,x28] // next t[0]*n0 + str x19,[x26],#8 // result!!! + adds x19,x20,x10 + sub x10,x27,x1 // done yet? + adcs x20,x21,x11 + adcs x21,x22,x12 + adcs x22,x23,x13 + //adc x0,x0,xzr + cbnz x28,Loop_mul4x_1st_tail + + sub x11,x27,x5 // rewinded x1 + cbz x10,Lmul4x_proceed + + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + ldp x14,x15,[x3,#8*0] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + b Loop_mul4x_1st_tail + +.align 5 +Lmul4x_proceed: + ldr x24,[x2,#8*4]! // *++b + adc x30,x0,xzr + ldp x6,x7,[x11,#8*0] // a[0..3] + sub x3,x3,x5 // rewind np + ldp x8,x9,[x11,#8*2] + add x1,x11,#8*4 + + stp x19,x20,[x26,#8*0] // result!!! + ldp x19,x20,[sp,#8*4] // t[0..3] + stp x21,x22,[x26,#8*2] // result!!! + ldp x21,x22,[sp,#8*6] + + ldp x14,x15,[x3,#8*0] // n[0..3] + mov x26,sp + ldp x16,x17,[x3,#8*2] + adds x3,x3,#8*4 // clear carry bit + mov x0,xzr + +.align 4 +Loop_mul4x_reduction: + mul x10,x6,x24 // lo(a[0..3]*b[4]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[0..3]*b[4]) + adcs x20,x20,x11 + mul x25,x19,x4 // t[0]*n0 + adcs x21,x21,x12 + umulh x11,x7,x24 + adcs x22,x22,x13 + umulh x12,x8,x24 + adc x23,xzr,xzr + umulh x13,x9,x24 + ldr x24,[x2,x28] // next b[i] + adds x20,x20,x10 + // (*) mul x10,x14,x25 + str x25,[x26],#8 // put aside t[0]*n0 for tail processing + adcs x21,x21,x11 + mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + // (*) adds xzr,x19,x10 + subs xzr,x19,#1 // (*) + umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 + adcs x19,x20,x11 + umulh x11,x15,x25 + adcs x20,x21,x12 + umulh x12,x16,x25 + adcs x21,x22,x13 + umulh x13,x17,x25 + adcs x22,x23,x0 + adc x0,xzr,xzr + adds x19,x19,x10 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + cbnz x28,Loop_mul4x_reduction + + adc x0,x0,xzr + ldp x10,x11,[x26,#8*4] // t[4..7] + ldp x12,x13,[x26,#8*6] + ldp x6,x7,[x1,#8*0] // a[4..7] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + adds x19,x19,x10 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + + ldr x25,[sp] // t[0]*n0 + ldp x14,x15,[x3,#8*0] // n[4..7] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + +.align 4 +Loop_mul4x_tail: + mul x10,x6,x24 // lo(a[4..7]*b[4]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[4..7]*b[4]) + adcs x20,x20,x11 + umulh x11,x7,x24 + adcs x21,x21,x12 + umulh x12,x8,x24 + adcs x22,x22,x13 + umulh x13,x9,x24 + adc x23,xzr,xzr + ldr x24,[x2,x28] // next b[i] + adds x20,x20,x10 + mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) + adcs x21,x21,x11 + mul x11,x15,x25 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + adds x19,x19,x10 + umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) + adcs x20,x20,x11 + umulh x11,x15,x25 + adcs x21,x21,x12 + umulh x12,x16,x25 + adcs x22,x22,x13 + umulh x13,x17,x25 + adcs x23,x23,x0 + ldr x25,[sp,x28] // next a[0]*n0 + adc x0,xzr,xzr + str x19,[x26],#8 // result!!! + adds x19,x20,x10 + sub x10,x27,x1 // done yet? + adcs x20,x21,x11 + adcs x21,x22,x12 + adcs x22,x23,x13 + //adc x0,x0,xzr + cbnz x28,Loop_mul4x_tail + + sub x11,x3,x5 // rewinded np? + adc x0,x0,xzr + cbz x10,Loop_mul4x_break + + ldp x10,x11,[x26,#8*4] + ldp x12,x13,[x26,#8*6] + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + adds x19,x19,x10 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + ldp x14,x15,[x3,#8*0] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + b Loop_mul4x_tail + +.align 4 +Loop_mul4x_break: + ldp x12,x13,[x29,#96] // pull rp and &b[num] + adds x19,x19,x30 + add x2,x2,#8*4 // bp++ + adcs x20,x20,xzr + sub x1,x1,x5 // rewind ap + adcs x21,x21,xzr + stp x19,x20,[x26,#8*0] // result!!! + adcs x22,x22,xzr + ldp x19,x20,[sp,#8*4] // t[0..3] + adc x30,x0,xzr + stp x21,x22,[x26,#8*2] // result!!! + cmp x2,x13 // done yet? + ldp x21,x22,[sp,#8*6] + ldp x14,x15,[x11,#8*0] // n[0..3] + ldp x16,x17,[x11,#8*2] + add x3,x11,#8*4 + b.eq Lmul4x_post + + ldr x24,[x2] + ldp x6,x7,[x1,#8*0] // a[0..3] + ldp x8,x9,[x1,#8*2] + adds x1,x1,#8*4 // clear carry bit + mov x0,xzr + mov x26,sp + b Loop_mul4x_reduction + +.align 4 +Lmul4x_post: + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + mov x0,x12 + mov x27,x12 // x0 copy + subs x10,x19,x14 + add x26,sp,#8*8 + sbcs x11,x20,x15 + sub x28,x5,#8*4 + +Lmul4x_sub: + sbcs x12,x21,x16 + ldp x14,x15,[x3,#8*0] + sub x28,x28,#8*4 + ldp x19,x20,[x26,#8*0] + sbcs x13,x22,x17 + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + ldp x21,x22,[x26,#8*2] + add x26,x26,#8*4 + stp x10,x11,[x0,#8*0] + sbcs x10,x19,x14 + stp x12,x13,[x0,#8*2] + add x0,x0,#8*4 + sbcs x11,x20,x15 + cbnz x28,Lmul4x_sub + + sbcs x12,x21,x16 + mov x26,sp + add x1,sp,#8*4 + ldp x6,x7,[x27,#8*0] + sbcs x13,x22,x17 + stp x10,x11,[x0,#8*0] + ldp x8,x9,[x27,#8*2] + stp x12,x13,[x0,#8*2] + ldp x19,x20,[x1,#8*0] + ldp x21,x22,[x1,#8*2] + sbcs xzr,x30,xzr // did it borrow? + ldr x30,[x29,#8] // pull return address + + sub x28,x5,#8*4 +Lmul4x_cond_copy: + sub x28,x28,#8*4 + csel x10,x19,x6,lo + stp xzr,xzr,[x26,#8*0] + csel x11,x20,x7,lo + ldp x6,x7,[x27,#8*4] + ldp x19,x20,[x1,#8*4] + csel x12,x21,x8,lo + stp xzr,xzr,[x26,#8*2] + add x26,x26,#8*4 + csel x13,x22,x9,lo + ldp x8,x9,[x27,#8*6] + ldp x21,x22,[x1,#8*6] + add x1,x1,#8*4 + stp x10,x11,[x27,#8*0] + stp x12,x13,[x27,#8*2] + add x27,x27,#8*4 + cbnz x28,Lmul4x_cond_copy + + csel x10,x19,x6,lo + stp xzr,xzr,[x26,#8*0] + csel x11,x20,x7,lo + stp xzr,xzr,[x26,#8*2] + csel x12,x21,x8,lo + stp xzr,xzr,[x26,#8*3] + csel x13,x22,x9,lo + stp xzr,xzr,[x26,#8*4] + stp x10,x11,[x27,#8*0] + stp x12,x13,[x27,#8*2] + + b Lmul4x_done + +.align 4 +Lmul4x4_post_condition: + adc x0,x0,xzr + ldr x1,[x29,#96] // pull rp + // x19-3,x0 hold result, x14-7 hold modulus + subs x6,x19,x14 + ldr x30,[x29,#8] // pull return address + sbcs x7,x20,x15 + stp xzr,xzr,[sp,#8*0] + sbcs x8,x21,x16 + stp xzr,xzr,[sp,#8*2] + sbcs x9,x22,x17 + stp xzr,xzr,[sp,#8*4] + sbcs xzr,x0,xzr // did it borrow? + stp xzr,xzr,[sp,#8*6] + + // x6-3 hold result-modulus + csel x6,x19,x6,lo + csel x7,x20,x7,lo + csel x8,x21,x8,lo + csel x9,x22,x9,lo + stp x6,x7,[x1,#8*0] + stp x8,x9,[x1,#8*2] + +Lmul4x_done: + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + mov x0,#1 + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + // x30 is popped earlier + AARCH64_VALIDATE_LINK_REGISTER + ret + +.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 4 +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) diff --git a/ring-0.17.14/pregenerated/bsaes-armv7-linux32.S b/ring-0.17.14/pregenerated/bsaes-armv7-linux32.S new file mode 100644 index 0000000000..771c2cdf7b --- /dev/null +++ b/ring-0.17.14/pregenerated/bsaes-armv7-linux32.S @@ -0,0 +1,796 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__) +@ Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. +@ +@ Licensed under the Apache License, Version 2.0 (the "License"); +@ you may not use this file except in compliance with the License. +@ You may obtain a copy of the License at +@ +@ https://www.apache.org/licenses/LICENSE-2.0 +@ +@ Unless required by applicable law or agreed to in writing, software +@ distributed under the License is distributed on an "AS IS" BASIS, +@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ See the License for the specific language governing permissions and +@ limitations under the License. + + +@ ==================================================================== +@ Written by Andy Polyakov for the OpenSSL +@ project. +@ +@ Specific modes and adaptation for Linux kernel by Ard Biesheuvel +@ of Linaro. +@ ==================================================================== + +@ Bit-sliced AES for ARM NEON +@ +@ February 2012. +@ +@ This implementation is direct adaptation of bsaes-x86_64 module for +@ ARM NEON. Except that this module is endian-neutral [in sense that +@ it can be compiled for either endianness] by courtesy of vld1.8's +@ neutrality. Initial version doesn't implement interface to OpenSSL, +@ only low-level primitives and unsupported entry points, just enough +@ to collect performance results, which for Cortex-A8 core are: +@ +@ encrypt 19.5 cycles per byte processed with 128-bit key +@ decrypt 22.1 cycles per byte processed with 128-bit key +@ key conv. 440 cycles per 128-bit key/0.18 of 8x block +@ +@ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7, +@ which is [much] worse than anticipated (for further details see +@ http://www.openssl.org/~appro/Snapdragon-S4.html). +@ +@ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code +@ manages in 20.0 cycles]. +@ +@ When comparing to x86_64 results keep in mind that NEON unit is +@ [mostly] single-issue and thus can't [fully] benefit from +@ instruction-level parallelism. And when comparing to aes-armv4 +@ results keep in mind key schedule conversion overhead (see +@ bsaes-x86_64.pl for further details)... +@ +@ + +@ April-August 2013 +@ Add CBC, CTR and XTS subroutines and adapt for kernel use; courtesy of Ard. + +#ifndef __KERNEL__ +# define VFP_ABI_PUSH vstmdb sp!,{d8-d15} +# define VFP_ABI_POP vldmia sp!,{d8-d15} +# define VFP_ABI_FRAME 0x40 +#else +# define VFP_ABI_PUSH +# define VFP_ABI_POP +# define VFP_ABI_FRAME 0 +# define BSAES_ASM_EXTENDED_KEY +# define __ARM_MAX_ARCH__ 7 +#endif + +#ifdef __thumb__ +# define adrl adr +#endif + +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.text +.syntax unified @ ARMv7-capable assembler is expected to handle this +#if defined(__thumb2__) && !defined(__APPLE__) +.thumb +#else +.code 32 +# undef __thumb2__ +#endif + +.type _bsaes_const,%object +.align 6 +_bsaes_const: +.LM0ISR:@ InvShiftRows constants +.quad 0x0a0e0206070b0f03, 0x0004080c0d010509 +.LISR: +.quad 0x0504070602010003, 0x0f0e0d0c080b0a09 +.LISRM0: +.quad 0x01040b0e0205080f, 0x0306090c00070a0d +.LM0SR:@ ShiftRows constants +.quad 0x0a0e02060f03070b, 0x0004080c05090d01 +.LSR: +.quad 0x0504070600030201, 0x0f0e0d0c0a09080b +.LSRM0: +.quad 0x0304090e00050a0f, 0x01060b0c0207080d +.LM0: +.quad 0x02060a0e03070b0f, 0x0004080c0105090d +.LREVM0SR: +.quad 0x090d01050c000408, 0x03070b0f060a0e02 +.byte 66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 6 +.size _bsaes_const,.-_bsaes_const + +.type _bsaes_encrypt8,%function +.align 4 +_bsaes_encrypt8: + adr r6,. + vldmia r4!, {q9} @ round 0 key +#if defined(__thumb2__) || defined(__APPLE__) + adr r6,.LM0SR +#else + sub r6,r6,#_bsaes_encrypt8-.LM0SR +#endif + + vldmia r6!, {q8} @ .LM0SR +_bsaes_encrypt8_alt: + veor q10, q0, q9 @ xor with round0 key + veor q11, q1, q9 + vtbl.8 d0, {q10}, d16 + vtbl.8 d1, {q10}, d17 + veor q12, q2, q9 + vtbl.8 d2, {q11}, d16 + vtbl.8 d3, {q11}, d17 + veor q13, q3, q9 + vtbl.8 d4, {q12}, d16 + vtbl.8 d5, {q12}, d17 + veor q14, q4, q9 + vtbl.8 d6, {q13}, d16 + vtbl.8 d7, {q13}, d17 + veor q15, q5, q9 + vtbl.8 d8, {q14}, d16 + vtbl.8 d9, {q14}, d17 + veor q10, q6, q9 + vtbl.8 d10, {q15}, d16 + vtbl.8 d11, {q15}, d17 + veor q11, q7, q9 + vtbl.8 d12, {q10}, d16 + vtbl.8 d13, {q10}, d17 + vtbl.8 d14, {q11}, d16 + vtbl.8 d15, {q11}, d17 +_bsaes_encrypt8_bitslice: + vmov.i8 q8,#0x55 @ compose .LBS0 + vmov.i8 q9,#0x33 @ compose .LBS1 + vshr.u64 q10, q6, #1 + vshr.u64 q11, q4, #1 + veor q10, q10, q7 + veor q11, q11, q5 + vand q10, q10, q8 + vand q11, q11, q8 + veor q7, q7, q10 + vshl.u64 q10, q10, #1 + veor q5, q5, q11 + vshl.u64 q11, q11, #1 + veor q6, q6, q10 + veor q4, q4, q11 + vshr.u64 q10, q2, #1 + vshr.u64 q11, q0, #1 + veor q10, q10, q3 + veor q11, q11, q1 + vand q10, q10, q8 + vand q11, q11, q8 + veor q3, q3, q10 + vshl.u64 q10, q10, #1 + veor q1, q1, q11 + vshl.u64 q11, q11, #1 + veor q2, q2, q10 + veor q0, q0, q11 + vmov.i8 q8,#0x0f @ compose .LBS2 + vshr.u64 q10, q5, #2 + vshr.u64 q11, q4, #2 + veor q10, q10, q7 + veor q11, q11, q6 + vand q10, q10, q9 + vand q11, q11, q9 + veor q7, q7, q10 + vshl.u64 q10, q10, #2 + veor q6, q6, q11 + vshl.u64 q11, q11, #2 + veor q5, q5, q10 + veor q4, q4, q11 + vshr.u64 q10, q1, #2 + vshr.u64 q11, q0, #2 + veor q10, q10, q3 + veor q11, q11, q2 + vand q10, q10, q9 + vand q11, q11, q9 + veor q3, q3, q10 + vshl.u64 q10, q10, #2 + veor q2, q2, q11 + vshl.u64 q11, q11, #2 + veor q1, q1, q10 + veor q0, q0, q11 + vshr.u64 q10, q3, #4 + vshr.u64 q11, q2, #4 + veor q10, q10, q7 + veor q11, q11, q6 + vand q10, q10, q8 + vand q11, q11, q8 + veor q7, q7, q10 + vshl.u64 q10, q10, #4 + veor q6, q6, q11 + vshl.u64 q11, q11, #4 + veor q3, q3, q10 + veor q2, q2, q11 + vshr.u64 q10, q1, #4 + vshr.u64 q11, q0, #4 + veor q10, q10, q5 + veor q11, q11, q4 + vand q10, q10, q8 + vand q11, q11, q8 + veor q5, q5, q10 + vshl.u64 q10, q10, #4 + veor q4, q4, q11 + vshl.u64 q11, q11, #4 + veor q1, q1, q10 + veor q0, q0, q11 + sub r5,r5,#1 + b .Lenc_sbox +.align 4 +.Lenc_loop: + vldmia r4!, {q8,q9,q10,q11} + veor q8, q8, q0 + veor q9, q9, q1 + vtbl.8 d0, {q8}, d24 + vtbl.8 d1, {q8}, d25 + vldmia r4!, {q8} + veor q10, q10, q2 + vtbl.8 d2, {q9}, d24 + vtbl.8 d3, {q9}, d25 + vldmia r4!, {q9} + veor q11, q11, q3 + vtbl.8 d4, {q10}, d24 + vtbl.8 d5, {q10}, d25 + vldmia r4!, {q10} + vtbl.8 d6, {q11}, d24 + vtbl.8 d7, {q11}, d25 + vldmia r4!, {q11} + veor q8, q8, q4 + veor q9, q9, q5 + vtbl.8 d8, {q8}, d24 + vtbl.8 d9, {q8}, d25 + veor q10, q10, q6 + vtbl.8 d10, {q9}, d24 + vtbl.8 d11, {q9}, d25 + veor q11, q11, q7 + vtbl.8 d12, {q10}, d24 + vtbl.8 d13, {q10}, d25 + vtbl.8 d14, {q11}, d24 + vtbl.8 d15, {q11}, d25 +.Lenc_sbox: + veor q2, q2, q1 + veor q5, q5, q6 + veor q3, q3, q0 + veor q6, q6, q2 + veor q5, q5, q0 + + veor q6, q6, q3 + veor q3, q3, q7 + veor q7, q7, q5 + veor q3, q3, q4 + veor q4, q4, q5 + + veor q2, q2, q7 + veor q3, q3, q1 + veor q1, q1, q5 + veor q11, q7, q4 + veor q10, q1, q2 + veor q9, q5, q3 + veor q13, q2, q4 + vmov q8, q10 + veor q12, q6, q0 + + vorr q10, q10, q9 + veor q15, q11, q8 + vand q14, q11, q12 + vorr q11, q11, q12 + veor q12, q12, q9 + vand q8, q8, q9 + veor q9, q3, q0 + vand q15, q15, q12 + vand q13, q13, q9 + veor q9, q7, q1 + veor q12, q5, q6 + veor q11, q11, q13 + veor q10, q10, q13 + vand q13, q9, q12 + vorr q9, q9, q12 + veor q11, q11, q15 + veor q8, q8, q13 + veor q10, q10, q14 + veor q9, q9, q15 + veor q8, q8, q14 + vand q12, q2, q3 + veor q9, q9, q14 + vand q13, q4, q0 + vand q14, q1, q5 + vorr q15, q7, q6 + veor q11, q11, q12 + veor q9, q9, q14 + veor q8, q8, q15 + veor q10, q10, q13 + + @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3 + + @ new smaller inversion + + vand q14, q11, q9 + vmov q12, q8 + + veor q13, q10, q14 + veor q15, q8, q14 + veor q14, q8, q14 @ q14=q15 + + vbsl q13, q9, q8 + vbsl q15, q11, q10 + veor q11, q11, q10 + + vbsl q12, q13, q14 + vbsl q8, q14, q13 + + vand q14, q12, q15 + veor q9, q9, q8 + + veor q14, q14, q11 + veor q12, q6, q0 + veor q8, q5, q3 + veor q10, q15, q14 + vand q10, q10, q6 + veor q6, q6, q5 + vand q11, q5, q15 + vand q6, q6, q14 + veor q5, q11, q10 + veor q6, q6, q11 + veor q15, q15, q13 + veor q14, q14, q9 + veor q11, q15, q14 + veor q10, q13, q9 + vand q11, q11, q12 + vand q10, q10, q0 + veor q12, q12, q8 + veor q0, q0, q3 + vand q8, q8, q15 + vand q3, q3, q13 + vand q12, q12, q14 + vand q0, q0, q9 + veor q8, q8, q12 + veor q0, q0, q3 + veor q12, q12, q11 + veor q3, q3, q10 + veor q6, q6, q12 + veor q0, q0, q12 + veor q5, q5, q8 + veor q3, q3, q8 + + veor q12, q7, q4 + veor q8, q1, q2 + veor q11, q15, q14 + veor q10, q13, q9 + vand q11, q11, q12 + vand q10, q10, q4 + veor q12, q12, q8 + veor q4, q4, q2 + vand q8, q8, q15 + vand q2, q2, q13 + vand q12, q12, q14 + vand q4, q4, q9 + veor q8, q8, q12 + veor q4, q4, q2 + veor q12, q12, q11 + veor q2, q2, q10 + veor q15, q15, q13 + veor q14, q14, q9 + veor q10, q15, q14 + vand q10, q10, q7 + veor q7, q7, q1 + vand q11, q1, q15 + vand q7, q7, q14 + veor q1, q11, q10 + veor q7, q7, q11 + veor q7, q7, q12 + veor q4, q4, q12 + veor q1, q1, q8 + veor q2, q2, q8 + veor q7, q7, q0 + veor q1, q1, q6 + veor q6, q6, q0 + veor q4, q4, q7 + veor q0, q0, q1 + + veor q1, q1, q5 + veor q5, q5, q2 + veor q2, q2, q3 + veor q3, q3, q5 + veor q4, q4, q5 + + veor q6, q6, q3 + subs r5,r5,#1 + bcc .Lenc_done + vext.8 q8, q0, q0, #12 @ x0 <<< 32 + vext.8 q9, q1, q1, #12 + veor q0, q0, q8 @ x0 ^ (x0 <<< 32) + vext.8 q10, q4, q4, #12 + veor q1, q1, q9 + vext.8 q11, q6, q6, #12 + veor q4, q4, q10 + vext.8 q12, q3, q3, #12 + veor q6, q6, q11 + vext.8 q13, q7, q7, #12 + veor q3, q3, q12 + vext.8 q14, q2, q2, #12 + veor q7, q7, q13 + vext.8 q15, q5, q5, #12 + veor q2, q2, q14 + + veor q9, q9, q0 + veor q5, q5, q15 + vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64) + veor q10, q10, q1 + veor q8, q8, q5 + veor q9, q9, q5 + vext.8 q1, q1, q1, #8 + veor q13, q13, q3 + veor q0, q0, q8 + veor q14, q14, q7 + veor q1, q1, q9 + vext.8 q8, q3, q3, #8 + veor q12, q12, q6 + vext.8 q9, q7, q7, #8 + veor q15, q15, q2 + vext.8 q3, q6, q6, #8 + veor q11, q11, q4 + vext.8 q7, q5, q5, #8 + veor q12, q12, q5 + vext.8 q6, q2, q2, #8 + veor q11, q11, q5 + vext.8 q2, q4, q4, #8 + veor q5, q9, q13 + veor q4, q8, q12 + veor q3, q3, q11 + veor q7, q7, q15 + veor q6, q6, q14 + @ vmov q4, q8 + veor q2, q2, q10 + @ vmov q5, q9 + vldmia r6, {q12} @ .LSR + ite eq @ Thumb2 thing, samity check in ARM + addeq r6,r6,#0x10 + bne .Lenc_loop + vldmia r6, {q12} @ .LSRM0 + b .Lenc_loop +.align 4 +.Lenc_done: + vmov.i8 q8,#0x55 @ compose .LBS0 + vmov.i8 q9,#0x33 @ compose .LBS1 + vshr.u64 q10, q2, #1 + vshr.u64 q11, q3, #1 + veor q10, q10, q5 + veor q11, q11, q7 + vand q10, q10, q8 + vand q11, q11, q8 + veor q5, q5, q10 + vshl.u64 q10, q10, #1 + veor q7, q7, q11 + vshl.u64 q11, q11, #1 + veor q2, q2, q10 + veor q3, q3, q11 + vshr.u64 q10, q4, #1 + vshr.u64 q11, q0, #1 + veor q10, q10, q6 + veor q11, q11, q1 + vand q10, q10, q8 + vand q11, q11, q8 + veor q6, q6, q10 + vshl.u64 q10, q10, #1 + veor q1, q1, q11 + vshl.u64 q11, q11, #1 + veor q4, q4, q10 + veor q0, q0, q11 + vmov.i8 q8,#0x0f @ compose .LBS2 + vshr.u64 q10, q7, #2 + vshr.u64 q11, q3, #2 + veor q10, q10, q5 + veor q11, q11, q2 + vand q10, q10, q9 + vand q11, q11, q9 + veor q5, q5, q10 + vshl.u64 q10, q10, #2 + veor q2, q2, q11 + vshl.u64 q11, q11, #2 + veor q7, q7, q10 + veor q3, q3, q11 + vshr.u64 q10, q1, #2 + vshr.u64 q11, q0, #2 + veor q10, q10, q6 + veor q11, q11, q4 + vand q10, q10, q9 + vand q11, q11, q9 + veor q6, q6, q10 + vshl.u64 q10, q10, #2 + veor q4, q4, q11 + vshl.u64 q11, q11, #2 + veor q1, q1, q10 + veor q0, q0, q11 + vshr.u64 q10, q6, #4 + vshr.u64 q11, q4, #4 + veor q10, q10, q5 + veor q11, q11, q2 + vand q10, q10, q8 + vand q11, q11, q8 + veor q5, q5, q10 + vshl.u64 q10, q10, #4 + veor q2, q2, q11 + vshl.u64 q11, q11, #4 + veor q6, q6, q10 + veor q4, q4, q11 + vshr.u64 q10, q1, #4 + vshr.u64 q11, q0, #4 + veor q10, q10, q7 + veor q11, q11, q3 + vand q10, q10, q8 + vand q11, q11, q8 + veor q7, q7, q10 + vshl.u64 q10, q10, #4 + veor q3, q3, q11 + vshl.u64 q11, q11, #4 + veor q1, q1, q10 + veor q0, q0, q11 + vldmia r4, {q8} @ last round key + veor q4, q4, q8 + veor q6, q6, q8 + veor q3, q3, q8 + veor q7, q7, q8 + veor q2, q2, q8 + veor q5, q5, q8 + veor q0, q0, q8 + veor q1, q1, q8 + bx lr +.size _bsaes_encrypt8,.-_bsaes_encrypt8 +.type _bsaes_key_convert,%function +.align 4 +_bsaes_key_convert: + adr r6,. + vld1.8 {q7}, [r4]! @ load round 0 key +#if defined(__thumb2__) || defined(__APPLE__) + adr r6,.LM0 +#else + sub r6,r6,#_bsaes_key_convert-.LM0 +#endif + vld1.8 {q15}, [r4]! @ load round 1 key + + vmov.i8 q8, #0x01 @ bit masks + vmov.i8 q9, #0x02 + vmov.i8 q10, #0x04 + vmov.i8 q11, #0x08 + vmov.i8 q12, #0x10 + vmov.i8 q13, #0x20 + vldmia r6, {q14} @ .LM0 + +#ifdef __ARMEL__ + vrev32.8 q7, q7 + vrev32.8 q15, q15 +#endif + sub r5,r5,#1 + vstmia r12!, {q7} @ save round 0 key + b .Lkey_loop + +.align 4 +.Lkey_loop: + vtbl.8 d14,{q15},d28 + vtbl.8 d15,{q15},d29 + vmov.i8 q6, #0x40 + vmov.i8 q15, #0x80 + + vtst.8 q0, q7, q8 + vtst.8 q1, q7, q9 + vtst.8 q2, q7, q10 + vtst.8 q3, q7, q11 + vtst.8 q4, q7, q12 + vtst.8 q5, q7, q13 + vtst.8 q6, q7, q6 + vtst.8 q7, q7, q15 + vld1.8 {q15}, [r4]! @ load next round key + vmvn q0, q0 @ "pnot" + vmvn q1, q1 + vmvn q5, q5 + vmvn q6, q6 +#ifdef __ARMEL__ + vrev32.8 q15, q15 +#endif + subs r5,r5,#1 + vstmia r12!,{q0,q1,q2,q3,q4,q5,q6,q7} @ write bit-sliced round key + bne .Lkey_loop + + vmov.i8 q7,#0x63 @ compose .L63 + @ don't save last round key + bx lr +.size _bsaes_key_convert,.-_bsaes_key_convert +.globl bsaes_ctr32_encrypt_blocks +.hidden bsaes_ctr32_encrypt_blocks +.type bsaes_ctr32_encrypt_blocks,%function +.align 5 +bsaes_ctr32_encrypt_blocks: + @ In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this + @ out to retain a constant-time implementation. + mov ip, sp + stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr} + VFP_ABI_PUSH + ldr r8, [ip] @ ctr is 1st arg on the stack + sub sp, sp, #0x10 @ scratch space to carry over the ctr + mov r9, sp @ save sp + + ldr r10, [r3, #240] @ get # of rounds +#ifndef BSAES_ASM_EXTENDED_KEY + @ allocate the key schedule on the stack + sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key + add r12, #96 @ size of bit-sliced key schedule + + @ populate the key schedule + mov r4, r3 @ pass key + mov r5, r10 @ pass # of rounds + mov sp, r12 @ sp is sp + bl _bsaes_key_convert + veor q7,q7,q15 @ fix up last round key + vstmia r12, {q7} @ save last round key + + vld1.8 {q0}, [r8] @ load counter +#ifdef __APPLE__ + mov r8, #:lower16:(.LREVM0SR-.LM0) + add r8, r6, r8 +#else + add r8, r6, #.LREVM0SR-.LM0 @ borrow r8 +#endif + vldmia sp, {q4} @ load round0 key +#else + ldr r12, [r3, #244] + eors r12, #1 + beq 0f + + @ populate the key schedule + str r12, [r3, #244] + mov r4, r3 @ pass key + mov r5, r10 @ pass # of rounds + add r12, r3, #248 @ pass key schedule + bl _bsaes_key_convert + veor q7,q7,q15 @ fix up last round key + vstmia r12, {q7} @ save last round key + +.align 2 + add r12, r3, #248 + vld1.8 {q0}, [r8] @ load counter + adrl r8, .LREVM0SR @ borrow r8 + vldmia r12, {q4} @ load round0 key + sub sp, #0x10 @ place for adjusted round0 key +#endif + + vmov.i32 q8,#1 @ compose 1<<96 + veor q9,q9,q9 + vrev32.8 q0,q0 + vext.8 q8,q9,q8,#4 + vrev32.8 q4,q4 + vadd.u32 q9,q8,q8 @ compose 2<<96 + vstmia sp, {q4} @ save adjusted round0 key + b .Lctr_enc_loop + +.align 4 +.Lctr_enc_loop: + vadd.u32 q10, q8, q9 @ compose 3<<96 + vadd.u32 q1, q0, q8 @ +1 + vadd.u32 q2, q0, q9 @ +2 + vadd.u32 q3, q0, q10 @ +3 + vadd.u32 q4, q1, q10 + vadd.u32 q5, q2, q10 + vadd.u32 q6, q3, q10 + vadd.u32 q7, q4, q10 + vadd.u32 q10, q5, q10 @ next counter + + @ Borrow prologue from _bsaes_encrypt8 to use the opportunity + @ to flip byte order in 32-bit counter + + vldmia sp, {q9} @ load round0 key +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x10 @ pass next round key +#else + add r4, r3, #264 +#endif + vldmia r8, {q8} @ .LREVM0SR + mov r5, r10 @ pass rounds + vstmia r9, {q10} @ save next counter +#ifdef __APPLE__ + mov r6, #:lower16:(.LREVM0SR-.LSR) + sub r6, r8, r6 +#else + sub r6, r8, #.LREVM0SR-.LSR @ pass constants +#endif + + bl _bsaes_encrypt8_alt + + subs r2, r2, #8 + blo .Lctr_enc_loop_done + + vld1.8 {q8,q9}, [r0]! @ load input + vld1.8 {q10,q11}, [r0]! + veor q0, q8 + veor q1, q9 + vld1.8 {q12,q13}, [r0]! + veor q4, q10 + veor q6, q11 + vld1.8 {q14,q15}, [r0]! + veor q3, q12 + vst1.8 {q0,q1}, [r1]! @ write output + veor q7, q13 + veor q2, q14 + vst1.8 {q4}, [r1]! + veor q5, q15 + vst1.8 {q6}, [r1]! + vmov.i32 q8, #1 @ compose 1<<96 + vst1.8 {q3}, [r1]! + veor q9, q9, q9 + vst1.8 {q7}, [r1]! + vext.8 q8, q9, q8, #4 + vst1.8 {q2}, [r1]! + vadd.u32 q9,q8,q8 @ compose 2<<96 + vst1.8 {q5}, [r1]! + vldmia r9, {q0} @ load counter + + bne .Lctr_enc_loop + b .Lctr_enc_done + +.align 4 +.Lctr_enc_loop_done: + add r2, r2, #8 + vld1.8 {q8}, [r0]! @ load input + veor q0, q8 + vst1.8 {q0}, [r1]! @ write output + cmp r2, #2 + blo .Lctr_enc_done + vld1.8 {q9}, [r0]! + veor q1, q9 + vst1.8 {q1}, [r1]! + beq .Lctr_enc_done + vld1.8 {q10}, [r0]! + veor q4, q10 + vst1.8 {q4}, [r1]! + cmp r2, #4 + blo .Lctr_enc_done + vld1.8 {q11}, [r0]! + veor q6, q11 + vst1.8 {q6}, [r1]! + beq .Lctr_enc_done + vld1.8 {q12}, [r0]! + veor q3, q12 + vst1.8 {q3}, [r1]! + cmp r2, #6 + blo .Lctr_enc_done + vld1.8 {q13}, [r0]! + veor q7, q13 + vst1.8 {q7}, [r1]! + beq .Lctr_enc_done + vld1.8 {q14}, [r0] + veor q2, q14 + vst1.8 {q2}, [r1]! + +.Lctr_enc_done: + vmov.i32 q0, #0 + vmov.i32 q1, #0 +#ifndef BSAES_ASM_EXTENDED_KEY +.Lctr_enc_bzero:@ wipe key schedule [if any] + vstmia sp!, {q0,q1} + cmp sp, r9 + bne .Lctr_enc_bzero +#else + vstmia sp, {q0,q1} +#endif + + mov sp, r9 + add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb + VFP_ABI_POP + ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return + + @ OpenSSL contains aes_nohw_* fallback code here. We patch this + @ out to retain a constant-time implementation. +.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__) diff --git a/ring-0.17.14/pregenerated/chacha-armv4-linux32.S b/ring-0.17.14/pregenerated/chacha-armv4-linux32.S new file mode 100644 index 0000000000..f058e5801e --- /dev/null +++ b/ring-0.17.14/pregenerated/chacha-armv4-linux32.S @@ -0,0 +1,1449 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__) +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. +.arch armv7-a + +.text +#if defined(__thumb2__) || defined(__clang__) +.syntax unified +#endif +#if defined(__thumb2__) +.thumb +#else +.code 32 +#endif + +#if defined(__thumb2__) || defined(__clang__) +#define ldrhsb ldrbhs +#endif + +.align 5 +.Lsigma: +.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral +.Lone: +.long 1,0,0,0 + +.globl ChaCha20_ctr32_nohw +.hidden ChaCha20_ctr32_nohw +.type ChaCha20_ctr32_nohw,%function +.align 5 +ChaCha20_ctr32_nohw: + ldr r12,[sp,#0] @ pull pointer to counter and nonce + stmdb sp!,{r0,r1,r2,r4-r11,lr} + adr r14,.Lsigma + ldmia r12,{r4,r5,r6,r7} @ load counter and nonce + sub sp,sp,#4*(16) @ off-load area + stmdb sp!,{r4,r5,r6,r7} @ copy counter and nonce + ldmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} @ load key + ldmia r14,{r0,r1,r2,r3} @ load sigma + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy key + stmdb sp!,{r0,r1,r2,r3} @ copy sigma + str r10,[sp,#4*(16+10)] @ off-load "rx" + str r11,[sp,#4*(16+11)] @ off-load "rx" + b .Loop_outer_enter + +.align 4 +.Loop_outer: + ldmia sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ load key material + str r11,[sp,#4*(32+2)] @ save len + str r12, [sp,#4*(32+1)] @ save inp + str r14, [sp,#4*(32+0)] @ save out +.Loop_outer_enter: + ldr r11, [sp,#4*(15)] + ldr r12,[sp,#4*(12)] @ modulo-scheduled load + ldr r10, [sp,#4*(13)] + ldr r14,[sp,#4*(14)] + str r11, [sp,#4*(16+15)] + mov r11,#10 + b .Loop + +.align 4 +.Loop: + subs r11,r11,#1 + add r0,r0,r4 + mov r12,r12,ror#16 + add r1,r1,r5 + mov r10,r10,ror#16 + eor r12,r12,r0,ror#16 + eor r10,r10,r1,ror#16 + add r8,r8,r12 + mov r4,r4,ror#20 + add r9,r9,r10 + mov r5,r5,ror#20 + eor r4,r4,r8,ror#20 + eor r5,r5,r9,ror#20 + add r0,r0,r4 + mov r12,r12,ror#24 + add r1,r1,r5 + mov r10,r10,ror#24 + eor r12,r12,r0,ror#24 + eor r10,r10,r1,ror#24 + add r8,r8,r12 + mov r4,r4,ror#25 + add r9,r9,r10 + mov r5,r5,ror#25 + str r10,[sp,#4*(16+13)] + ldr r10,[sp,#4*(16+15)] + eor r4,r4,r8,ror#25 + eor r5,r5,r9,ror#25 + str r8,[sp,#4*(16+8)] + ldr r8,[sp,#4*(16+10)] + add r2,r2,r6 + mov r14,r14,ror#16 + str r9,[sp,#4*(16+9)] + ldr r9,[sp,#4*(16+11)] + add r3,r3,r7 + mov r10,r10,ror#16 + eor r14,r14,r2,ror#16 + eor r10,r10,r3,ror#16 + add r8,r8,r14 + mov r6,r6,ror#20 + add r9,r9,r10 + mov r7,r7,ror#20 + eor r6,r6,r8,ror#20 + eor r7,r7,r9,ror#20 + add r2,r2,r6 + mov r14,r14,ror#24 + add r3,r3,r7 + mov r10,r10,ror#24 + eor r14,r14,r2,ror#24 + eor r10,r10,r3,ror#24 + add r8,r8,r14 + mov r6,r6,ror#25 + add r9,r9,r10 + mov r7,r7,ror#25 + eor r6,r6,r8,ror#25 + eor r7,r7,r9,ror#25 + add r0,r0,r5 + mov r10,r10,ror#16 + add r1,r1,r6 + mov r12,r12,ror#16 + eor r10,r10,r0,ror#16 + eor r12,r12,r1,ror#16 + add r8,r8,r10 + mov r5,r5,ror#20 + add r9,r9,r12 + mov r6,r6,ror#20 + eor r5,r5,r8,ror#20 + eor r6,r6,r9,ror#20 + add r0,r0,r5 + mov r10,r10,ror#24 + add r1,r1,r6 + mov r12,r12,ror#24 + eor r10,r10,r0,ror#24 + eor r12,r12,r1,ror#24 + add r8,r8,r10 + mov r5,r5,ror#25 + str r10,[sp,#4*(16+15)] + ldr r10,[sp,#4*(16+13)] + add r9,r9,r12 + mov r6,r6,ror#25 + eor r5,r5,r8,ror#25 + eor r6,r6,r9,ror#25 + str r8,[sp,#4*(16+10)] + ldr r8,[sp,#4*(16+8)] + add r2,r2,r7 + mov r10,r10,ror#16 + str r9,[sp,#4*(16+11)] + ldr r9,[sp,#4*(16+9)] + add r3,r3,r4 + mov r14,r14,ror#16 + eor r10,r10,r2,ror#16 + eor r14,r14,r3,ror#16 + add r8,r8,r10 + mov r7,r7,ror#20 + add r9,r9,r14 + mov r4,r4,ror#20 + eor r7,r7,r8,ror#20 + eor r4,r4,r9,ror#20 + add r2,r2,r7 + mov r10,r10,ror#24 + add r3,r3,r4 + mov r14,r14,ror#24 + eor r10,r10,r2,ror#24 + eor r14,r14,r3,ror#24 + add r8,r8,r10 + mov r7,r7,ror#25 + add r9,r9,r14 + mov r4,r4,ror#25 + eor r7,r7,r8,ror#25 + eor r4,r4,r9,ror#25 + bne .Loop + + ldr r11,[sp,#4*(32+2)] @ load len + + str r8, [sp,#4*(16+8)] @ modulo-scheduled store + str r9, [sp,#4*(16+9)] + str r12,[sp,#4*(16+12)] + str r10, [sp,#4*(16+13)] + str r14,[sp,#4*(16+14)] + + @ at this point we have first half of 512-bit result in + @ rx and second half at sp+4*(16+8) + + cmp r11,#64 @ done yet? +#ifdef __thumb2__ + itete lo +#endif + addlo r12,sp,#4*(0) @ shortcut or ... + ldrhs r12,[sp,#4*(32+1)] @ ... load inp + addlo r14,sp,#4*(0) @ shortcut or ... + ldrhs r14,[sp,#4*(32+0)] @ ... load out + + ldr r8,[sp,#4*(0)] @ load key material + ldr r9,[sp,#4*(1)] + +#if __ARM_ARCH>=6 || !defined(__ARMEB__) +# if __ARM_ARCH<7 + orr r10,r12,r14 + tst r10,#3 @ are input and output aligned? + ldr r10,[sp,#4*(2)] + bne .Lunaligned + cmp r11,#64 @ restore flags +# else + ldr r10,[sp,#4*(2)] +# endif + ldr r11,[sp,#4*(3)] + + add r0,r0,r8 @ accumulate key material + add r1,r1,r9 +# ifdef __thumb2__ + itt hs +# endif + ldrhs r8,[r12],#16 @ load input + ldrhs r9,[r12,#-12] + + add r2,r2,r10 + add r3,r3,r11 +# ifdef __thumb2__ + itt hs +# endif + ldrhs r10,[r12,#-8] + ldrhs r11,[r12,#-4] +# if __ARM_ARCH>=6 && defined(__ARMEB__) + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs r0,r0,r8 @ xor with input + eorhs r1,r1,r9 + add r8,sp,#4*(4) + str r0,[r14],#16 @ store output +# ifdef __thumb2__ + itt hs +# endif + eorhs r2,r2,r10 + eorhs r3,r3,r11 + ldmia r8,{r8,r9,r10,r11} @ load key material + str r1,[r14,#-12] + str r2,[r14,#-8] + str r3,[r14,#-4] + + add r4,r4,r8 @ accumulate key material + add r5,r5,r9 +# ifdef __thumb2__ + itt hs +# endif + ldrhs r8,[r12],#16 @ load input + ldrhs r9,[r12,#-12] + add r6,r6,r10 + add r7,r7,r11 +# ifdef __thumb2__ + itt hs +# endif + ldrhs r10,[r12,#-8] + ldrhs r11,[r12,#-4] +# if __ARM_ARCH>=6 && defined(__ARMEB__) + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs r4,r4,r8 + eorhs r5,r5,r9 + add r8,sp,#4*(8) + str r4,[r14],#16 @ store output +# ifdef __thumb2__ + itt hs +# endif + eorhs r6,r6,r10 + eorhs r7,r7,r11 + str r5,[r14,#-12] + ldmia r8,{r8,r9,r10,r11} @ load key material + str r6,[r14,#-8] + add r0,sp,#4*(16+8) + str r7,[r14,#-4] + + ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half + + add r0,r0,r8 @ accumulate key material + add r1,r1,r9 +# ifdef __thumb2__ + itt hs +# endif + ldrhs r8,[r12],#16 @ load input + ldrhs r9,[r12,#-12] +# ifdef __thumb2__ + itt hi +# endif + strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it + strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it + add r2,r2,r10 + add r3,r3,r11 +# ifdef __thumb2__ + itt hs +# endif + ldrhs r10,[r12,#-8] + ldrhs r11,[r12,#-4] +# if __ARM_ARCH>=6 && defined(__ARMEB__) + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs r0,r0,r8 + eorhs r1,r1,r9 + add r8,sp,#4*(12) + str r0,[r14],#16 @ store output +# ifdef __thumb2__ + itt hs +# endif + eorhs r2,r2,r10 + eorhs r3,r3,r11 + str r1,[r14,#-12] + ldmia r8,{r8,r9,r10,r11} @ load key material + str r2,[r14,#-8] + str r3,[r14,#-4] + + add r4,r4,r8 @ accumulate key material + add r5,r5,r9 +# ifdef __thumb2__ + itt hi +# endif + addhi r8,r8,#1 @ next counter value + strhi r8,[sp,#4*(12)] @ save next counter value +# ifdef __thumb2__ + itt hs +# endif + ldrhs r8,[r12],#16 @ load input + ldrhs r9,[r12,#-12] + add r6,r6,r10 + add r7,r7,r11 +# ifdef __thumb2__ + itt hs +# endif + ldrhs r10,[r12,#-8] + ldrhs r11,[r12,#-4] +# if __ARM_ARCH>=6 && defined(__ARMEB__) + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs r4,r4,r8 + eorhs r5,r5,r9 +# ifdef __thumb2__ + it ne +# endif + ldrne r8,[sp,#4*(32+2)] @ re-load len +# ifdef __thumb2__ + itt hs +# endif + eorhs r6,r6,r10 + eorhs r7,r7,r11 + str r4,[r14],#16 @ store output + str r5,[r14,#-12] +# ifdef __thumb2__ + it hs +# endif + subhs r11,r8,#64 @ len-=64 + str r6,[r14,#-8] + str r7,[r14,#-4] + bhi .Loop_outer + + beq .Ldone +# if __ARM_ARCH<7 + b .Ltail + +.align 4 +.Lunaligned:@ unaligned endian-neutral path + cmp r11,#64 @ restore flags +# endif +#endif +#if __ARM_ARCH<7 + ldr r11,[sp,#4*(3)] + add r0,r0,r8 @ accumulate key material + add r1,r1,r9 + add r2,r2,r10 +# ifdef __thumb2__ + itete lo +# endif + eorlo r8,r8,r8 @ zero or ... + ldrhsb r8,[r12],#16 @ ... load input + eorlo r9,r9,r9 + ldrhsb r9,[r12,#-12] + + add r3,r3,r11 +# ifdef __thumb2__ + itete lo +# endif + eorlo r10,r10,r10 + ldrhsb r10,[r12,#-8] + eorlo r11,r11,r11 + ldrhsb r11,[r12,#-4] + + eor r0,r8,r0 @ xor with input (or zero) + eor r1,r9,r1 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-15] @ load more input + ldrhsb r9,[r12,#-11] + eor r2,r10,r2 + strb r0,[r14],#16 @ store output + eor r3,r11,r3 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-7] + ldrhsb r11,[r12,#-3] + strb r1,[r14,#-12] + eor r0,r8,r0,lsr#8 + strb r2,[r14,#-8] + eor r1,r9,r1,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-14] @ load more input + ldrhsb r9,[r12,#-10] + strb r3,[r14,#-4] + eor r2,r10,r2,lsr#8 + strb r0,[r14,#-15] + eor r3,r11,r3,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-6] + ldrhsb r11,[r12,#-2] + strb r1,[r14,#-11] + eor r0,r8,r0,lsr#8 + strb r2,[r14,#-7] + eor r1,r9,r1,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-13] @ load more input + ldrhsb r9,[r12,#-9] + strb r3,[r14,#-3] + eor r2,r10,r2,lsr#8 + strb r0,[r14,#-14] + eor r3,r11,r3,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-5] + ldrhsb r11,[r12,#-1] + strb r1,[r14,#-10] + strb r2,[r14,#-6] + eor r0,r8,r0,lsr#8 + strb r3,[r14,#-2] + eor r1,r9,r1,lsr#8 + strb r0,[r14,#-13] + eor r2,r10,r2,lsr#8 + strb r1,[r14,#-9] + eor r3,r11,r3,lsr#8 + strb r2,[r14,#-5] + strb r3,[r14,#-1] + add r8,sp,#4*(4+0) + ldmia r8,{r8,r9,r10,r11} @ load key material + add r0,sp,#4*(16+8) + add r4,r4,r8 @ accumulate key material + add r5,r5,r9 + add r6,r6,r10 +# ifdef __thumb2__ + itete lo +# endif + eorlo r8,r8,r8 @ zero or ... + ldrhsb r8,[r12],#16 @ ... load input + eorlo r9,r9,r9 + ldrhsb r9,[r12,#-12] + + add r7,r7,r11 +# ifdef __thumb2__ + itete lo +# endif + eorlo r10,r10,r10 + ldrhsb r10,[r12,#-8] + eorlo r11,r11,r11 + ldrhsb r11,[r12,#-4] + + eor r4,r8,r4 @ xor with input (or zero) + eor r5,r9,r5 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-15] @ load more input + ldrhsb r9,[r12,#-11] + eor r6,r10,r6 + strb r4,[r14],#16 @ store output + eor r7,r11,r7 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-7] + ldrhsb r11,[r12,#-3] + strb r5,[r14,#-12] + eor r4,r8,r4,lsr#8 + strb r6,[r14,#-8] + eor r5,r9,r5,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-14] @ load more input + ldrhsb r9,[r12,#-10] + strb r7,[r14,#-4] + eor r6,r10,r6,lsr#8 + strb r4,[r14,#-15] + eor r7,r11,r7,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-6] + ldrhsb r11,[r12,#-2] + strb r5,[r14,#-11] + eor r4,r8,r4,lsr#8 + strb r6,[r14,#-7] + eor r5,r9,r5,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-13] @ load more input + ldrhsb r9,[r12,#-9] + strb r7,[r14,#-3] + eor r6,r10,r6,lsr#8 + strb r4,[r14,#-14] + eor r7,r11,r7,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-5] + ldrhsb r11,[r12,#-1] + strb r5,[r14,#-10] + strb r6,[r14,#-6] + eor r4,r8,r4,lsr#8 + strb r7,[r14,#-2] + eor r5,r9,r5,lsr#8 + strb r4,[r14,#-13] + eor r6,r10,r6,lsr#8 + strb r5,[r14,#-9] + eor r7,r11,r7,lsr#8 + strb r6,[r14,#-5] + strb r7,[r14,#-1] + add r8,sp,#4*(4+4) + ldmia r8,{r8,r9,r10,r11} @ load key material + ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half +# ifdef __thumb2__ + itt hi +# endif + strhi r10,[sp,#4*(16+10)] @ copy "rx" + strhi r11,[sp,#4*(16+11)] @ copy "rx" + add r0,r0,r8 @ accumulate key material + add r1,r1,r9 + add r2,r2,r10 +# ifdef __thumb2__ + itete lo +# endif + eorlo r8,r8,r8 @ zero or ... + ldrhsb r8,[r12],#16 @ ... load input + eorlo r9,r9,r9 + ldrhsb r9,[r12,#-12] + + add r3,r3,r11 +# ifdef __thumb2__ + itete lo +# endif + eorlo r10,r10,r10 + ldrhsb r10,[r12,#-8] + eorlo r11,r11,r11 + ldrhsb r11,[r12,#-4] + + eor r0,r8,r0 @ xor with input (or zero) + eor r1,r9,r1 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-15] @ load more input + ldrhsb r9,[r12,#-11] + eor r2,r10,r2 + strb r0,[r14],#16 @ store output + eor r3,r11,r3 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-7] + ldrhsb r11,[r12,#-3] + strb r1,[r14,#-12] + eor r0,r8,r0,lsr#8 + strb r2,[r14,#-8] + eor r1,r9,r1,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-14] @ load more input + ldrhsb r9,[r12,#-10] + strb r3,[r14,#-4] + eor r2,r10,r2,lsr#8 + strb r0,[r14,#-15] + eor r3,r11,r3,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-6] + ldrhsb r11,[r12,#-2] + strb r1,[r14,#-11] + eor r0,r8,r0,lsr#8 + strb r2,[r14,#-7] + eor r1,r9,r1,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-13] @ load more input + ldrhsb r9,[r12,#-9] + strb r3,[r14,#-3] + eor r2,r10,r2,lsr#8 + strb r0,[r14,#-14] + eor r3,r11,r3,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-5] + ldrhsb r11,[r12,#-1] + strb r1,[r14,#-10] + strb r2,[r14,#-6] + eor r0,r8,r0,lsr#8 + strb r3,[r14,#-2] + eor r1,r9,r1,lsr#8 + strb r0,[r14,#-13] + eor r2,r10,r2,lsr#8 + strb r1,[r14,#-9] + eor r3,r11,r3,lsr#8 + strb r2,[r14,#-5] + strb r3,[r14,#-1] + add r8,sp,#4*(4+8) + ldmia r8,{r8,r9,r10,r11} @ load key material + add r4,r4,r8 @ accumulate key material +# ifdef __thumb2__ + itt hi +# endif + addhi r8,r8,#1 @ next counter value + strhi r8,[sp,#4*(12)] @ save next counter value + add r5,r5,r9 + add r6,r6,r10 +# ifdef __thumb2__ + itete lo +# endif + eorlo r8,r8,r8 @ zero or ... + ldrhsb r8,[r12],#16 @ ... load input + eorlo r9,r9,r9 + ldrhsb r9,[r12,#-12] + + add r7,r7,r11 +# ifdef __thumb2__ + itete lo +# endif + eorlo r10,r10,r10 + ldrhsb r10,[r12,#-8] + eorlo r11,r11,r11 + ldrhsb r11,[r12,#-4] + + eor r4,r8,r4 @ xor with input (or zero) + eor r5,r9,r5 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-15] @ load more input + ldrhsb r9,[r12,#-11] + eor r6,r10,r6 + strb r4,[r14],#16 @ store output + eor r7,r11,r7 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-7] + ldrhsb r11,[r12,#-3] + strb r5,[r14,#-12] + eor r4,r8,r4,lsr#8 + strb r6,[r14,#-8] + eor r5,r9,r5,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-14] @ load more input + ldrhsb r9,[r12,#-10] + strb r7,[r14,#-4] + eor r6,r10,r6,lsr#8 + strb r4,[r14,#-15] + eor r7,r11,r7,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-6] + ldrhsb r11,[r12,#-2] + strb r5,[r14,#-11] + eor r4,r8,r4,lsr#8 + strb r6,[r14,#-7] + eor r5,r9,r5,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-13] @ load more input + ldrhsb r9,[r12,#-9] + strb r7,[r14,#-3] + eor r6,r10,r6,lsr#8 + strb r4,[r14,#-14] + eor r7,r11,r7,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-5] + ldrhsb r11,[r12,#-1] + strb r5,[r14,#-10] + strb r6,[r14,#-6] + eor r4,r8,r4,lsr#8 + strb r7,[r14,#-2] + eor r5,r9,r5,lsr#8 + strb r4,[r14,#-13] + eor r6,r10,r6,lsr#8 + strb r5,[r14,#-9] + eor r7,r11,r7,lsr#8 + strb r6,[r14,#-5] + strb r7,[r14,#-1] +# ifdef __thumb2__ + it ne +# endif + ldrne r8,[sp,#4*(32+2)] @ re-load len +# ifdef __thumb2__ + it hs +# endif + subhs r11,r8,#64 @ len-=64 + bhi .Loop_outer + + beq .Ldone +#endif + +.Ltail: + ldr r12,[sp,#4*(32+1)] @ load inp + add r9,sp,#4*(0) + ldr r14,[sp,#4*(32+0)] @ load out + +.Loop_tail: + ldrb r10,[r9],#1 @ read buffer on stack + ldrb r11,[r12],#1 @ read input + subs r8,r8,#1 + eor r11,r11,r10 + strb r11,[r14],#1 @ store output + bne .Loop_tail + +.Ldone: + add sp,sp,#4*(32+3) + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} +.size ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.globl ChaCha20_ctr32_neon +.hidden ChaCha20_ctr32_neon +.type ChaCha20_ctr32_neon,%function +.align 5 +ChaCha20_ctr32_neon: + ldr r12,[sp,#0] @ pull pointer to counter and nonce + stmdb sp!,{r0,r1,r2,r4-r11,lr} + adr r14,.Lsigma + vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI spec says so + stmdb sp!,{r0,r1,r2,r3} + + vld1.32 {q1,q2},[r3] @ load key + ldmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} @ load key + + sub sp,sp,#4*(16+16) + vld1.32 {q3},[r12] @ load counter and nonce + add r12,sp,#4*8 + ldmia r14,{r0,r1,r2,r3} @ load sigma + vld1.32 {q0},[r14]! @ load sigma + vld1.32 {q12},[r14] @ one + vst1.32 {q2,q3},[r12] @ copy 1/2key|counter|nonce + vst1.32 {q0,q1},[sp] @ copy sigma|1/2key + + str r10,[sp,#4*(16+10)] @ off-load "rx" + str r11,[sp,#4*(16+11)] @ off-load "rx" + vshl.i32 d26,d24,#1 @ two + vstr d24,[sp,#4*(16+0)] + vshl.i32 d28,d24,#2 @ four + vstr d26,[sp,#4*(16+2)] + vmov q4,q0 + vstr d28,[sp,#4*(16+4)] + vmov q8,q0 + vmov q5,q1 + vmov q9,q1 + b .Loop_neon_enter + +.align 4 +.Loop_neon_outer: + ldmia sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ load key material + cmp r11,#64*2 @ if len<=64*2 + bls .Lbreak_neon @ switch to integer-only + vmov q4,q0 + str r11,[sp,#4*(32+2)] @ save len + vmov q8,q0 + str r12, [sp,#4*(32+1)] @ save inp + vmov q5,q1 + str r14, [sp,#4*(32+0)] @ save out + vmov q9,q1 +.Loop_neon_enter: + ldr r11, [sp,#4*(15)] + vadd.i32 q7,q3,q12 @ counter+1 + ldr r12,[sp,#4*(12)] @ modulo-scheduled load + vmov q6,q2 + ldr r10, [sp,#4*(13)] + vmov q10,q2 + ldr r14,[sp,#4*(14)] + vadd.i32 q11,q7,q12 @ counter+2 + str r11, [sp,#4*(16+15)] + mov r11,#10 + add r12,r12,#3 @ counter+3 + b .Loop_neon + +.align 4 +.Loop_neon: + subs r11,r11,#1 + vadd.i32 q0,q0,q1 + add r0,r0,r4 + vadd.i32 q4,q4,q5 + mov r12,r12,ror#16 + vadd.i32 q8,q8,q9 + add r1,r1,r5 + veor q3,q3,q0 + mov r10,r10,ror#16 + veor q7,q7,q4 + eor r12,r12,r0,ror#16 + veor q11,q11,q8 + eor r10,r10,r1,ror#16 + vrev32.16 q3,q3 + add r8,r8,r12 + vrev32.16 q7,q7 + mov r4,r4,ror#20 + vrev32.16 q11,q11 + add r9,r9,r10 + vadd.i32 q2,q2,q3 + mov r5,r5,ror#20 + vadd.i32 q6,q6,q7 + eor r4,r4,r8,ror#20 + vadd.i32 q10,q10,q11 + eor r5,r5,r9,ror#20 + veor q12,q1,q2 + add r0,r0,r4 + veor q13,q5,q6 + mov r12,r12,ror#24 + veor q14,q9,q10 + add r1,r1,r5 + vshr.u32 q1,q12,#20 + mov r10,r10,ror#24 + vshr.u32 q5,q13,#20 + eor r12,r12,r0,ror#24 + vshr.u32 q9,q14,#20 + eor r10,r10,r1,ror#24 + vsli.32 q1,q12,#12 + add r8,r8,r12 + vsli.32 q5,q13,#12 + mov r4,r4,ror#25 + vsli.32 q9,q14,#12 + add r9,r9,r10 + vadd.i32 q0,q0,q1 + mov r5,r5,ror#25 + vadd.i32 q4,q4,q5 + str r10,[sp,#4*(16+13)] + vadd.i32 q8,q8,q9 + ldr r10,[sp,#4*(16+15)] + veor q12,q3,q0 + eor r4,r4,r8,ror#25 + veor q13,q7,q4 + eor r5,r5,r9,ror#25 + veor q14,q11,q8 + str r8,[sp,#4*(16+8)] + vshr.u32 q3,q12,#24 + ldr r8,[sp,#4*(16+10)] + vshr.u32 q7,q13,#24 + add r2,r2,r6 + vshr.u32 q11,q14,#24 + mov r14,r14,ror#16 + vsli.32 q3,q12,#8 + str r9,[sp,#4*(16+9)] + vsli.32 q7,q13,#8 + ldr r9,[sp,#4*(16+11)] + vsli.32 q11,q14,#8 + add r3,r3,r7 + vadd.i32 q2,q2,q3 + mov r10,r10,ror#16 + vadd.i32 q6,q6,q7 + eor r14,r14,r2,ror#16 + vadd.i32 q10,q10,q11 + eor r10,r10,r3,ror#16 + veor q12,q1,q2 + add r8,r8,r14 + veor q13,q5,q6 + mov r6,r6,ror#20 + veor q14,q9,q10 + add r9,r9,r10 + vshr.u32 q1,q12,#25 + mov r7,r7,ror#20 + vshr.u32 q5,q13,#25 + eor r6,r6,r8,ror#20 + vshr.u32 q9,q14,#25 + eor r7,r7,r9,ror#20 + vsli.32 q1,q12,#7 + add r2,r2,r6 + vsli.32 q5,q13,#7 + mov r14,r14,ror#24 + vsli.32 q9,q14,#7 + add r3,r3,r7 + vext.8 q2,q2,q2,#8 + mov r10,r10,ror#24 + vext.8 q6,q6,q6,#8 + eor r14,r14,r2,ror#24 + vext.8 q10,q10,q10,#8 + eor r10,r10,r3,ror#24 + vext.8 q1,q1,q1,#4 + add r8,r8,r14 + vext.8 q5,q5,q5,#4 + mov r6,r6,ror#25 + vext.8 q9,q9,q9,#4 + add r9,r9,r10 + vext.8 q3,q3,q3,#12 + mov r7,r7,ror#25 + vext.8 q7,q7,q7,#12 + eor r6,r6,r8,ror#25 + vext.8 q11,q11,q11,#12 + eor r7,r7,r9,ror#25 + vadd.i32 q0,q0,q1 + add r0,r0,r5 + vadd.i32 q4,q4,q5 + mov r10,r10,ror#16 + vadd.i32 q8,q8,q9 + add r1,r1,r6 + veor q3,q3,q0 + mov r12,r12,ror#16 + veor q7,q7,q4 + eor r10,r10,r0,ror#16 + veor q11,q11,q8 + eor r12,r12,r1,ror#16 + vrev32.16 q3,q3 + add r8,r8,r10 + vrev32.16 q7,q7 + mov r5,r5,ror#20 + vrev32.16 q11,q11 + add r9,r9,r12 + vadd.i32 q2,q2,q3 + mov r6,r6,ror#20 + vadd.i32 q6,q6,q7 + eor r5,r5,r8,ror#20 + vadd.i32 q10,q10,q11 + eor r6,r6,r9,ror#20 + veor q12,q1,q2 + add r0,r0,r5 + veor q13,q5,q6 + mov r10,r10,ror#24 + veor q14,q9,q10 + add r1,r1,r6 + vshr.u32 q1,q12,#20 + mov r12,r12,ror#24 + vshr.u32 q5,q13,#20 + eor r10,r10,r0,ror#24 + vshr.u32 q9,q14,#20 + eor r12,r12,r1,ror#24 + vsli.32 q1,q12,#12 + add r8,r8,r10 + vsli.32 q5,q13,#12 + mov r5,r5,ror#25 + vsli.32 q9,q14,#12 + str r10,[sp,#4*(16+15)] + vadd.i32 q0,q0,q1 + ldr r10,[sp,#4*(16+13)] + vadd.i32 q4,q4,q5 + add r9,r9,r12 + vadd.i32 q8,q8,q9 + mov r6,r6,ror#25 + veor q12,q3,q0 + eor r5,r5,r8,ror#25 + veor q13,q7,q4 + eor r6,r6,r9,ror#25 + veor q14,q11,q8 + str r8,[sp,#4*(16+10)] + vshr.u32 q3,q12,#24 + ldr r8,[sp,#4*(16+8)] + vshr.u32 q7,q13,#24 + add r2,r2,r7 + vshr.u32 q11,q14,#24 + mov r10,r10,ror#16 + vsli.32 q3,q12,#8 + str r9,[sp,#4*(16+11)] + vsli.32 q7,q13,#8 + ldr r9,[sp,#4*(16+9)] + vsli.32 q11,q14,#8 + add r3,r3,r4 + vadd.i32 q2,q2,q3 + mov r14,r14,ror#16 + vadd.i32 q6,q6,q7 + eor r10,r10,r2,ror#16 + vadd.i32 q10,q10,q11 + eor r14,r14,r3,ror#16 + veor q12,q1,q2 + add r8,r8,r10 + veor q13,q5,q6 + mov r7,r7,ror#20 + veor q14,q9,q10 + add r9,r9,r14 + vshr.u32 q1,q12,#25 + mov r4,r4,ror#20 + vshr.u32 q5,q13,#25 + eor r7,r7,r8,ror#20 + vshr.u32 q9,q14,#25 + eor r4,r4,r9,ror#20 + vsli.32 q1,q12,#7 + add r2,r2,r7 + vsli.32 q5,q13,#7 + mov r10,r10,ror#24 + vsli.32 q9,q14,#7 + add r3,r3,r4 + vext.8 q2,q2,q2,#8 + mov r14,r14,ror#24 + vext.8 q6,q6,q6,#8 + eor r10,r10,r2,ror#24 + vext.8 q10,q10,q10,#8 + eor r14,r14,r3,ror#24 + vext.8 q1,q1,q1,#12 + add r8,r8,r10 + vext.8 q5,q5,q5,#12 + mov r7,r7,ror#25 + vext.8 q9,q9,q9,#12 + add r9,r9,r14 + vext.8 q3,q3,q3,#4 + mov r4,r4,ror#25 + vext.8 q7,q7,q7,#4 + eor r7,r7,r8,ror#25 + vext.8 q11,q11,q11,#4 + eor r4,r4,r9,ror#25 + bne .Loop_neon + + add r11,sp,#32 + vld1.32 {q12,q13},[sp] @ load key material + vld1.32 {q14,q15},[r11] + + ldr r11,[sp,#4*(32+2)] @ load len + + str r8, [sp,#4*(16+8)] @ modulo-scheduled store + str r9, [sp,#4*(16+9)] + str r12,[sp,#4*(16+12)] + str r10, [sp,#4*(16+13)] + str r14,[sp,#4*(16+14)] + + @ at this point we have first half of 512-bit result in + @ rx and second half at sp+4*(16+8) + + ldr r12,[sp,#4*(32+1)] @ load inp + ldr r14,[sp,#4*(32+0)] @ load out + + vadd.i32 q0,q0,q12 @ accumulate key material + vadd.i32 q4,q4,q12 + vadd.i32 q8,q8,q12 + vldr d24,[sp,#4*(16+0)] @ one + + vadd.i32 q1,q1,q13 + vadd.i32 q5,q5,q13 + vadd.i32 q9,q9,q13 + vldr d26,[sp,#4*(16+2)] @ two + + vadd.i32 q2,q2,q14 + vadd.i32 q6,q6,q14 + vadd.i32 q10,q10,q14 + vadd.i32 d14,d14,d24 @ counter+1 + vadd.i32 d22,d22,d26 @ counter+2 + + vadd.i32 q3,q3,q15 + vadd.i32 q7,q7,q15 + vadd.i32 q11,q11,q15 + + cmp r11,#64*4 + blo .Ltail_neon + + vld1.8 {q12,q13},[r12]! @ load input + mov r11,sp + vld1.8 {q14,q15},[r12]! + veor q0,q0,q12 @ xor with input + veor q1,q1,q13 + vld1.8 {q12,q13},[r12]! + veor q2,q2,q14 + veor q3,q3,q15 + vld1.8 {q14,q15},[r12]! + + veor q4,q4,q12 + vst1.8 {q0,q1},[r14]! @ store output + veor q5,q5,q13 + vld1.8 {q12,q13},[r12]! + veor q6,q6,q14 + vst1.8 {q2,q3},[r14]! + veor q7,q7,q15 + vld1.8 {q14,q15},[r12]! + + veor q8,q8,q12 + vld1.32 {q0,q1},[r11]! @ load for next iteration + veor d25,d25,d25 + vldr d24,[sp,#4*(16+4)] @ four + veor q9,q9,q13 + vld1.32 {q2,q3},[r11] + veor q10,q10,q14 + vst1.8 {q4,q5},[r14]! + veor q11,q11,q15 + vst1.8 {q6,q7},[r14]! + + vadd.i32 d6,d6,d24 @ next counter value + vldr d24,[sp,#4*(16+0)] @ one + + ldmia sp,{r8,r9,r10,r11} @ load key material + add r0,r0,r8 @ accumulate key material + ldr r8,[r12],#16 @ load input + vst1.8 {q8,q9},[r14]! + add r1,r1,r9 + ldr r9,[r12,#-12] + vst1.8 {q10,q11},[r14]! + add r2,r2,r10 + ldr r10,[r12,#-8] + add r3,r3,r11 + ldr r11,[r12,#-4] +# ifdef __ARMEB__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +# endif + eor r0,r0,r8 @ xor with input + add r8,sp,#4*(4) + eor r1,r1,r9 + str r0,[r14],#16 @ store output + eor r2,r2,r10 + str r1,[r14,#-12] + eor r3,r3,r11 + ldmia r8,{r8,r9,r10,r11} @ load key material + str r2,[r14,#-8] + str r3,[r14,#-4] + + add r4,r4,r8 @ accumulate key material + ldr r8,[r12],#16 @ load input + add r5,r5,r9 + ldr r9,[r12,#-12] + add r6,r6,r10 + ldr r10,[r12,#-8] + add r7,r7,r11 + ldr r11,[r12,#-4] +# ifdef __ARMEB__ + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +# endif + eor r4,r4,r8 + add r8,sp,#4*(8) + eor r5,r5,r9 + str r4,[r14],#16 @ store output + eor r6,r6,r10 + str r5,[r14,#-12] + eor r7,r7,r11 + ldmia r8,{r8,r9,r10,r11} @ load key material + str r6,[r14,#-8] + add r0,sp,#4*(16+8) + str r7,[r14,#-4] + + ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half + + add r0,r0,r8 @ accumulate key material + ldr r8,[r12],#16 @ load input + add r1,r1,r9 + ldr r9,[r12,#-12] +# ifdef __thumb2__ + it hi +# endif + strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it + add r2,r2,r10 + ldr r10,[r12,#-8] +# ifdef __thumb2__ + it hi +# endif + strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it + add r3,r3,r11 + ldr r11,[r12,#-4] +# ifdef __ARMEB__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +# endif + eor r0,r0,r8 + add r8,sp,#4*(12) + eor r1,r1,r9 + str r0,[r14],#16 @ store output + eor r2,r2,r10 + str r1,[r14,#-12] + eor r3,r3,r11 + ldmia r8,{r8,r9,r10,r11} @ load key material + str r2,[r14,#-8] + str r3,[r14,#-4] + + add r4,r4,r8 @ accumulate key material + add r8,r8,#4 @ next counter value + add r5,r5,r9 + str r8,[sp,#4*(12)] @ save next counter value + ldr r8,[r12],#16 @ load input + add r6,r6,r10 + add r4,r4,#3 @ counter+3 + ldr r9,[r12,#-12] + add r7,r7,r11 + ldr r10,[r12,#-8] + ldr r11,[r12,#-4] +# ifdef __ARMEB__ + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +# endif + eor r4,r4,r8 +# ifdef __thumb2__ + it hi +# endif + ldrhi r8,[sp,#4*(32+2)] @ re-load len + eor r5,r5,r9 + eor r6,r6,r10 + str r4,[r14],#16 @ store output + eor r7,r7,r11 + str r5,[r14,#-12] + sub r11,r8,#64*4 @ len-=64*4 + str r6,[r14,#-8] + str r7,[r14,#-4] + bhi .Loop_neon_outer + + b .Ldone_neon + +.align 4 +.Lbreak_neon: + @ harmonize NEON and integer-only stack frames: load data + @ from NEON frame, but save to integer-only one; distance + @ between the two is 4*(32+4+16-32)=4*(20). + + str r11, [sp,#4*(20+32+2)] @ save len + add r11,sp,#4*(32+4) + str r12, [sp,#4*(20+32+1)] @ save inp + str r14, [sp,#4*(20+32+0)] @ save out + + ldr r12,[sp,#4*(16+10)] + ldr r14,[sp,#4*(16+11)] + vldmia r11,{d8,d9,d10,d11,d12,d13,d14,d15} @ fulfill ABI requirement + str r12,[sp,#4*(20+16+10)] @ copy "rx" + str r14,[sp,#4*(20+16+11)] @ copy "rx" + + ldr r11, [sp,#4*(15)] + ldr r12,[sp,#4*(12)] @ modulo-scheduled load + ldr r10, [sp,#4*(13)] + ldr r14,[sp,#4*(14)] + str r11, [sp,#4*(20+16+15)] + add r11,sp,#4*(20) + vst1.32 {q0,q1},[r11]! @ copy key + add sp,sp,#4*(20) @ switch frame + vst1.32 {q2,q3},[r11] + mov r11,#10 + b .Loop @ go integer-only + +.align 4 +.Ltail_neon: + cmp r11,#64*3 + bhs .L192_or_more_neon + cmp r11,#64*2 + bhs .L128_or_more_neon + cmp r11,#64*1 + bhs .L64_or_more_neon + + add r8,sp,#4*(8) + vst1.8 {q0,q1},[sp] + add r10,sp,#4*(0) + vst1.8 {q2,q3},[r8] + b .Loop_tail_neon + +.align 4 +.L64_or_more_neon: + vld1.8 {q12,q13},[r12]! + vld1.8 {q14,q15},[r12]! + veor q0,q0,q12 + veor q1,q1,q13 + veor q2,q2,q14 + veor q3,q3,q15 + vst1.8 {q0,q1},[r14]! + vst1.8 {q2,q3},[r14]! + + beq .Ldone_neon + + add r8,sp,#4*(8) + vst1.8 {q4,q5},[sp] + add r10,sp,#4*(0) + vst1.8 {q6,q7},[r8] + sub r11,r11,#64*1 @ len-=64*1 + b .Loop_tail_neon + +.align 4 +.L128_or_more_neon: + vld1.8 {q12,q13},[r12]! + vld1.8 {q14,q15},[r12]! + veor q0,q0,q12 + veor q1,q1,q13 + vld1.8 {q12,q13},[r12]! + veor q2,q2,q14 + veor q3,q3,q15 + vld1.8 {q14,q15},[r12]! + + veor q4,q4,q12 + veor q5,q5,q13 + vst1.8 {q0,q1},[r14]! + veor q6,q6,q14 + vst1.8 {q2,q3},[r14]! + veor q7,q7,q15 + vst1.8 {q4,q5},[r14]! + vst1.8 {q6,q7},[r14]! + + beq .Ldone_neon + + add r8,sp,#4*(8) + vst1.8 {q8,q9},[sp] + add r10,sp,#4*(0) + vst1.8 {q10,q11},[r8] + sub r11,r11,#64*2 @ len-=64*2 + b .Loop_tail_neon + +.align 4 +.L192_or_more_neon: + vld1.8 {q12,q13},[r12]! + vld1.8 {q14,q15},[r12]! + veor q0,q0,q12 + veor q1,q1,q13 + vld1.8 {q12,q13},[r12]! + veor q2,q2,q14 + veor q3,q3,q15 + vld1.8 {q14,q15},[r12]! + + veor q4,q4,q12 + veor q5,q5,q13 + vld1.8 {q12,q13},[r12]! + veor q6,q6,q14 + vst1.8 {q0,q1},[r14]! + veor q7,q7,q15 + vld1.8 {q14,q15},[r12]! + + veor q8,q8,q12 + vst1.8 {q2,q3},[r14]! + veor q9,q9,q13 + vst1.8 {q4,q5},[r14]! + veor q10,q10,q14 + vst1.8 {q6,q7},[r14]! + veor q11,q11,q15 + vst1.8 {q8,q9},[r14]! + vst1.8 {q10,q11},[r14]! + + beq .Ldone_neon + + ldmia sp,{r8,r9,r10,r11} @ load key material + add r0,r0,r8 @ accumulate key material + add r8,sp,#4*(4) + add r1,r1,r9 + add r2,r2,r10 + add r3,r3,r11 + ldmia r8,{r8,r9,r10,r11} @ load key material + + add r4,r4,r8 @ accumulate key material + add r8,sp,#4*(8) + add r5,r5,r9 + add r6,r6,r10 + add r7,r7,r11 + ldmia r8,{r8,r9,r10,r11} @ load key material +# ifdef __ARMEB__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +# endif + stmia sp,{r0,r1,r2,r3,r4,r5,r6,r7} + add r0,sp,#4*(16+8) + + ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half + + add r0,r0,r8 @ accumulate key material + add r8,sp,#4*(12) + add r1,r1,r9 + add r2,r2,r10 + add r3,r3,r11 + ldmia r8,{r8,r9,r10,r11} @ load key material + + add r4,r4,r8 @ accumulate key material + add r8,sp,#4*(8) + add r5,r5,r9 + add r4,r4,#3 @ counter+3 + add r6,r6,r10 + add r7,r7,r11 + ldr r11,[sp,#4*(32+2)] @ re-load len +# ifdef __ARMEB__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +# endif + stmia r8,{r0,r1,r2,r3,r4,r5,r6,r7} + add r10,sp,#4*(0) + sub r11,r11,#64*3 @ len-=64*3 + +.Loop_tail_neon: + ldrb r8,[r10],#1 @ read buffer on stack + ldrb r9,[r12],#1 @ read input + subs r11,r11,#1 + eor r8,r8,r9 + strb r8,[r14],#1 @ store output + bne .Loop_tail_neon + +.Ldone_neon: + add sp,sp,#4*(32+4) + vldmia sp,{d8,d9,d10,d11,d12,d13,d14,d15} + add sp,sp,#4*(16+3) + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} +.size ChaCha20_ctr32_neon,.-ChaCha20_ctr32_neon +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__) diff --git a/ring-0.17.14/pregenerated/chacha-armv8-ios64.S b/ring-0.17.14/pregenerated/chacha-armv8-ios64.S new file mode 100644 index 0000000000..322f67e47d --- /dev/null +++ b/ring-0.17.14/pregenerated/chacha-armv8-ios64.S @@ -0,0 +1,1966 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) +.section __TEXT,__const + +.align 5 +Lsigma: +.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral +Lone: +.long 1,0,0,0 +.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 + +.text + +.globl _ChaCha20_ctr32_nohw +.private_extern _ChaCha20_ctr32_nohw + +.align 5 +_ChaCha20_ctr32_nohw: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adrp x5,Lsigma@PAGE + add x5,x5,Lsigma@PAGEOFF + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#64 + + ldp x22,x23,[x5] // load sigma + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ldp x28,x30,[x4] // load counter +#ifdef __AARCH64EB__ + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + +Loop_outer: + mov w5,w22 // unpack key block + lsr x6,x22,#32 + mov w7,w23 + lsr x8,x23,#32 + mov w9,w24 + lsr x10,x24,#32 + mov w11,w25 + lsr x12,x25,#32 + mov w13,w26 + lsr x14,x26,#32 + mov w15,w27 + lsr x16,x27,#32 + mov w17,w28 + lsr x19,x28,#32 + mov w20,w30 + lsr x21,x30,#32 + + mov x4,#10 + subs x2,x2,#64 +Loop: + sub x4,x4,#1 + add w5,w5,w9 + add w6,w6,w10 + add w7,w7,w11 + add w8,w8,w12 + eor w17,w17,w5 + eor w19,w19,w6 + eor w20,w20,w7 + eor w21,w21,w8 + ror w17,w17,#16 + ror w19,w19,#16 + ror w20,w20,#16 + ror w21,w21,#16 + add w13,w13,w17 + add w14,w14,w19 + add w15,w15,w20 + add w16,w16,w21 + eor w9,w9,w13 + eor w10,w10,w14 + eor w11,w11,w15 + eor w12,w12,w16 + ror w9,w9,#20 + ror w10,w10,#20 + ror w11,w11,#20 + ror w12,w12,#20 + add w5,w5,w9 + add w6,w6,w10 + add w7,w7,w11 + add w8,w8,w12 + eor w17,w17,w5 + eor w19,w19,w6 + eor w20,w20,w7 + eor w21,w21,w8 + ror w17,w17,#24 + ror w19,w19,#24 + ror w20,w20,#24 + ror w21,w21,#24 + add w13,w13,w17 + add w14,w14,w19 + add w15,w15,w20 + add w16,w16,w21 + eor w9,w9,w13 + eor w10,w10,w14 + eor w11,w11,w15 + eor w12,w12,w16 + ror w9,w9,#25 + ror w10,w10,#25 + ror w11,w11,#25 + ror w12,w12,#25 + add w5,w5,w10 + add w6,w6,w11 + add w7,w7,w12 + add w8,w8,w9 + eor w21,w21,w5 + eor w17,w17,w6 + eor w19,w19,w7 + eor w20,w20,w8 + ror w21,w21,#16 + ror w17,w17,#16 + ror w19,w19,#16 + ror w20,w20,#16 + add w15,w15,w21 + add w16,w16,w17 + add w13,w13,w19 + add w14,w14,w20 + eor w10,w10,w15 + eor w11,w11,w16 + eor w12,w12,w13 + eor w9,w9,w14 + ror w10,w10,#20 + ror w11,w11,#20 + ror w12,w12,#20 + ror w9,w9,#20 + add w5,w5,w10 + add w6,w6,w11 + add w7,w7,w12 + add w8,w8,w9 + eor w21,w21,w5 + eor w17,w17,w6 + eor w19,w19,w7 + eor w20,w20,w8 + ror w21,w21,#24 + ror w17,w17,#24 + ror w19,w19,#24 + ror w20,w20,#24 + add w15,w15,w21 + add w16,w16,w17 + add w13,w13,w19 + add w14,w14,w20 + eor w10,w10,w15 + eor w11,w11,w16 + eor w12,w12,w13 + eor w9,w9,w14 + ror w10,w10,#25 + ror w11,w11,#25 + ror w12,w12,#25 + ror w9,w9,#25 + cbnz x4,Loop + + add w5,w5,w22 // accumulate key block + add x6,x6,x22,lsr#32 + add w7,w7,w23 + add x8,x8,x23,lsr#32 + add w9,w9,w24 + add x10,x10,x24,lsr#32 + add w11,w11,w25 + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add x21,x21,x30,lsr#32 + + b.lo Ltail + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#1 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + + b.hi Loop_outer + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.align 4 +Ltail: + add x2,x2,#64 +Less_than_64: + sub x0,x0,#1 + add x1,x1,x2 + add x0,x0,x2 + add x4,sp,x2 + neg x2,x2 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + stp x5,x7,[sp,#0] + stp x9,x11,[sp,#16] + stp x13,x15,[sp,#32] + stp x17,x20,[sp,#48] + +Loop_tail: + ldrb w10,[x1,x2] + ldrb w11,[x4,x2] + add x2,x2,#1 + eor w10,w10,w11 + strb w10,[x0,x2] + cbnz x2,Loop_tail + + stp xzr,xzr,[sp,#0] + stp xzr,xzr,[sp,#16] + stp xzr,xzr,[sp,#32] + stp xzr,xzr,[sp,#48] + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +.globl _ChaCha20_ctr32_neon +.private_extern _ChaCha20_ctr32_neon + +.align 5 +_ChaCha20_ctr32_neon: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adrp x5,Lsigma@PAGE + add x5,x5,Lsigma@PAGEOFF + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + cmp x2,#512 + b.hs L512_or_more_neon + + sub sp,sp,#64 + + ldp x22,x23,[x5] // load sigma + ld1 {v24.4s},[x5],#16 + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ld1 {v25.4s,v26.4s},[x3] + ldp x28,x30,[x4] // load counter + ld1 {v27.4s},[x4] + ld1 {v31.4s},[x5] +#ifdef __AARCH64EB__ + rev64 v24.4s,v24.4s + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + add v27.4s,v27.4s,v31.4s // += 1 + add v28.4s,v27.4s,v31.4s + add v29.4s,v28.4s,v31.4s + shl v31.4s,v31.4s,#2 // 1 -> 4 + +Loop_outer_neon: + mov w5,w22 // unpack key block + lsr x6,x22,#32 + mov v0.16b,v24.16b + mov w7,w23 + lsr x8,x23,#32 + mov v4.16b,v24.16b + mov w9,w24 + lsr x10,x24,#32 + mov v16.16b,v24.16b + mov w11,w25 + mov v1.16b,v25.16b + lsr x12,x25,#32 + mov v5.16b,v25.16b + mov w13,w26 + mov v17.16b,v25.16b + lsr x14,x26,#32 + mov v3.16b,v27.16b + mov w15,w27 + mov v7.16b,v28.16b + lsr x16,x27,#32 + mov v19.16b,v29.16b + mov w17,w28 + mov v2.16b,v26.16b + lsr x19,x28,#32 + mov v6.16b,v26.16b + mov w20,w30 + mov v18.16b,v26.16b + lsr x21,x30,#32 + + mov x4,#10 + subs x2,x2,#256 +Loop_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v16.4s,v16.4s,v17.4s + add w7,w7,w11 + eor v3.16b,v3.16b,v0.16b + add w8,w8,w12 + eor v7.16b,v7.16b,v4.16b + eor w17,w17,w5 + eor v19.16b,v19.16b,v16.16b + eor w19,w19,w6 + rev32 v3.8h,v3.8h + eor w20,w20,w7 + rev32 v7.8h,v7.8h + eor w21,w21,w8 + rev32 v19.8h,v19.8h + ror w17,w17,#16 + add v2.4s,v2.4s,v3.4s + ror w19,w19,#16 + add v6.4s,v6.4s,v7.4s + ror w20,w20,#16 + add v18.4s,v18.4s,v19.4s + ror w21,w21,#16 + eor v20.16b,v1.16b,v2.16b + add w13,w13,w17 + eor v21.16b,v5.16b,v6.16b + add w14,w14,w19 + eor v22.16b,v17.16b,v18.16b + add w15,w15,w20 + ushr v1.4s,v20.4s,#20 + add w16,w16,w21 + ushr v5.4s,v21.4s,#20 + eor w9,w9,w13 + ushr v17.4s,v22.4s,#20 + eor w10,w10,w14 + sli v1.4s,v20.4s,#12 + eor w11,w11,w15 + sli v5.4s,v21.4s,#12 + eor w12,w12,w16 + sli v17.4s,v22.4s,#12 + ror w9,w9,#20 + add v0.4s,v0.4s,v1.4s + ror w10,w10,#20 + add v4.4s,v4.4s,v5.4s + ror w11,w11,#20 + add v16.4s,v16.4s,v17.4s + ror w12,w12,#20 + eor v20.16b,v3.16b,v0.16b + add w5,w5,w9 + eor v21.16b,v7.16b,v4.16b + add w6,w6,w10 + eor v22.16b,v19.16b,v16.16b + add w7,w7,w11 + ushr v3.4s,v20.4s,#24 + add w8,w8,w12 + ushr v7.4s,v21.4s,#24 + eor w17,w17,w5 + ushr v19.4s,v22.4s,#24 + eor w19,w19,w6 + sli v3.4s,v20.4s,#8 + eor w20,w20,w7 + sli v7.4s,v21.4s,#8 + eor w21,w21,w8 + sli v19.4s,v22.4s,#8 + ror w17,w17,#24 + add v2.4s,v2.4s,v3.4s + ror w19,w19,#24 + add v6.4s,v6.4s,v7.4s + ror w20,w20,#24 + add v18.4s,v18.4s,v19.4s + ror w21,w21,#24 + eor v20.16b,v1.16b,v2.16b + add w13,w13,w17 + eor v21.16b,v5.16b,v6.16b + add w14,w14,w19 + eor v22.16b,v17.16b,v18.16b + add w15,w15,w20 + ushr v1.4s,v20.4s,#25 + add w16,w16,w21 + ushr v5.4s,v21.4s,#25 + eor w9,w9,w13 + ushr v17.4s,v22.4s,#25 + eor w10,w10,w14 + sli v1.4s,v20.4s,#7 + eor w11,w11,w15 + sli v5.4s,v21.4s,#7 + eor w12,w12,w16 + sli v17.4s,v22.4s,#7 + ror w9,w9,#25 + ext v2.16b,v2.16b,v2.16b,#8 + ror w10,w10,#25 + ext v6.16b,v6.16b,v6.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w10 + add v4.4s,v4.4s,v5.4s + add w6,w6,w11 + add v16.4s,v16.4s,v17.4s + add w7,w7,w12 + eor v3.16b,v3.16b,v0.16b + add w8,w8,w9 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w5 + eor v19.16b,v19.16b,v16.16b + eor w17,w17,w6 + rev32 v3.8h,v3.8h + eor w19,w19,w7 + rev32 v7.8h,v7.8h + eor w20,w20,w8 + rev32 v19.8h,v19.8h + ror w21,w21,#16 + add v2.4s,v2.4s,v3.4s + ror w17,w17,#16 + add v6.4s,v6.4s,v7.4s + ror w19,w19,#16 + add v18.4s,v18.4s,v19.4s + ror w20,w20,#16 + eor v20.16b,v1.16b,v2.16b + add w15,w15,w21 + eor v21.16b,v5.16b,v6.16b + add w16,w16,w17 + eor v22.16b,v17.16b,v18.16b + add w13,w13,w19 + ushr v1.4s,v20.4s,#20 + add w14,w14,w20 + ushr v5.4s,v21.4s,#20 + eor w10,w10,w15 + ushr v17.4s,v22.4s,#20 + eor w11,w11,w16 + sli v1.4s,v20.4s,#12 + eor w12,w12,w13 + sli v5.4s,v21.4s,#12 + eor w9,w9,w14 + sli v17.4s,v22.4s,#12 + ror w10,w10,#20 + add v0.4s,v0.4s,v1.4s + ror w11,w11,#20 + add v4.4s,v4.4s,v5.4s + ror w12,w12,#20 + add v16.4s,v16.4s,v17.4s + ror w9,w9,#20 + eor v20.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v21.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v22.16b,v19.16b,v16.16b + add w7,w7,w12 + ushr v3.4s,v20.4s,#24 + add w8,w8,w9 + ushr v7.4s,v21.4s,#24 + eor w21,w21,w5 + ushr v19.4s,v22.4s,#24 + eor w17,w17,w6 + sli v3.4s,v20.4s,#8 + eor w19,w19,w7 + sli v7.4s,v21.4s,#8 + eor w20,w20,w8 + sli v19.4s,v22.4s,#8 + ror w21,w21,#24 + add v2.4s,v2.4s,v3.4s + ror w17,w17,#24 + add v6.4s,v6.4s,v7.4s + ror w19,w19,#24 + add v18.4s,v18.4s,v19.4s + ror w20,w20,#24 + eor v20.16b,v1.16b,v2.16b + add w15,w15,w21 + eor v21.16b,v5.16b,v6.16b + add w16,w16,w17 + eor v22.16b,v17.16b,v18.16b + add w13,w13,w19 + ushr v1.4s,v20.4s,#25 + add w14,w14,w20 + ushr v5.4s,v21.4s,#25 + eor w10,w10,w15 + ushr v17.4s,v22.4s,#25 + eor w11,w11,w16 + sli v1.4s,v20.4s,#7 + eor w12,w12,w13 + sli v5.4s,v21.4s,#7 + eor w9,w9,w14 + sli v17.4s,v22.4s,#7 + ror w10,w10,#25 + ext v2.16b,v2.16b,v2.16b,#8 + ror w11,w11,#25 + ext v6.16b,v6.16b,v6.16b,#8 + ror w12,w12,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + cbnz x4,Loop_neon + + add w5,w5,w22 // accumulate key block + add v0.4s,v0.4s,v24.4s + add x6,x6,x22,lsr#32 + add v4.4s,v4.4s,v24.4s + add w7,w7,w23 + add v16.4s,v16.4s,v24.4s + add x8,x8,x23,lsr#32 + add v2.4s,v2.4s,v26.4s + add w9,w9,w24 + add v6.4s,v6.4s,v26.4s + add x10,x10,x24,lsr#32 + add v18.4s,v18.4s,v26.4s + add w11,w11,w25 + add v3.4s,v3.4s,v27.4s + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add v7.4s,v7.4s,v28.4s + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add v19.4s,v19.4s,v29.4s + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add v1.4s,v1.4s,v25.4s + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add v5.4s,v5.4s,v25.4s + add x21,x21,x30,lsr#32 + add v17.4s,v17.4s,v25.4s + + b.lo Ltail_neon + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor v0.16b,v0.16b,v20.16b + eor x15,x15,x16 + eor v1.16b,v1.16b,v21.16b + eor x17,x17,x19 + eor v2.16b,v2.16b,v22.16b + eor x20,x20,x21 + eor v3.16b,v3.16b,v23.16b + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#4 // increment counter + stp x9,x11,[x0,#16] + add v27.4s,v27.4s,v31.4s // += 4 + stp x13,x15,[x0,#32] + add v28.4s,v28.4s,v31.4s + stp x17,x20,[x0,#48] + add v29.4s,v29.4s,v31.4s + add x0,x0,#64 + + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 + + eor v4.16b,v4.16b,v20.16b + eor v5.16b,v5.16b,v21.16b + eor v6.16b,v6.16b,v22.16b + eor v7.16b,v7.16b,v23.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + + eor v16.16b,v16.16b,v0.16b + eor v17.16b,v17.16b,v1.16b + eor v18.16b,v18.16b,v2.16b + eor v19.16b,v19.16b,v3.16b + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 + + b.hi Loop_outer_neon + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +Ltail_neon: + add x2,x2,#256 + cmp x2,#64 + b.lo Less_than_64 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#4 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + b.eq Ldone_neon + sub x2,x2,#64 + cmp x2,#64 + b.lo Less_than_128 + + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor v0.16b,v0.16b,v20.16b + eor v1.16b,v1.16b,v21.16b + eor v2.16b,v2.16b,v22.16b + eor v3.16b,v3.16b,v23.16b + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + b.eq Ldone_neon + sub x2,x2,#64 + cmp x2,#64 + b.lo Less_than_192 + + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor v4.16b,v4.16b,v20.16b + eor v5.16b,v5.16b,v21.16b + eor v6.16b,v6.16b,v22.16b + eor v7.16b,v7.16b,v23.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + b.eq Ldone_neon + sub x2,x2,#64 + + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] + b Last_neon + +Less_than_128: + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] + b Last_neon +Less_than_192: + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] + b Last_neon + +.align 4 +Last_neon: + sub x0,x0,#1 + add x1,x1,x2 + add x0,x0,x2 + add x4,sp,x2 + neg x2,x2 + +Loop_tail_neon: + ldrb w10,[x1,x2] + ldrb w11,[x4,x2] + add x2,x2,#1 + eor w10,w10,w11 + strb w10,[x0,x2] + cbnz x2,Loop_tail_neon + + stp xzr,xzr,[sp,#0] + stp xzr,xzr,[sp,#16] + stp xzr,xzr,[sp,#32] + stp xzr,xzr,[sp,#48] + +Ldone_neon: + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +.align 5 +ChaCha20_512_neon: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adrp x5,Lsigma@PAGE + add x5,x5,Lsigma@PAGEOFF + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + +L512_or_more_neon: + sub sp,sp,#128+64 + + ldp x22,x23,[x5] // load sigma + ld1 {v24.4s},[x5],#16 + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ld1 {v25.4s,v26.4s},[x3] + ldp x28,x30,[x4] // load counter + ld1 {v27.4s},[x4] + ld1 {v31.4s},[x5] +#ifdef __AARCH64EB__ + rev64 v24.4s,v24.4s + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + add v27.4s,v27.4s,v31.4s // += 1 + stp q24,q25,[sp,#0] // off-load key block, invariant part + add v27.4s,v27.4s,v31.4s // not typo + str q26,[sp,#32] + add v28.4s,v27.4s,v31.4s + add v29.4s,v28.4s,v31.4s + add v30.4s,v29.4s,v31.4s + shl v31.4s,v31.4s,#2 // 1 -> 4 + + stp d8,d9,[sp,#128+0] // meet ABI requirements + stp d10,d11,[sp,#128+16] + stp d12,d13,[sp,#128+32] + stp d14,d15,[sp,#128+48] + + sub x2,x2,#512 // not typo + +Loop_outer_512_neon: + mov v0.16b,v24.16b + mov v4.16b,v24.16b + mov v8.16b,v24.16b + mov v12.16b,v24.16b + mov v16.16b,v24.16b + mov v20.16b,v24.16b + mov v1.16b,v25.16b + mov w5,w22 // unpack key block + mov v5.16b,v25.16b + lsr x6,x22,#32 + mov v9.16b,v25.16b + mov w7,w23 + mov v13.16b,v25.16b + lsr x8,x23,#32 + mov v17.16b,v25.16b + mov w9,w24 + mov v21.16b,v25.16b + lsr x10,x24,#32 + mov v3.16b,v27.16b + mov w11,w25 + mov v7.16b,v28.16b + lsr x12,x25,#32 + mov v11.16b,v29.16b + mov w13,w26 + mov v15.16b,v30.16b + lsr x14,x26,#32 + mov v2.16b,v26.16b + mov w15,w27 + mov v6.16b,v26.16b + lsr x16,x27,#32 + add v19.4s,v3.4s,v31.4s // +4 + mov w17,w28 + add v23.4s,v7.4s,v31.4s // +4 + lsr x19,x28,#32 + mov v10.16b,v26.16b + mov w20,w30 + mov v14.16b,v26.16b + lsr x21,x30,#32 + mov v18.16b,v26.16b + stp q27,q28,[sp,#48] // off-load key block, variable part + mov v22.16b,v26.16b + str q29,[sp,#80] + + mov x4,#5 + subs x2,x2,#512 +Loop_upper_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v11.16b,v11.16b,v11.16b,#12 + ext v15.16b,v15.16b,v15.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v23.16b,v23.16b,v23.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v9.16b,v9.16b,v9.16b,#4 + ext v13.16b,v13.16b,v13.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + ext v21.16b,v21.16b,v21.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v11.16b,v11.16b,v11.16b,#4 + ext v15.16b,v15.16b,v15.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v23.16b,v23.16b,v23.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v9.16b,v9.16b,v9.16b,#12 + ext v13.16b,v13.16b,v13.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + ext v21.16b,v21.16b,v21.16b,#12 + cbnz x4,Loop_upper_neon + + add w5,w5,w22 // accumulate key block + add x6,x6,x22,lsr#32 + add w7,w7,w23 + add x8,x8,x23,lsr#32 + add w9,w9,w24 + add x10,x10,x24,lsr#32 + add w11,w11,w25 + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add x21,x21,x30,lsr#32 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#1 // increment counter + mov w5,w22 // unpack key block + lsr x6,x22,#32 + stp x9,x11,[x0,#16] + mov w7,w23 + lsr x8,x23,#32 + stp x13,x15,[x0,#32] + mov w9,w24 + lsr x10,x24,#32 + stp x17,x20,[x0,#48] + add x0,x0,#64 + mov w11,w25 + lsr x12,x25,#32 + mov w13,w26 + lsr x14,x26,#32 + mov w15,w27 + lsr x16,x27,#32 + mov w17,w28 + lsr x19,x28,#32 + mov w20,w30 + lsr x21,x30,#32 + + mov x4,#5 +Loop_lower_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v11.16b,v11.16b,v11.16b,#12 + ext v15.16b,v15.16b,v15.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v23.16b,v23.16b,v23.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v9.16b,v9.16b,v9.16b,#4 + ext v13.16b,v13.16b,v13.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + ext v21.16b,v21.16b,v21.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v11.16b,v11.16b,v11.16b,#4 + ext v15.16b,v15.16b,v15.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v23.16b,v23.16b,v23.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v9.16b,v9.16b,v9.16b,#12 + ext v13.16b,v13.16b,v13.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + ext v21.16b,v21.16b,v21.16b,#12 + cbnz x4,Loop_lower_neon + + add w5,w5,w22 // accumulate key block + ldp q24,q25,[sp,#0] + add x6,x6,x22,lsr#32 + ldp q26,q27,[sp,#32] + add w7,w7,w23 + ldp q28,q29,[sp,#64] + add x8,x8,x23,lsr#32 + add v0.4s,v0.4s,v24.4s + add w9,w9,w24 + add v4.4s,v4.4s,v24.4s + add x10,x10,x24,lsr#32 + add v8.4s,v8.4s,v24.4s + add w11,w11,w25 + add v12.4s,v12.4s,v24.4s + add x12,x12,x25,lsr#32 + add v16.4s,v16.4s,v24.4s + add w13,w13,w26 + add v20.4s,v20.4s,v24.4s + add x14,x14,x26,lsr#32 + add v2.4s,v2.4s,v26.4s + add w15,w15,w27 + add v6.4s,v6.4s,v26.4s + add x16,x16,x27,lsr#32 + add v10.4s,v10.4s,v26.4s + add w17,w17,w28 + add v14.4s,v14.4s,v26.4s + add x19,x19,x28,lsr#32 + add v18.4s,v18.4s,v26.4s + add w20,w20,w30 + add v22.4s,v22.4s,v26.4s + add x21,x21,x30,lsr#32 + add v19.4s,v19.4s,v31.4s // +4 + add x5,x5,x6,lsl#32 // pack + add v23.4s,v23.4s,v31.4s // +4 + add x7,x7,x8,lsl#32 + add v3.4s,v3.4s,v27.4s + ldp x6,x8,[x1,#0] // load input + add v7.4s,v7.4s,v28.4s + add x9,x9,x10,lsl#32 + add v11.4s,v11.4s,v29.4s + add x11,x11,x12,lsl#32 + add v15.4s,v15.4s,v30.4s + ldp x10,x12,[x1,#16] + add v19.4s,v19.4s,v27.4s + add x13,x13,x14,lsl#32 + add v23.4s,v23.4s,v28.4s + add x15,x15,x16,lsl#32 + add v1.4s,v1.4s,v25.4s + ldp x14,x16,[x1,#32] + add v5.4s,v5.4s,v25.4s + add x17,x17,x19,lsl#32 + add v9.4s,v9.4s,v25.4s + add x20,x20,x21,lsl#32 + add v13.4s,v13.4s,v25.4s + ldp x19,x21,[x1,#48] + add v17.4s,v17.4s,v25.4s + add x1,x1,#64 + add v21.4s,v21.4s,v25.4s + +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor v0.16b,v0.16b,v24.16b + eor x15,x15,x16 + eor v1.16b,v1.16b,v25.16b + eor x17,x17,x19 + eor v2.16b,v2.16b,v26.16b + eor x20,x20,x21 + eor v3.16b,v3.16b,v27.16b + ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#7 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + + ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 + eor v4.16b,v4.16b,v24.16b + eor v5.16b,v5.16b,v25.16b + eor v6.16b,v6.16b,v26.16b + eor v7.16b,v7.16b,v27.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + eor v8.16b,v8.16b,v0.16b + ldp q24,q25,[sp,#0] + eor v9.16b,v9.16b,v1.16b + ldp q26,q27,[sp,#32] + eor v10.16b,v10.16b,v2.16b + eor v11.16b,v11.16b,v3.16b + st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 + + ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 + eor v12.16b,v12.16b,v4.16b + eor v13.16b,v13.16b,v5.16b + eor v14.16b,v14.16b,v6.16b + eor v15.16b,v15.16b,v7.16b + st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 + + ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 + eor v16.16b,v16.16b,v8.16b + eor v17.16b,v17.16b,v9.16b + eor v18.16b,v18.16b,v10.16b + eor v19.16b,v19.16b,v11.16b + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 + + shl v0.4s,v31.4s,#1 // 4 -> 8 + eor v20.16b,v20.16b,v12.16b + eor v21.16b,v21.16b,v13.16b + eor v22.16b,v22.16b,v14.16b + eor v23.16b,v23.16b,v15.16b + st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 + + add v27.4s,v27.4s,v0.4s // += 8 + add v28.4s,v28.4s,v0.4s + add v29.4s,v29.4s,v0.4s + add v30.4s,v30.4s,v0.4s + + b.hs Loop_outer_512_neon + + adds x2,x2,#512 + ushr v0.4s,v31.4s,#2 // 4 -> 1 + + ldp d8,d9,[sp,#128+0] // meet ABI requirements + ldp d10,d11,[sp,#128+16] + ldp d12,d13,[sp,#128+32] + ldp d14,d15,[sp,#128+48] + + stp q24,q31,[sp,#0] // wipe off-load area + stp q24,q31,[sp,#32] + stp q24,q31,[sp,#64] + + b.eq Ldone_512_neon + + cmp x2,#192 + sub v27.4s,v27.4s,v0.4s // -= 1 + sub v28.4s,v28.4s,v0.4s + sub v29.4s,v29.4s,v0.4s + add sp,sp,#128 + b.hs Loop_outer_neon + + eor v25.16b,v25.16b,v25.16b + eor v26.16b,v26.16b,v26.16b + eor v27.16b,v27.16b,v27.16b + eor v28.16b,v28.16b,v28.16b + eor v29.16b,v29.16b,v29.16b + eor v30.16b,v30.16b,v30.16b + b Loop_outer + +Ldone_512_neon: + ldp x19,x20,[x29,#16] + add sp,sp,#128+64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) diff --git a/ring-0.17.14/pregenerated/chacha-armv8-linux64.S b/ring-0.17.14/pregenerated/chacha-armv8-linux64.S new file mode 100644 index 0000000000..00108f0286 --- /dev/null +++ b/ring-0.17.14/pregenerated/chacha-armv8-linux64.S @@ -0,0 +1,1966 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) +.section .rodata + +.align 5 +.Lsigma: +.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral +.Lone: +.long 1,0,0,0 +.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 + +.text + +.globl ChaCha20_ctr32_nohw +.hidden ChaCha20_ctr32_nohw +.type ChaCha20_ctr32_nohw,%function +.align 5 +ChaCha20_ctr32_nohw: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adrp x5,.Lsigma + add x5,x5,:lo12:.Lsigma + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#64 + + ldp x22,x23,[x5] // load sigma + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ldp x28,x30,[x4] // load counter +#ifdef __AARCH64EB__ + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + +.Loop_outer: + mov w5,w22 // unpack key block + lsr x6,x22,#32 + mov w7,w23 + lsr x8,x23,#32 + mov w9,w24 + lsr x10,x24,#32 + mov w11,w25 + lsr x12,x25,#32 + mov w13,w26 + lsr x14,x26,#32 + mov w15,w27 + lsr x16,x27,#32 + mov w17,w28 + lsr x19,x28,#32 + mov w20,w30 + lsr x21,x30,#32 + + mov x4,#10 + subs x2,x2,#64 +.Loop: + sub x4,x4,#1 + add w5,w5,w9 + add w6,w6,w10 + add w7,w7,w11 + add w8,w8,w12 + eor w17,w17,w5 + eor w19,w19,w6 + eor w20,w20,w7 + eor w21,w21,w8 + ror w17,w17,#16 + ror w19,w19,#16 + ror w20,w20,#16 + ror w21,w21,#16 + add w13,w13,w17 + add w14,w14,w19 + add w15,w15,w20 + add w16,w16,w21 + eor w9,w9,w13 + eor w10,w10,w14 + eor w11,w11,w15 + eor w12,w12,w16 + ror w9,w9,#20 + ror w10,w10,#20 + ror w11,w11,#20 + ror w12,w12,#20 + add w5,w5,w9 + add w6,w6,w10 + add w7,w7,w11 + add w8,w8,w12 + eor w17,w17,w5 + eor w19,w19,w6 + eor w20,w20,w7 + eor w21,w21,w8 + ror w17,w17,#24 + ror w19,w19,#24 + ror w20,w20,#24 + ror w21,w21,#24 + add w13,w13,w17 + add w14,w14,w19 + add w15,w15,w20 + add w16,w16,w21 + eor w9,w9,w13 + eor w10,w10,w14 + eor w11,w11,w15 + eor w12,w12,w16 + ror w9,w9,#25 + ror w10,w10,#25 + ror w11,w11,#25 + ror w12,w12,#25 + add w5,w5,w10 + add w6,w6,w11 + add w7,w7,w12 + add w8,w8,w9 + eor w21,w21,w5 + eor w17,w17,w6 + eor w19,w19,w7 + eor w20,w20,w8 + ror w21,w21,#16 + ror w17,w17,#16 + ror w19,w19,#16 + ror w20,w20,#16 + add w15,w15,w21 + add w16,w16,w17 + add w13,w13,w19 + add w14,w14,w20 + eor w10,w10,w15 + eor w11,w11,w16 + eor w12,w12,w13 + eor w9,w9,w14 + ror w10,w10,#20 + ror w11,w11,#20 + ror w12,w12,#20 + ror w9,w9,#20 + add w5,w5,w10 + add w6,w6,w11 + add w7,w7,w12 + add w8,w8,w9 + eor w21,w21,w5 + eor w17,w17,w6 + eor w19,w19,w7 + eor w20,w20,w8 + ror w21,w21,#24 + ror w17,w17,#24 + ror w19,w19,#24 + ror w20,w20,#24 + add w15,w15,w21 + add w16,w16,w17 + add w13,w13,w19 + add w14,w14,w20 + eor w10,w10,w15 + eor w11,w11,w16 + eor w12,w12,w13 + eor w9,w9,w14 + ror w10,w10,#25 + ror w11,w11,#25 + ror w12,w12,#25 + ror w9,w9,#25 + cbnz x4,.Loop + + add w5,w5,w22 // accumulate key block + add x6,x6,x22,lsr#32 + add w7,w7,w23 + add x8,x8,x23,lsr#32 + add w9,w9,w24 + add x10,x10,x24,lsr#32 + add w11,w11,w25 + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add x21,x21,x30,lsr#32 + + b.lo .Ltail + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#1 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + + b.hi .Loop_outer + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.align 4 +.Ltail: + add x2,x2,#64 +.Less_than_64: + sub x0,x0,#1 + add x1,x1,x2 + add x0,x0,x2 + add x4,sp,x2 + neg x2,x2 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + stp x5,x7,[sp,#0] + stp x9,x11,[sp,#16] + stp x13,x15,[sp,#32] + stp x17,x20,[sp,#48] + +.Loop_tail: + ldrb w10,[x1,x2] + ldrb w11,[x4,x2] + add x2,x2,#1 + eor w10,w10,w11 + strb w10,[x0,x2] + cbnz x2,.Loop_tail + + stp xzr,xzr,[sp,#0] + stp xzr,xzr,[sp,#16] + stp xzr,xzr,[sp,#32] + stp xzr,xzr,[sp,#48] + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw + +.globl ChaCha20_ctr32_neon +.hidden ChaCha20_ctr32_neon +.type ChaCha20_ctr32_neon,%function +.align 5 +ChaCha20_ctr32_neon: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adrp x5,.Lsigma + add x5,x5,:lo12:.Lsigma + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + cmp x2,#512 + b.hs .L512_or_more_neon + + sub sp,sp,#64 + + ldp x22,x23,[x5] // load sigma + ld1 {v24.4s},[x5],#16 + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ld1 {v25.4s,v26.4s},[x3] + ldp x28,x30,[x4] // load counter + ld1 {v27.4s},[x4] + ld1 {v31.4s},[x5] +#ifdef __AARCH64EB__ + rev64 v24.4s,v24.4s + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + add v27.4s,v27.4s,v31.4s // += 1 + add v28.4s,v27.4s,v31.4s + add v29.4s,v28.4s,v31.4s + shl v31.4s,v31.4s,#2 // 1 -> 4 + +.Loop_outer_neon: + mov w5,w22 // unpack key block + lsr x6,x22,#32 + mov v0.16b,v24.16b + mov w7,w23 + lsr x8,x23,#32 + mov v4.16b,v24.16b + mov w9,w24 + lsr x10,x24,#32 + mov v16.16b,v24.16b + mov w11,w25 + mov v1.16b,v25.16b + lsr x12,x25,#32 + mov v5.16b,v25.16b + mov w13,w26 + mov v17.16b,v25.16b + lsr x14,x26,#32 + mov v3.16b,v27.16b + mov w15,w27 + mov v7.16b,v28.16b + lsr x16,x27,#32 + mov v19.16b,v29.16b + mov w17,w28 + mov v2.16b,v26.16b + lsr x19,x28,#32 + mov v6.16b,v26.16b + mov w20,w30 + mov v18.16b,v26.16b + lsr x21,x30,#32 + + mov x4,#10 + subs x2,x2,#256 +.Loop_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v16.4s,v16.4s,v17.4s + add w7,w7,w11 + eor v3.16b,v3.16b,v0.16b + add w8,w8,w12 + eor v7.16b,v7.16b,v4.16b + eor w17,w17,w5 + eor v19.16b,v19.16b,v16.16b + eor w19,w19,w6 + rev32 v3.8h,v3.8h + eor w20,w20,w7 + rev32 v7.8h,v7.8h + eor w21,w21,w8 + rev32 v19.8h,v19.8h + ror w17,w17,#16 + add v2.4s,v2.4s,v3.4s + ror w19,w19,#16 + add v6.4s,v6.4s,v7.4s + ror w20,w20,#16 + add v18.4s,v18.4s,v19.4s + ror w21,w21,#16 + eor v20.16b,v1.16b,v2.16b + add w13,w13,w17 + eor v21.16b,v5.16b,v6.16b + add w14,w14,w19 + eor v22.16b,v17.16b,v18.16b + add w15,w15,w20 + ushr v1.4s,v20.4s,#20 + add w16,w16,w21 + ushr v5.4s,v21.4s,#20 + eor w9,w9,w13 + ushr v17.4s,v22.4s,#20 + eor w10,w10,w14 + sli v1.4s,v20.4s,#12 + eor w11,w11,w15 + sli v5.4s,v21.4s,#12 + eor w12,w12,w16 + sli v17.4s,v22.4s,#12 + ror w9,w9,#20 + add v0.4s,v0.4s,v1.4s + ror w10,w10,#20 + add v4.4s,v4.4s,v5.4s + ror w11,w11,#20 + add v16.4s,v16.4s,v17.4s + ror w12,w12,#20 + eor v20.16b,v3.16b,v0.16b + add w5,w5,w9 + eor v21.16b,v7.16b,v4.16b + add w6,w6,w10 + eor v22.16b,v19.16b,v16.16b + add w7,w7,w11 + ushr v3.4s,v20.4s,#24 + add w8,w8,w12 + ushr v7.4s,v21.4s,#24 + eor w17,w17,w5 + ushr v19.4s,v22.4s,#24 + eor w19,w19,w6 + sli v3.4s,v20.4s,#8 + eor w20,w20,w7 + sli v7.4s,v21.4s,#8 + eor w21,w21,w8 + sli v19.4s,v22.4s,#8 + ror w17,w17,#24 + add v2.4s,v2.4s,v3.4s + ror w19,w19,#24 + add v6.4s,v6.4s,v7.4s + ror w20,w20,#24 + add v18.4s,v18.4s,v19.4s + ror w21,w21,#24 + eor v20.16b,v1.16b,v2.16b + add w13,w13,w17 + eor v21.16b,v5.16b,v6.16b + add w14,w14,w19 + eor v22.16b,v17.16b,v18.16b + add w15,w15,w20 + ushr v1.4s,v20.4s,#25 + add w16,w16,w21 + ushr v5.4s,v21.4s,#25 + eor w9,w9,w13 + ushr v17.4s,v22.4s,#25 + eor w10,w10,w14 + sli v1.4s,v20.4s,#7 + eor w11,w11,w15 + sli v5.4s,v21.4s,#7 + eor w12,w12,w16 + sli v17.4s,v22.4s,#7 + ror w9,w9,#25 + ext v2.16b,v2.16b,v2.16b,#8 + ror w10,w10,#25 + ext v6.16b,v6.16b,v6.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w10 + add v4.4s,v4.4s,v5.4s + add w6,w6,w11 + add v16.4s,v16.4s,v17.4s + add w7,w7,w12 + eor v3.16b,v3.16b,v0.16b + add w8,w8,w9 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w5 + eor v19.16b,v19.16b,v16.16b + eor w17,w17,w6 + rev32 v3.8h,v3.8h + eor w19,w19,w7 + rev32 v7.8h,v7.8h + eor w20,w20,w8 + rev32 v19.8h,v19.8h + ror w21,w21,#16 + add v2.4s,v2.4s,v3.4s + ror w17,w17,#16 + add v6.4s,v6.4s,v7.4s + ror w19,w19,#16 + add v18.4s,v18.4s,v19.4s + ror w20,w20,#16 + eor v20.16b,v1.16b,v2.16b + add w15,w15,w21 + eor v21.16b,v5.16b,v6.16b + add w16,w16,w17 + eor v22.16b,v17.16b,v18.16b + add w13,w13,w19 + ushr v1.4s,v20.4s,#20 + add w14,w14,w20 + ushr v5.4s,v21.4s,#20 + eor w10,w10,w15 + ushr v17.4s,v22.4s,#20 + eor w11,w11,w16 + sli v1.4s,v20.4s,#12 + eor w12,w12,w13 + sli v5.4s,v21.4s,#12 + eor w9,w9,w14 + sli v17.4s,v22.4s,#12 + ror w10,w10,#20 + add v0.4s,v0.4s,v1.4s + ror w11,w11,#20 + add v4.4s,v4.4s,v5.4s + ror w12,w12,#20 + add v16.4s,v16.4s,v17.4s + ror w9,w9,#20 + eor v20.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v21.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v22.16b,v19.16b,v16.16b + add w7,w7,w12 + ushr v3.4s,v20.4s,#24 + add w8,w8,w9 + ushr v7.4s,v21.4s,#24 + eor w21,w21,w5 + ushr v19.4s,v22.4s,#24 + eor w17,w17,w6 + sli v3.4s,v20.4s,#8 + eor w19,w19,w7 + sli v7.4s,v21.4s,#8 + eor w20,w20,w8 + sli v19.4s,v22.4s,#8 + ror w21,w21,#24 + add v2.4s,v2.4s,v3.4s + ror w17,w17,#24 + add v6.4s,v6.4s,v7.4s + ror w19,w19,#24 + add v18.4s,v18.4s,v19.4s + ror w20,w20,#24 + eor v20.16b,v1.16b,v2.16b + add w15,w15,w21 + eor v21.16b,v5.16b,v6.16b + add w16,w16,w17 + eor v22.16b,v17.16b,v18.16b + add w13,w13,w19 + ushr v1.4s,v20.4s,#25 + add w14,w14,w20 + ushr v5.4s,v21.4s,#25 + eor w10,w10,w15 + ushr v17.4s,v22.4s,#25 + eor w11,w11,w16 + sli v1.4s,v20.4s,#7 + eor w12,w12,w13 + sli v5.4s,v21.4s,#7 + eor w9,w9,w14 + sli v17.4s,v22.4s,#7 + ror w10,w10,#25 + ext v2.16b,v2.16b,v2.16b,#8 + ror w11,w11,#25 + ext v6.16b,v6.16b,v6.16b,#8 + ror w12,w12,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + cbnz x4,.Loop_neon + + add w5,w5,w22 // accumulate key block + add v0.4s,v0.4s,v24.4s + add x6,x6,x22,lsr#32 + add v4.4s,v4.4s,v24.4s + add w7,w7,w23 + add v16.4s,v16.4s,v24.4s + add x8,x8,x23,lsr#32 + add v2.4s,v2.4s,v26.4s + add w9,w9,w24 + add v6.4s,v6.4s,v26.4s + add x10,x10,x24,lsr#32 + add v18.4s,v18.4s,v26.4s + add w11,w11,w25 + add v3.4s,v3.4s,v27.4s + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add v7.4s,v7.4s,v28.4s + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add v19.4s,v19.4s,v29.4s + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add v1.4s,v1.4s,v25.4s + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add v5.4s,v5.4s,v25.4s + add x21,x21,x30,lsr#32 + add v17.4s,v17.4s,v25.4s + + b.lo .Ltail_neon + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor v0.16b,v0.16b,v20.16b + eor x15,x15,x16 + eor v1.16b,v1.16b,v21.16b + eor x17,x17,x19 + eor v2.16b,v2.16b,v22.16b + eor x20,x20,x21 + eor v3.16b,v3.16b,v23.16b + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#4 // increment counter + stp x9,x11,[x0,#16] + add v27.4s,v27.4s,v31.4s // += 4 + stp x13,x15,[x0,#32] + add v28.4s,v28.4s,v31.4s + stp x17,x20,[x0,#48] + add v29.4s,v29.4s,v31.4s + add x0,x0,#64 + + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 + + eor v4.16b,v4.16b,v20.16b + eor v5.16b,v5.16b,v21.16b + eor v6.16b,v6.16b,v22.16b + eor v7.16b,v7.16b,v23.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + + eor v16.16b,v16.16b,v0.16b + eor v17.16b,v17.16b,v1.16b + eor v18.16b,v18.16b,v2.16b + eor v19.16b,v19.16b,v3.16b + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 + + b.hi .Loop_outer_neon + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.Ltail_neon: + add x2,x2,#256 + cmp x2,#64 + b.lo .Less_than_64 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#4 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + b.eq .Ldone_neon + sub x2,x2,#64 + cmp x2,#64 + b.lo .Less_than_128 + + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor v0.16b,v0.16b,v20.16b + eor v1.16b,v1.16b,v21.16b + eor v2.16b,v2.16b,v22.16b + eor v3.16b,v3.16b,v23.16b + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + b.eq .Ldone_neon + sub x2,x2,#64 + cmp x2,#64 + b.lo .Less_than_192 + + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor v4.16b,v4.16b,v20.16b + eor v5.16b,v5.16b,v21.16b + eor v6.16b,v6.16b,v22.16b + eor v7.16b,v7.16b,v23.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + b.eq .Ldone_neon + sub x2,x2,#64 + + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] + b .Last_neon + +.Less_than_128: + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] + b .Last_neon +.Less_than_192: + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] + b .Last_neon + +.align 4 +.Last_neon: + sub x0,x0,#1 + add x1,x1,x2 + add x0,x0,x2 + add x4,sp,x2 + neg x2,x2 + +.Loop_tail_neon: + ldrb w10,[x1,x2] + ldrb w11,[x4,x2] + add x2,x2,#1 + eor w10,w10,w11 + strb w10,[x0,x2] + cbnz x2,.Loop_tail_neon + + stp xzr,xzr,[sp,#0] + stp xzr,xzr,[sp,#16] + stp xzr,xzr,[sp,#32] + stp xzr,xzr,[sp,#48] + +.Ldone_neon: + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ChaCha20_ctr32_neon,.-ChaCha20_ctr32_neon +.type ChaCha20_512_neon,%function +.align 5 +ChaCha20_512_neon: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adrp x5,.Lsigma + add x5,x5,:lo12:.Lsigma + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + +.L512_or_more_neon: + sub sp,sp,#128+64 + + ldp x22,x23,[x5] // load sigma + ld1 {v24.4s},[x5],#16 + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ld1 {v25.4s,v26.4s},[x3] + ldp x28,x30,[x4] // load counter + ld1 {v27.4s},[x4] + ld1 {v31.4s},[x5] +#ifdef __AARCH64EB__ + rev64 v24.4s,v24.4s + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + add v27.4s,v27.4s,v31.4s // += 1 + stp q24,q25,[sp,#0] // off-load key block, invariant part + add v27.4s,v27.4s,v31.4s // not typo + str q26,[sp,#32] + add v28.4s,v27.4s,v31.4s + add v29.4s,v28.4s,v31.4s + add v30.4s,v29.4s,v31.4s + shl v31.4s,v31.4s,#2 // 1 -> 4 + + stp d8,d9,[sp,#128+0] // meet ABI requirements + stp d10,d11,[sp,#128+16] + stp d12,d13,[sp,#128+32] + stp d14,d15,[sp,#128+48] + + sub x2,x2,#512 // not typo + +.Loop_outer_512_neon: + mov v0.16b,v24.16b + mov v4.16b,v24.16b + mov v8.16b,v24.16b + mov v12.16b,v24.16b + mov v16.16b,v24.16b + mov v20.16b,v24.16b + mov v1.16b,v25.16b + mov w5,w22 // unpack key block + mov v5.16b,v25.16b + lsr x6,x22,#32 + mov v9.16b,v25.16b + mov w7,w23 + mov v13.16b,v25.16b + lsr x8,x23,#32 + mov v17.16b,v25.16b + mov w9,w24 + mov v21.16b,v25.16b + lsr x10,x24,#32 + mov v3.16b,v27.16b + mov w11,w25 + mov v7.16b,v28.16b + lsr x12,x25,#32 + mov v11.16b,v29.16b + mov w13,w26 + mov v15.16b,v30.16b + lsr x14,x26,#32 + mov v2.16b,v26.16b + mov w15,w27 + mov v6.16b,v26.16b + lsr x16,x27,#32 + add v19.4s,v3.4s,v31.4s // +4 + mov w17,w28 + add v23.4s,v7.4s,v31.4s // +4 + lsr x19,x28,#32 + mov v10.16b,v26.16b + mov w20,w30 + mov v14.16b,v26.16b + lsr x21,x30,#32 + mov v18.16b,v26.16b + stp q27,q28,[sp,#48] // off-load key block, variable part + mov v22.16b,v26.16b + str q29,[sp,#80] + + mov x4,#5 + subs x2,x2,#512 +.Loop_upper_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v11.16b,v11.16b,v11.16b,#12 + ext v15.16b,v15.16b,v15.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v23.16b,v23.16b,v23.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v9.16b,v9.16b,v9.16b,#4 + ext v13.16b,v13.16b,v13.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + ext v21.16b,v21.16b,v21.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v11.16b,v11.16b,v11.16b,#4 + ext v15.16b,v15.16b,v15.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v23.16b,v23.16b,v23.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v9.16b,v9.16b,v9.16b,#12 + ext v13.16b,v13.16b,v13.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + ext v21.16b,v21.16b,v21.16b,#12 + cbnz x4,.Loop_upper_neon + + add w5,w5,w22 // accumulate key block + add x6,x6,x22,lsr#32 + add w7,w7,w23 + add x8,x8,x23,lsr#32 + add w9,w9,w24 + add x10,x10,x24,lsr#32 + add w11,w11,w25 + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add x21,x21,x30,lsr#32 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#1 // increment counter + mov w5,w22 // unpack key block + lsr x6,x22,#32 + stp x9,x11,[x0,#16] + mov w7,w23 + lsr x8,x23,#32 + stp x13,x15,[x0,#32] + mov w9,w24 + lsr x10,x24,#32 + stp x17,x20,[x0,#48] + add x0,x0,#64 + mov w11,w25 + lsr x12,x25,#32 + mov w13,w26 + lsr x14,x26,#32 + mov w15,w27 + lsr x16,x27,#32 + mov w17,w28 + lsr x19,x28,#32 + mov w20,w30 + lsr x21,x30,#32 + + mov x4,#5 +.Loop_lower_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v11.16b,v11.16b,v11.16b,#12 + ext v15.16b,v15.16b,v15.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v23.16b,v23.16b,v23.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v9.16b,v9.16b,v9.16b,#4 + ext v13.16b,v13.16b,v13.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + ext v21.16b,v21.16b,v21.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v11.16b,v11.16b,v11.16b,#4 + ext v15.16b,v15.16b,v15.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v23.16b,v23.16b,v23.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v9.16b,v9.16b,v9.16b,#12 + ext v13.16b,v13.16b,v13.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + ext v21.16b,v21.16b,v21.16b,#12 + cbnz x4,.Loop_lower_neon + + add w5,w5,w22 // accumulate key block + ldp q24,q25,[sp,#0] + add x6,x6,x22,lsr#32 + ldp q26,q27,[sp,#32] + add w7,w7,w23 + ldp q28,q29,[sp,#64] + add x8,x8,x23,lsr#32 + add v0.4s,v0.4s,v24.4s + add w9,w9,w24 + add v4.4s,v4.4s,v24.4s + add x10,x10,x24,lsr#32 + add v8.4s,v8.4s,v24.4s + add w11,w11,w25 + add v12.4s,v12.4s,v24.4s + add x12,x12,x25,lsr#32 + add v16.4s,v16.4s,v24.4s + add w13,w13,w26 + add v20.4s,v20.4s,v24.4s + add x14,x14,x26,lsr#32 + add v2.4s,v2.4s,v26.4s + add w15,w15,w27 + add v6.4s,v6.4s,v26.4s + add x16,x16,x27,lsr#32 + add v10.4s,v10.4s,v26.4s + add w17,w17,w28 + add v14.4s,v14.4s,v26.4s + add x19,x19,x28,lsr#32 + add v18.4s,v18.4s,v26.4s + add w20,w20,w30 + add v22.4s,v22.4s,v26.4s + add x21,x21,x30,lsr#32 + add v19.4s,v19.4s,v31.4s // +4 + add x5,x5,x6,lsl#32 // pack + add v23.4s,v23.4s,v31.4s // +4 + add x7,x7,x8,lsl#32 + add v3.4s,v3.4s,v27.4s + ldp x6,x8,[x1,#0] // load input + add v7.4s,v7.4s,v28.4s + add x9,x9,x10,lsl#32 + add v11.4s,v11.4s,v29.4s + add x11,x11,x12,lsl#32 + add v15.4s,v15.4s,v30.4s + ldp x10,x12,[x1,#16] + add v19.4s,v19.4s,v27.4s + add x13,x13,x14,lsl#32 + add v23.4s,v23.4s,v28.4s + add x15,x15,x16,lsl#32 + add v1.4s,v1.4s,v25.4s + ldp x14,x16,[x1,#32] + add v5.4s,v5.4s,v25.4s + add x17,x17,x19,lsl#32 + add v9.4s,v9.4s,v25.4s + add x20,x20,x21,lsl#32 + add v13.4s,v13.4s,v25.4s + ldp x19,x21,[x1,#48] + add v17.4s,v17.4s,v25.4s + add x1,x1,#64 + add v21.4s,v21.4s,v25.4s + +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor v0.16b,v0.16b,v24.16b + eor x15,x15,x16 + eor v1.16b,v1.16b,v25.16b + eor x17,x17,x19 + eor v2.16b,v2.16b,v26.16b + eor x20,x20,x21 + eor v3.16b,v3.16b,v27.16b + ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#7 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + + ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 + eor v4.16b,v4.16b,v24.16b + eor v5.16b,v5.16b,v25.16b + eor v6.16b,v6.16b,v26.16b + eor v7.16b,v7.16b,v27.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + eor v8.16b,v8.16b,v0.16b + ldp q24,q25,[sp,#0] + eor v9.16b,v9.16b,v1.16b + ldp q26,q27,[sp,#32] + eor v10.16b,v10.16b,v2.16b + eor v11.16b,v11.16b,v3.16b + st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 + + ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 + eor v12.16b,v12.16b,v4.16b + eor v13.16b,v13.16b,v5.16b + eor v14.16b,v14.16b,v6.16b + eor v15.16b,v15.16b,v7.16b + st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 + + ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 + eor v16.16b,v16.16b,v8.16b + eor v17.16b,v17.16b,v9.16b + eor v18.16b,v18.16b,v10.16b + eor v19.16b,v19.16b,v11.16b + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 + + shl v0.4s,v31.4s,#1 // 4 -> 8 + eor v20.16b,v20.16b,v12.16b + eor v21.16b,v21.16b,v13.16b + eor v22.16b,v22.16b,v14.16b + eor v23.16b,v23.16b,v15.16b + st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 + + add v27.4s,v27.4s,v0.4s // += 8 + add v28.4s,v28.4s,v0.4s + add v29.4s,v29.4s,v0.4s + add v30.4s,v30.4s,v0.4s + + b.hs .Loop_outer_512_neon + + adds x2,x2,#512 + ushr v0.4s,v31.4s,#2 // 4 -> 1 + + ldp d8,d9,[sp,#128+0] // meet ABI requirements + ldp d10,d11,[sp,#128+16] + ldp d12,d13,[sp,#128+32] + ldp d14,d15,[sp,#128+48] + + stp q24,q31,[sp,#0] // wipe off-load area + stp q24,q31,[sp,#32] + stp q24,q31,[sp,#64] + + b.eq .Ldone_512_neon + + cmp x2,#192 + sub v27.4s,v27.4s,v0.4s // -= 1 + sub v28.4s,v28.4s,v0.4s + sub v29.4s,v29.4s,v0.4s + add sp,sp,#128 + b.hs .Loop_outer_neon + + eor v25.16b,v25.16b,v25.16b + eor v26.16b,v26.16b,v26.16b + eor v27.16b,v27.16b,v27.16b + eor v28.16b,v28.16b,v28.16b + eor v29.16b,v29.16b,v29.16b + eor v30.16b,v30.16b,v30.16b + b .Loop_outer + +.Ldone_512_neon: + ldp x19,x20,[x29,#16] + add sp,sp,#128+64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ChaCha20_512_neon,.-ChaCha20_512_neon +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) diff --git a/ring-0.17.14/pregenerated/chacha-armv8-win64.S b/ring-0.17.14/pregenerated/chacha-armv8-win64.S new file mode 100644 index 0000000000..12d054264e --- /dev/null +++ b/ring-0.17.14/pregenerated/chacha-armv8-win64.S @@ -0,0 +1,1972 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +.section .rodata + +.align 5 +Lsigma: +.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral +Lone: +.long 1,0,0,0 +.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 + +.text + +.globl ChaCha20_ctr32_nohw + +.def ChaCha20_ctr32_nohw + .type 32 +.endef +.align 5 +ChaCha20_ctr32_nohw: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adrp x5,Lsigma + add x5,x5,:lo12:Lsigma + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#64 + + ldp x22,x23,[x5] // load sigma + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ldp x28,x30,[x4] // load counter +#ifdef __AARCH64EB__ + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + +Loop_outer: + mov w5,w22 // unpack key block + lsr x6,x22,#32 + mov w7,w23 + lsr x8,x23,#32 + mov w9,w24 + lsr x10,x24,#32 + mov w11,w25 + lsr x12,x25,#32 + mov w13,w26 + lsr x14,x26,#32 + mov w15,w27 + lsr x16,x27,#32 + mov w17,w28 + lsr x19,x28,#32 + mov w20,w30 + lsr x21,x30,#32 + + mov x4,#10 + subs x2,x2,#64 +Loop: + sub x4,x4,#1 + add w5,w5,w9 + add w6,w6,w10 + add w7,w7,w11 + add w8,w8,w12 + eor w17,w17,w5 + eor w19,w19,w6 + eor w20,w20,w7 + eor w21,w21,w8 + ror w17,w17,#16 + ror w19,w19,#16 + ror w20,w20,#16 + ror w21,w21,#16 + add w13,w13,w17 + add w14,w14,w19 + add w15,w15,w20 + add w16,w16,w21 + eor w9,w9,w13 + eor w10,w10,w14 + eor w11,w11,w15 + eor w12,w12,w16 + ror w9,w9,#20 + ror w10,w10,#20 + ror w11,w11,#20 + ror w12,w12,#20 + add w5,w5,w9 + add w6,w6,w10 + add w7,w7,w11 + add w8,w8,w12 + eor w17,w17,w5 + eor w19,w19,w6 + eor w20,w20,w7 + eor w21,w21,w8 + ror w17,w17,#24 + ror w19,w19,#24 + ror w20,w20,#24 + ror w21,w21,#24 + add w13,w13,w17 + add w14,w14,w19 + add w15,w15,w20 + add w16,w16,w21 + eor w9,w9,w13 + eor w10,w10,w14 + eor w11,w11,w15 + eor w12,w12,w16 + ror w9,w9,#25 + ror w10,w10,#25 + ror w11,w11,#25 + ror w12,w12,#25 + add w5,w5,w10 + add w6,w6,w11 + add w7,w7,w12 + add w8,w8,w9 + eor w21,w21,w5 + eor w17,w17,w6 + eor w19,w19,w7 + eor w20,w20,w8 + ror w21,w21,#16 + ror w17,w17,#16 + ror w19,w19,#16 + ror w20,w20,#16 + add w15,w15,w21 + add w16,w16,w17 + add w13,w13,w19 + add w14,w14,w20 + eor w10,w10,w15 + eor w11,w11,w16 + eor w12,w12,w13 + eor w9,w9,w14 + ror w10,w10,#20 + ror w11,w11,#20 + ror w12,w12,#20 + ror w9,w9,#20 + add w5,w5,w10 + add w6,w6,w11 + add w7,w7,w12 + add w8,w8,w9 + eor w21,w21,w5 + eor w17,w17,w6 + eor w19,w19,w7 + eor w20,w20,w8 + ror w21,w21,#24 + ror w17,w17,#24 + ror w19,w19,#24 + ror w20,w20,#24 + add w15,w15,w21 + add w16,w16,w17 + add w13,w13,w19 + add w14,w14,w20 + eor w10,w10,w15 + eor w11,w11,w16 + eor w12,w12,w13 + eor w9,w9,w14 + ror w10,w10,#25 + ror w11,w11,#25 + ror w12,w12,#25 + ror w9,w9,#25 + cbnz x4,Loop + + add w5,w5,w22 // accumulate key block + add x6,x6,x22,lsr#32 + add w7,w7,w23 + add x8,x8,x23,lsr#32 + add w9,w9,w24 + add x10,x10,x24,lsr#32 + add w11,w11,w25 + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add x21,x21,x30,lsr#32 + + b.lo Ltail + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#1 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + + b.hi Loop_outer + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.align 4 +Ltail: + add x2,x2,#64 +Less_than_64: + sub x0,x0,#1 + add x1,x1,x2 + add x0,x0,x2 + add x4,sp,x2 + neg x2,x2 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + stp x5,x7,[sp,#0] + stp x9,x11,[sp,#16] + stp x13,x15,[sp,#32] + stp x17,x20,[sp,#48] + +Loop_tail: + ldrb w10,[x1,x2] + ldrb w11,[x4,x2] + add x2,x2,#1 + eor w10,w10,w11 + strb w10,[x0,x2] + cbnz x2,Loop_tail + + stp xzr,xzr,[sp,#0] + stp xzr,xzr,[sp,#16] + stp xzr,xzr,[sp,#32] + stp xzr,xzr,[sp,#48] + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +.globl ChaCha20_ctr32_neon + +.def ChaCha20_ctr32_neon + .type 32 +.endef +.align 5 +ChaCha20_ctr32_neon: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adrp x5,Lsigma + add x5,x5,:lo12:Lsigma + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + cmp x2,#512 + b.hs L512_or_more_neon + + sub sp,sp,#64 + + ldp x22,x23,[x5] // load sigma + ld1 {v24.4s},[x5],#16 + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ld1 {v25.4s,v26.4s},[x3] + ldp x28,x30,[x4] // load counter + ld1 {v27.4s},[x4] + ld1 {v31.4s},[x5] +#ifdef __AARCH64EB__ + rev64 v24.4s,v24.4s + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + add v27.4s,v27.4s,v31.4s // += 1 + add v28.4s,v27.4s,v31.4s + add v29.4s,v28.4s,v31.4s + shl v31.4s,v31.4s,#2 // 1 -> 4 + +Loop_outer_neon: + mov w5,w22 // unpack key block + lsr x6,x22,#32 + mov v0.16b,v24.16b + mov w7,w23 + lsr x8,x23,#32 + mov v4.16b,v24.16b + mov w9,w24 + lsr x10,x24,#32 + mov v16.16b,v24.16b + mov w11,w25 + mov v1.16b,v25.16b + lsr x12,x25,#32 + mov v5.16b,v25.16b + mov w13,w26 + mov v17.16b,v25.16b + lsr x14,x26,#32 + mov v3.16b,v27.16b + mov w15,w27 + mov v7.16b,v28.16b + lsr x16,x27,#32 + mov v19.16b,v29.16b + mov w17,w28 + mov v2.16b,v26.16b + lsr x19,x28,#32 + mov v6.16b,v26.16b + mov w20,w30 + mov v18.16b,v26.16b + lsr x21,x30,#32 + + mov x4,#10 + subs x2,x2,#256 +Loop_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v16.4s,v16.4s,v17.4s + add w7,w7,w11 + eor v3.16b,v3.16b,v0.16b + add w8,w8,w12 + eor v7.16b,v7.16b,v4.16b + eor w17,w17,w5 + eor v19.16b,v19.16b,v16.16b + eor w19,w19,w6 + rev32 v3.8h,v3.8h + eor w20,w20,w7 + rev32 v7.8h,v7.8h + eor w21,w21,w8 + rev32 v19.8h,v19.8h + ror w17,w17,#16 + add v2.4s,v2.4s,v3.4s + ror w19,w19,#16 + add v6.4s,v6.4s,v7.4s + ror w20,w20,#16 + add v18.4s,v18.4s,v19.4s + ror w21,w21,#16 + eor v20.16b,v1.16b,v2.16b + add w13,w13,w17 + eor v21.16b,v5.16b,v6.16b + add w14,w14,w19 + eor v22.16b,v17.16b,v18.16b + add w15,w15,w20 + ushr v1.4s,v20.4s,#20 + add w16,w16,w21 + ushr v5.4s,v21.4s,#20 + eor w9,w9,w13 + ushr v17.4s,v22.4s,#20 + eor w10,w10,w14 + sli v1.4s,v20.4s,#12 + eor w11,w11,w15 + sli v5.4s,v21.4s,#12 + eor w12,w12,w16 + sli v17.4s,v22.4s,#12 + ror w9,w9,#20 + add v0.4s,v0.4s,v1.4s + ror w10,w10,#20 + add v4.4s,v4.4s,v5.4s + ror w11,w11,#20 + add v16.4s,v16.4s,v17.4s + ror w12,w12,#20 + eor v20.16b,v3.16b,v0.16b + add w5,w5,w9 + eor v21.16b,v7.16b,v4.16b + add w6,w6,w10 + eor v22.16b,v19.16b,v16.16b + add w7,w7,w11 + ushr v3.4s,v20.4s,#24 + add w8,w8,w12 + ushr v7.4s,v21.4s,#24 + eor w17,w17,w5 + ushr v19.4s,v22.4s,#24 + eor w19,w19,w6 + sli v3.4s,v20.4s,#8 + eor w20,w20,w7 + sli v7.4s,v21.4s,#8 + eor w21,w21,w8 + sli v19.4s,v22.4s,#8 + ror w17,w17,#24 + add v2.4s,v2.4s,v3.4s + ror w19,w19,#24 + add v6.4s,v6.4s,v7.4s + ror w20,w20,#24 + add v18.4s,v18.4s,v19.4s + ror w21,w21,#24 + eor v20.16b,v1.16b,v2.16b + add w13,w13,w17 + eor v21.16b,v5.16b,v6.16b + add w14,w14,w19 + eor v22.16b,v17.16b,v18.16b + add w15,w15,w20 + ushr v1.4s,v20.4s,#25 + add w16,w16,w21 + ushr v5.4s,v21.4s,#25 + eor w9,w9,w13 + ushr v17.4s,v22.4s,#25 + eor w10,w10,w14 + sli v1.4s,v20.4s,#7 + eor w11,w11,w15 + sli v5.4s,v21.4s,#7 + eor w12,w12,w16 + sli v17.4s,v22.4s,#7 + ror w9,w9,#25 + ext v2.16b,v2.16b,v2.16b,#8 + ror w10,w10,#25 + ext v6.16b,v6.16b,v6.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w10 + add v4.4s,v4.4s,v5.4s + add w6,w6,w11 + add v16.4s,v16.4s,v17.4s + add w7,w7,w12 + eor v3.16b,v3.16b,v0.16b + add w8,w8,w9 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w5 + eor v19.16b,v19.16b,v16.16b + eor w17,w17,w6 + rev32 v3.8h,v3.8h + eor w19,w19,w7 + rev32 v7.8h,v7.8h + eor w20,w20,w8 + rev32 v19.8h,v19.8h + ror w21,w21,#16 + add v2.4s,v2.4s,v3.4s + ror w17,w17,#16 + add v6.4s,v6.4s,v7.4s + ror w19,w19,#16 + add v18.4s,v18.4s,v19.4s + ror w20,w20,#16 + eor v20.16b,v1.16b,v2.16b + add w15,w15,w21 + eor v21.16b,v5.16b,v6.16b + add w16,w16,w17 + eor v22.16b,v17.16b,v18.16b + add w13,w13,w19 + ushr v1.4s,v20.4s,#20 + add w14,w14,w20 + ushr v5.4s,v21.4s,#20 + eor w10,w10,w15 + ushr v17.4s,v22.4s,#20 + eor w11,w11,w16 + sli v1.4s,v20.4s,#12 + eor w12,w12,w13 + sli v5.4s,v21.4s,#12 + eor w9,w9,w14 + sli v17.4s,v22.4s,#12 + ror w10,w10,#20 + add v0.4s,v0.4s,v1.4s + ror w11,w11,#20 + add v4.4s,v4.4s,v5.4s + ror w12,w12,#20 + add v16.4s,v16.4s,v17.4s + ror w9,w9,#20 + eor v20.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v21.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v22.16b,v19.16b,v16.16b + add w7,w7,w12 + ushr v3.4s,v20.4s,#24 + add w8,w8,w9 + ushr v7.4s,v21.4s,#24 + eor w21,w21,w5 + ushr v19.4s,v22.4s,#24 + eor w17,w17,w6 + sli v3.4s,v20.4s,#8 + eor w19,w19,w7 + sli v7.4s,v21.4s,#8 + eor w20,w20,w8 + sli v19.4s,v22.4s,#8 + ror w21,w21,#24 + add v2.4s,v2.4s,v3.4s + ror w17,w17,#24 + add v6.4s,v6.4s,v7.4s + ror w19,w19,#24 + add v18.4s,v18.4s,v19.4s + ror w20,w20,#24 + eor v20.16b,v1.16b,v2.16b + add w15,w15,w21 + eor v21.16b,v5.16b,v6.16b + add w16,w16,w17 + eor v22.16b,v17.16b,v18.16b + add w13,w13,w19 + ushr v1.4s,v20.4s,#25 + add w14,w14,w20 + ushr v5.4s,v21.4s,#25 + eor w10,w10,w15 + ushr v17.4s,v22.4s,#25 + eor w11,w11,w16 + sli v1.4s,v20.4s,#7 + eor w12,w12,w13 + sli v5.4s,v21.4s,#7 + eor w9,w9,w14 + sli v17.4s,v22.4s,#7 + ror w10,w10,#25 + ext v2.16b,v2.16b,v2.16b,#8 + ror w11,w11,#25 + ext v6.16b,v6.16b,v6.16b,#8 + ror w12,w12,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + cbnz x4,Loop_neon + + add w5,w5,w22 // accumulate key block + add v0.4s,v0.4s,v24.4s + add x6,x6,x22,lsr#32 + add v4.4s,v4.4s,v24.4s + add w7,w7,w23 + add v16.4s,v16.4s,v24.4s + add x8,x8,x23,lsr#32 + add v2.4s,v2.4s,v26.4s + add w9,w9,w24 + add v6.4s,v6.4s,v26.4s + add x10,x10,x24,lsr#32 + add v18.4s,v18.4s,v26.4s + add w11,w11,w25 + add v3.4s,v3.4s,v27.4s + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add v7.4s,v7.4s,v28.4s + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add v19.4s,v19.4s,v29.4s + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add v1.4s,v1.4s,v25.4s + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add v5.4s,v5.4s,v25.4s + add x21,x21,x30,lsr#32 + add v17.4s,v17.4s,v25.4s + + b.lo Ltail_neon + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor v0.16b,v0.16b,v20.16b + eor x15,x15,x16 + eor v1.16b,v1.16b,v21.16b + eor x17,x17,x19 + eor v2.16b,v2.16b,v22.16b + eor x20,x20,x21 + eor v3.16b,v3.16b,v23.16b + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#4 // increment counter + stp x9,x11,[x0,#16] + add v27.4s,v27.4s,v31.4s // += 4 + stp x13,x15,[x0,#32] + add v28.4s,v28.4s,v31.4s + stp x17,x20,[x0,#48] + add v29.4s,v29.4s,v31.4s + add x0,x0,#64 + + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 + + eor v4.16b,v4.16b,v20.16b + eor v5.16b,v5.16b,v21.16b + eor v6.16b,v6.16b,v22.16b + eor v7.16b,v7.16b,v23.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + + eor v16.16b,v16.16b,v0.16b + eor v17.16b,v17.16b,v1.16b + eor v18.16b,v18.16b,v2.16b + eor v19.16b,v19.16b,v3.16b + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 + + b.hi Loop_outer_neon + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +Ltail_neon: + add x2,x2,#256 + cmp x2,#64 + b.lo Less_than_64 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#4 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + b.eq Ldone_neon + sub x2,x2,#64 + cmp x2,#64 + b.lo Less_than_128 + + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor v0.16b,v0.16b,v20.16b + eor v1.16b,v1.16b,v21.16b + eor v2.16b,v2.16b,v22.16b + eor v3.16b,v3.16b,v23.16b + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + b.eq Ldone_neon + sub x2,x2,#64 + cmp x2,#64 + b.lo Less_than_192 + + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor v4.16b,v4.16b,v20.16b + eor v5.16b,v5.16b,v21.16b + eor v6.16b,v6.16b,v22.16b + eor v7.16b,v7.16b,v23.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + b.eq Ldone_neon + sub x2,x2,#64 + + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] + b Last_neon + +Less_than_128: + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] + b Last_neon +Less_than_192: + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] + b Last_neon + +.align 4 +Last_neon: + sub x0,x0,#1 + add x1,x1,x2 + add x0,x0,x2 + add x4,sp,x2 + neg x2,x2 + +Loop_tail_neon: + ldrb w10,[x1,x2] + ldrb w11,[x4,x2] + add x2,x2,#1 + eor w10,w10,w11 + strb w10,[x0,x2] + cbnz x2,Loop_tail_neon + + stp xzr,xzr,[sp,#0] + stp xzr,xzr,[sp,#16] + stp xzr,xzr,[sp,#32] + stp xzr,xzr,[sp,#48] + +Ldone_neon: + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.def ChaCha20_512_neon + .type 32 +.endef +.align 5 +ChaCha20_512_neon: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adrp x5,Lsigma + add x5,x5,:lo12:Lsigma + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + +L512_or_more_neon: + sub sp,sp,#128+64 + + ldp x22,x23,[x5] // load sigma + ld1 {v24.4s},[x5],#16 + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ld1 {v25.4s,v26.4s},[x3] + ldp x28,x30,[x4] // load counter + ld1 {v27.4s},[x4] + ld1 {v31.4s},[x5] +#ifdef __AARCH64EB__ + rev64 v24.4s,v24.4s + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + add v27.4s,v27.4s,v31.4s // += 1 + stp q24,q25,[sp,#0] // off-load key block, invariant part + add v27.4s,v27.4s,v31.4s // not typo + str q26,[sp,#32] + add v28.4s,v27.4s,v31.4s + add v29.4s,v28.4s,v31.4s + add v30.4s,v29.4s,v31.4s + shl v31.4s,v31.4s,#2 // 1 -> 4 + + stp d8,d9,[sp,#128+0] // meet ABI requirements + stp d10,d11,[sp,#128+16] + stp d12,d13,[sp,#128+32] + stp d14,d15,[sp,#128+48] + + sub x2,x2,#512 // not typo + +Loop_outer_512_neon: + mov v0.16b,v24.16b + mov v4.16b,v24.16b + mov v8.16b,v24.16b + mov v12.16b,v24.16b + mov v16.16b,v24.16b + mov v20.16b,v24.16b + mov v1.16b,v25.16b + mov w5,w22 // unpack key block + mov v5.16b,v25.16b + lsr x6,x22,#32 + mov v9.16b,v25.16b + mov w7,w23 + mov v13.16b,v25.16b + lsr x8,x23,#32 + mov v17.16b,v25.16b + mov w9,w24 + mov v21.16b,v25.16b + lsr x10,x24,#32 + mov v3.16b,v27.16b + mov w11,w25 + mov v7.16b,v28.16b + lsr x12,x25,#32 + mov v11.16b,v29.16b + mov w13,w26 + mov v15.16b,v30.16b + lsr x14,x26,#32 + mov v2.16b,v26.16b + mov w15,w27 + mov v6.16b,v26.16b + lsr x16,x27,#32 + add v19.4s,v3.4s,v31.4s // +4 + mov w17,w28 + add v23.4s,v7.4s,v31.4s // +4 + lsr x19,x28,#32 + mov v10.16b,v26.16b + mov w20,w30 + mov v14.16b,v26.16b + lsr x21,x30,#32 + mov v18.16b,v26.16b + stp q27,q28,[sp,#48] // off-load key block, variable part + mov v22.16b,v26.16b + str q29,[sp,#80] + + mov x4,#5 + subs x2,x2,#512 +Loop_upper_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v11.16b,v11.16b,v11.16b,#12 + ext v15.16b,v15.16b,v15.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v23.16b,v23.16b,v23.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v9.16b,v9.16b,v9.16b,#4 + ext v13.16b,v13.16b,v13.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + ext v21.16b,v21.16b,v21.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v11.16b,v11.16b,v11.16b,#4 + ext v15.16b,v15.16b,v15.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v23.16b,v23.16b,v23.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v9.16b,v9.16b,v9.16b,#12 + ext v13.16b,v13.16b,v13.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + ext v21.16b,v21.16b,v21.16b,#12 + cbnz x4,Loop_upper_neon + + add w5,w5,w22 // accumulate key block + add x6,x6,x22,lsr#32 + add w7,w7,w23 + add x8,x8,x23,lsr#32 + add w9,w9,w24 + add x10,x10,x24,lsr#32 + add w11,w11,w25 + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add x21,x21,x30,lsr#32 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#1 // increment counter + mov w5,w22 // unpack key block + lsr x6,x22,#32 + stp x9,x11,[x0,#16] + mov w7,w23 + lsr x8,x23,#32 + stp x13,x15,[x0,#32] + mov w9,w24 + lsr x10,x24,#32 + stp x17,x20,[x0,#48] + add x0,x0,#64 + mov w11,w25 + lsr x12,x25,#32 + mov w13,w26 + lsr x14,x26,#32 + mov w15,w27 + lsr x16,x27,#32 + mov w17,w28 + lsr x19,x28,#32 + mov w20,w30 + lsr x21,x30,#32 + + mov x4,#5 +Loop_lower_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v11.16b,v11.16b,v11.16b,#12 + ext v15.16b,v15.16b,v15.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v23.16b,v23.16b,v23.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v9.16b,v9.16b,v9.16b,#4 + ext v13.16b,v13.16b,v13.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + ext v21.16b,v21.16b,v21.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v11.16b,v11.16b,v11.16b,#4 + ext v15.16b,v15.16b,v15.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v23.16b,v23.16b,v23.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v9.16b,v9.16b,v9.16b,#12 + ext v13.16b,v13.16b,v13.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + ext v21.16b,v21.16b,v21.16b,#12 + cbnz x4,Loop_lower_neon + + add w5,w5,w22 // accumulate key block + ldp q24,q25,[sp,#0] + add x6,x6,x22,lsr#32 + ldp q26,q27,[sp,#32] + add w7,w7,w23 + ldp q28,q29,[sp,#64] + add x8,x8,x23,lsr#32 + add v0.4s,v0.4s,v24.4s + add w9,w9,w24 + add v4.4s,v4.4s,v24.4s + add x10,x10,x24,lsr#32 + add v8.4s,v8.4s,v24.4s + add w11,w11,w25 + add v12.4s,v12.4s,v24.4s + add x12,x12,x25,lsr#32 + add v16.4s,v16.4s,v24.4s + add w13,w13,w26 + add v20.4s,v20.4s,v24.4s + add x14,x14,x26,lsr#32 + add v2.4s,v2.4s,v26.4s + add w15,w15,w27 + add v6.4s,v6.4s,v26.4s + add x16,x16,x27,lsr#32 + add v10.4s,v10.4s,v26.4s + add w17,w17,w28 + add v14.4s,v14.4s,v26.4s + add x19,x19,x28,lsr#32 + add v18.4s,v18.4s,v26.4s + add w20,w20,w30 + add v22.4s,v22.4s,v26.4s + add x21,x21,x30,lsr#32 + add v19.4s,v19.4s,v31.4s // +4 + add x5,x5,x6,lsl#32 // pack + add v23.4s,v23.4s,v31.4s // +4 + add x7,x7,x8,lsl#32 + add v3.4s,v3.4s,v27.4s + ldp x6,x8,[x1,#0] // load input + add v7.4s,v7.4s,v28.4s + add x9,x9,x10,lsl#32 + add v11.4s,v11.4s,v29.4s + add x11,x11,x12,lsl#32 + add v15.4s,v15.4s,v30.4s + ldp x10,x12,[x1,#16] + add v19.4s,v19.4s,v27.4s + add x13,x13,x14,lsl#32 + add v23.4s,v23.4s,v28.4s + add x15,x15,x16,lsl#32 + add v1.4s,v1.4s,v25.4s + ldp x14,x16,[x1,#32] + add v5.4s,v5.4s,v25.4s + add x17,x17,x19,lsl#32 + add v9.4s,v9.4s,v25.4s + add x20,x20,x21,lsl#32 + add v13.4s,v13.4s,v25.4s + ldp x19,x21,[x1,#48] + add v17.4s,v17.4s,v25.4s + add x1,x1,#64 + add v21.4s,v21.4s,v25.4s + +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor v0.16b,v0.16b,v24.16b + eor x15,x15,x16 + eor v1.16b,v1.16b,v25.16b + eor x17,x17,x19 + eor v2.16b,v2.16b,v26.16b + eor x20,x20,x21 + eor v3.16b,v3.16b,v27.16b + ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#7 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + + ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 + eor v4.16b,v4.16b,v24.16b + eor v5.16b,v5.16b,v25.16b + eor v6.16b,v6.16b,v26.16b + eor v7.16b,v7.16b,v27.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + eor v8.16b,v8.16b,v0.16b + ldp q24,q25,[sp,#0] + eor v9.16b,v9.16b,v1.16b + ldp q26,q27,[sp,#32] + eor v10.16b,v10.16b,v2.16b + eor v11.16b,v11.16b,v3.16b + st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 + + ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 + eor v12.16b,v12.16b,v4.16b + eor v13.16b,v13.16b,v5.16b + eor v14.16b,v14.16b,v6.16b + eor v15.16b,v15.16b,v7.16b + st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 + + ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 + eor v16.16b,v16.16b,v8.16b + eor v17.16b,v17.16b,v9.16b + eor v18.16b,v18.16b,v10.16b + eor v19.16b,v19.16b,v11.16b + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 + + shl v0.4s,v31.4s,#1 // 4 -> 8 + eor v20.16b,v20.16b,v12.16b + eor v21.16b,v21.16b,v13.16b + eor v22.16b,v22.16b,v14.16b + eor v23.16b,v23.16b,v15.16b + st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 + + add v27.4s,v27.4s,v0.4s // += 8 + add v28.4s,v28.4s,v0.4s + add v29.4s,v29.4s,v0.4s + add v30.4s,v30.4s,v0.4s + + b.hs Loop_outer_512_neon + + adds x2,x2,#512 + ushr v0.4s,v31.4s,#2 // 4 -> 1 + + ldp d8,d9,[sp,#128+0] // meet ABI requirements + ldp d10,d11,[sp,#128+16] + ldp d12,d13,[sp,#128+32] + ldp d14,d15,[sp,#128+48] + + stp q24,q31,[sp,#0] // wipe off-load area + stp q24,q31,[sp,#32] + stp q24,q31,[sp,#64] + + b.eq Ldone_512_neon + + cmp x2,#192 + sub v27.4s,v27.4s,v0.4s // -= 1 + sub v28.4s,v28.4s,v0.4s + sub v29.4s,v29.4s,v0.4s + add sp,sp,#128 + b.hs Loop_outer_neon + + eor v25.16b,v25.16b,v25.16b + eor v26.16b,v26.16b,v26.16b + eor v27.16b,v27.16b,v27.16b + eor v28.16b,v28.16b,v28.16b + eor v29.16b,v29.16b,v29.16b + eor v30.16b,v30.16b,v30.16b + b Loop_outer + +Ldone_512_neon: + ldp x19,x20,[x29,#16] + add sp,sp,#128+64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) diff --git a/ring-0.17.14/pregenerated/chacha-x86-elf.S b/ring-0.17.14/pregenerated/chacha-x86-elf.S new file mode 100644 index 0000000000..b8b6b63c17 --- /dev/null +++ b/ring-0.17.14/pregenerated/chacha-x86-elf.S @@ -0,0 +1,601 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) +.text +.globl ChaCha20_ctr32_ssse3 +.hidden ChaCha20_ctr32_ssse3 +.type ChaCha20_ctr32_ssse3,@function +.align 16 +ChaCha20_ctr32_ssse3: +.L_ChaCha20_ctr32_ssse3_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + call .Lpic_point +.Lpic_point: + popl %eax + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%ecx + movl 32(%esp),%edx + movl 36(%esp),%ebx + movl %esp,%ebp + subl $524,%esp + andl $-64,%esp + movl %ebp,512(%esp) + leal .Lssse3_data-.Lpic_point(%eax),%eax + movdqu (%ebx),%xmm3 + cmpl $256,%ecx + jb .L0001x + movl %edx,516(%esp) + movl %ebx,520(%esp) + subl $256,%ecx + leal 384(%esp),%ebp + movdqu (%edx),%xmm7 + pshufd $0,%xmm3,%xmm0 + pshufd $85,%xmm3,%xmm1 + pshufd $170,%xmm3,%xmm2 + pshufd $255,%xmm3,%xmm3 + paddd 48(%eax),%xmm0 + pshufd $0,%xmm7,%xmm4 + pshufd $85,%xmm7,%xmm5 + psubd 64(%eax),%xmm0 + pshufd $170,%xmm7,%xmm6 + pshufd $255,%xmm7,%xmm7 + movdqa %xmm0,64(%ebp) + movdqa %xmm1,80(%ebp) + movdqa %xmm2,96(%ebp) + movdqa %xmm3,112(%ebp) + movdqu 16(%edx),%xmm3 + movdqa %xmm4,-64(%ebp) + movdqa %xmm5,-48(%ebp) + movdqa %xmm6,-32(%ebp) + movdqa %xmm7,-16(%ebp) + movdqa 32(%eax),%xmm7 + leal 128(%esp),%ebx + pshufd $0,%xmm3,%xmm0 + pshufd $85,%xmm3,%xmm1 + pshufd $170,%xmm3,%xmm2 + pshufd $255,%xmm3,%xmm3 + pshufd $0,%xmm7,%xmm4 + pshufd $85,%xmm7,%xmm5 + pshufd $170,%xmm7,%xmm6 + pshufd $255,%xmm7,%xmm7 + movdqa %xmm0,(%ebp) + movdqa %xmm1,16(%ebp) + movdqa %xmm2,32(%ebp) + movdqa %xmm3,48(%ebp) + movdqa %xmm4,-128(%ebp) + movdqa %xmm5,-112(%ebp) + movdqa %xmm6,-96(%ebp) + movdqa %xmm7,-80(%ebp) + leal 128(%esi),%esi + leal 128(%edi),%edi + jmp .L001outer_loop +.align 16 +.L001outer_loop: + movdqa -112(%ebp),%xmm1 + movdqa -96(%ebp),%xmm2 + movdqa -80(%ebp),%xmm3 + movdqa -48(%ebp),%xmm5 + movdqa -32(%ebp),%xmm6 + movdqa -16(%ebp),%xmm7 + movdqa %xmm1,-112(%ebx) + movdqa %xmm2,-96(%ebx) + movdqa %xmm3,-80(%ebx) + movdqa %xmm5,-48(%ebx) + movdqa %xmm6,-32(%ebx) + movdqa %xmm7,-16(%ebx) + movdqa 32(%ebp),%xmm2 + movdqa 48(%ebp),%xmm3 + movdqa 64(%ebp),%xmm4 + movdqa 80(%ebp),%xmm5 + movdqa 96(%ebp),%xmm6 + movdqa 112(%ebp),%xmm7 + paddd 64(%eax),%xmm4 + movdqa %xmm2,32(%ebx) + movdqa %xmm3,48(%ebx) + movdqa %xmm4,64(%ebx) + movdqa %xmm5,80(%ebx) + movdqa %xmm6,96(%ebx) + movdqa %xmm7,112(%ebx) + movdqa %xmm4,64(%ebp) + movdqa -128(%ebp),%xmm0 + movdqa %xmm4,%xmm6 + movdqa -64(%ebp),%xmm3 + movdqa (%ebp),%xmm4 + movdqa 16(%ebp),%xmm5 + movl $10,%edx + nop +.align 16 +.L002loop: + paddd %xmm3,%xmm0 + movdqa %xmm3,%xmm2 + pxor %xmm0,%xmm6 + pshufb (%eax),%xmm6 + paddd %xmm6,%xmm4 + pxor %xmm4,%xmm2 + movdqa -48(%ebx),%xmm3 + movdqa %xmm2,%xmm1 + pslld $12,%xmm2 + psrld $20,%xmm1 + por %xmm1,%xmm2 + movdqa -112(%ebx),%xmm1 + paddd %xmm2,%xmm0 + movdqa 80(%ebx),%xmm7 + pxor %xmm0,%xmm6 + movdqa %xmm0,-128(%ebx) + pshufb 16(%eax),%xmm6 + paddd %xmm6,%xmm4 + movdqa %xmm6,64(%ebx) + pxor %xmm4,%xmm2 + paddd %xmm3,%xmm1 + movdqa %xmm2,%xmm0 + pslld $7,%xmm2 + psrld $25,%xmm0 + pxor %xmm1,%xmm7 + por %xmm0,%xmm2 + movdqa %xmm4,(%ebx) + pshufb (%eax),%xmm7 + movdqa %xmm2,-64(%ebx) + paddd %xmm7,%xmm5 + movdqa 32(%ebx),%xmm4 + pxor %xmm5,%xmm3 + movdqa -32(%ebx),%xmm2 + movdqa %xmm3,%xmm0 + pslld $12,%xmm3 + psrld $20,%xmm0 + por %xmm0,%xmm3 + movdqa -96(%ebx),%xmm0 + paddd %xmm3,%xmm1 + movdqa 96(%ebx),%xmm6 + pxor %xmm1,%xmm7 + movdqa %xmm1,-112(%ebx) + pshufb 16(%eax),%xmm7 + paddd %xmm7,%xmm5 + movdqa %xmm7,80(%ebx) + pxor %xmm5,%xmm3 + paddd %xmm2,%xmm0 + movdqa %xmm3,%xmm1 + pslld $7,%xmm3 + psrld $25,%xmm1 + pxor %xmm0,%xmm6 + por %xmm1,%xmm3 + movdqa %xmm5,16(%ebx) + pshufb (%eax),%xmm6 + movdqa %xmm3,-48(%ebx) + paddd %xmm6,%xmm4 + movdqa 48(%ebx),%xmm5 + pxor %xmm4,%xmm2 + movdqa -16(%ebx),%xmm3 + movdqa %xmm2,%xmm1 + pslld $12,%xmm2 + psrld $20,%xmm1 + por %xmm1,%xmm2 + movdqa -80(%ebx),%xmm1 + paddd %xmm2,%xmm0 + movdqa 112(%ebx),%xmm7 + pxor %xmm0,%xmm6 + movdqa %xmm0,-96(%ebx) + pshufb 16(%eax),%xmm6 + paddd %xmm6,%xmm4 + movdqa %xmm6,96(%ebx) + pxor %xmm4,%xmm2 + paddd %xmm3,%xmm1 + movdqa %xmm2,%xmm0 + pslld $7,%xmm2 + psrld $25,%xmm0 + pxor %xmm1,%xmm7 + por %xmm0,%xmm2 + pshufb (%eax),%xmm7 + movdqa %xmm2,-32(%ebx) + paddd %xmm7,%xmm5 + pxor %xmm5,%xmm3 + movdqa -48(%ebx),%xmm2 + movdqa %xmm3,%xmm0 + pslld $12,%xmm3 + psrld $20,%xmm0 + por %xmm0,%xmm3 + movdqa -128(%ebx),%xmm0 + paddd %xmm3,%xmm1 + pxor %xmm1,%xmm7 + movdqa %xmm1,-80(%ebx) + pshufb 16(%eax),%xmm7 + paddd %xmm7,%xmm5 + movdqa %xmm7,%xmm6 + pxor %xmm5,%xmm3 + paddd %xmm2,%xmm0 + movdqa %xmm3,%xmm1 + pslld $7,%xmm3 + psrld $25,%xmm1 + pxor %xmm0,%xmm6 + por %xmm1,%xmm3 + pshufb (%eax),%xmm6 + movdqa %xmm3,-16(%ebx) + paddd %xmm6,%xmm4 + pxor %xmm4,%xmm2 + movdqa -32(%ebx),%xmm3 + movdqa %xmm2,%xmm1 + pslld $12,%xmm2 + psrld $20,%xmm1 + por %xmm1,%xmm2 + movdqa -112(%ebx),%xmm1 + paddd %xmm2,%xmm0 + movdqa 64(%ebx),%xmm7 + pxor %xmm0,%xmm6 + movdqa %xmm0,-128(%ebx) + pshufb 16(%eax),%xmm6 + paddd %xmm6,%xmm4 + movdqa %xmm6,112(%ebx) + pxor %xmm4,%xmm2 + paddd %xmm3,%xmm1 + movdqa %xmm2,%xmm0 + pslld $7,%xmm2 + psrld $25,%xmm0 + pxor %xmm1,%xmm7 + por %xmm0,%xmm2 + movdqa %xmm4,32(%ebx) + pshufb (%eax),%xmm7 + movdqa %xmm2,-48(%ebx) + paddd %xmm7,%xmm5 + movdqa (%ebx),%xmm4 + pxor %xmm5,%xmm3 + movdqa -16(%ebx),%xmm2 + movdqa %xmm3,%xmm0 + pslld $12,%xmm3 + psrld $20,%xmm0 + por %xmm0,%xmm3 + movdqa -96(%ebx),%xmm0 + paddd %xmm3,%xmm1 + movdqa 80(%ebx),%xmm6 + pxor %xmm1,%xmm7 + movdqa %xmm1,-112(%ebx) + pshufb 16(%eax),%xmm7 + paddd %xmm7,%xmm5 + movdqa %xmm7,64(%ebx) + pxor %xmm5,%xmm3 + paddd %xmm2,%xmm0 + movdqa %xmm3,%xmm1 + pslld $7,%xmm3 + psrld $25,%xmm1 + pxor %xmm0,%xmm6 + por %xmm1,%xmm3 + movdqa %xmm5,48(%ebx) + pshufb (%eax),%xmm6 + movdqa %xmm3,-32(%ebx) + paddd %xmm6,%xmm4 + movdqa 16(%ebx),%xmm5 + pxor %xmm4,%xmm2 + movdqa -64(%ebx),%xmm3 + movdqa %xmm2,%xmm1 + pslld $12,%xmm2 + psrld $20,%xmm1 + por %xmm1,%xmm2 + movdqa -80(%ebx),%xmm1 + paddd %xmm2,%xmm0 + movdqa 96(%ebx),%xmm7 + pxor %xmm0,%xmm6 + movdqa %xmm0,-96(%ebx) + pshufb 16(%eax),%xmm6 + paddd %xmm6,%xmm4 + movdqa %xmm6,80(%ebx) + pxor %xmm4,%xmm2 + paddd %xmm3,%xmm1 + movdqa %xmm2,%xmm0 + pslld $7,%xmm2 + psrld $25,%xmm0 + pxor %xmm1,%xmm7 + por %xmm0,%xmm2 + pshufb (%eax),%xmm7 + movdqa %xmm2,-16(%ebx) + paddd %xmm7,%xmm5 + pxor %xmm5,%xmm3 + movdqa %xmm3,%xmm0 + pslld $12,%xmm3 + psrld $20,%xmm0 + por %xmm0,%xmm3 + movdqa -128(%ebx),%xmm0 + paddd %xmm3,%xmm1 + movdqa 64(%ebx),%xmm6 + pxor %xmm1,%xmm7 + movdqa %xmm1,-80(%ebx) + pshufb 16(%eax),%xmm7 + paddd %xmm7,%xmm5 + movdqa %xmm7,96(%ebx) + pxor %xmm5,%xmm3 + movdqa %xmm3,%xmm1 + pslld $7,%xmm3 + psrld $25,%xmm1 + por %xmm1,%xmm3 + decl %edx + jnz .L002loop + movdqa %xmm3,-64(%ebx) + movdqa %xmm4,(%ebx) + movdqa %xmm5,16(%ebx) + movdqa %xmm6,64(%ebx) + movdqa %xmm7,96(%ebx) + movdqa -112(%ebx),%xmm1 + movdqa -96(%ebx),%xmm2 + movdqa -80(%ebx),%xmm3 + paddd -128(%ebp),%xmm0 + paddd -112(%ebp),%xmm1 + paddd -96(%ebp),%xmm2 + paddd -80(%ebp),%xmm3 + movdqa %xmm0,%xmm6 + punpckldq %xmm1,%xmm0 + movdqa %xmm2,%xmm7 + punpckldq %xmm3,%xmm2 + punpckhdq %xmm1,%xmm6 + punpckhdq %xmm3,%xmm7 + movdqa %xmm0,%xmm1 + punpcklqdq %xmm2,%xmm0 + movdqa %xmm6,%xmm3 + punpcklqdq %xmm7,%xmm6 + punpckhqdq %xmm2,%xmm1 + punpckhqdq %xmm7,%xmm3 + movdqu -128(%esi),%xmm4 + movdqu -64(%esi),%xmm5 + movdqu (%esi),%xmm2 + movdqu 64(%esi),%xmm7 + leal 16(%esi),%esi + pxor %xmm0,%xmm4 + movdqa -64(%ebx),%xmm0 + pxor %xmm1,%xmm5 + movdqa -48(%ebx),%xmm1 + pxor %xmm2,%xmm6 + movdqa -32(%ebx),%xmm2 + pxor %xmm3,%xmm7 + movdqa -16(%ebx),%xmm3 + movdqu %xmm4,-128(%edi) + movdqu %xmm5,-64(%edi) + movdqu %xmm6,(%edi) + movdqu %xmm7,64(%edi) + leal 16(%edi),%edi + paddd -64(%ebp),%xmm0 + paddd -48(%ebp),%xmm1 + paddd -32(%ebp),%xmm2 + paddd -16(%ebp),%xmm3 + movdqa %xmm0,%xmm6 + punpckldq %xmm1,%xmm0 + movdqa %xmm2,%xmm7 + punpckldq %xmm3,%xmm2 + punpckhdq %xmm1,%xmm6 + punpckhdq %xmm3,%xmm7 + movdqa %xmm0,%xmm1 + punpcklqdq %xmm2,%xmm0 + movdqa %xmm6,%xmm3 + punpcklqdq %xmm7,%xmm6 + punpckhqdq %xmm2,%xmm1 + punpckhqdq %xmm7,%xmm3 + movdqu -128(%esi),%xmm4 + movdqu -64(%esi),%xmm5 + movdqu (%esi),%xmm2 + movdqu 64(%esi),%xmm7 + leal 16(%esi),%esi + pxor %xmm0,%xmm4 + movdqa (%ebx),%xmm0 + pxor %xmm1,%xmm5 + movdqa 16(%ebx),%xmm1 + pxor %xmm2,%xmm6 + movdqa 32(%ebx),%xmm2 + pxor %xmm3,%xmm7 + movdqa 48(%ebx),%xmm3 + movdqu %xmm4,-128(%edi) + movdqu %xmm5,-64(%edi) + movdqu %xmm6,(%edi) + movdqu %xmm7,64(%edi) + leal 16(%edi),%edi + paddd (%ebp),%xmm0 + paddd 16(%ebp),%xmm1 + paddd 32(%ebp),%xmm2 + paddd 48(%ebp),%xmm3 + movdqa %xmm0,%xmm6 + punpckldq %xmm1,%xmm0 + movdqa %xmm2,%xmm7 + punpckldq %xmm3,%xmm2 + punpckhdq %xmm1,%xmm6 + punpckhdq %xmm3,%xmm7 + movdqa %xmm0,%xmm1 + punpcklqdq %xmm2,%xmm0 + movdqa %xmm6,%xmm3 + punpcklqdq %xmm7,%xmm6 + punpckhqdq %xmm2,%xmm1 + punpckhqdq %xmm7,%xmm3 + movdqu -128(%esi),%xmm4 + movdqu -64(%esi),%xmm5 + movdqu (%esi),%xmm2 + movdqu 64(%esi),%xmm7 + leal 16(%esi),%esi + pxor %xmm0,%xmm4 + movdqa 64(%ebx),%xmm0 + pxor %xmm1,%xmm5 + movdqa 80(%ebx),%xmm1 + pxor %xmm2,%xmm6 + movdqa 96(%ebx),%xmm2 + pxor %xmm3,%xmm7 + movdqa 112(%ebx),%xmm3 + movdqu %xmm4,-128(%edi) + movdqu %xmm5,-64(%edi) + movdqu %xmm6,(%edi) + movdqu %xmm7,64(%edi) + leal 16(%edi),%edi + paddd 64(%ebp),%xmm0 + paddd 80(%ebp),%xmm1 + paddd 96(%ebp),%xmm2 + paddd 112(%ebp),%xmm3 + movdqa %xmm0,%xmm6 + punpckldq %xmm1,%xmm0 + movdqa %xmm2,%xmm7 + punpckldq %xmm3,%xmm2 + punpckhdq %xmm1,%xmm6 + punpckhdq %xmm3,%xmm7 + movdqa %xmm0,%xmm1 + punpcklqdq %xmm2,%xmm0 + movdqa %xmm6,%xmm3 + punpcklqdq %xmm7,%xmm6 + punpckhqdq %xmm2,%xmm1 + punpckhqdq %xmm7,%xmm3 + movdqu -128(%esi),%xmm4 + movdqu -64(%esi),%xmm5 + movdqu (%esi),%xmm2 + movdqu 64(%esi),%xmm7 + leal 208(%esi),%esi + pxor %xmm0,%xmm4 + pxor %xmm1,%xmm5 + pxor %xmm2,%xmm6 + pxor %xmm3,%xmm7 + movdqu %xmm4,-128(%edi) + movdqu %xmm5,-64(%edi) + movdqu %xmm6,(%edi) + movdqu %xmm7,64(%edi) + leal 208(%edi),%edi + subl $256,%ecx + jnc .L001outer_loop + addl $256,%ecx + jz .L003done + movl 520(%esp),%ebx + leal -128(%esi),%esi + movl 516(%esp),%edx + leal -128(%edi),%edi + movd 64(%ebp),%xmm2 + movdqu (%ebx),%xmm3 + paddd 96(%eax),%xmm2 + pand 112(%eax),%xmm3 + por %xmm2,%xmm3 +.L0001x: + movdqa 32(%eax),%xmm0 + movdqu (%edx),%xmm1 + movdqu 16(%edx),%xmm2 + movdqa (%eax),%xmm6 + movdqa 16(%eax),%xmm7 + movl %ebp,48(%esp) + movdqa %xmm0,(%esp) + movdqa %xmm1,16(%esp) + movdqa %xmm2,32(%esp) + movdqa %xmm3,48(%esp) + movl $10,%edx + jmp .L004loop1x +.align 16 +.L005outer1x: + movdqa 80(%eax),%xmm3 + movdqa (%esp),%xmm0 + movdqa 16(%esp),%xmm1 + movdqa 32(%esp),%xmm2 + paddd 48(%esp),%xmm3 + movl $10,%edx + movdqa %xmm3,48(%esp) + jmp .L004loop1x +.align 16 +.L004loop1x: + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,222 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $20,%xmm1 + pslld $12,%xmm4 + por %xmm4,%xmm1 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,223 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $25,%xmm1 + pslld $7,%xmm4 + por %xmm4,%xmm1 + pshufd $78,%xmm2,%xmm2 + pshufd $57,%xmm1,%xmm1 + pshufd $147,%xmm3,%xmm3 + nop + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,222 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $20,%xmm1 + pslld $12,%xmm4 + por %xmm4,%xmm1 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,223 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $25,%xmm1 + pslld $7,%xmm4 + por %xmm4,%xmm1 + pshufd $78,%xmm2,%xmm2 + pshufd $147,%xmm1,%xmm1 + pshufd $57,%xmm3,%xmm3 + decl %edx + jnz .L004loop1x + paddd (%esp),%xmm0 + paddd 16(%esp),%xmm1 + paddd 32(%esp),%xmm2 + paddd 48(%esp),%xmm3 + cmpl $64,%ecx + jb .L006tail + movdqu (%esi),%xmm4 + movdqu 16(%esi),%xmm5 + pxor %xmm4,%xmm0 + movdqu 32(%esi),%xmm4 + pxor %xmm5,%xmm1 + movdqu 48(%esi),%xmm5 + pxor %xmm4,%xmm2 + pxor %xmm5,%xmm3 + leal 64(%esi),%esi + movdqu %xmm0,(%edi) + movdqu %xmm1,16(%edi) + movdqu %xmm2,32(%edi) + movdqu %xmm3,48(%edi) + leal 64(%edi),%edi + subl $64,%ecx + jnz .L005outer1x + jmp .L003done +.L006tail: + movdqa %xmm0,(%esp) + movdqa %xmm1,16(%esp) + movdqa %xmm2,32(%esp) + movdqa %xmm3,48(%esp) + xorl %eax,%eax + xorl %edx,%edx + xorl %ebp,%ebp +.L007tail_loop: + movb (%esp,%ebp,1),%al + movb (%esi,%ebp,1),%dl + leal 1(%ebp),%ebp + xorb %dl,%al + movb %al,-1(%edi,%ebp,1) + decl %ecx + jnz .L007tail_loop +.L003done: + movl 512(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size ChaCha20_ctr32_ssse3,.-.L_ChaCha20_ctr32_ssse3_begin +.align 64 +.Lssse3_data: +.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 +.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 +.long 1634760805,857760878,2036477234,1797285236 +.long 0,1,2,3 +.long 4,4,4,4 +.long 1,0,0,0 +.long 4,0,0,0 +.long 0,-1,-1,-1 +.align 64 +.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 +.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 +.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 +.byte 114,103,62,0 +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) diff --git a/ring-0.17.14/pregenerated/chacha-x86-win32n.asm b/ring-0.17.14/pregenerated/chacha-x86-win32n.asm new file mode 100644 index 0000000000..36e7b34234 --- /dev/null +++ b/ring-0.17.14/pregenerated/chacha-x86-win32n.asm @@ -0,0 +1,607 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%include "ring_core_generated/prefix_symbols_nasm.inc" +%ifidn __OUTPUT_FORMAT__, win32 +%ifidn __OUTPUT_FORMAT__,obj +section code use32 class=code align=64 +%elifidn __OUTPUT_FORMAT__,win32 +$@feat.00 equ 1 +section .text code align=64 +%else +section .text code +%endif +global _ChaCha20_ctr32_ssse3 +align 16 +_ChaCha20_ctr32_ssse3: +L$_ChaCha20_ctr32_ssse3_begin: + push ebp + push ebx + push esi + push edi + call L$pic_point +L$pic_point: + pop eax + mov edi,DWORD [20+esp] + mov esi,DWORD [24+esp] + mov ecx,DWORD [28+esp] + mov edx,DWORD [32+esp] + mov ebx,DWORD [36+esp] + mov ebp,esp + sub esp,524 + and esp,-64 + mov DWORD [512+esp],ebp + lea eax,[(L$ssse3_data-L$pic_point)+eax] + movdqu xmm3,[ebx] + cmp ecx,256 + jb NEAR L$0001x + mov DWORD [516+esp],edx + mov DWORD [520+esp],ebx + sub ecx,256 + lea ebp,[384+esp] + movdqu xmm7,[edx] + pshufd xmm0,xmm3,0 + pshufd xmm1,xmm3,85 + pshufd xmm2,xmm3,170 + pshufd xmm3,xmm3,255 + paddd xmm0,[48+eax] + pshufd xmm4,xmm7,0 + pshufd xmm5,xmm7,85 + psubd xmm0,[64+eax] + pshufd xmm6,xmm7,170 + pshufd xmm7,xmm7,255 + movdqa [64+ebp],xmm0 + movdqa [80+ebp],xmm1 + movdqa [96+ebp],xmm2 + movdqa [112+ebp],xmm3 + movdqu xmm3,[16+edx] + movdqa [ebp-64],xmm4 + movdqa [ebp-48],xmm5 + movdqa [ebp-32],xmm6 + movdqa [ebp-16],xmm7 + movdqa xmm7,[32+eax] + lea ebx,[128+esp] + pshufd xmm0,xmm3,0 + pshufd xmm1,xmm3,85 + pshufd xmm2,xmm3,170 + pshufd xmm3,xmm3,255 + pshufd xmm4,xmm7,0 + pshufd xmm5,xmm7,85 + pshufd xmm6,xmm7,170 + pshufd xmm7,xmm7,255 + movdqa [ebp],xmm0 + movdqa [16+ebp],xmm1 + movdqa [32+ebp],xmm2 + movdqa [48+ebp],xmm3 + movdqa [ebp-128],xmm4 + movdqa [ebp-112],xmm5 + movdqa [ebp-96],xmm6 + movdqa [ebp-80],xmm7 + lea esi,[128+esi] + lea edi,[128+edi] + jmp NEAR L$001outer_loop +align 16 +L$001outer_loop: + movdqa xmm1,[ebp-112] + movdqa xmm2,[ebp-96] + movdqa xmm3,[ebp-80] + movdqa xmm5,[ebp-48] + movdqa xmm6,[ebp-32] + movdqa xmm7,[ebp-16] + movdqa [ebx-112],xmm1 + movdqa [ebx-96],xmm2 + movdqa [ebx-80],xmm3 + movdqa [ebx-48],xmm5 + movdqa [ebx-32],xmm6 + movdqa [ebx-16],xmm7 + movdqa xmm2,[32+ebp] + movdqa xmm3,[48+ebp] + movdqa xmm4,[64+ebp] + movdqa xmm5,[80+ebp] + movdqa xmm6,[96+ebp] + movdqa xmm7,[112+ebp] + paddd xmm4,[64+eax] + movdqa [32+ebx],xmm2 + movdqa [48+ebx],xmm3 + movdqa [64+ebx],xmm4 + movdqa [80+ebx],xmm5 + movdqa [96+ebx],xmm6 + movdqa [112+ebx],xmm7 + movdqa [64+ebp],xmm4 + movdqa xmm0,[ebp-128] + movdqa xmm6,xmm4 + movdqa xmm3,[ebp-64] + movdqa xmm4,[ebp] + movdqa xmm5,[16+ebp] + mov edx,10 + nop +align 16 +L$002loop: + paddd xmm0,xmm3 + movdqa xmm2,xmm3 + pxor xmm6,xmm0 + pshufb xmm6,[eax] + paddd xmm4,xmm6 + pxor xmm2,xmm4 + movdqa xmm3,[ebx-48] + movdqa xmm1,xmm2 + pslld xmm2,12 + psrld xmm1,20 + por xmm2,xmm1 + movdqa xmm1,[ebx-112] + paddd xmm0,xmm2 + movdqa xmm7,[80+ebx] + pxor xmm6,xmm0 + movdqa [ebx-128],xmm0 + pshufb xmm6,[16+eax] + paddd xmm4,xmm6 + movdqa [64+ebx],xmm6 + pxor xmm2,xmm4 + paddd xmm1,xmm3 + movdqa xmm0,xmm2 + pslld xmm2,7 + psrld xmm0,25 + pxor xmm7,xmm1 + por xmm2,xmm0 + movdqa [ebx],xmm4 + pshufb xmm7,[eax] + movdqa [ebx-64],xmm2 + paddd xmm5,xmm7 + movdqa xmm4,[32+ebx] + pxor xmm3,xmm5 + movdqa xmm2,[ebx-32] + movdqa xmm0,xmm3 + pslld xmm3,12 + psrld xmm0,20 + por xmm3,xmm0 + movdqa xmm0,[ebx-96] + paddd xmm1,xmm3 + movdqa xmm6,[96+ebx] + pxor xmm7,xmm1 + movdqa [ebx-112],xmm1 + pshufb xmm7,[16+eax] + paddd xmm5,xmm7 + movdqa [80+ebx],xmm7 + pxor xmm3,xmm5 + paddd xmm0,xmm2 + movdqa xmm1,xmm3 + pslld xmm3,7 + psrld xmm1,25 + pxor xmm6,xmm0 + por xmm3,xmm1 + movdqa [16+ebx],xmm5 + pshufb xmm6,[eax] + movdqa [ebx-48],xmm3 + paddd xmm4,xmm6 + movdqa xmm5,[48+ebx] + pxor xmm2,xmm4 + movdqa xmm3,[ebx-16] + movdqa xmm1,xmm2 + pslld xmm2,12 + psrld xmm1,20 + por xmm2,xmm1 + movdqa xmm1,[ebx-80] + paddd xmm0,xmm2 + movdqa xmm7,[112+ebx] + pxor xmm6,xmm0 + movdqa [ebx-96],xmm0 + pshufb xmm6,[16+eax] + paddd xmm4,xmm6 + movdqa [96+ebx],xmm6 + pxor xmm2,xmm4 + paddd xmm1,xmm3 + movdqa xmm0,xmm2 + pslld xmm2,7 + psrld xmm0,25 + pxor xmm7,xmm1 + por xmm2,xmm0 + pshufb xmm7,[eax] + movdqa [ebx-32],xmm2 + paddd xmm5,xmm7 + pxor xmm3,xmm5 + movdqa xmm2,[ebx-48] + movdqa xmm0,xmm3 + pslld xmm3,12 + psrld xmm0,20 + por xmm3,xmm0 + movdqa xmm0,[ebx-128] + paddd xmm1,xmm3 + pxor xmm7,xmm1 + movdqa [ebx-80],xmm1 + pshufb xmm7,[16+eax] + paddd xmm5,xmm7 + movdqa xmm6,xmm7 + pxor xmm3,xmm5 + paddd xmm0,xmm2 + movdqa xmm1,xmm3 + pslld xmm3,7 + psrld xmm1,25 + pxor xmm6,xmm0 + por xmm3,xmm1 + pshufb xmm6,[eax] + movdqa [ebx-16],xmm3 + paddd xmm4,xmm6 + pxor xmm2,xmm4 + movdqa xmm3,[ebx-32] + movdqa xmm1,xmm2 + pslld xmm2,12 + psrld xmm1,20 + por xmm2,xmm1 + movdqa xmm1,[ebx-112] + paddd xmm0,xmm2 + movdqa xmm7,[64+ebx] + pxor xmm6,xmm0 + movdqa [ebx-128],xmm0 + pshufb xmm6,[16+eax] + paddd xmm4,xmm6 + movdqa [112+ebx],xmm6 + pxor xmm2,xmm4 + paddd xmm1,xmm3 + movdqa xmm0,xmm2 + pslld xmm2,7 + psrld xmm0,25 + pxor xmm7,xmm1 + por xmm2,xmm0 + movdqa [32+ebx],xmm4 + pshufb xmm7,[eax] + movdqa [ebx-48],xmm2 + paddd xmm5,xmm7 + movdqa xmm4,[ebx] + pxor xmm3,xmm5 + movdqa xmm2,[ebx-16] + movdqa xmm0,xmm3 + pslld xmm3,12 + psrld xmm0,20 + por xmm3,xmm0 + movdqa xmm0,[ebx-96] + paddd xmm1,xmm3 + movdqa xmm6,[80+ebx] + pxor xmm7,xmm1 + movdqa [ebx-112],xmm1 + pshufb xmm7,[16+eax] + paddd xmm5,xmm7 + movdqa [64+ebx],xmm7 + pxor xmm3,xmm5 + paddd xmm0,xmm2 + movdqa xmm1,xmm3 + pslld xmm3,7 + psrld xmm1,25 + pxor xmm6,xmm0 + por xmm3,xmm1 + movdqa [48+ebx],xmm5 + pshufb xmm6,[eax] + movdqa [ebx-32],xmm3 + paddd xmm4,xmm6 + movdqa xmm5,[16+ebx] + pxor xmm2,xmm4 + movdqa xmm3,[ebx-64] + movdqa xmm1,xmm2 + pslld xmm2,12 + psrld xmm1,20 + por xmm2,xmm1 + movdqa xmm1,[ebx-80] + paddd xmm0,xmm2 + movdqa xmm7,[96+ebx] + pxor xmm6,xmm0 + movdqa [ebx-96],xmm0 + pshufb xmm6,[16+eax] + paddd xmm4,xmm6 + movdqa [80+ebx],xmm6 + pxor xmm2,xmm4 + paddd xmm1,xmm3 + movdqa xmm0,xmm2 + pslld xmm2,7 + psrld xmm0,25 + pxor xmm7,xmm1 + por xmm2,xmm0 + pshufb xmm7,[eax] + movdqa [ebx-16],xmm2 + paddd xmm5,xmm7 + pxor xmm3,xmm5 + movdqa xmm0,xmm3 + pslld xmm3,12 + psrld xmm0,20 + por xmm3,xmm0 + movdqa xmm0,[ebx-128] + paddd xmm1,xmm3 + movdqa xmm6,[64+ebx] + pxor xmm7,xmm1 + movdqa [ebx-80],xmm1 + pshufb xmm7,[16+eax] + paddd xmm5,xmm7 + movdqa [96+ebx],xmm7 + pxor xmm3,xmm5 + movdqa xmm1,xmm3 + pslld xmm3,7 + psrld xmm1,25 + por xmm3,xmm1 + dec edx + jnz NEAR L$002loop + movdqa [ebx-64],xmm3 + movdqa [ebx],xmm4 + movdqa [16+ebx],xmm5 + movdqa [64+ebx],xmm6 + movdqa [96+ebx],xmm7 + movdqa xmm1,[ebx-112] + movdqa xmm2,[ebx-96] + movdqa xmm3,[ebx-80] + paddd xmm0,[ebp-128] + paddd xmm1,[ebp-112] + paddd xmm2,[ebp-96] + paddd xmm3,[ebp-80] + movdqa xmm6,xmm0 + punpckldq xmm0,xmm1 + movdqa xmm7,xmm2 + punpckldq xmm2,xmm3 + punpckhdq xmm6,xmm1 + punpckhdq xmm7,xmm3 + movdqa xmm1,xmm0 + punpcklqdq xmm0,xmm2 + movdqa xmm3,xmm6 + punpcklqdq xmm6,xmm7 + punpckhqdq xmm1,xmm2 + punpckhqdq xmm3,xmm7 + movdqu xmm4,[esi-128] + movdqu xmm5,[esi-64] + movdqu xmm2,[esi] + movdqu xmm7,[64+esi] + lea esi,[16+esi] + pxor xmm4,xmm0 + movdqa xmm0,[ebx-64] + pxor xmm5,xmm1 + movdqa xmm1,[ebx-48] + pxor xmm6,xmm2 + movdqa xmm2,[ebx-32] + pxor xmm7,xmm3 + movdqa xmm3,[ebx-16] + movdqu [edi-128],xmm4 + movdqu [edi-64],xmm5 + movdqu [edi],xmm6 + movdqu [64+edi],xmm7 + lea edi,[16+edi] + paddd xmm0,[ebp-64] + paddd xmm1,[ebp-48] + paddd xmm2,[ebp-32] + paddd xmm3,[ebp-16] + movdqa xmm6,xmm0 + punpckldq xmm0,xmm1 + movdqa xmm7,xmm2 + punpckldq xmm2,xmm3 + punpckhdq xmm6,xmm1 + punpckhdq xmm7,xmm3 + movdqa xmm1,xmm0 + punpcklqdq xmm0,xmm2 + movdqa xmm3,xmm6 + punpcklqdq xmm6,xmm7 + punpckhqdq xmm1,xmm2 + punpckhqdq xmm3,xmm7 + movdqu xmm4,[esi-128] + movdqu xmm5,[esi-64] + movdqu xmm2,[esi] + movdqu xmm7,[64+esi] + lea esi,[16+esi] + pxor xmm4,xmm0 + movdqa xmm0,[ebx] + pxor xmm5,xmm1 + movdqa xmm1,[16+ebx] + pxor xmm6,xmm2 + movdqa xmm2,[32+ebx] + pxor xmm7,xmm3 + movdqa xmm3,[48+ebx] + movdqu [edi-128],xmm4 + movdqu [edi-64],xmm5 + movdqu [edi],xmm6 + movdqu [64+edi],xmm7 + lea edi,[16+edi] + paddd xmm0,[ebp] + paddd xmm1,[16+ebp] + paddd xmm2,[32+ebp] + paddd xmm3,[48+ebp] + movdqa xmm6,xmm0 + punpckldq xmm0,xmm1 + movdqa xmm7,xmm2 + punpckldq xmm2,xmm3 + punpckhdq xmm6,xmm1 + punpckhdq xmm7,xmm3 + movdqa xmm1,xmm0 + punpcklqdq xmm0,xmm2 + movdqa xmm3,xmm6 + punpcklqdq xmm6,xmm7 + punpckhqdq xmm1,xmm2 + punpckhqdq xmm3,xmm7 + movdqu xmm4,[esi-128] + movdqu xmm5,[esi-64] + movdqu xmm2,[esi] + movdqu xmm7,[64+esi] + lea esi,[16+esi] + pxor xmm4,xmm0 + movdqa xmm0,[64+ebx] + pxor xmm5,xmm1 + movdqa xmm1,[80+ebx] + pxor xmm6,xmm2 + movdqa xmm2,[96+ebx] + pxor xmm7,xmm3 + movdqa xmm3,[112+ebx] + movdqu [edi-128],xmm4 + movdqu [edi-64],xmm5 + movdqu [edi],xmm6 + movdqu [64+edi],xmm7 + lea edi,[16+edi] + paddd xmm0,[64+ebp] + paddd xmm1,[80+ebp] + paddd xmm2,[96+ebp] + paddd xmm3,[112+ebp] + movdqa xmm6,xmm0 + punpckldq xmm0,xmm1 + movdqa xmm7,xmm2 + punpckldq xmm2,xmm3 + punpckhdq xmm6,xmm1 + punpckhdq xmm7,xmm3 + movdqa xmm1,xmm0 + punpcklqdq xmm0,xmm2 + movdqa xmm3,xmm6 + punpcklqdq xmm6,xmm7 + punpckhqdq xmm1,xmm2 + punpckhqdq xmm3,xmm7 + movdqu xmm4,[esi-128] + movdqu xmm5,[esi-64] + movdqu xmm2,[esi] + movdqu xmm7,[64+esi] + lea esi,[208+esi] + pxor xmm4,xmm0 + pxor xmm5,xmm1 + pxor xmm6,xmm2 + pxor xmm7,xmm3 + movdqu [edi-128],xmm4 + movdqu [edi-64],xmm5 + movdqu [edi],xmm6 + movdqu [64+edi],xmm7 + lea edi,[208+edi] + sub ecx,256 + jnc NEAR L$001outer_loop + add ecx,256 + jz NEAR L$003done + mov ebx,DWORD [520+esp] + lea esi,[esi-128] + mov edx,DWORD [516+esp] + lea edi,[edi-128] + movd xmm2,DWORD [64+ebp] + movdqu xmm3,[ebx] + paddd xmm2,[96+eax] + pand xmm3,[112+eax] + por xmm3,xmm2 +L$0001x: + movdqa xmm0,[32+eax] + movdqu xmm1,[edx] + movdqu xmm2,[16+edx] + movdqa xmm6,[eax] + movdqa xmm7,[16+eax] + mov DWORD [48+esp],ebp + movdqa [esp],xmm0 + movdqa [16+esp],xmm1 + movdqa [32+esp],xmm2 + movdqa [48+esp],xmm3 + mov edx,10 + jmp NEAR L$004loop1x +align 16 +L$005outer1x: + movdqa xmm3,[80+eax] + movdqa xmm0,[esp] + movdqa xmm1,[16+esp] + movdqa xmm2,[32+esp] + paddd xmm3,[48+esp] + mov edx,10 + movdqa [48+esp],xmm3 + jmp NEAR L$004loop1x +align 16 +L$004loop1x: + paddd xmm0,xmm1 + pxor xmm3,xmm0 +db 102,15,56,0,222 + paddd xmm2,xmm3 + pxor xmm1,xmm2 + movdqa xmm4,xmm1 + psrld xmm1,20 + pslld xmm4,12 + por xmm1,xmm4 + paddd xmm0,xmm1 + pxor xmm3,xmm0 +db 102,15,56,0,223 + paddd xmm2,xmm3 + pxor xmm1,xmm2 + movdqa xmm4,xmm1 + psrld xmm1,25 + pslld xmm4,7 + por xmm1,xmm4 + pshufd xmm2,xmm2,78 + pshufd xmm1,xmm1,57 + pshufd xmm3,xmm3,147 + nop + paddd xmm0,xmm1 + pxor xmm3,xmm0 +db 102,15,56,0,222 + paddd xmm2,xmm3 + pxor xmm1,xmm2 + movdqa xmm4,xmm1 + psrld xmm1,20 + pslld xmm4,12 + por xmm1,xmm4 + paddd xmm0,xmm1 + pxor xmm3,xmm0 +db 102,15,56,0,223 + paddd xmm2,xmm3 + pxor xmm1,xmm2 + movdqa xmm4,xmm1 + psrld xmm1,25 + pslld xmm4,7 + por xmm1,xmm4 + pshufd xmm2,xmm2,78 + pshufd xmm1,xmm1,147 + pshufd xmm3,xmm3,57 + dec edx + jnz NEAR L$004loop1x + paddd xmm0,[esp] + paddd xmm1,[16+esp] + paddd xmm2,[32+esp] + paddd xmm3,[48+esp] + cmp ecx,64 + jb NEAR L$006tail + movdqu xmm4,[esi] + movdqu xmm5,[16+esi] + pxor xmm0,xmm4 + movdqu xmm4,[32+esi] + pxor xmm1,xmm5 + movdqu xmm5,[48+esi] + pxor xmm2,xmm4 + pxor xmm3,xmm5 + lea esi,[64+esi] + movdqu [edi],xmm0 + movdqu [16+edi],xmm1 + movdqu [32+edi],xmm2 + movdqu [48+edi],xmm3 + lea edi,[64+edi] + sub ecx,64 + jnz NEAR L$005outer1x + jmp NEAR L$003done +L$006tail: + movdqa [esp],xmm0 + movdqa [16+esp],xmm1 + movdqa [32+esp],xmm2 + movdqa [48+esp],xmm3 + xor eax,eax + xor edx,edx + xor ebp,ebp +L$007tail_loop: + mov al,BYTE [ebp*1+esp] + mov dl,BYTE [ebp*1+esi] + lea ebp,[1+ebp] + xor al,dl + mov BYTE [ebp*1+edi-1],al + dec ecx + jnz NEAR L$007tail_loop +L$003done: + mov esp,DWORD [512+esp] + pop edi + pop esi + pop ebx + pop ebp + ret +align 64 +L$ssse3_data: +db 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 +db 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 +dd 1634760805,857760878,2036477234,1797285236 +dd 0,1,2,3 +dd 4,4,4,4 +dd 1,0,0,0 +dd 4,0,0,0 +dd 0,-1,-1,-1 +align 64 +db 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 +db 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 +db 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 +db 114,103,62,0 +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/ring-0.17.14/pregenerated/chacha-x86-win32n.o b/ring-0.17.14/pregenerated/chacha-x86-win32n.o new file mode 100644 index 0000000000..928f2e452d Binary files /dev/null and b/ring-0.17.14/pregenerated/chacha-x86-win32n.o differ diff --git a/ring-0.17.14/pregenerated/chacha-x86_64-elf.S b/ring-0.17.14/pregenerated/chacha-x86_64-elf.S new file mode 100644 index 0000000000..a85ae2b2bb --- /dev/null +++ b/ring-0.17.14/pregenerated/chacha-x86_64-elf.S @@ -0,0 +1,1474 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +.text + +.section .rodata +.align 64 +.Lzero: +.long 0,0,0,0 +.Lone: +.long 1,0,0,0 +.Linc: +.long 0,1,2,3 +.Lfour: +.long 4,4,4,4 +.Lincy: +.long 0,2,4,6,1,3,5,7 +.Leight: +.long 8,8,8,8,8,8,8,8 +.Lrot16: +.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd +.Lrot24: +.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe +.Lsigma: +.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 +.align 64 +.Lzeroz: +.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 +.Lfourz: +.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 +.Lincz: +.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 +.Lsixteen: +.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 +.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.text +.globl ChaCha20_ctr32_nohw +.hidden ChaCha20_ctr32_nohw +.type ChaCha20_ctr32_nohw,@function +.align 64 +ChaCha20_ctr32_nohw: +.cfi_startproc +_CET_ENDBR + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset r15,-56 + subq $64+24,%rsp +.cfi_adjust_cfa_offset 88 +.Lctr32_body: + + + movdqu (%rcx),%xmm1 + movdqu 16(%rcx),%xmm2 + movdqu (%r8),%xmm3 + movdqa .Lone(%rip),%xmm4 + + + movdqa %xmm1,16(%rsp) + movdqa %xmm2,32(%rsp) + movdqa %xmm3,48(%rsp) + movq %rdx,%rbp + jmp .Loop_outer + +.align 32 +.Loop_outer: + movl $0x61707865,%eax + movl $0x3320646e,%ebx + movl $0x79622d32,%ecx + movl $0x6b206574,%edx + movl 16(%rsp),%r8d + movl 20(%rsp),%r9d + movl 24(%rsp),%r10d + movl 28(%rsp),%r11d + movd %xmm3,%r12d + movl 52(%rsp),%r13d + movl 56(%rsp),%r14d + movl 60(%rsp),%r15d + + movq %rbp,64+0(%rsp) + movl $10,%ebp + movq %rsi,64+8(%rsp) +.byte 102,72,15,126,214 + movq %rdi,64+16(%rsp) + movq %rsi,%rdi + shrq $32,%rdi + jmp .Loop + +.align 32 +.Loop: + addl %r8d,%eax + xorl %eax,%r12d + roll $16,%r12d + addl %r9d,%ebx + xorl %ebx,%r13d + roll $16,%r13d + addl %r12d,%esi + xorl %esi,%r8d + roll $12,%r8d + addl %r13d,%edi + xorl %edi,%r9d + roll $12,%r9d + addl %r8d,%eax + xorl %eax,%r12d + roll $8,%r12d + addl %r9d,%ebx + xorl %ebx,%r13d + roll $8,%r13d + addl %r12d,%esi + xorl %esi,%r8d + roll $7,%r8d + addl %r13d,%edi + xorl %edi,%r9d + roll $7,%r9d + movl %esi,32(%rsp) + movl %edi,36(%rsp) + movl 40(%rsp),%esi + movl 44(%rsp),%edi + addl %r10d,%ecx + xorl %ecx,%r14d + roll $16,%r14d + addl %r11d,%edx + xorl %edx,%r15d + roll $16,%r15d + addl %r14d,%esi + xorl %esi,%r10d + roll $12,%r10d + addl %r15d,%edi + xorl %edi,%r11d + roll $12,%r11d + addl %r10d,%ecx + xorl %ecx,%r14d + roll $8,%r14d + addl %r11d,%edx + xorl %edx,%r15d + roll $8,%r15d + addl %r14d,%esi + xorl %esi,%r10d + roll $7,%r10d + addl %r15d,%edi + xorl %edi,%r11d + roll $7,%r11d + addl %r9d,%eax + xorl %eax,%r15d + roll $16,%r15d + addl %r10d,%ebx + xorl %ebx,%r12d + roll $16,%r12d + addl %r15d,%esi + xorl %esi,%r9d + roll $12,%r9d + addl %r12d,%edi + xorl %edi,%r10d + roll $12,%r10d + addl %r9d,%eax + xorl %eax,%r15d + roll $8,%r15d + addl %r10d,%ebx + xorl %ebx,%r12d + roll $8,%r12d + addl %r15d,%esi + xorl %esi,%r9d + roll $7,%r9d + addl %r12d,%edi + xorl %edi,%r10d + roll $7,%r10d + movl %esi,40(%rsp) + movl %edi,44(%rsp) + movl 32(%rsp),%esi + movl 36(%rsp),%edi + addl %r11d,%ecx + xorl %ecx,%r13d + roll $16,%r13d + addl %r8d,%edx + xorl %edx,%r14d + roll $16,%r14d + addl %r13d,%esi + xorl %esi,%r11d + roll $12,%r11d + addl %r14d,%edi + xorl %edi,%r8d + roll $12,%r8d + addl %r11d,%ecx + xorl %ecx,%r13d + roll $8,%r13d + addl %r8d,%edx + xorl %edx,%r14d + roll $8,%r14d + addl %r13d,%esi + xorl %esi,%r11d + roll $7,%r11d + addl %r14d,%edi + xorl %edi,%r8d + roll $7,%r8d + decl %ebp + jnz .Loop + movl %edi,36(%rsp) + movl %esi,32(%rsp) + movq 64(%rsp),%rbp + movdqa %xmm2,%xmm1 + movq 64+8(%rsp),%rsi + paddd %xmm4,%xmm3 + movq 64+16(%rsp),%rdi + + addl $0x61707865,%eax + addl $0x3320646e,%ebx + addl $0x79622d32,%ecx + addl $0x6b206574,%edx + addl 16(%rsp),%r8d + addl 20(%rsp),%r9d + addl 24(%rsp),%r10d + addl 28(%rsp),%r11d + addl 48(%rsp),%r12d + addl 52(%rsp),%r13d + addl 56(%rsp),%r14d + addl 60(%rsp),%r15d + paddd 32(%rsp),%xmm1 + + cmpq $64,%rbp + jb .Ltail + + xorl 0(%rsi),%eax + xorl 4(%rsi),%ebx + xorl 8(%rsi),%ecx + xorl 12(%rsi),%edx + xorl 16(%rsi),%r8d + xorl 20(%rsi),%r9d + xorl 24(%rsi),%r10d + xorl 28(%rsi),%r11d + movdqu 32(%rsi),%xmm0 + xorl 48(%rsi),%r12d + xorl 52(%rsi),%r13d + xorl 56(%rsi),%r14d + xorl 60(%rsi),%r15d + leaq 64(%rsi),%rsi + pxor %xmm1,%xmm0 + + movdqa %xmm2,32(%rsp) + movd %xmm3,48(%rsp) + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + movdqu %xmm0,32(%rdi) + movl %r12d,48(%rdi) + movl %r13d,52(%rdi) + movl %r14d,56(%rdi) + movl %r15d,60(%rdi) + leaq 64(%rdi),%rdi + + subq $64,%rbp + jnz .Loop_outer + + jmp .Ldone + +.align 16 +.Ltail: + movl %eax,0(%rsp) + movl %ebx,4(%rsp) + xorq %rbx,%rbx + movl %ecx,8(%rsp) + movl %edx,12(%rsp) + movl %r8d,16(%rsp) + movl %r9d,20(%rsp) + movl %r10d,24(%rsp) + movl %r11d,28(%rsp) + movdqa %xmm1,32(%rsp) + movl %r12d,48(%rsp) + movl %r13d,52(%rsp) + movl %r14d,56(%rsp) + movl %r15d,60(%rsp) + +.Loop_tail: + movzbl (%rsi,%rbx,1),%eax + movzbl (%rsp,%rbx,1),%edx + leaq 1(%rbx),%rbx + xorl %edx,%eax + movb %al,-1(%rdi,%rbx,1) + decq %rbp + jnz .Loop_tail + +.Ldone: + leaq 64+24+48(%rsp),%rsi + movq -48(%rsi),%r15 +.cfi_restore r15 + movq -40(%rsi),%r14 +.cfi_restore r14 + movq -32(%rsi),%r13 +.cfi_restore r13 + movq -24(%rsi),%r12 +.cfi_restore r12 + movq -16(%rsi),%rbp +.cfi_restore rbp + movq -8(%rsi),%rbx +.cfi_restore rbx + leaq (%rsi),%rsp +.cfi_adjust_cfa_offset -136 +.Lno_data: + ret +.cfi_endproc +.size ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw +.globl ChaCha20_ctr32_ssse3_4x +.hidden ChaCha20_ctr32_ssse3_4x +.type ChaCha20_ctr32_ssse3_4x,@function +.align 32 +ChaCha20_ctr32_ssse3_4x: +.cfi_startproc +_CET_ENDBR + movq %rsp,%r9 +.cfi_def_cfa_register r9 + subq $0x140+8,%rsp + movdqa .Lsigma(%rip),%xmm11 + movdqu (%rcx),%xmm15 + movdqu 16(%rcx),%xmm7 + movdqu (%r8),%xmm3 + leaq 256(%rsp),%rcx + leaq .Lrot16(%rip),%r10 + leaq .Lrot24(%rip),%r11 + + pshufd $0x00,%xmm11,%xmm8 + pshufd $0x55,%xmm11,%xmm9 + movdqa %xmm8,64(%rsp) + pshufd $0xaa,%xmm11,%xmm10 + movdqa %xmm9,80(%rsp) + pshufd $0xff,%xmm11,%xmm11 + movdqa %xmm10,96(%rsp) + movdqa %xmm11,112(%rsp) + + pshufd $0x00,%xmm15,%xmm12 + pshufd $0x55,%xmm15,%xmm13 + movdqa %xmm12,128-256(%rcx) + pshufd $0xaa,%xmm15,%xmm14 + movdqa %xmm13,144-256(%rcx) + pshufd $0xff,%xmm15,%xmm15 + movdqa %xmm14,160-256(%rcx) + movdqa %xmm15,176-256(%rcx) + + pshufd $0x00,%xmm7,%xmm4 + pshufd $0x55,%xmm7,%xmm5 + movdqa %xmm4,192-256(%rcx) + pshufd $0xaa,%xmm7,%xmm6 + movdqa %xmm5,208-256(%rcx) + pshufd $0xff,%xmm7,%xmm7 + movdqa %xmm6,224-256(%rcx) + movdqa %xmm7,240-256(%rcx) + + pshufd $0x00,%xmm3,%xmm0 + pshufd $0x55,%xmm3,%xmm1 + paddd .Linc(%rip),%xmm0 + pshufd $0xaa,%xmm3,%xmm2 + movdqa %xmm1,272-256(%rcx) + pshufd $0xff,%xmm3,%xmm3 + movdqa %xmm2,288-256(%rcx) + movdqa %xmm3,304-256(%rcx) + + jmp .Loop_enter4x + +.align 32 +.Loop_outer4x: + movdqa 64(%rsp),%xmm8 + movdqa 80(%rsp),%xmm9 + movdqa 96(%rsp),%xmm10 + movdqa 112(%rsp),%xmm11 + movdqa 128-256(%rcx),%xmm12 + movdqa 144-256(%rcx),%xmm13 + movdqa 160-256(%rcx),%xmm14 + movdqa 176-256(%rcx),%xmm15 + movdqa 192-256(%rcx),%xmm4 + movdqa 208-256(%rcx),%xmm5 + movdqa 224-256(%rcx),%xmm6 + movdqa 240-256(%rcx),%xmm7 + movdqa 256-256(%rcx),%xmm0 + movdqa 272-256(%rcx),%xmm1 + movdqa 288-256(%rcx),%xmm2 + movdqa 304-256(%rcx),%xmm3 + paddd .Lfour(%rip),%xmm0 + +.Loop_enter4x: + movdqa %xmm6,32(%rsp) + movdqa %xmm7,48(%rsp) + movdqa (%r10),%xmm7 + movl $10,%eax + movdqa %xmm0,256-256(%rcx) + jmp .Loop4x + +.align 32 +.Loop4x: + paddd %xmm12,%xmm8 + paddd %xmm13,%xmm9 + pxor %xmm8,%xmm0 + pxor %xmm9,%xmm1 +.byte 102,15,56,0,199 +.byte 102,15,56,0,207 + paddd %xmm0,%xmm4 + paddd %xmm1,%xmm5 + pxor %xmm4,%xmm12 + pxor %xmm5,%xmm13 + movdqa %xmm12,%xmm6 + pslld $12,%xmm12 + psrld $20,%xmm6 + movdqa %xmm13,%xmm7 + pslld $12,%xmm13 + por %xmm6,%xmm12 + psrld $20,%xmm7 + movdqa (%r11),%xmm6 + por %xmm7,%xmm13 + paddd %xmm12,%xmm8 + paddd %xmm13,%xmm9 + pxor %xmm8,%xmm0 + pxor %xmm9,%xmm1 +.byte 102,15,56,0,198 +.byte 102,15,56,0,206 + paddd %xmm0,%xmm4 + paddd %xmm1,%xmm5 + pxor %xmm4,%xmm12 + pxor %xmm5,%xmm13 + movdqa %xmm12,%xmm7 + pslld $7,%xmm12 + psrld $25,%xmm7 + movdqa %xmm13,%xmm6 + pslld $7,%xmm13 + por %xmm7,%xmm12 + psrld $25,%xmm6 + movdqa (%r10),%xmm7 + por %xmm6,%xmm13 + movdqa %xmm4,0(%rsp) + movdqa %xmm5,16(%rsp) + movdqa 32(%rsp),%xmm4 + movdqa 48(%rsp),%xmm5 + paddd %xmm14,%xmm10 + paddd %xmm15,%xmm11 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm3 +.byte 102,15,56,0,215 +.byte 102,15,56,0,223 + paddd %xmm2,%xmm4 + paddd %xmm3,%xmm5 + pxor %xmm4,%xmm14 + pxor %xmm5,%xmm15 + movdqa %xmm14,%xmm6 + pslld $12,%xmm14 + psrld $20,%xmm6 + movdqa %xmm15,%xmm7 + pslld $12,%xmm15 + por %xmm6,%xmm14 + psrld $20,%xmm7 + movdqa (%r11),%xmm6 + por %xmm7,%xmm15 + paddd %xmm14,%xmm10 + paddd %xmm15,%xmm11 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm3 +.byte 102,15,56,0,214 +.byte 102,15,56,0,222 + paddd %xmm2,%xmm4 + paddd %xmm3,%xmm5 + pxor %xmm4,%xmm14 + pxor %xmm5,%xmm15 + movdqa %xmm14,%xmm7 + pslld $7,%xmm14 + psrld $25,%xmm7 + movdqa %xmm15,%xmm6 + pslld $7,%xmm15 + por %xmm7,%xmm14 + psrld $25,%xmm6 + movdqa (%r10),%xmm7 + por %xmm6,%xmm15 + paddd %xmm13,%xmm8 + paddd %xmm14,%xmm9 + pxor %xmm8,%xmm3 + pxor %xmm9,%xmm0 +.byte 102,15,56,0,223 +.byte 102,15,56,0,199 + paddd %xmm3,%xmm4 + paddd %xmm0,%xmm5 + pxor %xmm4,%xmm13 + pxor %xmm5,%xmm14 + movdqa %xmm13,%xmm6 + pslld $12,%xmm13 + psrld $20,%xmm6 + movdqa %xmm14,%xmm7 + pslld $12,%xmm14 + por %xmm6,%xmm13 + psrld $20,%xmm7 + movdqa (%r11),%xmm6 + por %xmm7,%xmm14 + paddd %xmm13,%xmm8 + paddd %xmm14,%xmm9 + pxor %xmm8,%xmm3 + pxor %xmm9,%xmm0 +.byte 102,15,56,0,222 +.byte 102,15,56,0,198 + paddd %xmm3,%xmm4 + paddd %xmm0,%xmm5 + pxor %xmm4,%xmm13 + pxor %xmm5,%xmm14 + movdqa %xmm13,%xmm7 + pslld $7,%xmm13 + psrld $25,%xmm7 + movdqa %xmm14,%xmm6 + pslld $7,%xmm14 + por %xmm7,%xmm13 + psrld $25,%xmm6 + movdqa (%r10),%xmm7 + por %xmm6,%xmm14 + movdqa %xmm4,32(%rsp) + movdqa %xmm5,48(%rsp) + movdqa 0(%rsp),%xmm4 + movdqa 16(%rsp),%xmm5 + paddd %xmm15,%xmm10 + paddd %xmm12,%xmm11 + pxor %xmm10,%xmm1 + pxor %xmm11,%xmm2 +.byte 102,15,56,0,207 +.byte 102,15,56,0,215 + paddd %xmm1,%xmm4 + paddd %xmm2,%xmm5 + pxor %xmm4,%xmm15 + pxor %xmm5,%xmm12 + movdqa %xmm15,%xmm6 + pslld $12,%xmm15 + psrld $20,%xmm6 + movdqa %xmm12,%xmm7 + pslld $12,%xmm12 + por %xmm6,%xmm15 + psrld $20,%xmm7 + movdqa (%r11),%xmm6 + por %xmm7,%xmm12 + paddd %xmm15,%xmm10 + paddd %xmm12,%xmm11 + pxor %xmm10,%xmm1 + pxor %xmm11,%xmm2 +.byte 102,15,56,0,206 +.byte 102,15,56,0,214 + paddd %xmm1,%xmm4 + paddd %xmm2,%xmm5 + pxor %xmm4,%xmm15 + pxor %xmm5,%xmm12 + movdqa %xmm15,%xmm7 + pslld $7,%xmm15 + psrld $25,%xmm7 + movdqa %xmm12,%xmm6 + pslld $7,%xmm12 + por %xmm7,%xmm15 + psrld $25,%xmm6 + movdqa (%r10),%xmm7 + por %xmm6,%xmm12 + decl %eax + jnz .Loop4x + + paddd 64(%rsp),%xmm8 + paddd 80(%rsp),%xmm9 + paddd 96(%rsp),%xmm10 + paddd 112(%rsp),%xmm11 + + movdqa %xmm8,%xmm6 + punpckldq %xmm9,%xmm8 + movdqa %xmm10,%xmm7 + punpckldq %xmm11,%xmm10 + punpckhdq %xmm9,%xmm6 + punpckhdq %xmm11,%xmm7 + movdqa %xmm8,%xmm9 + punpcklqdq %xmm10,%xmm8 + movdqa %xmm6,%xmm11 + punpcklqdq %xmm7,%xmm6 + punpckhqdq %xmm10,%xmm9 + punpckhqdq %xmm7,%xmm11 + paddd 128-256(%rcx),%xmm12 + paddd 144-256(%rcx),%xmm13 + paddd 160-256(%rcx),%xmm14 + paddd 176-256(%rcx),%xmm15 + + movdqa %xmm8,0(%rsp) + movdqa %xmm9,16(%rsp) + movdqa 32(%rsp),%xmm8 + movdqa 48(%rsp),%xmm9 + + movdqa %xmm12,%xmm10 + punpckldq %xmm13,%xmm12 + movdqa %xmm14,%xmm7 + punpckldq %xmm15,%xmm14 + punpckhdq %xmm13,%xmm10 + punpckhdq %xmm15,%xmm7 + movdqa %xmm12,%xmm13 + punpcklqdq %xmm14,%xmm12 + movdqa %xmm10,%xmm15 + punpcklqdq %xmm7,%xmm10 + punpckhqdq %xmm14,%xmm13 + punpckhqdq %xmm7,%xmm15 + paddd 192-256(%rcx),%xmm4 + paddd 208-256(%rcx),%xmm5 + paddd 224-256(%rcx),%xmm8 + paddd 240-256(%rcx),%xmm9 + + movdqa %xmm6,32(%rsp) + movdqa %xmm11,48(%rsp) + + movdqa %xmm4,%xmm14 + punpckldq %xmm5,%xmm4 + movdqa %xmm8,%xmm7 + punpckldq %xmm9,%xmm8 + punpckhdq %xmm5,%xmm14 + punpckhdq %xmm9,%xmm7 + movdqa %xmm4,%xmm5 + punpcklqdq %xmm8,%xmm4 + movdqa %xmm14,%xmm9 + punpcklqdq %xmm7,%xmm14 + punpckhqdq %xmm8,%xmm5 + punpckhqdq %xmm7,%xmm9 + paddd 256-256(%rcx),%xmm0 + paddd 272-256(%rcx),%xmm1 + paddd 288-256(%rcx),%xmm2 + paddd 304-256(%rcx),%xmm3 + + movdqa %xmm0,%xmm8 + punpckldq %xmm1,%xmm0 + movdqa %xmm2,%xmm7 + punpckldq %xmm3,%xmm2 + punpckhdq %xmm1,%xmm8 + punpckhdq %xmm3,%xmm7 + movdqa %xmm0,%xmm1 + punpcklqdq %xmm2,%xmm0 + movdqa %xmm8,%xmm3 + punpcklqdq %xmm7,%xmm8 + punpckhqdq %xmm2,%xmm1 + punpckhqdq %xmm7,%xmm3 + cmpq $256,%rdx + jb .Ltail4x + + movdqu 0(%rsi),%xmm6 + movdqu 16(%rsi),%xmm11 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm7 + pxor 0(%rsp),%xmm6 + pxor %xmm12,%xmm11 + pxor %xmm4,%xmm2 + pxor %xmm0,%xmm7 + + movdqu %xmm6,0(%rdi) + movdqu 64(%rsi),%xmm6 + movdqu %xmm11,16(%rdi) + movdqu 80(%rsi),%xmm11 + movdqu %xmm2,32(%rdi) + movdqu 96(%rsi),%xmm2 + movdqu %xmm7,48(%rdi) + movdqu 112(%rsi),%xmm7 + leaq 128(%rsi),%rsi + pxor 16(%rsp),%xmm6 + pxor %xmm13,%xmm11 + pxor %xmm5,%xmm2 + pxor %xmm1,%xmm7 + + movdqu %xmm6,64(%rdi) + movdqu 0(%rsi),%xmm6 + movdqu %xmm11,80(%rdi) + movdqu 16(%rsi),%xmm11 + movdqu %xmm2,96(%rdi) + movdqu 32(%rsi),%xmm2 + movdqu %xmm7,112(%rdi) + leaq 128(%rdi),%rdi + movdqu 48(%rsi),%xmm7 + pxor 32(%rsp),%xmm6 + pxor %xmm10,%xmm11 + pxor %xmm14,%xmm2 + pxor %xmm8,%xmm7 + + movdqu %xmm6,0(%rdi) + movdqu 64(%rsi),%xmm6 + movdqu %xmm11,16(%rdi) + movdqu 80(%rsi),%xmm11 + movdqu %xmm2,32(%rdi) + movdqu 96(%rsi),%xmm2 + movdqu %xmm7,48(%rdi) + movdqu 112(%rsi),%xmm7 + leaq 128(%rsi),%rsi + pxor 48(%rsp),%xmm6 + pxor %xmm15,%xmm11 + pxor %xmm9,%xmm2 + pxor %xmm3,%xmm7 + movdqu %xmm6,64(%rdi) + movdqu %xmm11,80(%rdi) + movdqu %xmm2,96(%rdi) + movdqu %xmm7,112(%rdi) + leaq 128(%rdi),%rdi + + subq $256,%rdx + jnz .Loop_outer4x + + jmp .Ldone4x + +.Ltail4x: + cmpq $192,%rdx + jae .L192_or_more4x + cmpq $128,%rdx + jae .L128_or_more4x + cmpq $64,%rdx + jae .L64_or_more4x + + + xorq %r10,%r10 + + movdqa %xmm12,16(%rsp) + movdqa %xmm4,32(%rsp) + movdqa %xmm0,48(%rsp) + jmp .Loop_tail4x + +.align 32 +.L64_or_more4x: + movdqu 0(%rsi),%xmm6 + movdqu 16(%rsi),%xmm11 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm7 + pxor 0(%rsp),%xmm6 + pxor %xmm12,%xmm11 + pxor %xmm4,%xmm2 + pxor %xmm0,%xmm7 + movdqu %xmm6,0(%rdi) + movdqu %xmm11,16(%rdi) + movdqu %xmm2,32(%rdi) + movdqu %xmm7,48(%rdi) + je .Ldone4x + + movdqa 16(%rsp),%xmm6 + leaq 64(%rsi),%rsi + xorq %r10,%r10 + movdqa %xmm6,0(%rsp) + movdqa %xmm13,16(%rsp) + leaq 64(%rdi),%rdi + movdqa %xmm5,32(%rsp) + subq $64,%rdx + movdqa %xmm1,48(%rsp) + jmp .Loop_tail4x + +.align 32 +.L128_or_more4x: + movdqu 0(%rsi),%xmm6 + movdqu 16(%rsi),%xmm11 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm7 + pxor 0(%rsp),%xmm6 + pxor %xmm12,%xmm11 + pxor %xmm4,%xmm2 + pxor %xmm0,%xmm7 + + movdqu %xmm6,0(%rdi) + movdqu 64(%rsi),%xmm6 + movdqu %xmm11,16(%rdi) + movdqu 80(%rsi),%xmm11 + movdqu %xmm2,32(%rdi) + movdqu 96(%rsi),%xmm2 + movdqu %xmm7,48(%rdi) + movdqu 112(%rsi),%xmm7 + pxor 16(%rsp),%xmm6 + pxor %xmm13,%xmm11 + pxor %xmm5,%xmm2 + pxor %xmm1,%xmm7 + movdqu %xmm6,64(%rdi) + movdqu %xmm11,80(%rdi) + movdqu %xmm2,96(%rdi) + movdqu %xmm7,112(%rdi) + je .Ldone4x + + movdqa 32(%rsp),%xmm6 + leaq 128(%rsi),%rsi + xorq %r10,%r10 + movdqa %xmm6,0(%rsp) + movdqa %xmm10,16(%rsp) + leaq 128(%rdi),%rdi + movdqa %xmm14,32(%rsp) + subq $128,%rdx + movdqa %xmm8,48(%rsp) + jmp .Loop_tail4x + +.align 32 +.L192_or_more4x: + movdqu 0(%rsi),%xmm6 + movdqu 16(%rsi),%xmm11 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm7 + pxor 0(%rsp),%xmm6 + pxor %xmm12,%xmm11 + pxor %xmm4,%xmm2 + pxor %xmm0,%xmm7 + + movdqu %xmm6,0(%rdi) + movdqu 64(%rsi),%xmm6 + movdqu %xmm11,16(%rdi) + movdqu 80(%rsi),%xmm11 + movdqu %xmm2,32(%rdi) + movdqu 96(%rsi),%xmm2 + movdqu %xmm7,48(%rdi) + movdqu 112(%rsi),%xmm7 + leaq 128(%rsi),%rsi + pxor 16(%rsp),%xmm6 + pxor %xmm13,%xmm11 + pxor %xmm5,%xmm2 + pxor %xmm1,%xmm7 + + movdqu %xmm6,64(%rdi) + movdqu 0(%rsi),%xmm6 + movdqu %xmm11,80(%rdi) + movdqu 16(%rsi),%xmm11 + movdqu %xmm2,96(%rdi) + movdqu 32(%rsi),%xmm2 + movdqu %xmm7,112(%rdi) + leaq 128(%rdi),%rdi + movdqu 48(%rsi),%xmm7 + pxor 32(%rsp),%xmm6 + pxor %xmm10,%xmm11 + pxor %xmm14,%xmm2 + pxor %xmm8,%xmm7 + movdqu %xmm6,0(%rdi) + movdqu %xmm11,16(%rdi) + movdqu %xmm2,32(%rdi) + movdqu %xmm7,48(%rdi) + je .Ldone4x + + movdqa 48(%rsp),%xmm6 + leaq 64(%rsi),%rsi + xorq %r10,%r10 + movdqa %xmm6,0(%rsp) + movdqa %xmm15,16(%rsp) + leaq 64(%rdi),%rdi + movdqa %xmm9,32(%rsp) + subq $192,%rdx + movdqa %xmm3,48(%rsp) + +.Loop_tail4x: + movzbl (%rsi,%r10,1),%eax + movzbl (%rsp,%r10,1),%ecx + leaq 1(%r10),%r10 + xorl %ecx,%eax + movb %al,-1(%rdi,%r10,1) + decq %rdx + jnz .Loop_tail4x + +.Ldone4x: + leaq (%r9),%rsp +.cfi_def_cfa_register rsp +.L4x_epilogue: + ret +.cfi_endproc +.size ChaCha20_ctr32_ssse3_4x,.-ChaCha20_ctr32_ssse3_4x +.globl ChaCha20_ctr32_avx2 +.hidden ChaCha20_ctr32_avx2 +.type ChaCha20_ctr32_avx2,@function +.align 32 +ChaCha20_ctr32_avx2: +.cfi_startproc +_CET_ENDBR + movq %rsp,%r9 +.cfi_def_cfa_register r9 + subq $0x280+8,%rsp + andq $-32,%rsp + vzeroupper + + + + + + + + + + + vbroadcasti128 .Lsigma(%rip),%ymm11 + vbroadcasti128 (%rcx),%ymm3 + vbroadcasti128 16(%rcx),%ymm15 + vbroadcasti128 (%r8),%ymm7 + leaq 256(%rsp),%rcx + leaq 512(%rsp),%rax + leaq .Lrot16(%rip),%r10 + leaq .Lrot24(%rip),%r11 + + vpshufd $0x00,%ymm11,%ymm8 + vpshufd $0x55,%ymm11,%ymm9 + vmovdqa %ymm8,128-256(%rcx) + vpshufd $0xaa,%ymm11,%ymm10 + vmovdqa %ymm9,160-256(%rcx) + vpshufd $0xff,%ymm11,%ymm11 + vmovdqa %ymm10,192-256(%rcx) + vmovdqa %ymm11,224-256(%rcx) + + vpshufd $0x00,%ymm3,%ymm0 + vpshufd $0x55,%ymm3,%ymm1 + vmovdqa %ymm0,256-256(%rcx) + vpshufd $0xaa,%ymm3,%ymm2 + vmovdqa %ymm1,288-256(%rcx) + vpshufd $0xff,%ymm3,%ymm3 + vmovdqa %ymm2,320-256(%rcx) + vmovdqa %ymm3,352-256(%rcx) + + vpshufd $0x00,%ymm15,%ymm12 + vpshufd $0x55,%ymm15,%ymm13 + vmovdqa %ymm12,384-512(%rax) + vpshufd $0xaa,%ymm15,%ymm14 + vmovdqa %ymm13,416-512(%rax) + vpshufd $0xff,%ymm15,%ymm15 + vmovdqa %ymm14,448-512(%rax) + vmovdqa %ymm15,480-512(%rax) + + vpshufd $0x00,%ymm7,%ymm4 + vpshufd $0x55,%ymm7,%ymm5 + vpaddd .Lincy(%rip),%ymm4,%ymm4 + vpshufd $0xaa,%ymm7,%ymm6 + vmovdqa %ymm5,544-512(%rax) + vpshufd $0xff,%ymm7,%ymm7 + vmovdqa %ymm6,576-512(%rax) + vmovdqa %ymm7,608-512(%rax) + + jmp .Loop_enter8x + +.align 32 +.Loop_outer8x: + vmovdqa 128-256(%rcx),%ymm8 + vmovdqa 160-256(%rcx),%ymm9 + vmovdqa 192-256(%rcx),%ymm10 + vmovdqa 224-256(%rcx),%ymm11 + vmovdqa 256-256(%rcx),%ymm0 + vmovdqa 288-256(%rcx),%ymm1 + vmovdqa 320-256(%rcx),%ymm2 + vmovdqa 352-256(%rcx),%ymm3 + vmovdqa 384-512(%rax),%ymm12 + vmovdqa 416-512(%rax),%ymm13 + vmovdqa 448-512(%rax),%ymm14 + vmovdqa 480-512(%rax),%ymm15 + vmovdqa 512-512(%rax),%ymm4 + vmovdqa 544-512(%rax),%ymm5 + vmovdqa 576-512(%rax),%ymm6 + vmovdqa 608-512(%rax),%ymm7 + vpaddd .Leight(%rip),%ymm4,%ymm4 + +.Loop_enter8x: + vmovdqa %ymm14,64(%rsp) + vmovdqa %ymm15,96(%rsp) + vbroadcasti128 (%r10),%ymm15 + vmovdqa %ymm4,512-512(%rax) + movl $10,%eax + jmp .Loop8x + +.align 32 +.Loop8x: + vpaddd %ymm0,%ymm8,%ymm8 + vpxor %ymm4,%ymm8,%ymm4 + vpshufb %ymm15,%ymm4,%ymm4 + vpaddd %ymm1,%ymm9,%ymm9 + vpxor %ymm5,%ymm9,%ymm5 + vpshufb %ymm15,%ymm5,%ymm5 + vpaddd %ymm4,%ymm12,%ymm12 + vpxor %ymm0,%ymm12,%ymm0 + vpslld $12,%ymm0,%ymm14 + vpsrld $20,%ymm0,%ymm0 + vpor %ymm0,%ymm14,%ymm0 + vbroadcasti128 (%r11),%ymm14 + vpaddd %ymm5,%ymm13,%ymm13 + vpxor %ymm1,%ymm13,%ymm1 + vpslld $12,%ymm1,%ymm15 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm1,%ymm15,%ymm1 + vpaddd %ymm0,%ymm8,%ymm8 + vpxor %ymm4,%ymm8,%ymm4 + vpshufb %ymm14,%ymm4,%ymm4 + vpaddd %ymm1,%ymm9,%ymm9 + vpxor %ymm5,%ymm9,%ymm5 + vpshufb %ymm14,%ymm5,%ymm5 + vpaddd %ymm4,%ymm12,%ymm12 + vpxor %ymm0,%ymm12,%ymm0 + vpslld $7,%ymm0,%ymm15 + vpsrld $25,%ymm0,%ymm0 + vpor %ymm0,%ymm15,%ymm0 + vbroadcasti128 (%r10),%ymm15 + vpaddd %ymm5,%ymm13,%ymm13 + vpxor %ymm1,%ymm13,%ymm1 + vpslld $7,%ymm1,%ymm14 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm1,%ymm14,%ymm1 + vmovdqa %ymm12,0(%rsp) + vmovdqa %ymm13,32(%rsp) + vmovdqa 64(%rsp),%ymm12 + vmovdqa 96(%rsp),%ymm13 + vpaddd %ymm2,%ymm10,%ymm10 + vpxor %ymm6,%ymm10,%ymm6 + vpshufb %ymm15,%ymm6,%ymm6 + vpaddd %ymm3,%ymm11,%ymm11 + vpxor %ymm7,%ymm11,%ymm7 + vpshufb %ymm15,%ymm7,%ymm7 + vpaddd %ymm6,%ymm12,%ymm12 + vpxor %ymm2,%ymm12,%ymm2 + vpslld $12,%ymm2,%ymm14 + vpsrld $20,%ymm2,%ymm2 + vpor %ymm2,%ymm14,%ymm2 + vbroadcasti128 (%r11),%ymm14 + vpaddd %ymm7,%ymm13,%ymm13 + vpxor %ymm3,%ymm13,%ymm3 + vpslld $12,%ymm3,%ymm15 + vpsrld $20,%ymm3,%ymm3 + vpor %ymm3,%ymm15,%ymm3 + vpaddd %ymm2,%ymm10,%ymm10 + vpxor %ymm6,%ymm10,%ymm6 + vpshufb %ymm14,%ymm6,%ymm6 + vpaddd %ymm3,%ymm11,%ymm11 + vpxor %ymm7,%ymm11,%ymm7 + vpshufb %ymm14,%ymm7,%ymm7 + vpaddd %ymm6,%ymm12,%ymm12 + vpxor %ymm2,%ymm12,%ymm2 + vpslld $7,%ymm2,%ymm15 + vpsrld $25,%ymm2,%ymm2 + vpor %ymm2,%ymm15,%ymm2 + vbroadcasti128 (%r10),%ymm15 + vpaddd %ymm7,%ymm13,%ymm13 + vpxor %ymm3,%ymm13,%ymm3 + vpslld $7,%ymm3,%ymm14 + vpsrld $25,%ymm3,%ymm3 + vpor %ymm3,%ymm14,%ymm3 + vpaddd %ymm1,%ymm8,%ymm8 + vpxor %ymm7,%ymm8,%ymm7 + vpshufb %ymm15,%ymm7,%ymm7 + vpaddd %ymm2,%ymm9,%ymm9 + vpxor %ymm4,%ymm9,%ymm4 + vpshufb %ymm15,%ymm4,%ymm4 + vpaddd %ymm7,%ymm12,%ymm12 + vpxor %ymm1,%ymm12,%ymm1 + vpslld $12,%ymm1,%ymm14 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm1,%ymm14,%ymm1 + vbroadcasti128 (%r11),%ymm14 + vpaddd %ymm4,%ymm13,%ymm13 + vpxor %ymm2,%ymm13,%ymm2 + vpslld $12,%ymm2,%ymm15 + vpsrld $20,%ymm2,%ymm2 + vpor %ymm2,%ymm15,%ymm2 + vpaddd %ymm1,%ymm8,%ymm8 + vpxor %ymm7,%ymm8,%ymm7 + vpshufb %ymm14,%ymm7,%ymm7 + vpaddd %ymm2,%ymm9,%ymm9 + vpxor %ymm4,%ymm9,%ymm4 + vpshufb %ymm14,%ymm4,%ymm4 + vpaddd %ymm7,%ymm12,%ymm12 + vpxor %ymm1,%ymm12,%ymm1 + vpslld $7,%ymm1,%ymm15 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm1,%ymm15,%ymm1 + vbroadcasti128 (%r10),%ymm15 + vpaddd %ymm4,%ymm13,%ymm13 + vpxor %ymm2,%ymm13,%ymm2 + vpslld $7,%ymm2,%ymm14 + vpsrld $25,%ymm2,%ymm2 + vpor %ymm2,%ymm14,%ymm2 + vmovdqa %ymm12,64(%rsp) + vmovdqa %ymm13,96(%rsp) + vmovdqa 0(%rsp),%ymm12 + vmovdqa 32(%rsp),%ymm13 + vpaddd %ymm3,%ymm10,%ymm10 + vpxor %ymm5,%ymm10,%ymm5 + vpshufb %ymm15,%ymm5,%ymm5 + vpaddd %ymm0,%ymm11,%ymm11 + vpxor %ymm6,%ymm11,%ymm6 + vpshufb %ymm15,%ymm6,%ymm6 + vpaddd %ymm5,%ymm12,%ymm12 + vpxor %ymm3,%ymm12,%ymm3 + vpslld $12,%ymm3,%ymm14 + vpsrld $20,%ymm3,%ymm3 + vpor %ymm3,%ymm14,%ymm3 + vbroadcasti128 (%r11),%ymm14 + vpaddd %ymm6,%ymm13,%ymm13 + vpxor %ymm0,%ymm13,%ymm0 + vpslld $12,%ymm0,%ymm15 + vpsrld $20,%ymm0,%ymm0 + vpor %ymm0,%ymm15,%ymm0 + vpaddd %ymm3,%ymm10,%ymm10 + vpxor %ymm5,%ymm10,%ymm5 + vpshufb %ymm14,%ymm5,%ymm5 + vpaddd %ymm0,%ymm11,%ymm11 + vpxor %ymm6,%ymm11,%ymm6 + vpshufb %ymm14,%ymm6,%ymm6 + vpaddd %ymm5,%ymm12,%ymm12 + vpxor %ymm3,%ymm12,%ymm3 + vpslld $7,%ymm3,%ymm15 + vpsrld $25,%ymm3,%ymm3 + vpor %ymm3,%ymm15,%ymm3 + vbroadcasti128 (%r10),%ymm15 + vpaddd %ymm6,%ymm13,%ymm13 + vpxor %ymm0,%ymm13,%ymm0 + vpslld $7,%ymm0,%ymm14 + vpsrld $25,%ymm0,%ymm0 + vpor %ymm0,%ymm14,%ymm0 + decl %eax + jnz .Loop8x + + leaq 512(%rsp),%rax + vpaddd 128-256(%rcx),%ymm8,%ymm8 + vpaddd 160-256(%rcx),%ymm9,%ymm9 + vpaddd 192-256(%rcx),%ymm10,%ymm10 + vpaddd 224-256(%rcx),%ymm11,%ymm11 + + vpunpckldq %ymm9,%ymm8,%ymm14 + vpunpckldq %ymm11,%ymm10,%ymm15 + vpunpckhdq %ymm9,%ymm8,%ymm8 + vpunpckhdq %ymm11,%ymm10,%ymm10 + vpunpcklqdq %ymm15,%ymm14,%ymm9 + vpunpckhqdq %ymm15,%ymm14,%ymm14 + vpunpcklqdq %ymm10,%ymm8,%ymm11 + vpunpckhqdq %ymm10,%ymm8,%ymm8 + vpaddd 256-256(%rcx),%ymm0,%ymm0 + vpaddd 288-256(%rcx),%ymm1,%ymm1 + vpaddd 320-256(%rcx),%ymm2,%ymm2 + vpaddd 352-256(%rcx),%ymm3,%ymm3 + + vpunpckldq %ymm1,%ymm0,%ymm10 + vpunpckldq %ymm3,%ymm2,%ymm15 + vpunpckhdq %ymm1,%ymm0,%ymm0 + vpunpckhdq %ymm3,%ymm2,%ymm2 + vpunpcklqdq %ymm15,%ymm10,%ymm1 + vpunpckhqdq %ymm15,%ymm10,%ymm10 + vpunpcklqdq %ymm2,%ymm0,%ymm3 + vpunpckhqdq %ymm2,%ymm0,%ymm0 + vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 + vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 + vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 + vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 + vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 + vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 + vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 + vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 + vmovdqa %ymm15,0(%rsp) + vmovdqa %ymm9,32(%rsp) + vmovdqa 64(%rsp),%ymm15 + vmovdqa 96(%rsp),%ymm9 + + vpaddd 384-512(%rax),%ymm12,%ymm12 + vpaddd 416-512(%rax),%ymm13,%ymm13 + vpaddd 448-512(%rax),%ymm15,%ymm15 + vpaddd 480-512(%rax),%ymm9,%ymm9 + + vpunpckldq %ymm13,%ymm12,%ymm2 + vpunpckldq %ymm9,%ymm15,%ymm8 + vpunpckhdq %ymm13,%ymm12,%ymm12 + vpunpckhdq %ymm9,%ymm15,%ymm15 + vpunpcklqdq %ymm8,%ymm2,%ymm13 + vpunpckhqdq %ymm8,%ymm2,%ymm2 + vpunpcklqdq %ymm15,%ymm12,%ymm9 + vpunpckhqdq %ymm15,%ymm12,%ymm12 + vpaddd 512-512(%rax),%ymm4,%ymm4 + vpaddd 544-512(%rax),%ymm5,%ymm5 + vpaddd 576-512(%rax),%ymm6,%ymm6 + vpaddd 608-512(%rax),%ymm7,%ymm7 + + vpunpckldq %ymm5,%ymm4,%ymm15 + vpunpckldq %ymm7,%ymm6,%ymm8 + vpunpckhdq %ymm5,%ymm4,%ymm4 + vpunpckhdq %ymm7,%ymm6,%ymm6 + vpunpcklqdq %ymm8,%ymm15,%ymm5 + vpunpckhqdq %ymm8,%ymm15,%ymm15 + vpunpcklqdq %ymm6,%ymm4,%ymm7 + vpunpckhqdq %ymm6,%ymm4,%ymm4 + vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 + vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 + vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 + vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 + vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 + vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 + vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 + vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 + vmovdqa 0(%rsp),%ymm6 + vmovdqa 32(%rsp),%ymm12 + + cmpq $512,%rdx + jb .Ltail8x + + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + leaq 128(%rsi),%rsi + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + leaq 128(%rdi),%rdi + + vpxor 0(%rsi),%ymm12,%ymm12 + vpxor 32(%rsi),%ymm13,%ymm13 + vpxor 64(%rsi),%ymm10,%ymm10 + vpxor 96(%rsi),%ymm15,%ymm15 + leaq 128(%rsi),%rsi + vmovdqu %ymm12,0(%rdi) + vmovdqu %ymm13,32(%rdi) + vmovdqu %ymm10,64(%rdi) + vmovdqu %ymm15,96(%rdi) + leaq 128(%rdi),%rdi + + vpxor 0(%rsi),%ymm14,%ymm14 + vpxor 32(%rsi),%ymm2,%ymm2 + vpxor 64(%rsi),%ymm3,%ymm3 + vpxor 96(%rsi),%ymm7,%ymm7 + leaq 128(%rsi),%rsi + vmovdqu %ymm14,0(%rdi) + vmovdqu %ymm2,32(%rdi) + vmovdqu %ymm3,64(%rdi) + vmovdqu %ymm7,96(%rdi) + leaq 128(%rdi),%rdi + + vpxor 0(%rsi),%ymm11,%ymm11 + vpxor 32(%rsi),%ymm9,%ymm9 + vpxor 64(%rsi),%ymm0,%ymm0 + vpxor 96(%rsi),%ymm4,%ymm4 + leaq 128(%rsi),%rsi + vmovdqu %ymm11,0(%rdi) + vmovdqu %ymm9,32(%rdi) + vmovdqu %ymm0,64(%rdi) + vmovdqu %ymm4,96(%rdi) + leaq 128(%rdi),%rdi + + subq $512,%rdx + jnz .Loop_outer8x + + jmp .Ldone8x + +.Ltail8x: + cmpq $448,%rdx + jae .L448_or_more8x + cmpq $384,%rdx + jae .L384_or_more8x + cmpq $320,%rdx + jae .L320_or_more8x + cmpq $256,%rdx + jae .L256_or_more8x + cmpq $192,%rdx + jae .L192_or_more8x + cmpq $128,%rdx + jae .L128_or_more8x + cmpq $64,%rdx + jae .L64_or_more8x + + xorq %r10,%r10 + vmovdqa %ymm6,0(%rsp) + vmovdqa %ymm8,32(%rsp) + jmp .Loop_tail8x + +.align 32 +.L64_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + je .Ldone8x + + leaq 64(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm1,0(%rsp) + leaq 64(%rdi),%rdi + subq $64,%rdx + vmovdqa %ymm5,32(%rsp) + jmp .Loop_tail8x + +.align 32 +.L128_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + je .Ldone8x + + leaq 128(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm12,0(%rsp) + leaq 128(%rdi),%rdi + subq $128,%rdx + vmovdqa %ymm13,32(%rsp) + jmp .Loop_tail8x + +.align 32 +.L192_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + je .Ldone8x + + leaq 192(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm10,0(%rsp) + leaq 192(%rdi),%rdi + subq $192,%rdx + vmovdqa %ymm15,32(%rsp) + jmp .Loop_tail8x + +.align 32 +.L256_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vpxor 192(%rsi),%ymm10,%ymm10 + vpxor 224(%rsi),%ymm15,%ymm15 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + vmovdqu %ymm10,192(%rdi) + vmovdqu %ymm15,224(%rdi) + je .Ldone8x + + leaq 256(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm14,0(%rsp) + leaq 256(%rdi),%rdi + subq $256,%rdx + vmovdqa %ymm2,32(%rsp) + jmp .Loop_tail8x + +.align 32 +.L320_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vpxor 192(%rsi),%ymm10,%ymm10 + vpxor 224(%rsi),%ymm15,%ymm15 + vpxor 256(%rsi),%ymm14,%ymm14 + vpxor 288(%rsi),%ymm2,%ymm2 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + vmovdqu %ymm10,192(%rdi) + vmovdqu %ymm15,224(%rdi) + vmovdqu %ymm14,256(%rdi) + vmovdqu %ymm2,288(%rdi) + je .Ldone8x + + leaq 320(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm3,0(%rsp) + leaq 320(%rdi),%rdi + subq $320,%rdx + vmovdqa %ymm7,32(%rsp) + jmp .Loop_tail8x + +.align 32 +.L384_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vpxor 192(%rsi),%ymm10,%ymm10 + vpxor 224(%rsi),%ymm15,%ymm15 + vpxor 256(%rsi),%ymm14,%ymm14 + vpxor 288(%rsi),%ymm2,%ymm2 + vpxor 320(%rsi),%ymm3,%ymm3 + vpxor 352(%rsi),%ymm7,%ymm7 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + vmovdqu %ymm10,192(%rdi) + vmovdqu %ymm15,224(%rdi) + vmovdqu %ymm14,256(%rdi) + vmovdqu %ymm2,288(%rdi) + vmovdqu %ymm3,320(%rdi) + vmovdqu %ymm7,352(%rdi) + je .Ldone8x + + leaq 384(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm11,0(%rsp) + leaq 384(%rdi),%rdi + subq $384,%rdx + vmovdqa %ymm9,32(%rsp) + jmp .Loop_tail8x + +.align 32 +.L448_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vpxor 192(%rsi),%ymm10,%ymm10 + vpxor 224(%rsi),%ymm15,%ymm15 + vpxor 256(%rsi),%ymm14,%ymm14 + vpxor 288(%rsi),%ymm2,%ymm2 + vpxor 320(%rsi),%ymm3,%ymm3 + vpxor 352(%rsi),%ymm7,%ymm7 + vpxor 384(%rsi),%ymm11,%ymm11 + vpxor 416(%rsi),%ymm9,%ymm9 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + vmovdqu %ymm10,192(%rdi) + vmovdqu %ymm15,224(%rdi) + vmovdqu %ymm14,256(%rdi) + vmovdqu %ymm2,288(%rdi) + vmovdqu %ymm3,320(%rdi) + vmovdqu %ymm7,352(%rdi) + vmovdqu %ymm11,384(%rdi) + vmovdqu %ymm9,416(%rdi) + je .Ldone8x + + leaq 448(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm0,0(%rsp) + leaq 448(%rdi),%rdi + subq $448,%rdx + vmovdqa %ymm4,32(%rsp) + +.Loop_tail8x: + movzbl (%rsi,%r10,1),%eax + movzbl (%rsp,%r10,1),%ecx + leaq 1(%r10),%r10 + xorl %ecx,%eax + movb %al,-1(%rdi,%r10,1) + decq %rdx + jnz .Loop_tail8x + +.Ldone8x: + vzeroall + leaq (%r9),%rsp +.cfi_def_cfa_register rsp +.L8x_epilogue: + ret +.cfi_endproc +.size ChaCha20_ctr32_avx2,.-ChaCha20_ctr32_avx2 +#endif diff --git a/ring-0.17.14/pregenerated/chacha-x86_64-macosx.S b/ring-0.17.14/pregenerated/chacha-x86_64-macosx.S new file mode 100644 index 0000000000..60cc8717d1 --- /dev/null +++ b/ring-0.17.14/pregenerated/chacha-x86_64-macosx.S @@ -0,0 +1,1468 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +.text + +.section __DATA,__const +.p2align 6 +L$zero: +.long 0,0,0,0 +L$one: +.long 1,0,0,0 +L$inc: +.long 0,1,2,3 +L$four: +.long 4,4,4,4 +L$incy: +.long 0,2,4,6,1,3,5,7 +L$eight: +.long 8,8,8,8,8,8,8,8 +L$rot16: +.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd +L$rot24: +.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe +L$sigma: +.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 +.p2align 6 +L$zeroz: +.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 +L$fourz: +.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 +L$incz: +.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 +L$sixteen: +.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 +.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.text +.globl _ChaCha20_ctr32_nohw +.private_extern _ChaCha20_ctr32_nohw + +.p2align 6 +_ChaCha20_ctr32_nohw: + +_CET_ENDBR + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $64+24,%rsp + +L$ctr32_body: + + + movdqu (%rcx),%xmm1 + movdqu 16(%rcx),%xmm2 + movdqu (%r8),%xmm3 + movdqa L$one(%rip),%xmm4 + + + movdqa %xmm1,16(%rsp) + movdqa %xmm2,32(%rsp) + movdqa %xmm3,48(%rsp) + movq %rdx,%rbp + jmp L$oop_outer + +.p2align 5 +L$oop_outer: + movl $0x61707865,%eax + movl $0x3320646e,%ebx + movl $0x79622d32,%ecx + movl $0x6b206574,%edx + movl 16(%rsp),%r8d + movl 20(%rsp),%r9d + movl 24(%rsp),%r10d + movl 28(%rsp),%r11d + movd %xmm3,%r12d + movl 52(%rsp),%r13d + movl 56(%rsp),%r14d + movl 60(%rsp),%r15d + + movq %rbp,64+0(%rsp) + movl $10,%ebp + movq %rsi,64+8(%rsp) +.byte 102,72,15,126,214 + movq %rdi,64+16(%rsp) + movq %rsi,%rdi + shrq $32,%rdi + jmp L$oop + +.p2align 5 +L$oop: + addl %r8d,%eax + xorl %eax,%r12d + roll $16,%r12d + addl %r9d,%ebx + xorl %ebx,%r13d + roll $16,%r13d + addl %r12d,%esi + xorl %esi,%r8d + roll $12,%r8d + addl %r13d,%edi + xorl %edi,%r9d + roll $12,%r9d + addl %r8d,%eax + xorl %eax,%r12d + roll $8,%r12d + addl %r9d,%ebx + xorl %ebx,%r13d + roll $8,%r13d + addl %r12d,%esi + xorl %esi,%r8d + roll $7,%r8d + addl %r13d,%edi + xorl %edi,%r9d + roll $7,%r9d + movl %esi,32(%rsp) + movl %edi,36(%rsp) + movl 40(%rsp),%esi + movl 44(%rsp),%edi + addl %r10d,%ecx + xorl %ecx,%r14d + roll $16,%r14d + addl %r11d,%edx + xorl %edx,%r15d + roll $16,%r15d + addl %r14d,%esi + xorl %esi,%r10d + roll $12,%r10d + addl %r15d,%edi + xorl %edi,%r11d + roll $12,%r11d + addl %r10d,%ecx + xorl %ecx,%r14d + roll $8,%r14d + addl %r11d,%edx + xorl %edx,%r15d + roll $8,%r15d + addl %r14d,%esi + xorl %esi,%r10d + roll $7,%r10d + addl %r15d,%edi + xorl %edi,%r11d + roll $7,%r11d + addl %r9d,%eax + xorl %eax,%r15d + roll $16,%r15d + addl %r10d,%ebx + xorl %ebx,%r12d + roll $16,%r12d + addl %r15d,%esi + xorl %esi,%r9d + roll $12,%r9d + addl %r12d,%edi + xorl %edi,%r10d + roll $12,%r10d + addl %r9d,%eax + xorl %eax,%r15d + roll $8,%r15d + addl %r10d,%ebx + xorl %ebx,%r12d + roll $8,%r12d + addl %r15d,%esi + xorl %esi,%r9d + roll $7,%r9d + addl %r12d,%edi + xorl %edi,%r10d + roll $7,%r10d + movl %esi,40(%rsp) + movl %edi,44(%rsp) + movl 32(%rsp),%esi + movl 36(%rsp),%edi + addl %r11d,%ecx + xorl %ecx,%r13d + roll $16,%r13d + addl %r8d,%edx + xorl %edx,%r14d + roll $16,%r14d + addl %r13d,%esi + xorl %esi,%r11d + roll $12,%r11d + addl %r14d,%edi + xorl %edi,%r8d + roll $12,%r8d + addl %r11d,%ecx + xorl %ecx,%r13d + roll $8,%r13d + addl %r8d,%edx + xorl %edx,%r14d + roll $8,%r14d + addl %r13d,%esi + xorl %esi,%r11d + roll $7,%r11d + addl %r14d,%edi + xorl %edi,%r8d + roll $7,%r8d + decl %ebp + jnz L$oop + movl %edi,36(%rsp) + movl %esi,32(%rsp) + movq 64(%rsp),%rbp + movdqa %xmm2,%xmm1 + movq 64+8(%rsp),%rsi + paddd %xmm4,%xmm3 + movq 64+16(%rsp),%rdi + + addl $0x61707865,%eax + addl $0x3320646e,%ebx + addl $0x79622d32,%ecx + addl $0x6b206574,%edx + addl 16(%rsp),%r8d + addl 20(%rsp),%r9d + addl 24(%rsp),%r10d + addl 28(%rsp),%r11d + addl 48(%rsp),%r12d + addl 52(%rsp),%r13d + addl 56(%rsp),%r14d + addl 60(%rsp),%r15d + paddd 32(%rsp),%xmm1 + + cmpq $64,%rbp + jb L$tail + + xorl 0(%rsi),%eax + xorl 4(%rsi),%ebx + xorl 8(%rsi),%ecx + xorl 12(%rsi),%edx + xorl 16(%rsi),%r8d + xorl 20(%rsi),%r9d + xorl 24(%rsi),%r10d + xorl 28(%rsi),%r11d + movdqu 32(%rsi),%xmm0 + xorl 48(%rsi),%r12d + xorl 52(%rsi),%r13d + xorl 56(%rsi),%r14d + xorl 60(%rsi),%r15d + leaq 64(%rsi),%rsi + pxor %xmm1,%xmm0 + + movdqa %xmm2,32(%rsp) + movd %xmm3,48(%rsp) + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + movdqu %xmm0,32(%rdi) + movl %r12d,48(%rdi) + movl %r13d,52(%rdi) + movl %r14d,56(%rdi) + movl %r15d,60(%rdi) + leaq 64(%rdi),%rdi + + subq $64,%rbp + jnz L$oop_outer + + jmp L$done + +.p2align 4 +L$tail: + movl %eax,0(%rsp) + movl %ebx,4(%rsp) + xorq %rbx,%rbx + movl %ecx,8(%rsp) + movl %edx,12(%rsp) + movl %r8d,16(%rsp) + movl %r9d,20(%rsp) + movl %r10d,24(%rsp) + movl %r11d,28(%rsp) + movdqa %xmm1,32(%rsp) + movl %r12d,48(%rsp) + movl %r13d,52(%rsp) + movl %r14d,56(%rsp) + movl %r15d,60(%rsp) + +L$oop_tail: + movzbl (%rsi,%rbx,1),%eax + movzbl (%rsp,%rbx,1),%edx + leaq 1(%rbx),%rbx + xorl %edx,%eax + movb %al,-1(%rdi,%rbx,1) + decq %rbp + jnz L$oop_tail + +L$done: + leaq 64+24+48(%rsp),%rsi + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$no_data: + ret + + +.globl _ChaCha20_ctr32_ssse3_4x +.private_extern _ChaCha20_ctr32_ssse3_4x + +.p2align 5 +_ChaCha20_ctr32_ssse3_4x: + +_CET_ENDBR + movq %rsp,%r9 + + subq $0x140+8,%rsp + movdqa L$sigma(%rip),%xmm11 + movdqu (%rcx),%xmm15 + movdqu 16(%rcx),%xmm7 + movdqu (%r8),%xmm3 + leaq 256(%rsp),%rcx + leaq L$rot16(%rip),%r10 + leaq L$rot24(%rip),%r11 + + pshufd $0x00,%xmm11,%xmm8 + pshufd $0x55,%xmm11,%xmm9 + movdqa %xmm8,64(%rsp) + pshufd $0xaa,%xmm11,%xmm10 + movdqa %xmm9,80(%rsp) + pshufd $0xff,%xmm11,%xmm11 + movdqa %xmm10,96(%rsp) + movdqa %xmm11,112(%rsp) + + pshufd $0x00,%xmm15,%xmm12 + pshufd $0x55,%xmm15,%xmm13 + movdqa %xmm12,128-256(%rcx) + pshufd $0xaa,%xmm15,%xmm14 + movdqa %xmm13,144-256(%rcx) + pshufd $0xff,%xmm15,%xmm15 + movdqa %xmm14,160-256(%rcx) + movdqa %xmm15,176-256(%rcx) + + pshufd $0x00,%xmm7,%xmm4 + pshufd $0x55,%xmm7,%xmm5 + movdqa %xmm4,192-256(%rcx) + pshufd $0xaa,%xmm7,%xmm6 + movdqa %xmm5,208-256(%rcx) + pshufd $0xff,%xmm7,%xmm7 + movdqa %xmm6,224-256(%rcx) + movdqa %xmm7,240-256(%rcx) + + pshufd $0x00,%xmm3,%xmm0 + pshufd $0x55,%xmm3,%xmm1 + paddd L$inc(%rip),%xmm0 + pshufd $0xaa,%xmm3,%xmm2 + movdqa %xmm1,272-256(%rcx) + pshufd $0xff,%xmm3,%xmm3 + movdqa %xmm2,288-256(%rcx) + movdqa %xmm3,304-256(%rcx) + + jmp L$oop_enter4x + +.p2align 5 +L$oop_outer4x: + movdqa 64(%rsp),%xmm8 + movdqa 80(%rsp),%xmm9 + movdqa 96(%rsp),%xmm10 + movdqa 112(%rsp),%xmm11 + movdqa 128-256(%rcx),%xmm12 + movdqa 144-256(%rcx),%xmm13 + movdqa 160-256(%rcx),%xmm14 + movdqa 176-256(%rcx),%xmm15 + movdqa 192-256(%rcx),%xmm4 + movdqa 208-256(%rcx),%xmm5 + movdqa 224-256(%rcx),%xmm6 + movdqa 240-256(%rcx),%xmm7 + movdqa 256-256(%rcx),%xmm0 + movdqa 272-256(%rcx),%xmm1 + movdqa 288-256(%rcx),%xmm2 + movdqa 304-256(%rcx),%xmm3 + paddd L$four(%rip),%xmm0 + +L$oop_enter4x: + movdqa %xmm6,32(%rsp) + movdqa %xmm7,48(%rsp) + movdqa (%r10),%xmm7 + movl $10,%eax + movdqa %xmm0,256-256(%rcx) + jmp L$oop4x + +.p2align 5 +L$oop4x: + paddd %xmm12,%xmm8 + paddd %xmm13,%xmm9 + pxor %xmm8,%xmm0 + pxor %xmm9,%xmm1 +.byte 102,15,56,0,199 +.byte 102,15,56,0,207 + paddd %xmm0,%xmm4 + paddd %xmm1,%xmm5 + pxor %xmm4,%xmm12 + pxor %xmm5,%xmm13 + movdqa %xmm12,%xmm6 + pslld $12,%xmm12 + psrld $20,%xmm6 + movdqa %xmm13,%xmm7 + pslld $12,%xmm13 + por %xmm6,%xmm12 + psrld $20,%xmm7 + movdqa (%r11),%xmm6 + por %xmm7,%xmm13 + paddd %xmm12,%xmm8 + paddd %xmm13,%xmm9 + pxor %xmm8,%xmm0 + pxor %xmm9,%xmm1 +.byte 102,15,56,0,198 +.byte 102,15,56,0,206 + paddd %xmm0,%xmm4 + paddd %xmm1,%xmm5 + pxor %xmm4,%xmm12 + pxor %xmm5,%xmm13 + movdqa %xmm12,%xmm7 + pslld $7,%xmm12 + psrld $25,%xmm7 + movdqa %xmm13,%xmm6 + pslld $7,%xmm13 + por %xmm7,%xmm12 + psrld $25,%xmm6 + movdqa (%r10),%xmm7 + por %xmm6,%xmm13 + movdqa %xmm4,0(%rsp) + movdqa %xmm5,16(%rsp) + movdqa 32(%rsp),%xmm4 + movdqa 48(%rsp),%xmm5 + paddd %xmm14,%xmm10 + paddd %xmm15,%xmm11 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm3 +.byte 102,15,56,0,215 +.byte 102,15,56,0,223 + paddd %xmm2,%xmm4 + paddd %xmm3,%xmm5 + pxor %xmm4,%xmm14 + pxor %xmm5,%xmm15 + movdqa %xmm14,%xmm6 + pslld $12,%xmm14 + psrld $20,%xmm6 + movdqa %xmm15,%xmm7 + pslld $12,%xmm15 + por %xmm6,%xmm14 + psrld $20,%xmm7 + movdqa (%r11),%xmm6 + por %xmm7,%xmm15 + paddd %xmm14,%xmm10 + paddd %xmm15,%xmm11 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm3 +.byte 102,15,56,0,214 +.byte 102,15,56,0,222 + paddd %xmm2,%xmm4 + paddd %xmm3,%xmm5 + pxor %xmm4,%xmm14 + pxor %xmm5,%xmm15 + movdqa %xmm14,%xmm7 + pslld $7,%xmm14 + psrld $25,%xmm7 + movdqa %xmm15,%xmm6 + pslld $7,%xmm15 + por %xmm7,%xmm14 + psrld $25,%xmm6 + movdqa (%r10),%xmm7 + por %xmm6,%xmm15 + paddd %xmm13,%xmm8 + paddd %xmm14,%xmm9 + pxor %xmm8,%xmm3 + pxor %xmm9,%xmm0 +.byte 102,15,56,0,223 +.byte 102,15,56,0,199 + paddd %xmm3,%xmm4 + paddd %xmm0,%xmm5 + pxor %xmm4,%xmm13 + pxor %xmm5,%xmm14 + movdqa %xmm13,%xmm6 + pslld $12,%xmm13 + psrld $20,%xmm6 + movdqa %xmm14,%xmm7 + pslld $12,%xmm14 + por %xmm6,%xmm13 + psrld $20,%xmm7 + movdqa (%r11),%xmm6 + por %xmm7,%xmm14 + paddd %xmm13,%xmm8 + paddd %xmm14,%xmm9 + pxor %xmm8,%xmm3 + pxor %xmm9,%xmm0 +.byte 102,15,56,0,222 +.byte 102,15,56,0,198 + paddd %xmm3,%xmm4 + paddd %xmm0,%xmm5 + pxor %xmm4,%xmm13 + pxor %xmm5,%xmm14 + movdqa %xmm13,%xmm7 + pslld $7,%xmm13 + psrld $25,%xmm7 + movdqa %xmm14,%xmm6 + pslld $7,%xmm14 + por %xmm7,%xmm13 + psrld $25,%xmm6 + movdqa (%r10),%xmm7 + por %xmm6,%xmm14 + movdqa %xmm4,32(%rsp) + movdqa %xmm5,48(%rsp) + movdqa 0(%rsp),%xmm4 + movdqa 16(%rsp),%xmm5 + paddd %xmm15,%xmm10 + paddd %xmm12,%xmm11 + pxor %xmm10,%xmm1 + pxor %xmm11,%xmm2 +.byte 102,15,56,0,207 +.byte 102,15,56,0,215 + paddd %xmm1,%xmm4 + paddd %xmm2,%xmm5 + pxor %xmm4,%xmm15 + pxor %xmm5,%xmm12 + movdqa %xmm15,%xmm6 + pslld $12,%xmm15 + psrld $20,%xmm6 + movdqa %xmm12,%xmm7 + pslld $12,%xmm12 + por %xmm6,%xmm15 + psrld $20,%xmm7 + movdqa (%r11),%xmm6 + por %xmm7,%xmm12 + paddd %xmm15,%xmm10 + paddd %xmm12,%xmm11 + pxor %xmm10,%xmm1 + pxor %xmm11,%xmm2 +.byte 102,15,56,0,206 +.byte 102,15,56,0,214 + paddd %xmm1,%xmm4 + paddd %xmm2,%xmm5 + pxor %xmm4,%xmm15 + pxor %xmm5,%xmm12 + movdqa %xmm15,%xmm7 + pslld $7,%xmm15 + psrld $25,%xmm7 + movdqa %xmm12,%xmm6 + pslld $7,%xmm12 + por %xmm7,%xmm15 + psrld $25,%xmm6 + movdqa (%r10),%xmm7 + por %xmm6,%xmm12 + decl %eax + jnz L$oop4x + + paddd 64(%rsp),%xmm8 + paddd 80(%rsp),%xmm9 + paddd 96(%rsp),%xmm10 + paddd 112(%rsp),%xmm11 + + movdqa %xmm8,%xmm6 + punpckldq %xmm9,%xmm8 + movdqa %xmm10,%xmm7 + punpckldq %xmm11,%xmm10 + punpckhdq %xmm9,%xmm6 + punpckhdq %xmm11,%xmm7 + movdqa %xmm8,%xmm9 + punpcklqdq %xmm10,%xmm8 + movdqa %xmm6,%xmm11 + punpcklqdq %xmm7,%xmm6 + punpckhqdq %xmm10,%xmm9 + punpckhqdq %xmm7,%xmm11 + paddd 128-256(%rcx),%xmm12 + paddd 144-256(%rcx),%xmm13 + paddd 160-256(%rcx),%xmm14 + paddd 176-256(%rcx),%xmm15 + + movdqa %xmm8,0(%rsp) + movdqa %xmm9,16(%rsp) + movdqa 32(%rsp),%xmm8 + movdqa 48(%rsp),%xmm9 + + movdqa %xmm12,%xmm10 + punpckldq %xmm13,%xmm12 + movdqa %xmm14,%xmm7 + punpckldq %xmm15,%xmm14 + punpckhdq %xmm13,%xmm10 + punpckhdq %xmm15,%xmm7 + movdqa %xmm12,%xmm13 + punpcklqdq %xmm14,%xmm12 + movdqa %xmm10,%xmm15 + punpcklqdq %xmm7,%xmm10 + punpckhqdq %xmm14,%xmm13 + punpckhqdq %xmm7,%xmm15 + paddd 192-256(%rcx),%xmm4 + paddd 208-256(%rcx),%xmm5 + paddd 224-256(%rcx),%xmm8 + paddd 240-256(%rcx),%xmm9 + + movdqa %xmm6,32(%rsp) + movdqa %xmm11,48(%rsp) + + movdqa %xmm4,%xmm14 + punpckldq %xmm5,%xmm4 + movdqa %xmm8,%xmm7 + punpckldq %xmm9,%xmm8 + punpckhdq %xmm5,%xmm14 + punpckhdq %xmm9,%xmm7 + movdqa %xmm4,%xmm5 + punpcklqdq %xmm8,%xmm4 + movdqa %xmm14,%xmm9 + punpcklqdq %xmm7,%xmm14 + punpckhqdq %xmm8,%xmm5 + punpckhqdq %xmm7,%xmm9 + paddd 256-256(%rcx),%xmm0 + paddd 272-256(%rcx),%xmm1 + paddd 288-256(%rcx),%xmm2 + paddd 304-256(%rcx),%xmm3 + + movdqa %xmm0,%xmm8 + punpckldq %xmm1,%xmm0 + movdqa %xmm2,%xmm7 + punpckldq %xmm3,%xmm2 + punpckhdq %xmm1,%xmm8 + punpckhdq %xmm3,%xmm7 + movdqa %xmm0,%xmm1 + punpcklqdq %xmm2,%xmm0 + movdqa %xmm8,%xmm3 + punpcklqdq %xmm7,%xmm8 + punpckhqdq %xmm2,%xmm1 + punpckhqdq %xmm7,%xmm3 + cmpq $256,%rdx + jb L$tail4x + + movdqu 0(%rsi),%xmm6 + movdqu 16(%rsi),%xmm11 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm7 + pxor 0(%rsp),%xmm6 + pxor %xmm12,%xmm11 + pxor %xmm4,%xmm2 + pxor %xmm0,%xmm7 + + movdqu %xmm6,0(%rdi) + movdqu 64(%rsi),%xmm6 + movdqu %xmm11,16(%rdi) + movdqu 80(%rsi),%xmm11 + movdqu %xmm2,32(%rdi) + movdqu 96(%rsi),%xmm2 + movdqu %xmm7,48(%rdi) + movdqu 112(%rsi),%xmm7 + leaq 128(%rsi),%rsi + pxor 16(%rsp),%xmm6 + pxor %xmm13,%xmm11 + pxor %xmm5,%xmm2 + pxor %xmm1,%xmm7 + + movdqu %xmm6,64(%rdi) + movdqu 0(%rsi),%xmm6 + movdqu %xmm11,80(%rdi) + movdqu 16(%rsi),%xmm11 + movdqu %xmm2,96(%rdi) + movdqu 32(%rsi),%xmm2 + movdqu %xmm7,112(%rdi) + leaq 128(%rdi),%rdi + movdqu 48(%rsi),%xmm7 + pxor 32(%rsp),%xmm6 + pxor %xmm10,%xmm11 + pxor %xmm14,%xmm2 + pxor %xmm8,%xmm7 + + movdqu %xmm6,0(%rdi) + movdqu 64(%rsi),%xmm6 + movdqu %xmm11,16(%rdi) + movdqu 80(%rsi),%xmm11 + movdqu %xmm2,32(%rdi) + movdqu 96(%rsi),%xmm2 + movdqu %xmm7,48(%rdi) + movdqu 112(%rsi),%xmm7 + leaq 128(%rsi),%rsi + pxor 48(%rsp),%xmm6 + pxor %xmm15,%xmm11 + pxor %xmm9,%xmm2 + pxor %xmm3,%xmm7 + movdqu %xmm6,64(%rdi) + movdqu %xmm11,80(%rdi) + movdqu %xmm2,96(%rdi) + movdqu %xmm7,112(%rdi) + leaq 128(%rdi),%rdi + + subq $256,%rdx + jnz L$oop_outer4x + + jmp L$done4x + +L$tail4x: + cmpq $192,%rdx + jae L$192_or_more4x + cmpq $128,%rdx + jae L$128_or_more4x + cmpq $64,%rdx + jae L$64_or_more4x + + + xorq %r10,%r10 + + movdqa %xmm12,16(%rsp) + movdqa %xmm4,32(%rsp) + movdqa %xmm0,48(%rsp) + jmp L$oop_tail4x + +.p2align 5 +L$64_or_more4x: + movdqu 0(%rsi),%xmm6 + movdqu 16(%rsi),%xmm11 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm7 + pxor 0(%rsp),%xmm6 + pxor %xmm12,%xmm11 + pxor %xmm4,%xmm2 + pxor %xmm0,%xmm7 + movdqu %xmm6,0(%rdi) + movdqu %xmm11,16(%rdi) + movdqu %xmm2,32(%rdi) + movdqu %xmm7,48(%rdi) + je L$done4x + + movdqa 16(%rsp),%xmm6 + leaq 64(%rsi),%rsi + xorq %r10,%r10 + movdqa %xmm6,0(%rsp) + movdqa %xmm13,16(%rsp) + leaq 64(%rdi),%rdi + movdqa %xmm5,32(%rsp) + subq $64,%rdx + movdqa %xmm1,48(%rsp) + jmp L$oop_tail4x + +.p2align 5 +L$128_or_more4x: + movdqu 0(%rsi),%xmm6 + movdqu 16(%rsi),%xmm11 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm7 + pxor 0(%rsp),%xmm6 + pxor %xmm12,%xmm11 + pxor %xmm4,%xmm2 + pxor %xmm0,%xmm7 + + movdqu %xmm6,0(%rdi) + movdqu 64(%rsi),%xmm6 + movdqu %xmm11,16(%rdi) + movdqu 80(%rsi),%xmm11 + movdqu %xmm2,32(%rdi) + movdqu 96(%rsi),%xmm2 + movdqu %xmm7,48(%rdi) + movdqu 112(%rsi),%xmm7 + pxor 16(%rsp),%xmm6 + pxor %xmm13,%xmm11 + pxor %xmm5,%xmm2 + pxor %xmm1,%xmm7 + movdqu %xmm6,64(%rdi) + movdqu %xmm11,80(%rdi) + movdqu %xmm2,96(%rdi) + movdqu %xmm7,112(%rdi) + je L$done4x + + movdqa 32(%rsp),%xmm6 + leaq 128(%rsi),%rsi + xorq %r10,%r10 + movdqa %xmm6,0(%rsp) + movdqa %xmm10,16(%rsp) + leaq 128(%rdi),%rdi + movdqa %xmm14,32(%rsp) + subq $128,%rdx + movdqa %xmm8,48(%rsp) + jmp L$oop_tail4x + +.p2align 5 +L$192_or_more4x: + movdqu 0(%rsi),%xmm6 + movdqu 16(%rsi),%xmm11 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm7 + pxor 0(%rsp),%xmm6 + pxor %xmm12,%xmm11 + pxor %xmm4,%xmm2 + pxor %xmm0,%xmm7 + + movdqu %xmm6,0(%rdi) + movdqu 64(%rsi),%xmm6 + movdqu %xmm11,16(%rdi) + movdqu 80(%rsi),%xmm11 + movdqu %xmm2,32(%rdi) + movdqu 96(%rsi),%xmm2 + movdqu %xmm7,48(%rdi) + movdqu 112(%rsi),%xmm7 + leaq 128(%rsi),%rsi + pxor 16(%rsp),%xmm6 + pxor %xmm13,%xmm11 + pxor %xmm5,%xmm2 + pxor %xmm1,%xmm7 + + movdqu %xmm6,64(%rdi) + movdqu 0(%rsi),%xmm6 + movdqu %xmm11,80(%rdi) + movdqu 16(%rsi),%xmm11 + movdqu %xmm2,96(%rdi) + movdqu 32(%rsi),%xmm2 + movdqu %xmm7,112(%rdi) + leaq 128(%rdi),%rdi + movdqu 48(%rsi),%xmm7 + pxor 32(%rsp),%xmm6 + pxor %xmm10,%xmm11 + pxor %xmm14,%xmm2 + pxor %xmm8,%xmm7 + movdqu %xmm6,0(%rdi) + movdqu %xmm11,16(%rdi) + movdqu %xmm2,32(%rdi) + movdqu %xmm7,48(%rdi) + je L$done4x + + movdqa 48(%rsp),%xmm6 + leaq 64(%rsi),%rsi + xorq %r10,%r10 + movdqa %xmm6,0(%rsp) + movdqa %xmm15,16(%rsp) + leaq 64(%rdi),%rdi + movdqa %xmm9,32(%rsp) + subq $192,%rdx + movdqa %xmm3,48(%rsp) + +L$oop_tail4x: + movzbl (%rsi,%r10,1),%eax + movzbl (%rsp,%r10,1),%ecx + leaq 1(%r10),%r10 + xorl %ecx,%eax + movb %al,-1(%rdi,%r10,1) + decq %rdx + jnz L$oop_tail4x + +L$done4x: + leaq (%r9),%rsp + +L$4x_epilogue: + ret + + +.globl _ChaCha20_ctr32_avx2 +.private_extern _ChaCha20_ctr32_avx2 + +.p2align 5 +_ChaCha20_ctr32_avx2: + +_CET_ENDBR + movq %rsp,%r9 + + subq $0x280+8,%rsp + andq $-32,%rsp + vzeroupper + + + + + + + + + + + vbroadcasti128 L$sigma(%rip),%ymm11 + vbroadcasti128 (%rcx),%ymm3 + vbroadcasti128 16(%rcx),%ymm15 + vbroadcasti128 (%r8),%ymm7 + leaq 256(%rsp),%rcx + leaq 512(%rsp),%rax + leaq L$rot16(%rip),%r10 + leaq L$rot24(%rip),%r11 + + vpshufd $0x00,%ymm11,%ymm8 + vpshufd $0x55,%ymm11,%ymm9 + vmovdqa %ymm8,128-256(%rcx) + vpshufd $0xaa,%ymm11,%ymm10 + vmovdqa %ymm9,160-256(%rcx) + vpshufd $0xff,%ymm11,%ymm11 + vmovdqa %ymm10,192-256(%rcx) + vmovdqa %ymm11,224-256(%rcx) + + vpshufd $0x00,%ymm3,%ymm0 + vpshufd $0x55,%ymm3,%ymm1 + vmovdqa %ymm0,256-256(%rcx) + vpshufd $0xaa,%ymm3,%ymm2 + vmovdqa %ymm1,288-256(%rcx) + vpshufd $0xff,%ymm3,%ymm3 + vmovdqa %ymm2,320-256(%rcx) + vmovdqa %ymm3,352-256(%rcx) + + vpshufd $0x00,%ymm15,%ymm12 + vpshufd $0x55,%ymm15,%ymm13 + vmovdqa %ymm12,384-512(%rax) + vpshufd $0xaa,%ymm15,%ymm14 + vmovdqa %ymm13,416-512(%rax) + vpshufd $0xff,%ymm15,%ymm15 + vmovdqa %ymm14,448-512(%rax) + vmovdqa %ymm15,480-512(%rax) + + vpshufd $0x00,%ymm7,%ymm4 + vpshufd $0x55,%ymm7,%ymm5 + vpaddd L$incy(%rip),%ymm4,%ymm4 + vpshufd $0xaa,%ymm7,%ymm6 + vmovdqa %ymm5,544-512(%rax) + vpshufd $0xff,%ymm7,%ymm7 + vmovdqa %ymm6,576-512(%rax) + vmovdqa %ymm7,608-512(%rax) + + jmp L$oop_enter8x + +.p2align 5 +L$oop_outer8x: + vmovdqa 128-256(%rcx),%ymm8 + vmovdqa 160-256(%rcx),%ymm9 + vmovdqa 192-256(%rcx),%ymm10 + vmovdqa 224-256(%rcx),%ymm11 + vmovdqa 256-256(%rcx),%ymm0 + vmovdqa 288-256(%rcx),%ymm1 + vmovdqa 320-256(%rcx),%ymm2 + vmovdqa 352-256(%rcx),%ymm3 + vmovdqa 384-512(%rax),%ymm12 + vmovdqa 416-512(%rax),%ymm13 + vmovdqa 448-512(%rax),%ymm14 + vmovdqa 480-512(%rax),%ymm15 + vmovdqa 512-512(%rax),%ymm4 + vmovdqa 544-512(%rax),%ymm5 + vmovdqa 576-512(%rax),%ymm6 + vmovdqa 608-512(%rax),%ymm7 + vpaddd L$eight(%rip),%ymm4,%ymm4 + +L$oop_enter8x: + vmovdqa %ymm14,64(%rsp) + vmovdqa %ymm15,96(%rsp) + vbroadcasti128 (%r10),%ymm15 + vmovdqa %ymm4,512-512(%rax) + movl $10,%eax + jmp L$oop8x + +.p2align 5 +L$oop8x: + vpaddd %ymm0,%ymm8,%ymm8 + vpxor %ymm4,%ymm8,%ymm4 + vpshufb %ymm15,%ymm4,%ymm4 + vpaddd %ymm1,%ymm9,%ymm9 + vpxor %ymm5,%ymm9,%ymm5 + vpshufb %ymm15,%ymm5,%ymm5 + vpaddd %ymm4,%ymm12,%ymm12 + vpxor %ymm0,%ymm12,%ymm0 + vpslld $12,%ymm0,%ymm14 + vpsrld $20,%ymm0,%ymm0 + vpor %ymm0,%ymm14,%ymm0 + vbroadcasti128 (%r11),%ymm14 + vpaddd %ymm5,%ymm13,%ymm13 + vpxor %ymm1,%ymm13,%ymm1 + vpslld $12,%ymm1,%ymm15 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm1,%ymm15,%ymm1 + vpaddd %ymm0,%ymm8,%ymm8 + vpxor %ymm4,%ymm8,%ymm4 + vpshufb %ymm14,%ymm4,%ymm4 + vpaddd %ymm1,%ymm9,%ymm9 + vpxor %ymm5,%ymm9,%ymm5 + vpshufb %ymm14,%ymm5,%ymm5 + vpaddd %ymm4,%ymm12,%ymm12 + vpxor %ymm0,%ymm12,%ymm0 + vpslld $7,%ymm0,%ymm15 + vpsrld $25,%ymm0,%ymm0 + vpor %ymm0,%ymm15,%ymm0 + vbroadcasti128 (%r10),%ymm15 + vpaddd %ymm5,%ymm13,%ymm13 + vpxor %ymm1,%ymm13,%ymm1 + vpslld $7,%ymm1,%ymm14 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm1,%ymm14,%ymm1 + vmovdqa %ymm12,0(%rsp) + vmovdqa %ymm13,32(%rsp) + vmovdqa 64(%rsp),%ymm12 + vmovdqa 96(%rsp),%ymm13 + vpaddd %ymm2,%ymm10,%ymm10 + vpxor %ymm6,%ymm10,%ymm6 + vpshufb %ymm15,%ymm6,%ymm6 + vpaddd %ymm3,%ymm11,%ymm11 + vpxor %ymm7,%ymm11,%ymm7 + vpshufb %ymm15,%ymm7,%ymm7 + vpaddd %ymm6,%ymm12,%ymm12 + vpxor %ymm2,%ymm12,%ymm2 + vpslld $12,%ymm2,%ymm14 + vpsrld $20,%ymm2,%ymm2 + vpor %ymm2,%ymm14,%ymm2 + vbroadcasti128 (%r11),%ymm14 + vpaddd %ymm7,%ymm13,%ymm13 + vpxor %ymm3,%ymm13,%ymm3 + vpslld $12,%ymm3,%ymm15 + vpsrld $20,%ymm3,%ymm3 + vpor %ymm3,%ymm15,%ymm3 + vpaddd %ymm2,%ymm10,%ymm10 + vpxor %ymm6,%ymm10,%ymm6 + vpshufb %ymm14,%ymm6,%ymm6 + vpaddd %ymm3,%ymm11,%ymm11 + vpxor %ymm7,%ymm11,%ymm7 + vpshufb %ymm14,%ymm7,%ymm7 + vpaddd %ymm6,%ymm12,%ymm12 + vpxor %ymm2,%ymm12,%ymm2 + vpslld $7,%ymm2,%ymm15 + vpsrld $25,%ymm2,%ymm2 + vpor %ymm2,%ymm15,%ymm2 + vbroadcasti128 (%r10),%ymm15 + vpaddd %ymm7,%ymm13,%ymm13 + vpxor %ymm3,%ymm13,%ymm3 + vpslld $7,%ymm3,%ymm14 + vpsrld $25,%ymm3,%ymm3 + vpor %ymm3,%ymm14,%ymm3 + vpaddd %ymm1,%ymm8,%ymm8 + vpxor %ymm7,%ymm8,%ymm7 + vpshufb %ymm15,%ymm7,%ymm7 + vpaddd %ymm2,%ymm9,%ymm9 + vpxor %ymm4,%ymm9,%ymm4 + vpshufb %ymm15,%ymm4,%ymm4 + vpaddd %ymm7,%ymm12,%ymm12 + vpxor %ymm1,%ymm12,%ymm1 + vpslld $12,%ymm1,%ymm14 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm1,%ymm14,%ymm1 + vbroadcasti128 (%r11),%ymm14 + vpaddd %ymm4,%ymm13,%ymm13 + vpxor %ymm2,%ymm13,%ymm2 + vpslld $12,%ymm2,%ymm15 + vpsrld $20,%ymm2,%ymm2 + vpor %ymm2,%ymm15,%ymm2 + vpaddd %ymm1,%ymm8,%ymm8 + vpxor %ymm7,%ymm8,%ymm7 + vpshufb %ymm14,%ymm7,%ymm7 + vpaddd %ymm2,%ymm9,%ymm9 + vpxor %ymm4,%ymm9,%ymm4 + vpshufb %ymm14,%ymm4,%ymm4 + vpaddd %ymm7,%ymm12,%ymm12 + vpxor %ymm1,%ymm12,%ymm1 + vpslld $7,%ymm1,%ymm15 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm1,%ymm15,%ymm1 + vbroadcasti128 (%r10),%ymm15 + vpaddd %ymm4,%ymm13,%ymm13 + vpxor %ymm2,%ymm13,%ymm2 + vpslld $7,%ymm2,%ymm14 + vpsrld $25,%ymm2,%ymm2 + vpor %ymm2,%ymm14,%ymm2 + vmovdqa %ymm12,64(%rsp) + vmovdqa %ymm13,96(%rsp) + vmovdqa 0(%rsp),%ymm12 + vmovdqa 32(%rsp),%ymm13 + vpaddd %ymm3,%ymm10,%ymm10 + vpxor %ymm5,%ymm10,%ymm5 + vpshufb %ymm15,%ymm5,%ymm5 + vpaddd %ymm0,%ymm11,%ymm11 + vpxor %ymm6,%ymm11,%ymm6 + vpshufb %ymm15,%ymm6,%ymm6 + vpaddd %ymm5,%ymm12,%ymm12 + vpxor %ymm3,%ymm12,%ymm3 + vpslld $12,%ymm3,%ymm14 + vpsrld $20,%ymm3,%ymm3 + vpor %ymm3,%ymm14,%ymm3 + vbroadcasti128 (%r11),%ymm14 + vpaddd %ymm6,%ymm13,%ymm13 + vpxor %ymm0,%ymm13,%ymm0 + vpslld $12,%ymm0,%ymm15 + vpsrld $20,%ymm0,%ymm0 + vpor %ymm0,%ymm15,%ymm0 + vpaddd %ymm3,%ymm10,%ymm10 + vpxor %ymm5,%ymm10,%ymm5 + vpshufb %ymm14,%ymm5,%ymm5 + vpaddd %ymm0,%ymm11,%ymm11 + vpxor %ymm6,%ymm11,%ymm6 + vpshufb %ymm14,%ymm6,%ymm6 + vpaddd %ymm5,%ymm12,%ymm12 + vpxor %ymm3,%ymm12,%ymm3 + vpslld $7,%ymm3,%ymm15 + vpsrld $25,%ymm3,%ymm3 + vpor %ymm3,%ymm15,%ymm3 + vbroadcasti128 (%r10),%ymm15 + vpaddd %ymm6,%ymm13,%ymm13 + vpxor %ymm0,%ymm13,%ymm0 + vpslld $7,%ymm0,%ymm14 + vpsrld $25,%ymm0,%ymm0 + vpor %ymm0,%ymm14,%ymm0 + decl %eax + jnz L$oop8x + + leaq 512(%rsp),%rax + vpaddd 128-256(%rcx),%ymm8,%ymm8 + vpaddd 160-256(%rcx),%ymm9,%ymm9 + vpaddd 192-256(%rcx),%ymm10,%ymm10 + vpaddd 224-256(%rcx),%ymm11,%ymm11 + + vpunpckldq %ymm9,%ymm8,%ymm14 + vpunpckldq %ymm11,%ymm10,%ymm15 + vpunpckhdq %ymm9,%ymm8,%ymm8 + vpunpckhdq %ymm11,%ymm10,%ymm10 + vpunpcklqdq %ymm15,%ymm14,%ymm9 + vpunpckhqdq %ymm15,%ymm14,%ymm14 + vpunpcklqdq %ymm10,%ymm8,%ymm11 + vpunpckhqdq %ymm10,%ymm8,%ymm8 + vpaddd 256-256(%rcx),%ymm0,%ymm0 + vpaddd 288-256(%rcx),%ymm1,%ymm1 + vpaddd 320-256(%rcx),%ymm2,%ymm2 + vpaddd 352-256(%rcx),%ymm3,%ymm3 + + vpunpckldq %ymm1,%ymm0,%ymm10 + vpunpckldq %ymm3,%ymm2,%ymm15 + vpunpckhdq %ymm1,%ymm0,%ymm0 + vpunpckhdq %ymm3,%ymm2,%ymm2 + vpunpcklqdq %ymm15,%ymm10,%ymm1 + vpunpckhqdq %ymm15,%ymm10,%ymm10 + vpunpcklqdq %ymm2,%ymm0,%ymm3 + vpunpckhqdq %ymm2,%ymm0,%ymm0 + vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 + vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 + vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 + vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 + vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 + vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 + vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 + vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 + vmovdqa %ymm15,0(%rsp) + vmovdqa %ymm9,32(%rsp) + vmovdqa 64(%rsp),%ymm15 + vmovdqa 96(%rsp),%ymm9 + + vpaddd 384-512(%rax),%ymm12,%ymm12 + vpaddd 416-512(%rax),%ymm13,%ymm13 + vpaddd 448-512(%rax),%ymm15,%ymm15 + vpaddd 480-512(%rax),%ymm9,%ymm9 + + vpunpckldq %ymm13,%ymm12,%ymm2 + vpunpckldq %ymm9,%ymm15,%ymm8 + vpunpckhdq %ymm13,%ymm12,%ymm12 + vpunpckhdq %ymm9,%ymm15,%ymm15 + vpunpcklqdq %ymm8,%ymm2,%ymm13 + vpunpckhqdq %ymm8,%ymm2,%ymm2 + vpunpcklqdq %ymm15,%ymm12,%ymm9 + vpunpckhqdq %ymm15,%ymm12,%ymm12 + vpaddd 512-512(%rax),%ymm4,%ymm4 + vpaddd 544-512(%rax),%ymm5,%ymm5 + vpaddd 576-512(%rax),%ymm6,%ymm6 + vpaddd 608-512(%rax),%ymm7,%ymm7 + + vpunpckldq %ymm5,%ymm4,%ymm15 + vpunpckldq %ymm7,%ymm6,%ymm8 + vpunpckhdq %ymm5,%ymm4,%ymm4 + vpunpckhdq %ymm7,%ymm6,%ymm6 + vpunpcklqdq %ymm8,%ymm15,%ymm5 + vpunpckhqdq %ymm8,%ymm15,%ymm15 + vpunpcklqdq %ymm6,%ymm4,%ymm7 + vpunpckhqdq %ymm6,%ymm4,%ymm4 + vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 + vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 + vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 + vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 + vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 + vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 + vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 + vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 + vmovdqa 0(%rsp),%ymm6 + vmovdqa 32(%rsp),%ymm12 + + cmpq $512,%rdx + jb L$tail8x + + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + leaq 128(%rsi),%rsi + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + leaq 128(%rdi),%rdi + + vpxor 0(%rsi),%ymm12,%ymm12 + vpxor 32(%rsi),%ymm13,%ymm13 + vpxor 64(%rsi),%ymm10,%ymm10 + vpxor 96(%rsi),%ymm15,%ymm15 + leaq 128(%rsi),%rsi + vmovdqu %ymm12,0(%rdi) + vmovdqu %ymm13,32(%rdi) + vmovdqu %ymm10,64(%rdi) + vmovdqu %ymm15,96(%rdi) + leaq 128(%rdi),%rdi + + vpxor 0(%rsi),%ymm14,%ymm14 + vpxor 32(%rsi),%ymm2,%ymm2 + vpxor 64(%rsi),%ymm3,%ymm3 + vpxor 96(%rsi),%ymm7,%ymm7 + leaq 128(%rsi),%rsi + vmovdqu %ymm14,0(%rdi) + vmovdqu %ymm2,32(%rdi) + vmovdqu %ymm3,64(%rdi) + vmovdqu %ymm7,96(%rdi) + leaq 128(%rdi),%rdi + + vpxor 0(%rsi),%ymm11,%ymm11 + vpxor 32(%rsi),%ymm9,%ymm9 + vpxor 64(%rsi),%ymm0,%ymm0 + vpxor 96(%rsi),%ymm4,%ymm4 + leaq 128(%rsi),%rsi + vmovdqu %ymm11,0(%rdi) + vmovdqu %ymm9,32(%rdi) + vmovdqu %ymm0,64(%rdi) + vmovdqu %ymm4,96(%rdi) + leaq 128(%rdi),%rdi + + subq $512,%rdx + jnz L$oop_outer8x + + jmp L$done8x + +L$tail8x: + cmpq $448,%rdx + jae L$448_or_more8x + cmpq $384,%rdx + jae L$384_or_more8x + cmpq $320,%rdx + jae L$320_or_more8x + cmpq $256,%rdx + jae L$256_or_more8x + cmpq $192,%rdx + jae L$192_or_more8x + cmpq $128,%rdx + jae L$128_or_more8x + cmpq $64,%rdx + jae L$64_or_more8x + + xorq %r10,%r10 + vmovdqa %ymm6,0(%rsp) + vmovdqa %ymm8,32(%rsp) + jmp L$oop_tail8x + +.p2align 5 +L$64_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + je L$done8x + + leaq 64(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm1,0(%rsp) + leaq 64(%rdi),%rdi + subq $64,%rdx + vmovdqa %ymm5,32(%rsp) + jmp L$oop_tail8x + +.p2align 5 +L$128_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + je L$done8x + + leaq 128(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm12,0(%rsp) + leaq 128(%rdi),%rdi + subq $128,%rdx + vmovdqa %ymm13,32(%rsp) + jmp L$oop_tail8x + +.p2align 5 +L$192_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + je L$done8x + + leaq 192(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm10,0(%rsp) + leaq 192(%rdi),%rdi + subq $192,%rdx + vmovdqa %ymm15,32(%rsp) + jmp L$oop_tail8x + +.p2align 5 +L$256_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vpxor 192(%rsi),%ymm10,%ymm10 + vpxor 224(%rsi),%ymm15,%ymm15 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + vmovdqu %ymm10,192(%rdi) + vmovdqu %ymm15,224(%rdi) + je L$done8x + + leaq 256(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm14,0(%rsp) + leaq 256(%rdi),%rdi + subq $256,%rdx + vmovdqa %ymm2,32(%rsp) + jmp L$oop_tail8x + +.p2align 5 +L$320_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vpxor 192(%rsi),%ymm10,%ymm10 + vpxor 224(%rsi),%ymm15,%ymm15 + vpxor 256(%rsi),%ymm14,%ymm14 + vpxor 288(%rsi),%ymm2,%ymm2 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + vmovdqu %ymm10,192(%rdi) + vmovdqu %ymm15,224(%rdi) + vmovdqu %ymm14,256(%rdi) + vmovdqu %ymm2,288(%rdi) + je L$done8x + + leaq 320(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm3,0(%rsp) + leaq 320(%rdi),%rdi + subq $320,%rdx + vmovdqa %ymm7,32(%rsp) + jmp L$oop_tail8x + +.p2align 5 +L$384_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vpxor 192(%rsi),%ymm10,%ymm10 + vpxor 224(%rsi),%ymm15,%ymm15 + vpxor 256(%rsi),%ymm14,%ymm14 + vpxor 288(%rsi),%ymm2,%ymm2 + vpxor 320(%rsi),%ymm3,%ymm3 + vpxor 352(%rsi),%ymm7,%ymm7 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + vmovdqu %ymm10,192(%rdi) + vmovdqu %ymm15,224(%rdi) + vmovdqu %ymm14,256(%rdi) + vmovdqu %ymm2,288(%rdi) + vmovdqu %ymm3,320(%rdi) + vmovdqu %ymm7,352(%rdi) + je L$done8x + + leaq 384(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm11,0(%rsp) + leaq 384(%rdi),%rdi + subq $384,%rdx + vmovdqa %ymm9,32(%rsp) + jmp L$oop_tail8x + +.p2align 5 +L$448_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vpxor 192(%rsi),%ymm10,%ymm10 + vpxor 224(%rsi),%ymm15,%ymm15 + vpxor 256(%rsi),%ymm14,%ymm14 + vpxor 288(%rsi),%ymm2,%ymm2 + vpxor 320(%rsi),%ymm3,%ymm3 + vpxor 352(%rsi),%ymm7,%ymm7 + vpxor 384(%rsi),%ymm11,%ymm11 + vpxor 416(%rsi),%ymm9,%ymm9 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + vmovdqu %ymm10,192(%rdi) + vmovdqu %ymm15,224(%rdi) + vmovdqu %ymm14,256(%rdi) + vmovdqu %ymm2,288(%rdi) + vmovdqu %ymm3,320(%rdi) + vmovdqu %ymm7,352(%rdi) + vmovdqu %ymm11,384(%rdi) + vmovdqu %ymm9,416(%rdi) + je L$done8x + + leaq 448(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm0,0(%rsp) + leaq 448(%rdi),%rdi + subq $448,%rdx + vmovdqa %ymm4,32(%rsp) + +L$oop_tail8x: + movzbl (%rsi,%r10,1),%eax + movzbl (%rsp,%r10,1),%ecx + leaq 1(%r10),%r10 + xorl %ecx,%eax + movb %al,-1(%rdi,%r10,1) + decq %rdx + jnz L$oop_tail8x + +L$done8x: + vzeroall + leaq (%r9),%rsp + +L$8x_epilogue: + ret + + +#endif diff --git a/ring-0.17.14/pregenerated/chacha-x86_64-nasm.asm b/ring-0.17.14/pregenerated/chacha-x86_64-nasm.asm new file mode 100644 index 0000000000..3a08c9500b --- /dev/null +++ b/ring-0.17.14/pregenerated/chacha-x86_64-nasm.asm @@ -0,0 +1,1752 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%include "ring_core_generated/prefix_symbols_nasm.inc" +section .text code align=64 + + +section .rdata rdata align=8 +ALIGN 64 +$L$zero: + DD 0,0,0,0 +$L$one: + DD 1,0,0,0 +$L$inc: + DD 0,1,2,3 +$L$four: + DD 4,4,4,4 +$L$incy: + DD 0,2,4,6,1,3,5,7 +$L$eight: + DD 8,8,8,8,8,8,8,8 +$L$rot16: + DB 0x2,0x3,0x0,0x1,0x6,0x7,0x4,0x5,0xa,0xb,0x8,0x9,0xe,0xf,0xc,0xd +$L$rot24: + DB 0x3,0x0,0x1,0x2,0x7,0x4,0x5,0x6,0xb,0x8,0x9,0xa,0xf,0xc,0xd,0xe +$L$sigma: + DB 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107 + DB 0 +ALIGN 64 +$L$zeroz: + DD 0,0,0,0,1,0,0,0,2,0,0,0,3,0,0,0 +$L$fourz: + DD 4,0,0,0,4,0,0,0,4,0,0,0,4,0,0,0 +$L$incz: + DD 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 +$L$sixteen: + DD 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 + DB 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 + DB 95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32 + DB 98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115 + DB 108,46,111,114,103,62,0 +section .text + +global ChaCha20_ctr32_nohw + +ALIGN 64 +ChaCha20_ctr32_nohw: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ChaCha20_ctr32_nohw: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +_CET_ENDBR + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,64+24 + +$L$ctr32_body: + + + movdqu xmm1,XMMWORD[rcx] + movdqu xmm2,XMMWORD[16+rcx] + movdqu xmm3,XMMWORD[r8] + movdqa xmm4,XMMWORD[$L$one] + + + movdqa XMMWORD[16+rsp],xmm1 + movdqa XMMWORD[32+rsp],xmm2 + movdqa XMMWORD[48+rsp],xmm3 + mov rbp,rdx + jmp NEAR $L$oop_outer + +ALIGN 32 +$L$oop_outer: + mov eax,0x61707865 + mov ebx,0x3320646e + mov ecx,0x79622d32 + mov edx,0x6b206574 + mov r8d,DWORD[16+rsp] + mov r9d,DWORD[20+rsp] + mov r10d,DWORD[24+rsp] + mov r11d,DWORD[28+rsp] + movd r12d,xmm3 + mov r13d,DWORD[52+rsp] + mov r14d,DWORD[56+rsp] + mov r15d,DWORD[60+rsp] + + mov QWORD[((64+0))+rsp],rbp + mov ebp,10 + mov QWORD[((64+8))+rsp],rsi +DB 102,72,15,126,214 + mov QWORD[((64+16))+rsp],rdi + mov rdi,rsi + shr rdi,32 + jmp NEAR $L$oop + +ALIGN 32 +$L$oop: + add eax,r8d + xor r12d,eax + rol r12d,16 + add ebx,r9d + xor r13d,ebx + rol r13d,16 + add esi,r12d + xor r8d,esi + rol r8d,12 + add edi,r13d + xor r9d,edi + rol r9d,12 + add eax,r8d + xor r12d,eax + rol r12d,8 + add ebx,r9d + xor r13d,ebx + rol r13d,8 + add esi,r12d + xor r8d,esi + rol r8d,7 + add edi,r13d + xor r9d,edi + rol r9d,7 + mov DWORD[32+rsp],esi + mov DWORD[36+rsp],edi + mov esi,DWORD[40+rsp] + mov edi,DWORD[44+rsp] + add ecx,r10d + xor r14d,ecx + rol r14d,16 + add edx,r11d + xor r15d,edx + rol r15d,16 + add esi,r14d + xor r10d,esi + rol r10d,12 + add edi,r15d + xor r11d,edi + rol r11d,12 + add ecx,r10d + xor r14d,ecx + rol r14d,8 + add edx,r11d + xor r15d,edx + rol r15d,8 + add esi,r14d + xor r10d,esi + rol r10d,7 + add edi,r15d + xor r11d,edi + rol r11d,7 + add eax,r9d + xor r15d,eax + rol r15d,16 + add ebx,r10d + xor r12d,ebx + rol r12d,16 + add esi,r15d + xor r9d,esi + rol r9d,12 + add edi,r12d + xor r10d,edi + rol r10d,12 + add eax,r9d + xor r15d,eax + rol r15d,8 + add ebx,r10d + xor r12d,ebx + rol r12d,8 + add esi,r15d + xor r9d,esi + rol r9d,7 + add edi,r12d + xor r10d,edi + rol r10d,7 + mov DWORD[40+rsp],esi + mov DWORD[44+rsp],edi + mov esi,DWORD[32+rsp] + mov edi,DWORD[36+rsp] + add ecx,r11d + xor r13d,ecx + rol r13d,16 + add edx,r8d + xor r14d,edx + rol r14d,16 + add esi,r13d + xor r11d,esi + rol r11d,12 + add edi,r14d + xor r8d,edi + rol r8d,12 + add ecx,r11d + xor r13d,ecx + rol r13d,8 + add edx,r8d + xor r14d,edx + rol r14d,8 + add esi,r13d + xor r11d,esi + rol r11d,7 + add edi,r14d + xor r8d,edi + rol r8d,7 + dec ebp + jnz NEAR $L$oop + mov DWORD[36+rsp],edi + mov DWORD[32+rsp],esi + mov rbp,QWORD[64+rsp] + movdqa xmm1,xmm2 + mov rsi,QWORD[((64+8))+rsp] + paddd xmm3,xmm4 + mov rdi,QWORD[((64+16))+rsp] + + add eax,0x61707865 + add ebx,0x3320646e + add ecx,0x79622d32 + add edx,0x6b206574 + add r8d,DWORD[16+rsp] + add r9d,DWORD[20+rsp] + add r10d,DWORD[24+rsp] + add r11d,DWORD[28+rsp] + add r12d,DWORD[48+rsp] + add r13d,DWORD[52+rsp] + add r14d,DWORD[56+rsp] + add r15d,DWORD[60+rsp] + paddd xmm1,XMMWORD[32+rsp] + + cmp rbp,64 + jb NEAR $L$tail + + xor eax,DWORD[rsi] + xor ebx,DWORD[4+rsi] + xor ecx,DWORD[8+rsi] + xor edx,DWORD[12+rsi] + xor r8d,DWORD[16+rsi] + xor r9d,DWORD[20+rsi] + xor r10d,DWORD[24+rsi] + xor r11d,DWORD[28+rsi] + movdqu xmm0,XMMWORD[32+rsi] + xor r12d,DWORD[48+rsi] + xor r13d,DWORD[52+rsi] + xor r14d,DWORD[56+rsi] + xor r15d,DWORD[60+rsi] + lea rsi,[64+rsi] + pxor xmm0,xmm1 + + movdqa XMMWORD[32+rsp],xmm2 + movd DWORD[48+rsp],xmm3 + + mov DWORD[rdi],eax + mov DWORD[4+rdi],ebx + mov DWORD[8+rdi],ecx + mov DWORD[12+rdi],edx + mov DWORD[16+rdi],r8d + mov DWORD[20+rdi],r9d + mov DWORD[24+rdi],r10d + mov DWORD[28+rdi],r11d + movdqu XMMWORD[32+rdi],xmm0 + mov DWORD[48+rdi],r12d + mov DWORD[52+rdi],r13d + mov DWORD[56+rdi],r14d + mov DWORD[60+rdi],r15d + lea rdi,[64+rdi] + + sub rbp,64 + jnz NEAR $L$oop_outer + + jmp NEAR $L$done + +ALIGN 16 +$L$tail: + mov DWORD[rsp],eax + mov DWORD[4+rsp],ebx + xor rbx,rbx + mov DWORD[8+rsp],ecx + mov DWORD[12+rsp],edx + mov DWORD[16+rsp],r8d + mov DWORD[20+rsp],r9d + mov DWORD[24+rsp],r10d + mov DWORD[28+rsp],r11d + movdqa XMMWORD[32+rsp],xmm1 + mov DWORD[48+rsp],r12d + mov DWORD[52+rsp],r13d + mov DWORD[56+rsp],r14d + mov DWORD[60+rsp],r15d + +$L$oop_tail: + movzx eax,BYTE[rbx*1+rsi] + movzx edx,BYTE[rbx*1+rsp] + lea rbx,[1+rbx] + xor eax,edx + mov BYTE[((-1))+rbx*1+rdi],al + dec rbp + jnz NEAR $L$oop_tail + +$L$done: + lea rsi,[((64+24+48))+rsp] + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$no_data: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ChaCha20_ctr32_nohw: +global ChaCha20_ctr32_ssse3_4x + +ALIGN 32 +ChaCha20_ctr32_ssse3_4x: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ChaCha20_ctr32_ssse3_4x: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +_CET_ENDBR + mov r9,rsp + + sub rsp,0x140+168 + movaps XMMWORD[(-168)+r9],xmm6 + movaps XMMWORD[(-152)+r9],xmm7 + movaps XMMWORD[(-136)+r9],xmm8 + movaps XMMWORD[(-120)+r9],xmm9 + movaps XMMWORD[(-104)+r9],xmm10 + movaps XMMWORD[(-88)+r9],xmm11 + movaps XMMWORD[(-72)+r9],xmm12 + movaps XMMWORD[(-56)+r9],xmm13 + movaps XMMWORD[(-40)+r9],xmm14 + movaps XMMWORD[(-24)+r9],xmm15 +$L$4x_body: + movdqa xmm11,XMMWORD[$L$sigma] + movdqu xmm15,XMMWORD[rcx] + movdqu xmm7,XMMWORD[16+rcx] + movdqu xmm3,XMMWORD[r8] + lea rcx,[256+rsp] + lea r10,[$L$rot16] + lea r11,[$L$rot24] + + pshufd xmm8,xmm11,0x00 + pshufd xmm9,xmm11,0x55 + movdqa XMMWORD[64+rsp],xmm8 + pshufd xmm10,xmm11,0xaa + movdqa XMMWORD[80+rsp],xmm9 + pshufd xmm11,xmm11,0xff + movdqa XMMWORD[96+rsp],xmm10 + movdqa XMMWORD[112+rsp],xmm11 + + pshufd xmm12,xmm15,0x00 + pshufd xmm13,xmm15,0x55 + movdqa XMMWORD[(128-256)+rcx],xmm12 + pshufd xmm14,xmm15,0xaa + movdqa XMMWORD[(144-256)+rcx],xmm13 + pshufd xmm15,xmm15,0xff + movdqa XMMWORD[(160-256)+rcx],xmm14 + movdqa XMMWORD[(176-256)+rcx],xmm15 + + pshufd xmm4,xmm7,0x00 + pshufd xmm5,xmm7,0x55 + movdqa XMMWORD[(192-256)+rcx],xmm4 + pshufd xmm6,xmm7,0xaa + movdqa XMMWORD[(208-256)+rcx],xmm5 + pshufd xmm7,xmm7,0xff + movdqa XMMWORD[(224-256)+rcx],xmm6 + movdqa XMMWORD[(240-256)+rcx],xmm7 + + pshufd xmm0,xmm3,0x00 + pshufd xmm1,xmm3,0x55 + paddd xmm0,XMMWORD[$L$inc] + pshufd xmm2,xmm3,0xaa + movdqa XMMWORD[(272-256)+rcx],xmm1 + pshufd xmm3,xmm3,0xff + movdqa XMMWORD[(288-256)+rcx],xmm2 + movdqa XMMWORD[(304-256)+rcx],xmm3 + + jmp NEAR $L$oop_enter4x + +ALIGN 32 +$L$oop_outer4x: + movdqa xmm8,XMMWORD[64+rsp] + movdqa xmm9,XMMWORD[80+rsp] + movdqa xmm10,XMMWORD[96+rsp] + movdqa xmm11,XMMWORD[112+rsp] + movdqa xmm12,XMMWORD[((128-256))+rcx] + movdqa xmm13,XMMWORD[((144-256))+rcx] + movdqa xmm14,XMMWORD[((160-256))+rcx] + movdqa xmm15,XMMWORD[((176-256))+rcx] + movdqa xmm4,XMMWORD[((192-256))+rcx] + movdqa xmm5,XMMWORD[((208-256))+rcx] + movdqa xmm6,XMMWORD[((224-256))+rcx] + movdqa xmm7,XMMWORD[((240-256))+rcx] + movdqa xmm0,XMMWORD[((256-256))+rcx] + movdqa xmm1,XMMWORD[((272-256))+rcx] + movdqa xmm2,XMMWORD[((288-256))+rcx] + movdqa xmm3,XMMWORD[((304-256))+rcx] + paddd xmm0,XMMWORD[$L$four] + +$L$oop_enter4x: + movdqa XMMWORD[32+rsp],xmm6 + movdqa XMMWORD[48+rsp],xmm7 + movdqa xmm7,XMMWORD[r10] + mov eax,10 + movdqa XMMWORD[(256-256)+rcx],xmm0 + jmp NEAR $L$oop4x + +ALIGN 32 +$L$oop4x: + paddd xmm8,xmm12 + paddd xmm9,xmm13 + pxor xmm0,xmm8 + pxor xmm1,xmm9 +DB 102,15,56,0,199 +DB 102,15,56,0,207 + paddd xmm4,xmm0 + paddd xmm5,xmm1 + pxor xmm12,xmm4 + pxor xmm13,xmm5 + movdqa xmm6,xmm12 + pslld xmm12,12 + psrld xmm6,20 + movdqa xmm7,xmm13 + pslld xmm13,12 + por xmm12,xmm6 + psrld xmm7,20 + movdqa xmm6,XMMWORD[r11] + por xmm13,xmm7 + paddd xmm8,xmm12 + paddd xmm9,xmm13 + pxor xmm0,xmm8 + pxor xmm1,xmm9 +DB 102,15,56,0,198 +DB 102,15,56,0,206 + paddd xmm4,xmm0 + paddd xmm5,xmm1 + pxor xmm12,xmm4 + pxor xmm13,xmm5 + movdqa xmm7,xmm12 + pslld xmm12,7 + psrld xmm7,25 + movdqa xmm6,xmm13 + pslld xmm13,7 + por xmm12,xmm7 + psrld xmm6,25 + movdqa xmm7,XMMWORD[r10] + por xmm13,xmm6 + movdqa XMMWORD[rsp],xmm4 + movdqa XMMWORD[16+rsp],xmm5 + movdqa xmm4,XMMWORD[32+rsp] + movdqa xmm5,XMMWORD[48+rsp] + paddd xmm10,xmm14 + paddd xmm11,xmm15 + pxor xmm2,xmm10 + pxor xmm3,xmm11 +DB 102,15,56,0,215 +DB 102,15,56,0,223 + paddd xmm4,xmm2 + paddd xmm5,xmm3 + pxor xmm14,xmm4 + pxor xmm15,xmm5 + movdqa xmm6,xmm14 + pslld xmm14,12 + psrld xmm6,20 + movdqa xmm7,xmm15 + pslld xmm15,12 + por xmm14,xmm6 + psrld xmm7,20 + movdqa xmm6,XMMWORD[r11] + por xmm15,xmm7 + paddd xmm10,xmm14 + paddd xmm11,xmm15 + pxor xmm2,xmm10 + pxor xmm3,xmm11 +DB 102,15,56,0,214 +DB 102,15,56,0,222 + paddd xmm4,xmm2 + paddd xmm5,xmm3 + pxor xmm14,xmm4 + pxor xmm15,xmm5 + movdqa xmm7,xmm14 + pslld xmm14,7 + psrld xmm7,25 + movdqa xmm6,xmm15 + pslld xmm15,7 + por xmm14,xmm7 + psrld xmm6,25 + movdqa xmm7,XMMWORD[r10] + por xmm15,xmm6 + paddd xmm8,xmm13 + paddd xmm9,xmm14 + pxor xmm3,xmm8 + pxor xmm0,xmm9 +DB 102,15,56,0,223 +DB 102,15,56,0,199 + paddd xmm4,xmm3 + paddd xmm5,xmm0 + pxor xmm13,xmm4 + pxor xmm14,xmm5 + movdqa xmm6,xmm13 + pslld xmm13,12 + psrld xmm6,20 + movdqa xmm7,xmm14 + pslld xmm14,12 + por xmm13,xmm6 + psrld xmm7,20 + movdqa xmm6,XMMWORD[r11] + por xmm14,xmm7 + paddd xmm8,xmm13 + paddd xmm9,xmm14 + pxor xmm3,xmm8 + pxor xmm0,xmm9 +DB 102,15,56,0,222 +DB 102,15,56,0,198 + paddd xmm4,xmm3 + paddd xmm5,xmm0 + pxor xmm13,xmm4 + pxor xmm14,xmm5 + movdqa xmm7,xmm13 + pslld xmm13,7 + psrld xmm7,25 + movdqa xmm6,xmm14 + pslld xmm14,7 + por xmm13,xmm7 + psrld xmm6,25 + movdqa xmm7,XMMWORD[r10] + por xmm14,xmm6 + movdqa XMMWORD[32+rsp],xmm4 + movdqa XMMWORD[48+rsp],xmm5 + movdqa xmm4,XMMWORD[rsp] + movdqa xmm5,XMMWORD[16+rsp] + paddd xmm10,xmm15 + paddd xmm11,xmm12 + pxor xmm1,xmm10 + pxor xmm2,xmm11 +DB 102,15,56,0,207 +DB 102,15,56,0,215 + paddd xmm4,xmm1 + paddd xmm5,xmm2 + pxor xmm15,xmm4 + pxor xmm12,xmm5 + movdqa xmm6,xmm15 + pslld xmm15,12 + psrld xmm6,20 + movdqa xmm7,xmm12 + pslld xmm12,12 + por xmm15,xmm6 + psrld xmm7,20 + movdqa xmm6,XMMWORD[r11] + por xmm12,xmm7 + paddd xmm10,xmm15 + paddd xmm11,xmm12 + pxor xmm1,xmm10 + pxor xmm2,xmm11 +DB 102,15,56,0,206 +DB 102,15,56,0,214 + paddd xmm4,xmm1 + paddd xmm5,xmm2 + pxor xmm15,xmm4 + pxor xmm12,xmm5 + movdqa xmm7,xmm15 + pslld xmm15,7 + psrld xmm7,25 + movdqa xmm6,xmm12 + pslld xmm12,7 + por xmm15,xmm7 + psrld xmm6,25 + movdqa xmm7,XMMWORD[r10] + por xmm12,xmm6 + dec eax + jnz NEAR $L$oop4x + + paddd xmm8,XMMWORD[64+rsp] + paddd xmm9,XMMWORD[80+rsp] + paddd xmm10,XMMWORD[96+rsp] + paddd xmm11,XMMWORD[112+rsp] + + movdqa xmm6,xmm8 + punpckldq xmm8,xmm9 + movdqa xmm7,xmm10 + punpckldq xmm10,xmm11 + punpckhdq xmm6,xmm9 + punpckhdq xmm7,xmm11 + movdqa xmm9,xmm8 + punpcklqdq xmm8,xmm10 + movdqa xmm11,xmm6 + punpcklqdq xmm6,xmm7 + punpckhqdq xmm9,xmm10 + punpckhqdq xmm11,xmm7 + paddd xmm12,XMMWORD[((128-256))+rcx] + paddd xmm13,XMMWORD[((144-256))+rcx] + paddd xmm14,XMMWORD[((160-256))+rcx] + paddd xmm15,XMMWORD[((176-256))+rcx] + + movdqa XMMWORD[rsp],xmm8 + movdqa XMMWORD[16+rsp],xmm9 + movdqa xmm8,XMMWORD[32+rsp] + movdqa xmm9,XMMWORD[48+rsp] + + movdqa xmm10,xmm12 + punpckldq xmm12,xmm13 + movdqa xmm7,xmm14 + punpckldq xmm14,xmm15 + punpckhdq xmm10,xmm13 + punpckhdq xmm7,xmm15 + movdqa xmm13,xmm12 + punpcklqdq xmm12,xmm14 + movdqa xmm15,xmm10 + punpcklqdq xmm10,xmm7 + punpckhqdq xmm13,xmm14 + punpckhqdq xmm15,xmm7 + paddd xmm4,XMMWORD[((192-256))+rcx] + paddd xmm5,XMMWORD[((208-256))+rcx] + paddd xmm8,XMMWORD[((224-256))+rcx] + paddd xmm9,XMMWORD[((240-256))+rcx] + + movdqa XMMWORD[32+rsp],xmm6 + movdqa XMMWORD[48+rsp],xmm11 + + movdqa xmm14,xmm4 + punpckldq xmm4,xmm5 + movdqa xmm7,xmm8 + punpckldq xmm8,xmm9 + punpckhdq xmm14,xmm5 + punpckhdq xmm7,xmm9 + movdqa xmm5,xmm4 + punpcklqdq xmm4,xmm8 + movdqa xmm9,xmm14 + punpcklqdq xmm14,xmm7 + punpckhqdq xmm5,xmm8 + punpckhqdq xmm9,xmm7 + paddd xmm0,XMMWORD[((256-256))+rcx] + paddd xmm1,XMMWORD[((272-256))+rcx] + paddd xmm2,XMMWORD[((288-256))+rcx] + paddd xmm3,XMMWORD[((304-256))+rcx] + + movdqa xmm8,xmm0 + punpckldq xmm0,xmm1 + movdqa xmm7,xmm2 + punpckldq xmm2,xmm3 + punpckhdq xmm8,xmm1 + punpckhdq xmm7,xmm3 + movdqa xmm1,xmm0 + punpcklqdq xmm0,xmm2 + movdqa xmm3,xmm8 + punpcklqdq xmm8,xmm7 + punpckhqdq xmm1,xmm2 + punpckhqdq xmm3,xmm7 + cmp rdx,64*4 + jb NEAR $L$tail4x + + movdqu xmm6,XMMWORD[rsi] + movdqu xmm11,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm7,XMMWORD[48+rsi] + pxor xmm6,XMMWORD[rsp] + pxor xmm11,xmm12 + pxor xmm2,xmm4 + pxor xmm7,xmm0 + + movdqu XMMWORD[rdi],xmm6 + movdqu xmm6,XMMWORD[64+rsi] + movdqu XMMWORD[16+rdi],xmm11 + movdqu xmm11,XMMWORD[80+rsi] + movdqu XMMWORD[32+rdi],xmm2 + movdqu xmm2,XMMWORD[96+rsi] + movdqu XMMWORD[48+rdi],xmm7 + movdqu xmm7,XMMWORD[112+rsi] + lea rsi,[128+rsi] + pxor xmm6,XMMWORD[16+rsp] + pxor xmm11,xmm13 + pxor xmm2,xmm5 + pxor xmm7,xmm1 + + movdqu XMMWORD[64+rdi],xmm6 + movdqu xmm6,XMMWORD[rsi] + movdqu XMMWORD[80+rdi],xmm11 + movdqu xmm11,XMMWORD[16+rsi] + movdqu XMMWORD[96+rdi],xmm2 + movdqu xmm2,XMMWORD[32+rsi] + movdqu XMMWORD[112+rdi],xmm7 + lea rdi,[128+rdi] + movdqu xmm7,XMMWORD[48+rsi] + pxor xmm6,XMMWORD[32+rsp] + pxor xmm11,xmm10 + pxor xmm2,xmm14 + pxor xmm7,xmm8 + + movdqu XMMWORD[rdi],xmm6 + movdqu xmm6,XMMWORD[64+rsi] + movdqu XMMWORD[16+rdi],xmm11 + movdqu xmm11,XMMWORD[80+rsi] + movdqu XMMWORD[32+rdi],xmm2 + movdqu xmm2,XMMWORD[96+rsi] + movdqu XMMWORD[48+rdi],xmm7 + movdqu xmm7,XMMWORD[112+rsi] + lea rsi,[128+rsi] + pxor xmm6,XMMWORD[48+rsp] + pxor xmm11,xmm15 + pxor xmm2,xmm9 + pxor xmm7,xmm3 + movdqu XMMWORD[64+rdi],xmm6 + movdqu XMMWORD[80+rdi],xmm11 + movdqu XMMWORD[96+rdi],xmm2 + movdqu XMMWORD[112+rdi],xmm7 + lea rdi,[128+rdi] + + sub rdx,64*4 + jnz NEAR $L$oop_outer4x + + jmp NEAR $L$done4x + +$L$tail4x: + cmp rdx,192 + jae NEAR $L$192_or_more4x + cmp rdx,128 + jae NEAR $L$128_or_more4x + cmp rdx,64 + jae NEAR $L$64_or_more4x + + + xor r10,r10 + + movdqa XMMWORD[16+rsp],xmm12 + movdqa XMMWORD[32+rsp],xmm4 + movdqa XMMWORD[48+rsp],xmm0 + jmp NEAR $L$oop_tail4x + +ALIGN 32 +$L$64_or_more4x: + movdqu xmm6,XMMWORD[rsi] + movdqu xmm11,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm7,XMMWORD[48+rsi] + pxor xmm6,XMMWORD[rsp] + pxor xmm11,xmm12 + pxor xmm2,xmm4 + pxor xmm7,xmm0 + movdqu XMMWORD[rdi],xmm6 + movdqu XMMWORD[16+rdi],xmm11 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm7 + je NEAR $L$done4x + + movdqa xmm6,XMMWORD[16+rsp] + lea rsi,[64+rsi] + xor r10,r10 + movdqa XMMWORD[rsp],xmm6 + movdqa XMMWORD[16+rsp],xmm13 + lea rdi,[64+rdi] + movdqa XMMWORD[32+rsp],xmm5 + sub rdx,64 + movdqa XMMWORD[48+rsp],xmm1 + jmp NEAR $L$oop_tail4x + +ALIGN 32 +$L$128_or_more4x: + movdqu xmm6,XMMWORD[rsi] + movdqu xmm11,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm7,XMMWORD[48+rsi] + pxor xmm6,XMMWORD[rsp] + pxor xmm11,xmm12 + pxor xmm2,xmm4 + pxor xmm7,xmm0 + + movdqu XMMWORD[rdi],xmm6 + movdqu xmm6,XMMWORD[64+rsi] + movdqu XMMWORD[16+rdi],xmm11 + movdqu xmm11,XMMWORD[80+rsi] + movdqu XMMWORD[32+rdi],xmm2 + movdqu xmm2,XMMWORD[96+rsi] + movdqu XMMWORD[48+rdi],xmm7 + movdqu xmm7,XMMWORD[112+rsi] + pxor xmm6,XMMWORD[16+rsp] + pxor xmm11,xmm13 + pxor xmm2,xmm5 + pxor xmm7,xmm1 + movdqu XMMWORD[64+rdi],xmm6 + movdqu XMMWORD[80+rdi],xmm11 + movdqu XMMWORD[96+rdi],xmm2 + movdqu XMMWORD[112+rdi],xmm7 + je NEAR $L$done4x + + movdqa xmm6,XMMWORD[32+rsp] + lea rsi,[128+rsi] + xor r10,r10 + movdqa XMMWORD[rsp],xmm6 + movdqa XMMWORD[16+rsp],xmm10 + lea rdi,[128+rdi] + movdqa XMMWORD[32+rsp],xmm14 + sub rdx,128 + movdqa XMMWORD[48+rsp],xmm8 + jmp NEAR $L$oop_tail4x + +ALIGN 32 +$L$192_or_more4x: + movdqu xmm6,XMMWORD[rsi] + movdqu xmm11,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm7,XMMWORD[48+rsi] + pxor xmm6,XMMWORD[rsp] + pxor xmm11,xmm12 + pxor xmm2,xmm4 + pxor xmm7,xmm0 + + movdqu XMMWORD[rdi],xmm6 + movdqu xmm6,XMMWORD[64+rsi] + movdqu XMMWORD[16+rdi],xmm11 + movdqu xmm11,XMMWORD[80+rsi] + movdqu XMMWORD[32+rdi],xmm2 + movdqu xmm2,XMMWORD[96+rsi] + movdqu XMMWORD[48+rdi],xmm7 + movdqu xmm7,XMMWORD[112+rsi] + lea rsi,[128+rsi] + pxor xmm6,XMMWORD[16+rsp] + pxor xmm11,xmm13 + pxor xmm2,xmm5 + pxor xmm7,xmm1 + + movdqu XMMWORD[64+rdi],xmm6 + movdqu xmm6,XMMWORD[rsi] + movdqu XMMWORD[80+rdi],xmm11 + movdqu xmm11,XMMWORD[16+rsi] + movdqu XMMWORD[96+rdi],xmm2 + movdqu xmm2,XMMWORD[32+rsi] + movdqu XMMWORD[112+rdi],xmm7 + lea rdi,[128+rdi] + movdqu xmm7,XMMWORD[48+rsi] + pxor xmm6,XMMWORD[32+rsp] + pxor xmm11,xmm10 + pxor xmm2,xmm14 + pxor xmm7,xmm8 + movdqu XMMWORD[rdi],xmm6 + movdqu XMMWORD[16+rdi],xmm11 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm7 + je NEAR $L$done4x + + movdqa xmm6,XMMWORD[48+rsp] + lea rsi,[64+rsi] + xor r10,r10 + movdqa XMMWORD[rsp],xmm6 + movdqa XMMWORD[16+rsp],xmm15 + lea rdi,[64+rdi] + movdqa XMMWORD[32+rsp],xmm9 + sub rdx,192 + movdqa XMMWORD[48+rsp],xmm3 + +$L$oop_tail4x: + movzx eax,BYTE[r10*1+rsi] + movzx ecx,BYTE[r10*1+rsp] + lea r10,[1+r10] + xor eax,ecx + mov BYTE[((-1))+r10*1+rdi],al + dec rdx + jnz NEAR $L$oop_tail4x + +$L$done4x: + movaps xmm6,XMMWORD[((-168))+r9] + movaps xmm7,XMMWORD[((-152))+r9] + movaps xmm8,XMMWORD[((-136))+r9] + movaps xmm9,XMMWORD[((-120))+r9] + movaps xmm10,XMMWORD[((-104))+r9] + movaps xmm11,XMMWORD[((-88))+r9] + movaps xmm12,XMMWORD[((-72))+r9] + movaps xmm13,XMMWORD[((-56))+r9] + movaps xmm14,XMMWORD[((-40))+r9] + movaps xmm15,XMMWORD[((-24))+r9] + lea rsp,[r9] + +$L$4x_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ChaCha20_ctr32_ssse3_4x: +global ChaCha20_ctr32_avx2 + +ALIGN 32 +ChaCha20_ctr32_avx2: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ChaCha20_ctr32_avx2: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +_CET_ENDBR + mov r9,rsp + + sub rsp,0x280+168 + and rsp,-32 + movaps XMMWORD[(-168)+r9],xmm6 + movaps XMMWORD[(-152)+r9],xmm7 + movaps XMMWORD[(-136)+r9],xmm8 + movaps XMMWORD[(-120)+r9],xmm9 + movaps XMMWORD[(-104)+r9],xmm10 + movaps XMMWORD[(-88)+r9],xmm11 + movaps XMMWORD[(-72)+r9],xmm12 + movaps XMMWORD[(-56)+r9],xmm13 + movaps XMMWORD[(-40)+r9],xmm14 + movaps XMMWORD[(-24)+r9],xmm15 +$L$8x_body: + vzeroupper + + + + + + + + + + + vbroadcasti128 ymm11,XMMWORD[$L$sigma] + vbroadcasti128 ymm3,XMMWORD[rcx] + vbroadcasti128 ymm15,XMMWORD[16+rcx] + vbroadcasti128 ymm7,XMMWORD[r8] + lea rcx,[256+rsp] + lea rax,[512+rsp] + lea r10,[$L$rot16] + lea r11,[$L$rot24] + + vpshufd ymm8,ymm11,0x00 + vpshufd ymm9,ymm11,0x55 + vmovdqa YMMWORD[(128-256)+rcx],ymm8 + vpshufd ymm10,ymm11,0xaa + vmovdqa YMMWORD[(160-256)+rcx],ymm9 + vpshufd ymm11,ymm11,0xff + vmovdqa YMMWORD[(192-256)+rcx],ymm10 + vmovdqa YMMWORD[(224-256)+rcx],ymm11 + + vpshufd ymm0,ymm3,0x00 + vpshufd ymm1,ymm3,0x55 + vmovdqa YMMWORD[(256-256)+rcx],ymm0 + vpshufd ymm2,ymm3,0xaa + vmovdqa YMMWORD[(288-256)+rcx],ymm1 + vpshufd ymm3,ymm3,0xff + vmovdqa YMMWORD[(320-256)+rcx],ymm2 + vmovdqa YMMWORD[(352-256)+rcx],ymm3 + + vpshufd ymm12,ymm15,0x00 + vpshufd ymm13,ymm15,0x55 + vmovdqa YMMWORD[(384-512)+rax],ymm12 + vpshufd ymm14,ymm15,0xaa + vmovdqa YMMWORD[(416-512)+rax],ymm13 + vpshufd ymm15,ymm15,0xff + vmovdqa YMMWORD[(448-512)+rax],ymm14 + vmovdqa YMMWORD[(480-512)+rax],ymm15 + + vpshufd ymm4,ymm7,0x00 + vpshufd ymm5,ymm7,0x55 + vpaddd ymm4,ymm4,YMMWORD[$L$incy] + vpshufd ymm6,ymm7,0xaa + vmovdqa YMMWORD[(544-512)+rax],ymm5 + vpshufd ymm7,ymm7,0xff + vmovdqa YMMWORD[(576-512)+rax],ymm6 + vmovdqa YMMWORD[(608-512)+rax],ymm7 + + jmp NEAR $L$oop_enter8x + +ALIGN 32 +$L$oop_outer8x: + vmovdqa ymm8,YMMWORD[((128-256))+rcx] + vmovdqa ymm9,YMMWORD[((160-256))+rcx] + vmovdqa ymm10,YMMWORD[((192-256))+rcx] + vmovdqa ymm11,YMMWORD[((224-256))+rcx] + vmovdqa ymm0,YMMWORD[((256-256))+rcx] + vmovdqa ymm1,YMMWORD[((288-256))+rcx] + vmovdqa ymm2,YMMWORD[((320-256))+rcx] + vmovdqa ymm3,YMMWORD[((352-256))+rcx] + vmovdqa ymm12,YMMWORD[((384-512))+rax] + vmovdqa ymm13,YMMWORD[((416-512))+rax] + vmovdqa ymm14,YMMWORD[((448-512))+rax] + vmovdqa ymm15,YMMWORD[((480-512))+rax] + vmovdqa ymm4,YMMWORD[((512-512))+rax] + vmovdqa ymm5,YMMWORD[((544-512))+rax] + vmovdqa ymm6,YMMWORD[((576-512))+rax] + vmovdqa ymm7,YMMWORD[((608-512))+rax] + vpaddd ymm4,ymm4,YMMWORD[$L$eight] + +$L$oop_enter8x: + vmovdqa YMMWORD[64+rsp],ymm14 + vmovdqa YMMWORD[96+rsp],ymm15 + vbroadcasti128 ymm15,XMMWORD[r10] + vmovdqa YMMWORD[(512-512)+rax],ymm4 + mov eax,10 + jmp NEAR $L$oop8x + +ALIGN 32 +$L$oop8x: + vpaddd ymm8,ymm8,ymm0 + vpxor ymm4,ymm8,ymm4 + vpshufb ymm4,ymm4,ymm15 + vpaddd ymm9,ymm9,ymm1 + vpxor ymm5,ymm9,ymm5 + vpshufb ymm5,ymm5,ymm15 + vpaddd ymm12,ymm12,ymm4 + vpxor ymm0,ymm12,ymm0 + vpslld ymm14,ymm0,12 + vpsrld ymm0,ymm0,20 + vpor ymm0,ymm14,ymm0 + vbroadcasti128 ymm14,XMMWORD[r11] + vpaddd ymm13,ymm13,ymm5 + vpxor ymm1,ymm13,ymm1 + vpslld ymm15,ymm1,12 + vpsrld ymm1,ymm1,20 + vpor ymm1,ymm15,ymm1 + vpaddd ymm8,ymm8,ymm0 + vpxor ymm4,ymm8,ymm4 + vpshufb ymm4,ymm4,ymm14 + vpaddd ymm9,ymm9,ymm1 + vpxor ymm5,ymm9,ymm5 + vpshufb ymm5,ymm5,ymm14 + vpaddd ymm12,ymm12,ymm4 + vpxor ymm0,ymm12,ymm0 + vpslld ymm15,ymm0,7 + vpsrld ymm0,ymm0,25 + vpor ymm0,ymm15,ymm0 + vbroadcasti128 ymm15,XMMWORD[r10] + vpaddd ymm13,ymm13,ymm5 + vpxor ymm1,ymm13,ymm1 + vpslld ymm14,ymm1,7 + vpsrld ymm1,ymm1,25 + vpor ymm1,ymm14,ymm1 + vmovdqa YMMWORD[rsp],ymm12 + vmovdqa YMMWORD[32+rsp],ymm13 + vmovdqa ymm12,YMMWORD[64+rsp] + vmovdqa ymm13,YMMWORD[96+rsp] + vpaddd ymm10,ymm10,ymm2 + vpxor ymm6,ymm10,ymm6 + vpshufb ymm6,ymm6,ymm15 + vpaddd ymm11,ymm11,ymm3 + vpxor ymm7,ymm11,ymm7 + vpshufb ymm7,ymm7,ymm15 + vpaddd ymm12,ymm12,ymm6 + vpxor ymm2,ymm12,ymm2 + vpslld ymm14,ymm2,12 + vpsrld ymm2,ymm2,20 + vpor ymm2,ymm14,ymm2 + vbroadcasti128 ymm14,XMMWORD[r11] + vpaddd ymm13,ymm13,ymm7 + vpxor ymm3,ymm13,ymm3 + vpslld ymm15,ymm3,12 + vpsrld ymm3,ymm3,20 + vpor ymm3,ymm15,ymm3 + vpaddd ymm10,ymm10,ymm2 + vpxor ymm6,ymm10,ymm6 + vpshufb ymm6,ymm6,ymm14 + vpaddd ymm11,ymm11,ymm3 + vpxor ymm7,ymm11,ymm7 + vpshufb ymm7,ymm7,ymm14 + vpaddd ymm12,ymm12,ymm6 + vpxor ymm2,ymm12,ymm2 + vpslld ymm15,ymm2,7 + vpsrld ymm2,ymm2,25 + vpor ymm2,ymm15,ymm2 + vbroadcasti128 ymm15,XMMWORD[r10] + vpaddd ymm13,ymm13,ymm7 + vpxor ymm3,ymm13,ymm3 + vpslld ymm14,ymm3,7 + vpsrld ymm3,ymm3,25 + vpor ymm3,ymm14,ymm3 + vpaddd ymm8,ymm8,ymm1 + vpxor ymm7,ymm8,ymm7 + vpshufb ymm7,ymm7,ymm15 + vpaddd ymm9,ymm9,ymm2 + vpxor ymm4,ymm9,ymm4 + vpshufb ymm4,ymm4,ymm15 + vpaddd ymm12,ymm12,ymm7 + vpxor ymm1,ymm12,ymm1 + vpslld ymm14,ymm1,12 + vpsrld ymm1,ymm1,20 + vpor ymm1,ymm14,ymm1 + vbroadcasti128 ymm14,XMMWORD[r11] + vpaddd ymm13,ymm13,ymm4 + vpxor ymm2,ymm13,ymm2 + vpslld ymm15,ymm2,12 + vpsrld ymm2,ymm2,20 + vpor ymm2,ymm15,ymm2 + vpaddd ymm8,ymm8,ymm1 + vpxor ymm7,ymm8,ymm7 + vpshufb ymm7,ymm7,ymm14 + vpaddd ymm9,ymm9,ymm2 + vpxor ymm4,ymm9,ymm4 + vpshufb ymm4,ymm4,ymm14 + vpaddd ymm12,ymm12,ymm7 + vpxor ymm1,ymm12,ymm1 + vpslld ymm15,ymm1,7 + vpsrld ymm1,ymm1,25 + vpor ymm1,ymm15,ymm1 + vbroadcasti128 ymm15,XMMWORD[r10] + vpaddd ymm13,ymm13,ymm4 + vpxor ymm2,ymm13,ymm2 + vpslld ymm14,ymm2,7 + vpsrld ymm2,ymm2,25 + vpor ymm2,ymm14,ymm2 + vmovdqa YMMWORD[64+rsp],ymm12 + vmovdqa YMMWORD[96+rsp],ymm13 + vmovdqa ymm12,YMMWORD[rsp] + vmovdqa ymm13,YMMWORD[32+rsp] + vpaddd ymm10,ymm10,ymm3 + vpxor ymm5,ymm10,ymm5 + vpshufb ymm5,ymm5,ymm15 + vpaddd ymm11,ymm11,ymm0 + vpxor ymm6,ymm11,ymm6 + vpshufb ymm6,ymm6,ymm15 + vpaddd ymm12,ymm12,ymm5 + vpxor ymm3,ymm12,ymm3 + vpslld ymm14,ymm3,12 + vpsrld ymm3,ymm3,20 + vpor ymm3,ymm14,ymm3 + vbroadcasti128 ymm14,XMMWORD[r11] + vpaddd ymm13,ymm13,ymm6 + vpxor ymm0,ymm13,ymm0 + vpslld ymm15,ymm0,12 + vpsrld ymm0,ymm0,20 + vpor ymm0,ymm15,ymm0 + vpaddd ymm10,ymm10,ymm3 + vpxor ymm5,ymm10,ymm5 + vpshufb ymm5,ymm5,ymm14 + vpaddd ymm11,ymm11,ymm0 + vpxor ymm6,ymm11,ymm6 + vpshufb ymm6,ymm6,ymm14 + vpaddd ymm12,ymm12,ymm5 + vpxor ymm3,ymm12,ymm3 + vpslld ymm15,ymm3,7 + vpsrld ymm3,ymm3,25 + vpor ymm3,ymm15,ymm3 + vbroadcasti128 ymm15,XMMWORD[r10] + vpaddd ymm13,ymm13,ymm6 + vpxor ymm0,ymm13,ymm0 + vpslld ymm14,ymm0,7 + vpsrld ymm0,ymm0,25 + vpor ymm0,ymm14,ymm0 + dec eax + jnz NEAR $L$oop8x + + lea rax,[512+rsp] + vpaddd ymm8,ymm8,YMMWORD[((128-256))+rcx] + vpaddd ymm9,ymm9,YMMWORD[((160-256))+rcx] + vpaddd ymm10,ymm10,YMMWORD[((192-256))+rcx] + vpaddd ymm11,ymm11,YMMWORD[((224-256))+rcx] + + vpunpckldq ymm14,ymm8,ymm9 + vpunpckldq ymm15,ymm10,ymm11 + vpunpckhdq ymm8,ymm8,ymm9 + vpunpckhdq ymm10,ymm10,ymm11 + vpunpcklqdq ymm9,ymm14,ymm15 + vpunpckhqdq ymm14,ymm14,ymm15 + vpunpcklqdq ymm11,ymm8,ymm10 + vpunpckhqdq ymm8,ymm8,ymm10 + vpaddd ymm0,ymm0,YMMWORD[((256-256))+rcx] + vpaddd ymm1,ymm1,YMMWORD[((288-256))+rcx] + vpaddd ymm2,ymm2,YMMWORD[((320-256))+rcx] + vpaddd ymm3,ymm3,YMMWORD[((352-256))+rcx] + + vpunpckldq ymm10,ymm0,ymm1 + vpunpckldq ymm15,ymm2,ymm3 + vpunpckhdq ymm0,ymm0,ymm1 + vpunpckhdq ymm2,ymm2,ymm3 + vpunpcklqdq ymm1,ymm10,ymm15 + vpunpckhqdq ymm10,ymm10,ymm15 + vpunpcklqdq ymm3,ymm0,ymm2 + vpunpckhqdq ymm0,ymm0,ymm2 + vperm2i128 ymm15,ymm9,ymm1,0x20 + vperm2i128 ymm1,ymm9,ymm1,0x31 + vperm2i128 ymm9,ymm14,ymm10,0x20 + vperm2i128 ymm10,ymm14,ymm10,0x31 + vperm2i128 ymm14,ymm11,ymm3,0x20 + vperm2i128 ymm3,ymm11,ymm3,0x31 + vperm2i128 ymm11,ymm8,ymm0,0x20 + vperm2i128 ymm0,ymm8,ymm0,0x31 + vmovdqa YMMWORD[rsp],ymm15 + vmovdqa YMMWORD[32+rsp],ymm9 + vmovdqa ymm15,YMMWORD[64+rsp] + vmovdqa ymm9,YMMWORD[96+rsp] + + vpaddd ymm12,ymm12,YMMWORD[((384-512))+rax] + vpaddd ymm13,ymm13,YMMWORD[((416-512))+rax] + vpaddd ymm15,ymm15,YMMWORD[((448-512))+rax] + vpaddd ymm9,ymm9,YMMWORD[((480-512))+rax] + + vpunpckldq ymm2,ymm12,ymm13 + vpunpckldq ymm8,ymm15,ymm9 + vpunpckhdq ymm12,ymm12,ymm13 + vpunpckhdq ymm15,ymm15,ymm9 + vpunpcklqdq ymm13,ymm2,ymm8 + vpunpckhqdq ymm2,ymm2,ymm8 + vpunpcklqdq ymm9,ymm12,ymm15 + vpunpckhqdq ymm12,ymm12,ymm15 + vpaddd ymm4,ymm4,YMMWORD[((512-512))+rax] + vpaddd ymm5,ymm5,YMMWORD[((544-512))+rax] + vpaddd ymm6,ymm6,YMMWORD[((576-512))+rax] + vpaddd ymm7,ymm7,YMMWORD[((608-512))+rax] + + vpunpckldq ymm15,ymm4,ymm5 + vpunpckldq ymm8,ymm6,ymm7 + vpunpckhdq ymm4,ymm4,ymm5 + vpunpckhdq ymm6,ymm6,ymm7 + vpunpcklqdq ymm5,ymm15,ymm8 + vpunpckhqdq ymm15,ymm15,ymm8 + vpunpcklqdq ymm7,ymm4,ymm6 + vpunpckhqdq ymm4,ymm4,ymm6 + vperm2i128 ymm8,ymm13,ymm5,0x20 + vperm2i128 ymm5,ymm13,ymm5,0x31 + vperm2i128 ymm13,ymm2,ymm15,0x20 + vperm2i128 ymm15,ymm2,ymm15,0x31 + vperm2i128 ymm2,ymm9,ymm7,0x20 + vperm2i128 ymm7,ymm9,ymm7,0x31 + vperm2i128 ymm9,ymm12,ymm4,0x20 + vperm2i128 ymm4,ymm12,ymm4,0x31 + vmovdqa ymm6,YMMWORD[rsp] + vmovdqa ymm12,YMMWORD[32+rsp] + + cmp rdx,64*8 + jb NEAR $L$tail8x + + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vpxor ymm1,ymm1,YMMWORD[64+rsi] + vpxor ymm5,ymm5,YMMWORD[96+rsi] + lea rsi,[128+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + vmovdqu YMMWORD[64+rdi],ymm1 + vmovdqu YMMWORD[96+rdi],ymm5 + lea rdi,[128+rdi] + + vpxor ymm12,ymm12,YMMWORD[rsi] + vpxor ymm13,ymm13,YMMWORD[32+rsi] + vpxor ymm10,ymm10,YMMWORD[64+rsi] + vpxor ymm15,ymm15,YMMWORD[96+rsi] + lea rsi,[128+rsi] + vmovdqu YMMWORD[rdi],ymm12 + vmovdqu YMMWORD[32+rdi],ymm13 + vmovdqu YMMWORD[64+rdi],ymm10 + vmovdqu YMMWORD[96+rdi],ymm15 + lea rdi,[128+rdi] + + vpxor ymm14,ymm14,YMMWORD[rsi] + vpxor ymm2,ymm2,YMMWORD[32+rsi] + vpxor ymm3,ymm3,YMMWORD[64+rsi] + vpxor ymm7,ymm7,YMMWORD[96+rsi] + lea rsi,[128+rsi] + vmovdqu YMMWORD[rdi],ymm14 + vmovdqu YMMWORD[32+rdi],ymm2 + vmovdqu YMMWORD[64+rdi],ymm3 + vmovdqu YMMWORD[96+rdi],ymm7 + lea rdi,[128+rdi] + + vpxor ymm11,ymm11,YMMWORD[rsi] + vpxor ymm9,ymm9,YMMWORD[32+rsi] + vpxor ymm0,ymm0,YMMWORD[64+rsi] + vpxor ymm4,ymm4,YMMWORD[96+rsi] + lea rsi,[128+rsi] + vmovdqu YMMWORD[rdi],ymm11 + vmovdqu YMMWORD[32+rdi],ymm9 + vmovdqu YMMWORD[64+rdi],ymm0 + vmovdqu YMMWORD[96+rdi],ymm4 + lea rdi,[128+rdi] + + sub rdx,64*8 + jnz NEAR $L$oop_outer8x + + jmp NEAR $L$done8x + +$L$tail8x: + cmp rdx,448 + jae NEAR $L$448_or_more8x + cmp rdx,384 + jae NEAR $L$384_or_more8x + cmp rdx,320 + jae NEAR $L$320_or_more8x + cmp rdx,256 + jae NEAR $L$256_or_more8x + cmp rdx,192 + jae NEAR $L$192_or_more8x + cmp rdx,128 + jae NEAR $L$128_or_more8x + cmp rdx,64 + jae NEAR $L$64_or_more8x + + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm6 + vmovdqa YMMWORD[32+rsp],ymm8 + jmp NEAR $L$oop_tail8x + +ALIGN 32 +$L$64_or_more8x: + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + je NEAR $L$done8x + + lea rsi,[64+rsi] + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm1 + lea rdi,[64+rdi] + sub rdx,64 + vmovdqa YMMWORD[32+rsp],ymm5 + jmp NEAR $L$oop_tail8x + +ALIGN 32 +$L$128_or_more8x: + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vpxor ymm1,ymm1,YMMWORD[64+rsi] + vpxor ymm5,ymm5,YMMWORD[96+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + vmovdqu YMMWORD[64+rdi],ymm1 + vmovdqu YMMWORD[96+rdi],ymm5 + je NEAR $L$done8x + + lea rsi,[128+rsi] + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm12 + lea rdi,[128+rdi] + sub rdx,128 + vmovdqa YMMWORD[32+rsp],ymm13 + jmp NEAR $L$oop_tail8x + +ALIGN 32 +$L$192_or_more8x: + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vpxor ymm1,ymm1,YMMWORD[64+rsi] + vpxor ymm5,ymm5,YMMWORD[96+rsi] + vpxor ymm12,ymm12,YMMWORD[128+rsi] + vpxor ymm13,ymm13,YMMWORD[160+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + vmovdqu YMMWORD[64+rdi],ymm1 + vmovdqu YMMWORD[96+rdi],ymm5 + vmovdqu YMMWORD[128+rdi],ymm12 + vmovdqu YMMWORD[160+rdi],ymm13 + je NEAR $L$done8x + + lea rsi,[192+rsi] + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm10 + lea rdi,[192+rdi] + sub rdx,192 + vmovdqa YMMWORD[32+rsp],ymm15 + jmp NEAR $L$oop_tail8x + +ALIGN 32 +$L$256_or_more8x: + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vpxor ymm1,ymm1,YMMWORD[64+rsi] + vpxor ymm5,ymm5,YMMWORD[96+rsi] + vpxor ymm12,ymm12,YMMWORD[128+rsi] + vpxor ymm13,ymm13,YMMWORD[160+rsi] + vpxor ymm10,ymm10,YMMWORD[192+rsi] + vpxor ymm15,ymm15,YMMWORD[224+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + vmovdqu YMMWORD[64+rdi],ymm1 + vmovdqu YMMWORD[96+rdi],ymm5 + vmovdqu YMMWORD[128+rdi],ymm12 + vmovdqu YMMWORD[160+rdi],ymm13 + vmovdqu YMMWORD[192+rdi],ymm10 + vmovdqu YMMWORD[224+rdi],ymm15 + je NEAR $L$done8x + + lea rsi,[256+rsi] + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm14 + lea rdi,[256+rdi] + sub rdx,256 + vmovdqa YMMWORD[32+rsp],ymm2 + jmp NEAR $L$oop_tail8x + +ALIGN 32 +$L$320_or_more8x: + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vpxor ymm1,ymm1,YMMWORD[64+rsi] + vpxor ymm5,ymm5,YMMWORD[96+rsi] + vpxor ymm12,ymm12,YMMWORD[128+rsi] + vpxor ymm13,ymm13,YMMWORD[160+rsi] + vpxor ymm10,ymm10,YMMWORD[192+rsi] + vpxor ymm15,ymm15,YMMWORD[224+rsi] + vpxor ymm14,ymm14,YMMWORD[256+rsi] + vpxor ymm2,ymm2,YMMWORD[288+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + vmovdqu YMMWORD[64+rdi],ymm1 + vmovdqu YMMWORD[96+rdi],ymm5 + vmovdqu YMMWORD[128+rdi],ymm12 + vmovdqu YMMWORD[160+rdi],ymm13 + vmovdqu YMMWORD[192+rdi],ymm10 + vmovdqu YMMWORD[224+rdi],ymm15 + vmovdqu YMMWORD[256+rdi],ymm14 + vmovdqu YMMWORD[288+rdi],ymm2 + je NEAR $L$done8x + + lea rsi,[320+rsi] + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm3 + lea rdi,[320+rdi] + sub rdx,320 + vmovdqa YMMWORD[32+rsp],ymm7 + jmp NEAR $L$oop_tail8x + +ALIGN 32 +$L$384_or_more8x: + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vpxor ymm1,ymm1,YMMWORD[64+rsi] + vpxor ymm5,ymm5,YMMWORD[96+rsi] + vpxor ymm12,ymm12,YMMWORD[128+rsi] + vpxor ymm13,ymm13,YMMWORD[160+rsi] + vpxor ymm10,ymm10,YMMWORD[192+rsi] + vpxor ymm15,ymm15,YMMWORD[224+rsi] + vpxor ymm14,ymm14,YMMWORD[256+rsi] + vpxor ymm2,ymm2,YMMWORD[288+rsi] + vpxor ymm3,ymm3,YMMWORD[320+rsi] + vpxor ymm7,ymm7,YMMWORD[352+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + vmovdqu YMMWORD[64+rdi],ymm1 + vmovdqu YMMWORD[96+rdi],ymm5 + vmovdqu YMMWORD[128+rdi],ymm12 + vmovdqu YMMWORD[160+rdi],ymm13 + vmovdqu YMMWORD[192+rdi],ymm10 + vmovdqu YMMWORD[224+rdi],ymm15 + vmovdqu YMMWORD[256+rdi],ymm14 + vmovdqu YMMWORD[288+rdi],ymm2 + vmovdqu YMMWORD[320+rdi],ymm3 + vmovdqu YMMWORD[352+rdi],ymm7 + je NEAR $L$done8x + + lea rsi,[384+rsi] + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm11 + lea rdi,[384+rdi] + sub rdx,384 + vmovdqa YMMWORD[32+rsp],ymm9 + jmp NEAR $L$oop_tail8x + +ALIGN 32 +$L$448_or_more8x: + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vpxor ymm1,ymm1,YMMWORD[64+rsi] + vpxor ymm5,ymm5,YMMWORD[96+rsi] + vpxor ymm12,ymm12,YMMWORD[128+rsi] + vpxor ymm13,ymm13,YMMWORD[160+rsi] + vpxor ymm10,ymm10,YMMWORD[192+rsi] + vpxor ymm15,ymm15,YMMWORD[224+rsi] + vpxor ymm14,ymm14,YMMWORD[256+rsi] + vpxor ymm2,ymm2,YMMWORD[288+rsi] + vpxor ymm3,ymm3,YMMWORD[320+rsi] + vpxor ymm7,ymm7,YMMWORD[352+rsi] + vpxor ymm11,ymm11,YMMWORD[384+rsi] + vpxor ymm9,ymm9,YMMWORD[416+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + vmovdqu YMMWORD[64+rdi],ymm1 + vmovdqu YMMWORD[96+rdi],ymm5 + vmovdqu YMMWORD[128+rdi],ymm12 + vmovdqu YMMWORD[160+rdi],ymm13 + vmovdqu YMMWORD[192+rdi],ymm10 + vmovdqu YMMWORD[224+rdi],ymm15 + vmovdqu YMMWORD[256+rdi],ymm14 + vmovdqu YMMWORD[288+rdi],ymm2 + vmovdqu YMMWORD[320+rdi],ymm3 + vmovdqu YMMWORD[352+rdi],ymm7 + vmovdqu YMMWORD[384+rdi],ymm11 + vmovdqu YMMWORD[416+rdi],ymm9 + je NEAR $L$done8x + + lea rsi,[448+rsi] + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm0 + lea rdi,[448+rdi] + sub rdx,448 + vmovdqa YMMWORD[32+rsp],ymm4 + +$L$oop_tail8x: + movzx eax,BYTE[r10*1+rsi] + movzx ecx,BYTE[r10*1+rsp] + lea r10,[1+r10] + xor eax,ecx + mov BYTE[((-1))+r10*1+rdi],al + dec rdx + jnz NEAR $L$oop_tail8x + +$L$done8x: + vzeroall + movaps xmm6,XMMWORD[((-168))+r9] + movaps xmm7,XMMWORD[((-152))+r9] + movaps xmm8,XMMWORD[((-136))+r9] + movaps xmm9,XMMWORD[((-120))+r9] + movaps xmm10,XMMWORD[((-104))+r9] + movaps xmm11,XMMWORD[((-88))+r9] + movaps xmm12,XMMWORD[((-72))+r9] + movaps xmm13,XMMWORD[((-56))+r9] + movaps xmm14,XMMWORD[((-40))+r9] + movaps xmm15,XMMWORD[((-24))+r9] + lea rsp,[r9] + +$L$8x_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ChaCha20_ctr32_avx2: +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +se_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + lea r10,[$L$ctr32_body] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + lea r10,[$L$no_data] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + lea rax,[((64+24+48))+rax] + + mov rbx,QWORD[((-8))+rax] + mov rbp,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 + +$L$common_seh_tail: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + ret + + + +ALIGN 16 +ssse3_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[192+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + lea rsi,[((-40))+rax] + lea rdi,[512+r8] + mov ecx,4 + DD 0xa548f3fc + + jmp NEAR $L$common_seh_tail + + + +ALIGN 16 +full_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[192+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + lea rsi,[((-168))+rax] + lea rdi,[512+r8] + mov ecx,20 + DD 0xa548f3fc + + jmp NEAR $L$common_seh_tail + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_ChaCha20_ctr32_nohw wrt ..imagebase + DD $L$SEH_end_ChaCha20_ctr32_nohw wrt ..imagebase + DD $L$SEH_info_ChaCha20_ctr32_nohw wrt ..imagebase + + DD $L$SEH_begin_ChaCha20_ctr32_ssse3_4x wrt ..imagebase + DD $L$SEH_end_ChaCha20_ctr32_ssse3_4x wrt ..imagebase + DD $L$SEH_info_ChaCha20_ctr32_ssse3_4x wrt ..imagebase + DD $L$SEH_begin_ChaCha20_ctr32_avx2 wrt ..imagebase + DD $L$SEH_end_ChaCha20_ctr32_avx2 wrt ..imagebase + DD $L$SEH_info_ChaCha20_ctr32_avx2 wrt ..imagebase +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_ChaCha20_ctr32_nohw: + DB 9,0,0,0 + DD se_handler wrt ..imagebase + +$L$SEH_info_ChaCha20_ctr32_ssse3_4x: + DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$4x_body wrt ..imagebase,$L$4x_epilogue wrt ..imagebase +$L$SEH_info_ChaCha20_ctr32_avx2: + DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$8x_body wrt ..imagebase,$L$8x_epilogue wrt ..imagebase +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/ring-0.17.14/pregenerated/chacha-x86_64-nasm.o b/ring-0.17.14/pregenerated/chacha-x86_64-nasm.o new file mode 100644 index 0000000000..f606b4015f Binary files /dev/null and b/ring-0.17.14/pregenerated/chacha-x86_64-nasm.o differ diff --git a/ring-0.17.14/pregenerated/chacha20_poly1305_armv8-ios64.S b/ring-0.17.14/pregenerated/chacha20_poly1305_armv8-ios64.S new file mode 100644 index 0000000000..36e007434a --- /dev/null +++ b/ring-0.17.14/pregenerated/chacha20_poly1305_armv8-ios64.S @@ -0,0 +1,3008 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) +.section __TEXT,__const + +.align 7 +Lchacha20_consts: +.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' +Linc: +.long 1,2,3,4 +Lrol8: +.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 +Lclamp: +.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC + +.text + + +.align 6 +Lpoly_hash_ad_internal: +.cfi_startproc + cbnz x4, Lpoly_hash_intro + ret + +Lpoly_hash_intro: + cmp x4, #16 + b.lt Lpoly_hash_ad_tail + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + sub x4, x4, #16 + b Lpoly_hash_ad_internal + +Lpoly_hash_ad_tail: + cbz x4, Lpoly_hash_ad_ret + + eor v20.16b, v20.16b, v20.16b // Use T0 to load the AAD + sub x4, x4, #1 + +Lpoly_hash_tail_16_compose: + ext v20.16b, v20.16b, v20.16b, #15 + ldrb w11, [x3, x4] + mov v20.b[0], w11 + subs x4, x4, #1 + b.ge Lpoly_hash_tail_16_compose + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + +Lpoly_hash_ad_ret: + ret +.cfi_endproc + + +///////////////////////////////// +// +// void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data); +// +.globl _chacha20_poly1305_seal +.private_extern _chacha20_poly1305_seal + +.align 6 +_chacha20_poly1305_seal: + AARCH64_SIGN_LINK_REGISTER +.cfi_startproc + stp x29, x30, [sp, #-80]! +.cfi_def_cfa_offset 80 +.cfi_offset w30, -72 +.cfi_offset w29, -80 + mov x29, sp + // We probably could do .cfi_def_cfa w29, 80 at this point, but since + // we don't actually use the frame pointer like that, it's probably not + // worth bothering. + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] +.cfi_offset b15, -8 +.cfi_offset b14, -16 +.cfi_offset b13, -24 +.cfi_offset b12, -32 +.cfi_offset b11, -40 +.cfi_offset b10, -48 +.cfi_offset b9, -56 +.cfi_offset b8, -64 + + adrp x11, Lchacha20_consts@PAGE + add x11, x11, Lchacha20_consts@PAGEOFF + + ld1 {v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values + ld1 {v28.16b - v30.16b}, [x5] + + mov x15, #1 // Prepare the Poly1305 state + mov x8, #0 + mov x9, #0 + mov x10, #0 + + ldr x12, [x5, #56] // The total cipher text length includes extra_in_len + add x12, x12, x2 + mov v31.d[0], x4 // Store the input and aad lengths + mov v31.d[1], x12 + + cmp x2, #128 + b.le Lseal_128 // Optimization for smaller buffers + + // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext, + // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically, + // the fifth block (A4-D4) horizontally. + ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] + mov v4.16b, v24.16b + + ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 + mov v9.16b, v28.16b + + ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 + mov v14.16b, v29.16b + + ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] + add v15.4s, v15.4s, v25.4s + mov v19.16b, v30.16b + + sub x5, x5, #32 + + mov x6, #10 + +.align 5 +Lseal_init_rounds: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v9.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v18.8h, v18.8h + rev32 v19.8h, v19.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + eor v8.16b, v8.16b, v13.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v9.4s, #20 + sli v8.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + add v3.4s, v3.4s, v7.4s + add v4.4s, v4.4s, v8.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v18.16b, {v18.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v14.16b + + ushr v9.4s, v8.4s, #25 + sli v9.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #4 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #12 + add v0.4s, v0.4s, v6.4s + add v1.4s, v1.4s, v7.4s + add v2.4s, v2.4s, v8.4s + add v3.4s, v3.4s, v5.4s + add v4.4s, v4.4s, v9.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v18.8h, v18.8h + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v19.8h, v19.8h + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v6.4s, #20 + sli v20.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v5.4s, #20 + sli v8.4s, v5.4s, #12 + ushr v5.4s, v9.4s, #20 + sli v5.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v5.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v18.16b, {v18.16b}, v26.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v12.16b + eor v6.16b, v6.16b, v13.16b + eor v7.16b, v7.16b, v10.16b + eor v8.16b, v8.16b, v11.16b + eor v5.16b, v5.16b, v14.16b + + ushr v9.4s, v5.4s, #25 + sli v9.4s, v5.4s, #7 + ushr v5.4s, v8.4s, #25 + sli v5.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v20.4s, #25 + sli v6.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #12 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #4 + subs x6, x6, #1 + b.hi Lseal_init_rounds + + add v15.4s, v15.4s, v25.4s + mov x11, #4 + dup v20.4s, w11 + add v25.4s, v25.4s, v20.4s + + zip1 v20.4s, v0.4s, v1.4s + zip2 v21.4s, v0.4s, v1.4s + zip1 v22.4s, v2.4s, v3.4s + zip2 v23.4s, v2.4s, v3.4s + + zip1 v0.2d, v20.2d, v22.2d + zip2 v1.2d, v20.2d, v22.2d + zip1 v2.2d, v21.2d, v23.2d + zip2 v3.2d, v21.2d, v23.2d + + zip1 v20.4s, v5.4s, v6.4s + zip2 v21.4s, v5.4s, v6.4s + zip1 v22.4s, v7.4s, v8.4s + zip2 v23.4s, v7.4s, v8.4s + + zip1 v5.2d, v20.2d, v22.2d + zip2 v6.2d, v20.2d, v22.2d + zip1 v7.2d, v21.2d, v23.2d + zip2 v8.2d, v21.2d, v23.2d + + zip1 v20.4s, v10.4s, v11.4s + zip2 v21.4s, v10.4s, v11.4s + zip1 v22.4s, v12.4s, v13.4s + zip2 v23.4s, v12.4s, v13.4s + + zip1 v10.2d, v20.2d, v22.2d + zip2 v11.2d, v20.2d, v22.2d + zip1 v12.2d, v21.2d, v23.2d + zip2 v13.2d, v21.2d, v23.2d + + zip1 v20.4s, v15.4s, v16.4s + zip2 v21.4s, v15.4s, v16.4s + zip1 v22.4s, v17.4s, v18.4s + zip2 v23.4s, v17.4s, v18.4s + + zip1 v15.2d, v20.2d, v22.2d + zip2 v16.2d, v20.2d, v22.2d + zip1 v17.2d, v21.2d, v23.2d + zip2 v18.2d, v21.2d, v23.2d + + add v4.4s, v4.4s, v24.4s + add v9.4s, v9.4s, v28.4s + and v4.16b, v4.16b, v27.16b + + add v0.4s, v0.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v15.4s, v15.4s, v30.4s + + add v1.4s, v1.4s, v24.4s + add v6.4s, v6.4s, v28.4s + add v11.4s, v11.4s, v29.4s + add v16.4s, v16.4s, v30.4s + + add v2.4s, v2.4s, v24.4s + add v7.4s, v7.4s, v28.4s + add v12.4s, v12.4s, v29.4s + add v17.4s, v17.4s, v30.4s + + add v3.4s, v3.4s, v24.4s + add v8.4s, v8.4s, v28.4s + add v13.4s, v13.4s, v29.4s + add v18.4s, v18.4s, v30.4s + + mov x16, v4.d[0] // Move the R key to GPRs + mov x17, v4.d[1] + mov v27.16b, v9.16b // Store the S key + + bl Lpoly_hash_ad_internal + + mov x3, x0 + cmp x2, #256 + b.le Lseal_tail + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v15.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v1.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v11.16b + eor v23.16b, v23.16b, v16.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v2.16b + eor v21.16b, v21.16b, v7.16b + eor v22.16b, v22.16b, v12.16b + eor v23.16b, v23.16b, v17.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v3.16b + eor v21.16b, v21.16b, v8.16b + eor v22.16b, v22.16b, v13.16b + eor v23.16b, v23.16b, v18.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #256 + + mov x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds + mov x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256 + +Lseal_main_loop: + adrp x11, Lchacha20_consts@PAGE + add x11, x11, Lchacha20_consts@PAGEOFF + + ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] + mov v4.16b, v24.16b + + ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 + mov v9.16b, v28.16b + + ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 + mov v14.16b, v29.16b + + ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] + add v15.4s, v15.4s, v25.4s + mov v19.16b, v30.16b + + eor v20.16b, v20.16b, v20.16b //zero + not v21.16b, v20.16b // -1 + sub v21.4s, v25.4s, v21.4s // Add +1 + ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) + add v19.4s, v19.4s, v20.4s + + sub x5, x5, #32 +.align 5 +Lseal_main_loop_rounds: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v9.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v18.8h, v18.8h + rev32 v19.8h, v19.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + eor v8.16b, v8.16b, v13.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v9.4s, #20 + sli v8.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + add v3.4s, v3.4s, v7.4s + add v4.4s, v4.4s, v8.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v18.16b, {v18.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v14.16b + + ushr v9.4s, v8.4s, #25 + sli v9.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #4 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #12 + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + add v0.4s, v0.4s, v6.4s + add v1.4s, v1.4s, v7.4s + add v2.4s, v2.4s, v8.4s + add v3.4s, v3.4s, v5.4s + add v4.4s, v4.4s, v9.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v18.8h, v18.8h + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v19.8h, v19.8h + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v6.4s, #20 + sli v20.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v5.4s, #20 + sli v8.4s, v5.4s, #12 + ushr v5.4s, v9.4s, #20 + sli v5.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v5.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v18.16b, {v18.16b}, v26.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v12.16b + eor v6.16b, v6.16b, v13.16b + eor v7.16b, v7.16b, v10.16b + eor v8.16b, v8.16b, v11.16b + eor v5.16b, v5.16b, v14.16b + + ushr v9.4s, v5.4s, #25 + sli v9.4s, v5.4s, #7 + ushr v5.4s, v8.4s, #25 + sli v5.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v20.4s, #25 + sli v6.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #12 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #4 + subs x6, x6, #1 + b.ge Lseal_main_loop_rounds + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + subs x7, x7, #1 + b.gt Lseal_main_loop_rounds + + eor v20.16b, v20.16b, v20.16b //zero + not v21.16b, v20.16b // -1 + sub v21.4s, v25.4s, v21.4s // Add +1 + ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) + add v19.4s, v19.4s, v20.4s + + add v15.4s, v15.4s, v25.4s + mov x11, #5 + dup v20.4s, w11 + add v25.4s, v25.4s, v20.4s + + zip1 v20.4s, v0.4s, v1.4s + zip2 v21.4s, v0.4s, v1.4s + zip1 v22.4s, v2.4s, v3.4s + zip2 v23.4s, v2.4s, v3.4s + + zip1 v0.2d, v20.2d, v22.2d + zip2 v1.2d, v20.2d, v22.2d + zip1 v2.2d, v21.2d, v23.2d + zip2 v3.2d, v21.2d, v23.2d + + zip1 v20.4s, v5.4s, v6.4s + zip2 v21.4s, v5.4s, v6.4s + zip1 v22.4s, v7.4s, v8.4s + zip2 v23.4s, v7.4s, v8.4s + + zip1 v5.2d, v20.2d, v22.2d + zip2 v6.2d, v20.2d, v22.2d + zip1 v7.2d, v21.2d, v23.2d + zip2 v8.2d, v21.2d, v23.2d + + zip1 v20.4s, v10.4s, v11.4s + zip2 v21.4s, v10.4s, v11.4s + zip1 v22.4s, v12.4s, v13.4s + zip2 v23.4s, v12.4s, v13.4s + + zip1 v10.2d, v20.2d, v22.2d + zip2 v11.2d, v20.2d, v22.2d + zip1 v12.2d, v21.2d, v23.2d + zip2 v13.2d, v21.2d, v23.2d + + zip1 v20.4s, v15.4s, v16.4s + zip2 v21.4s, v15.4s, v16.4s + zip1 v22.4s, v17.4s, v18.4s + zip2 v23.4s, v17.4s, v18.4s + + zip1 v15.2d, v20.2d, v22.2d + zip2 v16.2d, v20.2d, v22.2d + zip1 v17.2d, v21.2d, v23.2d + zip2 v18.2d, v21.2d, v23.2d + + add v0.4s, v0.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v15.4s, v15.4s, v30.4s + + add v1.4s, v1.4s, v24.4s + add v6.4s, v6.4s, v28.4s + add v11.4s, v11.4s, v29.4s + add v16.4s, v16.4s, v30.4s + + add v2.4s, v2.4s, v24.4s + add v7.4s, v7.4s, v28.4s + add v12.4s, v12.4s, v29.4s + add v17.4s, v17.4s, v30.4s + + add v3.4s, v3.4s, v24.4s + add v8.4s, v8.4s, v28.4s + add v13.4s, v13.4s, v29.4s + add v18.4s, v18.4s, v30.4s + + add v4.4s, v4.4s, v24.4s + add v9.4s, v9.4s, v28.4s + add v14.4s, v14.4s, v29.4s + add v19.4s, v19.4s, v30.4s + + cmp x2, #320 + b.le Lseal_tail + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v15.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v1.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v11.16b + eor v23.16b, v23.16b, v16.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v2.16b + eor v21.16b, v21.16b, v7.16b + eor v22.16b, v22.16b, v12.16b + eor v23.16b, v23.16b, v17.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v3.16b + eor v21.16b, v21.16b, v8.16b + eor v22.16b, v22.16b, v13.16b + eor v23.16b, v23.16b, v18.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v4.16b + eor v21.16b, v21.16b, v9.16b + eor v22.16b, v22.16b, v14.16b + eor v23.16b, v23.16b, v19.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #320 + + mov x6, #0 + mov x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration + + b Lseal_main_loop + +Lseal_tail: + // This part of the function handles the storage and authentication of the last [0,320) bytes + // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data. + cmp x2, #64 + b.lt Lseal_tail_64 + + // Store and authenticate 64B blocks per iteration + ld1 {v20.16b - v23.16b}, [x1], #64 + + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v15.16b + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v21.d[0] + mov x12, v21.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v22.d[0] + mov x12, v22.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v23.d[0] + mov x12, v23.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + st1 {v20.16b - v23.16b}, [x0], #64 + sub x2, x2, #64 + + // Shift the state left by 64 bytes for the next iteration of the loop + mov v0.16b, v1.16b + mov v5.16b, v6.16b + mov v10.16b, v11.16b + mov v15.16b, v16.16b + + mov v1.16b, v2.16b + mov v6.16b, v7.16b + mov v11.16b, v12.16b + mov v16.16b, v17.16b + + mov v2.16b, v3.16b + mov v7.16b, v8.16b + mov v12.16b, v13.16b + mov v17.16b, v18.16b + + mov v3.16b, v4.16b + mov v8.16b, v9.16b + mov v13.16b, v14.16b + mov v18.16b, v19.16b + + b Lseal_tail + +Lseal_tail_64: + ldp x3, x4, [x5, #48] // extra_in_len and extra_in_ptr + + // Here we handle the last [0,64) bytes of plaintext + cmp x2, #16 + b.lt Lseal_tail_16 + // Each iteration encrypt and authenticate a 16B block + ld1 {v20.16b}, [x1], #16 + eor v20.16b, v20.16b, v0.16b + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + st1 {v20.16b}, [x0], #16 + + sub x2, x2, #16 + + // Shift the state left by 16 bytes for the next iteration of the loop + mov v0.16b, v5.16b + mov v5.16b, v10.16b + mov v10.16b, v15.16b + + b Lseal_tail_64 + +Lseal_tail_16: + // Here we handle the last [0,16) bytes of ciphertext that require a padded block + cbz x2, Lseal_hash_extra + + eor v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in + eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes + not v22.16b, v20.16b + + mov x6, x2 + add x1, x1, x2 + + cbz x4, Lseal_tail_16_compose // No extra data to pad with, zero padding + + mov x7, #16 // We need to load some extra_in first for padding + sub x7, x7, x2 + cmp x4, x7 + csel x7, x4, x7, lt // Load the minimum of extra_in_len and the amount needed to fill the register + mov x12, x7 + add x3, x3, x7 + sub x4, x4, x7 + +Lseal_tail16_compose_extra_in: + ext v20.16b, v20.16b, v20.16b, #15 + ldrb w11, [x3, #-1]! + mov v20.b[0], w11 + subs x7, x7, #1 + b.gt Lseal_tail16_compose_extra_in + + add x3, x3, x12 + +Lseal_tail_16_compose: + ext v20.16b, v20.16b, v20.16b, #15 + ldrb w11, [x1, #-1]! + mov v20.b[0], w11 + ext v21.16b, v22.16b, v21.16b, #15 + subs x2, x2, #1 + b.gt Lseal_tail_16_compose + + and v0.16b, v0.16b, v21.16b + eor v20.16b, v20.16b, v0.16b + mov v21.16b, v20.16b + +Lseal_tail_16_store: + umov w11, v20.b[0] + strb w11, [x0], #1 + ext v20.16b, v20.16b, v20.16b, #1 + subs x6, x6, #1 + b.gt Lseal_tail_16_store + + // Hash in the final ct block concatenated with extra_in + mov x11, v21.d[0] + mov x12, v21.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + +Lseal_hash_extra: + cbz x4, Lseal_finalize + +Lseal_hash_extra_loop: + cmp x4, #16 + b.lt Lseal_hash_extra_tail + ld1 {v20.16b}, [x3], #16 + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + sub x4, x4, #16 + b Lseal_hash_extra_loop + +Lseal_hash_extra_tail: + cbz x4, Lseal_finalize + eor v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext + add x3, x3, x4 + +Lseal_hash_extra_load: + ext v20.16b, v20.16b, v20.16b, #15 + ldrb w11, [x3, #-1]! + mov v20.b[0], w11 + subs x4, x4, #1 + b.gt Lseal_hash_extra_load + + // Hash in the final padded extra_in blcok + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + +Lseal_finalize: + mov x11, v31.d[0] + mov x12, v31.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + // Final reduction step + sub x12, xzr, x15 + orr x13, xzr, #3 + subs x11, x8, #-5 + sbcs x12, x9, x12 + sbcs x13, x10, x13 + csel x8, x11, x8, cs + csel x9, x12, x9, cs + csel x10, x13, x10, cs + mov x11, v27.d[0] + mov x12, v27.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + + stp x8, x9, [x5] + + ldp d8, d9, [sp, #16] + ldp d10, d11, [sp, #32] + ldp d12, d13, [sp, #48] + ldp d14, d15, [sp, #64] +.cfi_restore b15 +.cfi_restore b14 +.cfi_restore b13 +.cfi_restore b12 +.cfi_restore b11 +.cfi_restore b10 +.cfi_restore b9 +.cfi_restore b8 + ldp x29, x30, [sp], 80 +.cfi_restore w29 +.cfi_restore w30 +.cfi_def_cfa_offset 0 + AARCH64_VALIDATE_LINK_REGISTER + ret + +Lseal_128: + // On some architectures preparing 5 blocks for small buffers is wasteful + eor v25.16b, v25.16b, v25.16b + mov x11, #1 + mov v25.s[0], w11 + mov v0.16b, v24.16b + mov v1.16b, v24.16b + mov v2.16b, v24.16b + mov v5.16b, v28.16b + mov v6.16b, v28.16b + mov v7.16b, v28.16b + mov v10.16b, v29.16b + mov v11.16b, v29.16b + mov v12.16b, v29.16b + mov v17.16b, v30.16b + add v15.4s, v17.4s, v25.4s + add v16.4s, v15.4s, v25.4s + + mov x6, #10 + +Lseal_128_rounds: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #4 + ext v6.16b, v6.16b, v6.16b, #4 + ext v7.16b, v7.16b, v7.16b, #4 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #12 + ext v16.16b, v16.16b, v16.16b, #12 + ext v17.16b, v17.16b, v17.16b, #12 + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #12 + ext v6.16b, v6.16b, v6.16b, #12 + ext v7.16b, v7.16b, v7.16b, #12 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #4 + ext v16.16b, v16.16b, v16.16b, #4 + ext v17.16b, v17.16b, v17.16b, #4 + subs x6, x6, #1 + b.hi Lseal_128_rounds + + add v0.4s, v0.4s, v24.4s + add v1.4s, v1.4s, v24.4s + add v2.4s, v2.4s, v24.4s + + add v5.4s, v5.4s, v28.4s + add v6.4s, v6.4s, v28.4s + add v7.4s, v7.4s, v28.4s + + // Only the first 32 bytes of the third block (counter = 0) are needed, + // so skip updating v12 and v17. + add v10.4s, v10.4s, v29.4s + add v11.4s, v11.4s, v29.4s + + add v30.4s, v30.4s, v25.4s + add v15.4s, v15.4s, v30.4s + add v30.4s, v30.4s, v25.4s + add v16.4s, v16.4s, v30.4s + + and v2.16b, v2.16b, v27.16b + mov x16, v2.d[0] // Move the R key to GPRs + mov x17, v2.d[1] + mov v27.16b, v7.16b // Store the S key + + bl Lpoly_hash_ad_internal + b Lseal_tail +.cfi_endproc + + +///////////////////////////////// +// +// void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data); +// +.globl _chacha20_poly1305_open +.private_extern _chacha20_poly1305_open + +.align 6 +_chacha20_poly1305_open: + AARCH64_SIGN_LINK_REGISTER +.cfi_startproc + stp x29, x30, [sp, #-80]! +.cfi_def_cfa_offset 80 +.cfi_offset w30, -72 +.cfi_offset w29, -80 + mov x29, sp + // We probably could do .cfi_def_cfa w29, 80 at this point, but since + // we don't actually use the frame pointer like that, it's probably not + // worth bothering. + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] +.cfi_offset b15, -8 +.cfi_offset b14, -16 +.cfi_offset b13, -24 +.cfi_offset b12, -32 +.cfi_offset b11, -40 +.cfi_offset b10, -48 +.cfi_offset b9, -56 +.cfi_offset b8, -64 + + adrp x11, Lchacha20_consts@PAGE + add x11, x11, Lchacha20_consts@PAGEOFF + + ld1 {v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values + ld1 {v28.16b - v30.16b}, [x5] + + mov x15, #1 // Prepare the Poly1305 state + mov x8, #0 + mov x9, #0 + mov x10, #0 + + mov v31.d[0], x4 // Store the input and aad lengths + mov v31.d[1], x2 + + cmp x2, #128 + b.le Lopen_128 // Optimization for smaller buffers + + // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys + mov v0.16b, v24.16b + mov v5.16b, v28.16b + mov v10.16b, v29.16b + mov v15.16b, v30.16b + + mov x6, #10 + +.align 5 +Lopen_init_rounds: + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #4 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #12 + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #12 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #4 + subs x6, x6, #1 + b.hi Lopen_init_rounds + + add v0.4s, v0.4s, v24.4s + add v5.4s, v5.4s, v28.4s + + and v0.16b, v0.16b, v27.16b + mov x16, v0.d[0] // Move the R key to GPRs + mov x17, v0.d[1] + mov v27.16b, v5.16b // Store the S key + + bl Lpoly_hash_ad_internal + +Lopen_ad_done: + mov x3, x1 + +// Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes +Lopen_main_loop: + + cmp x2, #192 + b.lt Lopen_tail + + adrp x11, Lchacha20_consts@PAGE + add x11, x11, Lchacha20_consts@PAGEOFF + + ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] + mov v4.16b, v24.16b + + ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 + mov v9.16b, v28.16b + + ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 + mov v14.16b, v29.16b + + ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] + sub x5, x5, #32 + add v15.4s, v15.4s, v25.4s + mov v19.16b, v30.16b + + eor v20.16b, v20.16b, v20.16b //zero + not v21.16b, v20.16b // -1 + sub v21.4s, v25.4s, v21.4s // Add +1 + ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) + add v19.4s, v19.4s, v20.4s + + lsr x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12 + sub x4, x4, #10 + + mov x7, #10 + subs x6, x7, x4 + subs x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash + csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full + + cbz x7, Lopen_main_loop_rounds_short + +.align 5 +Lopen_main_loop_rounds: + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most +Lopen_main_loop_rounds_short: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v9.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v18.8h, v18.8h + rev32 v19.8h, v19.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + eor v8.16b, v8.16b, v13.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v9.4s, #20 + sli v8.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + add v3.4s, v3.4s, v7.4s + add v4.4s, v4.4s, v8.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v18.16b, {v18.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v14.16b + + ushr v9.4s, v8.4s, #25 + sli v9.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #4 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #12 + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + add v0.4s, v0.4s, v6.4s + add v1.4s, v1.4s, v7.4s + add v2.4s, v2.4s, v8.4s + add v3.4s, v3.4s, v5.4s + add v4.4s, v4.4s, v9.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v18.8h, v18.8h + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v19.8h, v19.8h + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v6.4s, #20 + sli v20.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v5.4s, #20 + sli v8.4s, v5.4s, #12 + ushr v5.4s, v9.4s, #20 + sli v5.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v5.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v18.16b, {v18.16b}, v26.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v12.16b + eor v6.16b, v6.16b, v13.16b + eor v7.16b, v7.16b, v10.16b + eor v8.16b, v8.16b, v11.16b + eor v5.16b, v5.16b, v14.16b + + ushr v9.4s, v5.4s, #25 + sli v9.4s, v5.4s, #7 + ushr v5.4s, v8.4s, #25 + sli v5.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v20.4s, #25 + sli v6.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #12 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #4 + subs x7, x7, #1 + b.gt Lopen_main_loop_rounds + subs x6, x6, #1 + b.ge Lopen_main_loop_rounds_short + + eor v20.16b, v20.16b, v20.16b //zero + not v21.16b, v20.16b // -1 + sub v21.4s, v25.4s, v21.4s // Add +1 + ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) + add v19.4s, v19.4s, v20.4s + + add v15.4s, v15.4s, v25.4s + mov x11, #5 + dup v20.4s, w11 + add v25.4s, v25.4s, v20.4s + + zip1 v20.4s, v0.4s, v1.4s + zip2 v21.4s, v0.4s, v1.4s + zip1 v22.4s, v2.4s, v3.4s + zip2 v23.4s, v2.4s, v3.4s + + zip1 v0.2d, v20.2d, v22.2d + zip2 v1.2d, v20.2d, v22.2d + zip1 v2.2d, v21.2d, v23.2d + zip2 v3.2d, v21.2d, v23.2d + + zip1 v20.4s, v5.4s, v6.4s + zip2 v21.4s, v5.4s, v6.4s + zip1 v22.4s, v7.4s, v8.4s + zip2 v23.4s, v7.4s, v8.4s + + zip1 v5.2d, v20.2d, v22.2d + zip2 v6.2d, v20.2d, v22.2d + zip1 v7.2d, v21.2d, v23.2d + zip2 v8.2d, v21.2d, v23.2d + + zip1 v20.4s, v10.4s, v11.4s + zip2 v21.4s, v10.4s, v11.4s + zip1 v22.4s, v12.4s, v13.4s + zip2 v23.4s, v12.4s, v13.4s + + zip1 v10.2d, v20.2d, v22.2d + zip2 v11.2d, v20.2d, v22.2d + zip1 v12.2d, v21.2d, v23.2d + zip2 v13.2d, v21.2d, v23.2d + + zip1 v20.4s, v15.4s, v16.4s + zip2 v21.4s, v15.4s, v16.4s + zip1 v22.4s, v17.4s, v18.4s + zip2 v23.4s, v17.4s, v18.4s + + zip1 v15.2d, v20.2d, v22.2d + zip2 v16.2d, v20.2d, v22.2d + zip1 v17.2d, v21.2d, v23.2d + zip2 v18.2d, v21.2d, v23.2d + + add v0.4s, v0.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v15.4s, v15.4s, v30.4s + + add v1.4s, v1.4s, v24.4s + add v6.4s, v6.4s, v28.4s + add v11.4s, v11.4s, v29.4s + add v16.4s, v16.4s, v30.4s + + add v2.4s, v2.4s, v24.4s + add v7.4s, v7.4s, v28.4s + add v12.4s, v12.4s, v29.4s + add v17.4s, v17.4s, v30.4s + + add v3.4s, v3.4s, v24.4s + add v8.4s, v8.4s, v28.4s + add v13.4s, v13.4s, v29.4s + add v18.4s, v18.4s, v30.4s + + add v4.4s, v4.4s, v24.4s + add v9.4s, v9.4s, v28.4s + add v14.4s, v14.4s, v29.4s + add v19.4s, v19.4s, v30.4s + + // We can always safely store 192 bytes + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v15.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v1.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v11.16b + eor v23.16b, v23.16b, v16.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v2.16b + eor v21.16b, v21.16b, v7.16b + eor v22.16b, v22.16b, v12.16b + eor v23.16b, v23.16b, v17.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #192 + + mov v0.16b, v3.16b + mov v5.16b, v8.16b + mov v10.16b, v13.16b + mov v15.16b, v18.16b + + cmp x2, #64 + b.lt Lopen_tail_64_store + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v3.16b + eor v21.16b, v21.16b, v8.16b + eor v22.16b, v22.16b, v13.16b + eor v23.16b, v23.16b, v18.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #64 + + mov v0.16b, v4.16b + mov v5.16b, v9.16b + mov v10.16b, v14.16b + mov v15.16b, v19.16b + + cmp x2, #64 + b.lt Lopen_tail_64_store + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v4.16b + eor v21.16b, v21.16b, v9.16b + eor v22.16b, v22.16b, v14.16b + eor v23.16b, v23.16b, v19.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #64 + b Lopen_main_loop + +Lopen_tail: + + cbz x2, Lopen_finalize + + lsr x4, x2, #4 // How many whole blocks we have to hash + + cmp x2, #64 + b.le Lopen_tail_64 + cmp x2, #128 + b.le Lopen_tail_128 + +Lopen_tail_192: + // We need three more blocks + mov v0.16b, v24.16b + mov v1.16b, v24.16b + mov v2.16b, v24.16b + mov v5.16b, v28.16b + mov v6.16b, v28.16b + mov v7.16b, v28.16b + mov v10.16b, v29.16b + mov v11.16b, v29.16b + mov v12.16b, v29.16b + mov v15.16b, v30.16b + mov v16.16b, v30.16b + mov v17.16b, v30.16b + eor v23.16b, v23.16b, v23.16b + eor v21.16b, v21.16b, v21.16b + ins v23.s[0], v25.s[0] + ins v21.d[0], x15 + + add v22.4s, v23.4s, v21.4s + add v21.4s, v22.4s, v21.4s + + add v15.4s, v15.4s, v21.4s + add v16.4s, v16.4s, v23.4s + add v17.4s, v17.4s, v22.4s + + mov x7, #10 + subs x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash + csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing + sub x4, x4, x7 + + cbz x7, Lopen_tail_192_rounds_no_hash + +Lopen_tail_192_rounds: + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most +Lopen_tail_192_rounds_no_hash: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #4 + ext v6.16b, v6.16b, v6.16b, #4 + ext v7.16b, v7.16b, v7.16b, #4 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #12 + ext v16.16b, v16.16b, v16.16b, #12 + ext v17.16b, v17.16b, v17.16b, #12 + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #12 + ext v6.16b, v6.16b, v6.16b, #12 + ext v7.16b, v7.16b, v7.16b, #12 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #4 + ext v16.16b, v16.16b, v16.16b, #4 + ext v17.16b, v17.16b, v17.16b, #4 + subs x7, x7, #1 + b.gt Lopen_tail_192_rounds + subs x6, x6, #1 + b.ge Lopen_tail_192_rounds_no_hash + + // We hashed 160 bytes at most, may still have 32 bytes left +Lopen_tail_192_hash: + cbz x4, Lopen_tail_192_hash_done + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + sub x4, x4, #1 + b Lopen_tail_192_hash + +Lopen_tail_192_hash_done: + + add v0.4s, v0.4s, v24.4s + add v1.4s, v1.4s, v24.4s + add v2.4s, v2.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v6.4s, v6.4s, v28.4s + add v7.4s, v7.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v11.4s, v11.4s, v29.4s + add v12.4s, v12.4s, v29.4s + add v15.4s, v15.4s, v30.4s + add v16.4s, v16.4s, v30.4s + add v17.4s, v17.4s, v30.4s + + add v15.4s, v15.4s, v21.4s + add v16.4s, v16.4s, v23.4s + add v17.4s, v17.4s, v22.4s + + ld1 {v20.16b - v23.16b}, [x1], #64 + + eor v20.16b, v20.16b, v1.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v11.16b + eor v23.16b, v23.16b, v16.16b + + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + + eor v20.16b, v20.16b, v2.16b + eor v21.16b, v21.16b, v7.16b + eor v22.16b, v22.16b, v12.16b + eor v23.16b, v23.16b, v17.16b + + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #128 + b Lopen_tail_64_store + +Lopen_tail_128: + // We need two more blocks + mov v0.16b, v24.16b + mov v1.16b, v24.16b + mov v5.16b, v28.16b + mov v6.16b, v28.16b + mov v10.16b, v29.16b + mov v11.16b, v29.16b + mov v15.16b, v30.16b + mov v16.16b, v30.16b + eor v23.16b, v23.16b, v23.16b + eor v22.16b, v22.16b, v22.16b + ins v23.s[0], v25.s[0] + ins v22.d[0], x15 + add v22.4s, v22.4s, v23.4s + + add v15.4s, v15.4s, v22.4s + add v16.4s, v16.4s, v23.4s + + mov x6, #10 + sub x6, x6, x4 + +Lopen_tail_128_rounds: + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #4 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #12 + add v1.4s, v1.4s, v6.4s + eor v16.16b, v16.16b, v1.16b + rev32 v16.8h, v16.8h + + add v11.4s, v11.4s, v16.4s + eor v6.16b, v6.16b, v11.16b + ushr v20.4s, v6.4s, #20 + sli v20.4s, v6.4s, #12 + add v1.4s, v1.4s, v20.4s + eor v16.16b, v16.16b, v1.16b + tbl v16.16b, {v16.16b}, v26.16b + + add v11.4s, v11.4s, v16.4s + eor v20.16b, v20.16b, v11.16b + ushr v6.4s, v20.4s, #25 + sli v6.4s, v20.4s, #7 + ext v6.16b, v6.16b, v6.16b, #4 + ext v11.16b, v11.16b, v11.16b, #8 + ext v16.16b, v16.16b, v16.16b, #12 + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #12 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #4 + add v1.4s, v1.4s, v6.4s + eor v16.16b, v16.16b, v1.16b + rev32 v16.8h, v16.8h + + add v11.4s, v11.4s, v16.4s + eor v6.16b, v6.16b, v11.16b + ushr v20.4s, v6.4s, #20 + sli v20.4s, v6.4s, #12 + add v1.4s, v1.4s, v20.4s + eor v16.16b, v16.16b, v1.16b + tbl v16.16b, {v16.16b}, v26.16b + + add v11.4s, v11.4s, v16.4s + eor v20.16b, v20.16b, v11.16b + ushr v6.4s, v20.4s, #25 + sli v6.4s, v20.4s, #7 + ext v6.16b, v6.16b, v6.16b, #12 + ext v11.16b, v11.16b, v11.16b, #8 + ext v16.16b, v16.16b, v16.16b, #4 + subs x6, x6, #1 + b.gt Lopen_tail_128_rounds + cbz x4, Lopen_tail_128_rounds_done + subs x4, x4, #1 + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + b Lopen_tail_128_rounds + +Lopen_tail_128_rounds_done: + add v0.4s, v0.4s, v24.4s + add v1.4s, v1.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v6.4s, v6.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v11.4s, v11.4s, v29.4s + add v15.4s, v15.4s, v30.4s + add v16.4s, v16.4s, v30.4s + add v15.4s, v15.4s, v22.4s + add v16.4s, v16.4s, v23.4s + + ld1 {v20.16b - v23.16b}, [x1], #64 + + eor v20.16b, v20.16b, v1.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v11.16b + eor v23.16b, v23.16b, v16.16b + + st1 {v20.16b - v23.16b}, [x0], #64 + sub x2, x2, #64 + + b Lopen_tail_64_store + +Lopen_tail_64: + // We just need a single block + mov v0.16b, v24.16b + mov v5.16b, v28.16b + mov v10.16b, v29.16b + mov v15.16b, v30.16b + eor v23.16b, v23.16b, v23.16b + ins v23.s[0], v25.s[0] + add v15.4s, v15.4s, v23.4s + + mov x6, #10 + sub x6, x6, x4 + +Lopen_tail_64_rounds: + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #4 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #12 + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #12 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #4 + subs x6, x6, #1 + b.gt Lopen_tail_64_rounds + cbz x4, Lopen_tail_64_rounds_done + subs x4, x4, #1 + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + b Lopen_tail_64_rounds + +Lopen_tail_64_rounds_done: + add v0.4s, v0.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v15.4s, v15.4s, v30.4s + add v15.4s, v15.4s, v23.4s + +Lopen_tail_64_store: + cmp x2, #16 + b.lt Lopen_tail_16 + + ld1 {v20.16b}, [x1], #16 + eor v20.16b, v20.16b, v0.16b + st1 {v20.16b}, [x0], #16 + mov v0.16b, v5.16b + mov v5.16b, v10.16b + mov v10.16b, v15.16b + sub x2, x2, #16 + b Lopen_tail_64_store + +Lopen_tail_16: + // Here we handle the last [0,16) bytes that require a padded block + cbz x2, Lopen_finalize + + eor v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext + eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask + not v22.16b, v20.16b + + add x7, x1, x2 + mov x6, x2 + +Lopen_tail_16_compose: + ext v20.16b, v20.16b, v20.16b, #15 + ldrb w11, [x7, #-1]! + mov v20.b[0], w11 + ext v21.16b, v22.16b, v21.16b, #15 + subs x2, x2, #1 + b.gt Lopen_tail_16_compose + + and v20.16b, v20.16b, v21.16b + // Hash in the final padded block + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + eor v20.16b, v20.16b, v0.16b + +Lopen_tail_16_store: + umov w11, v20.b[0] + strb w11, [x0], #1 + ext v20.16b, v20.16b, v20.16b, #1 + subs x6, x6, #1 + b.gt Lopen_tail_16_store + +Lopen_finalize: + mov x11, v31.d[0] + mov x12, v31.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + // Final reduction step + sub x12, xzr, x15 + orr x13, xzr, #3 + subs x11, x8, #-5 + sbcs x12, x9, x12 + sbcs x13, x10, x13 + csel x8, x11, x8, cs + csel x9, x12, x9, cs + csel x10, x13, x10, cs + mov x11, v27.d[0] + mov x12, v27.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + + stp x8, x9, [x5] + + ldp d8, d9, [sp, #16] + ldp d10, d11, [sp, #32] + ldp d12, d13, [sp, #48] + ldp d14, d15, [sp, #64] +.cfi_restore b15 +.cfi_restore b14 +.cfi_restore b13 +.cfi_restore b12 +.cfi_restore b11 +.cfi_restore b10 +.cfi_restore b9 +.cfi_restore b8 + ldp x29, x30, [sp], 80 +.cfi_restore w29 +.cfi_restore w30 +.cfi_def_cfa_offset 0 + AARCH64_VALIDATE_LINK_REGISTER + ret + +Lopen_128: + // On some architectures preparing 5 blocks for small buffers is wasteful + eor v25.16b, v25.16b, v25.16b + mov x11, #1 + mov v25.s[0], w11 + mov v0.16b, v24.16b + mov v1.16b, v24.16b + mov v2.16b, v24.16b + mov v5.16b, v28.16b + mov v6.16b, v28.16b + mov v7.16b, v28.16b + mov v10.16b, v29.16b + mov v11.16b, v29.16b + mov v12.16b, v29.16b + mov v17.16b, v30.16b + add v15.4s, v17.4s, v25.4s + add v16.4s, v15.4s, v25.4s + + mov x6, #10 + +Lopen_128_rounds: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #4 + ext v6.16b, v6.16b, v6.16b, #4 + ext v7.16b, v7.16b, v7.16b, #4 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #12 + ext v16.16b, v16.16b, v16.16b, #12 + ext v17.16b, v17.16b, v17.16b, #12 + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #12 + ext v6.16b, v6.16b, v6.16b, #12 + ext v7.16b, v7.16b, v7.16b, #12 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #4 + ext v16.16b, v16.16b, v16.16b, #4 + ext v17.16b, v17.16b, v17.16b, #4 + subs x6, x6, #1 + b.hi Lopen_128_rounds + + add v0.4s, v0.4s, v24.4s + add v1.4s, v1.4s, v24.4s + add v2.4s, v2.4s, v24.4s + + add v5.4s, v5.4s, v28.4s + add v6.4s, v6.4s, v28.4s + add v7.4s, v7.4s, v28.4s + + add v10.4s, v10.4s, v29.4s + add v11.4s, v11.4s, v29.4s + + add v30.4s, v30.4s, v25.4s + add v15.4s, v15.4s, v30.4s + add v30.4s, v30.4s, v25.4s + add v16.4s, v16.4s, v30.4s + + and v2.16b, v2.16b, v27.16b + mov x16, v2.d[0] // Move the R key to GPRs + mov x17, v2.d[1] + mov v27.16b, v7.16b // Store the S key + + bl Lpoly_hash_ad_internal + +Lopen_128_store: + cmp x2, #64 + b.lt Lopen_128_store_64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v21.d[0] + mov x12, v21.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v22.d[0] + mov x12, v22.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v23.d[0] + mov x12, v23.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v15.16b + + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #64 + + mov v0.16b, v1.16b + mov v5.16b, v6.16b + mov v10.16b, v11.16b + mov v15.16b, v16.16b + +Lopen_128_store_64: + + lsr x4, x2, #4 + mov x3, x1 + +Lopen_128_hash_64: + cbz x4, Lopen_tail_64_store + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + sub x4, x4, #1 + b Lopen_128_hash_64 +.cfi_endproc + +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) diff --git a/ring-0.17.14/pregenerated/chacha20_poly1305_armv8-linux64.S b/ring-0.17.14/pregenerated/chacha20_poly1305_armv8-linux64.S new file mode 100644 index 0000000000..611f4366e0 --- /dev/null +++ b/ring-0.17.14/pregenerated/chacha20_poly1305_armv8-linux64.S @@ -0,0 +1,3008 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) +.section .rodata + +.align 7 +.Lchacha20_consts: +.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' +.Linc: +.long 1,2,3,4 +.Lrol8: +.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 +.Lclamp: +.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC + +.text + +.type .Lpoly_hash_ad_internal,%function +.align 6 +.Lpoly_hash_ad_internal: +.cfi_startproc + cbnz x4, .Lpoly_hash_intro + ret + +.Lpoly_hash_intro: + cmp x4, #16 + b.lt .Lpoly_hash_ad_tail + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + sub x4, x4, #16 + b .Lpoly_hash_ad_internal + +.Lpoly_hash_ad_tail: + cbz x4, .Lpoly_hash_ad_ret + + eor v20.16b, v20.16b, v20.16b // Use T0 to load the AAD + sub x4, x4, #1 + +.Lpoly_hash_tail_16_compose: + ext v20.16b, v20.16b, v20.16b, #15 + ldrb w11, [x3, x4] + mov v20.b[0], w11 + subs x4, x4, #1 + b.ge .Lpoly_hash_tail_16_compose + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + +.Lpoly_hash_ad_ret: + ret +.cfi_endproc +.size .Lpoly_hash_ad_internal, .-.Lpoly_hash_ad_internal + +///////////////////////////////// +// +// void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data); +// +.globl chacha20_poly1305_seal +.hidden chacha20_poly1305_seal +.type chacha20_poly1305_seal,%function +.align 6 +chacha20_poly1305_seal: + AARCH64_SIGN_LINK_REGISTER +.cfi_startproc + stp x29, x30, [sp, #-80]! +.cfi_def_cfa_offset 80 +.cfi_offset w30, -72 +.cfi_offset w29, -80 + mov x29, sp + // We probably could do .cfi_def_cfa w29, 80 at this point, but since + // we don't actually use the frame pointer like that, it's probably not + // worth bothering. + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] +.cfi_offset b15, -8 +.cfi_offset b14, -16 +.cfi_offset b13, -24 +.cfi_offset b12, -32 +.cfi_offset b11, -40 +.cfi_offset b10, -48 +.cfi_offset b9, -56 +.cfi_offset b8, -64 + + adrp x11, .Lchacha20_consts + add x11, x11, :lo12:.Lchacha20_consts + + ld1 {v24.16b - v27.16b}, [x11] // .Load the CONSTS, INC, ROL8 and CLAMP values + ld1 {v28.16b - v30.16b}, [x5] + + mov x15, #1 // Prepare the Poly1305 state + mov x8, #0 + mov x9, #0 + mov x10, #0 + + ldr x12, [x5, #56] // The total cipher text length includes extra_in_len + add x12, x12, x2 + mov v31.d[0], x4 // Store the input and aad lengths + mov v31.d[1], x12 + + cmp x2, #128 + b.le .Lseal_128 // Optimization for smaller buffers + + // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext, + // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically, + // the fifth block (A4-D4) horizontally. + ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] + mov v4.16b, v24.16b + + ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 + mov v9.16b, v28.16b + + ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 + mov v14.16b, v29.16b + + ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] + add v15.4s, v15.4s, v25.4s + mov v19.16b, v30.16b + + sub x5, x5, #32 + + mov x6, #10 + +.align 5 +.Lseal_init_rounds: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v9.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v18.8h, v18.8h + rev32 v19.8h, v19.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + eor v8.16b, v8.16b, v13.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v9.4s, #20 + sli v8.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + add v3.4s, v3.4s, v7.4s + add v4.4s, v4.4s, v8.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v18.16b, {v18.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v14.16b + + ushr v9.4s, v8.4s, #25 + sli v9.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #4 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #12 + add v0.4s, v0.4s, v6.4s + add v1.4s, v1.4s, v7.4s + add v2.4s, v2.4s, v8.4s + add v3.4s, v3.4s, v5.4s + add v4.4s, v4.4s, v9.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v18.8h, v18.8h + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v19.8h, v19.8h + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v6.4s, #20 + sli v20.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v5.4s, #20 + sli v8.4s, v5.4s, #12 + ushr v5.4s, v9.4s, #20 + sli v5.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v5.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v18.16b, {v18.16b}, v26.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v12.16b + eor v6.16b, v6.16b, v13.16b + eor v7.16b, v7.16b, v10.16b + eor v8.16b, v8.16b, v11.16b + eor v5.16b, v5.16b, v14.16b + + ushr v9.4s, v5.4s, #25 + sli v9.4s, v5.4s, #7 + ushr v5.4s, v8.4s, #25 + sli v5.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v20.4s, #25 + sli v6.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #12 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #4 + subs x6, x6, #1 + b.hi .Lseal_init_rounds + + add v15.4s, v15.4s, v25.4s + mov x11, #4 + dup v20.4s, w11 + add v25.4s, v25.4s, v20.4s + + zip1 v20.4s, v0.4s, v1.4s + zip2 v21.4s, v0.4s, v1.4s + zip1 v22.4s, v2.4s, v3.4s + zip2 v23.4s, v2.4s, v3.4s + + zip1 v0.2d, v20.2d, v22.2d + zip2 v1.2d, v20.2d, v22.2d + zip1 v2.2d, v21.2d, v23.2d + zip2 v3.2d, v21.2d, v23.2d + + zip1 v20.4s, v5.4s, v6.4s + zip2 v21.4s, v5.4s, v6.4s + zip1 v22.4s, v7.4s, v8.4s + zip2 v23.4s, v7.4s, v8.4s + + zip1 v5.2d, v20.2d, v22.2d + zip2 v6.2d, v20.2d, v22.2d + zip1 v7.2d, v21.2d, v23.2d + zip2 v8.2d, v21.2d, v23.2d + + zip1 v20.4s, v10.4s, v11.4s + zip2 v21.4s, v10.4s, v11.4s + zip1 v22.4s, v12.4s, v13.4s + zip2 v23.4s, v12.4s, v13.4s + + zip1 v10.2d, v20.2d, v22.2d + zip2 v11.2d, v20.2d, v22.2d + zip1 v12.2d, v21.2d, v23.2d + zip2 v13.2d, v21.2d, v23.2d + + zip1 v20.4s, v15.4s, v16.4s + zip2 v21.4s, v15.4s, v16.4s + zip1 v22.4s, v17.4s, v18.4s + zip2 v23.4s, v17.4s, v18.4s + + zip1 v15.2d, v20.2d, v22.2d + zip2 v16.2d, v20.2d, v22.2d + zip1 v17.2d, v21.2d, v23.2d + zip2 v18.2d, v21.2d, v23.2d + + add v4.4s, v4.4s, v24.4s + add v9.4s, v9.4s, v28.4s + and v4.16b, v4.16b, v27.16b + + add v0.4s, v0.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v15.4s, v15.4s, v30.4s + + add v1.4s, v1.4s, v24.4s + add v6.4s, v6.4s, v28.4s + add v11.4s, v11.4s, v29.4s + add v16.4s, v16.4s, v30.4s + + add v2.4s, v2.4s, v24.4s + add v7.4s, v7.4s, v28.4s + add v12.4s, v12.4s, v29.4s + add v17.4s, v17.4s, v30.4s + + add v3.4s, v3.4s, v24.4s + add v8.4s, v8.4s, v28.4s + add v13.4s, v13.4s, v29.4s + add v18.4s, v18.4s, v30.4s + + mov x16, v4.d[0] // Move the R key to GPRs + mov x17, v4.d[1] + mov v27.16b, v9.16b // Store the S key + + bl .Lpoly_hash_ad_internal + + mov x3, x0 + cmp x2, #256 + b.le .Lseal_tail + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v15.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v1.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v11.16b + eor v23.16b, v23.16b, v16.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v2.16b + eor v21.16b, v21.16b, v7.16b + eor v22.16b, v22.16b, v12.16b + eor v23.16b, v23.16b, v17.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v3.16b + eor v21.16b, v21.16b, v8.16b + eor v22.16b, v22.16b, v13.16b + eor v23.16b, v23.16b, v18.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #256 + + mov x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds + mov x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256 + +.Lseal_main_loop: + adrp x11, .Lchacha20_consts + add x11, x11, :lo12:.Lchacha20_consts + + ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] + mov v4.16b, v24.16b + + ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 + mov v9.16b, v28.16b + + ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 + mov v14.16b, v29.16b + + ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] + add v15.4s, v15.4s, v25.4s + mov v19.16b, v30.16b + + eor v20.16b, v20.16b, v20.16b //zero + not v21.16b, v20.16b // -1 + sub v21.4s, v25.4s, v21.4s // Add +1 + ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) + add v19.4s, v19.4s, v20.4s + + sub x5, x5, #32 +.align 5 +.Lseal_main_loop_rounds: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v9.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v18.8h, v18.8h + rev32 v19.8h, v19.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + eor v8.16b, v8.16b, v13.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v9.4s, #20 + sli v8.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + add v3.4s, v3.4s, v7.4s + add v4.4s, v4.4s, v8.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v18.16b, {v18.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v14.16b + + ushr v9.4s, v8.4s, #25 + sli v9.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #4 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #12 + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + add v0.4s, v0.4s, v6.4s + add v1.4s, v1.4s, v7.4s + add v2.4s, v2.4s, v8.4s + add v3.4s, v3.4s, v5.4s + add v4.4s, v4.4s, v9.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v18.8h, v18.8h + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v19.8h, v19.8h + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v6.4s, #20 + sli v20.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v5.4s, #20 + sli v8.4s, v5.4s, #12 + ushr v5.4s, v9.4s, #20 + sli v5.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v5.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v18.16b, {v18.16b}, v26.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v12.16b + eor v6.16b, v6.16b, v13.16b + eor v7.16b, v7.16b, v10.16b + eor v8.16b, v8.16b, v11.16b + eor v5.16b, v5.16b, v14.16b + + ushr v9.4s, v5.4s, #25 + sli v9.4s, v5.4s, #7 + ushr v5.4s, v8.4s, #25 + sli v5.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v20.4s, #25 + sli v6.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #12 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #4 + subs x6, x6, #1 + b.ge .Lseal_main_loop_rounds + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + subs x7, x7, #1 + b.gt .Lseal_main_loop_rounds + + eor v20.16b, v20.16b, v20.16b //zero + not v21.16b, v20.16b // -1 + sub v21.4s, v25.4s, v21.4s // Add +1 + ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) + add v19.4s, v19.4s, v20.4s + + add v15.4s, v15.4s, v25.4s + mov x11, #5 + dup v20.4s, w11 + add v25.4s, v25.4s, v20.4s + + zip1 v20.4s, v0.4s, v1.4s + zip2 v21.4s, v0.4s, v1.4s + zip1 v22.4s, v2.4s, v3.4s + zip2 v23.4s, v2.4s, v3.4s + + zip1 v0.2d, v20.2d, v22.2d + zip2 v1.2d, v20.2d, v22.2d + zip1 v2.2d, v21.2d, v23.2d + zip2 v3.2d, v21.2d, v23.2d + + zip1 v20.4s, v5.4s, v6.4s + zip2 v21.4s, v5.4s, v6.4s + zip1 v22.4s, v7.4s, v8.4s + zip2 v23.4s, v7.4s, v8.4s + + zip1 v5.2d, v20.2d, v22.2d + zip2 v6.2d, v20.2d, v22.2d + zip1 v7.2d, v21.2d, v23.2d + zip2 v8.2d, v21.2d, v23.2d + + zip1 v20.4s, v10.4s, v11.4s + zip2 v21.4s, v10.4s, v11.4s + zip1 v22.4s, v12.4s, v13.4s + zip2 v23.4s, v12.4s, v13.4s + + zip1 v10.2d, v20.2d, v22.2d + zip2 v11.2d, v20.2d, v22.2d + zip1 v12.2d, v21.2d, v23.2d + zip2 v13.2d, v21.2d, v23.2d + + zip1 v20.4s, v15.4s, v16.4s + zip2 v21.4s, v15.4s, v16.4s + zip1 v22.4s, v17.4s, v18.4s + zip2 v23.4s, v17.4s, v18.4s + + zip1 v15.2d, v20.2d, v22.2d + zip2 v16.2d, v20.2d, v22.2d + zip1 v17.2d, v21.2d, v23.2d + zip2 v18.2d, v21.2d, v23.2d + + add v0.4s, v0.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v15.4s, v15.4s, v30.4s + + add v1.4s, v1.4s, v24.4s + add v6.4s, v6.4s, v28.4s + add v11.4s, v11.4s, v29.4s + add v16.4s, v16.4s, v30.4s + + add v2.4s, v2.4s, v24.4s + add v7.4s, v7.4s, v28.4s + add v12.4s, v12.4s, v29.4s + add v17.4s, v17.4s, v30.4s + + add v3.4s, v3.4s, v24.4s + add v8.4s, v8.4s, v28.4s + add v13.4s, v13.4s, v29.4s + add v18.4s, v18.4s, v30.4s + + add v4.4s, v4.4s, v24.4s + add v9.4s, v9.4s, v28.4s + add v14.4s, v14.4s, v29.4s + add v19.4s, v19.4s, v30.4s + + cmp x2, #320 + b.le .Lseal_tail + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v15.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v1.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v11.16b + eor v23.16b, v23.16b, v16.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v2.16b + eor v21.16b, v21.16b, v7.16b + eor v22.16b, v22.16b, v12.16b + eor v23.16b, v23.16b, v17.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v3.16b + eor v21.16b, v21.16b, v8.16b + eor v22.16b, v22.16b, v13.16b + eor v23.16b, v23.16b, v18.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v4.16b + eor v21.16b, v21.16b, v9.16b + eor v22.16b, v22.16b, v14.16b + eor v23.16b, v23.16b, v19.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #320 + + mov x6, #0 + mov x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration + + b .Lseal_main_loop + +.Lseal_tail: + // This part of the function handles the storage and authentication of the last [0,320) bytes + // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data. + cmp x2, #64 + b.lt .Lseal_tail_64 + + // Store and authenticate 64B blocks per iteration + ld1 {v20.16b - v23.16b}, [x1], #64 + + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v15.16b + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v21.d[0] + mov x12, v21.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v22.d[0] + mov x12, v22.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v23.d[0] + mov x12, v23.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + st1 {v20.16b - v23.16b}, [x0], #64 + sub x2, x2, #64 + + // Shift the state left by 64 bytes for the next iteration of the loop + mov v0.16b, v1.16b + mov v5.16b, v6.16b + mov v10.16b, v11.16b + mov v15.16b, v16.16b + + mov v1.16b, v2.16b + mov v6.16b, v7.16b + mov v11.16b, v12.16b + mov v16.16b, v17.16b + + mov v2.16b, v3.16b + mov v7.16b, v8.16b + mov v12.16b, v13.16b + mov v17.16b, v18.16b + + mov v3.16b, v4.16b + mov v8.16b, v9.16b + mov v13.16b, v14.16b + mov v18.16b, v19.16b + + b .Lseal_tail + +.Lseal_tail_64: + ldp x3, x4, [x5, #48] // extra_in_len and extra_in_ptr + + // Here we handle the last [0,64) bytes of plaintext + cmp x2, #16 + b.lt .Lseal_tail_16 + // Each iteration encrypt and authenticate a 16B block + ld1 {v20.16b}, [x1], #16 + eor v20.16b, v20.16b, v0.16b + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + st1 {v20.16b}, [x0], #16 + + sub x2, x2, #16 + + // Shift the state left by 16 bytes for the next iteration of the loop + mov v0.16b, v5.16b + mov v5.16b, v10.16b + mov v10.16b, v15.16b + + b .Lseal_tail_64 + +.Lseal_tail_16: + // Here we handle the last [0,16) bytes of ciphertext that require a padded block + cbz x2, .Lseal_hash_extra + + eor v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in + eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes + not v22.16b, v20.16b + + mov x6, x2 + add x1, x1, x2 + + cbz x4, .Lseal_tail_16_compose // No extra data to pad with, zero padding + + mov x7, #16 // We need to load some extra_in first for padding + sub x7, x7, x2 + cmp x4, x7 + csel x7, x4, x7, lt // .Load the minimum of extra_in_len and the amount needed to fill the register + mov x12, x7 + add x3, x3, x7 + sub x4, x4, x7 + +.Lseal_tail16_compose_extra_in: + ext v20.16b, v20.16b, v20.16b, #15 + ldrb w11, [x3, #-1]! + mov v20.b[0], w11 + subs x7, x7, #1 + b.gt .Lseal_tail16_compose_extra_in + + add x3, x3, x12 + +.Lseal_tail_16_compose: + ext v20.16b, v20.16b, v20.16b, #15 + ldrb w11, [x1, #-1]! + mov v20.b[0], w11 + ext v21.16b, v22.16b, v21.16b, #15 + subs x2, x2, #1 + b.gt .Lseal_tail_16_compose + + and v0.16b, v0.16b, v21.16b + eor v20.16b, v20.16b, v0.16b + mov v21.16b, v20.16b + +.Lseal_tail_16_store: + umov w11, v20.b[0] + strb w11, [x0], #1 + ext v20.16b, v20.16b, v20.16b, #1 + subs x6, x6, #1 + b.gt .Lseal_tail_16_store + + // Hash in the final ct block concatenated with extra_in + mov x11, v21.d[0] + mov x12, v21.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + +.Lseal_hash_extra: + cbz x4, .Lseal_finalize + +.Lseal_hash_extra_loop: + cmp x4, #16 + b.lt .Lseal_hash_extra_tail + ld1 {v20.16b}, [x3], #16 + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + sub x4, x4, #16 + b .Lseal_hash_extra_loop + +.Lseal_hash_extra_tail: + cbz x4, .Lseal_finalize + eor v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext + add x3, x3, x4 + +.Lseal_hash_extra_load: + ext v20.16b, v20.16b, v20.16b, #15 + ldrb w11, [x3, #-1]! + mov v20.b[0], w11 + subs x4, x4, #1 + b.gt .Lseal_hash_extra_load + + // Hash in the final padded extra_in blcok + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + +.Lseal_finalize: + mov x11, v31.d[0] + mov x12, v31.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + // Final reduction step + sub x12, xzr, x15 + orr x13, xzr, #3 + subs x11, x8, #-5 + sbcs x12, x9, x12 + sbcs x13, x10, x13 + csel x8, x11, x8, cs + csel x9, x12, x9, cs + csel x10, x13, x10, cs + mov x11, v27.d[0] + mov x12, v27.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + + stp x8, x9, [x5] + + ldp d8, d9, [sp, #16] + ldp d10, d11, [sp, #32] + ldp d12, d13, [sp, #48] + ldp d14, d15, [sp, #64] +.cfi_restore b15 +.cfi_restore b14 +.cfi_restore b13 +.cfi_restore b12 +.cfi_restore b11 +.cfi_restore b10 +.cfi_restore b9 +.cfi_restore b8 + ldp x29, x30, [sp], 80 +.cfi_restore w29 +.cfi_restore w30 +.cfi_def_cfa_offset 0 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.Lseal_128: + // On some architectures preparing 5 blocks for small buffers is wasteful + eor v25.16b, v25.16b, v25.16b + mov x11, #1 + mov v25.s[0], w11 + mov v0.16b, v24.16b + mov v1.16b, v24.16b + mov v2.16b, v24.16b + mov v5.16b, v28.16b + mov v6.16b, v28.16b + mov v7.16b, v28.16b + mov v10.16b, v29.16b + mov v11.16b, v29.16b + mov v12.16b, v29.16b + mov v17.16b, v30.16b + add v15.4s, v17.4s, v25.4s + add v16.4s, v15.4s, v25.4s + + mov x6, #10 + +.Lseal_128_rounds: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #4 + ext v6.16b, v6.16b, v6.16b, #4 + ext v7.16b, v7.16b, v7.16b, #4 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #12 + ext v16.16b, v16.16b, v16.16b, #12 + ext v17.16b, v17.16b, v17.16b, #12 + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #12 + ext v6.16b, v6.16b, v6.16b, #12 + ext v7.16b, v7.16b, v7.16b, #12 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #4 + ext v16.16b, v16.16b, v16.16b, #4 + ext v17.16b, v17.16b, v17.16b, #4 + subs x6, x6, #1 + b.hi .Lseal_128_rounds + + add v0.4s, v0.4s, v24.4s + add v1.4s, v1.4s, v24.4s + add v2.4s, v2.4s, v24.4s + + add v5.4s, v5.4s, v28.4s + add v6.4s, v6.4s, v28.4s + add v7.4s, v7.4s, v28.4s + + // Only the first 32 bytes of the third block (counter = 0) are needed, + // so skip updating v12 and v17. + add v10.4s, v10.4s, v29.4s + add v11.4s, v11.4s, v29.4s + + add v30.4s, v30.4s, v25.4s + add v15.4s, v15.4s, v30.4s + add v30.4s, v30.4s, v25.4s + add v16.4s, v16.4s, v30.4s + + and v2.16b, v2.16b, v27.16b + mov x16, v2.d[0] // Move the R key to GPRs + mov x17, v2.d[1] + mov v27.16b, v7.16b // Store the S key + + bl .Lpoly_hash_ad_internal + b .Lseal_tail +.cfi_endproc +.size chacha20_poly1305_seal,.-chacha20_poly1305_seal + +///////////////////////////////// +// +// void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data); +// +.globl chacha20_poly1305_open +.hidden chacha20_poly1305_open +.type chacha20_poly1305_open,%function +.align 6 +chacha20_poly1305_open: + AARCH64_SIGN_LINK_REGISTER +.cfi_startproc + stp x29, x30, [sp, #-80]! +.cfi_def_cfa_offset 80 +.cfi_offset w30, -72 +.cfi_offset w29, -80 + mov x29, sp + // We probably could do .cfi_def_cfa w29, 80 at this point, but since + // we don't actually use the frame pointer like that, it's probably not + // worth bothering. + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] +.cfi_offset b15, -8 +.cfi_offset b14, -16 +.cfi_offset b13, -24 +.cfi_offset b12, -32 +.cfi_offset b11, -40 +.cfi_offset b10, -48 +.cfi_offset b9, -56 +.cfi_offset b8, -64 + + adrp x11, .Lchacha20_consts + add x11, x11, :lo12:.Lchacha20_consts + + ld1 {v24.16b - v27.16b}, [x11] // .Load the CONSTS, INC, ROL8 and CLAMP values + ld1 {v28.16b - v30.16b}, [x5] + + mov x15, #1 // Prepare the Poly1305 state + mov x8, #0 + mov x9, #0 + mov x10, #0 + + mov v31.d[0], x4 // Store the input and aad lengths + mov v31.d[1], x2 + + cmp x2, #128 + b.le .Lopen_128 // Optimization for smaller buffers + + // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys + mov v0.16b, v24.16b + mov v5.16b, v28.16b + mov v10.16b, v29.16b + mov v15.16b, v30.16b + + mov x6, #10 + +.align 5 +.Lopen_init_rounds: + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #4 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #12 + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #12 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #4 + subs x6, x6, #1 + b.hi .Lopen_init_rounds + + add v0.4s, v0.4s, v24.4s + add v5.4s, v5.4s, v28.4s + + and v0.16b, v0.16b, v27.16b + mov x16, v0.d[0] // Move the R key to GPRs + mov x17, v0.d[1] + mov v27.16b, v5.16b // Store the S key + + bl .Lpoly_hash_ad_internal + +.Lopen_ad_done: + mov x3, x1 + +// Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes +.Lopen_main_loop: + + cmp x2, #192 + b.lt .Lopen_tail + + adrp x11, .Lchacha20_consts + add x11, x11, :lo12:.Lchacha20_consts + + ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] + mov v4.16b, v24.16b + + ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 + mov v9.16b, v28.16b + + ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 + mov v14.16b, v29.16b + + ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] + sub x5, x5, #32 + add v15.4s, v15.4s, v25.4s + mov v19.16b, v30.16b + + eor v20.16b, v20.16b, v20.16b //zero + not v21.16b, v20.16b // -1 + sub v21.4s, v25.4s, v21.4s // Add +1 + ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) + add v19.4s, v19.4s, v20.4s + + lsr x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12 + sub x4, x4, #10 + + mov x7, #10 + subs x6, x7, x4 + subs x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash + csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full + + cbz x7, .Lopen_main_loop_rounds_short + +.align 5 +.Lopen_main_loop_rounds: + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most +.Lopen_main_loop_rounds_short: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v9.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v18.8h, v18.8h + rev32 v19.8h, v19.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + eor v8.16b, v8.16b, v13.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v9.4s, #20 + sli v8.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + add v3.4s, v3.4s, v7.4s + add v4.4s, v4.4s, v8.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v18.16b, {v18.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v14.16b + + ushr v9.4s, v8.4s, #25 + sli v9.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #4 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #12 + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + add v0.4s, v0.4s, v6.4s + add v1.4s, v1.4s, v7.4s + add v2.4s, v2.4s, v8.4s + add v3.4s, v3.4s, v5.4s + add v4.4s, v4.4s, v9.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v18.8h, v18.8h + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v19.8h, v19.8h + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v6.4s, #20 + sli v20.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v5.4s, #20 + sli v8.4s, v5.4s, #12 + ushr v5.4s, v9.4s, #20 + sli v5.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v5.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v18.16b, {v18.16b}, v26.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v12.16b + eor v6.16b, v6.16b, v13.16b + eor v7.16b, v7.16b, v10.16b + eor v8.16b, v8.16b, v11.16b + eor v5.16b, v5.16b, v14.16b + + ushr v9.4s, v5.4s, #25 + sli v9.4s, v5.4s, #7 + ushr v5.4s, v8.4s, #25 + sli v5.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v20.4s, #25 + sli v6.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #12 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #4 + subs x7, x7, #1 + b.gt .Lopen_main_loop_rounds + subs x6, x6, #1 + b.ge .Lopen_main_loop_rounds_short + + eor v20.16b, v20.16b, v20.16b //zero + not v21.16b, v20.16b // -1 + sub v21.4s, v25.4s, v21.4s // Add +1 + ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) + add v19.4s, v19.4s, v20.4s + + add v15.4s, v15.4s, v25.4s + mov x11, #5 + dup v20.4s, w11 + add v25.4s, v25.4s, v20.4s + + zip1 v20.4s, v0.4s, v1.4s + zip2 v21.4s, v0.4s, v1.4s + zip1 v22.4s, v2.4s, v3.4s + zip2 v23.4s, v2.4s, v3.4s + + zip1 v0.2d, v20.2d, v22.2d + zip2 v1.2d, v20.2d, v22.2d + zip1 v2.2d, v21.2d, v23.2d + zip2 v3.2d, v21.2d, v23.2d + + zip1 v20.4s, v5.4s, v6.4s + zip2 v21.4s, v5.4s, v6.4s + zip1 v22.4s, v7.4s, v8.4s + zip2 v23.4s, v7.4s, v8.4s + + zip1 v5.2d, v20.2d, v22.2d + zip2 v6.2d, v20.2d, v22.2d + zip1 v7.2d, v21.2d, v23.2d + zip2 v8.2d, v21.2d, v23.2d + + zip1 v20.4s, v10.4s, v11.4s + zip2 v21.4s, v10.4s, v11.4s + zip1 v22.4s, v12.4s, v13.4s + zip2 v23.4s, v12.4s, v13.4s + + zip1 v10.2d, v20.2d, v22.2d + zip2 v11.2d, v20.2d, v22.2d + zip1 v12.2d, v21.2d, v23.2d + zip2 v13.2d, v21.2d, v23.2d + + zip1 v20.4s, v15.4s, v16.4s + zip2 v21.4s, v15.4s, v16.4s + zip1 v22.4s, v17.4s, v18.4s + zip2 v23.4s, v17.4s, v18.4s + + zip1 v15.2d, v20.2d, v22.2d + zip2 v16.2d, v20.2d, v22.2d + zip1 v17.2d, v21.2d, v23.2d + zip2 v18.2d, v21.2d, v23.2d + + add v0.4s, v0.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v15.4s, v15.4s, v30.4s + + add v1.4s, v1.4s, v24.4s + add v6.4s, v6.4s, v28.4s + add v11.4s, v11.4s, v29.4s + add v16.4s, v16.4s, v30.4s + + add v2.4s, v2.4s, v24.4s + add v7.4s, v7.4s, v28.4s + add v12.4s, v12.4s, v29.4s + add v17.4s, v17.4s, v30.4s + + add v3.4s, v3.4s, v24.4s + add v8.4s, v8.4s, v28.4s + add v13.4s, v13.4s, v29.4s + add v18.4s, v18.4s, v30.4s + + add v4.4s, v4.4s, v24.4s + add v9.4s, v9.4s, v28.4s + add v14.4s, v14.4s, v29.4s + add v19.4s, v19.4s, v30.4s + + // We can always safely store 192 bytes + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v15.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v1.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v11.16b + eor v23.16b, v23.16b, v16.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v2.16b + eor v21.16b, v21.16b, v7.16b + eor v22.16b, v22.16b, v12.16b + eor v23.16b, v23.16b, v17.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #192 + + mov v0.16b, v3.16b + mov v5.16b, v8.16b + mov v10.16b, v13.16b + mov v15.16b, v18.16b + + cmp x2, #64 + b.lt .Lopen_tail_64_store + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v3.16b + eor v21.16b, v21.16b, v8.16b + eor v22.16b, v22.16b, v13.16b + eor v23.16b, v23.16b, v18.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #64 + + mov v0.16b, v4.16b + mov v5.16b, v9.16b + mov v10.16b, v14.16b + mov v15.16b, v19.16b + + cmp x2, #64 + b.lt .Lopen_tail_64_store + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v4.16b + eor v21.16b, v21.16b, v9.16b + eor v22.16b, v22.16b, v14.16b + eor v23.16b, v23.16b, v19.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #64 + b .Lopen_main_loop + +.Lopen_tail: + + cbz x2, .Lopen_finalize + + lsr x4, x2, #4 // How many whole blocks we have to hash + + cmp x2, #64 + b.le .Lopen_tail_64 + cmp x2, #128 + b.le .Lopen_tail_128 + +.Lopen_tail_192: + // We need three more blocks + mov v0.16b, v24.16b + mov v1.16b, v24.16b + mov v2.16b, v24.16b + mov v5.16b, v28.16b + mov v6.16b, v28.16b + mov v7.16b, v28.16b + mov v10.16b, v29.16b + mov v11.16b, v29.16b + mov v12.16b, v29.16b + mov v15.16b, v30.16b + mov v16.16b, v30.16b + mov v17.16b, v30.16b + eor v23.16b, v23.16b, v23.16b + eor v21.16b, v21.16b, v21.16b + ins v23.s[0], v25.s[0] + ins v21.d[0], x15 + + add v22.4s, v23.4s, v21.4s + add v21.4s, v22.4s, v21.4s + + add v15.4s, v15.4s, v21.4s + add v16.4s, v16.4s, v23.4s + add v17.4s, v17.4s, v22.4s + + mov x7, #10 + subs x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash + csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing + sub x4, x4, x7 + + cbz x7, .Lopen_tail_192_rounds_no_hash + +.Lopen_tail_192_rounds: + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most +.Lopen_tail_192_rounds_no_hash: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #4 + ext v6.16b, v6.16b, v6.16b, #4 + ext v7.16b, v7.16b, v7.16b, #4 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #12 + ext v16.16b, v16.16b, v16.16b, #12 + ext v17.16b, v17.16b, v17.16b, #12 + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #12 + ext v6.16b, v6.16b, v6.16b, #12 + ext v7.16b, v7.16b, v7.16b, #12 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #4 + ext v16.16b, v16.16b, v16.16b, #4 + ext v17.16b, v17.16b, v17.16b, #4 + subs x7, x7, #1 + b.gt .Lopen_tail_192_rounds + subs x6, x6, #1 + b.ge .Lopen_tail_192_rounds_no_hash + + // We hashed 160 bytes at most, may still have 32 bytes left +.Lopen_tail_192_hash: + cbz x4, .Lopen_tail_192_hash_done + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + sub x4, x4, #1 + b .Lopen_tail_192_hash + +.Lopen_tail_192_hash_done: + + add v0.4s, v0.4s, v24.4s + add v1.4s, v1.4s, v24.4s + add v2.4s, v2.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v6.4s, v6.4s, v28.4s + add v7.4s, v7.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v11.4s, v11.4s, v29.4s + add v12.4s, v12.4s, v29.4s + add v15.4s, v15.4s, v30.4s + add v16.4s, v16.4s, v30.4s + add v17.4s, v17.4s, v30.4s + + add v15.4s, v15.4s, v21.4s + add v16.4s, v16.4s, v23.4s + add v17.4s, v17.4s, v22.4s + + ld1 {v20.16b - v23.16b}, [x1], #64 + + eor v20.16b, v20.16b, v1.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v11.16b + eor v23.16b, v23.16b, v16.16b + + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + + eor v20.16b, v20.16b, v2.16b + eor v21.16b, v21.16b, v7.16b + eor v22.16b, v22.16b, v12.16b + eor v23.16b, v23.16b, v17.16b + + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #128 + b .Lopen_tail_64_store + +.Lopen_tail_128: + // We need two more blocks + mov v0.16b, v24.16b + mov v1.16b, v24.16b + mov v5.16b, v28.16b + mov v6.16b, v28.16b + mov v10.16b, v29.16b + mov v11.16b, v29.16b + mov v15.16b, v30.16b + mov v16.16b, v30.16b + eor v23.16b, v23.16b, v23.16b + eor v22.16b, v22.16b, v22.16b + ins v23.s[0], v25.s[0] + ins v22.d[0], x15 + add v22.4s, v22.4s, v23.4s + + add v15.4s, v15.4s, v22.4s + add v16.4s, v16.4s, v23.4s + + mov x6, #10 + sub x6, x6, x4 + +.Lopen_tail_128_rounds: + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #4 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #12 + add v1.4s, v1.4s, v6.4s + eor v16.16b, v16.16b, v1.16b + rev32 v16.8h, v16.8h + + add v11.4s, v11.4s, v16.4s + eor v6.16b, v6.16b, v11.16b + ushr v20.4s, v6.4s, #20 + sli v20.4s, v6.4s, #12 + add v1.4s, v1.4s, v20.4s + eor v16.16b, v16.16b, v1.16b + tbl v16.16b, {v16.16b}, v26.16b + + add v11.4s, v11.4s, v16.4s + eor v20.16b, v20.16b, v11.16b + ushr v6.4s, v20.4s, #25 + sli v6.4s, v20.4s, #7 + ext v6.16b, v6.16b, v6.16b, #4 + ext v11.16b, v11.16b, v11.16b, #8 + ext v16.16b, v16.16b, v16.16b, #12 + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #12 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #4 + add v1.4s, v1.4s, v6.4s + eor v16.16b, v16.16b, v1.16b + rev32 v16.8h, v16.8h + + add v11.4s, v11.4s, v16.4s + eor v6.16b, v6.16b, v11.16b + ushr v20.4s, v6.4s, #20 + sli v20.4s, v6.4s, #12 + add v1.4s, v1.4s, v20.4s + eor v16.16b, v16.16b, v1.16b + tbl v16.16b, {v16.16b}, v26.16b + + add v11.4s, v11.4s, v16.4s + eor v20.16b, v20.16b, v11.16b + ushr v6.4s, v20.4s, #25 + sli v6.4s, v20.4s, #7 + ext v6.16b, v6.16b, v6.16b, #12 + ext v11.16b, v11.16b, v11.16b, #8 + ext v16.16b, v16.16b, v16.16b, #4 + subs x6, x6, #1 + b.gt .Lopen_tail_128_rounds + cbz x4, .Lopen_tail_128_rounds_done + subs x4, x4, #1 + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + b .Lopen_tail_128_rounds + +.Lopen_tail_128_rounds_done: + add v0.4s, v0.4s, v24.4s + add v1.4s, v1.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v6.4s, v6.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v11.4s, v11.4s, v29.4s + add v15.4s, v15.4s, v30.4s + add v16.4s, v16.4s, v30.4s + add v15.4s, v15.4s, v22.4s + add v16.4s, v16.4s, v23.4s + + ld1 {v20.16b - v23.16b}, [x1], #64 + + eor v20.16b, v20.16b, v1.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v11.16b + eor v23.16b, v23.16b, v16.16b + + st1 {v20.16b - v23.16b}, [x0], #64 + sub x2, x2, #64 + + b .Lopen_tail_64_store + +.Lopen_tail_64: + // We just need a single block + mov v0.16b, v24.16b + mov v5.16b, v28.16b + mov v10.16b, v29.16b + mov v15.16b, v30.16b + eor v23.16b, v23.16b, v23.16b + ins v23.s[0], v25.s[0] + add v15.4s, v15.4s, v23.4s + + mov x6, #10 + sub x6, x6, x4 + +.Lopen_tail_64_rounds: + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #4 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #12 + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #12 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #4 + subs x6, x6, #1 + b.gt .Lopen_tail_64_rounds + cbz x4, .Lopen_tail_64_rounds_done + subs x4, x4, #1 + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + b .Lopen_tail_64_rounds + +.Lopen_tail_64_rounds_done: + add v0.4s, v0.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v15.4s, v15.4s, v30.4s + add v15.4s, v15.4s, v23.4s + +.Lopen_tail_64_store: + cmp x2, #16 + b.lt .Lopen_tail_16 + + ld1 {v20.16b}, [x1], #16 + eor v20.16b, v20.16b, v0.16b + st1 {v20.16b}, [x0], #16 + mov v0.16b, v5.16b + mov v5.16b, v10.16b + mov v10.16b, v15.16b + sub x2, x2, #16 + b .Lopen_tail_64_store + +.Lopen_tail_16: + // Here we handle the last [0,16) bytes that require a padded block + cbz x2, .Lopen_finalize + + eor v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext + eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask + not v22.16b, v20.16b + + add x7, x1, x2 + mov x6, x2 + +.Lopen_tail_16_compose: + ext v20.16b, v20.16b, v20.16b, #15 + ldrb w11, [x7, #-1]! + mov v20.b[0], w11 + ext v21.16b, v22.16b, v21.16b, #15 + subs x2, x2, #1 + b.gt .Lopen_tail_16_compose + + and v20.16b, v20.16b, v21.16b + // Hash in the final padded block + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + eor v20.16b, v20.16b, v0.16b + +.Lopen_tail_16_store: + umov w11, v20.b[0] + strb w11, [x0], #1 + ext v20.16b, v20.16b, v20.16b, #1 + subs x6, x6, #1 + b.gt .Lopen_tail_16_store + +.Lopen_finalize: + mov x11, v31.d[0] + mov x12, v31.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + // Final reduction step + sub x12, xzr, x15 + orr x13, xzr, #3 + subs x11, x8, #-5 + sbcs x12, x9, x12 + sbcs x13, x10, x13 + csel x8, x11, x8, cs + csel x9, x12, x9, cs + csel x10, x13, x10, cs + mov x11, v27.d[0] + mov x12, v27.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + + stp x8, x9, [x5] + + ldp d8, d9, [sp, #16] + ldp d10, d11, [sp, #32] + ldp d12, d13, [sp, #48] + ldp d14, d15, [sp, #64] +.cfi_restore b15 +.cfi_restore b14 +.cfi_restore b13 +.cfi_restore b12 +.cfi_restore b11 +.cfi_restore b10 +.cfi_restore b9 +.cfi_restore b8 + ldp x29, x30, [sp], 80 +.cfi_restore w29 +.cfi_restore w30 +.cfi_def_cfa_offset 0 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.Lopen_128: + // On some architectures preparing 5 blocks for small buffers is wasteful + eor v25.16b, v25.16b, v25.16b + mov x11, #1 + mov v25.s[0], w11 + mov v0.16b, v24.16b + mov v1.16b, v24.16b + mov v2.16b, v24.16b + mov v5.16b, v28.16b + mov v6.16b, v28.16b + mov v7.16b, v28.16b + mov v10.16b, v29.16b + mov v11.16b, v29.16b + mov v12.16b, v29.16b + mov v17.16b, v30.16b + add v15.4s, v17.4s, v25.4s + add v16.4s, v15.4s, v25.4s + + mov x6, #10 + +.Lopen_128_rounds: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #4 + ext v6.16b, v6.16b, v6.16b, #4 + ext v7.16b, v7.16b, v7.16b, #4 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #12 + ext v16.16b, v16.16b, v16.16b, #12 + ext v17.16b, v17.16b, v17.16b, #12 + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #12 + ext v6.16b, v6.16b, v6.16b, #12 + ext v7.16b, v7.16b, v7.16b, #12 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #4 + ext v16.16b, v16.16b, v16.16b, #4 + ext v17.16b, v17.16b, v17.16b, #4 + subs x6, x6, #1 + b.hi .Lopen_128_rounds + + add v0.4s, v0.4s, v24.4s + add v1.4s, v1.4s, v24.4s + add v2.4s, v2.4s, v24.4s + + add v5.4s, v5.4s, v28.4s + add v6.4s, v6.4s, v28.4s + add v7.4s, v7.4s, v28.4s + + add v10.4s, v10.4s, v29.4s + add v11.4s, v11.4s, v29.4s + + add v30.4s, v30.4s, v25.4s + add v15.4s, v15.4s, v30.4s + add v30.4s, v30.4s, v25.4s + add v16.4s, v16.4s, v30.4s + + and v2.16b, v2.16b, v27.16b + mov x16, v2.d[0] // Move the R key to GPRs + mov x17, v2.d[1] + mov v27.16b, v7.16b // Store the S key + + bl .Lpoly_hash_ad_internal + +.Lopen_128_store: + cmp x2, #64 + b.lt .Lopen_128_store_64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v21.d[0] + mov x12, v21.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v22.d[0] + mov x12, v22.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v23.d[0] + mov x12, v23.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v15.16b + + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #64 + + mov v0.16b, v1.16b + mov v5.16b, v6.16b + mov v10.16b, v11.16b + mov v15.16b, v16.16b + +.Lopen_128_store_64: + + lsr x4, x2, #4 + mov x3, x1 + +.Lopen_128_hash_64: + cbz x4, .Lopen_tail_64_store + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + sub x4, x4, #1 + b .Lopen_128_hash_64 +.cfi_endproc +.size chacha20_poly1305_open,.-chacha20_poly1305_open +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) diff --git a/ring-0.17.14/pregenerated/chacha20_poly1305_armv8-win64.S b/ring-0.17.14/pregenerated/chacha20_poly1305_armv8-win64.S new file mode 100644 index 0000000000..75f34c5e6b --- /dev/null +++ b/ring-0.17.14/pregenerated/chacha20_poly1305_armv8-win64.S @@ -0,0 +1,3014 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +.section .rodata + +.align 7 +Lchacha20_consts: +.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' +Linc: +.long 1,2,3,4 +Lrol8: +.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 +Lclamp: +.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC + +.text + +.def Lpoly_hash_ad_internal + .type 32 +.endef +.align 6 +Lpoly_hash_ad_internal: +.cfi_startproc + cbnz x4, Lpoly_hash_intro + ret + +Lpoly_hash_intro: + cmp x4, #16 + b.lt Lpoly_hash_ad_tail + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + sub x4, x4, #16 + b Lpoly_hash_ad_internal + +Lpoly_hash_ad_tail: + cbz x4, Lpoly_hash_ad_ret + + eor v20.16b, v20.16b, v20.16b // Use T0 to load the AAD + sub x4, x4, #1 + +Lpoly_hash_tail_16_compose: + ext v20.16b, v20.16b, v20.16b, #15 + ldrb w11, [x3, x4] + mov v20.b[0], w11 + subs x4, x4, #1 + b.ge Lpoly_hash_tail_16_compose + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + +Lpoly_hash_ad_ret: + ret +.cfi_endproc + + +///////////////////////////////// +// +// void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data); +// +.globl chacha20_poly1305_seal + +.def chacha20_poly1305_seal + .type 32 +.endef +.align 6 +chacha20_poly1305_seal: + AARCH64_SIGN_LINK_REGISTER +.cfi_startproc + stp x29, x30, [sp, #-80]! +.cfi_def_cfa_offset 80 +.cfi_offset w30, -72 +.cfi_offset w29, -80 + mov x29, sp + // We probably could do .cfi_def_cfa w29, 80 at this point, but since + // we don't actually use the frame pointer like that, it's probably not + // worth bothering. + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] +.cfi_offset b15, -8 +.cfi_offset b14, -16 +.cfi_offset b13, -24 +.cfi_offset b12, -32 +.cfi_offset b11, -40 +.cfi_offset b10, -48 +.cfi_offset b9, -56 +.cfi_offset b8, -64 + + adrp x11, Lchacha20_consts + add x11, x11, :lo12:Lchacha20_consts + + ld1 {v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values + ld1 {v28.16b - v30.16b}, [x5] + + mov x15, #1 // Prepare the Poly1305 state + mov x8, #0 + mov x9, #0 + mov x10, #0 + + ldr x12, [x5, #56] // The total cipher text length includes extra_in_len + add x12, x12, x2 + mov v31.d[0], x4 // Store the input and aad lengths + mov v31.d[1], x12 + + cmp x2, #128 + b.le Lseal_128 // Optimization for smaller buffers + + // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext, + // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically, + // the fifth block (A4-D4) horizontally. + ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] + mov v4.16b, v24.16b + + ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 + mov v9.16b, v28.16b + + ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 + mov v14.16b, v29.16b + + ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] + add v15.4s, v15.4s, v25.4s + mov v19.16b, v30.16b + + sub x5, x5, #32 + + mov x6, #10 + +.align 5 +Lseal_init_rounds: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v9.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v18.8h, v18.8h + rev32 v19.8h, v19.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + eor v8.16b, v8.16b, v13.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v9.4s, #20 + sli v8.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + add v3.4s, v3.4s, v7.4s + add v4.4s, v4.4s, v8.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v18.16b, {v18.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v14.16b + + ushr v9.4s, v8.4s, #25 + sli v9.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #4 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #12 + add v0.4s, v0.4s, v6.4s + add v1.4s, v1.4s, v7.4s + add v2.4s, v2.4s, v8.4s + add v3.4s, v3.4s, v5.4s + add v4.4s, v4.4s, v9.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v18.8h, v18.8h + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v19.8h, v19.8h + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v6.4s, #20 + sli v20.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v5.4s, #20 + sli v8.4s, v5.4s, #12 + ushr v5.4s, v9.4s, #20 + sli v5.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v5.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v18.16b, {v18.16b}, v26.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v12.16b + eor v6.16b, v6.16b, v13.16b + eor v7.16b, v7.16b, v10.16b + eor v8.16b, v8.16b, v11.16b + eor v5.16b, v5.16b, v14.16b + + ushr v9.4s, v5.4s, #25 + sli v9.4s, v5.4s, #7 + ushr v5.4s, v8.4s, #25 + sli v5.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v20.4s, #25 + sli v6.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #12 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #4 + subs x6, x6, #1 + b.hi Lseal_init_rounds + + add v15.4s, v15.4s, v25.4s + mov x11, #4 + dup v20.4s, w11 + add v25.4s, v25.4s, v20.4s + + zip1 v20.4s, v0.4s, v1.4s + zip2 v21.4s, v0.4s, v1.4s + zip1 v22.4s, v2.4s, v3.4s + zip2 v23.4s, v2.4s, v3.4s + + zip1 v0.2d, v20.2d, v22.2d + zip2 v1.2d, v20.2d, v22.2d + zip1 v2.2d, v21.2d, v23.2d + zip2 v3.2d, v21.2d, v23.2d + + zip1 v20.4s, v5.4s, v6.4s + zip2 v21.4s, v5.4s, v6.4s + zip1 v22.4s, v7.4s, v8.4s + zip2 v23.4s, v7.4s, v8.4s + + zip1 v5.2d, v20.2d, v22.2d + zip2 v6.2d, v20.2d, v22.2d + zip1 v7.2d, v21.2d, v23.2d + zip2 v8.2d, v21.2d, v23.2d + + zip1 v20.4s, v10.4s, v11.4s + zip2 v21.4s, v10.4s, v11.4s + zip1 v22.4s, v12.4s, v13.4s + zip2 v23.4s, v12.4s, v13.4s + + zip1 v10.2d, v20.2d, v22.2d + zip2 v11.2d, v20.2d, v22.2d + zip1 v12.2d, v21.2d, v23.2d + zip2 v13.2d, v21.2d, v23.2d + + zip1 v20.4s, v15.4s, v16.4s + zip2 v21.4s, v15.4s, v16.4s + zip1 v22.4s, v17.4s, v18.4s + zip2 v23.4s, v17.4s, v18.4s + + zip1 v15.2d, v20.2d, v22.2d + zip2 v16.2d, v20.2d, v22.2d + zip1 v17.2d, v21.2d, v23.2d + zip2 v18.2d, v21.2d, v23.2d + + add v4.4s, v4.4s, v24.4s + add v9.4s, v9.4s, v28.4s + and v4.16b, v4.16b, v27.16b + + add v0.4s, v0.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v15.4s, v15.4s, v30.4s + + add v1.4s, v1.4s, v24.4s + add v6.4s, v6.4s, v28.4s + add v11.4s, v11.4s, v29.4s + add v16.4s, v16.4s, v30.4s + + add v2.4s, v2.4s, v24.4s + add v7.4s, v7.4s, v28.4s + add v12.4s, v12.4s, v29.4s + add v17.4s, v17.4s, v30.4s + + add v3.4s, v3.4s, v24.4s + add v8.4s, v8.4s, v28.4s + add v13.4s, v13.4s, v29.4s + add v18.4s, v18.4s, v30.4s + + mov x16, v4.d[0] // Move the R key to GPRs + mov x17, v4.d[1] + mov v27.16b, v9.16b // Store the S key + + bl Lpoly_hash_ad_internal + + mov x3, x0 + cmp x2, #256 + b.le Lseal_tail + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v15.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v1.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v11.16b + eor v23.16b, v23.16b, v16.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v2.16b + eor v21.16b, v21.16b, v7.16b + eor v22.16b, v22.16b, v12.16b + eor v23.16b, v23.16b, v17.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v3.16b + eor v21.16b, v21.16b, v8.16b + eor v22.16b, v22.16b, v13.16b + eor v23.16b, v23.16b, v18.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #256 + + mov x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds + mov x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256 + +Lseal_main_loop: + adrp x11, Lchacha20_consts + add x11, x11, :lo12:Lchacha20_consts + + ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] + mov v4.16b, v24.16b + + ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 + mov v9.16b, v28.16b + + ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 + mov v14.16b, v29.16b + + ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] + add v15.4s, v15.4s, v25.4s + mov v19.16b, v30.16b + + eor v20.16b, v20.16b, v20.16b //zero + not v21.16b, v20.16b // -1 + sub v21.4s, v25.4s, v21.4s // Add +1 + ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) + add v19.4s, v19.4s, v20.4s + + sub x5, x5, #32 +.align 5 +Lseal_main_loop_rounds: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v9.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v18.8h, v18.8h + rev32 v19.8h, v19.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + eor v8.16b, v8.16b, v13.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v9.4s, #20 + sli v8.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + add v3.4s, v3.4s, v7.4s + add v4.4s, v4.4s, v8.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v18.16b, {v18.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v14.16b + + ushr v9.4s, v8.4s, #25 + sli v9.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #4 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #12 + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + add v0.4s, v0.4s, v6.4s + add v1.4s, v1.4s, v7.4s + add v2.4s, v2.4s, v8.4s + add v3.4s, v3.4s, v5.4s + add v4.4s, v4.4s, v9.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v18.8h, v18.8h + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v19.8h, v19.8h + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v6.4s, #20 + sli v20.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v5.4s, #20 + sli v8.4s, v5.4s, #12 + ushr v5.4s, v9.4s, #20 + sli v5.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v5.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v18.16b, {v18.16b}, v26.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v12.16b + eor v6.16b, v6.16b, v13.16b + eor v7.16b, v7.16b, v10.16b + eor v8.16b, v8.16b, v11.16b + eor v5.16b, v5.16b, v14.16b + + ushr v9.4s, v5.4s, #25 + sli v9.4s, v5.4s, #7 + ushr v5.4s, v8.4s, #25 + sli v5.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v20.4s, #25 + sli v6.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #12 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #4 + subs x6, x6, #1 + b.ge Lseal_main_loop_rounds + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + subs x7, x7, #1 + b.gt Lseal_main_loop_rounds + + eor v20.16b, v20.16b, v20.16b //zero + not v21.16b, v20.16b // -1 + sub v21.4s, v25.4s, v21.4s // Add +1 + ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) + add v19.4s, v19.4s, v20.4s + + add v15.4s, v15.4s, v25.4s + mov x11, #5 + dup v20.4s, w11 + add v25.4s, v25.4s, v20.4s + + zip1 v20.4s, v0.4s, v1.4s + zip2 v21.4s, v0.4s, v1.4s + zip1 v22.4s, v2.4s, v3.4s + zip2 v23.4s, v2.4s, v3.4s + + zip1 v0.2d, v20.2d, v22.2d + zip2 v1.2d, v20.2d, v22.2d + zip1 v2.2d, v21.2d, v23.2d + zip2 v3.2d, v21.2d, v23.2d + + zip1 v20.4s, v5.4s, v6.4s + zip2 v21.4s, v5.4s, v6.4s + zip1 v22.4s, v7.4s, v8.4s + zip2 v23.4s, v7.4s, v8.4s + + zip1 v5.2d, v20.2d, v22.2d + zip2 v6.2d, v20.2d, v22.2d + zip1 v7.2d, v21.2d, v23.2d + zip2 v8.2d, v21.2d, v23.2d + + zip1 v20.4s, v10.4s, v11.4s + zip2 v21.4s, v10.4s, v11.4s + zip1 v22.4s, v12.4s, v13.4s + zip2 v23.4s, v12.4s, v13.4s + + zip1 v10.2d, v20.2d, v22.2d + zip2 v11.2d, v20.2d, v22.2d + zip1 v12.2d, v21.2d, v23.2d + zip2 v13.2d, v21.2d, v23.2d + + zip1 v20.4s, v15.4s, v16.4s + zip2 v21.4s, v15.4s, v16.4s + zip1 v22.4s, v17.4s, v18.4s + zip2 v23.4s, v17.4s, v18.4s + + zip1 v15.2d, v20.2d, v22.2d + zip2 v16.2d, v20.2d, v22.2d + zip1 v17.2d, v21.2d, v23.2d + zip2 v18.2d, v21.2d, v23.2d + + add v0.4s, v0.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v15.4s, v15.4s, v30.4s + + add v1.4s, v1.4s, v24.4s + add v6.4s, v6.4s, v28.4s + add v11.4s, v11.4s, v29.4s + add v16.4s, v16.4s, v30.4s + + add v2.4s, v2.4s, v24.4s + add v7.4s, v7.4s, v28.4s + add v12.4s, v12.4s, v29.4s + add v17.4s, v17.4s, v30.4s + + add v3.4s, v3.4s, v24.4s + add v8.4s, v8.4s, v28.4s + add v13.4s, v13.4s, v29.4s + add v18.4s, v18.4s, v30.4s + + add v4.4s, v4.4s, v24.4s + add v9.4s, v9.4s, v28.4s + add v14.4s, v14.4s, v29.4s + add v19.4s, v19.4s, v30.4s + + cmp x2, #320 + b.le Lseal_tail + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v15.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v1.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v11.16b + eor v23.16b, v23.16b, v16.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v2.16b + eor v21.16b, v21.16b, v7.16b + eor v22.16b, v22.16b, v12.16b + eor v23.16b, v23.16b, v17.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v3.16b + eor v21.16b, v21.16b, v8.16b + eor v22.16b, v22.16b, v13.16b + eor v23.16b, v23.16b, v18.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v4.16b + eor v21.16b, v21.16b, v9.16b + eor v22.16b, v22.16b, v14.16b + eor v23.16b, v23.16b, v19.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #320 + + mov x6, #0 + mov x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration + + b Lseal_main_loop + +Lseal_tail: + // This part of the function handles the storage and authentication of the last [0,320) bytes + // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data. + cmp x2, #64 + b.lt Lseal_tail_64 + + // Store and authenticate 64B blocks per iteration + ld1 {v20.16b - v23.16b}, [x1], #64 + + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v15.16b + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v21.d[0] + mov x12, v21.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v22.d[0] + mov x12, v22.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v23.d[0] + mov x12, v23.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + st1 {v20.16b - v23.16b}, [x0], #64 + sub x2, x2, #64 + + // Shift the state left by 64 bytes for the next iteration of the loop + mov v0.16b, v1.16b + mov v5.16b, v6.16b + mov v10.16b, v11.16b + mov v15.16b, v16.16b + + mov v1.16b, v2.16b + mov v6.16b, v7.16b + mov v11.16b, v12.16b + mov v16.16b, v17.16b + + mov v2.16b, v3.16b + mov v7.16b, v8.16b + mov v12.16b, v13.16b + mov v17.16b, v18.16b + + mov v3.16b, v4.16b + mov v8.16b, v9.16b + mov v13.16b, v14.16b + mov v18.16b, v19.16b + + b Lseal_tail + +Lseal_tail_64: + ldp x3, x4, [x5, #48] // extra_in_len and extra_in_ptr + + // Here we handle the last [0,64) bytes of plaintext + cmp x2, #16 + b.lt Lseal_tail_16 + // Each iteration encrypt and authenticate a 16B block + ld1 {v20.16b}, [x1], #16 + eor v20.16b, v20.16b, v0.16b + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + st1 {v20.16b}, [x0], #16 + + sub x2, x2, #16 + + // Shift the state left by 16 bytes for the next iteration of the loop + mov v0.16b, v5.16b + mov v5.16b, v10.16b + mov v10.16b, v15.16b + + b Lseal_tail_64 + +Lseal_tail_16: + // Here we handle the last [0,16) bytes of ciphertext that require a padded block + cbz x2, Lseal_hash_extra + + eor v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in + eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes + not v22.16b, v20.16b + + mov x6, x2 + add x1, x1, x2 + + cbz x4, Lseal_tail_16_compose // No extra data to pad with, zero padding + + mov x7, #16 // We need to load some extra_in first for padding + sub x7, x7, x2 + cmp x4, x7 + csel x7, x4, x7, lt // Load the minimum of extra_in_len and the amount needed to fill the register + mov x12, x7 + add x3, x3, x7 + sub x4, x4, x7 + +Lseal_tail16_compose_extra_in: + ext v20.16b, v20.16b, v20.16b, #15 + ldrb w11, [x3, #-1]! + mov v20.b[0], w11 + subs x7, x7, #1 + b.gt Lseal_tail16_compose_extra_in + + add x3, x3, x12 + +Lseal_tail_16_compose: + ext v20.16b, v20.16b, v20.16b, #15 + ldrb w11, [x1, #-1]! + mov v20.b[0], w11 + ext v21.16b, v22.16b, v21.16b, #15 + subs x2, x2, #1 + b.gt Lseal_tail_16_compose + + and v0.16b, v0.16b, v21.16b + eor v20.16b, v20.16b, v0.16b + mov v21.16b, v20.16b + +Lseal_tail_16_store: + umov w11, v20.b[0] + strb w11, [x0], #1 + ext v20.16b, v20.16b, v20.16b, #1 + subs x6, x6, #1 + b.gt Lseal_tail_16_store + + // Hash in the final ct block concatenated with extra_in + mov x11, v21.d[0] + mov x12, v21.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + +Lseal_hash_extra: + cbz x4, Lseal_finalize + +Lseal_hash_extra_loop: + cmp x4, #16 + b.lt Lseal_hash_extra_tail + ld1 {v20.16b}, [x3], #16 + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + sub x4, x4, #16 + b Lseal_hash_extra_loop + +Lseal_hash_extra_tail: + cbz x4, Lseal_finalize + eor v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext + add x3, x3, x4 + +Lseal_hash_extra_load: + ext v20.16b, v20.16b, v20.16b, #15 + ldrb w11, [x3, #-1]! + mov v20.b[0], w11 + subs x4, x4, #1 + b.gt Lseal_hash_extra_load + + // Hash in the final padded extra_in blcok + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + +Lseal_finalize: + mov x11, v31.d[0] + mov x12, v31.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + // Final reduction step + sub x12, xzr, x15 + orr x13, xzr, #3 + subs x11, x8, #-5 + sbcs x12, x9, x12 + sbcs x13, x10, x13 + csel x8, x11, x8, cs + csel x9, x12, x9, cs + csel x10, x13, x10, cs + mov x11, v27.d[0] + mov x12, v27.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + + stp x8, x9, [x5] + + ldp d8, d9, [sp, #16] + ldp d10, d11, [sp, #32] + ldp d12, d13, [sp, #48] + ldp d14, d15, [sp, #64] +.cfi_restore b15 +.cfi_restore b14 +.cfi_restore b13 +.cfi_restore b12 +.cfi_restore b11 +.cfi_restore b10 +.cfi_restore b9 +.cfi_restore b8 + ldp x29, x30, [sp], 80 +.cfi_restore w29 +.cfi_restore w30 +.cfi_def_cfa_offset 0 + AARCH64_VALIDATE_LINK_REGISTER + ret + +Lseal_128: + // On some architectures preparing 5 blocks for small buffers is wasteful + eor v25.16b, v25.16b, v25.16b + mov x11, #1 + mov v25.s[0], w11 + mov v0.16b, v24.16b + mov v1.16b, v24.16b + mov v2.16b, v24.16b + mov v5.16b, v28.16b + mov v6.16b, v28.16b + mov v7.16b, v28.16b + mov v10.16b, v29.16b + mov v11.16b, v29.16b + mov v12.16b, v29.16b + mov v17.16b, v30.16b + add v15.4s, v17.4s, v25.4s + add v16.4s, v15.4s, v25.4s + + mov x6, #10 + +Lseal_128_rounds: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #4 + ext v6.16b, v6.16b, v6.16b, #4 + ext v7.16b, v7.16b, v7.16b, #4 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #12 + ext v16.16b, v16.16b, v16.16b, #12 + ext v17.16b, v17.16b, v17.16b, #12 + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #12 + ext v6.16b, v6.16b, v6.16b, #12 + ext v7.16b, v7.16b, v7.16b, #12 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #4 + ext v16.16b, v16.16b, v16.16b, #4 + ext v17.16b, v17.16b, v17.16b, #4 + subs x6, x6, #1 + b.hi Lseal_128_rounds + + add v0.4s, v0.4s, v24.4s + add v1.4s, v1.4s, v24.4s + add v2.4s, v2.4s, v24.4s + + add v5.4s, v5.4s, v28.4s + add v6.4s, v6.4s, v28.4s + add v7.4s, v7.4s, v28.4s + + // Only the first 32 bytes of the third block (counter = 0) are needed, + // so skip updating v12 and v17. + add v10.4s, v10.4s, v29.4s + add v11.4s, v11.4s, v29.4s + + add v30.4s, v30.4s, v25.4s + add v15.4s, v15.4s, v30.4s + add v30.4s, v30.4s, v25.4s + add v16.4s, v16.4s, v30.4s + + and v2.16b, v2.16b, v27.16b + mov x16, v2.d[0] // Move the R key to GPRs + mov x17, v2.d[1] + mov v27.16b, v7.16b // Store the S key + + bl Lpoly_hash_ad_internal + b Lseal_tail +.cfi_endproc + + +///////////////////////////////// +// +// void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data); +// +.globl chacha20_poly1305_open + +.def chacha20_poly1305_open + .type 32 +.endef +.align 6 +chacha20_poly1305_open: + AARCH64_SIGN_LINK_REGISTER +.cfi_startproc + stp x29, x30, [sp, #-80]! +.cfi_def_cfa_offset 80 +.cfi_offset w30, -72 +.cfi_offset w29, -80 + mov x29, sp + // We probably could do .cfi_def_cfa w29, 80 at this point, but since + // we don't actually use the frame pointer like that, it's probably not + // worth bothering. + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] +.cfi_offset b15, -8 +.cfi_offset b14, -16 +.cfi_offset b13, -24 +.cfi_offset b12, -32 +.cfi_offset b11, -40 +.cfi_offset b10, -48 +.cfi_offset b9, -56 +.cfi_offset b8, -64 + + adrp x11, Lchacha20_consts + add x11, x11, :lo12:Lchacha20_consts + + ld1 {v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values + ld1 {v28.16b - v30.16b}, [x5] + + mov x15, #1 // Prepare the Poly1305 state + mov x8, #0 + mov x9, #0 + mov x10, #0 + + mov v31.d[0], x4 // Store the input and aad lengths + mov v31.d[1], x2 + + cmp x2, #128 + b.le Lopen_128 // Optimization for smaller buffers + + // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys + mov v0.16b, v24.16b + mov v5.16b, v28.16b + mov v10.16b, v29.16b + mov v15.16b, v30.16b + + mov x6, #10 + +.align 5 +Lopen_init_rounds: + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #4 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #12 + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #12 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #4 + subs x6, x6, #1 + b.hi Lopen_init_rounds + + add v0.4s, v0.4s, v24.4s + add v5.4s, v5.4s, v28.4s + + and v0.16b, v0.16b, v27.16b + mov x16, v0.d[0] // Move the R key to GPRs + mov x17, v0.d[1] + mov v27.16b, v5.16b // Store the S key + + bl Lpoly_hash_ad_internal + +Lopen_ad_done: + mov x3, x1 + +// Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes +Lopen_main_loop: + + cmp x2, #192 + b.lt Lopen_tail + + adrp x11, Lchacha20_consts + add x11, x11, :lo12:Lchacha20_consts + + ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] + mov v4.16b, v24.16b + + ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 + mov v9.16b, v28.16b + + ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 + mov v14.16b, v29.16b + + ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] + sub x5, x5, #32 + add v15.4s, v15.4s, v25.4s + mov v19.16b, v30.16b + + eor v20.16b, v20.16b, v20.16b //zero + not v21.16b, v20.16b // -1 + sub v21.4s, v25.4s, v21.4s // Add +1 + ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) + add v19.4s, v19.4s, v20.4s + + lsr x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12 + sub x4, x4, #10 + + mov x7, #10 + subs x6, x7, x4 + subs x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash + csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full + + cbz x7, Lopen_main_loop_rounds_short + +.align 5 +Lopen_main_loop_rounds: + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most +Lopen_main_loop_rounds_short: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v9.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v18.8h, v18.8h + rev32 v19.8h, v19.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + eor v8.16b, v8.16b, v13.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v9.4s, #20 + sli v8.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + add v3.4s, v3.4s, v7.4s + add v4.4s, v4.4s, v8.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v18.16b, {v18.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v14.16b + + ushr v9.4s, v8.4s, #25 + sli v9.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #4 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #12 + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + add v0.4s, v0.4s, v6.4s + add v1.4s, v1.4s, v7.4s + add v2.4s, v2.4s, v8.4s + add v3.4s, v3.4s, v5.4s + add v4.4s, v4.4s, v9.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v18.8h, v18.8h + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v19.8h, v19.8h + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v6.4s, #20 + sli v20.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v5.4s, #20 + sli v8.4s, v5.4s, #12 + ushr v5.4s, v9.4s, #20 + sli v5.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v5.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v18.16b, {v18.16b}, v26.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v12.16b + eor v6.16b, v6.16b, v13.16b + eor v7.16b, v7.16b, v10.16b + eor v8.16b, v8.16b, v11.16b + eor v5.16b, v5.16b, v14.16b + + ushr v9.4s, v5.4s, #25 + sli v9.4s, v5.4s, #7 + ushr v5.4s, v8.4s, #25 + sli v5.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v20.4s, #25 + sli v6.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #12 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #4 + subs x7, x7, #1 + b.gt Lopen_main_loop_rounds + subs x6, x6, #1 + b.ge Lopen_main_loop_rounds_short + + eor v20.16b, v20.16b, v20.16b //zero + not v21.16b, v20.16b // -1 + sub v21.4s, v25.4s, v21.4s // Add +1 + ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) + add v19.4s, v19.4s, v20.4s + + add v15.4s, v15.4s, v25.4s + mov x11, #5 + dup v20.4s, w11 + add v25.4s, v25.4s, v20.4s + + zip1 v20.4s, v0.4s, v1.4s + zip2 v21.4s, v0.4s, v1.4s + zip1 v22.4s, v2.4s, v3.4s + zip2 v23.4s, v2.4s, v3.4s + + zip1 v0.2d, v20.2d, v22.2d + zip2 v1.2d, v20.2d, v22.2d + zip1 v2.2d, v21.2d, v23.2d + zip2 v3.2d, v21.2d, v23.2d + + zip1 v20.4s, v5.4s, v6.4s + zip2 v21.4s, v5.4s, v6.4s + zip1 v22.4s, v7.4s, v8.4s + zip2 v23.4s, v7.4s, v8.4s + + zip1 v5.2d, v20.2d, v22.2d + zip2 v6.2d, v20.2d, v22.2d + zip1 v7.2d, v21.2d, v23.2d + zip2 v8.2d, v21.2d, v23.2d + + zip1 v20.4s, v10.4s, v11.4s + zip2 v21.4s, v10.4s, v11.4s + zip1 v22.4s, v12.4s, v13.4s + zip2 v23.4s, v12.4s, v13.4s + + zip1 v10.2d, v20.2d, v22.2d + zip2 v11.2d, v20.2d, v22.2d + zip1 v12.2d, v21.2d, v23.2d + zip2 v13.2d, v21.2d, v23.2d + + zip1 v20.4s, v15.4s, v16.4s + zip2 v21.4s, v15.4s, v16.4s + zip1 v22.4s, v17.4s, v18.4s + zip2 v23.4s, v17.4s, v18.4s + + zip1 v15.2d, v20.2d, v22.2d + zip2 v16.2d, v20.2d, v22.2d + zip1 v17.2d, v21.2d, v23.2d + zip2 v18.2d, v21.2d, v23.2d + + add v0.4s, v0.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v15.4s, v15.4s, v30.4s + + add v1.4s, v1.4s, v24.4s + add v6.4s, v6.4s, v28.4s + add v11.4s, v11.4s, v29.4s + add v16.4s, v16.4s, v30.4s + + add v2.4s, v2.4s, v24.4s + add v7.4s, v7.4s, v28.4s + add v12.4s, v12.4s, v29.4s + add v17.4s, v17.4s, v30.4s + + add v3.4s, v3.4s, v24.4s + add v8.4s, v8.4s, v28.4s + add v13.4s, v13.4s, v29.4s + add v18.4s, v18.4s, v30.4s + + add v4.4s, v4.4s, v24.4s + add v9.4s, v9.4s, v28.4s + add v14.4s, v14.4s, v29.4s + add v19.4s, v19.4s, v30.4s + + // We can always safely store 192 bytes + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v15.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v1.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v11.16b + eor v23.16b, v23.16b, v16.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v2.16b + eor v21.16b, v21.16b, v7.16b + eor v22.16b, v22.16b, v12.16b + eor v23.16b, v23.16b, v17.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #192 + + mov v0.16b, v3.16b + mov v5.16b, v8.16b + mov v10.16b, v13.16b + mov v15.16b, v18.16b + + cmp x2, #64 + b.lt Lopen_tail_64_store + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v3.16b + eor v21.16b, v21.16b, v8.16b + eor v22.16b, v22.16b, v13.16b + eor v23.16b, v23.16b, v18.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #64 + + mov v0.16b, v4.16b + mov v5.16b, v9.16b + mov v10.16b, v14.16b + mov v15.16b, v19.16b + + cmp x2, #64 + b.lt Lopen_tail_64_store + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v4.16b + eor v21.16b, v21.16b, v9.16b + eor v22.16b, v22.16b, v14.16b + eor v23.16b, v23.16b, v19.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #64 + b Lopen_main_loop + +Lopen_tail: + + cbz x2, Lopen_finalize + + lsr x4, x2, #4 // How many whole blocks we have to hash + + cmp x2, #64 + b.le Lopen_tail_64 + cmp x2, #128 + b.le Lopen_tail_128 + +Lopen_tail_192: + // We need three more blocks + mov v0.16b, v24.16b + mov v1.16b, v24.16b + mov v2.16b, v24.16b + mov v5.16b, v28.16b + mov v6.16b, v28.16b + mov v7.16b, v28.16b + mov v10.16b, v29.16b + mov v11.16b, v29.16b + mov v12.16b, v29.16b + mov v15.16b, v30.16b + mov v16.16b, v30.16b + mov v17.16b, v30.16b + eor v23.16b, v23.16b, v23.16b + eor v21.16b, v21.16b, v21.16b + ins v23.s[0], v25.s[0] + ins v21.d[0], x15 + + add v22.4s, v23.4s, v21.4s + add v21.4s, v22.4s, v21.4s + + add v15.4s, v15.4s, v21.4s + add v16.4s, v16.4s, v23.4s + add v17.4s, v17.4s, v22.4s + + mov x7, #10 + subs x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash + csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing + sub x4, x4, x7 + + cbz x7, Lopen_tail_192_rounds_no_hash + +Lopen_tail_192_rounds: + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most +Lopen_tail_192_rounds_no_hash: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #4 + ext v6.16b, v6.16b, v6.16b, #4 + ext v7.16b, v7.16b, v7.16b, #4 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #12 + ext v16.16b, v16.16b, v16.16b, #12 + ext v17.16b, v17.16b, v17.16b, #12 + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #12 + ext v6.16b, v6.16b, v6.16b, #12 + ext v7.16b, v7.16b, v7.16b, #12 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #4 + ext v16.16b, v16.16b, v16.16b, #4 + ext v17.16b, v17.16b, v17.16b, #4 + subs x7, x7, #1 + b.gt Lopen_tail_192_rounds + subs x6, x6, #1 + b.ge Lopen_tail_192_rounds_no_hash + + // We hashed 160 bytes at most, may still have 32 bytes left +Lopen_tail_192_hash: + cbz x4, Lopen_tail_192_hash_done + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + sub x4, x4, #1 + b Lopen_tail_192_hash + +Lopen_tail_192_hash_done: + + add v0.4s, v0.4s, v24.4s + add v1.4s, v1.4s, v24.4s + add v2.4s, v2.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v6.4s, v6.4s, v28.4s + add v7.4s, v7.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v11.4s, v11.4s, v29.4s + add v12.4s, v12.4s, v29.4s + add v15.4s, v15.4s, v30.4s + add v16.4s, v16.4s, v30.4s + add v17.4s, v17.4s, v30.4s + + add v15.4s, v15.4s, v21.4s + add v16.4s, v16.4s, v23.4s + add v17.4s, v17.4s, v22.4s + + ld1 {v20.16b - v23.16b}, [x1], #64 + + eor v20.16b, v20.16b, v1.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v11.16b + eor v23.16b, v23.16b, v16.16b + + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + + eor v20.16b, v20.16b, v2.16b + eor v21.16b, v21.16b, v7.16b + eor v22.16b, v22.16b, v12.16b + eor v23.16b, v23.16b, v17.16b + + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #128 + b Lopen_tail_64_store + +Lopen_tail_128: + // We need two more blocks + mov v0.16b, v24.16b + mov v1.16b, v24.16b + mov v5.16b, v28.16b + mov v6.16b, v28.16b + mov v10.16b, v29.16b + mov v11.16b, v29.16b + mov v15.16b, v30.16b + mov v16.16b, v30.16b + eor v23.16b, v23.16b, v23.16b + eor v22.16b, v22.16b, v22.16b + ins v23.s[0], v25.s[0] + ins v22.d[0], x15 + add v22.4s, v22.4s, v23.4s + + add v15.4s, v15.4s, v22.4s + add v16.4s, v16.4s, v23.4s + + mov x6, #10 + sub x6, x6, x4 + +Lopen_tail_128_rounds: + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #4 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #12 + add v1.4s, v1.4s, v6.4s + eor v16.16b, v16.16b, v1.16b + rev32 v16.8h, v16.8h + + add v11.4s, v11.4s, v16.4s + eor v6.16b, v6.16b, v11.16b + ushr v20.4s, v6.4s, #20 + sli v20.4s, v6.4s, #12 + add v1.4s, v1.4s, v20.4s + eor v16.16b, v16.16b, v1.16b + tbl v16.16b, {v16.16b}, v26.16b + + add v11.4s, v11.4s, v16.4s + eor v20.16b, v20.16b, v11.16b + ushr v6.4s, v20.4s, #25 + sli v6.4s, v20.4s, #7 + ext v6.16b, v6.16b, v6.16b, #4 + ext v11.16b, v11.16b, v11.16b, #8 + ext v16.16b, v16.16b, v16.16b, #12 + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #12 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #4 + add v1.4s, v1.4s, v6.4s + eor v16.16b, v16.16b, v1.16b + rev32 v16.8h, v16.8h + + add v11.4s, v11.4s, v16.4s + eor v6.16b, v6.16b, v11.16b + ushr v20.4s, v6.4s, #20 + sli v20.4s, v6.4s, #12 + add v1.4s, v1.4s, v20.4s + eor v16.16b, v16.16b, v1.16b + tbl v16.16b, {v16.16b}, v26.16b + + add v11.4s, v11.4s, v16.4s + eor v20.16b, v20.16b, v11.16b + ushr v6.4s, v20.4s, #25 + sli v6.4s, v20.4s, #7 + ext v6.16b, v6.16b, v6.16b, #12 + ext v11.16b, v11.16b, v11.16b, #8 + ext v16.16b, v16.16b, v16.16b, #4 + subs x6, x6, #1 + b.gt Lopen_tail_128_rounds + cbz x4, Lopen_tail_128_rounds_done + subs x4, x4, #1 + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + b Lopen_tail_128_rounds + +Lopen_tail_128_rounds_done: + add v0.4s, v0.4s, v24.4s + add v1.4s, v1.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v6.4s, v6.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v11.4s, v11.4s, v29.4s + add v15.4s, v15.4s, v30.4s + add v16.4s, v16.4s, v30.4s + add v15.4s, v15.4s, v22.4s + add v16.4s, v16.4s, v23.4s + + ld1 {v20.16b - v23.16b}, [x1], #64 + + eor v20.16b, v20.16b, v1.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v11.16b + eor v23.16b, v23.16b, v16.16b + + st1 {v20.16b - v23.16b}, [x0], #64 + sub x2, x2, #64 + + b Lopen_tail_64_store + +Lopen_tail_64: + // We just need a single block + mov v0.16b, v24.16b + mov v5.16b, v28.16b + mov v10.16b, v29.16b + mov v15.16b, v30.16b + eor v23.16b, v23.16b, v23.16b + ins v23.s[0], v25.s[0] + add v15.4s, v15.4s, v23.4s + + mov x6, #10 + sub x6, x6, x4 + +Lopen_tail_64_rounds: + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #4 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #12 + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #12 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #4 + subs x6, x6, #1 + b.gt Lopen_tail_64_rounds + cbz x4, Lopen_tail_64_rounds_done + subs x4, x4, #1 + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + b Lopen_tail_64_rounds + +Lopen_tail_64_rounds_done: + add v0.4s, v0.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v15.4s, v15.4s, v30.4s + add v15.4s, v15.4s, v23.4s + +Lopen_tail_64_store: + cmp x2, #16 + b.lt Lopen_tail_16 + + ld1 {v20.16b}, [x1], #16 + eor v20.16b, v20.16b, v0.16b + st1 {v20.16b}, [x0], #16 + mov v0.16b, v5.16b + mov v5.16b, v10.16b + mov v10.16b, v15.16b + sub x2, x2, #16 + b Lopen_tail_64_store + +Lopen_tail_16: + // Here we handle the last [0,16) bytes that require a padded block + cbz x2, Lopen_finalize + + eor v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext + eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask + not v22.16b, v20.16b + + add x7, x1, x2 + mov x6, x2 + +Lopen_tail_16_compose: + ext v20.16b, v20.16b, v20.16b, #15 + ldrb w11, [x7, #-1]! + mov v20.b[0], w11 + ext v21.16b, v22.16b, v21.16b, #15 + subs x2, x2, #1 + b.gt Lopen_tail_16_compose + + and v20.16b, v20.16b, v21.16b + // Hash in the final padded block + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + eor v20.16b, v20.16b, v0.16b + +Lopen_tail_16_store: + umov w11, v20.b[0] + strb w11, [x0], #1 + ext v20.16b, v20.16b, v20.16b, #1 + subs x6, x6, #1 + b.gt Lopen_tail_16_store + +Lopen_finalize: + mov x11, v31.d[0] + mov x12, v31.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + // Final reduction step + sub x12, xzr, x15 + orr x13, xzr, #3 + subs x11, x8, #-5 + sbcs x12, x9, x12 + sbcs x13, x10, x13 + csel x8, x11, x8, cs + csel x9, x12, x9, cs + csel x10, x13, x10, cs + mov x11, v27.d[0] + mov x12, v27.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + + stp x8, x9, [x5] + + ldp d8, d9, [sp, #16] + ldp d10, d11, [sp, #32] + ldp d12, d13, [sp, #48] + ldp d14, d15, [sp, #64] +.cfi_restore b15 +.cfi_restore b14 +.cfi_restore b13 +.cfi_restore b12 +.cfi_restore b11 +.cfi_restore b10 +.cfi_restore b9 +.cfi_restore b8 + ldp x29, x30, [sp], 80 +.cfi_restore w29 +.cfi_restore w30 +.cfi_def_cfa_offset 0 + AARCH64_VALIDATE_LINK_REGISTER + ret + +Lopen_128: + // On some architectures preparing 5 blocks for small buffers is wasteful + eor v25.16b, v25.16b, v25.16b + mov x11, #1 + mov v25.s[0], w11 + mov v0.16b, v24.16b + mov v1.16b, v24.16b + mov v2.16b, v24.16b + mov v5.16b, v28.16b + mov v6.16b, v28.16b + mov v7.16b, v28.16b + mov v10.16b, v29.16b + mov v11.16b, v29.16b + mov v12.16b, v29.16b + mov v17.16b, v30.16b + add v15.4s, v17.4s, v25.4s + add v16.4s, v15.4s, v25.4s + + mov x6, #10 + +Lopen_128_rounds: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #4 + ext v6.16b, v6.16b, v6.16b, #4 + ext v7.16b, v7.16b, v7.16b, #4 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #12 + ext v16.16b, v16.16b, v16.16b, #12 + ext v17.16b, v17.16b, v17.16b, #12 + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #12 + ext v6.16b, v6.16b, v6.16b, #12 + ext v7.16b, v7.16b, v7.16b, #12 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #4 + ext v16.16b, v16.16b, v16.16b, #4 + ext v17.16b, v17.16b, v17.16b, #4 + subs x6, x6, #1 + b.hi Lopen_128_rounds + + add v0.4s, v0.4s, v24.4s + add v1.4s, v1.4s, v24.4s + add v2.4s, v2.4s, v24.4s + + add v5.4s, v5.4s, v28.4s + add v6.4s, v6.4s, v28.4s + add v7.4s, v7.4s, v28.4s + + add v10.4s, v10.4s, v29.4s + add v11.4s, v11.4s, v29.4s + + add v30.4s, v30.4s, v25.4s + add v15.4s, v15.4s, v30.4s + add v30.4s, v30.4s, v25.4s + add v16.4s, v16.4s, v30.4s + + and v2.16b, v2.16b, v27.16b + mov x16, v2.d[0] // Move the R key to GPRs + mov x17, v2.d[1] + mov v27.16b, v7.16b // Store the S key + + bl Lpoly_hash_ad_internal + +Lopen_128_store: + cmp x2, #64 + b.lt Lopen_128_store_64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v21.d[0] + mov x12, v21.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v22.d[0] + mov x12, v22.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v23.d[0] + mov x12, v23.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v15.16b + + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #64 + + mov v0.16b, v1.16b + mov v5.16b, v6.16b + mov v10.16b, v11.16b + mov v15.16b, v16.16b + +Lopen_128_store_64: + + lsr x4, x2, #4 + mov x3, x1 + +Lopen_128_hash_64: + cbz x4, Lopen_tail_64_store + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + sub x4, x4, #1 + b Lopen_128_hash_64 +.cfi_endproc + +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) diff --git a/ring-0.17.14/pregenerated/chacha20_poly1305_x86_64-elf.S b/ring-0.17.14/pregenerated/chacha20_poly1305_x86_64-elf.S new file mode 100644 index 0000000000..499d70cc83 --- /dev/null +++ b/ring-0.17.14/pregenerated/chacha20_poly1305_x86_64-elf.S @@ -0,0 +1,8940 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +.section .rodata +.align 64 +chacha20_poly1305_constants: +.Lchacha20_consts: +.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' +.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' +.Lrol8: +.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 +.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 +.Lrol16: +.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 +.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 +.Lavx2_init: +.long 0,0,0,0 +.Lsse_inc: +.long 1,0,0,0 +.Lavx2_inc: +.long 2,0,0,0,2,0,0,0 +.Lclamp: +.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC +.quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF +.align 16 +.Land_masks: +.byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff +.text + +.type poly_hash_ad_internal,@function +.align 64 +poly_hash_ad_internal: +.cfi_startproc +.cfi_def_cfa rsp, 8 + xorq %r10,%r10 + xorq %r11,%r11 + xorq %r12,%r12 + cmpq $13,%r8 + jne .Lhash_ad_loop +.Lpoly_fast_tls_ad: + + movq (%rcx),%r10 + movq 5(%rcx),%r11 + shrq $24,%r11 + movq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + ret +.Lhash_ad_loop: + + cmpq $16,%r8 + jb .Lhash_ad_tail + addq 0+0(%rcx),%r10 + adcq 8+0(%rcx),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rcx),%rcx + subq $16,%r8 + jmp .Lhash_ad_loop +.Lhash_ad_tail: + cmpq $0,%r8 + je .Lhash_ad_done + + xorq %r13,%r13 + xorq %r14,%r14 + xorq %r15,%r15 + addq %r8,%rcx +.Lhash_ad_tail_loop: + shldq $8,%r13,%r14 + shlq $8,%r13 + movzbq -1(%rcx),%r15 + xorq %r15,%r13 + decq %rcx + decq %r8 + jne .Lhash_ad_tail_loop + + addq %r13,%r10 + adcq %r14,%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + +.Lhash_ad_done: + ret +.cfi_endproc +.size poly_hash_ad_internal, .-poly_hash_ad_internal + +.globl chacha20_poly1305_open_sse41 +.hidden chacha20_poly1305_open_sse41 +.type chacha20_poly1305_open_sse41,@function +.align 64 +chacha20_poly1305_open_sse41: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + + + pushq %r9 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r9,-64 + subq $288 + 0 + 32,%rsp +.cfi_adjust_cfa_offset 288 + 32 + + leaq 32(%rsp),%rbp + andq $-32,%rbp + + movq %rdx,%rbx + movq %r8,0+0+32(%rbp) + movq %rbx,8+0+32(%rbp) + + cmpq $128,%rbx + jbe .Lopen_sse_128 + + movdqa .Lchacha20_consts(%rip),%xmm0 + movdqu 0(%r9),%xmm4 + movdqu 16(%r9),%xmm8 + movdqu 32(%r9),%xmm12 + + movdqa %xmm12,%xmm7 + + movdqa %xmm4,0+48(%rbp) + movdqa %xmm8,0+64(%rbp) + movdqa %xmm12,0+96(%rbp) + movq $10,%r10 +.Lopen_sse_init_rounds: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + + decq %r10 + jne .Lopen_sse_init_rounds + + paddd .Lchacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + + pand .Lclamp(%rip),%xmm0 + movdqa %xmm0,0+0(%rbp) + movdqa %xmm4,0+16(%rbp) + + movq %r8,%r8 + call poly_hash_ad_internal +.Lopen_sse_main_loop: + cmpq $256,%rbx + jb .Lopen_sse_tail + + movdqa .Lchacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa %xmm0,%xmm2 + movdqa %xmm4,%xmm6 + movdqa %xmm8,%xmm10 + movdqa %xmm0,%xmm3 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm11 + movdqa 0+96(%rbp),%xmm15 + paddd .Lsse_inc(%rip),%xmm15 + movdqa %xmm15,%xmm14 + paddd .Lsse_inc(%rip),%xmm14 + movdqa %xmm14,%xmm13 + paddd .Lsse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd .Lsse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + movdqa %xmm14,0+128(%rbp) + movdqa %xmm15,0+144(%rbp) + + + + movq $4,%rcx + movq %rsi,%r8 +.Lopen_sse_main_loop_rounds: + movdqa %xmm8,0+80(%rbp) + movdqa .Lrol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + addq 0+0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + + leaq 16(%r8),%r8 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movdqa .Lrol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 0+80(%rbp),%xmm8 + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 +.byte 102,15,58,15,255,4 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,12 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + movdqa %xmm8,0+80(%rbp) + movdqa .Lrol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movdqa .Lrol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 0+80(%rbp),%xmm8 +.byte 102,15,58,15,255,12 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,4 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + + decq %rcx + jge .Lopen_sse_main_loop_rounds + addq 0+0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%r8),%r8 + cmpq $-6,%rcx + jg .Lopen_sse_main_loop_rounds + paddd .Lchacha20_consts(%rip),%xmm3 + paddd 0+48(%rbp),%xmm7 + paddd 0+64(%rbp),%xmm11 + paddd 0+144(%rbp),%xmm15 + paddd .Lchacha20_consts(%rip),%xmm2 + paddd 0+48(%rbp),%xmm6 + paddd 0+64(%rbp),%xmm10 + paddd 0+128(%rbp),%xmm14 + paddd .Lchacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd .Lchacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + movdqa %xmm12,0+80(%rbp) + movdqu 0 + 0(%rsi),%xmm12 + pxor %xmm3,%xmm12 + movdqu %xmm12,0 + 0(%rdi) + movdqu 16 + 0(%rsi),%xmm12 + pxor %xmm7,%xmm12 + movdqu %xmm12,16 + 0(%rdi) + movdqu 32 + 0(%rsi),%xmm12 + pxor %xmm11,%xmm12 + movdqu %xmm12,32 + 0(%rdi) + movdqu 48 + 0(%rsi),%xmm12 + pxor %xmm15,%xmm12 + movdqu %xmm12,48 + 0(%rdi) + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 64(%rdi) + movdqu %xmm6,16 + 64(%rdi) + movdqu %xmm10,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + movdqu 0 + 128(%rsi),%xmm3 + movdqu 16 + 128(%rsi),%xmm7 + movdqu 32 + 128(%rsi),%xmm11 + movdqu 48 + 128(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 128(%rdi) + movdqu %xmm5,16 + 128(%rdi) + movdqu %xmm9,32 + 128(%rdi) + movdqu %xmm15,48 + 128(%rdi) + movdqu 0 + 192(%rsi),%xmm3 + movdqu 16 + 192(%rsi),%xmm7 + movdqu 32 + 192(%rsi),%xmm11 + movdqu 48 + 192(%rsi),%xmm15 + pxor %xmm3,%xmm0 + pxor %xmm7,%xmm4 + pxor %xmm11,%xmm8 + pxor 0+80(%rbp),%xmm15 + movdqu %xmm0,0 + 192(%rdi) + movdqu %xmm4,16 + 192(%rdi) + movdqu %xmm8,32 + 192(%rdi) + movdqu %xmm15,48 + 192(%rdi) + + leaq 256(%rsi),%rsi + leaq 256(%rdi),%rdi + subq $256,%rbx + jmp .Lopen_sse_main_loop +.Lopen_sse_tail: + + testq %rbx,%rbx + jz .Lopen_sse_finalize + cmpq $192,%rbx + ja .Lopen_sse_tail_256 + cmpq $128,%rbx + ja .Lopen_sse_tail_192 + cmpq $64,%rbx + ja .Lopen_sse_tail_128 + movdqa .Lchacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa 0+96(%rbp),%xmm12 + paddd .Lsse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + + xorq %r8,%r8 + movq %rbx,%rcx + cmpq $16,%rcx + jb .Lopen_sse_tail_64_rounds +.Lopen_sse_tail_64_rounds_and_x1hash: + addq 0+0(%rsi,%r8,1),%r10 + adcq 8+0(%rsi,%r8,1),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + subq $16,%rcx +.Lopen_sse_tail_64_rounds: + addq $16,%r8 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + + cmpq $16,%rcx + jae .Lopen_sse_tail_64_rounds_and_x1hash + cmpq $160,%r8 + jne .Lopen_sse_tail_64_rounds + paddd .Lchacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + + jmp .Lopen_sse_tail_64_dec_loop + +.Lopen_sse_tail_128: + movdqa .Lchacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa 0+96(%rbp),%xmm13 + paddd .Lsse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd .Lsse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + + movq %rbx,%rcx + andq $-16,%rcx + xorq %r8,%r8 +.Lopen_sse_tail_128_rounds_and_x1hash: + addq 0+0(%rsi,%r8,1),%r10 + adcq 8+0(%rsi,%r8,1),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +.Lopen_sse_tail_128_rounds: + addq $16,%r8 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 + + cmpq %rcx,%r8 + jb .Lopen_sse_tail_128_rounds_and_x1hash + cmpq $160,%r8 + jne .Lopen_sse_tail_128_rounds + paddd .Lchacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd .Lchacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + movdqu 0 + 0(%rsi),%xmm3 + movdqu 16 + 0(%rsi),%xmm7 + movdqu 32 + 0(%rsi),%xmm11 + movdqu 48 + 0(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 0(%rdi) + movdqu %xmm5,16 + 0(%rdi) + movdqu %xmm9,32 + 0(%rdi) + movdqu %xmm15,48 + 0(%rdi) + + subq $64,%rbx + leaq 64(%rsi),%rsi + leaq 64(%rdi),%rdi + jmp .Lopen_sse_tail_64_dec_loop + +.Lopen_sse_tail_192: + movdqa .Lchacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa %xmm0,%xmm2 + movdqa %xmm4,%xmm6 + movdqa %xmm8,%xmm10 + movdqa 0+96(%rbp),%xmm14 + paddd .Lsse_inc(%rip),%xmm14 + movdqa %xmm14,%xmm13 + paddd .Lsse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd .Lsse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + movdqa %xmm14,0+128(%rbp) + + movq %rbx,%rcx + movq $160,%r8 + cmpq $160,%rcx + cmovgq %r8,%rcx + andq $-16,%rcx + xorq %r8,%r8 +.Lopen_sse_tail_192_rounds_and_x1hash: + addq 0+0(%rsi,%r8,1),%r10 + adcq 8+0(%rsi,%r8,1),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +.Lopen_sse_tail_192_rounds: + addq $16,%r8 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 + + cmpq %rcx,%r8 + jb .Lopen_sse_tail_192_rounds_and_x1hash + cmpq $160,%r8 + jne .Lopen_sse_tail_192_rounds + cmpq $176,%rbx + jb .Lopen_sse_tail_192_finish + addq 0+160(%rsi),%r10 + adcq 8+160(%rsi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + cmpq $192,%rbx + jb .Lopen_sse_tail_192_finish + addq 0+176(%rsi),%r10 + adcq 8+176(%rsi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +.Lopen_sse_tail_192_finish: + paddd .Lchacha20_consts(%rip),%xmm2 + paddd 0+48(%rbp),%xmm6 + paddd 0+64(%rbp),%xmm10 + paddd 0+128(%rbp),%xmm14 + paddd .Lchacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd .Lchacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + movdqu 0 + 0(%rsi),%xmm3 + movdqu 16 + 0(%rsi),%xmm7 + movdqu 32 + 0(%rsi),%xmm11 + movdqu 48 + 0(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 0(%rdi) + movdqu %xmm6,16 + 0(%rdi) + movdqu %xmm10,32 + 0(%rdi) + movdqu %xmm15,48 + 0(%rdi) + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 64(%rdi) + movdqu %xmm5,16 + 64(%rdi) + movdqu %xmm9,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + + subq $128,%rbx + leaq 128(%rsi),%rsi + leaq 128(%rdi),%rdi + jmp .Lopen_sse_tail_64_dec_loop + +.Lopen_sse_tail_256: + movdqa .Lchacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa %xmm0,%xmm2 + movdqa %xmm4,%xmm6 + movdqa %xmm8,%xmm10 + movdqa %xmm0,%xmm3 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm11 + movdqa 0+96(%rbp),%xmm15 + paddd .Lsse_inc(%rip),%xmm15 + movdqa %xmm15,%xmm14 + paddd .Lsse_inc(%rip),%xmm14 + movdqa %xmm14,%xmm13 + paddd .Lsse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd .Lsse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + movdqa %xmm14,0+128(%rbp) + movdqa %xmm15,0+144(%rbp) + + xorq %r8,%r8 +.Lopen_sse_tail_256_rounds_and_x1hash: + addq 0+0(%rsi,%r8,1),%r10 + adcq 8+0(%rsi,%r8,1),%r11 + adcq $1,%r12 + movdqa %xmm11,0+80(%rbp) + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm4 + pxor %xmm11,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm4 + pxor %xmm11,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm5 + pxor %xmm11,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm5 + pxor %xmm11,%xmm5 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm6 + pxor %xmm11,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm6 + pxor %xmm11,%xmm6 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 + movdqa 0+80(%rbp),%xmm11 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movdqa %xmm9,0+80(%rbp) + paddd %xmm7,%xmm3 + pxor %xmm3,%xmm15 + pshufb .Lrol16(%rip),%xmm15 + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm9 + pslld $12,%xmm9 + psrld $20,%xmm7 + pxor %xmm9,%xmm7 + paddd %xmm7,%xmm3 + pxor %xmm3,%xmm15 + pshufb .Lrol8(%rip),%xmm15 + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm9 + pslld $7,%xmm9 + psrld $25,%xmm7 + pxor %xmm9,%xmm7 +.byte 102,15,58,15,255,4 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,12 + movdqa 0+80(%rbp),%xmm9 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + movdqa %xmm11,0+80(%rbp) + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm4 + pxor %xmm11,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm4 + pxor %xmm11,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm5 + pxor %xmm11,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm5 + pxor %xmm11,%xmm5 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm6 + pxor %xmm11,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm6 + pxor %xmm11,%xmm6 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 + movdqa 0+80(%rbp),%xmm11 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + movdqa %xmm9,0+80(%rbp) + paddd %xmm7,%xmm3 + pxor %xmm3,%xmm15 + pshufb .Lrol16(%rip),%xmm15 + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm9 + pslld $12,%xmm9 + psrld $20,%xmm7 + pxor %xmm9,%xmm7 + paddd %xmm7,%xmm3 + pxor %xmm3,%xmm15 + pshufb .Lrol8(%rip),%xmm15 + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm9 + pslld $7,%xmm9 + psrld $25,%xmm7 + pxor %xmm9,%xmm7 +.byte 102,15,58,15,255,12 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,4 + movdqa 0+80(%rbp),%xmm9 + + addq $16,%r8 + cmpq $160,%r8 + jb .Lopen_sse_tail_256_rounds_and_x1hash + + movq %rbx,%rcx + andq $-16,%rcx +.Lopen_sse_tail_256_hash: + addq 0+0(%rsi,%r8,1),%r10 + adcq 8+0(%rsi,%r8,1),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + addq $16,%r8 + cmpq %rcx,%r8 + jb .Lopen_sse_tail_256_hash + paddd .Lchacha20_consts(%rip),%xmm3 + paddd 0+48(%rbp),%xmm7 + paddd 0+64(%rbp),%xmm11 + paddd 0+144(%rbp),%xmm15 + paddd .Lchacha20_consts(%rip),%xmm2 + paddd 0+48(%rbp),%xmm6 + paddd 0+64(%rbp),%xmm10 + paddd 0+128(%rbp),%xmm14 + paddd .Lchacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd .Lchacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + movdqa %xmm12,0+80(%rbp) + movdqu 0 + 0(%rsi),%xmm12 + pxor %xmm3,%xmm12 + movdqu %xmm12,0 + 0(%rdi) + movdqu 16 + 0(%rsi),%xmm12 + pxor %xmm7,%xmm12 + movdqu %xmm12,16 + 0(%rdi) + movdqu 32 + 0(%rsi),%xmm12 + pxor %xmm11,%xmm12 + movdqu %xmm12,32 + 0(%rdi) + movdqu 48 + 0(%rsi),%xmm12 + pxor %xmm15,%xmm12 + movdqu %xmm12,48 + 0(%rdi) + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 64(%rdi) + movdqu %xmm6,16 + 64(%rdi) + movdqu %xmm10,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + movdqu 0 + 128(%rsi),%xmm3 + movdqu 16 + 128(%rsi),%xmm7 + movdqu 32 + 128(%rsi),%xmm11 + movdqu 48 + 128(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 128(%rdi) + movdqu %xmm5,16 + 128(%rdi) + movdqu %xmm9,32 + 128(%rdi) + movdqu %xmm15,48 + 128(%rdi) + + movdqa 0+80(%rbp),%xmm12 + subq $192,%rbx + leaq 192(%rsi),%rsi + leaq 192(%rdi),%rdi + + +.Lopen_sse_tail_64_dec_loop: + cmpq $16,%rbx + jb .Lopen_sse_tail_16_init + subq $16,%rbx + movdqu (%rsi),%xmm3 + pxor %xmm3,%xmm0 + movdqu %xmm0,(%rdi) + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi + movdqa %xmm4,%xmm0 + movdqa %xmm8,%xmm4 + movdqa %xmm12,%xmm8 + jmp .Lopen_sse_tail_64_dec_loop +.Lopen_sse_tail_16_init: + movdqa %xmm0,%xmm1 + + +.Lopen_sse_tail_16: + testq %rbx,%rbx + jz .Lopen_sse_finalize + + + + pxor %xmm3,%xmm3 + leaq -1(%rsi,%rbx,1),%rsi + movq %rbx,%r8 +.Lopen_sse_tail_16_compose: + pslldq $1,%xmm3 + pinsrb $0,(%rsi),%xmm3 + subq $1,%rsi + subq $1,%r8 + jnz .Lopen_sse_tail_16_compose + +.byte 102,73,15,126,221 + pextrq $1,%xmm3,%r14 + + pxor %xmm1,%xmm3 + + +.Lopen_sse_tail_16_extract: + pextrb $0,%xmm3,(%rdi) + psrldq $1,%xmm3 + addq $1,%rdi + subq $1,%rbx + jne .Lopen_sse_tail_16_extract + + addq %r13,%r10 + adcq %r14,%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + +.Lopen_sse_finalize: + addq 0+0+32(%rbp),%r10 + adcq 8+0+32(%rbp),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + + movq %r10,%r13 + movq %r11,%r14 + movq %r12,%r15 + subq $-5,%r10 + sbbq $-1,%r11 + sbbq $3,%r12 + cmovcq %r13,%r10 + cmovcq %r14,%r11 + cmovcq %r15,%r12 + + addq 0+0+16(%rbp),%r10 + adcq 8+0+16(%rbp),%r11 + +.cfi_remember_state + addq $288 + 0 + 32,%rsp +.cfi_adjust_cfa_offset -(288 + 32) + + popq %r9 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r9 + movq %r10,(%r9) + movq %r11,8(%r9) + popq %r15 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r15 + popq %r14 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r14 + popq %r13 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r13 + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + popq %rbx +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbx + popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbp + ret + +.Lopen_sse_128: +.cfi_restore_state + movdqu .Lchacha20_consts(%rip),%xmm0 + movdqa %xmm0,%xmm1 + movdqa %xmm0,%xmm2 + movdqu 0(%r9),%xmm4 + movdqa %xmm4,%xmm5 + movdqa %xmm4,%xmm6 + movdqu 16(%r9),%xmm8 + movdqa %xmm8,%xmm9 + movdqa %xmm8,%xmm10 + movdqu 32(%r9),%xmm12 + movdqa %xmm12,%xmm13 + paddd .Lsse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm14 + paddd .Lsse_inc(%rip),%xmm14 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm11 + movdqa %xmm13,%xmm15 + movq $10,%r10 + +.Lopen_sse_128_rounds: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 + + decq %r10 + jnz .Lopen_sse_128_rounds + paddd .Lchacha20_consts(%rip),%xmm0 + paddd .Lchacha20_consts(%rip),%xmm1 + paddd .Lchacha20_consts(%rip),%xmm2 + paddd %xmm7,%xmm4 + paddd %xmm7,%xmm5 + paddd %xmm7,%xmm6 + paddd %xmm11,%xmm9 + paddd %xmm11,%xmm10 + paddd %xmm15,%xmm13 + paddd .Lsse_inc(%rip),%xmm15 + paddd %xmm15,%xmm14 + + pand .Lclamp(%rip),%xmm0 + movdqa %xmm0,0+0(%rbp) + movdqa %xmm4,0+16(%rbp) + + movq %r8,%r8 + call poly_hash_ad_internal +.Lopen_sse_128_xor_hash: + cmpq $16,%rbx + jb .Lopen_sse_tail_16 + subq $16,%rbx + addq 0+0(%rsi),%r10 + adcq 8+0(%rsi),%r11 + adcq $1,%r12 + + + movdqu 0(%rsi),%xmm3 + pxor %xmm3,%xmm1 + movdqu %xmm1,0(%rdi) + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + + movdqa %xmm5,%xmm1 + movdqa %xmm9,%xmm5 + movdqa %xmm13,%xmm9 + movdqa %xmm2,%xmm13 + movdqa %xmm6,%xmm2 + movdqa %xmm10,%xmm6 + movdqa %xmm14,%xmm10 + jmp .Lopen_sse_128_xor_hash +.size chacha20_poly1305_open_sse41, .-chacha20_poly1305_open_sse41 +.cfi_endproc + + + + + + + +.globl chacha20_poly1305_seal_sse41 +.hidden chacha20_poly1305_seal_sse41 +.type chacha20_poly1305_seal_sse41,@function +.align 64 +chacha20_poly1305_seal_sse41: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + + + pushq %r9 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r9,-64 + subq $288 + 0 + 32,%rsp +.cfi_adjust_cfa_offset 288 + 32 + leaq 32(%rsp),%rbp + andq $-32,%rbp + + movq 56(%r9),%rbx + addq %rdx,%rbx + movq %r8,0+0+32(%rbp) + movq %rbx,8+0+32(%rbp) + movq %rdx,%rbx + + cmpq $128,%rbx + jbe .Lseal_sse_128 + + movdqa .Lchacha20_consts(%rip),%xmm0 + movdqu 0(%r9),%xmm4 + movdqu 16(%r9),%xmm8 + movdqu 32(%r9),%xmm12 + + movdqa %xmm0,%xmm1 + movdqa %xmm0,%xmm2 + movdqa %xmm0,%xmm3 + movdqa %xmm4,%xmm5 + movdqa %xmm4,%xmm6 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm9 + movdqa %xmm8,%xmm10 + movdqa %xmm8,%xmm11 + movdqa %xmm12,%xmm15 + paddd .Lsse_inc(%rip),%xmm12 + movdqa %xmm12,%xmm14 + paddd .Lsse_inc(%rip),%xmm12 + movdqa %xmm12,%xmm13 + paddd .Lsse_inc(%rip),%xmm12 + + movdqa %xmm4,0+48(%rbp) + movdqa %xmm8,0+64(%rbp) + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + movdqa %xmm14,0+128(%rbp) + movdqa %xmm15,0+144(%rbp) + movq $10,%r10 +.Lseal_sse_init_rounds: + movdqa %xmm8,0+80(%rbp) + movdqa .Lrol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movdqa .Lrol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 0+80(%rbp),%xmm8 +.byte 102,15,58,15,255,4 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,12 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + movdqa %xmm8,0+80(%rbp) + movdqa .Lrol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movdqa .Lrol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 0+80(%rbp),%xmm8 +.byte 102,15,58,15,255,12 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,4 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + + decq %r10 + jnz .Lseal_sse_init_rounds + paddd .Lchacha20_consts(%rip),%xmm3 + paddd 0+48(%rbp),%xmm7 + paddd 0+64(%rbp),%xmm11 + paddd 0+144(%rbp),%xmm15 + paddd .Lchacha20_consts(%rip),%xmm2 + paddd 0+48(%rbp),%xmm6 + paddd 0+64(%rbp),%xmm10 + paddd 0+128(%rbp),%xmm14 + paddd .Lchacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd .Lchacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + + + pand .Lclamp(%rip),%xmm3 + movdqa %xmm3,0+0(%rbp) + movdqa %xmm7,0+16(%rbp) + + movq %r8,%r8 + call poly_hash_ad_internal + movdqu 0 + 0(%rsi),%xmm3 + movdqu 16 + 0(%rsi),%xmm7 + movdqu 32 + 0(%rsi),%xmm11 + movdqu 48 + 0(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 0(%rdi) + movdqu %xmm6,16 + 0(%rdi) + movdqu %xmm10,32 + 0(%rdi) + movdqu %xmm15,48 + 0(%rdi) + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 64(%rdi) + movdqu %xmm5,16 + 64(%rdi) + movdqu %xmm9,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + + cmpq $192,%rbx + ja .Lseal_sse_main_init + movq $128,%rcx + subq $128,%rbx + leaq 128(%rsi),%rsi + jmp .Lseal_sse_128_tail_hash +.Lseal_sse_main_init: + movdqu 0 + 128(%rsi),%xmm3 + movdqu 16 + 128(%rsi),%xmm7 + movdqu 32 + 128(%rsi),%xmm11 + movdqu 48 + 128(%rsi),%xmm15 + pxor %xmm3,%xmm0 + pxor %xmm7,%xmm4 + pxor %xmm11,%xmm8 + pxor %xmm12,%xmm15 + movdqu %xmm0,0 + 128(%rdi) + movdqu %xmm4,16 + 128(%rdi) + movdqu %xmm8,32 + 128(%rdi) + movdqu %xmm15,48 + 128(%rdi) + + movq $192,%rcx + subq $192,%rbx + leaq 192(%rsi),%rsi + movq $2,%rcx + movq $8,%r8 + cmpq $64,%rbx + jbe .Lseal_sse_tail_64 + cmpq $128,%rbx + jbe .Lseal_sse_tail_128 + cmpq $192,%rbx + jbe .Lseal_sse_tail_192 + +.Lseal_sse_main_loop: + movdqa .Lchacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa %xmm0,%xmm2 + movdqa %xmm4,%xmm6 + movdqa %xmm8,%xmm10 + movdqa %xmm0,%xmm3 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm11 + movdqa 0+96(%rbp),%xmm15 + paddd .Lsse_inc(%rip),%xmm15 + movdqa %xmm15,%xmm14 + paddd .Lsse_inc(%rip),%xmm14 + movdqa %xmm14,%xmm13 + paddd .Lsse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd .Lsse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + movdqa %xmm14,0+128(%rbp) + movdqa %xmm15,0+144(%rbp) + +.align 32 +.Lseal_sse_main_rounds: + movdqa %xmm8,0+80(%rbp) + movdqa .Lrol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movdqa .Lrol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 0+80(%rbp),%xmm8 + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 +.byte 102,15,58,15,255,4 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,12 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + movdqa %xmm8,0+80(%rbp) + movdqa .Lrol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movdqa .Lrol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 0+80(%rbp),%xmm8 +.byte 102,15,58,15,255,12 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,4 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + + leaq 16(%rdi),%rdi + decq %r8 + jge .Lseal_sse_main_rounds + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi + decq %rcx + jg .Lseal_sse_main_rounds + paddd .Lchacha20_consts(%rip),%xmm3 + paddd 0+48(%rbp),%xmm7 + paddd 0+64(%rbp),%xmm11 + paddd 0+144(%rbp),%xmm15 + paddd .Lchacha20_consts(%rip),%xmm2 + paddd 0+48(%rbp),%xmm6 + paddd 0+64(%rbp),%xmm10 + paddd 0+128(%rbp),%xmm14 + paddd .Lchacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd .Lchacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + + movdqa %xmm14,0+80(%rbp) + movdqa %xmm14,0+80(%rbp) + movdqu 0 + 0(%rsi),%xmm14 + pxor %xmm3,%xmm14 + movdqu %xmm14,0 + 0(%rdi) + movdqu 16 + 0(%rsi),%xmm14 + pxor %xmm7,%xmm14 + movdqu %xmm14,16 + 0(%rdi) + movdqu 32 + 0(%rsi),%xmm14 + pxor %xmm11,%xmm14 + movdqu %xmm14,32 + 0(%rdi) + movdqu 48 + 0(%rsi),%xmm14 + pxor %xmm15,%xmm14 + movdqu %xmm14,48 + 0(%rdi) + + movdqa 0+80(%rbp),%xmm14 + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 64(%rdi) + movdqu %xmm6,16 + 64(%rdi) + movdqu %xmm10,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + movdqu 0 + 128(%rsi),%xmm3 + movdqu 16 + 128(%rsi),%xmm7 + movdqu 32 + 128(%rsi),%xmm11 + movdqu 48 + 128(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 128(%rdi) + movdqu %xmm5,16 + 128(%rdi) + movdqu %xmm9,32 + 128(%rdi) + movdqu %xmm15,48 + 128(%rdi) + + cmpq $256,%rbx + ja .Lseal_sse_main_loop_xor + + movq $192,%rcx + subq $192,%rbx + leaq 192(%rsi),%rsi + jmp .Lseal_sse_128_tail_hash +.Lseal_sse_main_loop_xor: + movdqu 0 + 192(%rsi),%xmm3 + movdqu 16 + 192(%rsi),%xmm7 + movdqu 32 + 192(%rsi),%xmm11 + movdqu 48 + 192(%rsi),%xmm15 + pxor %xmm3,%xmm0 + pxor %xmm7,%xmm4 + pxor %xmm11,%xmm8 + pxor %xmm12,%xmm15 + movdqu %xmm0,0 + 192(%rdi) + movdqu %xmm4,16 + 192(%rdi) + movdqu %xmm8,32 + 192(%rdi) + movdqu %xmm15,48 + 192(%rdi) + + leaq 256(%rsi),%rsi + subq $256,%rbx + movq $6,%rcx + movq $4,%r8 + cmpq $192,%rbx + jg .Lseal_sse_main_loop + movq %rbx,%rcx + testq %rbx,%rbx + je .Lseal_sse_128_tail_hash + movq $6,%rcx + cmpq $128,%rbx + ja .Lseal_sse_tail_192 + cmpq $64,%rbx + ja .Lseal_sse_tail_128 + +.Lseal_sse_tail_64: + movdqa .Lchacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa 0+96(%rbp),%xmm12 + paddd .Lsse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + +.Lseal_sse_tail_64_rounds_and_x2hash: + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +.Lseal_sse_tail_64_rounds_and_x1hash: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi + decq %rcx + jg .Lseal_sse_tail_64_rounds_and_x2hash + decq %r8 + jge .Lseal_sse_tail_64_rounds_and_x1hash + paddd .Lchacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + + jmp .Lseal_sse_128_tail_xor + +.Lseal_sse_tail_128: + movdqa .Lchacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa 0+96(%rbp),%xmm13 + paddd .Lsse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd .Lsse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + +.Lseal_sse_tail_128_rounds_and_x2hash: + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +.Lseal_sse_tail_128_rounds_and_x1hash: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 + + leaq 16(%rdi),%rdi + decq %rcx + jg .Lseal_sse_tail_128_rounds_and_x2hash + decq %r8 + jge .Lseal_sse_tail_128_rounds_and_x1hash + paddd .Lchacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd .Lchacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + movdqu 0 + 0(%rsi),%xmm3 + movdqu 16 + 0(%rsi),%xmm7 + movdqu 32 + 0(%rsi),%xmm11 + movdqu 48 + 0(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 0(%rdi) + movdqu %xmm5,16 + 0(%rdi) + movdqu %xmm9,32 + 0(%rdi) + movdqu %xmm15,48 + 0(%rdi) + + movq $64,%rcx + subq $64,%rbx + leaq 64(%rsi),%rsi + jmp .Lseal_sse_128_tail_hash + +.Lseal_sse_tail_192: + movdqa .Lchacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa %xmm0,%xmm2 + movdqa %xmm4,%xmm6 + movdqa %xmm8,%xmm10 + movdqa 0+96(%rbp),%xmm14 + paddd .Lsse_inc(%rip),%xmm14 + movdqa %xmm14,%xmm13 + paddd .Lsse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd .Lsse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + movdqa %xmm14,0+128(%rbp) + +.Lseal_sse_tail_192_rounds_and_x2hash: + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +.Lseal_sse_tail_192_rounds_and_x1hash: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 + + leaq 16(%rdi),%rdi + decq %rcx + jg .Lseal_sse_tail_192_rounds_and_x2hash + decq %r8 + jge .Lseal_sse_tail_192_rounds_and_x1hash + paddd .Lchacha20_consts(%rip),%xmm2 + paddd 0+48(%rbp),%xmm6 + paddd 0+64(%rbp),%xmm10 + paddd 0+128(%rbp),%xmm14 + paddd .Lchacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd .Lchacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + movdqu 0 + 0(%rsi),%xmm3 + movdqu 16 + 0(%rsi),%xmm7 + movdqu 32 + 0(%rsi),%xmm11 + movdqu 48 + 0(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 0(%rdi) + movdqu %xmm6,16 + 0(%rdi) + movdqu %xmm10,32 + 0(%rdi) + movdqu %xmm15,48 + 0(%rdi) + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 64(%rdi) + movdqu %xmm5,16 + 64(%rdi) + movdqu %xmm9,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + + movq $128,%rcx + subq $128,%rbx + leaq 128(%rsi),%rsi + +.Lseal_sse_128_tail_hash: + cmpq $16,%rcx + jb .Lseal_sse_128_tail_xor + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + subq $16,%rcx + leaq 16(%rdi),%rdi + jmp .Lseal_sse_128_tail_hash + +.Lseal_sse_128_tail_xor: + cmpq $16,%rbx + jb .Lseal_sse_tail_16 + subq $16,%rbx + + movdqu 0(%rsi),%xmm3 + pxor %xmm3,%xmm0 + movdqu %xmm0,0(%rdi) + + addq 0(%rdi),%r10 + adcq 8(%rdi),%r11 + adcq $1,%r12 + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + + movdqa %xmm4,%xmm0 + movdqa %xmm8,%xmm4 + movdqa %xmm12,%xmm8 + movdqa %xmm1,%xmm12 + movdqa %xmm5,%xmm1 + movdqa %xmm9,%xmm5 + movdqa %xmm13,%xmm9 + jmp .Lseal_sse_128_tail_xor + +.Lseal_sse_tail_16: + testq %rbx,%rbx + jz .Lprocess_blocks_of_extra_in + + movq %rbx,%r8 + movq %rbx,%rcx + leaq -1(%rsi,%rbx,1),%rsi + pxor %xmm15,%xmm15 +.Lseal_sse_tail_16_compose: + pslldq $1,%xmm15 + pinsrb $0,(%rsi),%xmm15 + leaq -1(%rsi),%rsi + decq %rcx + jne .Lseal_sse_tail_16_compose + + + pxor %xmm0,%xmm15 + + + movq %rbx,%rcx + movdqu %xmm15,%xmm0 +.Lseal_sse_tail_16_extract: + pextrb $0,%xmm0,(%rdi) + psrldq $1,%xmm0 + addq $1,%rdi + subq $1,%rcx + jnz .Lseal_sse_tail_16_extract + + + + + + + + + movq 288 + 0 + 32(%rsp),%r9 + movq 56(%r9),%r14 + movq 48(%r9),%r13 + testq %r14,%r14 + jz .Lprocess_partial_block + + movq $16,%r15 + subq %rbx,%r15 + cmpq %r15,%r14 + + jge .Lload_extra_in + movq %r14,%r15 + +.Lload_extra_in: + + + leaq -1(%r13,%r15,1),%rsi + + + addq %r15,%r13 + subq %r15,%r14 + movq %r13,48(%r9) + movq %r14,56(%r9) + + + + addq %r15,%r8 + + + pxor %xmm11,%xmm11 +.Lload_extra_load_loop: + pslldq $1,%xmm11 + pinsrb $0,(%rsi),%xmm11 + leaq -1(%rsi),%rsi + subq $1,%r15 + jnz .Lload_extra_load_loop + + + + + movq %rbx,%r15 + +.Lload_extra_shift_loop: + pslldq $1,%xmm11 + subq $1,%r15 + jnz .Lload_extra_shift_loop + + + + + leaq .Land_masks(%rip),%r15 + shlq $4,%rbx + pand -16(%r15,%rbx,1),%xmm15 + + + por %xmm11,%xmm15 + + + +.byte 102,77,15,126,253 + pextrq $1,%xmm15,%r14 + addq %r13,%r10 + adcq %r14,%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + +.Lprocess_blocks_of_extra_in: + + movq 288+32+0 (%rsp),%r9 + movq 48(%r9),%rsi + movq 56(%r9),%r8 + movq %r8,%rcx + shrq $4,%r8 + +.Lprocess_extra_hash_loop: + jz process_extra_in_trailer + addq 0+0(%rsi),%r10 + adcq 8+0(%rsi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rsi),%rsi + subq $1,%r8 + jmp .Lprocess_extra_hash_loop +process_extra_in_trailer: + andq $15,%rcx + movq %rcx,%rbx + jz .Ldo_length_block + leaq -1(%rsi,%rcx,1),%rsi + +.Lprocess_extra_in_trailer_load: + pslldq $1,%xmm15 + pinsrb $0,(%rsi),%xmm15 + leaq -1(%rsi),%rsi + subq $1,%rcx + jnz .Lprocess_extra_in_trailer_load + +.Lprocess_partial_block: + + leaq .Land_masks(%rip),%r15 + shlq $4,%rbx + pand -16(%r15,%rbx,1),%xmm15 +.byte 102,77,15,126,253 + pextrq $1,%xmm15,%r14 + addq %r13,%r10 + adcq %r14,%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + +.Ldo_length_block: + addq 0+0+32(%rbp),%r10 + adcq 8+0+32(%rbp),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + + movq %r10,%r13 + movq %r11,%r14 + movq %r12,%r15 + subq $-5,%r10 + sbbq $-1,%r11 + sbbq $3,%r12 + cmovcq %r13,%r10 + cmovcq %r14,%r11 + cmovcq %r15,%r12 + + addq 0+0+16(%rbp),%r10 + adcq 8+0+16(%rbp),%r11 + +.cfi_remember_state + addq $288 + 0 + 32,%rsp +.cfi_adjust_cfa_offset -(288 + 32) + + popq %r9 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r9 + movq %r10,(%r9) + movq %r11,8(%r9) + popq %r15 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r15 + popq %r14 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r14 + popq %r13 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r13 + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + popq %rbx +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbx + popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbp + ret + +.Lseal_sse_128: +.cfi_restore_state + movdqu .Lchacha20_consts(%rip),%xmm0 + movdqa %xmm0,%xmm1 + movdqa %xmm0,%xmm2 + movdqu 0(%r9),%xmm4 + movdqa %xmm4,%xmm5 + movdqa %xmm4,%xmm6 + movdqu 16(%r9),%xmm8 + movdqa %xmm8,%xmm9 + movdqa %xmm8,%xmm10 + movdqu 32(%r9),%xmm14 + movdqa %xmm14,%xmm12 + paddd .Lsse_inc(%rip),%xmm12 + movdqa %xmm12,%xmm13 + paddd .Lsse_inc(%rip),%xmm13 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm11 + movdqa %xmm12,%xmm15 + movq $10,%r10 + +.Lseal_sse_128_rounds: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 + + decq %r10 + jnz .Lseal_sse_128_rounds + paddd .Lchacha20_consts(%rip),%xmm0 + paddd .Lchacha20_consts(%rip),%xmm1 + paddd .Lchacha20_consts(%rip),%xmm2 + paddd %xmm7,%xmm4 + paddd %xmm7,%xmm5 + paddd %xmm7,%xmm6 + paddd %xmm11,%xmm8 + paddd %xmm11,%xmm9 + paddd %xmm15,%xmm12 + paddd .Lsse_inc(%rip),%xmm15 + paddd %xmm15,%xmm13 + + pand .Lclamp(%rip),%xmm2 + movdqa %xmm2,0+0(%rbp) + movdqa %xmm6,0+16(%rbp) + + movq %r8,%r8 + call poly_hash_ad_internal + jmp .Lseal_sse_128_tail_xor +.size chacha20_poly1305_seal_sse41, .-chacha20_poly1305_seal_sse41 +.cfi_endproc + + +.globl chacha20_poly1305_open_avx2 +.hidden chacha20_poly1305_open_avx2 +.type chacha20_poly1305_open_avx2,@function +.align 64 +chacha20_poly1305_open_avx2: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + + + pushq %r9 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r9,-64 + subq $288 + 0 + 32,%rsp +.cfi_adjust_cfa_offset 288 + 32 + + leaq 32(%rsp),%rbp + andq $-32,%rbp + + movq %rdx,%rbx + movq %r8,0+0+32(%rbp) + movq %rbx,8+0+32(%rbp) + + vzeroupper + vmovdqa .Lchacha20_consts(%rip),%ymm0 + vbroadcasti128 0(%r9),%ymm4 + vbroadcasti128 16(%r9),%ymm8 + vbroadcasti128 32(%r9),%ymm12 + vpaddd .Lavx2_init(%rip),%ymm12,%ymm12 + cmpq $192,%rbx + jbe .Lopen_avx2_192 + cmpq $320,%rbx + jbe .Lopen_avx2_320 + + vmovdqa %ymm4,0+64(%rbp) + vmovdqa %ymm8,0+96(%rbp) + vmovdqa %ymm12,0+160(%rbp) + movq $10,%r10 +.Lopen_avx2_init_rounds: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + + decq %r10 + jne .Lopen_avx2_init_rounds + vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + + vpand .Lclamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0+0(%rbp) + + vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 + + movq %r8,%r8 + call poly_hash_ad_internal + + xorq %rcx,%rcx +.Lopen_avx2_init_hash: + addq 0+0(%rsi,%rcx,1),%r10 + adcq 8+0(%rsi,%rcx,1),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + addq $16,%rcx + cmpq $64,%rcx + jne .Lopen_avx2_init_hash + + vpxor 0(%rsi),%ymm0,%ymm0 + vpxor 32(%rsi),%ymm4,%ymm4 + + vmovdqu %ymm0,0(%rdi) + vmovdqu %ymm4,32(%rdi) + leaq 64(%rsi),%rsi + leaq 64(%rdi),%rdi + subq $64,%rbx +.Lopen_avx2_main_loop: + + cmpq $512,%rbx + jb .Lopen_avx2_main_loop_done + vmovdqa .Lchacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa .Lavx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm15 + vpaddd %ymm15,%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm15,0+256(%rbp) + vmovdqa %ymm14,0+224(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm12,0+160(%rbp) + + xorq %rcx,%rcx +.Lopen_avx2_main_loop_rounds: + addq 0+0(%rsi,%rcx,1),%r10 + adcq 8+0(%rsi,%rcx,1),%r11 + adcq $1,%r12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa .Lrol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + addq %rax,%r15 + adcq %rdx,%r9 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .Lrol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + addq 0+16(%rsi,%rcx,1),%r10 + adcq 8+16(%rsi,%rcx,1),%r11 + adcq $1,%r12 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa .Lrol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + addq %rax,%r15 + adcq %rdx,%r9 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + addq 0+32(%rsi,%rcx,1),%r10 + adcq 8+32(%rsi,%rcx,1),%r11 + adcq $1,%r12 + + leaq 48(%rcx),%rcx + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .Lrol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + addq %rax,%r15 + adcq %rdx,%r9 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpalignr $4,%ymm12,%ymm12,%ymm12 + + cmpq $60*8,%rcx + jne .Lopen_avx2_main_loop_rounds + vpaddd .Lchacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 0+64(%rbp),%ymm7,%ymm7 + vpaddd 0+96(%rbp),%ymm11,%ymm11 + vpaddd 0+256(%rbp),%ymm15,%ymm15 + vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 0+64(%rbp),%ymm6,%ymm6 + vpaddd 0+96(%rbp),%ymm10,%ymm10 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + + vmovdqa %ymm0,0+128(%rbp) + addq 0+60*8(%rsi),%r10 + adcq 8+60*8(%rsi),%r11 + adcq $1,%r12 + vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 + vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vpxor 0+0(%rsi),%ymm0,%ymm0 + vpxor 32+0(%rsi),%ymm3,%ymm3 + vpxor 64+0(%rsi),%ymm7,%ymm7 + vpxor 96+0(%rsi),%ymm11,%ymm11 + vmovdqu %ymm0,0+0(%rdi) + vmovdqu %ymm3,32+0(%rdi) + vmovdqu %ymm7,64+0(%rdi) + vmovdqu %ymm11,96+0(%rdi) + + vmovdqa 0+128(%rbp),%ymm0 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm2,%ymm2 + vpxor 64+128(%rsi),%ymm6,%ymm6 + vpxor 96+128(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm2,32+128(%rdi) + vmovdqu %ymm6,64+128(%rdi) + vmovdqu %ymm10,96+128(%rdi) + addq 0+60*8+16(%rsi),%r10 + adcq 8+60*8+16(%rsi),%r11 + adcq $1,%r12 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+256(%rsi),%ymm3,%ymm3 + vpxor 32+256(%rsi),%ymm1,%ymm1 + vpxor 64+256(%rsi),%ymm5,%ymm5 + vpxor 96+256(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+256(%rdi) + vmovdqu %ymm1,32+256(%rdi) + vmovdqu %ymm5,64+256(%rdi) + vmovdqu %ymm9,96+256(%rdi) + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x13,%ymm0,%ymm4,%ymm4 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm8 + vpxor 0+384(%rsi),%ymm3,%ymm3 + vpxor 32+384(%rsi),%ymm0,%ymm0 + vpxor 64+384(%rsi),%ymm4,%ymm4 + vpxor 96+384(%rsi),%ymm8,%ymm8 + vmovdqu %ymm3,0+384(%rdi) + vmovdqu %ymm0,32+384(%rdi) + vmovdqu %ymm4,64+384(%rdi) + vmovdqu %ymm8,96+384(%rdi) + + leaq 512(%rsi),%rsi + leaq 512(%rdi),%rdi + subq $512,%rbx + jmp .Lopen_avx2_main_loop +.Lopen_avx2_main_loop_done: + testq %rbx,%rbx + vzeroupper + je .Lopen_sse_finalize + + cmpq $384,%rbx + ja .Lopen_avx2_tail_512 + cmpq $256,%rbx + ja .Lopen_avx2_tail_384 + cmpq $128,%rbx + ja .Lopen_avx2_tail_256 + vmovdqa .Lchacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa .Lavx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vmovdqa %ymm12,0+160(%rbp) + + xorq %r8,%r8 + movq %rbx,%rcx + andq $-16,%rcx + testq %rcx,%rcx + je .Lopen_avx2_tail_128_rounds +.Lopen_avx2_tail_128_rounds_and_x1hash: + addq 0+0(%rsi,%r8,1),%r10 + adcq 8+0(%rsi,%r8,1),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +.Lopen_avx2_tail_128_rounds: + addq $16,%r8 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + + cmpq %rcx,%r8 + jb .Lopen_avx2_tail_128_rounds_and_x1hash + cmpq $160,%r8 + jne .Lopen_avx2_tail_128_rounds + vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + jmp .Lopen_avx2_tail_128_xor + +.Lopen_avx2_tail_256: + vmovdqa .Lchacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa .Lavx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm13,0+192(%rbp) + + movq %rbx,0+128(%rbp) + movq %rbx,%rcx + subq $128,%rcx + shrq $4,%rcx + movq $10,%r8 + cmpq $10,%rcx + cmovgq %r8,%rcx + movq %rsi,%rbx + xorq %r8,%r8 +.Lopen_avx2_tail_256_rounds_and_x1hash: + addq 0+0(%rbx),%r10 + adcq 8+0(%rbx),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rbx),%rbx +.Lopen_avx2_tail_256_rounds: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + + incq %r8 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm6,%ymm6,%ymm6 + + cmpq %rcx,%r8 + jb .Lopen_avx2_tail_256_rounds_and_x1hash + cmpq $10,%r8 + jne .Lopen_avx2_tail_256_rounds + movq %rbx,%r8 + subq %rsi,%rbx + movq %rbx,%rcx + movq 0+128(%rbp),%rbx +.Lopen_avx2_tail_256_hash: + addq $16,%rcx + cmpq %rbx,%rcx + jg .Lopen_avx2_tail_256_done + addq 0+0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%r8),%r8 + jmp .Lopen_avx2_tail_256_hash +.Lopen_avx2_tail_256_done: + vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+0(%rsi),%ymm3,%ymm3 + vpxor 32+0(%rsi),%ymm1,%ymm1 + vpxor 64+0(%rsi),%ymm5,%ymm5 + vpxor 96+0(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+0(%rdi) + vmovdqu %ymm1,32+0(%rdi) + vmovdqu %ymm5,64+0(%rdi) + vmovdqu %ymm9,96+0(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + leaq 128(%rsi),%rsi + leaq 128(%rdi),%rdi + subq $128,%rbx + jmp .Lopen_avx2_tail_128_xor + +.Lopen_avx2_tail_384: + vmovdqa .Lchacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa .Lavx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm14,0+224(%rbp) + + movq %rbx,0+128(%rbp) + movq %rbx,%rcx + subq $256,%rcx + shrq $4,%rcx + addq $6,%rcx + movq $10,%r8 + cmpq $10,%rcx + cmovgq %r8,%rcx + movq %rsi,%rbx + xorq %r8,%r8 +.Lopen_avx2_tail_384_rounds_and_x2hash: + addq 0+0(%rbx),%r10 + adcq 8+0(%rbx),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rbx),%rbx +.Lopen_avx2_tail_384_rounds_and_x1hash: + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + addq 0+0(%rbx),%r10 + adcq 8+0(%rbx),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rbx),%rbx + incq %r8 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + + cmpq %rcx,%r8 + jb .Lopen_avx2_tail_384_rounds_and_x2hash + cmpq $10,%r8 + jne .Lopen_avx2_tail_384_rounds_and_x1hash + movq %rbx,%r8 + subq %rsi,%rbx + movq %rbx,%rcx + movq 0+128(%rbp),%rbx +.Lopen_avx2_384_tail_hash: + addq $16,%rcx + cmpq %rbx,%rcx + jg .Lopen_avx2_384_tail_done + addq 0+0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%r8),%r8 + jmp .Lopen_avx2_384_tail_hash +.Lopen_avx2_384_tail_done: + vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 0+64(%rbp),%ymm6,%ymm6 + vpaddd 0+96(%rbp),%ymm10,%ymm10 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+0(%rsi),%ymm3,%ymm3 + vpxor 32+0(%rsi),%ymm2,%ymm2 + vpxor 64+0(%rsi),%ymm6,%ymm6 + vpxor 96+0(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+0(%rdi) + vmovdqu %ymm2,32+0(%rdi) + vmovdqu %ymm6,64+0(%rdi) + vmovdqu %ymm10,96+0(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm1,%ymm1 + vpxor 64+128(%rsi),%ymm5,%ymm5 + vpxor 96+128(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm1,32+128(%rdi) + vmovdqu %ymm5,64+128(%rdi) + vmovdqu %ymm9,96+128(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + leaq 256(%rsi),%rsi + leaq 256(%rdi),%rdi + subq $256,%rbx + jmp .Lopen_avx2_tail_128_xor + +.Lopen_avx2_tail_512: + vmovdqa .Lchacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa .Lavx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm15 + vpaddd %ymm15,%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm15,0+256(%rbp) + vmovdqa %ymm14,0+224(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm12,0+160(%rbp) + + xorq %rcx,%rcx + movq %rsi,%r8 +.Lopen_avx2_tail_512_rounds_and_x2hash: + addq 0+0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%r8),%r8 +.Lopen_avx2_tail_512_rounds_and_x1hash: + vmovdqa %ymm8,0+128(%rbp) + vmovdqa .Lrol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .Lrol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + addq 0+0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa .Lrol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + addq 0+16(%r8),%r10 + adcq 8+16(%r8),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%r8),%r8 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .Lrol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm12,%ymm12,%ymm12 + + incq %rcx + cmpq $4,%rcx + jl .Lopen_avx2_tail_512_rounds_and_x2hash + cmpq $10,%rcx + jne .Lopen_avx2_tail_512_rounds_and_x1hash + movq %rbx,%rcx + subq $384,%rcx + andq $-16,%rcx +.Lopen_avx2_tail_512_hash: + testq %rcx,%rcx + je .Lopen_avx2_tail_512_done + addq 0+0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%r8),%r8 + subq $16,%rcx + jmp .Lopen_avx2_tail_512_hash +.Lopen_avx2_tail_512_done: + vpaddd .Lchacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 0+64(%rbp),%ymm7,%ymm7 + vpaddd 0+96(%rbp),%ymm11,%ymm11 + vpaddd 0+256(%rbp),%ymm15,%ymm15 + vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 0+64(%rbp),%ymm6,%ymm6 + vpaddd 0+96(%rbp),%ymm10,%ymm10 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + + vmovdqa %ymm0,0+128(%rbp) + vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 + vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vpxor 0+0(%rsi),%ymm0,%ymm0 + vpxor 32+0(%rsi),%ymm3,%ymm3 + vpxor 64+0(%rsi),%ymm7,%ymm7 + vpxor 96+0(%rsi),%ymm11,%ymm11 + vmovdqu %ymm0,0+0(%rdi) + vmovdqu %ymm3,32+0(%rdi) + vmovdqu %ymm7,64+0(%rdi) + vmovdqu %ymm11,96+0(%rdi) + + vmovdqa 0+128(%rbp),%ymm0 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm2,%ymm2 + vpxor 64+128(%rsi),%ymm6,%ymm6 + vpxor 96+128(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm2,32+128(%rdi) + vmovdqu %ymm6,64+128(%rdi) + vmovdqu %ymm10,96+128(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+256(%rsi),%ymm3,%ymm3 + vpxor 32+256(%rsi),%ymm1,%ymm1 + vpxor 64+256(%rsi),%ymm5,%ymm5 + vpxor 96+256(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+256(%rdi) + vmovdqu %ymm1,32+256(%rdi) + vmovdqu %ymm5,64+256(%rdi) + vmovdqu %ymm9,96+256(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + leaq 384(%rsi),%rsi + leaq 384(%rdi),%rdi + subq $384,%rbx +.Lopen_avx2_tail_128_xor: + cmpq $32,%rbx + jb .Lopen_avx2_tail_32_xor + subq $32,%rbx + vpxor (%rsi),%ymm0,%ymm0 + vmovdqu %ymm0,(%rdi) + leaq 32(%rsi),%rsi + leaq 32(%rdi),%rdi + vmovdqa %ymm4,%ymm0 + vmovdqa %ymm8,%ymm4 + vmovdqa %ymm12,%ymm8 + jmp .Lopen_avx2_tail_128_xor +.Lopen_avx2_tail_32_xor: + cmpq $16,%rbx + vmovdqa %xmm0,%xmm1 + jb .Lopen_avx2_exit + subq $16,%rbx + + vpxor (%rsi),%xmm0,%xmm1 + vmovdqu %xmm1,(%rdi) + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi + vperm2i128 $0x11,%ymm0,%ymm0,%ymm0 + vmovdqa %xmm0,%xmm1 +.Lopen_avx2_exit: + vzeroupper + jmp .Lopen_sse_tail_16 + +.Lopen_avx2_192: + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm8,%ymm10 + vpaddd .Lavx2_inc(%rip),%ymm12,%ymm13 + vmovdqa %ymm12,%ymm11 + vmovdqa %ymm13,%ymm15 + movq $10,%r10 +.Lopen_avx2_192_rounds: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + + decq %r10 + jne .Lopen_avx2_192_rounds + vpaddd %ymm2,%ymm0,%ymm0 + vpaddd %ymm2,%ymm1,%ymm1 + vpaddd %ymm6,%ymm4,%ymm4 + vpaddd %ymm6,%ymm5,%ymm5 + vpaddd %ymm10,%ymm8,%ymm8 + vpaddd %ymm10,%ymm9,%ymm9 + vpaddd %ymm11,%ymm12,%ymm12 + vpaddd %ymm15,%ymm13,%ymm13 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + + vpand .Lclamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0+0(%rbp) + + vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 +.Lopen_avx2_short: + movq %r8,%r8 + call poly_hash_ad_internal +.Lopen_avx2_short_hash_and_xor_loop: + cmpq $32,%rbx + jb .Lopen_avx2_short_tail_32 + subq $32,%rbx + addq 0+0(%rsi),%r10 + adcq 8+0(%rsi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + addq 0+16(%rsi),%r10 + adcq 8+16(%rsi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + + vpxor (%rsi),%ymm0,%ymm0 + vmovdqu %ymm0,(%rdi) + leaq 32(%rsi),%rsi + leaq 32(%rdi),%rdi + + vmovdqa %ymm4,%ymm0 + vmovdqa %ymm8,%ymm4 + vmovdqa %ymm12,%ymm8 + vmovdqa %ymm1,%ymm12 + vmovdqa %ymm5,%ymm1 + vmovdqa %ymm9,%ymm5 + vmovdqa %ymm13,%ymm9 + vmovdqa %ymm2,%ymm13 + vmovdqa %ymm6,%ymm2 + jmp .Lopen_avx2_short_hash_and_xor_loop +.Lopen_avx2_short_tail_32: + cmpq $16,%rbx + vmovdqa %xmm0,%xmm1 + jb .Lopen_avx2_short_tail_32_exit + subq $16,%rbx + addq 0+0(%rsi),%r10 + adcq 8+0(%rsi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + vpxor (%rsi),%xmm0,%xmm3 + vmovdqu %xmm3,(%rdi) + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi + vextracti128 $1,%ymm0,%xmm1 +.Lopen_avx2_short_tail_32_exit: + vzeroupper + jmp .Lopen_sse_tail_16 + +.Lopen_avx2_320: + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm8,%ymm10 + vpaddd .Lavx2_inc(%rip),%ymm12,%ymm13 + vpaddd .Lavx2_inc(%rip),%ymm13,%ymm14 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm14,0+224(%rbp) + movq $10,%r10 +.Lopen_avx2_320_rounds: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm6,%ymm6,%ymm6 + + decq %r10 + jne .Lopen_avx2_320_rounds + vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 + vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 + vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 + vpaddd %ymm7,%ymm4,%ymm4 + vpaddd %ymm7,%ymm5,%ymm5 + vpaddd %ymm7,%ymm6,%ymm6 + vpaddd %ymm11,%ymm8,%ymm8 + vpaddd %ymm11,%ymm9,%ymm9 + vpaddd %ymm11,%ymm10,%ymm10 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + + vpand .Lclamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0+0(%rbp) + + vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm9 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm13 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm6 + jmp .Lopen_avx2_short +.size chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2 +.cfi_endproc + + +.globl chacha20_poly1305_seal_avx2 +.hidden chacha20_poly1305_seal_avx2 +.type chacha20_poly1305_seal_avx2,@function +.align 64 +chacha20_poly1305_seal_avx2: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + + + pushq %r9 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r9,-64 + subq $288 + 0 + 32,%rsp +.cfi_adjust_cfa_offset 288 + 32 + leaq 32(%rsp),%rbp + andq $-32,%rbp + + movq 56(%r9),%rbx + addq %rdx,%rbx + movq %r8,0+0+32(%rbp) + movq %rbx,8+0+32(%rbp) + movq %rdx,%rbx + + vzeroupper + vmovdqa .Lchacha20_consts(%rip),%ymm0 + vbroadcasti128 0(%r9),%ymm4 + vbroadcasti128 16(%r9),%ymm8 + vbroadcasti128 32(%r9),%ymm12 + vpaddd .Lavx2_init(%rip),%ymm12,%ymm12 + cmpq $192,%rbx + jbe .Lseal_avx2_192 + cmpq $320,%rbx + jbe .Lseal_avx2_320 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm4,0+64(%rbp) + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm8,%ymm11 + vmovdqa %ymm8,0+96(%rbp) + vmovdqa %ymm12,%ymm15 + vpaddd .Lavx2_inc(%rip),%ymm15,%ymm14 + vpaddd .Lavx2_inc(%rip),%ymm14,%ymm13 + vpaddd .Lavx2_inc(%rip),%ymm13,%ymm12 + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm14,0+224(%rbp) + vmovdqa %ymm15,0+256(%rbp) + movq $10,%r10 +.Lseal_avx2_init_rounds: + vmovdqa %ymm8,0+128(%rbp) + vmovdqa .Lrol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .Lrol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa .Lrol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .Lrol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm12,%ymm12,%ymm12 + + decq %r10 + jnz .Lseal_avx2_init_rounds + vpaddd .Lchacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 0+64(%rbp),%ymm7,%ymm7 + vpaddd 0+96(%rbp),%ymm11,%ymm11 + vpaddd 0+256(%rbp),%ymm15,%ymm15 + vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 0+64(%rbp),%ymm6,%ymm6 + vpaddd 0+96(%rbp),%ymm10,%ymm10 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vperm2i128 $0x02,%ymm3,%ymm7,%ymm15 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm3 + vpand .Lclamp(%rip),%ymm15,%ymm15 + vmovdqa %ymm15,0+0(%rbp) + movq %r8,%r8 + call poly_hash_ad_internal + + vpxor 0(%rsi),%ymm3,%ymm3 + vpxor 32(%rsi),%ymm11,%ymm11 + vmovdqu %ymm3,0(%rdi) + vmovdqu %ymm11,32(%rdi) + vperm2i128 $0x02,%ymm2,%ymm6,%ymm15 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+64(%rsi),%ymm15,%ymm15 + vpxor 32+64(%rsi),%ymm2,%ymm2 + vpxor 64+64(%rsi),%ymm6,%ymm6 + vpxor 96+64(%rsi),%ymm10,%ymm10 + vmovdqu %ymm15,0+64(%rdi) + vmovdqu %ymm2,32+64(%rdi) + vmovdqu %ymm6,64+64(%rdi) + vmovdqu %ymm10,96+64(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm15 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+192(%rsi),%ymm15,%ymm15 + vpxor 32+192(%rsi),%ymm1,%ymm1 + vpxor 64+192(%rsi),%ymm5,%ymm5 + vpxor 96+192(%rsi),%ymm9,%ymm9 + vmovdqu %ymm15,0+192(%rdi) + vmovdqu %ymm1,32+192(%rdi) + vmovdqu %ymm5,64+192(%rdi) + vmovdqu %ymm9,96+192(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm15 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm15,%ymm8 + + leaq 320(%rsi),%rsi + subq $320,%rbx + movq $320,%rcx + cmpq $128,%rbx + jbe .Lseal_avx2_short_hash_remainder + vpxor 0(%rsi),%ymm0,%ymm0 + vpxor 32(%rsi),%ymm4,%ymm4 + vpxor 64(%rsi),%ymm8,%ymm8 + vpxor 96(%rsi),%ymm12,%ymm12 + vmovdqu %ymm0,320(%rdi) + vmovdqu %ymm4,352(%rdi) + vmovdqu %ymm8,384(%rdi) + vmovdqu %ymm12,416(%rdi) + leaq 128(%rsi),%rsi + subq $128,%rbx + movq $8,%rcx + movq $2,%r8 + cmpq $128,%rbx + jbe .Lseal_avx2_tail_128 + cmpq $256,%rbx + jbe .Lseal_avx2_tail_256 + cmpq $384,%rbx + jbe .Lseal_avx2_tail_384 + cmpq $512,%rbx + jbe .Lseal_avx2_tail_512 + vmovdqa .Lchacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa .Lavx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm15 + vpaddd %ymm15,%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm15,0+256(%rbp) + vmovdqa %ymm14,0+224(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm8,0+128(%rbp) + vmovdqa .Lrol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .Lrol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa .Lrol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .Lrol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa .Lrol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .Lrol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + + subq $16,%rdi + movq $9,%rcx + jmp .Lseal_avx2_main_loop_rounds_entry +.align 32 +.Lseal_avx2_main_loop: + vmovdqa .Lchacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa .Lavx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm15 + vpaddd %ymm15,%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm15,0+256(%rbp) + vmovdqa %ymm14,0+224(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm12,0+160(%rbp) + + movq $10,%rcx +.align 32 +.Lseal_avx2_main_loop_rounds: + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa .Lrol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + addq %rax,%r15 + adcq %rdx,%r9 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .Lrol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +.Lseal_avx2_main_loop_rounds_entry: + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + addq 0+16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa .Lrol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + addq %rax,%r15 + adcq %rdx,%r9 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + addq 0+32(%rdi),%r10 + adcq 8+32(%rdi),%r11 + adcq $1,%r12 + + leaq 48(%rdi),%rdi + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .Lrol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + addq %rax,%r15 + adcq %rdx,%r9 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpalignr $4,%ymm12,%ymm12,%ymm12 + + decq %rcx + jne .Lseal_avx2_main_loop_rounds + vpaddd .Lchacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 0+64(%rbp),%ymm7,%ymm7 + vpaddd 0+96(%rbp),%ymm11,%ymm11 + vpaddd 0+256(%rbp),%ymm15,%ymm15 + vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 0+64(%rbp),%ymm6,%ymm6 + vpaddd 0+96(%rbp),%ymm10,%ymm10 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + + vmovdqa %ymm0,0+128(%rbp) + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + addq 0+16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 + vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vpxor 0+0(%rsi),%ymm0,%ymm0 + vpxor 32+0(%rsi),%ymm3,%ymm3 + vpxor 64+0(%rsi),%ymm7,%ymm7 + vpxor 96+0(%rsi),%ymm11,%ymm11 + vmovdqu %ymm0,0+0(%rdi) + vmovdqu %ymm3,32+0(%rdi) + vmovdqu %ymm7,64+0(%rdi) + vmovdqu %ymm11,96+0(%rdi) + + vmovdqa 0+128(%rbp),%ymm0 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm2,%ymm2 + vpxor 64+128(%rsi),%ymm6,%ymm6 + vpxor 96+128(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm2,32+128(%rdi) + vmovdqu %ymm6,64+128(%rdi) + vmovdqu %ymm10,96+128(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+256(%rsi),%ymm3,%ymm3 + vpxor 32+256(%rsi),%ymm1,%ymm1 + vpxor 64+256(%rsi),%ymm5,%ymm5 + vpxor 96+256(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+256(%rdi) + vmovdqu %ymm1,32+256(%rdi) + vmovdqu %ymm5,64+256(%rdi) + vmovdqu %ymm9,96+256(%rdi) + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x13,%ymm0,%ymm4,%ymm4 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm8 + vpxor 0+384(%rsi),%ymm3,%ymm3 + vpxor 32+384(%rsi),%ymm0,%ymm0 + vpxor 64+384(%rsi),%ymm4,%ymm4 + vpxor 96+384(%rsi),%ymm8,%ymm8 + vmovdqu %ymm3,0+384(%rdi) + vmovdqu %ymm0,32+384(%rdi) + vmovdqu %ymm4,64+384(%rdi) + vmovdqu %ymm8,96+384(%rdi) + + leaq 512(%rsi),%rsi + subq $512,%rbx + cmpq $512,%rbx + jg .Lseal_avx2_main_loop + + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + addq 0+16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + movq $10,%rcx + xorq %r8,%r8 + + cmpq $384,%rbx + ja .Lseal_avx2_tail_512 + cmpq $256,%rbx + ja .Lseal_avx2_tail_384 + cmpq $128,%rbx + ja .Lseal_avx2_tail_256 + +.Lseal_avx2_tail_128: + vmovdqa .Lchacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa .Lavx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vmovdqa %ymm12,0+160(%rbp) + +.Lseal_avx2_tail_128_rounds_and_3xhash: + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +.Lseal_avx2_tail_128_rounds_and_2xhash: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + addq 0+16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + decq %rcx + jg .Lseal_avx2_tail_128_rounds_and_3xhash + decq %r8 + jge .Lseal_avx2_tail_128_rounds_and_2xhash + vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + jmp .Lseal_avx2_short_loop + +.Lseal_avx2_tail_256: + vmovdqa .Lchacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa .Lavx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm13,0+192(%rbp) + +.Lseal_avx2_tail_256_rounds_and_3xhash: + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +.Lseal_avx2_tail_256_rounds_and_2xhash: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + addq 0+16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + decq %rcx + jg .Lseal_avx2_tail_256_rounds_and_3xhash + decq %r8 + jge .Lseal_avx2_tail_256_rounds_and_2xhash + vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+0(%rsi),%ymm3,%ymm3 + vpxor 32+0(%rsi),%ymm1,%ymm1 + vpxor 64+0(%rsi),%ymm5,%ymm5 + vpxor 96+0(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+0(%rdi) + vmovdqu %ymm1,32+0(%rdi) + vmovdqu %ymm5,64+0(%rdi) + vmovdqu %ymm9,96+0(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + movq $128,%rcx + leaq 128(%rsi),%rsi + subq $128,%rbx + jmp .Lseal_avx2_short_hash_remainder + +.Lseal_avx2_tail_384: + vmovdqa .Lchacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa .Lavx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm14,0+224(%rbp) + +.Lseal_avx2_tail_384_rounds_and_3xhash: + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +.Lseal_avx2_tail_384_rounds_and_2xhash: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + addq 0+16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm6,%ymm6,%ymm6 + + leaq 32(%rdi),%rdi + decq %rcx + jg .Lseal_avx2_tail_384_rounds_and_3xhash + decq %r8 + jge .Lseal_avx2_tail_384_rounds_and_2xhash + vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 0+64(%rbp),%ymm6,%ymm6 + vpaddd 0+96(%rbp),%ymm10,%ymm10 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+0(%rsi),%ymm3,%ymm3 + vpxor 32+0(%rsi),%ymm2,%ymm2 + vpxor 64+0(%rsi),%ymm6,%ymm6 + vpxor 96+0(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+0(%rdi) + vmovdqu %ymm2,32+0(%rdi) + vmovdqu %ymm6,64+0(%rdi) + vmovdqu %ymm10,96+0(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm1,%ymm1 + vpxor 64+128(%rsi),%ymm5,%ymm5 + vpxor 96+128(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm1,32+128(%rdi) + vmovdqu %ymm5,64+128(%rdi) + vmovdqu %ymm9,96+128(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + movq $256,%rcx + leaq 256(%rsi),%rsi + subq $256,%rbx + jmp .Lseal_avx2_short_hash_remainder + +.Lseal_avx2_tail_512: + vmovdqa .Lchacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa .Lavx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm15 + vpaddd %ymm15,%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm15,0+256(%rbp) + vmovdqa %ymm14,0+224(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm12,0+160(%rbp) + +.Lseal_avx2_tail_512_rounds_and_3xhash: + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +.Lseal_avx2_tail_512_rounds_and_2xhash: + vmovdqa %ymm8,0+128(%rbp) + vmovdqa .Lrol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .Lrol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + addq %rax,%r15 + adcq %rdx,%r9 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa .Lrol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .Lrol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + addq 0+16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm12,%ymm12,%ymm12 + + + + + + + + + + + + + + + + + addq %rax,%r15 + adcq %rdx,%r9 + + + + + + + + + + + + + + + + + + + + + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + decq %rcx + jg .Lseal_avx2_tail_512_rounds_and_3xhash + decq %r8 + jge .Lseal_avx2_tail_512_rounds_and_2xhash + vpaddd .Lchacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 0+64(%rbp),%ymm7,%ymm7 + vpaddd 0+96(%rbp),%ymm11,%ymm11 + vpaddd 0+256(%rbp),%ymm15,%ymm15 + vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 0+64(%rbp),%ymm6,%ymm6 + vpaddd 0+96(%rbp),%ymm10,%ymm10 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + + vmovdqa %ymm0,0+128(%rbp) + vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 + vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vpxor 0+0(%rsi),%ymm0,%ymm0 + vpxor 32+0(%rsi),%ymm3,%ymm3 + vpxor 64+0(%rsi),%ymm7,%ymm7 + vpxor 96+0(%rsi),%ymm11,%ymm11 + vmovdqu %ymm0,0+0(%rdi) + vmovdqu %ymm3,32+0(%rdi) + vmovdqu %ymm7,64+0(%rdi) + vmovdqu %ymm11,96+0(%rdi) + + vmovdqa 0+128(%rbp),%ymm0 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm2,%ymm2 + vpxor 64+128(%rsi),%ymm6,%ymm6 + vpxor 96+128(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm2,32+128(%rdi) + vmovdqu %ymm6,64+128(%rdi) + vmovdqu %ymm10,96+128(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+256(%rsi),%ymm3,%ymm3 + vpxor 32+256(%rsi),%ymm1,%ymm1 + vpxor 64+256(%rsi),%ymm5,%ymm5 + vpxor 96+256(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+256(%rdi) + vmovdqu %ymm1,32+256(%rdi) + vmovdqu %ymm5,64+256(%rdi) + vmovdqu %ymm9,96+256(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + movq $384,%rcx + leaq 384(%rsi),%rsi + subq $384,%rbx + jmp .Lseal_avx2_short_hash_remainder + +.Lseal_avx2_320: + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm8,%ymm10 + vpaddd .Lavx2_inc(%rip),%ymm12,%ymm13 + vpaddd .Lavx2_inc(%rip),%ymm13,%ymm14 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm14,0+224(%rbp) + movq $10,%r10 +.Lseal_avx2_320_rounds: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm6,%ymm6,%ymm6 + + decq %r10 + jne .Lseal_avx2_320_rounds + vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 + vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 + vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 + vpaddd %ymm7,%ymm4,%ymm4 + vpaddd %ymm7,%ymm5,%ymm5 + vpaddd %ymm7,%ymm6,%ymm6 + vpaddd %ymm11,%ymm8,%ymm8 + vpaddd %ymm11,%ymm9,%ymm9 + vpaddd %ymm11,%ymm10,%ymm10 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + + vpand .Lclamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0+0(%rbp) + + vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm9 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm13 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm6 + jmp .Lseal_avx2_short + +.Lseal_avx2_192: + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm8,%ymm10 + vpaddd .Lavx2_inc(%rip),%ymm12,%ymm13 + vmovdqa %ymm12,%ymm11 + vmovdqa %ymm13,%ymm15 + movq $10,%r10 +.Lseal_avx2_192_rounds: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + + decq %r10 + jne .Lseal_avx2_192_rounds + vpaddd %ymm2,%ymm0,%ymm0 + vpaddd %ymm2,%ymm1,%ymm1 + vpaddd %ymm6,%ymm4,%ymm4 + vpaddd %ymm6,%ymm5,%ymm5 + vpaddd %ymm10,%ymm8,%ymm8 + vpaddd %ymm10,%ymm9,%ymm9 + vpaddd %ymm11,%ymm12,%ymm12 + vpaddd %ymm15,%ymm13,%ymm13 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + + vpand .Lclamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0+0(%rbp) + + vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 +.Lseal_avx2_short: + movq %r8,%r8 + call poly_hash_ad_internal + xorq %rcx,%rcx +.Lseal_avx2_short_hash_remainder: + cmpq $16,%rcx + jb .Lseal_avx2_short_loop + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + subq $16,%rcx + addq $16,%rdi + jmp .Lseal_avx2_short_hash_remainder +.Lseal_avx2_short_loop: + cmpq $32,%rbx + jb .Lseal_avx2_short_tail + subq $32,%rbx + + vpxor (%rsi),%ymm0,%ymm0 + vmovdqu %ymm0,(%rdi) + leaq 32(%rsi),%rsi + + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + addq 0+16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + + vmovdqa %ymm4,%ymm0 + vmovdqa %ymm8,%ymm4 + vmovdqa %ymm12,%ymm8 + vmovdqa %ymm1,%ymm12 + vmovdqa %ymm5,%ymm1 + vmovdqa %ymm9,%ymm5 + vmovdqa %ymm13,%ymm9 + vmovdqa %ymm2,%ymm13 + vmovdqa %ymm6,%ymm2 + jmp .Lseal_avx2_short_loop +.Lseal_avx2_short_tail: + cmpq $16,%rbx + jb .Lseal_avx2_exit + subq $16,%rbx + vpxor (%rsi),%xmm0,%xmm3 + vmovdqu %xmm3,(%rdi) + leaq 16(%rsi),%rsi + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi + vextracti128 $1,%ymm0,%xmm0 +.Lseal_avx2_exit: + vzeroupper + jmp .Lseal_sse_tail_16 +.cfi_endproc +.size chacha20_poly1305_seal_avx2, .-chacha20_poly1305_seal_avx2 +#endif diff --git a/ring-0.17.14/pregenerated/chacha20_poly1305_x86_64-macosx.S b/ring-0.17.14/pregenerated/chacha20_poly1305_x86_64-macosx.S new file mode 100644 index 0000000000..d8040ddcfb --- /dev/null +++ b/ring-0.17.14/pregenerated/chacha20_poly1305_x86_64-macosx.S @@ -0,0 +1,8898 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +.section __DATA,__const +.p2align 6 +chacha20_poly1305_constants: +L$chacha20_consts: +.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' +.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' +L$rol8: +.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 +.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 +L$rol16: +.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 +.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 +L$avx2_init: +.long 0,0,0,0 +L$sse_inc: +.long 1,0,0,0 +L$avx2_inc: +.long 2,0,0,0,2,0,0,0 +L$clamp: +.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC +.quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF +.p2align 4 +L$and_masks: +.byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff +.text + + +.p2align 6 +poly_hash_ad_internal: + + + xorq %r10,%r10 + xorq %r11,%r11 + xorq %r12,%r12 + cmpq $13,%r8 + jne L$hash_ad_loop +L$poly_fast_tls_ad: + + movq (%rcx),%r10 + movq 5(%rcx),%r11 + shrq $24,%r11 + movq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + ret +L$hash_ad_loop: + + cmpq $16,%r8 + jb L$hash_ad_tail + addq 0+0(%rcx),%r10 + adcq 8+0(%rcx),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rcx),%rcx + subq $16,%r8 + jmp L$hash_ad_loop +L$hash_ad_tail: + cmpq $0,%r8 + je L$hash_ad_done + + xorq %r13,%r13 + xorq %r14,%r14 + xorq %r15,%r15 + addq %r8,%rcx +L$hash_ad_tail_loop: + shldq $8,%r13,%r14 + shlq $8,%r13 + movzbq -1(%rcx),%r15 + xorq %r15,%r13 + decq %rcx + decq %r8 + jne L$hash_ad_tail_loop + + addq %r13,%r10 + adcq %r14,%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + +L$hash_ad_done: + ret + + + +.globl _chacha20_poly1305_open_sse41 +.private_extern _chacha20_poly1305_open_sse41 + +.p2align 6 +_chacha20_poly1305_open_sse41: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + + + pushq %r9 + + subq $288 + 0 + 32,%rsp + + + leaq 32(%rsp),%rbp + andq $-32,%rbp + + movq %rdx,%rbx + movq %r8,0+0+32(%rbp) + movq %rbx,8+0+32(%rbp) + + cmpq $128,%rbx + jbe L$open_sse_128 + + movdqa L$chacha20_consts(%rip),%xmm0 + movdqu 0(%r9),%xmm4 + movdqu 16(%r9),%xmm8 + movdqu 32(%r9),%xmm12 + + movdqa %xmm12,%xmm7 + + movdqa %xmm4,0+48(%rbp) + movdqa %xmm8,0+64(%rbp) + movdqa %xmm12,0+96(%rbp) + movq $10,%r10 +L$open_sse_init_rounds: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + + decq %r10 + jne L$open_sse_init_rounds + + paddd L$chacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + + pand L$clamp(%rip),%xmm0 + movdqa %xmm0,0+0(%rbp) + movdqa %xmm4,0+16(%rbp) + + movq %r8,%r8 + call poly_hash_ad_internal +L$open_sse_main_loop: + cmpq $256,%rbx + jb L$open_sse_tail + + movdqa L$chacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa %xmm0,%xmm2 + movdqa %xmm4,%xmm6 + movdqa %xmm8,%xmm10 + movdqa %xmm0,%xmm3 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm11 + movdqa 0+96(%rbp),%xmm15 + paddd L$sse_inc(%rip),%xmm15 + movdqa %xmm15,%xmm14 + paddd L$sse_inc(%rip),%xmm14 + movdqa %xmm14,%xmm13 + paddd L$sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd L$sse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + movdqa %xmm14,0+128(%rbp) + movdqa %xmm15,0+144(%rbp) + + + + movq $4,%rcx + movq %rsi,%r8 +L$open_sse_main_loop_rounds: + movdqa %xmm8,0+80(%rbp) + movdqa L$rol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + addq 0+0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + + leaq 16(%r8),%r8 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movdqa L$rol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 0+80(%rbp),%xmm8 + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 +.byte 102,15,58,15,255,4 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,12 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + movdqa %xmm8,0+80(%rbp) + movdqa L$rol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movdqa L$rol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 0+80(%rbp),%xmm8 +.byte 102,15,58,15,255,12 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,4 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + + decq %rcx + jge L$open_sse_main_loop_rounds + addq 0+0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%r8),%r8 + cmpq $-6,%rcx + jg L$open_sse_main_loop_rounds + paddd L$chacha20_consts(%rip),%xmm3 + paddd 0+48(%rbp),%xmm7 + paddd 0+64(%rbp),%xmm11 + paddd 0+144(%rbp),%xmm15 + paddd L$chacha20_consts(%rip),%xmm2 + paddd 0+48(%rbp),%xmm6 + paddd 0+64(%rbp),%xmm10 + paddd 0+128(%rbp),%xmm14 + paddd L$chacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd L$chacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + movdqa %xmm12,0+80(%rbp) + movdqu 0 + 0(%rsi),%xmm12 + pxor %xmm3,%xmm12 + movdqu %xmm12,0 + 0(%rdi) + movdqu 16 + 0(%rsi),%xmm12 + pxor %xmm7,%xmm12 + movdqu %xmm12,16 + 0(%rdi) + movdqu 32 + 0(%rsi),%xmm12 + pxor %xmm11,%xmm12 + movdqu %xmm12,32 + 0(%rdi) + movdqu 48 + 0(%rsi),%xmm12 + pxor %xmm15,%xmm12 + movdqu %xmm12,48 + 0(%rdi) + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 64(%rdi) + movdqu %xmm6,16 + 64(%rdi) + movdqu %xmm10,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + movdqu 0 + 128(%rsi),%xmm3 + movdqu 16 + 128(%rsi),%xmm7 + movdqu 32 + 128(%rsi),%xmm11 + movdqu 48 + 128(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 128(%rdi) + movdqu %xmm5,16 + 128(%rdi) + movdqu %xmm9,32 + 128(%rdi) + movdqu %xmm15,48 + 128(%rdi) + movdqu 0 + 192(%rsi),%xmm3 + movdqu 16 + 192(%rsi),%xmm7 + movdqu 32 + 192(%rsi),%xmm11 + movdqu 48 + 192(%rsi),%xmm15 + pxor %xmm3,%xmm0 + pxor %xmm7,%xmm4 + pxor %xmm11,%xmm8 + pxor 0+80(%rbp),%xmm15 + movdqu %xmm0,0 + 192(%rdi) + movdqu %xmm4,16 + 192(%rdi) + movdqu %xmm8,32 + 192(%rdi) + movdqu %xmm15,48 + 192(%rdi) + + leaq 256(%rsi),%rsi + leaq 256(%rdi),%rdi + subq $256,%rbx + jmp L$open_sse_main_loop +L$open_sse_tail: + + testq %rbx,%rbx + jz L$open_sse_finalize + cmpq $192,%rbx + ja L$open_sse_tail_256 + cmpq $128,%rbx + ja L$open_sse_tail_192 + cmpq $64,%rbx + ja L$open_sse_tail_128 + movdqa L$chacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa 0+96(%rbp),%xmm12 + paddd L$sse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + + xorq %r8,%r8 + movq %rbx,%rcx + cmpq $16,%rcx + jb L$open_sse_tail_64_rounds +L$open_sse_tail_64_rounds_and_x1hash: + addq 0+0(%rsi,%r8,1),%r10 + adcq 8+0(%rsi,%r8,1),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + subq $16,%rcx +L$open_sse_tail_64_rounds: + addq $16,%r8 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + + cmpq $16,%rcx + jae L$open_sse_tail_64_rounds_and_x1hash + cmpq $160,%r8 + jne L$open_sse_tail_64_rounds + paddd L$chacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + + jmp L$open_sse_tail_64_dec_loop + +L$open_sse_tail_128: + movdqa L$chacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa 0+96(%rbp),%xmm13 + paddd L$sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd L$sse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + + movq %rbx,%rcx + andq $-16,%rcx + xorq %r8,%r8 +L$open_sse_tail_128_rounds_and_x1hash: + addq 0+0(%rsi,%r8,1),%r10 + adcq 8+0(%rsi,%r8,1),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +L$open_sse_tail_128_rounds: + addq $16,%r8 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 + + cmpq %rcx,%r8 + jb L$open_sse_tail_128_rounds_and_x1hash + cmpq $160,%r8 + jne L$open_sse_tail_128_rounds + paddd L$chacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd L$chacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + movdqu 0 + 0(%rsi),%xmm3 + movdqu 16 + 0(%rsi),%xmm7 + movdqu 32 + 0(%rsi),%xmm11 + movdqu 48 + 0(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 0(%rdi) + movdqu %xmm5,16 + 0(%rdi) + movdqu %xmm9,32 + 0(%rdi) + movdqu %xmm15,48 + 0(%rdi) + + subq $64,%rbx + leaq 64(%rsi),%rsi + leaq 64(%rdi),%rdi + jmp L$open_sse_tail_64_dec_loop + +L$open_sse_tail_192: + movdqa L$chacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa %xmm0,%xmm2 + movdqa %xmm4,%xmm6 + movdqa %xmm8,%xmm10 + movdqa 0+96(%rbp),%xmm14 + paddd L$sse_inc(%rip),%xmm14 + movdqa %xmm14,%xmm13 + paddd L$sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd L$sse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + movdqa %xmm14,0+128(%rbp) + + movq %rbx,%rcx + movq $160,%r8 + cmpq $160,%rcx + cmovgq %r8,%rcx + andq $-16,%rcx + xorq %r8,%r8 +L$open_sse_tail_192_rounds_and_x1hash: + addq 0+0(%rsi,%r8,1),%r10 + adcq 8+0(%rsi,%r8,1),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +L$open_sse_tail_192_rounds: + addq $16,%r8 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 + + cmpq %rcx,%r8 + jb L$open_sse_tail_192_rounds_and_x1hash + cmpq $160,%r8 + jne L$open_sse_tail_192_rounds + cmpq $176,%rbx + jb L$open_sse_tail_192_finish + addq 0+160(%rsi),%r10 + adcq 8+160(%rsi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + cmpq $192,%rbx + jb L$open_sse_tail_192_finish + addq 0+176(%rsi),%r10 + adcq 8+176(%rsi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +L$open_sse_tail_192_finish: + paddd L$chacha20_consts(%rip),%xmm2 + paddd 0+48(%rbp),%xmm6 + paddd 0+64(%rbp),%xmm10 + paddd 0+128(%rbp),%xmm14 + paddd L$chacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd L$chacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + movdqu 0 + 0(%rsi),%xmm3 + movdqu 16 + 0(%rsi),%xmm7 + movdqu 32 + 0(%rsi),%xmm11 + movdqu 48 + 0(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 0(%rdi) + movdqu %xmm6,16 + 0(%rdi) + movdqu %xmm10,32 + 0(%rdi) + movdqu %xmm15,48 + 0(%rdi) + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 64(%rdi) + movdqu %xmm5,16 + 64(%rdi) + movdqu %xmm9,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + + subq $128,%rbx + leaq 128(%rsi),%rsi + leaq 128(%rdi),%rdi + jmp L$open_sse_tail_64_dec_loop + +L$open_sse_tail_256: + movdqa L$chacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa %xmm0,%xmm2 + movdqa %xmm4,%xmm6 + movdqa %xmm8,%xmm10 + movdqa %xmm0,%xmm3 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm11 + movdqa 0+96(%rbp),%xmm15 + paddd L$sse_inc(%rip),%xmm15 + movdqa %xmm15,%xmm14 + paddd L$sse_inc(%rip),%xmm14 + movdqa %xmm14,%xmm13 + paddd L$sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd L$sse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + movdqa %xmm14,0+128(%rbp) + movdqa %xmm15,0+144(%rbp) + + xorq %r8,%r8 +L$open_sse_tail_256_rounds_and_x1hash: + addq 0+0(%rsi,%r8,1),%r10 + adcq 8+0(%rsi,%r8,1),%r11 + adcq $1,%r12 + movdqa %xmm11,0+80(%rbp) + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm4 + pxor %xmm11,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm4 + pxor %xmm11,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm5 + pxor %xmm11,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm5 + pxor %xmm11,%xmm5 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm6 + pxor %xmm11,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm6 + pxor %xmm11,%xmm6 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 + movdqa 0+80(%rbp),%xmm11 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movdqa %xmm9,0+80(%rbp) + paddd %xmm7,%xmm3 + pxor %xmm3,%xmm15 + pshufb L$rol16(%rip),%xmm15 + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm9 + pslld $12,%xmm9 + psrld $20,%xmm7 + pxor %xmm9,%xmm7 + paddd %xmm7,%xmm3 + pxor %xmm3,%xmm15 + pshufb L$rol8(%rip),%xmm15 + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm9 + pslld $7,%xmm9 + psrld $25,%xmm7 + pxor %xmm9,%xmm7 +.byte 102,15,58,15,255,4 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,12 + movdqa 0+80(%rbp),%xmm9 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + movdqa %xmm11,0+80(%rbp) + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm4 + pxor %xmm11,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm4 + pxor %xmm11,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm5 + pxor %xmm11,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm5 + pxor %xmm11,%xmm5 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm6 + pxor %xmm11,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm6 + pxor %xmm11,%xmm6 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 + movdqa 0+80(%rbp),%xmm11 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + movdqa %xmm9,0+80(%rbp) + paddd %xmm7,%xmm3 + pxor %xmm3,%xmm15 + pshufb L$rol16(%rip),%xmm15 + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm9 + pslld $12,%xmm9 + psrld $20,%xmm7 + pxor %xmm9,%xmm7 + paddd %xmm7,%xmm3 + pxor %xmm3,%xmm15 + pshufb L$rol8(%rip),%xmm15 + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm9 + pslld $7,%xmm9 + psrld $25,%xmm7 + pxor %xmm9,%xmm7 +.byte 102,15,58,15,255,12 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,4 + movdqa 0+80(%rbp),%xmm9 + + addq $16,%r8 + cmpq $160,%r8 + jb L$open_sse_tail_256_rounds_and_x1hash + + movq %rbx,%rcx + andq $-16,%rcx +L$open_sse_tail_256_hash: + addq 0+0(%rsi,%r8,1),%r10 + adcq 8+0(%rsi,%r8,1),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + addq $16,%r8 + cmpq %rcx,%r8 + jb L$open_sse_tail_256_hash + paddd L$chacha20_consts(%rip),%xmm3 + paddd 0+48(%rbp),%xmm7 + paddd 0+64(%rbp),%xmm11 + paddd 0+144(%rbp),%xmm15 + paddd L$chacha20_consts(%rip),%xmm2 + paddd 0+48(%rbp),%xmm6 + paddd 0+64(%rbp),%xmm10 + paddd 0+128(%rbp),%xmm14 + paddd L$chacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd L$chacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + movdqa %xmm12,0+80(%rbp) + movdqu 0 + 0(%rsi),%xmm12 + pxor %xmm3,%xmm12 + movdqu %xmm12,0 + 0(%rdi) + movdqu 16 + 0(%rsi),%xmm12 + pxor %xmm7,%xmm12 + movdqu %xmm12,16 + 0(%rdi) + movdqu 32 + 0(%rsi),%xmm12 + pxor %xmm11,%xmm12 + movdqu %xmm12,32 + 0(%rdi) + movdqu 48 + 0(%rsi),%xmm12 + pxor %xmm15,%xmm12 + movdqu %xmm12,48 + 0(%rdi) + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 64(%rdi) + movdqu %xmm6,16 + 64(%rdi) + movdqu %xmm10,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + movdqu 0 + 128(%rsi),%xmm3 + movdqu 16 + 128(%rsi),%xmm7 + movdqu 32 + 128(%rsi),%xmm11 + movdqu 48 + 128(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 128(%rdi) + movdqu %xmm5,16 + 128(%rdi) + movdqu %xmm9,32 + 128(%rdi) + movdqu %xmm15,48 + 128(%rdi) + + movdqa 0+80(%rbp),%xmm12 + subq $192,%rbx + leaq 192(%rsi),%rsi + leaq 192(%rdi),%rdi + + +L$open_sse_tail_64_dec_loop: + cmpq $16,%rbx + jb L$open_sse_tail_16_init + subq $16,%rbx + movdqu (%rsi),%xmm3 + pxor %xmm3,%xmm0 + movdqu %xmm0,(%rdi) + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi + movdqa %xmm4,%xmm0 + movdqa %xmm8,%xmm4 + movdqa %xmm12,%xmm8 + jmp L$open_sse_tail_64_dec_loop +L$open_sse_tail_16_init: + movdqa %xmm0,%xmm1 + + +L$open_sse_tail_16: + testq %rbx,%rbx + jz L$open_sse_finalize + + + + pxor %xmm3,%xmm3 + leaq -1(%rsi,%rbx,1),%rsi + movq %rbx,%r8 +L$open_sse_tail_16_compose: + pslldq $1,%xmm3 + pinsrb $0,(%rsi),%xmm3 + subq $1,%rsi + subq $1,%r8 + jnz L$open_sse_tail_16_compose + +.byte 102,73,15,126,221 + pextrq $1,%xmm3,%r14 + + pxor %xmm1,%xmm3 + + +L$open_sse_tail_16_extract: + pextrb $0,%xmm3,(%rdi) + psrldq $1,%xmm3 + addq $1,%rdi + subq $1,%rbx + jne L$open_sse_tail_16_extract + + addq %r13,%r10 + adcq %r14,%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + +L$open_sse_finalize: + addq 0+0+32(%rbp),%r10 + adcq 8+0+32(%rbp),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + + movq %r10,%r13 + movq %r11,%r14 + movq %r12,%r15 + subq $-5,%r10 + sbbq $-1,%r11 + sbbq $3,%r12 + cmovcq %r13,%r10 + cmovcq %r14,%r11 + cmovcq %r15,%r12 + + addq 0+0+16(%rbp),%r10 + adcq 8+0+16(%rbp),%r11 + + + addq $288 + 0 + 32,%rsp + + + popq %r9 + + movq %r10,(%r9) + movq %r11,8(%r9) + popq %r15 + + popq %r14 + + popq %r13 + + popq %r12 + + popq %rbx + + popq %rbp + + ret + +L$open_sse_128: + + movdqu L$chacha20_consts(%rip),%xmm0 + movdqa %xmm0,%xmm1 + movdqa %xmm0,%xmm2 + movdqu 0(%r9),%xmm4 + movdqa %xmm4,%xmm5 + movdqa %xmm4,%xmm6 + movdqu 16(%r9),%xmm8 + movdqa %xmm8,%xmm9 + movdqa %xmm8,%xmm10 + movdqu 32(%r9),%xmm12 + movdqa %xmm12,%xmm13 + paddd L$sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm14 + paddd L$sse_inc(%rip),%xmm14 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm11 + movdqa %xmm13,%xmm15 + movq $10,%r10 + +L$open_sse_128_rounds: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 + + decq %r10 + jnz L$open_sse_128_rounds + paddd L$chacha20_consts(%rip),%xmm0 + paddd L$chacha20_consts(%rip),%xmm1 + paddd L$chacha20_consts(%rip),%xmm2 + paddd %xmm7,%xmm4 + paddd %xmm7,%xmm5 + paddd %xmm7,%xmm6 + paddd %xmm11,%xmm9 + paddd %xmm11,%xmm10 + paddd %xmm15,%xmm13 + paddd L$sse_inc(%rip),%xmm15 + paddd %xmm15,%xmm14 + + pand L$clamp(%rip),%xmm0 + movdqa %xmm0,0+0(%rbp) + movdqa %xmm4,0+16(%rbp) + + movq %r8,%r8 + call poly_hash_ad_internal +L$open_sse_128_xor_hash: + cmpq $16,%rbx + jb L$open_sse_tail_16 + subq $16,%rbx + addq 0+0(%rsi),%r10 + adcq 8+0(%rsi),%r11 + adcq $1,%r12 + + + movdqu 0(%rsi),%xmm3 + pxor %xmm3,%xmm1 + movdqu %xmm1,0(%rdi) + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + + movdqa %xmm5,%xmm1 + movdqa %xmm9,%xmm5 + movdqa %xmm13,%xmm9 + movdqa %xmm2,%xmm13 + movdqa %xmm6,%xmm2 + movdqa %xmm10,%xmm6 + movdqa %xmm14,%xmm10 + jmp L$open_sse_128_xor_hash + + + + + + + + + +.globl _chacha20_poly1305_seal_sse41 +.private_extern _chacha20_poly1305_seal_sse41 + +.p2align 6 +_chacha20_poly1305_seal_sse41: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + + + pushq %r9 + + subq $288 + 0 + 32,%rsp + + leaq 32(%rsp),%rbp + andq $-32,%rbp + + movq 56(%r9),%rbx + addq %rdx,%rbx + movq %r8,0+0+32(%rbp) + movq %rbx,8+0+32(%rbp) + movq %rdx,%rbx + + cmpq $128,%rbx + jbe L$seal_sse_128 + + movdqa L$chacha20_consts(%rip),%xmm0 + movdqu 0(%r9),%xmm4 + movdqu 16(%r9),%xmm8 + movdqu 32(%r9),%xmm12 + + movdqa %xmm0,%xmm1 + movdqa %xmm0,%xmm2 + movdqa %xmm0,%xmm3 + movdqa %xmm4,%xmm5 + movdqa %xmm4,%xmm6 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm9 + movdqa %xmm8,%xmm10 + movdqa %xmm8,%xmm11 + movdqa %xmm12,%xmm15 + paddd L$sse_inc(%rip),%xmm12 + movdqa %xmm12,%xmm14 + paddd L$sse_inc(%rip),%xmm12 + movdqa %xmm12,%xmm13 + paddd L$sse_inc(%rip),%xmm12 + + movdqa %xmm4,0+48(%rbp) + movdqa %xmm8,0+64(%rbp) + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + movdqa %xmm14,0+128(%rbp) + movdqa %xmm15,0+144(%rbp) + movq $10,%r10 +L$seal_sse_init_rounds: + movdqa %xmm8,0+80(%rbp) + movdqa L$rol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movdqa L$rol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 0+80(%rbp),%xmm8 +.byte 102,15,58,15,255,4 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,12 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + movdqa %xmm8,0+80(%rbp) + movdqa L$rol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movdqa L$rol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 0+80(%rbp),%xmm8 +.byte 102,15,58,15,255,12 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,4 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + + decq %r10 + jnz L$seal_sse_init_rounds + paddd L$chacha20_consts(%rip),%xmm3 + paddd 0+48(%rbp),%xmm7 + paddd 0+64(%rbp),%xmm11 + paddd 0+144(%rbp),%xmm15 + paddd L$chacha20_consts(%rip),%xmm2 + paddd 0+48(%rbp),%xmm6 + paddd 0+64(%rbp),%xmm10 + paddd 0+128(%rbp),%xmm14 + paddd L$chacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd L$chacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + + + pand L$clamp(%rip),%xmm3 + movdqa %xmm3,0+0(%rbp) + movdqa %xmm7,0+16(%rbp) + + movq %r8,%r8 + call poly_hash_ad_internal + movdqu 0 + 0(%rsi),%xmm3 + movdqu 16 + 0(%rsi),%xmm7 + movdqu 32 + 0(%rsi),%xmm11 + movdqu 48 + 0(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 0(%rdi) + movdqu %xmm6,16 + 0(%rdi) + movdqu %xmm10,32 + 0(%rdi) + movdqu %xmm15,48 + 0(%rdi) + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 64(%rdi) + movdqu %xmm5,16 + 64(%rdi) + movdqu %xmm9,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + + cmpq $192,%rbx + ja L$seal_sse_main_init + movq $128,%rcx + subq $128,%rbx + leaq 128(%rsi),%rsi + jmp L$seal_sse_128_tail_hash +L$seal_sse_main_init: + movdqu 0 + 128(%rsi),%xmm3 + movdqu 16 + 128(%rsi),%xmm7 + movdqu 32 + 128(%rsi),%xmm11 + movdqu 48 + 128(%rsi),%xmm15 + pxor %xmm3,%xmm0 + pxor %xmm7,%xmm4 + pxor %xmm11,%xmm8 + pxor %xmm12,%xmm15 + movdqu %xmm0,0 + 128(%rdi) + movdqu %xmm4,16 + 128(%rdi) + movdqu %xmm8,32 + 128(%rdi) + movdqu %xmm15,48 + 128(%rdi) + + movq $192,%rcx + subq $192,%rbx + leaq 192(%rsi),%rsi + movq $2,%rcx + movq $8,%r8 + cmpq $64,%rbx + jbe L$seal_sse_tail_64 + cmpq $128,%rbx + jbe L$seal_sse_tail_128 + cmpq $192,%rbx + jbe L$seal_sse_tail_192 + +L$seal_sse_main_loop: + movdqa L$chacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa %xmm0,%xmm2 + movdqa %xmm4,%xmm6 + movdqa %xmm8,%xmm10 + movdqa %xmm0,%xmm3 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm11 + movdqa 0+96(%rbp),%xmm15 + paddd L$sse_inc(%rip),%xmm15 + movdqa %xmm15,%xmm14 + paddd L$sse_inc(%rip),%xmm14 + movdqa %xmm14,%xmm13 + paddd L$sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd L$sse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + movdqa %xmm14,0+128(%rbp) + movdqa %xmm15,0+144(%rbp) + +.p2align 5 +L$seal_sse_main_rounds: + movdqa %xmm8,0+80(%rbp) + movdqa L$rol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movdqa L$rol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 0+80(%rbp),%xmm8 + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 +.byte 102,15,58,15,255,4 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,12 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + movdqa %xmm8,0+80(%rbp) + movdqa L$rol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movdqa L$rol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 0+80(%rbp),%xmm8 +.byte 102,15,58,15,255,12 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,4 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + + leaq 16(%rdi),%rdi + decq %r8 + jge L$seal_sse_main_rounds + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi + decq %rcx + jg L$seal_sse_main_rounds + paddd L$chacha20_consts(%rip),%xmm3 + paddd 0+48(%rbp),%xmm7 + paddd 0+64(%rbp),%xmm11 + paddd 0+144(%rbp),%xmm15 + paddd L$chacha20_consts(%rip),%xmm2 + paddd 0+48(%rbp),%xmm6 + paddd 0+64(%rbp),%xmm10 + paddd 0+128(%rbp),%xmm14 + paddd L$chacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd L$chacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + + movdqa %xmm14,0+80(%rbp) + movdqa %xmm14,0+80(%rbp) + movdqu 0 + 0(%rsi),%xmm14 + pxor %xmm3,%xmm14 + movdqu %xmm14,0 + 0(%rdi) + movdqu 16 + 0(%rsi),%xmm14 + pxor %xmm7,%xmm14 + movdqu %xmm14,16 + 0(%rdi) + movdqu 32 + 0(%rsi),%xmm14 + pxor %xmm11,%xmm14 + movdqu %xmm14,32 + 0(%rdi) + movdqu 48 + 0(%rsi),%xmm14 + pxor %xmm15,%xmm14 + movdqu %xmm14,48 + 0(%rdi) + + movdqa 0+80(%rbp),%xmm14 + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 64(%rdi) + movdqu %xmm6,16 + 64(%rdi) + movdqu %xmm10,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + movdqu 0 + 128(%rsi),%xmm3 + movdqu 16 + 128(%rsi),%xmm7 + movdqu 32 + 128(%rsi),%xmm11 + movdqu 48 + 128(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 128(%rdi) + movdqu %xmm5,16 + 128(%rdi) + movdqu %xmm9,32 + 128(%rdi) + movdqu %xmm15,48 + 128(%rdi) + + cmpq $256,%rbx + ja L$seal_sse_main_loop_xor + + movq $192,%rcx + subq $192,%rbx + leaq 192(%rsi),%rsi + jmp L$seal_sse_128_tail_hash +L$seal_sse_main_loop_xor: + movdqu 0 + 192(%rsi),%xmm3 + movdqu 16 + 192(%rsi),%xmm7 + movdqu 32 + 192(%rsi),%xmm11 + movdqu 48 + 192(%rsi),%xmm15 + pxor %xmm3,%xmm0 + pxor %xmm7,%xmm4 + pxor %xmm11,%xmm8 + pxor %xmm12,%xmm15 + movdqu %xmm0,0 + 192(%rdi) + movdqu %xmm4,16 + 192(%rdi) + movdqu %xmm8,32 + 192(%rdi) + movdqu %xmm15,48 + 192(%rdi) + + leaq 256(%rsi),%rsi + subq $256,%rbx + movq $6,%rcx + movq $4,%r8 + cmpq $192,%rbx + jg L$seal_sse_main_loop + movq %rbx,%rcx + testq %rbx,%rbx + je L$seal_sse_128_tail_hash + movq $6,%rcx + cmpq $128,%rbx + ja L$seal_sse_tail_192 + cmpq $64,%rbx + ja L$seal_sse_tail_128 + +L$seal_sse_tail_64: + movdqa L$chacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa 0+96(%rbp),%xmm12 + paddd L$sse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + +L$seal_sse_tail_64_rounds_and_x2hash: + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +L$seal_sse_tail_64_rounds_and_x1hash: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi + decq %rcx + jg L$seal_sse_tail_64_rounds_and_x2hash + decq %r8 + jge L$seal_sse_tail_64_rounds_and_x1hash + paddd L$chacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + + jmp L$seal_sse_128_tail_xor + +L$seal_sse_tail_128: + movdqa L$chacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa 0+96(%rbp),%xmm13 + paddd L$sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd L$sse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + +L$seal_sse_tail_128_rounds_and_x2hash: + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +L$seal_sse_tail_128_rounds_and_x1hash: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 + + leaq 16(%rdi),%rdi + decq %rcx + jg L$seal_sse_tail_128_rounds_and_x2hash + decq %r8 + jge L$seal_sse_tail_128_rounds_and_x1hash + paddd L$chacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd L$chacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + movdqu 0 + 0(%rsi),%xmm3 + movdqu 16 + 0(%rsi),%xmm7 + movdqu 32 + 0(%rsi),%xmm11 + movdqu 48 + 0(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 0(%rdi) + movdqu %xmm5,16 + 0(%rdi) + movdqu %xmm9,32 + 0(%rdi) + movdqu %xmm15,48 + 0(%rdi) + + movq $64,%rcx + subq $64,%rbx + leaq 64(%rsi),%rsi + jmp L$seal_sse_128_tail_hash + +L$seal_sse_tail_192: + movdqa L$chacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa %xmm0,%xmm2 + movdqa %xmm4,%xmm6 + movdqa %xmm8,%xmm10 + movdqa 0+96(%rbp),%xmm14 + paddd L$sse_inc(%rip),%xmm14 + movdqa %xmm14,%xmm13 + paddd L$sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd L$sse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + movdqa %xmm14,0+128(%rbp) + +L$seal_sse_tail_192_rounds_and_x2hash: + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +L$seal_sse_tail_192_rounds_and_x1hash: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 + + leaq 16(%rdi),%rdi + decq %rcx + jg L$seal_sse_tail_192_rounds_and_x2hash + decq %r8 + jge L$seal_sse_tail_192_rounds_and_x1hash + paddd L$chacha20_consts(%rip),%xmm2 + paddd 0+48(%rbp),%xmm6 + paddd 0+64(%rbp),%xmm10 + paddd 0+128(%rbp),%xmm14 + paddd L$chacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd L$chacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + movdqu 0 + 0(%rsi),%xmm3 + movdqu 16 + 0(%rsi),%xmm7 + movdqu 32 + 0(%rsi),%xmm11 + movdqu 48 + 0(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 0(%rdi) + movdqu %xmm6,16 + 0(%rdi) + movdqu %xmm10,32 + 0(%rdi) + movdqu %xmm15,48 + 0(%rdi) + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 64(%rdi) + movdqu %xmm5,16 + 64(%rdi) + movdqu %xmm9,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + + movq $128,%rcx + subq $128,%rbx + leaq 128(%rsi),%rsi + +L$seal_sse_128_tail_hash: + cmpq $16,%rcx + jb L$seal_sse_128_tail_xor + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + subq $16,%rcx + leaq 16(%rdi),%rdi + jmp L$seal_sse_128_tail_hash + +L$seal_sse_128_tail_xor: + cmpq $16,%rbx + jb L$seal_sse_tail_16 + subq $16,%rbx + + movdqu 0(%rsi),%xmm3 + pxor %xmm3,%xmm0 + movdqu %xmm0,0(%rdi) + + addq 0(%rdi),%r10 + adcq 8(%rdi),%r11 + adcq $1,%r12 + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + + movdqa %xmm4,%xmm0 + movdqa %xmm8,%xmm4 + movdqa %xmm12,%xmm8 + movdqa %xmm1,%xmm12 + movdqa %xmm5,%xmm1 + movdqa %xmm9,%xmm5 + movdqa %xmm13,%xmm9 + jmp L$seal_sse_128_tail_xor + +L$seal_sse_tail_16: + testq %rbx,%rbx + jz L$process_blocks_of_extra_in + + movq %rbx,%r8 + movq %rbx,%rcx + leaq -1(%rsi,%rbx,1),%rsi + pxor %xmm15,%xmm15 +L$seal_sse_tail_16_compose: + pslldq $1,%xmm15 + pinsrb $0,(%rsi),%xmm15 + leaq -1(%rsi),%rsi + decq %rcx + jne L$seal_sse_tail_16_compose + + + pxor %xmm0,%xmm15 + + + movq %rbx,%rcx + movdqu %xmm15,%xmm0 +L$seal_sse_tail_16_extract: + pextrb $0,%xmm0,(%rdi) + psrldq $1,%xmm0 + addq $1,%rdi + subq $1,%rcx + jnz L$seal_sse_tail_16_extract + + + + + + + + + movq 288 + 0 + 32(%rsp),%r9 + movq 56(%r9),%r14 + movq 48(%r9),%r13 + testq %r14,%r14 + jz L$process_partial_block + + movq $16,%r15 + subq %rbx,%r15 + cmpq %r15,%r14 + + jge L$load_extra_in + movq %r14,%r15 + +L$load_extra_in: + + + leaq -1(%r13,%r15,1),%rsi + + + addq %r15,%r13 + subq %r15,%r14 + movq %r13,48(%r9) + movq %r14,56(%r9) + + + + addq %r15,%r8 + + + pxor %xmm11,%xmm11 +L$load_extra_load_loop: + pslldq $1,%xmm11 + pinsrb $0,(%rsi),%xmm11 + leaq -1(%rsi),%rsi + subq $1,%r15 + jnz L$load_extra_load_loop + + + + + movq %rbx,%r15 + +L$load_extra_shift_loop: + pslldq $1,%xmm11 + subq $1,%r15 + jnz L$load_extra_shift_loop + + + + + leaq L$and_masks(%rip),%r15 + shlq $4,%rbx + pand -16(%r15,%rbx,1),%xmm15 + + + por %xmm11,%xmm15 + + + +.byte 102,77,15,126,253 + pextrq $1,%xmm15,%r14 + addq %r13,%r10 + adcq %r14,%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + +L$process_blocks_of_extra_in: + + movq 288+32+0 (%rsp),%r9 + movq 48(%r9),%rsi + movq 56(%r9),%r8 + movq %r8,%rcx + shrq $4,%r8 + +L$process_extra_hash_loop: + jz process_extra_in_trailer + addq 0+0(%rsi),%r10 + adcq 8+0(%rsi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rsi),%rsi + subq $1,%r8 + jmp L$process_extra_hash_loop +process_extra_in_trailer: + andq $15,%rcx + movq %rcx,%rbx + jz L$do_length_block + leaq -1(%rsi,%rcx,1),%rsi + +L$process_extra_in_trailer_load: + pslldq $1,%xmm15 + pinsrb $0,(%rsi),%xmm15 + leaq -1(%rsi),%rsi + subq $1,%rcx + jnz L$process_extra_in_trailer_load + +L$process_partial_block: + + leaq L$and_masks(%rip),%r15 + shlq $4,%rbx + pand -16(%r15,%rbx,1),%xmm15 +.byte 102,77,15,126,253 + pextrq $1,%xmm15,%r14 + addq %r13,%r10 + adcq %r14,%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + +L$do_length_block: + addq 0+0+32(%rbp),%r10 + adcq 8+0+32(%rbp),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + + movq %r10,%r13 + movq %r11,%r14 + movq %r12,%r15 + subq $-5,%r10 + sbbq $-1,%r11 + sbbq $3,%r12 + cmovcq %r13,%r10 + cmovcq %r14,%r11 + cmovcq %r15,%r12 + + addq 0+0+16(%rbp),%r10 + adcq 8+0+16(%rbp),%r11 + + + addq $288 + 0 + 32,%rsp + + + popq %r9 + + movq %r10,(%r9) + movq %r11,8(%r9) + popq %r15 + + popq %r14 + + popq %r13 + + popq %r12 + + popq %rbx + + popq %rbp + + ret + +L$seal_sse_128: + + movdqu L$chacha20_consts(%rip),%xmm0 + movdqa %xmm0,%xmm1 + movdqa %xmm0,%xmm2 + movdqu 0(%r9),%xmm4 + movdqa %xmm4,%xmm5 + movdqa %xmm4,%xmm6 + movdqu 16(%r9),%xmm8 + movdqa %xmm8,%xmm9 + movdqa %xmm8,%xmm10 + movdqu 32(%r9),%xmm14 + movdqa %xmm14,%xmm12 + paddd L$sse_inc(%rip),%xmm12 + movdqa %xmm12,%xmm13 + paddd L$sse_inc(%rip),%xmm13 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm11 + movdqa %xmm12,%xmm15 + movq $10,%r10 + +L$seal_sse_128_rounds: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 + + decq %r10 + jnz L$seal_sse_128_rounds + paddd L$chacha20_consts(%rip),%xmm0 + paddd L$chacha20_consts(%rip),%xmm1 + paddd L$chacha20_consts(%rip),%xmm2 + paddd %xmm7,%xmm4 + paddd %xmm7,%xmm5 + paddd %xmm7,%xmm6 + paddd %xmm11,%xmm8 + paddd %xmm11,%xmm9 + paddd %xmm15,%xmm12 + paddd L$sse_inc(%rip),%xmm15 + paddd %xmm15,%xmm13 + + pand L$clamp(%rip),%xmm2 + movdqa %xmm2,0+0(%rbp) + movdqa %xmm6,0+16(%rbp) + + movq %r8,%r8 + call poly_hash_ad_internal + jmp L$seal_sse_128_tail_xor + + + + +.globl _chacha20_poly1305_open_avx2 +.private_extern _chacha20_poly1305_open_avx2 + +.p2align 6 +_chacha20_poly1305_open_avx2: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + + + pushq %r9 + + subq $288 + 0 + 32,%rsp + + + leaq 32(%rsp),%rbp + andq $-32,%rbp + + movq %rdx,%rbx + movq %r8,0+0+32(%rbp) + movq %rbx,8+0+32(%rbp) + + vzeroupper + vmovdqa L$chacha20_consts(%rip),%ymm0 + vbroadcasti128 0(%r9),%ymm4 + vbroadcasti128 16(%r9),%ymm8 + vbroadcasti128 32(%r9),%ymm12 + vpaddd L$avx2_init(%rip),%ymm12,%ymm12 + cmpq $192,%rbx + jbe L$open_avx2_192 + cmpq $320,%rbx + jbe L$open_avx2_320 + + vmovdqa %ymm4,0+64(%rbp) + vmovdqa %ymm8,0+96(%rbp) + vmovdqa %ymm12,0+160(%rbp) + movq $10,%r10 +L$open_avx2_init_rounds: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + + decq %r10 + jne L$open_avx2_init_rounds + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + + vpand L$clamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0+0(%rbp) + + vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 + + movq %r8,%r8 + call poly_hash_ad_internal + + xorq %rcx,%rcx +L$open_avx2_init_hash: + addq 0+0(%rsi,%rcx,1),%r10 + adcq 8+0(%rsi,%rcx,1),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + addq $16,%rcx + cmpq $64,%rcx + jne L$open_avx2_init_hash + + vpxor 0(%rsi),%ymm0,%ymm0 + vpxor 32(%rsi),%ymm4,%ymm4 + + vmovdqu %ymm0,0(%rdi) + vmovdqu %ymm4,32(%rdi) + leaq 64(%rsi),%rsi + leaq 64(%rdi),%rdi + subq $64,%rbx +L$open_avx2_main_loop: + + cmpq $512,%rbx + jb L$open_avx2_main_loop_done + vmovdqa L$chacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa L$avx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm15 + vpaddd %ymm15,%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm15,0+256(%rbp) + vmovdqa %ymm14,0+224(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm12,0+160(%rbp) + + xorq %rcx,%rcx +L$open_avx2_main_loop_rounds: + addq 0+0(%rsi,%rcx,1),%r10 + adcq 8+0(%rsi,%rcx,1),%r11 + adcq $1,%r12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + addq %rax,%r15 + adcq %rdx,%r9 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa L$rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + addq 0+16(%rsi,%rcx,1),%r10 + adcq 8+16(%rsi,%rcx,1),%r11 + adcq $1,%r12 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + addq %rax,%r15 + adcq %rdx,%r9 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + addq 0+32(%rsi,%rcx,1),%r10 + adcq 8+32(%rsi,%rcx,1),%r11 + adcq $1,%r12 + + leaq 48(%rcx),%rcx + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa L$rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + addq %rax,%r15 + adcq %rdx,%r9 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpalignr $4,%ymm12,%ymm12,%ymm12 + + cmpq $60*8,%rcx + jne L$open_avx2_main_loop_rounds + vpaddd L$chacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 0+64(%rbp),%ymm7,%ymm7 + vpaddd 0+96(%rbp),%ymm11,%ymm11 + vpaddd 0+256(%rbp),%ymm15,%ymm15 + vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 0+64(%rbp),%ymm6,%ymm6 + vpaddd 0+96(%rbp),%ymm10,%ymm10 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + + vmovdqa %ymm0,0+128(%rbp) + addq 0+60*8(%rsi),%r10 + adcq 8+60*8(%rsi),%r11 + adcq $1,%r12 + vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 + vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vpxor 0+0(%rsi),%ymm0,%ymm0 + vpxor 32+0(%rsi),%ymm3,%ymm3 + vpxor 64+0(%rsi),%ymm7,%ymm7 + vpxor 96+0(%rsi),%ymm11,%ymm11 + vmovdqu %ymm0,0+0(%rdi) + vmovdqu %ymm3,32+0(%rdi) + vmovdqu %ymm7,64+0(%rdi) + vmovdqu %ymm11,96+0(%rdi) + + vmovdqa 0+128(%rbp),%ymm0 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm2,%ymm2 + vpxor 64+128(%rsi),%ymm6,%ymm6 + vpxor 96+128(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm2,32+128(%rdi) + vmovdqu %ymm6,64+128(%rdi) + vmovdqu %ymm10,96+128(%rdi) + addq 0+60*8+16(%rsi),%r10 + adcq 8+60*8+16(%rsi),%r11 + adcq $1,%r12 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+256(%rsi),%ymm3,%ymm3 + vpxor 32+256(%rsi),%ymm1,%ymm1 + vpxor 64+256(%rsi),%ymm5,%ymm5 + vpxor 96+256(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+256(%rdi) + vmovdqu %ymm1,32+256(%rdi) + vmovdqu %ymm5,64+256(%rdi) + vmovdqu %ymm9,96+256(%rdi) + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x13,%ymm0,%ymm4,%ymm4 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm8 + vpxor 0+384(%rsi),%ymm3,%ymm3 + vpxor 32+384(%rsi),%ymm0,%ymm0 + vpxor 64+384(%rsi),%ymm4,%ymm4 + vpxor 96+384(%rsi),%ymm8,%ymm8 + vmovdqu %ymm3,0+384(%rdi) + vmovdqu %ymm0,32+384(%rdi) + vmovdqu %ymm4,64+384(%rdi) + vmovdqu %ymm8,96+384(%rdi) + + leaq 512(%rsi),%rsi + leaq 512(%rdi),%rdi + subq $512,%rbx + jmp L$open_avx2_main_loop +L$open_avx2_main_loop_done: + testq %rbx,%rbx + vzeroupper + je L$open_sse_finalize + + cmpq $384,%rbx + ja L$open_avx2_tail_512 + cmpq $256,%rbx + ja L$open_avx2_tail_384 + cmpq $128,%rbx + ja L$open_avx2_tail_256 + vmovdqa L$chacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa L$avx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vmovdqa %ymm12,0+160(%rbp) + + xorq %r8,%r8 + movq %rbx,%rcx + andq $-16,%rcx + testq %rcx,%rcx + je L$open_avx2_tail_128_rounds +L$open_avx2_tail_128_rounds_and_x1hash: + addq 0+0(%rsi,%r8,1),%r10 + adcq 8+0(%rsi,%r8,1),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +L$open_avx2_tail_128_rounds: + addq $16,%r8 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + + cmpq %rcx,%r8 + jb L$open_avx2_tail_128_rounds_and_x1hash + cmpq $160,%r8 + jne L$open_avx2_tail_128_rounds + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + jmp L$open_avx2_tail_128_xor + +L$open_avx2_tail_256: + vmovdqa L$chacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa L$avx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm13,0+192(%rbp) + + movq %rbx,0+128(%rbp) + movq %rbx,%rcx + subq $128,%rcx + shrq $4,%rcx + movq $10,%r8 + cmpq $10,%rcx + cmovgq %r8,%rcx + movq %rsi,%rbx + xorq %r8,%r8 +L$open_avx2_tail_256_rounds_and_x1hash: + addq 0+0(%rbx),%r10 + adcq 8+0(%rbx),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rbx),%rbx +L$open_avx2_tail_256_rounds: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + + incq %r8 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm6,%ymm6,%ymm6 + + cmpq %rcx,%r8 + jb L$open_avx2_tail_256_rounds_and_x1hash + cmpq $10,%r8 + jne L$open_avx2_tail_256_rounds + movq %rbx,%r8 + subq %rsi,%rbx + movq %rbx,%rcx + movq 0+128(%rbp),%rbx +L$open_avx2_tail_256_hash: + addq $16,%rcx + cmpq %rbx,%rcx + jg L$open_avx2_tail_256_done + addq 0+0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%r8),%r8 + jmp L$open_avx2_tail_256_hash +L$open_avx2_tail_256_done: + vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+0(%rsi),%ymm3,%ymm3 + vpxor 32+0(%rsi),%ymm1,%ymm1 + vpxor 64+0(%rsi),%ymm5,%ymm5 + vpxor 96+0(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+0(%rdi) + vmovdqu %ymm1,32+0(%rdi) + vmovdqu %ymm5,64+0(%rdi) + vmovdqu %ymm9,96+0(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + leaq 128(%rsi),%rsi + leaq 128(%rdi),%rdi + subq $128,%rbx + jmp L$open_avx2_tail_128_xor + +L$open_avx2_tail_384: + vmovdqa L$chacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa L$avx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm14,0+224(%rbp) + + movq %rbx,0+128(%rbp) + movq %rbx,%rcx + subq $256,%rcx + shrq $4,%rcx + addq $6,%rcx + movq $10,%r8 + cmpq $10,%rcx + cmovgq %r8,%rcx + movq %rsi,%rbx + xorq %r8,%r8 +L$open_avx2_tail_384_rounds_and_x2hash: + addq 0+0(%rbx),%r10 + adcq 8+0(%rbx),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rbx),%rbx +L$open_avx2_tail_384_rounds_and_x1hash: + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + addq 0+0(%rbx),%r10 + adcq 8+0(%rbx),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rbx),%rbx + incq %r8 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + + cmpq %rcx,%r8 + jb L$open_avx2_tail_384_rounds_and_x2hash + cmpq $10,%r8 + jne L$open_avx2_tail_384_rounds_and_x1hash + movq %rbx,%r8 + subq %rsi,%rbx + movq %rbx,%rcx + movq 0+128(%rbp),%rbx +L$open_avx2_384_tail_hash: + addq $16,%rcx + cmpq %rbx,%rcx + jg L$open_avx2_384_tail_done + addq 0+0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%r8),%r8 + jmp L$open_avx2_384_tail_hash +L$open_avx2_384_tail_done: + vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 0+64(%rbp),%ymm6,%ymm6 + vpaddd 0+96(%rbp),%ymm10,%ymm10 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+0(%rsi),%ymm3,%ymm3 + vpxor 32+0(%rsi),%ymm2,%ymm2 + vpxor 64+0(%rsi),%ymm6,%ymm6 + vpxor 96+0(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+0(%rdi) + vmovdqu %ymm2,32+0(%rdi) + vmovdqu %ymm6,64+0(%rdi) + vmovdqu %ymm10,96+0(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm1,%ymm1 + vpxor 64+128(%rsi),%ymm5,%ymm5 + vpxor 96+128(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm1,32+128(%rdi) + vmovdqu %ymm5,64+128(%rdi) + vmovdqu %ymm9,96+128(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + leaq 256(%rsi),%rsi + leaq 256(%rdi),%rdi + subq $256,%rbx + jmp L$open_avx2_tail_128_xor + +L$open_avx2_tail_512: + vmovdqa L$chacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa L$avx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm15 + vpaddd %ymm15,%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm15,0+256(%rbp) + vmovdqa %ymm14,0+224(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm12,0+160(%rbp) + + xorq %rcx,%rcx + movq %rsi,%r8 +L$open_avx2_tail_512_rounds_and_x2hash: + addq 0+0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%r8),%r8 +L$open_avx2_tail_512_rounds_and_x1hash: + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa L$rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + addq 0+0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + addq 0+16(%r8),%r10 + adcq 8+16(%r8),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%r8),%r8 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa L$rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm12,%ymm12,%ymm12 + + incq %rcx + cmpq $4,%rcx + jl L$open_avx2_tail_512_rounds_and_x2hash + cmpq $10,%rcx + jne L$open_avx2_tail_512_rounds_and_x1hash + movq %rbx,%rcx + subq $384,%rcx + andq $-16,%rcx +L$open_avx2_tail_512_hash: + testq %rcx,%rcx + je L$open_avx2_tail_512_done + addq 0+0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%r8),%r8 + subq $16,%rcx + jmp L$open_avx2_tail_512_hash +L$open_avx2_tail_512_done: + vpaddd L$chacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 0+64(%rbp),%ymm7,%ymm7 + vpaddd 0+96(%rbp),%ymm11,%ymm11 + vpaddd 0+256(%rbp),%ymm15,%ymm15 + vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 0+64(%rbp),%ymm6,%ymm6 + vpaddd 0+96(%rbp),%ymm10,%ymm10 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + + vmovdqa %ymm0,0+128(%rbp) + vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 + vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vpxor 0+0(%rsi),%ymm0,%ymm0 + vpxor 32+0(%rsi),%ymm3,%ymm3 + vpxor 64+0(%rsi),%ymm7,%ymm7 + vpxor 96+0(%rsi),%ymm11,%ymm11 + vmovdqu %ymm0,0+0(%rdi) + vmovdqu %ymm3,32+0(%rdi) + vmovdqu %ymm7,64+0(%rdi) + vmovdqu %ymm11,96+0(%rdi) + + vmovdqa 0+128(%rbp),%ymm0 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm2,%ymm2 + vpxor 64+128(%rsi),%ymm6,%ymm6 + vpxor 96+128(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm2,32+128(%rdi) + vmovdqu %ymm6,64+128(%rdi) + vmovdqu %ymm10,96+128(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+256(%rsi),%ymm3,%ymm3 + vpxor 32+256(%rsi),%ymm1,%ymm1 + vpxor 64+256(%rsi),%ymm5,%ymm5 + vpxor 96+256(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+256(%rdi) + vmovdqu %ymm1,32+256(%rdi) + vmovdqu %ymm5,64+256(%rdi) + vmovdqu %ymm9,96+256(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + leaq 384(%rsi),%rsi + leaq 384(%rdi),%rdi + subq $384,%rbx +L$open_avx2_tail_128_xor: + cmpq $32,%rbx + jb L$open_avx2_tail_32_xor + subq $32,%rbx + vpxor (%rsi),%ymm0,%ymm0 + vmovdqu %ymm0,(%rdi) + leaq 32(%rsi),%rsi + leaq 32(%rdi),%rdi + vmovdqa %ymm4,%ymm0 + vmovdqa %ymm8,%ymm4 + vmovdqa %ymm12,%ymm8 + jmp L$open_avx2_tail_128_xor +L$open_avx2_tail_32_xor: + cmpq $16,%rbx + vmovdqa %xmm0,%xmm1 + jb L$open_avx2_exit + subq $16,%rbx + + vpxor (%rsi),%xmm0,%xmm1 + vmovdqu %xmm1,(%rdi) + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi + vperm2i128 $0x11,%ymm0,%ymm0,%ymm0 + vmovdqa %xmm0,%xmm1 +L$open_avx2_exit: + vzeroupper + jmp L$open_sse_tail_16 + +L$open_avx2_192: + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm8,%ymm10 + vpaddd L$avx2_inc(%rip),%ymm12,%ymm13 + vmovdqa %ymm12,%ymm11 + vmovdqa %ymm13,%ymm15 + movq $10,%r10 +L$open_avx2_192_rounds: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + + decq %r10 + jne L$open_avx2_192_rounds + vpaddd %ymm2,%ymm0,%ymm0 + vpaddd %ymm2,%ymm1,%ymm1 + vpaddd %ymm6,%ymm4,%ymm4 + vpaddd %ymm6,%ymm5,%ymm5 + vpaddd %ymm10,%ymm8,%ymm8 + vpaddd %ymm10,%ymm9,%ymm9 + vpaddd %ymm11,%ymm12,%ymm12 + vpaddd %ymm15,%ymm13,%ymm13 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + + vpand L$clamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0+0(%rbp) + + vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 +L$open_avx2_short: + movq %r8,%r8 + call poly_hash_ad_internal +L$open_avx2_short_hash_and_xor_loop: + cmpq $32,%rbx + jb L$open_avx2_short_tail_32 + subq $32,%rbx + addq 0+0(%rsi),%r10 + adcq 8+0(%rsi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + addq 0+16(%rsi),%r10 + adcq 8+16(%rsi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + + vpxor (%rsi),%ymm0,%ymm0 + vmovdqu %ymm0,(%rdi) + leaq 32(%rsi),%rsi + leaq 32(%rdi),%rdi + + vmovdqa %ymm4,%ymm0 + vmovdqa %ymm8,%ymm4 + vmovdqa %ymm12,%ymm8 + vmovdqa %ymm1,%ymm12 + vmovdqa %ymm5,%ymm1 + vmovdqa %ymm9,%ymm5 + vmovdqa %ymm13,%ymm9 + vmovdqa %ymm2,%ymm13 + vmovdqa %ymm6,%ymm2 + jmp L$open_avx2_short_hash_and_xor_loop +L$open_avx2_short_tail_32: + cmpq $16,%rbx + vmovdqa %xmm0,%xmm1 + jb L$open_avx2_short_tail_32_exit + subq $16,%rbx + addq 0+0(%rsi),%r10 + adcq 8+0(%rsi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + vpxor (%rsi),%xmm0,%xmm3 + vmovdqu %xmm3,(%rdi) + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi + vextracti128 $1,%ymm0,%xmm1 +L$open_avx2_short_tail_32_exit: + vzeroupper + jmp L$open_sse_tail_16 + +L$open_avx2_320: + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm8,%ymm10 + vpaddd L$avx2_inc(%rip),%ymm12,%ymm13 + vpaddd L$avx2_inc(%rip),%ymm13,%ymm14 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm14,0+224(%rbp) + movq $10,%r10 +L$open_avx2_320_rounds: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm6,%ymm6,%ymm6 + + decq %r10 + jne L$open_avx2_320_rounds + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd %ymm7,%ymm4,%ymm4 + vpaddd %ymm7,%ymm5,%ymm5 + vpaddd %ymm7,%ymm6,%ymm6 + vpaddd %ymm11,%ymm8,%ymm8 + vpaddd %ymm11,%ymm9,%ymm9 + vpaddd %ymm11,%ymm10,%ymm10 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + + vpand L$clamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0+0(%rbp) + + vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm9 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm13 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm6 + jmp L$open_avx2_short + + + + +.globl _chacha20_poly1305_seal_avx2 +.private_extern _chacha20_poly1305_seal_avx2 + +.p2align 6 +_chacha20_poly1305_seal_avx2: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + + + pushq %r9 + + subq $288 + 0 + 32,%rsp + + leaq 32(%rsp),%rbp + andq $-32,%rbp + + movq 56(%r9),%rbx + addq %rdx,%rbx + movq %r8,0+0+32(%rbp) + movq %rbx,8+0+32(%rbp) + movq %rdx,%rbx + + vzeroupper + vmovdqa L$chacha20_consts(%rip),%ymm0 + vbroadcasti128 0(%r9),%ymm4 + vbroadcasti128 16(%r9),%ymm8 + vbroadcasti128 32(%r9),%ymm12 + vpaddd L$avx2_init(%rip),%ymm12,%ymm12 + cmpq $192,%rbx + jbe L$seal_avx2_192 + cmpq $320,%rbx + jbe L$seal_avx2_320 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm4,0+64(%rbp) + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm8,%ymm11 + vmovdqa %ymm8,0+96(%rbp) + vmovdqa %ymm12,%ymm15 + vpaddd L$avx2_inc(%rip),%ymm15,%ymm14 + vpaddd L$avx2_inc(%rip),%ymm14,%ymm13 + vpaddd L$avx2_inc(%rip),%ymm13,%ymm12 + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm14,0+224(%rbp) + vmovdqa %ymm15,0+256(%rbp) + movq $10,%r10 +L$seal_avx2_init_rounds: + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa L$rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa L$rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm12,%ymm12,%ymm12 + + decq %r10 + jnz L$seal_avx2_init_rounds + vpaddd L$chacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 0+64(%rbp),%ymm7,%ymm7 + vpaddd 0+96(%rbp),%ymm11,%ymm11 + vpaddd 0+256(%rbp),%ymm15,%ymm15 + vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 0+64(%rbp),%ymm6,%ymm6 + vpaddd 0+96(%rbp),%ymm10,%ymm10 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vperm2i128 $0x02,%ymm3,%ymm7,%ymm15 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm3 + vpand L$clamp(%rip),%ymm15,%ymm15 + vmovdqa %ymm15,0+0(%rbp) + movq %r8,%r8 + call poly_hash_ad_internal + + vpxor 0(%rsi),%ymm3,%ymm3 + vpxor 32(%rsi),%ymm11,%ymm11 + vmovdqu %ymm3,0(%rdi) + vmovdqu %ymm11,32(%rdi) + vperm2i128 $0x02,%ymm2,%ymm6,%ymm15 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+64(%rsi),%ymm15,%ymm15 + vpxor 32+64(%rsi),%ymm2,%ymm2 + vpxor 64+64(%rsi),%ymm6,%ymm6 + vpxor 96+64(%rsi),%ymm10,%ymm10 + vmovdqu %ymm15,0+64(%rdi) + vmovdqu %ymm2,32+64(%rdi) + vmovdqu %ymm6,64+64(%rdi) + vmovdqu %ymm10,96+64(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm15 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+192(%rsi),%ymm15,%ymm15 + vpxor 32+192(%rsi),%ymm1,%ymm1 + vpxor 64+192(%rsi),%ymm5,%ymm5 + vpxor 96+192(%rsi),%ymm9,%ymm9 + vmovdqu %ymm15,0+192(%rdi) + vmovdqu %ymm1,32+192(%rdi) + vmovdqu %ymm5,64+192(%rdi) + vmovdqu %ymm9,96+192(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm15 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm15,%ymm8 + + leaq 320(%rsi),%rsi + subq $320,%rbx + movq $320,%rcx + cmpq $128,%rbx + jbe L$seal_avx2_short_hash_remainder + vpxor 0(%rsi),%ymm0,%ymm0 + vpxor 32(%rsi),%ymm4,%ymm4 + vpxor 64(%rsi),%ymm8,%ymm8 + vpxor 96(%rsi),%ymm12,%ymm12 + vmovdqu %ymm0,320(%rdi) + vmovdqu %ymm4,352(%rdi) + vmovdqu %ymm8,384(%rdi) + vmovdqu %ymm12,416(%rdi) + leaq 128(%rsi),%rsi + subq $128,%rbx + movq $8,%rcx + movq $2,%r8 + cmpq $128,%rbx + jbe L$seal_avx2_tail_128 + cmpq $256,%rbx + jbe L$seal_avx2_tail_256 + cmpq $384,%rbx + jbe L$seal_avx2_tail_384 + cmpq $512,%rbx + jbe L$seal_avx2_tail_512 + vmovdqa L$chacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa L$avx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm15 + vpaddd %ymm15,%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm15,0+256(%rbp) + vmovdqa %ymm14,0+224(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa L$rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa L$rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa L$rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + + subq $16,%rdi + movq $9,%rcx + jmp L$seal_avx2_main_loop_rounds_entry +.p2align 5 +L$seal_avx2_main_loop: + vmovdqa L$chacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa L$avx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm15 + vpaddd %ymm15,%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm15,0+256(%rbp) + vmovdqa %ymm14,0+224(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm12,0+160(%rbp) + + movq $10,%rcx +.p2align 5 +L$seal_avx2_main_loop_rounds: + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + addq %rax,%r15 + adcq %rdx,%r9 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa L$rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +L$seal_avx2_main_loop_rounds_entry: + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + addq 0+16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + addq %rax,%r15 + adcq %rdx,%r9 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + addq 0+32(%rdi),%r10 + adcq 8+32(%rdi),%r11 + adcq $1,%r12 + + leaq 48(%rdi),%rdi + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa L$rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + addq %rax,%r15 + adcq %rdx,%r9 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpalignr $4,%ymm12,%ymm12,%ymm12 + + decq %rcx + jne L$seal_avx2_main_loop_rounds + vpaddd L$chacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 0+64(%rbp),%ymm7,%ymm7 + vpaddd 0+96(%rbp),%ymm11,%ymm11 + vpaddd 0+256(%rbp),%ymm15,%ymm15 + vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 0+64(%rbp),%ymm6,%ymm6 + vpaddd 0+96(%rbp),%ymm10,%ymm10 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + + vmovdqa %ymm0,0+128(%rbp) + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + addq 0+16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 + vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vpxor 0+0(%rsi),%ymm0,%ymm0 + vpxor 32+0(%rsi),%ymm3,%ymm3 + vpxor 64+0(%rsi),%ymm7,%ymm7 + vpxor 96+0(%rsi),%ymm11,%ymm11 + vmovdqu %ymm0,0+0(%rdi) + vmovdqu %ymm3,32+0(%rdi) + vmovdqu %ymm7,64+0(%rdi) + vmovdqu %ymm11,96+0(%rdi) + + vmovdqa 0+128(%rbp),%ymm0 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm2,%ymm2 + vpxor 64+128(%rsi),%ymm6,%ymm6 + vpxor 96+128(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm2,32+128(%rdi) + vmovdqu %ymm6,64+128(%rdi) + vmovdqu %ymm10,96+128(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+256(%rsi),%ymm3,%ymm3 + vpxor 32+256(%rsi),%ymm1,%ymm1 + vpxor 64+256(%rsi),%ymm5,%ymm5 + vpxor 96+256(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+256(%rdi) + vmovdqu %ymm1,32+256(%rdi) + vmovdqu %ymm5,64+256(%rdi) + vmovdqu %ymm9,96+256(%rdi) + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x13,%ymm0,%ymm4,%ymm4 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm8 + vpxor 0+384(%rsi),%ymm3,%ymm3 + vpxor 32+384(%rsi),%ymm0,%ymm0 + vpxor 64+384(%rsi),%ymm4,%ymm4 + vpxor 96+384(%rsi),%ymm8,%ymm8 + vmovdqu %ymm3,0+384(%rdi) + vmovdqu %ymm0,32+384(%rdi) + vmovdqu %ymm4,64+384(%rdi) + vmovdqu %ymm8,96+384(%rdi) + + leaq 512(%rsi),%rsi + subq $512,%rbx + cmpq $512,%rbx + jg L$seal_avx2_main_loop + + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + addq 0+16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + movq $10,%rcx + xorq %r8,%r8 + + cmpq $384,%rbx + ja L$seal_avx2_tail_512 + cmpq $256,%rbx + ja L$seal_avx2_tail_384 + cmpq $128,%rbx + ja L$seal_avx2_tail_256 + +L$seal_avx2_tail_128: + vmovdqa L$chacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa L$avx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vmovdqa %ymm12,0+160(%rbp) + +L$seal_avx2_tail_128_rounds_and_3xhash: + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +L$seal_avx2_tail_128_rounds_and_2xhash: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + addq 0+16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + decq %rcx + jg L$seal_avx2_tail_128_rounds_and_3xhash + decq %r8 + jge L$seal_avx2_tail_128_rounds_and_2xhash + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + jmp L$seal_avx2_short_loop + +L$seal_avx2_tail_256: + vmovdqa L$chacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa L$avx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm13,0+192(%rbp) + +L$seal_avx2_tail_256_rounds_and_3xhash: + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +L$seal_avx2_tail_256_rounds_and_2xhash: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + addq 0+16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + decq %rcx + jg L$seal_avx2_tail_256_rounds_and_3xhash + decq %r8 + jge L$seal_avx2_tail_256_rounds_and_2xhash + vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+0(%rsi),%ymm3,%ymm3 + vpxor 32+0(%rsi),%ymm1,%ymm1 + vpxor 64+0(%rsi),%ymm5,%ymm5 + vpxor 96+0(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+0(%rdi) + vmovdqu %ymm1,32+0(%rdi) + vmovdqu %ymm5,64+0(%rdi) + vmovdqu %ymm9,96+0(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + movq $128,%rcx + leaq 128(%rsi),%rsi + subq $128,%rbx + jmp L$seal_avx2_short_hash_remainder + +L$seal_avx2_tail_384: + vmovdqa L$chacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa L$avx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm14,0+224(%rbp) + +L$seal_avx2_tail_384_rounds_and_3xhash: + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +L$seal_avx2_tail_384_rounds_and_2xhash: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + addq 0+16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm6,%ymm6,%ymm6 + + leaq 32(%rdi),%rdi + decq %rcx + jg L$seal_avx2_tail_384_rounds_and_3xhash + decq %r8 + jge L$seal_avx2_tail_384_rounds_and_2xhash + vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 0+64(%rbp),%ymm6,%ymm6 + vpaddd 0+96(%rbp),%ymm10,%ymm10 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+0(%rsi),%ymm3,%ymm3 + vpxor 32+0(%rsi),%ymm2,%ymm2 + vpxor 64+0(%rsi),%ymm6,%ymm6 + vpxor 96+0(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+0(%rdi) + vmovdqu %ymm2,32+0(%rdi) + vmovdqu %ymm6,64+0(%rdi) + vmovdqu %ymm10,96+0(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm1,%ymm1 + vpxor 64+128(%rsi),%ymm5,%ymm5 + vpxor 96+128(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm1,32+128(%rdi) + vmovdqu %ymm5,64+128(%rdi) + vmovdqu %ymm9,96+128(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + movq $256,%rcx + leaq 256(%rsi),%rsi + subq $256,%rbx + jmp L$seal_avx2_short_hash_remainder + +L$seal_avx2_tail_512: + vmovdqa L$chacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa L$avx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm15 + vpaddd %ymm15,%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm15,0+256(%rbp) + vmovdqa %ymm14,0+224(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm12,0+160(%rbp) + +L$seal_avx2_tail_512_rounds_and_3xhash: + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +L$seal_avx2_tail_512_rounds_and_2xhash: + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa L$rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + addq %rax,%r15 + adcq %rdx,%r9 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa L$rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + addq 0+16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm12,%ymm12,%ymm12 + + + + + + + + + + + + + + + + + addq %rax,%r15 + adcq %rdx,%r9 + + + + + + + + + + + + + + + + + + + + + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + decq %rcx + jg L$seal_avx2_tail_512_rounds_and_3xhash + decq %r8 + jge L$seal_avx2_tail_512_rounds_and_2xhash + vpaddd L$chacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 0+64(%rbp),%ymm7,%ymm7 + vpaddd 0+96(%rbp),%ymm11,%ymm11 + vpaddd 0+256(%rbp),%ymm15,%ymm15 + vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 0+64(%rbp),%ymm6,%ymm6 + vpaddd 0+96(%rbp),%ymm10,%ymm10 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + + vmovdqa %ymm0,0+128(%rbp) + vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 + vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vpxor 0+0(%rsi),%ymm0,%ymm0 + vpxor 32+0(%rsi),%ymm3,%ymm3 + vpxor 64+0(%rsi),%ymm7,%ymm7 + vpxor 96+0(%rsi),%ymm11,%ymm11 + vmovdqu %ymm0,0+0(%rdi) + vmovdqu %ymm3,32+0(%rdi) + vmovdqu %ymm7,64+0(%rdi) + vmovdqu %ymm11,96+0(%rdi) + + vmovdqa 0+128(%rbp),%ymm0 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm2,%ymm2 + vpxor 64+128(%rsi),%ymm6,%ymm6 + vpxor 96+128(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm2,32+128(%rdi) + vmovdqu %ymm6,64+128(%rdi) + vmovdqu %ymm10,96+128(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+256(%rsi),%ymm3,%ymm3 + vpxor 32+256(%rsi),%ymm1,%ymm1 + vpxor 64+256(%rsi),%ymm5,%ymm5 + vpxor 96+256(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+256(%rdi) + vmovdqu %ymm1,32+256(%rdi) + vmovdqu %ymm5,64+256(%rdi) + vmovdqu %ymm9,96+256(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + movq $384,%rcx + leaq 384(%rsi),%rsi + subq $384,%rbx + jmp L$seal_avx2_short_hash_remainder + +L$seal_avx2_320: + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm8,%ymm10 + vpaddd L$avx2_inc(%rip),%ymm12,%ymm13 + vpaddd L$avx2_inc(%rip),%ymm13,%ymm14 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm14,0+224(%rbp) + movq $10,%r10 +L$seal_avx2_320_rounds: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm6,%ymm6,%ymm6 + + decq %r10 + jne L$seal_avx2_320_rounds + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd %ymm7,%ymm4,%ymm4 + vpaddd %ymm7,%ymm5,%ymm5 + vpaddd %ymm7,%ymm6,%ymm6 + vpaddd %ymm11,%ymm8,%ymm8 + vpaddd %ymm11,%ymm9,%ymm9 + vpaddd %ymm11,%ymm10,%ymm10 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + + vpand L$clamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0+0(%rbp) + + vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm9 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm13 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm6 + jmp L$seal_avx2_short + +L$seal_avx2_192: + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm8,%ymm10 + vpaddd L$avx2_inc(%rip),%ymm12,%ymm13 + vmovdqa %ymm12,%ymm11 + vmovdqa %ymm13,%ymm15 + movq $10,%r10 +L$seal_avx2_192_rounds: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + + decq %r10 + jne L$seal_avx2_192_rounds + vpaddd %ymm2,%ymm0,%ymm0 + vpaddd %ymm2,%ymm1,%ymm1 + vpaddd %ymm6,%ymm4,%ymm4 + vpaddd %ymm6,%ymm5,%ymm5 + vpaddd %ymm10,%ymm8,%ymm8 + vpaddd %ymm10,%ymm9,%ymm9 + vpaddd %ymm11,%ymm12,%ymm12 + vpaddd %ymm15,%ymm13,%ymm13 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + + vpand L$clamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0+0(%rbp) + + vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 +L$seal_avx2_short: + movq %r8,%r8 + call poly_hash_ad_internal + xorq %rcx,%rcx +L$seal_avx2_short_hash_remainder: + cmpq $16,%rcx + jb L$seal_avx2_short_loop + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + subq $16,%rcx + addq $16,%rdi + jmp L$seal_avx2_short_hash_remainder +L$seal_avx2_short_loop: + cmpq $32,%rbx + jb L$seal_avx2_short_tail + subq $32,%rbx + + vpxor (%rsi),%ymm0,%ymm0 + vmovdqu %ymm0,(%rdi) + leaq 32(%rsi),%rsi + + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + addq 0+16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + + vmovdqa %ymm4,%ymm0 + vmovdqa %ymm8,%ymm4 + vmovdqa %ymm12,%ymm8 + vmovdqa %ymm1,%ymm12 + vmovdqa %ymm5,%ymm1 + vmovdqa %ymm9,%ymm5 + vmovdqa %ymm13,%ymm9 + vmovdqa %ymm2,%ymm13 + vmovdqa %ymm6,%ymm2 + jmp L$seal_avx2_short_loop +L$seal_avx2_short_tail: + cmpq $16,%rbx + jb L$seal_avx2_exit + subq $16,%rbx + vpxor (%rsi),%xmm0,%xmm3 + vmovdqu %xmm3,(%rdi) + leaq 16(%rsi),%rsi + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi + vextracti128 $1,%ymm0,%xmm0 +L$seal_avx2_exit: + vzeroupper + jmp L$seal_sse_tail_16 + + +#endif diff --git a/ring-0.17.14/pregenerated/chacha20_poly1305_x86_64-nasm.asm b/ring-0.17.14/pregenerated/chacha20_poly1305_x86_64-nasm.asm new file mode 100644 index 0000000000..127dbe30f2 --- /dev/null +++ b/ring-0.17.14/pregenerated/chacha20_poly1305_x86_64-nasm.asm @@ -0,0 +1,9021 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%include "ring_core_generated/prefix_symbols_nasm.inc" +section .rdata rdata align=8 +ALIGN 64 +chacha20_poly1305_constants: +$L$chacha20_consts: + DB 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' + DB 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' +$L$rol8: + DB 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 + DB 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 +$L$rol16: + DB 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 + DB 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 +$L$avx2_init: + DD 0,0,0,0 +$L$sse_inc: + DD 1,0,0,0 +$L$avx2_inc: + DD 2,0,0,0,2,0,0,0 +$L$clamp: + DQ 0x0FFFFFFC0FFFFFFF,0x0FFFFFFC0FFFFFFC + DQ 0xFFFFFFFFFFFFFFFF,0xFFFFFFFFFFFFFFFF +ALIGN 16 +$L$and_masks: + DB 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + DB 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + DB 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + DB 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + DB 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + DB 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00 + DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00 + DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 + DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00 + DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00 + DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00 + DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff +section .text code align=64 + + + +ALIGN 64 +poly_hash_ad_internal: + + + xor r10,r10 + xor r11,r11 + xor r12,r12 + cmp r8,13 + jne NEAR $L$hash_ad_loop +$L$poly_fast_tls_ad: + + mov r10,QWORD[rcx] + mov r11,QWORD[5+rcx] + shr r11,24 + mov r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + ret +$L$hash_ad_loop: + + cmp r8,16 + jb NEAR $L$hash_ad_tail + add r10,QWORD[((0+0))+rcx] + adc r11,QWORD[((8+0))+rcx] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rcx,[16+rcx] + sub r8,16 + jmp NEAR $L$hash_ad_loop +$L$hash_ad_tail: + cmp r8,0 + je NEAR $L$hash_ad_done + + xor r13,r13 + xor r14,r14 + xor r15,r15 + add rcx,r8 +$L$hash_ad_tail_loop: + shld r14,r13,8 + shl r13,8 + movzx r15,BYTE[((-1))+rcx] + xor r13,r15 + dec rcx + dec r8 + jne NEAR $L$hash_ad_tail_loop + + add r10,r13 + adc r11,r14 + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + +$L$hash_ad_done: + ret + + + +global chacha20_poly1305_open_sse41 + +ALIGN 64 +chacha20_poly1305_open_sse41: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_chacha20_poly1305_open_sse41: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + + + push r9 + + sub rsp,288 + 160 + 32 + + + lea rbp,[32+rsp] + and rbp,-32 + + movaps XMMWORD[(0+0)+rbp],xmm6 + movaps XMMWORD[(16+0)+rbp],xmm7 + movaps XMMWORD[(32+0)+rbp],xmm8 + movaps XMMWORD[(48+0)+rbp],xmm9 + movaps XMMWORD[(64+0)+rbp],xmm10 + movaps XMMWORD[(80+0)+rbp],xmm11 + movaps XMMWORD[(96+0)+rbp],xmm12 + movaps XMMWORD[(112+0)+rbp],xmm13 + movaps XMMWORD[(128+0)+rbp],xmm14 + movaps XMMWORD[(144+0)+rbp],xmm15 + + mov rbx,rdx + mov QWORD[((0+160+32))+rbp],r8 + mov QWORD[((8+160+32))+rbp],rbx + + cmp rbx,128 + jbe NEAR $L$open_sse_128 + + movdqa xmm0,XMMWORD[$L$chacha20_consts] + movdqu xmm4,XMMWORD[r9] + movdqu xmm8,XMMWORD[16+r9] + movdqu xmm12,XMMWORD[32+r9] + + movdqa xmm7,xmm12 + + movdqa XMMWORD[(160+48)+rbp],xmm4 + movdqa XMMWORD[(160+64)+rbp],xmm8 + movdqa XMMWORD[(160+96)+rbp],xmm12 + mov r10,10 +$L$open_sse_init_rounds: + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,4 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,12 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,12 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,4 + + dec r10 + jne NEAR $L$open_sse_init_rounds + + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm4,XMMWORD[((160+48))+rbp] + + pand xmm0,XMMWORD[$L$clamp] + movdqa XMMWORD[(160+0)+rbp],xmm0 + movdqa XMMWORD[(160+16)+rbp],xmm4 + + mov r8,r8 + call poly_hash_ad_internal +$L$open_sse_main_loop: + cmp rbx,16*16 + jb NEAR $L$open_sse_tail + + movdqa xmm0,XMMWORD[$L$chacha20_consts] + movdqa xmm4,XMMWORD[((160+48))+rbp] + movdqa xmm8,XMMWORD[((160+64))+rbp] + movdqa xmm1,xmm0 + movdqa xmm5,xmm4 + movdqa xmm9,xmm8 + movdqa xmm2,xmm0 + movdqa xmm6,xmm4 + movdqa xmm10,xmm8 + movdqa xmm3,xmm0 + movdqa xmm7,xmm4 + movdqa xmm11,xmm8 + movdqa xmm15,XMMWORD[((160+96))+rbp] + paddd xmm15,XMMWORD[$L$sse_inc] + movdqa xmm14,xmm15 + paddd xmm14,XMMWORD[$L$sse_inc] + movdqa xmm13,xmm14 + paddd xmm13,XMMWORD[$L$sse_inc] + movdqa xmm12,xmm13 + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa XMMWORD[(160+96)+rbp],xmm12 + movdqa XMMWORD[(160+112)+rbp],xmm13 + movdqa XMMWORD[(160+128)+rbp],xmm14 + movdqa XMMWORD[(160+144)+rbp],xmm15 + + + + mov rcx,4 + mov r8,rsi +$L$open_sse_main_loop_rounds: + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,XMMWORD[$L$rol16] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + pxor xmm13,xmm1 + pxor xmm12,xmm0 +DB 102,69,15,56,0,248 +DB 102,69,15,56,0,240 +DB 102,69,15,56,0,232 +DB 102,69,15,56,0,224 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + add r10,QWORD[((0+0))+r8] + adc r11,QWORD[((8+0))+r8] + adc r12,1 + + lea r8,[16+r8] + pxor xmm6,xmm10 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,20 + pslld xmm7,32-20 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,20 + pslld xmm6,32-20 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,20 + pslld xmm5,32-20 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,20 + pslld xmm4,32-20 + pxor xmm4,xmm8 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + movdqa xmm8,XMMWORD[$L$rol8] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + pxor xmm13,xmm1 + pxor xmm12,xmm0 +DB 102,69,15,56,0,248 +DB 102,69,15,56,0,240 +DB 102,69,15,56,0,232 +DB 102,69,15,56,0,224 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + pxor xmm6,xmm10 + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,25 + pslld xmm7,32-25 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,25 + pslld xmm6,32-25 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,25 + pslld xmm5,32-25 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,25 + pslld xmm4,32-25 + pxor xmm4,xmm8 + movdqa xmm8,XMMWORD[((160+80))+rbp] + imul r9,r12 + add r15,r10 + adc r9,rdx +DB 102,15,58,15,255,4 +DB 102,69,15,58,15,219,8 +DB 102,69,15,58,15,255,12 +DB 102,15,58,15,246,4 +DB 102,69,15,58,15,210,8 +DB 102,69,15,58,15,246,12 +DB 102,15,58,15,237,4 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,12 +DB 102,15,58,15,228,4 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,12 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,XMMWORD[$L$rol16] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + pxor xmm13,xmm1 + pxor xmm12,xmm0 +DB 102,69,15,56,0,248 +DB 102,69,15,56,0,240 +DB 102,69,15,56,0,232 +DB 102,69,15,56,0,224 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + pxor xmm6,xmm10 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,20 + pslld xmm7,32-20 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,20 + pslld xmm6,32-20 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,20 + pslld xmm5,32-20 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,20 + pslld xmm4,32-20 + pxor xmm4,xmm8 + movdqa xmm8,XMMWORD[$L$rol8] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + pxor xmm13,xmm1 + pxor xmm12,xmm0 +DB 102,69,15,56,0,248 +DB 102,69,15,56,0,240 +DB 102,69,15,56,0,232 +DB 102,69,15,56,0,224 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + pxor xmm6,xmm10 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,25 + pslld xmm7,32-25 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,25 + pslld xmm6,32-25 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,25 + pslld xmm5,32-25 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,25 + pslld xmm4,32-25 + pxor xmm4,xmm8 + movdqa xmm8,XMMWORD[((160+80))+rbp] +DB 102,15,58,15,255,12 +DB 102,69,15,58,15,219,8 +DB 102,69,15,58,15,255,4 +DB 102,15,58,15,246,12 +DB 102,69,15,58,15,210,8 +DB 102,69,15,58,15,246,4 +DB 102,15,58,15,237,12 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,4 +DB 102,15,58,15,228,12 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,4 + + dec rcx + jge NEAR $L$open_sse_main_loop_rounds + add r10,QWORD[((0+0))+r8] + adc r11,QWORD[((8+0))+r8] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea r8,[16+r8] + cmp rcx,-6 + jg NEAR $L$open_sse_main_loop_rounds + paddd xmm3,XMMWORD[$L$chacha20_consts] + paddd xmm7,XMMWORD[((160+48))+rbp] + paddd xmm11,XMMWORD[((160+64))+rbp] + paddd xmm15,XMMWORD[((160+144))+rbp] + paddd xmm2,XMMWORD[$L$chacha20_consts] + paddd xmm6,XMMWORD[((160+48))+rbp] + paddd xmm10,XMMWORD[((160+64))+rbp] + paddd xmm14,XMMWORD[((160+128))+rbp] + paddd xmm1,XMMWORD[$L$chacha20_consts] + paddd xmm5,XMMWORD[((160+48))+rbp] + paddd xmm9,XMMWORD[((160+64))+rbp] + paddd xmm13,XMMWORD[((160+112))+rbp] + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm4,XMMWORD[((160+48))+rbp] + paddd xmm8,XMMWORD[((160+64))+rbp] + paddd xmm12,XMMWORD[((160+96))+rbp] + movdqa XMMWORD[(160+80)+rbp],xmm12 + movdqu xmm12,XMMWORD[((0 + 0))+rsi] + pxor xmm12,xmm3 + movdqu XMMWORD[(0 + 0)+rdi],xmm12 + movdqu xmm12,XMMWORD[((16 + 0))+rsi] + pxor xmm12,xmm7 + movdqu XMMWORD[(16 + 0)+rdi],xmm12 + movdqu xmm12,XMMWORD[((32 + 0))+rsi] + pxor xmm12,xmm11 + movdqu XMMWORD[(32 + 0)+rdi],xmm12 + movdqu xmm12,XMMWORD[((48 + 0))+rsi] + pxor xmm12,xmm15 + movdqu XMMWORD[(48 + 0)+rdi],xmm12 + movdqu xmm3,XMMWORD[((0 + 64))+rsi] + movdqu xmm7,XMMWORD[((16 + 64))+rsi] + movdqu xmm11,XMMWORD[((32 + 64))+rsi] + movdqu xmm15,XMMWORD[((48 + 64))+rsi] + pxor xmm2,xmm3 + pxor xmm6,xmm7 + pxor xmm10,xmm11 + pxor xmm15,xmm14 + movdqu XMMWORD[(0 + 64)+rdi],xmm2 + movdqu XMMWORD[(16 + 64)+rdi],xmm6 + movdqu XMMWORD[(32 + 64)+rdi],xmm10 + movdqu XMMWORD[(48 + 64)+rdi],xmm15 + movdqu xmm3,XMMWORD[((0 + 128))+rsi] + movdqu xmm7,XMMWORD[((16 + 128))+rsi] + movdqu xmm11,XMMWORD[((32 + 128))+rsi] + movdqu xmm15,XMMWORD[((48 + 128))+rsi] + pxor xmm1,xmm3 + pxor xmm5,xmm7 + pxor xmm9,xmm11 + pxor xmm15,xmm13 + movdqu XMMWORD[(0 + 128)+rdi],xmm1 + movdqu XMMWORD[(16 + 128)+rdi],xmm5 + movdqu XMMWORD[(32 + 128)+rdi],xmm9 + movdqu XMMWORD[(48 + 128)+rdi],xmm15 + movdqu xmm3,XMMWORD[((0 + 192))+rsi] + movdqu xmm7,XMMWORD[((16 + 192))+rsi] + movdqu xmm11,XMMWORD[((32 + 192))+rsi] + movdqu xmm15,XMMWORD[((48 + 192))+rsi] + pxor xmm0,xmm3 + pxor xmm4,xmm7 + pxor xmm8,xmm11 + pxor xmm15,XMMWORD[((160+80))+rbp] + movdqu XMMWORD[(0 + 192)+rdi],xmm0 + movdqu XMMWORD[(16 + 192)+rdi],xmm4 + movdqu XMMWORD[(32 + 192)+rdi],xmm8 + movdqu XMMWORD[(48 + 192)+rdi],xmm15 + + lea rsi,[256+rsi] + lea rdi,[256+rdi] + sub rbx,16*16 + jmp NEAR $L$open_sse_main_loop +$L$open_sse_tail: + + test rbx,rbx + jz NEAR $L$open_sse_finalize + cmp rbx,12*16 + ja NEAR $L$open_sse_tail_256 + cmp rbx,8*16 + ja NEAR $L$open_sse_tail_192 + cmp rbx,4*16 + ja NEAR $L$open_sse_tail_128 + movdqa xmm0,XMMWORD[$L$chacha20_consts] + movdqa xmm4,XMMWORD[((160+48))+rbp] + movdqa xmm8,XMMWORD[((160+64))+rbp] + movdqa xmm12,XMMWORD[((160+96))+rbp] + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa XMMWORD[(160+96)+rbp],xmm12 + + xor r8,r8 + mov rcx,rbx + cmp rcx,16 + jb NEAR $L$open_sse_tail_64_rounds +$L$open_sse_tail_64_rounds_and_x1hash: + add r10,QWORD[((0+0))+r8*1+rsi] + adc r11,QWORD[((8+0))+r8*1+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + sub rcx,16 +$L$open_sse_tail_64_rounds: + add r8,16 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,4 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,12 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,12 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,4 + + cmp rcx,16 + jae NEAR $L$open_sse_tail_64_rounds_and_x1hash + cmp r8,10*16 + jne NEAR $L$open_sse_tail_64_rounds + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm4,XMMWORD[((160+48))+rbp] + paddd xmm8,XMMWORD[((160+64))+rbp] + paddd xmm12,XMMWORD[((160+96))+rbp] + + jmp NEAR $L$open_sse_tail_64_dec_loop + +$L$open_sse_tail_128: + movdqa xmm0,XMMWORD[$L$chacha20_consts] + movdqa xmm4,XMMWORD[((160+48))+rbp] + movdqa xmm8,XMMWORD[((160+64))+rbp] + movdqa xmm1,xmm0 + movdqa xmm5,xmm4 + movdqa xmm9,xmm8 + movdqa xmm13,XMMWORD[((160+96))+rbp] + paddd xmm13,XMMWORD[$L$sse_inc] + movdqa xmm12,xmm13 + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa XMMWORD[(160+96)+rbp],xmm12 + movdqa XMMWORD[(160+112)+rbp],xmm13 + + mov rcx,rbx + and rcx,-16 + xor r8,r8 +$L$open_sse_tail_128_rounds_and_x1hash: + add r10,QWORD[((0+0))+r8*1+rsi] + adc r11,QWORD[((8+0))+r8*1+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + +$L$open_sse_tail_128_rounds: + add r8,16 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,4 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,12 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 +DB 102,15,58,15,237,4 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,12 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,12 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,4 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 +DB 102,15,58,15,237,12 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,4 + + cmp r8,rcx + jb NEAR $L$open_sse_tail_128_rounds_and_x1hash + cmp r8,10*16 + jne NEAR $L$open_sse_tail_128_rounds + paddd xmm1,XMMWORD[$L$chacha20_consts] + paddd xmm5,XMMWORD[((160+48))+rbp] + paddd xmm9,XMMWORD[((160+64))+rbp] + paddd xmm13,XMMWORD[((160+112))+rbp] + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm4,XMMWORD[((160+48))+rbp] + paddd xmm8,XMMWORD[((160+64))+rbp] + paddd xmm12,XMMWORD[((160+96))+rbp] + movdqu xmm3,XMMWORD[((0 + 0))+rsi] + movdqu xmm7,XMMWORD[((16 + 0))+rsi] + movdqu xmm11,XMMWORD[((32 + 0))+rsi] + movdqu xmm15,XMMWORD[((48 + 0))+rsi] + pxor xmm1,xmm3 + pxor xmm5,xmm7 + pxor xmm9,xmm11 + pxor xmm15,xmm13 + movdqu XMMWORD[(0 + 0)+rdi],xmm1 + movdqu XMMWORD[(16 + 0)+rdi],xmm5 + movdqu XMMWORD[(32 + 0)+rdi],xmm9 + movdqu XMMWORD[(48 + 0)+rdi],xmm15 + + sub rbx,4*16 + lea rsi,[64+rsi] + lea rdi,[64+rdi] + jmp NEAR $L$open_sse_tail_64_dec_loop + +$L$open_sse_tail_192: + movdqa xmm0,XMMWORD[$L$chacha20_consts] + movdqa xmm4,XMMWORD[((160+48))+rbp] + movdqa xmm8,XMMWORD[((160+64))+rbp] + movdqa xmm1,xmm0 + movdqa xmm5,xmm4 + movdqa xmm9,xmm8 + movdqa xmm2,xmm0 + movdqa xmm6,xmm4 + movdqa xmm10,xmm8 + movdqa xmm14,XMMWORD[((160+96))+rbp] + paddd xmm14,XMMWORD[$L$sse_inc] + movdqa xmm13,xmm14 + paddd xmm13,XMMWORD[$L$sse_inc] + movdqa xmm12,xmm13 + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa XMMWORD[(160+96)+rbp],xmm12 + movdqa XMMWORD[(160+112)+rbp],xmm13 + movdqa XMMWORD[(160+128)+rbp],xmm14 + + mov rcx,rbx + mov r8,10*16 + cmp rcx,10*16 + cmovg rcx,r8 + and rcx,-16 + xor r8,r8 +$L$open_sse_tail_192_rounds_and_x1hash: + add r10,QWORD[((0+0))+r8*1+rsi] + adc r11,QWORD[((8+0))+r8*1+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + +$L$open_sse_tail_192_rounds: + add r8,16 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,4 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,12 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 +DB 102,15,58,15,237,4 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,12 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol16] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,12 + psrld xmm6,20 + pxor xmm6,xmm3 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol8] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,7 + psrld xmm6,25 + pxor xmm6,xmm3 +DB 102,15,58,15,246,4 +DB 102,69,15,58,15,210,8 +DB 102,69,15,58,15,246,12 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,12 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,4 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 +DB 102,15,58,15,237,12 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,4 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol16] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,12 + psrld xmm6,20 + pxor xmm6,xmm3 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol8] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,7 + psrld xmm6,25 + pxor xmm6,xmm3 +DB 102,15,58,15,246,12 +DB 102,69,15,58,15,210,8 +DB 102,69,15,58,15,246,4 + + cmp r8,rcx + jb NEAR $L$open_sse_tail_192_rounds_and_x1hash + cmp r8,10*16 + jne NEAR $L$open_sse_tail_192_rounds + cmp rbx,11*16 + jb NEAR $L$open_sse_tail_192_finish + add r10,QWORD[((0+160))+rsi] + adc r11,QWORD[((8+160))+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + cmp rbx,12*16 + jb NEAR $L$open_sse_tail_192_finish + add r10,QWORD[((0+176))+rsi] + adc r11,QWORD[((8+176))+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + +$L$open_sse_tail_192_finish: + paddd xmm2,XMMWORD[$L$chacha20_consts] + paddd xmm6,XMMWORD[((160+48))+rbp] + paddd xmm10,XMMWORD[((160+64))+rbp] + paddd xmm14,XMMWORD[((160+128))+rbp] + paddd xmm1,XMMWORD[$L$chacha20_consts] + paddd xmm5,XMMWORD[((160+48))+rbp] + paddd xmm9,XMMWORD[((160+64))+rbp] + paddd xmm13,XMMWORD[((160+112))+rbp] + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm4,XMMWORD[((160+48))+rbp] + paddd xmm8,XMMWORD[((160+64))+rbp] + paddd xmm12,XMMWORD[((160+96))+rbp] + movdqu xmm3,XMMWORD[((0 + 0))+rsi] + movdqu xmm7,XMMWORD[((16 + 0))+rsi] + movdqu xmm11,XMMWORD[((32 + 0))+rsi] + movdqu xmm15,XMMWORD[((48 + 0))+rsi] + pxor xmm2,xmm3 + pxor xmm6,xmm7 + pxor xmm10,xmm11 + pxor xmm15,xmm14 + movdqu XMMWORD[(0 + 0)+rdi],xmm2 + movdqu XMMWORD[(16 + 0)+rdi],xmm6 + movdqu XMMWORD[(32 + 0)+rdi],xmm10 + movdqu XMMWORD[(48 + 0)+rdi],xmm15 + movdqu xmm3,XMMWORD[((0 + 64))+rsi] + movdqu xmm7,XMMWORD[((16 + 64))+rsi] + movdqu xmm11,XMMWORD[((32 + 64))+rsi] + movdqu xmm15,XMMWORD[((48 + 64))+rsi] + pxor xmm1,xmm3 + pxor xmm5,xmm7 + pxor xmm9,xmm11 + pxor xmm15,xmm13 + movdqu XMMWORD[(0 + 64)+rdi],xmm1 + movdqu XMMWORD[(16 + 64)+rdi],xmm5 + movdqu XMMWORD[(32 + 64)+rdi],xmm9 + movdqu XMMWORD[(48 + 64)+rdi],xmm15 + + sub rbx,8*16 + lea rsi,[128+rsi] + lea rdi,[128+rdi] + jmp NEAR $L$open_sse_tail_64_dec_loop + +$L$open_sse_tail_256: + movdqa xmm0,XMMWORD[$L$chacha20_consts] + movdqa xmm4,XMMWORD[((160+48))+rbp] + movdqa xmm8,XMMWORD[((160+64))+rbp] + movdqa xmm1,xmm0 + movdqa xmm5,xmm4 + movdqa xmm9,xmm8 + movdqa xmm2,xmm0 + movdqa xmm6,xmm4 + movdqa xmm10,xmm8 + movdqa xmm3,xmm0 + movdqa xmm7,xmm4 + movdqa xmm11,xmm8 + movdqa xmm15,XMMWORD[((160+96))+rbp] + paddd xmm15,XMMWORD[$L$sse_inc] + movdqa xmm14,xmm15 + paddd xmm14,XMMWORD[$L$sse_inc] + movdqa xmm13,xmm14 + paddd xmm13,XMMWORD[$L$sse_inc] + movdqa xmm12,xmm13 + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa XMMWORD[(160+96)+rbp],xmm12 + movdqa XMMWORD[(160+112)+rbp],xmm13 + movdqa XMMWORD[(160+128)+rbp],xmm14 + movdqa XMMWORD[(160+144)+rbp],xmm15 + + xor r8,r8 +$L$open_sse_tail_256_rounds_and_x1hash: + add r10,QWORD[((0+0))+r8*1+rsi] + adc r11,QWORD[((8+0))+r8*1+rsi] + adc r12,1 + movdqa XMMWORD[(160+80)+rbp],xmm11 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm11,xmm4 + pslld xmm11,12 + psrld xmm4,20 + pxor xmm4,xmm11 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm11,xmm4 + pslld xmm11,7 + psrld xmm4,25 + pxor xmm4,xmm11 +DB 102,15,58,15,228,4 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,12 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm11,xmm5 + pslld xmm11,12 + psrld xmm5,20 + pxor xmm5,xmm11 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm11,xmm5 + pslld xmm11,7 + psrld xmm5,25 + pxor xmm5,xmm11 +DB 102,15,58,15,237,4 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,12 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol16] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm11,xmm6 + pslld xmm11,12 + psrld xmm6,20 + pxor xmm6,xmm11 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol8] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm11,xmm6 + pslld xmm11,7 + psrld xmm6,25 + pxor xmm6,xmm11 +DB 102,15,58,15,246,4 +DB 102,69,15,58,15,210,8 +DB 102,69,15,58,15,246,12 + movdqa xmm11,XMMWORD[((160+80))+rbp] + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + movdqa XMMWORD[(160+80)+rbp],xmm9 + paddd xmm3,xmm7 + pxor xmm15,xmm3 + pshufb xmm15,XMMWORD[$L$rol16] + paddd xmm11,xmm15 + pxor xmm7,xmm11 + movdqa xmm9,xmm7 + pslld xmm9,12 + psrld xmm7,20 + pxor xmm7,xmm9 + paddd xmm3,xmm7 + pxor xmm15,xmm3 + pshufb xmm15,XMMWORD[$L$rol8] + paddd xmm11,xmm15 + pxor xmm7,xmm11 + movdqa xmm9,xmm7 + pslld xmm9,7 + psrld xmm7,25 + pxor xmm7,xmm9 +DB 102,15,58,15,255,4 +DB 102,69,15,58,15,219,8 +DB 102,69,15,58,15,255,12 + movdqa xmm9,XMMWORD[((160+80))+rbp] + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + movdqa XMMWORD[(160+80)+rbp],xmm11 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm11,xmm4 + pslld xmm11,12 + psrld xmm4,20 + pxor xmm4,xmm11 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm11,xmm4 + pslld xmm11,7 + psrld xmm4,25 + pxor xmm4,xmm11 +DB 102,15,58,15,228,12 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,4 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm11,xmm5 + pslld xmm11,12 + psrld xmm5,20 + pxor xmm5,xmm11 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm11,xmm5 + pslld xmm11,7 + psrld xmm5,25 + pxor xmm5,xmm11 +DB 102,15,58,15,237,12 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,4 + imul r9,r12 + add r15,r10 + adc r9,rdx + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol16] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm11,xmm6 + pslld xmm11,12 + psrld xmm6,20 + pxor xmm6,xmm11 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol8] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm11,xmm6 + pslld xmm11,7 + psrld xmm6,25 + pxor xmm6,xmm11 +DB 102,15,58,15,246,12 +DB 102,69,15,58,15,210,8 +DB 102,69,15,58,15,246,4 + movdqa xmm11,XMMWORD[((160+80))+rbp] + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + movdqa XMMWORD[(160+80)+rbp],xmm9 + paddd xmm3,xmm7 + pxor xmm15,xmm3 + pshufb xmm15,XMMWORD[$L$rol16] + paddd xmm11,xmm15 + pxor xmm7,xmm11 + movdqa xmm9,xmm7 + pslld xmm9,12 + psrld xmm7,20 + pxor xmm7,xmm9 + paddd xmm3,xmm7 + pxor xmm15,xmm3 + pshufb xmm15,XMMWORD[$L$rol8] + paddd xmm11,xmm15 + pxor xmm7,xmm11 + movdqa xmm9,xmm7 + pslld xmm9,7 + psrld xmm7,25 + pxor xmm7,xmm9 +DB 102,15,58,15,255,12 +DB 102,69,15,58,15,219,8 +DB 102,69,15,58,15,255,4 + movdqa xmm9,XMMWORD[((160+80))+rbp] + + add r8,16 + cmp r8,10*16 + jb NEAR $L$open_sse_tail_256_rounds_and_x1hash + + mov rcx,rbx + and rcx,-16 +$L$open_sse_tail_256_hash: + add r10,QWORD[((0+0))+r8*1+rsi] + adc r11,QWORD[((8+0))+r8*1+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + add r8,16 + cmp r8,rcx + jb NEAR $L$open_sse_tail_256_hash + paddd xmm3,XMMWORD[$L$chacha20_consts] + paddd xmm7,XMMWORD[((160+48))+rbp] + paddd xmm11,XMMWORD[((160+64))+rbp] + paddd xmm15,XMMWORD[((160+144))+rbp] + paddd xmm2,XMMWORD[$L$chacha20_consts] + paddd xmm6,XMMWORD[((160+48))+rbp] + paddd xmm10,XMMWORD[((160+64))+rbp] + paddd xmm14,XMMWORD[((160+128))+rbp] + paddd xmm1,XMMWORD[$L$chacha20_consts] + paddd xmm5,XMMWORD[((160+48))+rbp] + paddd xmm9,XMMWORD[((160+64))+rbp] + paddd xmm13,XMMWORD[((160+112))+rbp] + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm4,XMMWORD[((160+48))+rbp] + paddd xmm8,XMMWORD[((160+64))+rbp] + paddd xmm12,XMMWORD[((160+96))+rbp] + movdqa XMMWORD[(160+80)+rbp],xmm12 + movdqu xmm12,XMMWORD[((0 + 0))+rsi] + pxor xmm12,xmm3 + movdqu XMMWORD[(0 + 0)+rdi],xmm12 + movdqu xmm12,XMMWORD[((16 + 0))+rsi] + pxor xmm12,xmm7 + movdqu XMMWORD[(16 + 0)+rdi],xmm12 + movdqu xmm12,XMMWORD[((32 + 0))+rsi] + pxor xmm12,xmm11 + movdqu XMMWORD[(32 + 0)+rdi],xmm12 + movdqu xmm12,XMMWORD[((48 + 0))+rsi] + pxor xmm12,xmm15 + movdqu XMMWORD[(48 + 0)+rdi],xmm12 + movdqu xmm3,XMMWORD[((0 + 64))+rsi] + movdqu xmm7,XMMWORD[((16 + 64))+rsi] + movdqu xmm11,XMMWORD[((32 + 64))+rsi] + movdqu xmm15,XMMWORD[((48 + 64))+rsi] + pxor xmm2,xmm3 + pxor xmm6,xmm7 + pxor xmm10,xmm11 + pxor xmm15,xmm14 + movdqu XMMWORD[(0 + 64)+rdi],xmm2 + movdqu XMMWORD[(16 + 64)+rdi],xmm6 + movdqu XMMWORD[(32 + 64)+rdi],xmm10 + movdqu XMMWORD[(48 + 64)+rdi],xmm15 + movdqu xmm3,XMMWORD[((0 + 128))+rsi] + movdqu xmm7,XMMWORD[((16 + 128))+rsi] + movdqu xmm11,XMMWORD[((32 + 128))+rsi] + movdqu xmm15,XMMWORD[((48 + 128))+rsi] + pxor xmm1,xmm3 + pxor xmm5,xmm7 + pxor xmm9,xmm11 + pxor xmm15,xmm13 + movdqu XMMWORD[(0 + 128)+rdi],xmm1 + movdqu XMMWORD[(16 + 128)+rdi],xmm5 + movdqu XMMWORD[(32 + 128)+rdi],xmm9 + movdqu XMMWORD[(48 + 128)+rdi],xmm15 + + movdqa xmm12,XMMWORD[((160+80))+rbp] + sub rbx,12*16 + lea rsi,[192+rsi] + lea rdi,[192+rdi] + + +$L$open_sse_tail_64_dec_loop: + cmp rbx,16 + jb NEAR $L$open_sse_tail_16_init + sub rbx,16 + movdqu xmm3,XMMWORD[rsi] + pxor xmm0,xmm3 + movdqu XMMWORD[rdi],xmm0 + lea rsi,[16+rsi] + lea rdi,[16+rdi] + movdqa xmm0,xmm4 + movdqa xmm4,xmm8 + movdqa xmm8,xmm12 + jmp NEAR $L$open_sse_tail_64_dec_loop +$L$open_sse_tail_16_init: + movdqa xmm1,xmm0 + + +$L$open_sse_tail_16: + test rbx,rbx + jz NEAR $L$open_sse_finalize + + + + pxor xmm3,xmm3 + lea rsi,[((-1))+rbx*1+rsi] + mov r8,rbx +$L$open_sse_tail_16_compose: + pslldq xmm3,1 + pinsrb xmm3,BYTE[rsi],0 + sub rsi,1 + sub r8,1 + jnz NEAR $L$open_sse_tail_16_compose + +DB 102,73,15,126,221 + pextrq r14,xmm3,1 + + pxor xmm3,xmm1 + + +$L$open_sse_tail_16_extract: + pextrb XMMWORD[rdi],xmm3,0 + psrldq xmm3,1 + add rdi,1 + sub rbx,1 + jne NEAR $L$open_sse_tail_16_extract + + add r10,r13 + adc r11,r14 + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + +$L$open_sse_finalize: + add r10,QWORD[((0+160+32))+rbp] + adc r11,QWORD[((8+160+32))+rbp] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + + mov r13,r10 + mov r14,r11 + mov r15,r12 + sub r10,-5 + sbb r11,-1 + sbb r12,3 + cmovc r10,r13 + cmovc r11,r14 + cmovc r12,r15 + + add r10,QWORD[((0+160+16))+rbp] + adc r11,QWORD[((8+160+16))+rbp] + + movaps xmm6,XMMWORD[((0+0))+rbp] + movaps xmm7,XMMWORD[((16+0))+rbp] + movaps xmm8,XMMWORD[((32+0))+rbp] + movaps xmm9,XMMWORD[((48+0))+rbp] + movaps xmm10,XMMWORD[((64+0))+rbp] + movaps xmm11,XMMWORD[((80+0))+rbp] + movaps xmm12,XMMWORD[((96+0))+rbp] + movaps xmm13,XMMWORD[((112+0))+rbp] + movaps xmm14,XMMWORD[((128+0))+rbp] + movaps xmm15,XMMWORD[((144+0))+rbp] + + + add rsp,288 + 160 + 32 + + + pop r9 + + mov QWORD[r9],r10 + mov QWORD[8+r9],r11 + pop r15 + + pop r14 + + pop r13 + + pop r12 + + pop rbx + + pop rbp + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$open_sse_128: + + movdqu xmm0,XMMWORD[$L$chacha20_consts] + movdqa xmm1,xmm0 + movdqa xmm2,xmm0 + movdqu xmm4,XMMWORD[r9] + movdqa xmm5,xmm4 + movdqa xmm6,xmm4 + movdqu xmm8,XMMWORD[16+r9] + movdqa xmm9,xmm8 + movdqa xmm10,xmm8 + movdqu xmm12,XMMWORD[32+r9] + movdqa xmm13,xmm12 + paddd xmm13,XMMWORD[$L$sse_inc] + movdqa xmm14,xmm13 + paddd xmm14,XMMWORD[$L$sse_inc] + movdqa xmm7,xmm4 + movdqa xmm11,xmm8 + movdqa xmm15,xmm13 + mov r10,10 + +$L$open_sse_128_rounds: + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,4 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,12 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 +DB 102,15,58,15,237,4 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,12 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol16] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,12 + psrld xmm6,20 + pxor xmm6,xmm3 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol8] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,7 + psrld xmm6,25 + pxor xmm6,xmm3 +DB 102,15,58,15,246,4 +DB 102,69,15,58,15,210,8 +DB 102,69,15,58,15,246,12 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,12 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,4 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 +DB 102,15,58,15,237,12 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,4 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol16] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,12 + psrld xmm6,20 + pxor xmm6,xmm3 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol8] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,7 + psrld xmm6,25 + pxor xmm6,xmm3 +DB 102,15,58,15,246,12 +DB 102,69,15,58,15,210,8 +DB 102,69,15,58,15,246,4 + + dec r10 + jnz NEAR $L$open_sse_128_rounds + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm1,XMMWORD[$L$chacha20_consts] + paddd xmm2,XMMWORD[$L$chacha20_consts] + paddd xmm4,xmm7 + paddd xmm5,xmm7 + paddd xmm6,xmm7 + paddd xmm9,xmm11 + paddd xmm10,xmm11 + paddd xmm13,xmm15 + paddd xmm15,XMMWORD[$L$sse_inc] + paddd xmm14,xmm15 + + pand xmm0,XMMWORD[$L$clamp] + movdqa XMMWORD[(160+0)+rbp],xmm0 + movdqa XMMWORD[(160+16)+rbp],xmm4 + + mov r8,r8 + call poly_hash_ad_internal +$L$open_sse_128_xor_hash: + cmp rbx,16 + jb NEAR $L$open_sse_tail_16 + sub rbx,16 + add r10,QWORD[((0+0))+rsi] + adc r11,QWORD[((8+0))+rsi] + adc r12,1 + + + movdqu xmm3,XMMWORD[rsi] + pxor xmm1,xmm3 + movdqu XMMWORD[rdi],xmm1 + lea rsi,[16+rsi] + lea rdi,[16+rdi] + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + + movdqa xmm1,xmm5 + movdqa xmm5,xmm9 + movdqa xmm9,xmm13 + movdqa xmm13,xmm2 + movdqa xmm2,xmm6 + movdqa xmm6,xmm10 + movdqa xmm10,xmm14 + jmp NEAR $L$open_sse_128_xor_hash +$L$SEH_end_chacha20_poly1305_open_sse41: + + + + + + + + +global chacha20_poly1305_seal_sse41 + +ALIGN 64 +chacha20_poly1305_seal_sse41: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_chacha20_poly1305_seal_sse41: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + + + push r9 + + sub rsp,288 + 160 + 32 + + lea rbp,[32+rsp] + and rbp,-32 + + movaps XMMWORD[(0+0)+rbp],xmm6 + movaps XMMWORD[(16+0)+rbp],xmm7 + movaps XMMWORD[(32+0)+rbp],xmm8 + movaps XMMWORD[(48+0)+rbp],xmm9 + movaps XMMWORD[(64+0)+rbp],xmm10 + movaps XMMWORD[(80+0)+rbp],xmm11 + movaps XMMWORD[(96+0)+rbp],xmm12 + movaps XMMWORD[(112+0)+rbp],xmm13 + movaps XMMWORD[(128+0)+rbp],xmm14 + movaps XMMWORD[(144+0)+rbp],xmm15 + + mov rbx,QWORD[56+r9] + add rbx,rdx + mov QWORD[((0+160+32))+rbp],r8 + mov QWORD[((8+160+32))+rbp],rbx + mov rbx,rdx + + cmp rbx,128 + jbe NEAR $L$seal_sse_128 + + movdqa xmm0,XMMWORD[$L$chacha20_consts] + movdqu xmm4,XMMWORD[r9] + movdqu xmm8,XMMWORD[16+r9] + movdqu xmm12,XMMWORD[32+r9] + + movdqa xmm1,xmm0 + movdqa xmm2,xmm0 + movdqa xmm3,xmm0 + movdqa xmm5,xmm4 + movdqa xmm6,xmm4 + movdqa xmm7,xmm4 + movdqa xmm9,xmm8 + movdqa xmm10,xmm8 + movdqa xmm11,xmm8 + movdqa xmm15,xmm12 + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa xmm14,xmm12 + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa xmm13,xmm12 + paddd xmm12,XMMWORD[$L$sse_inc] + + movdqa XMMWORD[(160+48)+rbp],xmm4 + movdqa XMMWORD[(160+64)+rbp],xmm8 + movdqa XMMWORD[(160+96)+rbp],xmm12 + movdqa XMMWORD[(160+112)+rbp],xmm13 + movdqa XMMWORD[(160+128)+rbp],xmm14 + movdqa XMMWORD[(160+144)+rbp],xmm15 + mov r10,10 +$L$seal_sse_init_rounds: + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,XMMWORD[$L$rol16] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + pxor xmm13,xmm1 + pxor xmm12,xmm0 +DB 102,69,15,56,0,248 +DB 102,69,15,56,0,240 +DB 102,69,15,56,0,232 +DB 102,69,15,56,0,224 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + pxor xmm6,xmm10 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,20 + pslld xmm7,32-20 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,20 + pslld xmm6,32-20 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,20 + pslld xmm5,32-20 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,20 + pslld xmm4,32-20 + pxor xmm4,xmm8 + movdqa xmm8,XMMWORD[$L$rol8] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + pxor xmm13,xmm1 + pxor xmm12,xmm0 +DB 102,69,15,56,0,248 +DB 102,69,15,56,0,240 +DB 102,69,15,56,0,232 +DB 102,69,15,56,0,224 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + pxor xmm6,xmm10 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,25 + pslld xmm7,32-25 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,25 + pslld xmm6,32-25 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,25 + pslld xmm5,32-25 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,25 + pslld xmm4,32-25 + pxor xmm4,xmm8 + movdqa xmm8,XMMWORD[((160+80))+rbp] +DB 102,15,58,15,255,4 +DB 102,69,15,58,15,219,8 +DB 102,69,15,58,15,255,12 +DB 102,15,58,15,246,4 +DB 102,69,15,58,15,210,8 +DB 102,69,15,58,15,246,12 +DB 102,15,58,15,237,4 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,12 +DB 102,15,58,15,228,4 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,12 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,XMMWORD[$L$rol16] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + pxor xmm13,xmm1 + pxor xmm12,xmm0 +DB 102,69,15,56,0,248 +DB 102,69,15,56,0,240 +DB 102,69,15,56,0,232 +DB 102,69,15,56,0,224 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + pxor xmm6,xmm10 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,20 + pslld xmm7,32-20 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,20 + pslld xmm6,32-20 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,20 + pslld xmm5,32-20 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,20 + pslld xmm4,32-20 + pxor xmm4,xmm8 + movdqa xmm8,XMMWORD[$L$rol8] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + pxor xmm13,xmm1 + pxor xmm12,xmm0 +DB 102,69,15,56,0,248 +DB 102,69,15,56,0,240 +DB 102,69,15,56,0,232 +DB 102,69,15,56,0,224 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + pxor xmm6,xmm10 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,25 + pslld xmm7,32-25 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,25 + pslld xmm6,32-25 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,25 + pslld xmm5,32-25 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,25 + pslld xmm4,32-25 + pxor xmm4,xmm8 + movdqa xmm8,XMMWORD[((160+80))+rbp] +DB 102,15,58,15,255,12 +DB 102,69,15,58,15,219,8 +DB 102,69,15,58,15,255,4 +DB 102,15,58,15,246,12 +DB 102,69,15,58,15,210,8 +DB 102,69,15,58,15,246,4 +DB 102,15,58,15,237,12 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,4 +DB 102,15,58,15,228,12 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,4 + + dec r10 + jnz NEAR $L$seal_sse_init_rounds + paddd xmm3,XMMWORD[$L$chacha20_consts] + paddd xmm7,XMMWORD[((160+48))+rbp] + paddd xmm11,XMMWORD[((160+64))+rbp] + paddd xmm15,XMMWORD[((160+144))+rbp] + paddd xmm2,XMMWORD[$L$chacha20_consts] + paddd xmm6,XMMWORD[((160+48))+rbp] + paddd xmm10,XMMWORD[((160+64))+rbp] + paddd xmm14,XMMWORD[((160+128))+rbp] + paddd xmm1,XMMWORD[$L$chacha20_consts] + paddd xmm5,XMMWORD[((160+48))+rbp] + paddd xmm9,XMMWORD[((160+64))+rbp] + paddd xmm13,XMMWORD[((160+112))+rbp] + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm4,XMMWORD[((160+48))+rbp] + paddd xmm8,XMMWORD[((160+64))+rbp] + paddd xmm12,XMMWORD[((160+96))+rbp] + + + pand xmm3,XMMWORD[$L$clamp] + movdqa XMMWORD[(160+0)+rbp],xmm3 + movdqa XMMWORD[(160+16)+rbp],xmm7 + + mov r8,r8 + call poly_hash_ad_internal + movdqu xmm3,XMMWORD[((0 + 0))+rsi] + movdqu xmm7,XMMWORD[((16 + 0))+rsi] + movdqu xmm11,XMMWORD[((32 + 0))+rsi] + movdqu xmm15,XMMWORD[((48 + 0))+rsi] + pxor xmm2,xmm3 + pxor xmm6,xmm7 + pxor xmm10,xmm11 + pxor xmm15,xmm14 + movdqu XMMWORD[(0 + 0)+rdi],xmm2 + movdqu XMMWORD[(16 + 0)+rdi],xmm6 + movdqu XMMWORD[(32 + 0)+rdi],xmm10 + movdqu XMMWORD[(48 + 0)+rdi],xmm15 + movdqu xmm3,XMMWORD[((0 + 64))+rsi] + movdqu xmm7,XMMWORD[((16 + 64))+rsi] + movdqu xmm11,XMMWORD[((32 + 64))+rsi] + movdqu xmm15,XMMWORD[((48 + 64))+rsi] + pxor xmm1,xmm3 + pxor xmm5,xmm7 + pxor xmm9,xmm11 + pxor xmm15,xmm13 + movdqu XMMWORD[(0 + 64)+rdi],xmm1 + movdqu XMMWORD[(16 + 64)+rdi],xmm5 + movdqu XMMWORD[(32 + 64)+rdi],xmm9 + movdqu XMMWORD[(48 + 64)+rdi],xmm15 + + cmp rbx,12*16 + ja NEAR $L$seal_sse_main_init + mov rcx,8*16 + sub rbx,8*16 + lea rsi,[128+rsi] + jmp NEAR $L$seal_sse_128_tail_hash +$L$seal_sse_main_init: + movdqu xmm3,XMMWORD[((0 + 128))+rsi] + movdqu xmm7,XMMWORD[((16 + 128))+rsi] + movdqu xmm11,XMMWORD[((32 + 128))+rsi] + movdqu xmm15,XMMWORD[((48 + 128))+rsi] + pxor xmm0,xmm3 + pxor xmm4,xmm7 + pxor xmm8,xmm11 + pxor xmm15,xmm12 + movdqu XMMWORD[(0 + 128)+rdi],xmm0 + movdqu XMMWORD[(16 + 128)+rdi],xmm4 + movdqu XMMWORD[(32 + 128)+rdi],xmm8 + movdqu XMMWORD[(48 + 128)+rdi],xmm15 + + mov rcx,12*16 + sub rbx,12*16 + lea rsi,[192+rsi] + mov rcx,2 + mov r8,8 + cmp rbx,4*16 + jbe NEAR $L$seal_sse_tail_64 + cmp rbx,8*16 + jbe NEAR $L$seal_sse_tail_128 + cmp rbx,12*16 + jbe NEAR $L$seal_sse_tail_192 + +$L$seal_sse_main_loop: + movdqa xmm0,XMMWORD[$L$chacha20_consts] + movdqa xmm4,XMMWORD[((160+48))+rbp] + movdqa xmm8,XMMWORD[((160+64))+rbp] + movdqa xmm1,xmm0 + movdqa xmm5,xmm4 + movdqa xmm9,xmm8 + movdqa xmm2,xmm0 + movdqa xmm6,xmm4 + movdqa xmm10,xmm8 + movdqa xmm3,xmm0 + movdqa xmm7,xmm4 + movdqa xmm11,xmm8 + movdqa xmm15,XMMWORD[((160+96))+rbp] + paddd xmm15,XMMWORD[$L$sse_inc] + movdqa xmm14,xmm15 + paddd xmm14,XMMWORD[$L$sse_inc] + movdqa xmm13,xmm14 + paddd xmm13,XMMWORD[$L$sse_inc] + movdqa xmm12,xmm13 + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa XMMWORD[(160+96)+rbp],xmm12 + movdqa XMMWORD[(160+112)+rbp],xmm13 + movdqa XMMWORD[(160+128)+rbp],xmm14 + movdqa XMMWORD[(160+144)+rbp],xmm15 + +ALIGN 32 +$L$seal_sse_main_rounds: + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,XMMWORD[$L$rol16] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + pxor xmm13,xmm1 + pxor xmm12,xmm0 +DB 102,69,15,56,0,248 +DB 102,69,15,56,0,240 +DB 102,69,15,56,0,232 +DB 102,69,15,56,0,224 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + pxor xmm6,xmm10 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,20 + pslld xmm7,32-20 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,20 + pslld xmm6,32-20 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,20 + pslld xmm5,32-20 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,20 + pslld xmm4,32-20 + pxor xmm4,xmm8 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + movdqa xmm8,XMMWORD[$L$rol8] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + pxor xmm13,xmm1 + pxor xmm12,xmm0 +DB 102,69,15,56,0,248 +DB 102,69,15,56,0,240 +DB 102,69,15,56,0,232 +DB 102,69,15,56,0,224 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + pxor xmm6,xmm10 + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,25 + pslld xmm7,32-25 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,25 + pslld xmm6,32-25 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,25 + pslld xmm5,32-25 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,25 + pslld xmm4,32-25 + pxor xmm4,xmm8 + movdqa xmm8,XMMWORD[((160+80))+rbp] + imul r9,r12 + add r15,r10 + adc r9,rdx +DB 102,15,58,15,255,4 +DB 102,69,15,58,15,219,8 +DB 102,69,15,58,15,255,12 +DB 102,15,58,15,246,4 +DB 102,69,15,58,15,210,8 +DB 102,69,15,58,15,246,12 +DB 102,15,58,15,237,4 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,12 +DB 102,15,58,15,228,4 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,12 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,XMMWORD[$L$rol16] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + pxor xmm13,xmm1 + pxor xmm12,xmm0 +DB 102,69,15,56,0,248 +DB 102,69,15,56,0,240 +DB 102,69,15,56,0,232 +DB 102,69,15,56,0,224 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + pxor xmm6,xmm10 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,20 + pslld xmm7,32-20 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,20 + pslld xmm6,32-20 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,20 + pslld xmm5,32-20 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,20 + pslld xmm4,32-20 + pxor xmm4,xmm8 + movdqa xmm8,XMMWORD[$L$rol8] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + pxor xmm13,xmm1 + pxor xmm12,xmm0 +DB 102,69,15,56,0,248 +DB 102,69,15,56,0,240 +DB 102,69,15,56,0,232 +DB 102,69,15,56,0,224 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + pxor xmm6,xmm10 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,25 + pslld xmm7,32-25 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,25 + pslld xmm6,32-25 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,25 + pslld xmm5,32-25 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,25 + pslld xmm4,32-25 + pxor xmm4,xmm8 + movdqa xmm8,XMMWORD[((160+80))+rbp] +DB 102,15,58,15,255,12 +DB 102,69,15,58,15,219,8 +DB 102,69,15,58,15,255,4 +DB 102,15,58,15,246,12 +DB 102,69,15,58,15,210,8 +DB 102,69,15,58,15,246,4 +DB 102,15,58,15,237,12 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,4 +DB 102,15,58,15,228,12 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,4 + + lea rdi,[16+rdi] + dec r8 + jge NEAR $L$seal_sse_main_rounds + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[16+rdi] + dec rcx + jg NEAR $L$seal_sse_main_rounds + paddd xmm3,XMMWORD[$L$chacha20_consts] + paddd xmm7,XMMWORD[((160+48))+rbp] + paddd xmm11,XMMWORD[((160+64))+rbp] + paddd xmm15,XMMWORD[((160+144))+rbp] + paddd xmm2,XMMWORD[$L$chacha20_consts] + paddd xmm6,XMMWORD[((160+48))+rbp] + paddd xmm10,XMMWORD[((160+64))+rbp] + paddd xmm14,XMMWORD[((160+128))+rbp] + paddd xmm1,XMMWORD[$L$chacha20_consts] + paddd xmm5,XMMWORD[((160+48))+rbp] + paddd xmm9,XMMWORD[((160+64))+rbp] + paddd xmm13,XMMWORD[((160+112))+rbp] + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm4,XMMWORD[((160+48))+rbp] + paddd xmm8,XMMWORD[((160+64))+rbp] + paddd xmm12,XMMWORD[((160+96))+rbp] + + movdqa XMMWORD[(160+80)+rbp],xmm14 + movdqa XMMWORD[(160+80)+rbp],xmm14 + movdqu xmm14,XMMWORD[((0 + 0))+rsi] + pxor xmm14,xmm3 + movdqu XMMWORD[(0 + 0)+rdi],xmm14 + movdqu xmm14,XMMWORD[((16 + 0))+rsi] + pxor xmm14,xmm7 + movdqu XMMWORD[(16 + 0)+rdi],xmm14 + movdqu xmm14,XMMWORD[((32 + 0))+rsi] + pxor xmm14,xmm11 + movdqu XMMWORD[(32 + 0)+rdi],xmm14 + movdqu xmm14,XMMWORD[((48 + 0))+rsi] + pxor xmm14,xmm15 + movdqu XMMWORD[(48 + 0)+rdi],xmm14 + + movdqa xmm14,XMMWORD[((160+80))+rbp] + movdqu xmm3,XMMWORD[((0 + 64))+rsi] + movdqu xmm7,XMMWORD[((16 + 64))+rsi] + movdqu xmm11,XMMWORD[((32 + 64))+rsi] + movdqu xmm15,XMMWORD[((48 + 64))+rsi] + pxor xmm2,xmm3 + pxor xmm6,xmm7 + pxor xmm10,xmm11 + pxor xmm15,xmm14 + movdqu XMMWORD[(0 + 64)+rdi],xmm2 + movdqu XMMWORD[(16 + 64)+rdi],xmm6 + movdqu XMMWORD[(32 + 64)+rdi],xmm10 + movdqu XMMWORD[(48 + 64)+rdi],xmm15 + movdqu xmm3,XMMWORD[((0 + 128))+rsi] + movdqu xmm7,XMMWORD[((16 + 128))+rsi] + movdqu xmm11,XMMWORD[((32 + 128))+rsi] + movdqu xmm15,XMMWORD[((48 + 128))+rsi] + pxor xmm1,xmm3 + pxor xmm5,xmm7 + pxor xmm9,xmm11 + pxor xmm15,xmm13 + movdqu XMMWORD[(0 + 128)+rdi],xmm1 + movdqu XMMWORD[(16 + 128)+rdi],xmm5 + movdqu XMMWORD[(32 + 128)+rdi],xmm9 + movdqu XMMWORD[(48 + 128)+rdi],xmm15 + + cmp rbx,16*16 + ja NEAR $L$seal_sse_main_loop_xor + + mov rcx,12*16 + sub rbx,12*16 + lea rsi,[192+rsi] + jmp NEAR $L$seal_sse_128_tail_hash +$L$seal_sse_main_loop_xor: + movdqu xmm3,XMMWORD[((0 + 192))+rsi] + movdqu xmm7,XMMWORD[((16 + 192))+rsi] + movdqu xmm11,XMMWORD[((32 + 192))+rsi] + movdqu xmm15,XMMWORD[((48 + 192))+rsi] + pxor xmm0,xmm3 + pxor xmm4,xmm7 + pxor xmm8,xmm11 + pxor xmm15,xmm12 + movdqu XMMWORD[(0 + 192)+rdi],xmm0 + movdqu XMMWORD[(16 + 192)+rdi],xmm4 + movdqu XMMWORD[(32 + 192)+rdi],xmm8 + movdqu XMMWORD[(48 + 192)+rdi],xmm15 + + lea rsi,[256+rsi] + sub rbx,16*16 + mov rcx,6 + mov r8,4 + cmp rbx,12*16 + jg NEAR $L$seal_sse_main_loop + mov rcx,rbx + test rbx,rbx + je NEAR $L$seal_sse_128_tail_hash + mov rcx,6 + cmp rbx,8*16 + ja NEAR $L$seal_sse_tail_192 + cmp rbx,4*16 + ja NEAR $L$seal_sse_tail_128 + +$L$seal_sse_tail_64: + movdqa xmm0,XMMWORD[$L$chacha20_consts] + movdqa xmm4,XMMWORD[((160+48))+rbp] + movdqa xmm8,XMMWORD[((160+64))+rbp] + movdqa xmm12,XMMWORD[((160+96))+rbp] + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa XMMWORD[(160+96)+rbp],xmm12 + +$L$seal_sse_tail_64_rounds_and_x2hash: + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[16+rdi] +$L$seal_sse_tail_64_rounds_and_x1hash: + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,4 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,12 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,12 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,4 + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[16+rdi] + dec rcx + jg NEAR $L$seal_sse_tail_64_rounds_and_x2hash + dec r8 + jge NEAR $L$seal_sse_tail_64_rounds_and_x1hash + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm4,XMMWORD[((160+48))+rbp] + paddd xmm8,XMMWORD[((160+64))+rbp] + paddd xmm12,XMMWORD[((160+96))+rbp] + + jmp NEAR $L$seal_sse_128_tail_xor + +$L$seal_sse_tail_128: + movdqa xmm0,XMMWORD[$L$chacha20_consts] + movdqa xmm4,XMMWORD[((160+48))+rbp] + movdqa xmm8,XMMWORD[((160+64))+rbp] + movdqa xmm1,xmm0 + movdqa xmm5,xmm4 + movdqa xmm9,xmm8 + movdqa xmm13,XMMWORD[((160+96))+rbp] + paddd xmm13,XMMWORD[$L$sse_inc] + movdqa xmm12,xmm13 + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa XMMWORD[(160+96)+rbp],xmm12 + movdqa XMMWORD[(160+112)+rbp],xmm13 + +$L$seal_sse_tail_128_rounds_and_x2hash: + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[16+rdi] +$L$seal_sse_tail_128_rounds_and_x1hash: + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,4 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,12 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 +DB 102,15,58,15,237,4 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,12 + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,12 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,4 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 +DB 102,15,58,15,237,12 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,4 + + lea rdi,[16+rdi] + dec rcx + jg NEAR $L$seal_sse_tail_128_rounds_and_x2hash + dec r8 + jge NEAR $L$seal_sse_tail_128_rounds_and_x1hash + paddd xmm1,XMMWORD[$L$chacha20_consts] + paddd xmm5,XMMWORD[((160+48))+rbp] + paddd xmm9,XMMWORD[((160+64))+rbp] + paddd xmm13,XMMWORD[((160+112))+rbp] + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm4,XMMWORD[((160+48))+rbp] + paddd xmm8,XMMWORD[((160+64))+rbp] + paddd xmm12,XMMWORD[((160+96))+rbp] + movdqu xmm3,XMMWORD[((0 + 0))+rsi] + movdqu xmm7,XMMWORD[((16 + 0))+rsi] + movdqu xmm11,XMMWORD[((32 + 0))+rsi] + movdqu xmm15,XMMWORD[((48 + 0))+rsi] + pxor xmm1,xmm3 + pxor xmm5,xmm7 + pxor xmm9,xmm11 + pxor xmm15,xmm13 + movdqu XMMWORD[(0 + 0)+rdi],xmm1 + movdqu XMMWORD[(16 + 0)+rdi],xmm5 + movdqu XMMWORD[(32 + 0)+rdi],xmm9 + movdqu XMMWORD[(48 + 0)+rdi],xmm15 + + mov rcx,4*16 + sub rbx,4*16 + lea rsi,[64+rsi] + jmp NEAR $L$seal_sse_128_tail_hash + +$L$seal_sse_tail_192: + movdqa xmm0,XMMWORD[$L$chacha20_consts] + movdqa xmm4,XMMWORD[((160+48))+rbp] + movdqa xmm8,XMMWORD[((160+64))+rbp] + movdqa xmm1,xmm0 + movdqa xmm5,xmm4 + movdqa xmm9,xmm8 + movdqa xmm2,xmm0 + movdqa xmm6,xmm4 + movdqa xmm10,xmm8 + movdqa xmm14,XMMWORD[((160+96))+rbp] + paddd xmm14,XMMWORD[$L$sse_inc] + movdqa xmm13,xmm14 + paddd xmm13,XMMWORD[$L$sse_inc] + movdqa xmm12,xmm13 + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa XMMWORD[(160+96)+rbp],xmm12 + movdqa XMMWORD[(160+112)+rbp],xmm13 + movdqa XMMWORD[(160+128)+rbp],xmm14 + +$L$seal_sse_tail_192_rounds_and_x2hash: + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[16+rdi] +$L$seal_sse_tail_192_rounds_and_x1hash: + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,4 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,12 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 +DB 102,15,58,15,237,4 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,12 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol16] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,12 + psrld xmm6,20 + pxor xmm6,xmm3 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol8] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,7 + psrld xmm6,25 + pxor xmm6,xmm3 +DB 102,15,58,15,246,4 +DB 102,69,15,58,15,210,8 +DB 102,69,15,58,15,246,12 + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,12 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,4 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 +DB 102,15,58,15,237,12 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,4 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol16] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,12 + psrld xmm6,20 + pxor xmm6,xmm3 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol8] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,7 + psrld xmm6,25 + pxor xmm6,xmm3 +DB 102,15,58,15,246,12 +DB 102,69,15,58,15,210,8 +DB 102,69,15,58,15,246,4 + + lea rdi,[16+rdi] + dec rcx + jg NEAR $L$seal_sse_tail_192_rounds_and_x2hash + dec r8 + jge NEAR $L$seal_sse_tail_192_rounds_and_x1hash + paddd xmm2,XMMWORD[$L$chacha20_consts] + paddd xmm6,XMMWORD[((160+48))+rbp] + paddd xmm10,XMMWORD[((160+64))+rbp] + paddd xmm14,XMMWORD[((160+128))+rbp] + paddd xmm1,XMMWORD[$L$chacha20_consts] + paddd xmm5,XMMWORD[((160+48))+rbp] + paddd xmm9,XMMWORD[((160+64))+rbp] + paddd xmm13,XMMWORD[((160+112))+rbp] + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm4,XMMWORD[((160+48))+rbp] + paddd xmm8,XMMWORD[((160+64))+rbp] + paddd xmm12,XMMWORD[((160+96))+rbp] + movdqu xmm3,XMMWORD[((0 + 0))+rsi] + movdqu xmm7,XMMWORD[((16 + 0))+rsi] + movdqu xmm11,XMMWORD[((32 + 0))+rsi] + movdqu xmm15,XMMWORD[((48 + 0))+rsi] + pxor xmm2,xmm3 + pxor xmm6,xmm7 + pxor xmm10,xmm11 + pxor xmm15,xmm14 + movdqu XMMWORD[(0 + 0)+rdi],xmm2 + movdqu XMMWORD[(16 + 0)+rdi],xmm6 + movdqu XMMWORD[(32 + 0)+rdi],xmm10 + movdqu XMMWORD[(48 + 0)+rdi],xmm15 + movdqu xmm3,XMMWORD[((0 + 64))+rsi] + movdqu xmm7,XMMWORD[((16 + 64))+rsi] + movdqu xmm11,XMMWORD[((32 + 64))+rsi] + movdqu xmm15,XMMWORD[((48 + 64))+rsi] + pxor xmm1,xmm3 + pxor xmm5,xmm7 + pxor xmm9,xmm11 + pxor xmm15,xmm13 + movdqu XMMWORD[(0 + 64)+rdi],xmm1 + movdqu XMMWORD[(16 + 64)+rdi],xmm5 + movdqu XMMWORD[(32 + 64)+rdi],xmm9 + movdqu XMMWORD[(48 + 64)+rdi],xmm15 + + mov rcx,8*16 + sub rbx,8*16 + lea rsi,[128+rsi] + +$L$seal_sse_128_tail_hash: + cmp rcx,16 + jb NEAR $L$seal_sse_128_tail_xor + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + sub rcx,16 + lea rdi,[16+rdi] + jmp NEAR $L$seal_sse_128_tail_hash + +$L$seal_sse_128_tail_xor: + cmp rbx,16 + jb NEAR $L$seal_sse_tail_16 + sub rbx,16 + + movdqu xmm3,XMMWORD[rsi] + pxor xmm0,xmm3 + movdqu XMMWORD[rdi],xmm0 + + add r10,QWORD[rdi] + adc r11,QWORD[8+rdi] + adc r12,1 + lea rsi,[16+rsi] + lea rdi,[16+rdi] + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + + movdqa xmm0,xmm4 + movdqa xmm4,xmm8 + movdqa xmm8,xmm12 + movdqa xmm12,xmm1 + movdqa xmm1,xmm5 + movdqa xmm5,xmm9 + movdqa xmm9,xmm13 + jmp NEAR $L$seal_sse_128_tail_xor + +$L$seal_sse_tail_16: + test rbx,rbx + jz NEAR $L$process_blocks_of_extra_in + + mov r8,rbx + mov rcx,rbx + lea rsi,[((-1))+rbx*1+rsi] + pxor xmm15,xmm15 +$L$seal_sse_tail_16_compose: + pslldq xmm15,1 + pinsrb xmm15,BYTE[rsi],0 + lea rsi,[((-1))+rsi] + dec rcx + jne NEAR $L$seal_sse_tail_16_compose + + + pxor xmm15,xmm0 + + + mov rcx,rbx + movdqu xmm0,xmm15 +$L$seal_sse_tail_16_extract: + pextrb XMMWORD[rdi],xmm0,0 + psrldq xmm0,1 + add rdi,1 + sub rcx,1 + jnz NEAR $L$seal_sse_tail_16_extract + + + + + + + + + mov r9,QWORD[((288 + 160 + 32))+rsp] + mov r14,QWORD[56+r9] + mov r13,QWORD[48+r9] + test r14,r14 + jz NEAR $L$process_partial_block + + mov r15,16 + sub r15,rbx + cmp r14,r15 + + jge NEAR $L$load_extra_in + mov r15,r14 + +$L$load_extra_in: + + + lea rsi,[((-1))+r15*1+r13] + + + add r13,r15 + sub r14,r15 + mov QWORD[48+r9],r13 + mov QWORD[56+r9],r14 + + + + add r8,r15 + + + pxor xmm11,xmm11 +$L$load_extra_load_loop: + pslldq xmm11,1 + pinsrb xmm11,BYTE[rsi],0 + lea rsi,[((-1))+rsi] + sub r15,1 + jnz NEAR $L$load_extra_load_loop + + + + + mov r15,rbx + +$L$load_extra_shift_loop: + pslldq xmm11,1 + sub r15,1 + jnz NEAR $L$load_extra_shift_loop + + + + + lea r15,[$L$and_masks] + shl rbx,4 + pand xmm15,XMMWORD[((-16))+rbx*1+r15] + + + por xmm15,xmm11 + + + +DB 102,77,15,126,253 + pextrq r14,xmm15,1 + add r10,r13 + adc r11,r14 + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + +$L$process_blocks_of_extra_in: + + mov r9,QWORD[((288+32+160 ))+rsp] + mov rsi,QWORD[48+r9] + mov r8,QWORD[56+r9] + mov rcx,r8 + shr r8,4 + +$L$process_extra_hash_loop: + jz NEAR process_extra_in_trailer + add r10,QWORD[((0+0))+rsi] + adc r11,QWORD[((8+0))+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rsi,[16+rsi] + sub r8,1 + jmp NEAR $L$process_extra_hash_loop +process_extra_in_trailer: + and rcx,15 + mov rbx,rcx + jz NEAR $L$do_length_block + lea rsi,[((-1))+rcx*1+rsi] + +$L$process_extra_in_trailer_load: + pslldq xmm15,1 + pinsrb xmm15,BYTE[rsi],0 + lea rsi,[((-1))+rsi] + sub rcx,1 + jnz NEAR $L$process_extra_in_trailer_load + +$L$process_partial_block: + + lea r15,[$L$and_masks] + shl rbx,4 + pand xmm15,XMMWORD[((-16))+rbx*1+r15] +DB 102,77,15,126,253 + pextrq r14,xmm15,1 + add r10,r13 + adc r11,r14 + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + +$L$do_length_block: + add r10,QWORD[((0+160+32))+rbp] + adc r11,QWORD[((8+160+32))+rbp] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + + mov r13,r10 + mov r14,r11 + mov r15,r12 + sub r10,-5 + sbb r11,-1 + sbb r12,3 + cmovc r10,r13 + cmovc r11,r14 + cmovc r12,r15 + + add r10,QWORD[((0+160+16))+rbp] + adc r11,QWORD[((8+160+16))+rbp] + + movaps xmm6,XMMWORD[((0+0))+rbp] + movaps xmm7,XMMWORD[((16+0))+rbp] + movaps xmm8,XMMWORD[((32+0))+rbp] + movaps xmm9,XMMWORD[((48+0))+rbp] + movaps xmm10,XMMWORD[((64+0))+rbp] + movaps xmm11,XMMWORD[((80+0))+rbp] + movaps xmm12,XMMWORD[((96+0))+rbp] + movaps xmm13,XMMWORD[((112+0))+rbp] + movaps xmm14,XMMWORD[((128+0))+rbp] + movaps xmm15,XMMWORD[((144+0))+rbp] + + + add rsp,288 + 160 + 32 + + + pop r9 + + mov QWORD[r9],r10 + mov QWORD[8+r9],r11 + pop r15 + + pop r14 + + pop r13 + + pop r12 + + pop rbx + + pop rbp + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$seal_sse_128: + + movdqu xmm0,XMMWORD[$L$chacha20_consts] + movdqa xmm1,xmm0 + movdqa xmm2,xmm0 + movdqu xmm4,XMMWORD[r9] + movdqa xmm5,xmm4 + movdqa xmm6,xmm4 + movdqu xmm8,XMMWORD[16+r9] + movdqa xmm9,xmm8 + movdqa xmm10,xmm8 + movdqu xmm14,XMMWORD[32+r9] + movdqa xmm12,xmm14 + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa xmm13,xmm12 + paddd xmm13,XMMWORD[$L$sse_inc] + movdqa xmm7,xmm4 + movdqa xmm11,xmm8 + movdqa xmm15,xmm12 + mov r10,10 + +$L$seal_sse_128_rounds: + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,4 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,12 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 +DB 102,15,58,15,237,4 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,12 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol16] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,12 + psrld xmm6,20 + pxor xmm6,xmm3 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol8] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,7 + psrld xmm6,25 + pxor xmm6,xmm3 +DB 102,15,58,15,246,4 +DB 102,69,15,58,15,210,8 +DB 102,69,15,58,15,246,12 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,12 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,4 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 +DB 102,15,58,15,237,12 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,4 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol16] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,12 + psrld xmm6,20 + pxor xmm6,xmm3 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol8] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,7 + psrld xmm6,25 + pxor xmm6,xmm3 +DB 102,15,58,15,246,12 +DB 102,69,15,58,15,210,8 +DB 102,69,15,58,15,246,4 + + dec r10 + jnz NEAR $L$seal_sse_128_rounds + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm1,XMMWORD[$L$chacha20_consts] + paddd xmm2,XMMWORD[$L$chacha20_consts] + paddd xmm4,xmm7 + paddd xmm5,xmm7 + paddd xmm6,xmm7 + paddd xmm8,xmm11 + paddd xmm9,xmm11 + paddd xmm12,xmm15 + paddd xmm15,XMMWORD[$L$sse_inc] + paddd xmm13,xmm15 + + pand xmm2,XMMWORD[$L$clamp] + movdqa XMMWORD[(160+0)+rbp],xmm2 + movdqa XMMWORD[(160+16)+rbp],xmm6 + + mov r8,r8 + call poly_hash_ad_internal + jmp NEAR $L$seal_sse_128_tail_xor +$L$SEH_end_chacha20_poly1305_seal_sse41: + + + +global chacha20_poly1305_open_avx2 + +ALIGN 64 +chacha20_poly1305_open_avx2: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_chacha20_poly1305_open_avx2: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + + + push r9 + + sub rsp,288 + 160 + 32 + + + lea rbp,[32+rsp] + and rbp,-32 + + movaps XMMWORD[(0+0)+rbp],xmm6 + movaps XMMWORD[(16+0)+rbp],xmm7 + movaps XMMWORD[(32+0)+rbp],xmm8 + movaps XMMWORD[(48+0)+rbp],xmm9 + movaps XMMWORD[(64+0)+rbp],xmm10 + movaps XMMWORD[(80+0)+rbp],xmm11 + movaps XMMWORD[(96+0)+rbp],xmm12 + movaps XMMWORD[(112+0)+rbp],xmm13 + movaps XMMWORD[(128+0)+rbp],xmm14 + movaps XMMWORD[(144+0)+rbp],xmm15 + + mov rbx,rdx + mov QWORD[((0+160+32))+rbp],r8 + mov QWORD[((8+160+32))+rbp],rbx + + vzeroupper + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vbroadcasti128 ymm4,XMMWORD[r9] + vbroadcasti128 ymm8,XMMWORD[16+r9] + vbroadcasti128 ymm12,XMMWORD[32+r9] + vpaddd ymm12,ymm12,YMMWORD[$L$avx2_init] + cmp rbx,6*32 + jbe NEAR $L$open_avx2_192 + cmp rbx,10*32 + jbe NEAR $L$open_avx2_320 + + vmovdqa YMMWORD[(160+64)+rbp],ymm4 + vmovdqa YMMWORD[(160+96)+rbp],ymm8 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + mov r10,10 +$L$open_avx2_init_rounds: + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,4 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,12 + + dec r10 + jne NEAR $L$open_avx2_init_rounds + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + + vperm2i128 ymm3,ymm4,ymm0,0x02 + + vpand ymm3,ymm3,YMMWORD[$L$clamp] + vmovdqa YMMWORD[(160+0)+rbp],ymm3 + + vperm2i128 ymm0,ymm4,ymm0,0x13 + vperm2i128 ymm4,ymm12,ymm8,0x13 + + mov r8,r8 + call poly_hash_ad_internal + + xor rcx,rcx +$L$open_avx2_init_hash: + add r10,QWORD[((0+0))+rcx*1+rsi] + adc r11,QWORD[((8+0))+rcx*1+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + add rcx,16 + cmp rcx,2*32 + jne NEAR $L$open_avx2_init_hash + + vpxor ymm0,ymm0,YMMWORD[rsi] + vpxor ymm4,ymm4,YMMWORD[32+rsi] + + vmovdqu YMMWORD[rdi],ymm0 + vmovdqu YMMWORD[32+rdi],ymm4 + lea rsi,[64+rsi] + lea rdi,[64+rdi] + sub rbx,2*32 +$L$open_avx2_main_loop: + + cmp rbx,16*32 + jb NEAR $L$open_avx2_main_loop_done + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vmovdqa ymm4,YMMWORD[((160+64))+rbp] + vmovdqa ymm8,YMMWORD[((160+96))+rbp] + vmovdqa ymm1,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm2,ymm0 + vmovdqa ymm6,ymm4 + vmovdqa ymm10,ymm8 + vmovdqa ymm3,ymm0 + vmovdqa ymm7,ymm4 + vmovdqa ymm11,ymm8 + vmovdqa ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm15,ymm12,YMMWORD[((160+160))+rbp] + vpaddd ymm14,ymm12,ymm15 + vpaddd ymm13,ymm12,ymm14 + vpaddd ymm12,ymm12,ymm13 + vmovdqa YMMWORD[(160+256)+rbp],ymm15 + vmovdqa YMMWORD[(160+224)+rbp],ymm14 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + + xor rcx,rcx +$L$open_avx2_main_loop_rounds: + add r10,QWORD[((0+0))+rcx*1+rsi] + adc r11,QWORD[((8+0))+rcx*1+rsi] + adc r12,1 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + add r15,rax + adc r9,rdx + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + add r10,QWORD[((0+16))+rcx*1+rsi] + adc r11,QWORD[((8+16))+rcx*1+rsi] + adc r12,1 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,4 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,12 + vpalignr ymm6,ymm6,ymm6,4 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,12 + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + vpalignr ymm5,ymm5,ymm5,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm4,ymm4,ymm4,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm12,ymm12,ymm12,12 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + add r15,rax + adc r9,rdx + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + add r10,QWORD[((0+32))+rcx*1+rsi] + adc r11,QWORD[((8+32))+rcx*1+rsi] + adc r12,1 + + lea rcx,[48+rcx] + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + add r15,rax + adc r9,rdx + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,12 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,4 + vpalignr ymm6,ymm6,ymm6,12 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,4 + vpalignr ymm5,ymm5,ymm5,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm4,ymm4,ymm4,12 + vpalignr ymm8,ymm8,ymm8,8 + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vpalignr ymm12,ymm12,ymm12,4 + + cmp rcx,10*6*8 + jne NEAR $L$open_avx2_main_loop_rounds + vpaddd ymm3,ymm3,YMMWORD[$L$chacha20_consts] + vpaddd ymm7,ymm7,YMMWORD[((160+64))+rbp] + vpaddd ymm11,ymm11,YMMWORD[((160+96))+rbp] + vpaddd ymm15,ymm15,YMMWORD[((160+256))+rbp] + vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] + vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp] + vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp] + vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] + vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] + vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] + vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] + vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + + vmovdqa YMMWORD[(160+128)+rbp],ymm0 + add r10,QWORD[((0+480))+rsi] + adc r11,QWORD[((8+480))+rsi] + adc r12,1 + vperm2i128 ymm0,ymm7,ymm3,0x02 + vperm2i128 ymm7,ymm7,ymm3,0x13 + vperm2i128 ymm3,ymm15,ymm11,0x02 + vperm2i128 ymm11,ymm15,ymm11,0x13 + vpxor ymm0,ymm0,YMMWORD[((0+0))+rsi] + vpxor ymm3,ymm3,YMMWORD[((32+0))+rsi] + vpxor ymm7,ymm7,YMMWORD[((64+0))+rsi] + vpxor ymm11,ymm11,YMMWORD[((96+0))+rsi] + vmovdqu YMMWORD[(0+0)+rdi],ymm0 + vmovdqu YMMWORD[(32+0)+rdi],ymm3 + vmovdqu YMMWORD[(64+0)+rdi],ymm7 + vmovdqu YMMWORD[(96+0)+rdi],ymm11 + + vmovdqa ymm0,YMMWORD[((160+128))+rbp] + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vperm2i128 ymm3,ymm6,ymm2,0x02 + vperm2i128 ymm6,ymm6,ymm2,0x13 + vperm2i128 ymm2,ymm14,ymm10,0x02 + vperm2i128 ymm10,ymm14,ymm10,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi] + vpxor ymm2,ymm2,YMMWORD[((32+128))+rsi] + vpxor ymm6,ymm6,YMMWORD[((64+128))+rsi] + vpxor ymm10,ymm10,YMMWORD[((96+128))+rsi] + vmovdqu YMMWORD[(0+128)+rdi],ymm3 + vmovdqu YMMWORD[(32+128)+rdi],ymm2 + vmovdqu YMMWORD[(64+128)+rdi],ymm6 + vmovdqu YMMWORD[(96+128)+rdi],ymm10 + add r10,QWORD[((0+480+16))+rsi] + adc r11,QWORD[((8+480+16))+rsi] + adc r12,1 + vperm2i128 ymm3,ymm5,ymm1,0x02 + vperm2i128 ymm5,ymm5,ymm1,0x13 + vperm2i128 ymm1,ymm13,ymm9,0x02 + vperm2i128 ymm9,ymm13,ymm9,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+256))+rsi] + vpxor ymm1,ymm1,YMMWORD[((32+256))+rsi] + vpxor ymm5,ymm5,YMMWORD[((64+256))+rsi] + vpxor ymm9,ymm9,YMMWORD[((96+256))+rsi] + vmovdqu YMMWORD[(0+256)+rdi],ymm3 + vmovdqu YMMWORD[(32+256)+rdi],ymm1 + vmovdqu YMMWORD[(64+256)+rdi],ymm5 + vmovdqu YMMWORD[(96+256)+rdi],ymm9 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vperm2i128 ymm3,ymm4,ymm0,0x02 + vperm2i128 ymm4,ymm4,ymm0,0x13 + vperm2i128 ymm0,ymm12,ymm8,0x02 + vperm2i128 ymm8,ymm12,ymm8,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+384))+rsi] + vpxor ymm0,ymm0,YMMWORD[((32+384))+rsi] + vpxor ymm4,ymm4,YMMWORD[((64+384))+rsi] + vpxor ymm8,ymm8,YMMWORD[((96+384))+rsi] + vmovdqu YMMWORD[(0+384)+rdi],ymm3 + vmovdqu YMMWORD[(32+384)+rdi],ymm0 + vmovdqu YMMWORD[(64+384)+rdi],ymm4 + vmovdqu YMMWORD[(96+384)+rdi],ymm8 + + lea rsi,[512+rsi] + lea rdi,[512+rdi] + sub rbx,16*32 + jmp NEAR $L$open_avx2_main_loop +$L$open_avx2_main_loop_done: + test rbx,rbx + vzeroupper + je NEAR $L$open_sse_finalize + + cmp rbx,12*32 + ja NEAR $L$open_avx2_tail_512 + cmp rbx,8*32 + ja NEAR $L$open_avx2_tail_384 + cmp rbx,4*32 + ja NEAR $L$open_avx2_tail_256 + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vmovdqa ymm4,YMMWORD[((160+64))+rbp] + vmovdqa ymm8,YMMWORD[((160+96))+rbp] + vmovdqa ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + + xor r8,r8 + mov rcx,rbx + and rcx,-16 + test rcx,rcx + je NEAR $L$open_avx2_tail_128_rounds +$L$open_avx2_tail_128_rounds_and_x1hash: + add r10,QWORD[((0+0))+r8*1+rsi] + adc r11,QWORD[((8+0))+r8*1+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + +$L$open_avx2_tail_128_rounds: + add r8,16 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,4 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,12 + + cmp r8,rcx + jb NEAR $L$open_avx2_tail_128_rounds_and_x1hash + cmp r8,160 + jne NEAR $L$open_avx2_tail_128_rounds + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + vperm2i128 ymm3,ymm4,ymm0,0x13 + vperm2i128 ymm0,ymm4,ymm0,0x02 + vperm2i128 ymm4,ymm12,ymm8,0x02 + vperm2i128 ymm12,ymm12,ymm8,0x13 + vmovdqa ymm8,ymm3 + + jmp NEAR $L$open_avx2_tail_128_xor + +$L$open_avx2_tail_256: + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vmovdqa ymm4,YMMWORD[((160+64))+rbp] + vmovdqa ymm8,YMMWORD[((160+96))+rbp] + vmovdqa ymm1,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm13,ymm12,YMMWORD[((160+160))+rbp] + vpaddd ymm12,ymm12,ymm13 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + + mov QWORD[((160+128))+rbp],rbx + mov rcx,rbx + sub rcx,4*32 + shr rcx,4 + mov r8,10 + cmp rcx,10 + cmovg rcx,r8 + mov rbx,rsi + xor r8,r8 +$L$open_avx2_tail_256_rounds_and_x1hash: + add r10,QWORD[((0+0))+rbx] + adc r11,QWORD[((8+0))+rbx] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rbx,[16+rbx] +$L$open_avx2_tail_256_rounds: + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,4 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,4 + + inc r8 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,12 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,12 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol16] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpsrld ymm3,ymm6,20 + vpslld ymm6,ymm6,12 + vpxor ymm6,ymm6,ymm3 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol8] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpslld ymm3,ymm6,7 + vpsrld ymm6,ymm6,25 + vpxor ymm6,ymm6,ymm3 + vpalignr ymm14,ymm14,ymm14,4 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm6,ymm6,ymm6,12 + + cmp r8,rcx + jb NEAR $L$open_avx2_tail_256_rounds_and_x1hash + cmp r8,10 + jne NEAR $L$open_avx2_tail_256_rounds + mov r8,rbx + sub rbx,rsi + mov rcx,rbx + mov rbx,QWORD[((160+128))+rbp] +$L$open_avx2_tail_256_hash: + add rcx,16 + cmp rcx,rbx + jg NEAR $L$open_avx2_tail_256_done + add r10,QWORD[((0+0))+r8] + adc r11,QWORD[((8+0))+r8] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea r8,[16+r8] + jmp NEAR $L$open_avx2_tail_256_hash +$L$open_avx2_tail_256_done: + vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] + vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] + vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] + vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + vperm2i128 ymm3,ymm5,ymm1,0x02 + vperm2i128 ymm5,ymm5,ymm1,0x13 + vperm2i128 ymm1,ymm13,ymm9,0x02 + vperm2i128 ymm9,ymm13,ymm9,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+0))+rsi] + vpxor ymm1,ymm1,YMMWORD[((32+0))+rsi] + vpxor ymm5,ymm5,YMMWORD[((64+0))+rsi] + vpxor ymm9,ymm9,YMMWORD[((96+0))+rsi] + vmovdqu YMMWORD[(0+0)+rdi],ymm3 + vmovdqu YMMWORD[(32+0)+rdi],ymm1 + vmovdqu YMMWORD[(64+0)+rdi],ymm5 + vmovdqu YMMWORD[(96+0)+rdi],ymm9 + vperm2i128 ymm3,ymm4,ymm0,0x13 + vperm2i128 ymm0,ymm4,ymm0,0x02 + vperm2i128 ymm4,ymm12,ymm8,0x02 + vperm2i128 ymm12,ymm12,ymm8,0x13 + vmovdqa ymm8,ymm3 + + lea rsi,[128+rsi] + lea rdi,[128+rdi] + sub rbx,4*32 + jmp NEAR $L$open_avx2_tail_128_xor + +$L$open_avx2_tail_384: + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vmovdqa ymm4,YMMWORD[((160+64))+rbp] + vmovdqa ymm8,YMMWORD[((160+96))+rbp] + vmovdqa ymm1,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm2,ymm0 + vmovdqa ymm6,ymm4 + vmovdqa ymm10,ymm8 + vmovdqa ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm14,ymm12,YMMWORD[((160+160))+rbp] + vpaddd ymm13,ymm12,ymm14 + vpaddd ymm12,ymm12,ymm13 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + vmovdqa YMMWORD[(160+224)+rbp],ymm14 + + mov QWORD[((160+128))+rbp],rbx + mov rcx,rbx + sub rcx,8*32 + shr rcx,4 + add rcx,6 + mov r8,10 + cmp rcx,10 + cmovg rcx,r8 + mov rbx,rsi + xor r8,r8 +$L$open_avx2_tail_384_rounds_and_x2hash: + add r10,QWORD[((0+0))+rbx] + adc r11,QWORD[((8+0))+rbx] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rbx,[16+rbx] +$L$open_avx2_tail_384_rounds_and_x1hash: + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol16] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpsrld ymm3,ymm6,20 + vpslld ymm6,ymm6,12 + vpxor ymm6,ymm6,ymm3 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol8] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpslld ymm3,ymm6,7 + vpsrld ymm6,ymm6,25 + vpxor ymm6,ymm6,ymm3 + vpalignr ymm14,ymm14,ymm14,12 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm6,ymm6,ymm6,4 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,4 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,4 + add r10,QWORD[((0+0))+rbx] + adc r11,QWORD[((8+0))+rbx] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rbx,[16+rbx] + inc r8 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol16] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpsrld ymm3,ymm6,20 + vpslld ymm6,ymm6,12 + vpxor ymm6,ymm6,ymm3 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol8] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpslld ymm3,ymm6,7 + vpsrld ymm6,ymm6,25 + vpxor ymm6,ymm6,ymm3 + vpalignr ymm14,ymm14,ymm14,4 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm6,ymm6,ymm6,12 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,12 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,12 + + cmp r8,rcx + jb NEAR $L$open_avx2_tail_384_rounds_and_x2hash + cmp r8,10 + jne NEAR $L$open_avx2_tail_384_rounds_and_x1hash + mov r8,rbx + sub rbx,rsi + mov rcx,rbx + mov rbx,QWORD[((160+128))+rbp] +$L$open_avx2_384_tail_hash: + add rcx,16 + cmp rcx,rbx + jg NEAR $L$open_avx2_384_tail_done + add r10,QWORD[((0+0))+r8] + adc r11,QWORD[((8+0))+r8] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea r8,[16+r8] + jmp NEAR $L$open_avx2_384_tail_hash +$L$open_avx2_384_tail_done: + vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] + vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp] + vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp] + vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] + vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] + vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] + vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] + vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + vperm2i128 ymm3,ymm6,ymm2,0x02 + vperm2i128 ymm6,ymm6,ymm2,0x13 + vperm2i128 ymm2,ymm14,ymm10,0x02 + vperm2i128 ymm10,ymm14,ymm10,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+0))+rsi] + vpxor ymm2,ymm2,YMMWORD[((32+0))+rsi] + vpxor ymm6,ymm6,YMMWORD[((64+0))+rsi] + vpxor ymm10,ymm10,YMMWORD[((96+0))+rsi] + vmovdqu YMMWORD[(0+0)+rdi],ymm3 + vmovdqu YMMWORD[(32+0)+rdi],ymm2 + vmovdqu YMMWORD[(64+0)+rdi],ymm6 + vmovdqu YMMWORD[(96+0)+rdi],ymm10 + vperm2i128 ymm3,ymm5,ymm1,0x02 + vperm2i128 ymm5,ymm5,ymm1,0x13 + vperm2i128 ymm1,ymm13,ymm9,0x02 + vperm2i128 ymm9,ymm13,ymm9,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi] + vpxor ymm1,ymm1,YMMWORD[((32+128))+rsi] + vpxor ymm5,ymm5,YMMWORD[((64+128))+rsi] + vpxor ymm9,ymm9,YMMWORD[((96+128))+rsi] + vmovdqu YMMWORD[(0+128)+rdi],ymm3 + vmovdqu YMMWORD[(32+128)+rdi],ymm1 + vmovdqu YMMWORD[(64+128)+rdi],ymm5 + vmovdqu YMMWORD[(96+128)+rdi],ymm9 + vperm2i128 ymm3,ymm4,ymm0,0x13 + vperm2i128 ymm0,ymm4,ymm0,0x02 + vperm2i128 ymm4,ymm12,ymm8,0x02 + vperm2i128 ymm12,ymm12,ymm8,0x13 + vmovdqa ymm8,ymm3 + + lea rsi,[256+rsi] + lea rdi,[256+rdi] + sub rbx,8*32 + jmp NEAR $L$open_avx2_tail_128_xor + +$L$open_avx2_tail_512: + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vmovdqa ymm4,YMMWORD[((160+64))+rbp] + vmovdqa ymm8,YMMWORD[((160+96))+rbp] + vmovdqa ymm1,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm2,ymm0 + vmovdqa ymm6,ymm4 + vmovdqa ymm10,ymm8 + vmovdqa ymm3,ymm0 + vmovdqa ymm7,ymm4 + vmovdqa ymm11,ymm8 + vmovdqa ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm15,ymm12,YMMWORD[((160+160))+rbp] + vpaddd ymm14,ymm12,ymm15 + vpaddd ymm13,ymm12,ymm14 + vpaddd ymm12,ymm12,ymm13 + vmovdqa YMMWORD[(160+256)+rbp],ymm15 + vmovdqa YMMWORD[(160+224)+rbp],ymm14 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + + xor rcx,rcx + mov r8,rsi +$L$open_avx2_tail_512_rounds_and_x2hash: + add r10,QWORD[((0+0))+r8] + adc r11,QWORD[((8+0))+r8] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea r8,[16+r8] +$L$open_avx2_tail_512_rounds_and_x1hash: + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + add r10,QWORD[((0+0))+r8] + adc r11,QWORD[((8+0))+r8] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,4 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,12 + vpalignr ymm6,ymm6,ymm6,4 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,12 + vpalignr ymm5,ymm5,ymm5,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm4,ymm4,ymm4,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm12,ymm12,ymm12,12 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + add r10,QWORD[((0+16))+r8] + adc r11,QWORD[((8+16))+r8] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea r8,[32+r8] + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,12 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,4 + vpalignr ymm6,ymm6,ymm6,12 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,4 + vpalignr ymm5,ymm5,ymm5,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm4,ymm4,ymm4,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm12,ymm12,ymm12,4 + + inc rcx + cmp rcx,4 + jl NEAR $L$open_avx2_tail_512_rounds_and_x2hash + cmp rcx,10 + jne NEAR $L$open_avx2_tail_512_rounds_and_x1hash + mov rcx,rbx + sub rcx,12*32 + and rcx,-16 +$L$open_avx2_tail_512_hash: + test rcx,rcx + je NEAR $L$open_avx2_tail_512_done + add r10,QWORD[((0+0))+r8] + adc r11,QWORD[((8+0))+r8] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea r8,[16+r8] + sub rcx,2*8 + jmp NEAR $L$open_avx2_tail_512_hash +$L$open_avx2_tail_512_done: + vpaddd ymm3,ymm3,YMMWORD[$L$chacha20_consts] + vpaddd ymm7,ymm7,YMMWORD[((160+64))+rbp] + vpaddd ymm11,ymm11,YMMWORD[((160+96))+rbp] + vpaddd ymm15,ymm15,YMMWORD[((160+256))+rbp] + vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] + vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp] + vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp] + vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] + vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] + vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] + vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] + vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + + vmovdqa YMMWORD[(160+128)+rbp],ymm0 + vperm2i128 ymm0,ymm7,ymm3,0x02 + vperm2i128 ymm7,ymm7,ymm3,0x13 + vperm2i128 ymm3,ymm15,ymm11,0x02 + vperm2i128 ymm11,ymm15,ymm11,0x13 + vpxor ymm0,ymm0,YMMWORD[((0+0))+rsi] + vpxor ymm3,ymm3,YMMWORD[((32+0))+rsi] + vpxor ymm7,ymm7,YMMWORD[((64+0))+rsi] + vpxor ymm11,ymm11,YMMWORD[((96+0))+rsi] + vmovdqu YMMWORD[(0+0)+rdi],ymm0 + vmovdqu YMMWORD[(32+0)+rdi],ymm3 + vmovdqu YMMWORD[(64+0)+rdi],ymm7 + vmovdqu YMMWORD[(96+0)+rdi],ymm11 + + vmovdqa ymm0,YMMWORD[((160+128))+rbp] + vperm2i128 ymm3,ymm6,ymm2,0x02 + vperm2i128 ymm6,ymm6,ymm2,0x13 + vperm2i128 ymm2,ymm14,ymm10,0x02 + vperm2i128 ymm10,ymm14,ymm10,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi] + vpxor ymm2,ymm2,YMMWORD[((32+128))+rsi] + vpxor ymm6,ymm6,YMMWORD[((64+128))+rsi] + vpxor ymm10,ymm10,YMMWORD[((96+128))+rsi] + vmovdqu YMMWORD[(0+128)+rdi],ymm3 + vmovdqu YMMWORD[(32+128)+rdi],ymm2 + vmovdqu YMMWORD[(64+128)+rdi],ymm6 + vmovdqu YMMWORD[(96+128)+rdi],ymm10 + vperm2i128 ymm3,ymm5,ymm1,0x02 + vperm2i128 ymm5,ymm5,ymm1,0x13 + vperm2i128 ymm1,ymm13,ymm9,0x02 + vperm2i128 ymm9,ymm13,ymm9,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+256))+rsi] + vpxor ymm1,ymm1,YMMWORD[((32+256))+rsi] + vpxor ymm5,ymm5,YMMWORD[((64+256))+rsi] + vpxor ymm9,ymm9,YMMWORD[((96+256))+rsi] + vmovdqu YMMWORD[(0+256)+rdi],ymm3 + vmovdqu YMMWORD[(32+256)+rdi],ymm1 + vmovdqu YMMWORD[(64+256)+rdi],ymm5 + vmovdqu YMMWORD[(96+256)+rdi],ymm9 + vperm2i128 ymm3,ymm4,ymm0,0x13 + vperm2i128 ymm0,ymm4,ymm0,0x02 + vperm2i128 ymm4,ymm12,ymm8,0x02 + vperm2i128 ymm12,ymm12,ymm8,0x13 + vmovdqa ymm8,ymm3 + + lea rsi,[384+rsi] + lea rdi,[384+rdi] + sub rbx,12*32 +$L$open_avx2_tail_128_xor: + cmp rbx,32 + jb NEAR $L$open_avx2_tail_32_xor + sub rbx,32 + vpxor ymm0,ymm0,YMMWORD[rsi] + vmovdqu YMMWORD[rdi],ymm0 + lea rsi,[32+rsi] + lea rdi,[32+rdi] + vmovdqa ymm0,ymm4 + vmovdqa ymm4,ymm8 + vmovdqa ymm8,ymm12 + jmp NEAR $L$open_avx2_tail_128_xor +$L$open_avx2_tail_32_xor: + cmp rbx,16 + vmovdqa xmm1,xmm0 + jb NEAR $L$open_avx2_exit + sub rbx,16 + + vpxor xmm1,xmm0,XMMWORD[rsi] + vmovdqu XMMWORD[rdi],xmm1 + lea rsi,[16+rsi] + lea rdi,[16+rdi] + vperm2i128 ymm0,ymm0,ymm0,0x11 + vmovdqa xmm1,xmm0 +$L$open_avx2_exit: + vzeroupper + jmp NEAR $L$open_sse_tail_16 + +$L$open_avx2_192: + vmovdqa ymm1,ymm0 + vmovdqa ymm2,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm6,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm10,ymm8 + vpaddd ymm13,ymm12,YMMWORD[$L$avx2_inc] + vmovdqa ymm11,ymm12 + vmovdqa ymm15,ymm13 + mov r10,10 +$L$open_avx2_192_rounds: + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,4 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,4 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,12 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,12 + + dec r10 + jne NEAR $L$open_avx2_192_rounds + vpaddd ymm0,ymm0,ymm2 + vpaddd ymm1,ymm1,ymm2 + vpaddd ymm4,ymm4,ymm6 + vpaddd ymm5,ymm5,ymm6 + vpaddd ymm8,ymm8,ymm10 + vpaddd ymm9,ymm9,ymm10 + vpaddd ymm12,ymm12,ymm11 + vpaddd ymm13,ymm13,ymm15 + vperm2i128 ymm3,ymm4,ymm0,0x02 + + vpand ymm3,ymm3,YMMWORD[$L$clamp] + vmovdqa YMMWORD[(160+0)+rbp],ymm3 + + vperm2i128 ymm0,ymm4,ymm0,0x13 + vperm2i128 ymm4,ymm12,ymm8,0x13 + vperm2i128 ymm8,ymm5,ymm1,0x02 + vperm2i128 ymm12,ymm13,ymm9,0x02 + vperm2i128 ymm1,ymm5,ymm1,0x13 + vperm2i128 ymm5,ymm13,ymm9,0x13 +$L$open_avx2_short: + mov r8,r8 + call poly_hash_ad_internal +$L$open_avx2_short_hash_and_xor_loop: + cmp rbx,32 + jb NEAR $L$open_avx2_short_tail_32 + sub rbx,32 + add r10,QWORD[((0+0))+rsi] + adc r11,QWORD[((8+0))+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + add r10,QWORD[((0+16))+rsi] + adc r11,QWORD[((8+16))+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + + vpxor ymm0,ymm0,YMMWORD[rsi] + vmovdqu YMMWORD[rdi],ymm0 + lea rsi,[32+rsi] + lea rdi,[32+rdi] + + vmovdqa ymm0,ymm4 + vmovdqa ymm4,ymm8 + vmovdqa ymm8,ymm12 + vmovdqa ymm12,ymm1 + vmovdqa ymm1,ymm5 + vmovdqa ymm5,ymm9 + vmovdqa ymm9,ymm13 + vmovdqa ymm13,ymm2 + vmovdqa ymm2,ymm6 + jmp NEAR $L$open_avx2_short_hash_and_xor_loop +$L$open_avx2_short_tail_32: + cmp rbx,16 + vmovdqa xmm1,xmm0 + jb NEAR $L$open_avx2_short_tail_32_exit + sub rbx,16 + add r10,QWORD[((0+0))+rsi] + adc r11,QWORD[((8+0))+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + vpxor xmm3,xmm0,XMMWORD[rsi] + vmovdqu XMMWORD[rdi],xmm3 + lea rsi,[16+rsi] + lea rdi,[16+rdi] + vextracti128 xmm1,ymm0,1 +$L$open_avx2_short_tail_32_exit: + vzeroupper + jmp NEAR $L$open_sse_tail_16 + +$L$open_avx2_320: + vmovdqa ymm1,ymm0 + vmovdqa ymm2,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm6,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm10,ymm8 + vpaddd ymm13,ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm14,ymm13,YMMWORD[$L$avx2_inc] + vmovdqa ymm7,ymm4 + vmovdqa ymm11,ymm8 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + vmovdqa YMMWORD[(160+224)+rbp],ymm14 + mov r10,10 +$L$open_avx2_320_rounds: + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,4 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,4 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol16] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpsrld ymm3,ymm6,20 + vpslld ymm6,ymm6,12 + vpxor ymm6,ymm6,ymm3 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol8] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpslld ymm3,ymm6,7 + vpsrld ymm6,ymm6,25 + vpxor ymm6,ymm6,ymm3 + vpalignr ymm14,ymm14,ymm14,12 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm6,ymm6,ymm6,4 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,12 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,12 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol16] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpsrld ymm3,ymm6,20 + vpslld ymm6,ymm6,12 + vpxor ymm6,ymm6,ymm3 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol8] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpslld ymm3,ymm6,7 + vpsrld ymm6,ymm6,25 + vpxor ymm6,ymm6,ymm3 + vpalignr ymm14,ymm14,ymm14,4 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm6,ymm6,ymm6,12 + + dec r10 + jne NEAR $L$open_avx2_320_rounds + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] + vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,ymm7 + vpaddd ymm5,ymm5,ymm7 + vpaddd ymm6,ymm6,ymm7 + vpaddd ymm8,ymm8,ymm11 + vpaddd ymm9,ymm9,ymm11 + vpaddd ymm10,ymm10,ymm11 + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] + vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] + vperm2i128 ymm3,ymm4,ymm0,0x02 + + vpand ymm3,ymm3,YMMWORD[$L$clamp] + vmovdqa YMMWORD[(160+0)+rbp],ymm3 + + vperm2i128 ymm0,ymm4,ymm0,0x13 + vperm2i128 ymm4,ymm12,ymm8,0x13 + vperm2i128 ymm8,ymm5,ymm1,0x02 + vperm2i128 ymm12,ymm13,ymm9,0x02 + vperm2i128 ymm1,ymm5,ymm1,0x13 + vperm2i128 ymm5,ymm13,ymm9,0x13 + vperm2i128 ymm9,ymm6,ymm2,0x02 + vperm2i128 ymm13,ymm14,ymm10,0x02 + vperm2i128 ymm2,ymm6,ymm2,0x13 + vperm2i128 ymm6,ymm14,ymm10,0x13 + jmp NEAR $L$open_avx2_short +$L$SEH_end_chacha20_poly1305_open_avx2: + + + +global chacha20_poly1305_seal_avx2 + +ALIGN 64 +chacha20_poly1305_seal_avx2: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_chacha20_poly1305_seal_avx2: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + + + push r9 + + sub rsp,288 + 160 + 32 + + lea rbp,[32+rsp] + and rbp,-32 + + movaps XMMWORD[(0+0)+rbp],xmm6 + movaps XMMWORD[(16+0)+rbp],xmm7 + movaps XMMWORD[(32+0)+rbp],xmm8 + movaps XMMWORD[(48+0)+rbp],xmm9 + movaps XMMWORD[(64+0)+rbp],xmm10 + movaps XMMWORD[(80+0)+rbp],xmm11 + movaps XMMWORD[(96+0)+rbp],xmm12 + movaps XMMWORD[(112+0)+rbp],xmm13 + movaps XMMWORD[(128+0)+rbp],xmm14 + movaps XMMWORD[(144+0)+rbp],xmm15 + + mov rbx,QWORD[56+r9] + add rbx,rdx + mov QWORD[((0+160+32))+rbp],r8 + mov QWORD[((8+160+32))+rbp],rbx + mov rbx,rdx + + vzeroupper + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vbroadcasti128 ymm4,XMMWORD[r9] + vbroadcasti128 ymm8,XMMWORD[16+r9] + vbroadcasti128 ymm12,XMMWORD[32+r9] + vpaddd ymm12,ymm12,YMMWORD[$L$avx2_init] + cmp rbx,6*32 + jbe NEAR $L$seal_avx2_192 + cmp rbx,10*32 + jbe NEAR $L$seal_avx2_320 + vmovdqa ymm1,ymm0 + vmovdqa ymm2,ymm0 + vmovdqa ymm3,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm6,ymm4 + vmovdqa ymm7,ymm4 + vmovdqa YMMWORD[(160+64)+rbp],ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm10,ymm8 + vmovdqa ymm11,ymm8 + vmovdqa YMMWORD[(160+96)+rbp],ymm8 + vmovdqa ymm15,ymm12 + vpaddd ymm14,ymm15,YMMWORD[$L$avx2_inc] + vpaddd ymm13,ymm14,YMMWORD[$L$avx2_inc] + vpaddd ymm12,ymm13,YMMWORD[$L$avx2_inc] + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + vmovdqa YMMWORD[(160+224)+rbp],ymm14 + vmovdqa YMMWORD[(160+256)+rbp],ymm15 + mov r10,10 +$L$seal_avx2_init_rounds: + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,4 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,12 + vpalignr ymm6,ymm6,ymm6,4 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,12 + vpalignr ymm5,ymm5,ymm5,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm4,ymm4,ymm4,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm12,ymm12,ymm12,12 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,12 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,4 + vpalignr ymm6,ymm6,ymm6,12 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,4 + vpalignr ymm5,ymm5,ymm5,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm4,ymm4,ymm4,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm12,ymm12,ymm12,4 + + dec r10 + jnz NEAR $L$seal_avx2_init_rounds + vpaddd ymm3,ymm3,YMMWORD[$L$chacha20_consts] + vpaddd ymm7,ymm7,YMMWORD[((160+64))+rbp] + vpaddd ymm11,ymm11,YMMWORD[((160+96))+rbp] + vpaddd ymm15,ymm15,YMMWORD[((160+256))+rbp] + vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] + vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp] + vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp] + vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] + vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] + vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] + vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] + vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + + vperm2i128 ymm11,ymm15,ymm11,0x13 + vperm2i128 ymm15,ymm7,ymm3,0x02 + vperm2i128 ymm3,ymm7,ymm3,0x13 + vpand ymm15,ymm15,YMMWORD[$L$clamp] + vmovdqa YMMWORD[(160+0)+rbp],ymm15 + mov r8,r8 + call poly_hash_ad_internal + + vpxor ymm3,ymm3,YMMWORD[rsi] + vpxor ymm11,ymm11,YMMWORD[32+rsi] + vmovdqu YMMWORD[rdi],ymm3 + vmovdqu YMMWORD[32+rdi],ymm11 + vperm2i128 ymm15,ymm6,ymm2,0x02 + vperm2i128 ymm6,ymm6,ymm2,0x13 + vperm2i128 ymm2,ymm14,ymm10,0x02 + vperm2i128 ymm10,ymm14,ymm10,0x13 + vpxor ymm15,ymm15,YMMWORD[((0+64))+rsi] + vpxor ymm2,ymm2,YMMWORD[((32+64))+rsi] + vpxor ymm6,ymm6,YMMWORD[((64+64))+rsi] + vpxor ymm10,ymm10,YMMWORD[((96+64))+rsi] + vmovdqu YMMWORD[(0+64)+rdi],ymm15 + vmovdqu YMMWORD[(32+64)+rdi],ymm2 + vmovdqu YMMWORD[(64+64)+rdi],ymm6 + vmovdqu YMMWORD[(96+64)+rdi],ymm10 + vperm2i128 ymm15,ymm5,ymm1,0x02 + vperm2i128 ymm5,ymm5,ymm1,0x13 + vperm2i128 ymm1,ymm13,ymm9,0x02 + vperm2i128 ymm9,ymm13,ymm9,0x13 + vpxor ymm15,ymm15,YMMWORD[((0+192))+rsi] + vpxor ymm1,ymm1,YMMWORD[((32+192))+rsi] + vpxor ymm5,ymm5,YMMWORD[((64+192))+rsi] + vpxor ymm9,ymm9,YMMWORD[((96+192))+rsi] + vmovdqu YMMWORD[(0+192)+rdi],ymm15 + vmovdqu YMMWORD[(32+192)+rdi],ymm1 + vmovdqu YMMWORD[(64+192)+rdi],ymm5 + vmovdqu YMMWORD[(96+192)+rdi],ymm9 + vperm2i128 ymm15,ymm4,ymm0,0x13 + vperm2i128 ymm0,ymm4,ymm0,0x02 + vperm2i128 ymm4,ymm12,ymm8,0x02 + vperm2i128 ymm12,ymm12,ymm8,0x13 + vmovdqa ymm8,ymm15 + + lea rsi,[320+rsi] + sub rbx,10*32 + mov rcx,10*32 + cmp rbx,4*32 + jbe NEAR $L$seal_avx2_short_hash_remainder + vpxor ymm0,ymm0,YMMWORD[rsi] + vpxor ymm4,ymm4,YMMWORD[32+rsi] + vpxor ymm8,ymm8,YMMWORD[64+rsi] + vpxor ymm12,ymm12,YMMWORD[96+rsi] + vmovdqu YMMWORD[320+rdi],ymm0 + vmovdqu YMMWORD[352+rdi],ymm4 + vmovdqu YMMWORD[384+rdi],ymm8 + vmovdqu YMMWORD[416+rdi],ymm12 + lea rsi,[128+rsi] + sub rbx,4*32 + mov rcx,8 + mov r8,2 + cmp rbx,4*32 + jbe NEAR $L$seal_avx2_tail_128 + cmp rbx,8*32 + jbe NEAR $L$seal_avx2_tail_256 + cmp rbx,12*32 + jbe NEAR $L$seal_avx2_tail_384 + cmp rbx,16*32 + jbe NEAR $L$seal_avx2_tail_512 + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vmovdqa ymm4,YMMWORD[((160+64))+rbp] + vmovdqa ymm8,YMMWORD[((160+96))+rbp] + vmovdqa ymm1,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm2,ymm0 + vmovdqa ymm6,ymm4 + vmovdqa ymm10,ymm8 + vmovdqa ymm3,ymm0 + vmovdqa ymm7,ymm4 + vmovdqa ymm11,ymm8 + vmovdqa ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm15,ymm12,YMMWORD[((160+160))+rbp] + vpaddd ymm14,ymm12,ymm15 + vpaddd ymm13,ymm12,ymm14 + vpaddd ymm12,ymm12,ymm13 + vmovdqa YMMWORD[(160+256)+rbp],ymm15 + vmovdqa YMMWORD[(160+224)+rbp],ymm14 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,4 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,12 + vpalignr ymm6,ymm6,ymm6,4 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,12 + vpalignr ymm5,ymm5,ymm5,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm4,ymm4,ymm4,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm12,ymm12,ymm12,12 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,12 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,4 + vpalignr ymm6,ymm6,ymm6,12 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,4 + vpalignr ymm5,ymm5,ymm5,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm4,ymm4,ymm4,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm12,ymm12,ymm12,4 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + + sub rdi,16 + mov rcx,9 + jmp NEAR $L$seal_avx2_main_loop_rounds_entry +ALIGN 32 +$L$seal_avx2_main_loop: + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vmovdqa ymm4,YMMWORD[((160+64))+rbp] + vmovdqa ymm8,YMMWORD[((160+96))+rbp] + vmovdqa ymm1,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm2,ymm0 + vmovdqa ymm6,ymm4 + vmovdqa ymm10,ymm8 + vmovdqa ymm3,ymm0 + vmovdqa ymm7,ymm4 + vmovdqa ymm11,ymm8 + vmovdqa ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm15,ymm12,YMMWORD[((160+160))+rbp] + vpaddd ymm14,ymm12,ymm15 + vpaddd ymm13,ymm12,ymm14 + vpaddd ymm12,ymm12,ymm13 + vmovdqa YMMWORD[(160+256)+rbp],ymm15 + vmovdqa YMMWORD[(160+224)+rbp],ymm14 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + + mov rcx,10 +ALIGN 32 +$L$seal_avx2_main_loop_rounds: + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + add r15,rax + adc r9,rdx + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + +$L$seal_avx2_main_loop_rounds_entry: + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + add r10,QWORD[((0+16))+rdi] + adc r11,QWORD[((8+16))+rdi] + adc r12,1 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,4 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,12 + vpalignr ymm6,ymm6,ymm6,4 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,12 + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + vpalignr ymm5,ymm5,ymm5,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm4,ymm4,ymm4,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm12,ymm12,ymm12,12 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + add r15,rax + adc r9,rdx + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + add r10,QWORD[((0+32))+rdi] + adc r11,QWORD[((8+32))+rdi] + adc r12,1 + + lea rdi,[48+rdi] + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + add r15,rax + adc r9,rdx + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,12 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,4 + vpalignr ymm6,ymm6,ymm6,12 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,4 + vpalignr ymm5,ymm5,ymm5,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm4,ymm4,ymm4,12 + vpalignr ymm8,ymm8,ymm8,8 + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vpalignr ymm12,ymm12,ymm12,4 + + dec rcx + jne NEAR $L$seal_avx2_main_loop_rounds + vpaddd ymm3,ymm3,YMMWORD[$L$chacha20_consts] + vpaddd ymm7,ymm7,YMMWORD[((160+64))+rbp] + vpaddd ymm11,ymm11,YMMWORD[((160+96))+rbp] + vpaddd ymm15,ymm15,YMMWORD[((160+256))+rbp] + vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] + vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp] + vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp] + vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] + vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] + vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] + vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] + vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + + vmovdqa YMMWORD[(160+128)+rbp],ymm0 + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + add r10,QWORD[((0+16))+rdi] + adc r11,QWORD[((8+16))+rdi] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[32+rdi] + vperm2i128 ymm0,ymm7,ymm3,0x02 + vperm2i128 ymm7,ymm7,ymm3,0x13 + vperm2i128 ymm3,ymm15,ymm11,0x02 + vperm2i128 ymm11,ymm15,ymm11,0x13 + vpxor ymm0,ymm0,YMMWORD[((0+0))+rsi] + vpxor ymm3,ymm3,YMMWORD[((32+0))+rsi] + vpxor ymm7,ymm7,YMMWORD[((64+0))+rsi] + vpxor ymm11,ymm11,YMMWORD[((96+0))+rsi] + vmovdqu YMMWORD[(0+0)+rdi],ymm0 + vmovdqu YMMWORD[(32+0)+rdi],ymm3 + vmovdqu YMMWORD[(64+0)+rdi],ymm7 + vmovdqu YMMWORD[(96+0)+rdi],ymm11 + + vmovdqa ymm0,YMMWORD[((160+128))+rbp] + vperm2i128 ymm3,ymm6,ymm2,0x02 + vperm2i128 ymm6,ymm6,ymm2,0x13 + vperm2i128 ymm2,ymm14,ymm10,0x02 + vperm2i128 ymm10,ymm14,ymm10,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi] + vpxor ymm2,ymm2,YMMWORD[((32+128))+rsi] + vpxor ymm6,ymm6,YMMWORD[((64+128))+rsi] + vpxor ymm10,ymm10,YMMWORD[((96+128))+rsi] + vmovdqu YMMWORD[(0+128)+rdi],ymm3 + vmovdqu YMMWORD[(32+128)+rdi],ymm2 + vmovdqu YMMWORD[(64+128)+rdi],ymm6 + vmovdqu YMMWORD[(96+128)+rdi],ymm10 + vperm2i128 ymm3,ymm5,ymm1,0x02 + vperm2i128 ymm5,ymm5,ymm1,0x13 + vperm2i128 ymm1,ymm13,ymm9,0x02 + vperm2i128 ymm9,ymm13,ymm9,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+256))+rsi] + vpxor ymm1,ymm1,YMMWORD[((32+256))+rsi] + vpxor ymm5,ymm5,YMMWORD[((64+256))+rsi] + vpxor ymm9,ymm9,YMMWORD[((96+256))+rsi] + vmovdqu YMMWORD[(0+256)+rdi],ymm3 + vmovdqu YMMWORD[(32+256)+rdi],ymm1 + vmovdqu YMMWORD[(64+256)+rdi],ymm5 + vmovdqu YMMWORD[(96+256)+rdi],ymm9 + vperm2i128 ymm3,ymm4,ymm0,0x02 + vperm2i128 ymm4,ymm4,ymm0,0x13 + vperm2i128 ymm0,ymm12,ymm8,0x02 + vperm2i128 ymm8,ymm12,ymm8,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+384))+rsi] + vpxor ymm0,ymm0,YMMWORD[((32+384))+rsi] + vpxor ymm4,ymm4,YMMWORD[((64+384))+rsi] + vpxor ymm8,ymm8,YMMWORD[((96+384))+rsi] + vmovdqu YMMWORD[(0+384)+rdi],ymm3 + vmovdqu YMMWORD[(32+384)+rdi],ymm0 + vmovdqu YMMWORD[(64+384)+rdi],ymm4 + vmovdqu YMMWORD[(96+384)+rdi],ymm8 + + lea rsi,[512+rsi] + sub rbx,16*32 + cmp rbx,16*32 + jg NEAR $L$seal_avx2_main_loop + + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + add r10,QWORD[((0+16))+rdi] + adc r11,QWORD[((8+16))+rdi] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[32+rdi] + mov rcx,10 + xor r8,r8 + + cmp rbx,12*32 + ja NEAR $L$seal_avx2_tail_512 + cmp rbx,8*32 + ja NEAR $L$seal_avx2_tail_384 + cmp rbx,4*32 + ja NEAR $L$seal_avx2_tail_256 + +$L$seal_avx2_tail_128: + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vmovdqa ymm4,YMMWORD[((160+64))+rbp] + vmovdqa ymm8,YMMWORD[((160+96))+rbp] + vmovdqa ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + +$L$seal_avx2_tail_128_rounds_and_3xhash: + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[16+rdi] +$L$seal_avx2_tail_128_rounds_and_2xhash: + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,4 + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,12 + add r10,QWORD[((0+16))+rdi] + adc r11,QWORD[((8+16))+rdi] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[32+rdi] + dec rcx + jg NEAR $L$seal_avx2_tail_128_rounds_and_3xhash + dec r8 + jge NEAR $L$seal_avx2_tail_128_rounds_and_2xhash + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + vperm2i128 ymm3,ymm4,ymm0,0x13 + vperm2i128 ymm0,ymm4,ymm0,0x02 + vperm2i128 ymm4,ymm12,ymm8,0x02 + vperm2i128 ymm12,ymm12,ymm8,0x13 + vmovdqa ymm8,ymm3 + + jmp NEAR $L$seal_avx2_short_loop + +$L$seal_avx2_tail_256: + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vmovdqa ymm4,YMMWORD[((160+64))+rbp] + vmovdqa ymm8,YMMWORD[((160+96))+rbp] + vmovdqa ymm1,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm13,ymm12,YMMWORD[((160+160))+rbp] + vpaddd ymm12,ymm12,ymm13 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + +$L$seal_avx2_tail_256_rounds_and_3xhash: + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[16+rdi] +$L$seal_avx2_tail_256_rounds_and_2xhash: + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,4 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,4 + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,12 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,12 + add r10,QWORD[((0+16))+rdi] + adc r11,QWORD[((8+16))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[32+rdi] + dec rcx + jg NEAR $L$seal_avx2_tail_256_rounds_and_3xhash + dec r8 + jge NEAR $L$seal_avx2_tail_256_rounds_and_2xhash + vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] + vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] + vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] + vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + vperm2i128 ymm3,ymm5,ymm1,0x02 + vperm2i128 ymm5,ymm5,ymm1,0x13 + vperm2i128 ymm1,ymm13,ymm9,0x02 + vperm2i128 ymm9,ymm13,ymm9,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+0))+rsi] + vpxor ymm1,ymm1,YMMWORD[((32+0))+rsi] + vpxor ymm5,ymm5,YMMWORD[((64+0))+rsi] + vpxor ymm9,ymm9,YMMWORD[((96+0))+rsi] + vmovdqu YMMWORD[(0+0)+rdi],ymm3 + vmovdqu YMMWORD[(32+0)+rdi],ymm1 + vmovdqu YMMWORD[(64+0)+rdi],ymm5 + vmovdqu YMMWORD[(96+0)+rdi],ymm9 + vperm2i128 ymm3,ymm4,ymm0,0x13 + vperm2i128 ymm0,ymm4,ymm0,0x02 + vperm2i128 ymm4,ymm12,ymm8,0x02 + vperm2i128 ymm12,ymm12,ymm8,0x13 + vmovdqa ymm8,ymm3 + + mov rcx,4*32 + lea rsi,[128+rsi] + sub rbx,4*32 + jmp NEAR $L$seal_avx2_short_hash_remainder + +$L$seal_avx2_tail_384: + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vmovdqa ymm4,YMMWORD[((160+64))+rbp] + vmovdqa ymm8,YMMWORD[((160+96))+rbp] + vmovdqa ymm1,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm2,ymm0 + vmovdqa ymm6,ymm4 + vmovdqa ymm10,ymm8 + vmovdqa ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm14,ymm12,YMMWORD[((160+160))+rbp] + vpaddd ymm13,ymm12,ymm14 + vpaddd ymm12,ymm12,ymm13 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + vmovdqa YMMWORD[(160+224)+rbp],ymm14 + +$L$seal_avx2_tail_384_rounds_and_3xhash: + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[16+rdi] +$L$seal_avx2_tail_384_rounds_and_2xhash: + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,4 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,4 + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol16] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpsrld ymm3,ymm6,20 + vpslld ymm6,ymm6,12 + vpxor ymm6,ymm6,ymm3 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol8] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpslld ymm3,ymm6,7 + vpsrld ymm6,ymm6,25 + vpxor ymm6,ymm6,ymm3 + vpalignr ymm14,ymm14,ymm14,12 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm6,ymm6,ymm6,4 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,12 + add r10,QWORD[((0+16))+rdi] + adc r11,QWORD[((8+16))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,12 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol16] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpsrld ymm3,ymm6,20 + vpslld ymm6,ymm6,12 + vpxor ymm6,ymm6,ymm3 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol8] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpslld ymm3,ymm6,7 + vpsrld ymm6,ymm6,25 + vpxor ymm6,ymm6,ymm3 + vpalignr ymm14,ymm14,ymm14,4 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm6,ymm6,ymm6,12 + + lea rdi,[32+rdi] + dec rcx + jg NEAR $L$seal_avx2_tail_384_rounds_and_3xhash + dec r8 + jge NEAR $L$seal_avx2_tail_384_rounds_and_2xhash + vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] + vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp] + vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp] + vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] + vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] + vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] + vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] + vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + vperm2i128 ymm3,ymm6,ymm2,0x02 + vperm2i128 ymm6,ymm6,ymm2,0x13 + vperm2i128 ymm2,ymm14,ymm10,0x02 + vperm2i128 ymm10,ymm14,ymm10,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+0))+rsi] + vpxor ymm2,ymm2,YMMWORD[((32+0))+rsi] + vpxor ymm6,ymm6,YMMWORD[((64+0))+rsi] + vpxor ymm10,ymm10,YMMWORD[((96+0))+rsi] + vmovdqu YMMWORD[(0+0)+rdi],ymm3 + vmovdqu YMMWORD[(32+0)+rdi],ymm2 + vmovdqu YMMWORD[(64+0)+rdi],ymm6 + vmovdqu YMMWORD[(96+0)+rdi],ymm10 + vperm2i128 ymm3,ymm5,ymm1,0x02 + vperm2i128 ymm5,ymm5,ymm1,0x13 + vperm2i128 ymm1,ymm13,ymm9,0x02 + vperm2i128 ymm9,ymm13,ymm9,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi] + vpxor ymm1,ymm1,YMMWORD[((32+128))+rsi] + vpxor ymm5,ymm5,YMMWORD[((64+128))+rsi] + vpxor ymm9,ymm9,YMMWORD[((96+128))+rsi] + vmovdqu YMMWORD[(0+128)+rdi],ymm3 + vmovdqu YMMWORD[(32+128)+rdi],ymm1 + vmovdqu YMMWORD[(64+128)+rdi],ymm5 + vmovdqu YMMWORD[(96+128)+rdi],ymm9 + vperm2i128 ymm3,ymm4,ymm0,0x13 + vperm2i128 ymm0,ymm4,ymm0,0x02 + vperm2i128 ymm4,ymm12,ymm8,0x02 + vperm2i128 ymm12,ymm12,ymm8,0x13 + vmovdqa ymm8,ymm3 + + mov rcx,8*32 + lea rsi,[256+rsi] + sub rbx,8*32 + jmp NEAR $L$seal_avx2_short_hash_remainder + +$L$seal_avx2_tail_512: + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vmovdqa ymm4,YMMWORD[((160+64))+rbp] + vmovdqa ymm8,YMMWORD[((160+96))+rbp] + vmovdqa ymm1,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm2,ymm0 + vmovdqa ymm6,ymm4 + vmovdqa ymm10,ymm8 + vmovdqa ymm3,ymm0 + vmovdqa ymm7,ymm4 + vmovdqa ymm11,ymm8 + vmovdqa ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm15,ymm12,YMMWORD[((160+160))+rbp] + vpaddd ymm14,ymm12,ymm15 + vpaddd ymm13,ymm12,ymm14 + vpaddd ymm12,ymm12,ymm13 + vmovdqa YMMWORD[(160+256)+rbp],ymm15 + vmovdqa YMMWORD[(160+224)+rbp],ymm14 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + +$L$seal_avx2_tail_512_rounds_and_3xhash: + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[16+rdi] +$L$seal_avx2_tail_512_rounds_and_2xhash: + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,4 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,12 + vpalignr ymm6,ymm6,ymm6,4 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,12 + vpalignr ymm5,ymm5,ymm5,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm4,ymm4,ymm4,4 + add r15,rax + adc r9,rdx + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm12,ymm12,ymm12,12 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + add r10,QWORD[((0+16))+rdi] + adc r11,QWORD[((8+16))+rdi] + adc r12,1 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,12 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,4 + vpalignr ymm6,ymm6,ymm6,12 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,4 + vpalignr ymm5,ymm5,ymm5,12 + vpalignr ymm9,ymm9,ymm9,8 + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm4,ymm4,ymm4,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm12,ymm12,ymm12,4 + + + + + + + + + + + + + + + + + add r15,rax + adc r9,rdx + + + + + + + + + + + + + + + + + + + + + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[32+rdi] + dec rcx + jg NEAR $L$seal_avx2_tail_512_rounds_and_3xhash + dec r8 + jge NEAR $L$seal_avx2_tail_512_rounds_and_2xhash + vpaddd ymm3,ymm3,YMMWORD[$L$chacha20_consts] + vpaddd ymm7,ymm7,YMMWORD[((160+64))+rbp] + vpaddd ymm11,ymm11,YMMWORD[((160+96))+rbp] + vpaddd ymm15,ymm15,YMMWORD[((160+256))+rbp] + vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] + vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp] + vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp] + vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] + vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] + vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] + vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] + vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + + vmovdqa YMMWORD[(160+128)+rbp],ymm0 + vperm2i128 ymm0,ymm7,ymm3,0x02 + vperm2i128 ymm7,ymm7,ymm3,0x13 + vperm2i128 ymm3,ymm15,ymm11,0x02 + vperm2i128 ymm11,ymm15,ymm11,0x13 + vpxor ymm0,ymm0,YMMWORD[((0+0))+rsi] + vpxor ymm3,ymm3,YMMWORD[((32+0))+rsi] + vpxor ymm7,ymm7,YMMWORD[((64+0))+rsi] + vpxor ymm11,ymm11,YMMWORD[((96+0))+rsi] + vmovdqu YMMWORD[(0+0)+rdi],ymm0 + vmovdqu YMMWORD[(32+0)+rdi],ymm3 + vmovdqu YMMWORD[(64+0)+rdi],ymm7 + vmovdqu YMMWORD[(96+0)+rdi],ymm11 + + vmovdqa ymm0,YMMWORD[((160+128))+rbp] + vperm2i128 ymm3,ymm6,ymm2,0x02 + vperm2i128 ymm6,ymm6,ymm2,0x13 + vperm2i128 ymm2,ymm14,ymm10,0x02 + vperm2i128 ymm10,ymm14,ymm10,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi] + vpxor ymm2,ymm2,YMMWORD[((32+128))+rsi] + vpxor ymm6,ymm6,YMMWORD[((64+128))+rsi] + vpxor ymm10,ymm10,YMMWORD[((96+128))+rsi] + vmovdqu YMMWORD[(0+128)+rdi],ymm3 + vmovdqu YMMWORD[(32+128)+rdi],ymm2 + vmovdqu YMMWORD[(64+128)+rdi],ymm6 + vmovdqu YMMWORD[(96+128)+rdi],ymm10 + vperm2i128 ymm3,ymm5,ymm1,0x02 + vperm2i128 ymm5,ymm5,ymm1,0x13 + vperm2i128 ymm1,ymm13,ymm9,0x02 + vperm2i128 ymm9,ymm13,ymm9,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+256))+rsi] + vpxor ymm1,ymm1,YMMWORD[((32+256))+rsi] + vpxor ymm5,ymm5,YMMWORD[((64+256))+rsi] + vpxor ymm9,ymm9,YMMWORD[((96+256))+rsi] + vmovdqu YMMWORD[(0+256)+rdi],ymm3 + vmovdqu YMMWORD[(32+256)+rdi],ymm1 + vmovdqu YMMWORD[(64+256)+rdi],ymm5 + vmovdqu YMMWORD[(96+256)+rdi],ymm9 + vperm2i128 ymm3,ymm4,ymm0,0x13 + vperm2i128 ymm0,ymm4,ymm0,0x02 + vperm2i128 ymm4,ymm12,ymm8,0x02 + vperm2i128 ymm12,ymm12,ymm8,0x13 + vmovdqa ymm8,ymm3 + + mov rcx,12*32 + lea rsi,[384+rsi] + sub rbx,12*32 + jmp NEAR $L$seal_avx2_short_hash_remainder + +$L$seal_avx2_320: + vmovdqa ymm1,ymm0 + vmovdqa ymm2,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm6,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm10,ymm8 + vpaddd ymm13,ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm14,ymm13,YMMWORD[$L$avx2_inc] + vmovdqa ymm7,ymm4 + vmovdqa ymm11,ymm8 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + vmovdqa YMMWORD[(160+224)+rbp],ymm14 + mov r10,10 +$L$seal_avx2_320_rounds: + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,4 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,4 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol16] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpsrld ymm3,ymm6,20 + vpslld ymm6,ymm6,12 + vpxor ymm6,ymm6,ymm3 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol8] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpslld ymm3,ymm6,7 + vpsrld ymm6,ymm6,25 + vpxor ymm6,ymm6,ymm3 + vpalignr ymm14,ymm14,ymm14,12 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm6,ymm6,ymm6,4 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,12 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,12 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol16] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpsrld ymm3,ymm6,20 + vpslld ymm6,ymm6,12 + vpxor ymm6,ymm6,ymm3 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol8] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpslld ymm3,ymm6,7 + vpsrld ymm6,ymm6,25 + vpxor ymm6,ymm6,ymm3 + vpalignr ymm14,ymm14,ymm14,4 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm6,ymm6,ymm6,12 + + dec r10 + jne NEAR $L$seal_avx2_320_rounds + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] + vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,ymm7 + vpaddd ymm5,ymm5,ymm7 + vpaddd ymm6,ymm6,ymm7 + vpaddd ymm8,ymm8,ymm11 + vpaddd ymm9,ymm9,ymm11 + vpaddd ymm10,ymm10,ymm11 + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] + vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] + vperm2i128 ymm3,ymm4,ymm0,0x02 + + vpand ymm3,ymm3,YMMWORD[$L$clamp] + vmovdqa YMMWORD[(160+0)+rbp],ymm3 + + vperm2i128 ymm0,ymm4,ymm0,0x13 + vperm2i128 ymm4,ymm12,ymm8,0x13 + vperm2i128 ymm8,ymm5,ymm1,0x02 + vperm2i128 ymm12,ymm13,ymm9,0x02 + vperm2i128 ymm1,ymm5,ymm1,0x13 + vperm2i128 ymm5,ymm13,ymm9,0x13 + vperm2i128 ymm9,ymm6,ymm2,0x02 + vperm2i128 ymm13,ymm14,ymm10,0x02 + vperm2i128 ymm2,ymm6,ymm2,0x13 + vperm2i128 ymm6,ymm14,ymm10,0x13 + jmp NEAR $L$seal_avx2_short + +$L$seal_avx2_192: + vmovdqa ymm1,ymm0 + vmovdqa ymm2,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm6,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm10,ymm8 + vpaddd ymm13,ymm12,YMMWORD[$L$avx2_inc] + vmovdqa ymm11,ymm12 + vmovdqa ymm15,ymm13 + mov r10,10 +$L$seal_avx2_192_rounds: + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,4 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,4 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,12 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,12 + + dec r10 + jne NEAR $L$seal_avx2_192_rounds + vpaddd ymm0,ymm0,ymm2 + vpaddd ymm1,ymm1,ymm2 + vpaddd ymm4,ymm4,ymm6 + vpaddd ymm5,ymm5,ymm6 + vpaddd ymm8,ymm8,ymm10 + vpaddd ymm9,ymm9,ymm10 + vpaddd ymm12,ymm12,ymm11 + vpaddd ymm13,ymm13,ymm15 + vperm2i128 ymm3,ymm4,ymm0,0x02 + + vpand ymm3,ymm3,YMMWORD[$L$clamp] + vmovdqa YMMWORD[(160+0)+rbp],ymm3 + + vperm2i128 ymm0,ymm4,ymm0,0x13 + vperm2i128 ymm4,ymm12,ymm8,0x13 + vperm2i128 ymm8,ymm5,ymm1,0x02 + vperm2i128 ymm12,ymm13,ymm9,0x02 + vperm2i128 ymm1,ymm5,ymm1,0x13 + vperm2i128 ymm5,ymm13,ymm9,0x13 +$L$seal_avx2_short: + mov r8,r8 + call poly_hash_ad_internal + xor rcx,rcx +$L$seal_avx2_short_hash_remainder: + cmp rcx,16 + jb NEAR $L$seal_avx2_short_loop + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + sub rcx,16 + add rdi,16 + jmp NEAR $L$seal_avx2_short_hash_remainder +$L$seal_avx2_short_loop: + cmp rbx,32 + jb NEAR $L$seal_avx2_short_tail + sub rbx,32 + + vpxor ymm0,ymm0,YMMWORD[rsi] + vmovdqu YMMWORD[rdi],ymm0 + lea rsi,[32+rsi] + + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + add r10,QWORD[((0+16))+rdi] + adc r11,QWORD[((8+16))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[32+rdi] + + vmovdqa ymm0,ymm4 + vmovdqa ymm4,ymm8 + vmovdqa ymm8,ymm12 + vmovdqa ymm12,ymm1 + vmovdqa ymm1,ymm5 + vmovdqa ymm5,ymm9 + vmovdqa ymm9,ymm13 + vmovdqa ymm13,ymm2 + vmovdqa ymm2,ymm6 + jmp NEAR $L$seal_avx2_short_loop +$L$seal_avx2_short_tail: + cmp rbx,16 + jb NEAR $L$seal_avx2_exit + sub rbx,16 + vpxor xmm3,xmm0,XMMWORD[rsi] + vmovdqu XMMWORD[rdi],xmm3 + lea rsi,[16+rsi] + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[16+rdi] + vextracti128 xmm0,ymm0,1 +$L$seal_avx2_exit: + vzeroupper + jmp NEAR $L$seal_sse_tail_16 + +$L$SEH_end_chacha20_poly1305_seal_avx2: +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/ring-0.17.14/pregenerated/chacha20_poly1305_x86_64-nasm.o b/ring-0.17.14/pregenerated/chacha20_poly1305_x86_64-nasm.o new file mode 100644 index 0000000000..08c6c5d2bc Binary files /dev/null and b/ring-0.17.14/pregenerated/chacha20_poly1305_x86_64-nasm.o differ diff --git a/ring-0.17.14/pregenerated/ghash-armv4-linux32.S b/ring-0.17.14/pregenerated/ghash-armv4-linux32.S new file mode 100644 index 0000000000..1209e5ea16 --- /dev/null +++ b/ring-0.17.14/pregenerated/ghash-armv4-linux32.S @@ -0,0 +1,242 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__) +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL +@ instructions are in aesv8-armx.pl.) +.arch armv7-a + +.text +#if defined(__thumb2__) || defined(__clang__) +.syntax unified +#define ldrplb ldrbpl +#define ldrneb ldrbne +#endif +#if defined(__thumb2__) +.thumb +#else +.code 32 +#endif +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.globl gcm_init_neon +.hidden gcm_init_neon +.type gcm_init_neon,%function +.align 4 +gcm_init_neon: + vld1.64 d7,[r1]! @ load H + vmov.i8 q8,#0xe1 + vld1.64 d6,[r1] + vshl.i64 d17,#57 + vshr.u64 d16,#63 @ t0=0xc2....01 + vdup.8 q9,d7[7] + vshr.u64 d26,d6,#63 + vshr.s8 q9,#7 @ broadcast carry bit + vshl.i64 q3,q3,#1 + vand q8,q8,q9 + vorr d7,d26 @ H<<<=1 + veor q3,q3,q8 @ twisted H + vstmia r0,{q3} + + bx lr @ bx lr +.size gcm_init_neon,.-gcm_init_neon + +.globl gcm_gmult_neon +.hidden gcm_gmult_neon +.type gcm_gmult_neon,%function +.align 4 +gcm_gmult_neon: + vld1.64 d7,[r0]! @ load Xi + vld1.64 d6,[r0]! + vmov.i64 d29,#0x0000ffffffffffff + vldmia r1,{d26,d27} @ load twisted H + vmov.i64 d30,#0x00000000ffffffff +#ifdef __ARMEL__ + vrev64.8 q3,q3 +#endif + vmov.i64 d31,#0x000000000000ffff + veor d28,d26,d27 @ Karatsuba pre-processing + mov r3,#16 + b .Lgmult_neon +.size gcm_gmult_neon,.-gcm_gmult_neon + +.globl gcm_ghash_neon +.hidden gcm_ghash_neon +.type gcm_ghash_neon,%function +.align 4 +gcm_ghash_neon: + vld1.64 d1,[r0]! @ load Xi + vld1.64 d0,[r0]! + vmov.i64 d29,#0x0000ffffffffffff + vldmia r1,{d26,d27} @ load twisted H + vmov.i64 d30,#0x00000000ffffffff +#ifdef __ARMEL__ + vrev64.8 q0,q0 +#endif + vmov.i64 d31,#0x000000000000ffff + veor d28,d26,d27 @ Karatsuba pre-processing + +.Loop_neon: + vld1.64 d7,[r2]! @ load inp + vld1.64 d6,[r2]! +#ifdef __ARMEL__ + vrev64.8 q3,q3 +#endif + veor q3,q0 @ inp^=Xi +.Lgmult_neon: + vext.8 d16, d26, d26, #1 @ A1 + vmull.p8 q8, d16, d6 @ F = A1*B + vext.8 d0, d6, d6, #1 @ B1 + vmull.p8 q0, d26, d0 @ E = A*B1 + vext.8 d18, d26, d26, #2 @ A2 + vmull.p8 q9, d18, d6 @ H = A2*B + vext.8 d22, d6, d6, #2 @ B2 + vmull.p8 q11, d26, d22 @ G = A*B2 + vext.8 d20, d26, d26, #3 @ A3 + veor q8, q8, q0 @ L = E + F + vmull.p8 q10, d20, d6 @ J = A3*B + vext.8 d0, d6, d6, #3 @ B3 + veor q9, q9, q11 @ M = G + H + vmull.p8 q0, d26, d0 @ I = A*B3 + veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 + vand d17, d17, d29 + vext.8 d22, d6, d6, #4 @ B4 + veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 + vand d19, d19, d30 + vmull.p8 q11, d26, d22 @ K = A*B4 + veor q10, q10, q0 @ N = I + J + veor d16, d16, d17 + veor d18, d18, d19 + veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 + vand d21, d21, d31 + vext.8 q8, q8, q8, #15 + veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 + vmov.i64 d23, #0 + vext.8 q9, q9, q9, #14 + veor d20, d20, d21 + vmull.p8 q0, d26, d6 @ D = A*B + vext.8 q11, q11, q11, #12 + vext.8 q10, q10, q10, #13 + veor q8, q8, q9 + veor q10, q10, q11 + veor q0, q0, q8 + veor q0, q0, q10 + veor d6,d6,d7 @ Karatsuba pre-processing + vext.8 d16, d28, d28, #1 @ A1 + vmull.p8 q8, d16, d6 @ F = A1*B + vext.8 d2, d6, d6, #1 @ B1 + vmull.p8 q1, d28, d2 @ E = A*B1 + vext.8 d18, d28, d28, #2 @ A2 + vmull.p8 q9, d18, d6 @ H = A2*B + vext.8 d22, d6, d6, #2 @ B2 + vmull.p8 q11, d28, d22 @ G = A*B2 + vext.8 d20, d28, d28, #3 @ A3 + veor q8, q8, q1 @ L = E + F + vmull.p8 q10, d20, d6 @ J = A3*B + vext.8 d2, d6, d6, #3 @ B3 + veor q9, q9, q11 @ M = G + H + vmull.p8 q1, d28, d2 @ I = A*B3 + veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 + vand d17, d17, d29 + vext.8 d22, d6, d6, #4 @ B4 + veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 + vand d19, d19, d30 + vmull.p8 q11, d28, d22 @ K = A*B4 + veor q10, q10, q1 @ N = I + J + veor d16, d16, d17 + veor d18, d18, d19 + veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 + vand d21, d21, d31 + vext.8 q8, q8, q8, #15 + veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 + vmov.i64 d23, #0 + vext.8 q9, q9, q9, #14 + veor d20, d20, d21 + vmull.p8 q1, d28, d6 @ D = A*B + vext.8 q11, q11, q11, #12 + vext.8 q10, q10, q10, #13 + veor q8, q8, q9 + veor q10, q10, q11 + veor q1, q1, q8 + veor q1, q1, q10 + vext.8 d16, d27, d27, #1 @ A1 + vmull.p8 q8, d16, d7 @ F = A1*B + vext.8 d4, d7, d7, #1 @ B1 + vmull.p8 q2, d27, d4 @ E = A*B1 + vext.8 d18, d27, d27, #2 @ A2 + vmull.p8 q9, d18, d7 @ H = A2*B + vext.8 d22, d7, d7, #2 @ B2 + vmull.p8 q11, d27, d22 @ G = A*B2 + vext.8 d20, d27, d27, #3 @ A3 + veor q8, q8, q2 @ L = E + F + vmull.p8 q10, d20, d7 @ J = A3*B + vext.8 d4, d7, d7, #3 @ B3 + veor q9, q9, q11 @ M = G + H + vmull.p8 q2, d27, d4 @ I = A*B3 + veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 + vand d17, d17, d29 + vext.8 d22, d7, d7, #4 @ B4 + veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 + vand d19, d19, d30 + vmull.p8 q11, d27, d22 @ K = A*B4 + veor q10, q10, q2 @ N = I + J + veor d16, d16, d17 + veor d18, d18, d19 + veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 + vand d21, d21, d31 + vext.8 q8, q8, q8, #15 + veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 + vmov.i64 d23, #0 + vext.8 q9, q9, q9, #14 + veor d20, d20, d21 + vmull.p8 q2, d27, d7 @ D = A*B + vext.8 q11, q11, q11, #12 + vext.8 q10, q10, q10, #13 + veor q8, q8, q9 + veor q10, q10, q11 + veor q2, q2, q8 + veor q2, q2, q10 + veor q1,q1,q0 @ Karatsuba post-processing + veor q1,q1,q2 + veor d1,d1,d2 + veor d4,d4,d3 @ Xh|Xl - 256-bit result + + @ equivalent of reduction_avx from ghash-x86_64.pl + vshl.i64 q9,q0,#57 @ 1st phase + vshl.i64 q10,q0,#62 + veor q10,q10,q9 @ + vshl.i64 q9,q0,#63 + veor q10, q10, q9 @ + veor d1,d1,d20 @ + veor d4,d4,d21 + + vshr.u64 q10,q0,#1 @ 2nd phase + veor q2,q2,q0 + veor q0,q0,q10 @ + vshr.u64 q10,q10,#6 + vshr.u64 q0,q0,#1 @ + veor q0,q0,q2 @ + veor q0,q0,q10 @ + + subs r3,#16 + bne .Loop_neon + +#ifdef __ARMEL__ + vrev64.8 q0,q0 +#endif + sub r0,#16 + vst1.64 d1,[r0]! @ write out Xi + vst1.64 d0,[r0] + + bx lr @ bx lr +.size gcm_ghash_neon,.-gcm_ghash_neon +#endif +.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__) diff --git a/ring-0.17.14/pregenerated/ghash-neon-armv8-ios64.S b/ring-0.17.14/pregenerated/ghash-neon-armv8-ios64.S new file mode 100644 index 0000000000..790cf2fe39 --- /dev/null +++ b/ring-0.17.14/pregenerated/ghash-neon-armv8-ios64.S @@ -0,0 +1,333 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) +.text + +.globl _gcm_init_neon +.private_extern _gcm_init_neon + +.align 4 +_gcm_init_neon: + AARCH64_VALID_CALL_TARGET + // This function is adapted from gcm_init_v8. xC2 is t3. + ld1 {v17.2d}, [x1] // load H + movi v19.16b, #0xe1 + shl v19.2d, v19.2d, #57 // 0xc2.0 + ext v3.16b, v17.16b, v17.16b, #8 + ushr v18.2d, v19.2d, #63 + dup v17.4s, v17.s[1] + ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01 + ushr v18.2d, v3.2d, #63 + sshr v17.4s, v17.4s, #31 // broadcast carry bit + and v18.16b, v18.16b, v16.16b + shl v3.2d, v3.2d, #1 + ext v18.16b, v18.16b, v18.16b, #8 + and v16.16b, v16.16b, v17.16b + orr v3.16b, v3.16b, v18.16b // H<<<=1 + eor v5.16b, v3.16b, v16.16b // twisted H + st1 {v5.2d}, [x0] // store Htable[0] + ret + + +.globl _gcm_gmult_neon +.private_extern _gcm_gmult_neon + +.align 4 +_gcm_gmult_neon: + AARCH64_VALID_CALL_TARGET + ld1 {v3.16b}, [x0] // load Xi + ld1 {v5.1d}, [x1], #8 // load twisted H + ld1 {v6.1d}, [x1] + adrp x9, Lmasks@PAGE // load constants + add x9, x9, Lmasks@PAGEOFF + ld1 {v24.2d, v25.2d}, [x9] + rev64 v3.16b, v3.16b // byteswap Xi + ext v3.16b, v3.16b, v3.16b, #8 + eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing + + mov x3, #16 + b Lgmult_neon + + +.globl _gcm_ghash_neon +.private_extern _gcm_ghash_neon + +.align 4 +_gcm_ghash_neon: + AARCH64_VALID_CALL_TARGET + ld1 {v0.16b}, [x0] // load Xi + ld1 {v5.1d}, [x1], #8 // load twisted H + ld1 {v6.1d}, [x1] + adrp x9, Lmasks@PAGE // load constants + add x9, x9, Lmasks@PAGEOFF + ld1 {v24.2d, v25.2d}, [x9] + rev64 v0.16b, v0.16b // byteswap Xi + ext v0.16b, v0.16b, v0.16b, #8 + eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing + +Loop_neon: + ld1 {v3.16b}, [x2], #16 // load inp + rev64 v3.16b, v3.16b // byteswap inp + ext v3.16b, v3.16b, v3.16b, #8 + eor v3.16b, v3.16b, v0.16b // inp ^= Xi + +Lgmult_neon: + // Split the input into v3 and v4. (The upper halves are unused, + // so it is okay to leave them alone.) + ins v4.d[0], v3.d[1] + ext v16.8b, v5.8b, v5.8b, #1 // A1 + pmull v16.8h, v16.8b, v3.8b // F = A1*B + ext v0.8b, v3.8b, v3.8b, #1 // B1 + pmull v0.8h, v5.8b, v0.8b // E = A*B1 + ext v17.8b, v5.8b, v5.8b, #2 // A2 + pmull v17.8h, v17.8b, v3.8b // H = A2*B + ext v19.8b, v3.8b, v3.8b, #2 // B2 + pmull v19.8h, v5.8b, v19.8b // G = A*B2 + ext v18.8b, v5.8b, v5.8b, #3 // A3 + eor v16.16b, v16.16b, v0.16b // L = E + F + pmull v18.8h, v18.8b, v3.8b // J = A3*B + ext v0.8b, v3.8b, v3.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v0.8h, v5.8b, v0.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v3.8b, v3.8b, #4 // B4 + eor v18.16b, v18.16b, v0.16b // N = I + J + pmull v19.8h, v5.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v0.8h, v5.8b, v3.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v0.16b, v0.16b, v16.16b + eor v0.16b, v0.16b, v18.16b + eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing + ext v16.8b, v7.8b, v7.8b, #1 // A1 + pmull v16.8h, v16.8b, v3.8b // F = A1*B + ext v1.8b, v3.8b, v3.8b, #1 // B1 + pmull v1.8h, v7.8b, v1.8b // E = A*B1 + ext v17.8b, v7.8b, v7.8b, #2 // A2 + pmull v17.8h, v17.8b, v3.8b // H = A2*B + ext v19.8b, v3.8b, v3.8b, #2 // B2 + pmull v19.8h, v7.8b, v19.8b // G = A*B2 + ext v18.8b, v7.8b, v7.8b, #3 // A3 + eor v16.16b, v16.16b, v1.16b // L = E + F + pmull v18.8h, v18.8b, v3.8b // J = A3*B + ext v1.8b, v3.8b, v3.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v1.8h, v7.8b, v1.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v3.8b, v3.8b, #4 // B4 + eor v18.16b, v18.16b, v1.16b // N = I + J + pmull v19.8h, v7.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v1.8h, v7.8b, v3.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v1.16b, v1.16b, v16.16b + eor v1.16b, v1.16b, v18.16b + ext v16.8b, v6.8b, v6.8b, #1 // A1 + pmull v16.8h, v16.8b, v4.8b // F = A1*B + ext v2.8b, v4.8b, v4.8b, #1 // B1 + pmull v2.8h, v6.8b, v2.8b // E = A*B1 + ext v17.8b, v6.8b, v6.8b, #2 // A2 + pmull v17.8h, v17.8b, v4.8b // H = A2*B + ext v19.8b, v4.8b, v4.8b, #2 // B2 + pmull v19.8h, v6.8b, v19.8b // G = A*B2 + ext v18.8b, v6.8b, v6.8b, #3 // A3 + eor v16.16b, v16.16b, v2.16b // L = E + F + pmull v18.8h, v18.8b, v4.8b // J = A3*B + ext v2.8b, v4.8b, v4.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v2.8h, v6.8b, v2.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v4.8b, v4.8b, #4 // B4 + eor v18.16b, v18.16b, v2.16b // N = I + J + pmull v19.8h, v6.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v2.8h, v6.8b, v4.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v2.16b, v2.16b, v16.16b + eor v2.16b, v2.16b, v18.16b + ext v16.16b, v0.16b, v2.16b, #8 + eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing + eor v1.16b, v1.16b, v2.16b + eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi + ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result + // This is a no-op due to the ins instruction below. + // ins v2.d[0], v1.d[1] + + // equivalent of reduction_avx from ghash-x86_64.pl + shl v17.2d, v0.2d, #57 // 1st phase + shl v18.2d, v0.2d, #62 + eor v18.16b, v18.16b, v17.16b // + shl v17.2d, v0.2d, #63 + eor v18.16b, v18.16b, v17.16b // + // Note Xm contains {Xl.d[1], Xh.d[0]}. + eor v18.16b, v18.16b, v1.16b + ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0] + ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1] + + ushr v18.2d, v0.2d, #1 // 2nd phase + eor v2.16b, v2.16b,v0.16b + eor v0.16b, v0.16b,v18.16b // + ushr v18.2d, v18.2d, #6 + ushr v0.2d, v0.2d, #1 // + eor v0.16b, v0.16b, v2.16b // + eor v0.16b, v0.16b, v18.16b // + + subs x3, x3, #16 + bne Loop_neon + + rev64 v0.16b, v0.16b // byteswap Xi and write + ext v0.16b, v0.16b, v0.16b, #8 + st1 {v0.16b}, [x0] + + ret + + +.section __TEXT,__const +.align 4 +Lmasks: +.quad 0x0000ffffffffffff // k48 +.quad 0x00000000ffffffff // k32 +.quad 0x000000000000ffff // k16 +.quad 0x0000000000000000 // k0 +.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) diff --git a/ring-0.17.14/pregenerated/ghash-neon-armv8-linux64.S b/ring-0.17.14/pregenerated/ghash-neon-armv8-linux64.S new file mode 100644 index 0000000000..7ddf782779 --- /dev/null +++ b/ring-0.17.14/pregenerated/ghash-neon-armv8-linux64.S @@ -0,0 +1,333 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) +.text + +.globl gcm_init_neon +.hidden gcm_init_neon +.type gcm_init_neon,%function +.align 4 +gcm_init_neon: + AARCH64_VALID_CALL_TARGET + // This function is adapted from gcm_init_v8. xC2 is t3. + ld1 {v17.2d}, [x1] // load H + movi v19.16b, #0xe1 + shl v19.2d, v19.2d, #57 // 0xc2.0 + ext v3.16b, v17.16b, v17.16b, #8 + ushr v18.2d, v19.2d, #63 + dup v17.4s, v17.s[1] + ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01 + ushr v18.2d, v3.2d, #63 + sshr v17.4s, v17.4s, #31 // broadcast carry bit + and v18.16b, v18.16b, v16.16b + shl v3.2d, v3.2d, #1 + ext v18.16b, v18.16b, v18.16b, #8 + and v16.16b, v16.16b, v17.16b + orr v3.16b, v3.16b, v18.16b // H<<<=1 + eor v5.16b, v3.16b, v16.16b // twisted H + st1 {v5.2d}, [x0] // store Htable[0] + ret +.size gcm_init_neon,.-gcm_init_neon + +.globl gcm_gmult_neon +.hidden gcm_gmult_neon +.type gcm_gmult_neon,%function +.align 4 +gcm_gmult_neon: + AARCH64_VALID_CALL_TARGET + ld1 {v3.16b}, [x0] // load Xi + ld1 {v5.1d}, [x1], #8 // load twisted H + ld1 {v6.1d}, [x1] + adrp x9, .Lmasks // load constants + add x9, x9, :lo12:.Lmasks + ld1 {v24.2d, v25.2d}, [x9] + rev64 v3.16b, v3.16b // byteswap Xi + ext v3.16b, v3.16b, v3.16b, #8 + eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing + + mov x3, #16 + b .Lgmult_neon +.size gcm_gmult_neon,.-gcm_gmult_neon + +.globl gcm_ghash_neon +.hidden gcm_ghash_neon +.type gcm_ghash_neon,%function +.align 4 +gcm_ghash_neon: + AARCH64_VALID_CALL_TARGET + ld1 {v0.16b}, [x0] // load Xi + ld1 {v5.1d}, [x1], #8 // load twisted H + ld1 {v6.1d}, [x1] + adrp x9, .Lmasks // load constants + add x9, x9, :lo12:.Lmasks + ld1 {v24.2d, v25.2d}, [x9] + rev64 v0.16b, v0.16b // byteswap Xi + ext v0.16b, v0.16b, v0.16b, #8 + eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing + +.Loop_neon: + ld1 {v3.16b}, [x2], #16 // load inp + rev64 v3.16b, v3.16b // byteswap inp + ext v3.16b, v3.16b, v3.16b, #8 + eor v3.16b, v3.16b, v0.16b // inp ^= Xi + +.Lgmult_neon: + // Split the input into v3 and v4. (The upper halves are unused, + // so it is okay to leave them alone.) + ins v4.d[0], v3.d[1] + ext v16.8b, v5.8b, v5.8b, #1 // A1 + pmull v16.8h, v16.8b, v3.8b // F = A1*B + ext v0.8b, v3.8b, v3.8b, #1 // B1 + pmull v0.8h, v5.8b, v0.8b // E = A*B1 + ext v17.8b, v5.8b, v5.8b, #2 // A2 + pmull v17.8h, v17.8b, v3.8b // H = A2*B + ext v19.8b, v3.8b, v3.8b, #2 // B2 + pmull v19.8h, v5.8b, v19.8b // G = A*B2 + ext v18.8b, v5.8b, v5.8b, #3 // A3 + eor v16.16b, v16.16b, v0.16b // L = E + F + pmull v18.8h, v18.8b, v3.8b // J = A3*B + ext v0.8b, v3.8b, v3.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v0.8h, v5.8b, v0.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v3.8b, v3.8b, #4 // B4 + eor v18.16b, v18.16b, v0.16b // N = I + J + pmull v19.8h, v5.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v0.8h, v5.8b, v3.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v0.16b, v0.16b, v16.16b + eor v0.16b, v0.16b, v18.16b + eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing + ext v16.8b, v7.8b, v7.8b, #1 // A1 + pmull v16.8h, v16.8b, v3.8b // F = A1*B + ext v1.8b, v3.8b, v3.8b, #1 // B1 + pmull v1.8h, v7.8b, v1.8b // E = A*B1 + ext v17.8b, v7.8b, v7.8b, #2 // A2 + pmull v17.8h, v17.8b, v3.8b // H = A2*B + ext v19.8b, v3.8b, v3.8b, #2 // B2 + pmull v19.8h, v7.8b, v19.8b // G = A*B2 + ext v18.8b, v7.8b, v7.8b, #3 // A3 + eor v16.16b, v16.16b, v1.16b // L = E + F + pmull v18.8h, v18.8b, v3.8b // J = A3*B + ext v1.8b, v3.8b, v3.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v1.8h, v7.8b, v1.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v3.8b, v3.8b, #4 // B4 + eor v18.16b, v18.16b, v1.16b // N = I + J + pmull v19.8h, v7.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v1.8h, v7.8b, v3.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v1.16b, v1.16b, v16.16b + eor v1.16b, v1.16b, v18.16b + ext v16.8b, v6.8b, v6.8b, #1 // A1 + pmull v16.8h, v16.8b, v4.8b // F = A1*B + ext v2.8b, v4.8b, v4.8b, #1 // B1 + pmull v2.8h, v6.8b, v2.8b // E = A*B1 + ext v17.8b, v6.8b, v6.8b, #2 // A2 + pmull v17.8h, v17.8b, v4.8b // H = A2*B + ext v19.8b, v4.8b, v4.8b, #2 // B2 + pmull v19.8h, v6.8b, v19.8b // G = A*B2 + ext v18.8b, v6.8b, v6.8b, #3 // A3 + eor v16.16b, v16.16b, v2.16b // L = E + F + pmull v18.8h, v18.8b, v4.8b // J = A3*B + ext v2.8b, v4.8b, v4.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v2.8h, v6.8b, v2.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v4.8b, v4.8b, #4 // B4 + eor v18.16b, v18.16b, v2.16b // N = I + J + pmull v19.8h, v6.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v2.8h, v6.8b, v4.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v2.16b, v2.16b, v16.16b + eor v2.16b, v2.16b, v18.16b + ext v16.16b, v0.16b, v2.16b, #8 + eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing + eor v1.16b, v1.16b, v2.16b + eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi + ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result + // This is a no-op due to the ins instruction below. + // ins v2.d[0], v1.d[1] + + // equivalent of reduction_avx from ghash-x86_64.pl + shl v17.2d, v0.2d, #57 // 1st phase + shl v18.2d, v0.2d, #62 + eor v18.16b, v18.16b, v17.16b // + shl v17.2d, v0.2d, #63 + eor v18.16b, v18.16b, v17.16b // + // Note Xm contains {Xl.d[1], Xh.d[0]}. + eor v18.16b, v18.16b, v1.16b + ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0] + ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1] + + ushr v18.2d, v0.2d, #1 // 2nd phase + eor v2.16b, v2.16b,v0.16b + eor v0.16b, v0.16b,v18.16b // + ushr v18.2d, v18.2d, #6 + ushr v0.2d, v0.2d, #1 // + eor v0.16b, v0.16b, v2.16b // + eor v0.16b, v0.16b, v18.16b // + + subs x3, x3, #16 + bne .Loop_neon + + rev64 v0.16b, v0.16b // byteswap Xi and write + ext v0.16b, v0.16b, v0.16b, #8 + st1 {v0.16b}, [x0] + + ret +.size gcm_ghash_neon,.-gcm_ghash_neon + +.section .rodata +.align 4 +.Lmasks: +.quad 0x0000ffffffffffff // k48 +.quad 0x00000000ffffffff // k32 +.quad 0x000000000000ffff // k16 +.quad 0x0000000000000000 // k0 +.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) diff --git a/ring-0.17.14/pregenerated/ghash-neon-armv8-win64.S b/ring-0.17.14/pregenerated/ghash-neon-armv8-win64.S new file mode 100644 index 0000000000..db3d424f11 --- /dev/null +++ b/ring-0.17.14/pregenerated/ghash-neon-armv8-win64.S @@ -0,0 +1,339 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +.text + +.globl gcm_init_neon + +.def gcm_init_neon + .type 32 +.endef +.align 4 +gcm_init_neon: + AARCH64_VALID_CALL_TARGET + // This function is adapted from gcm_init_v8. xC2 is t3. + ld1 {v17.2d}, [x1] // load H + movi v19.16b, #0xe1 + shl v19.2d, v19.2d, #57 // 0xc2.0 + ext v3.16b, v17.16b, v17.16b, #8 + ushr v18.2d, v19.2d, #63 + dup v17.4s, v17.s[1] + ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01 + ushr v18.2d, v3.2d, #63 + sshr v17.4s, v17.4s, #31 // broadcast carry bit + and v18.16b, v18.16b, v16.16b + shl v3.2d, v3.2d, #1 + ext v18.16b, v18.16b, v18.16b, #8 + and v16.16b, v16.16b, v17.16b + orr v3.16b, v3.16b, v18.16b // H<<<=1 + eor v5.16b, v3.16b, v16.16b // twisted H + st1 {v5.2d}, [x0] // store Htable[0] + ret + + +.globl gcm_gmult_neon + +.def gcm_gmult_neon + .type 32 +.endef +.align 4 +gcm_gmult_neon: + AARCH64_VALID_CALL_TARGET + ld1 {v3.16b}, [x0] // load Xi + ld1 {v5.1d}, [x1], #8 // load twisted H + ld1 {v6.1d}, [x1] + adrp x9, Lmasks // load constants + add x9, x9, :lo12:Lmasks + ld1 {v24.2d, v25.2d}, [x9] + rev64 v3.16b, v3.16b // byteswap Xi + ext v3.16b, v3.16b, v3.16b, #8 + eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing + + mov x3, #16 + b Lgmult_neon + + +.globl gcm_ghash_neon + +.def gcm_ghash_neon + .type 32 +.endef +.align 4 +gcm_ghash_neon: + AARCH64_VALID_CALL_TARGET + ld1 {v0.16b}, [x0] // load Xi + ld1 {v5.1d}, [x1], #8 // load twisted H + ld1 {v6.1d}, [x1] + adrp x9, Lmasks // load constants + add x9, x9, :lo12:Lmasks + ld1 {v24.2d, v25.2d}, [x9] + rev64 v0.16b, v0.16b // byteswap Xi + ext v0.16b, v0.16b, v0.16b, #8 + eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing + +Loop_neon: + ld1 {v3.16b}, [x2], #16 // load inp + rev64 v3.16b, v3.16b // byteswap inp + ext v3.16b, v3.16b, v3.16b, #8 + eor v3.16b, v3.16b, v0.16b // inp ^= Xi + +Lgmult_neon: + // Split the input into v3 and v4. (The upper halves are unused, + // so it is okay to leave them alone.) + ins v4.d[0], v3.d[1] + ext v16.8b, v5.8b, v5.8b, #1 // A1 + pmull v16.8h, v16.8b, v3.8b // F = A1*B + ext v0.8b, v3.8b, v3.8b, #1 // B1 + pmull v0.8h, v5.8b, v0.8b // E = A*B1 + ext v17.8b, v5.8b, v5.8b, #2 // A2 + pmull v17.8h, v17.8b, v3.8b // H = A2*B + ext v19.8b, v3.8b, v3.8b, #2 // B2 + pmull v19.8h, v5.8b, v19.8b // G = A*B2 + ext v18.8b, v5.8b, v5.8b, #3 // A3 + eor v16.16b, v16.16b, v0.16b // L = E + F + pmull v18.8h, v18.8b, v3.8b // J = A3*B + ext v0.8b, v3.8b, v3.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v0.8h, v5.8b, v0.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v3.8b, v3.8b, #4 // B4 + eor v18.16b, v18.16b, v0.16b // N = I + J + pmull v19.8h, v5.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v0.8h, v5.8b, v3.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v0.16b, v0.16b, v16.16b + eor v0.16b, v0.16b, v18.16b + eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing + ext v16.8b, v7.8b, v7.8b, #1 // A1 + pmull v16.8h, v16.8b, v3.8b // F = A1*B + ext v1.8b, v3.8b, v3.8b, #1 // B1 + pmull v1.8h, v7.8b, v1.8b // E = A*B1 + ext v17.8b, v7.8b, v7.8b, #2 // A2 + pmull v17.8h, v17.8b, v3.8b // H = A2*B + ext v19.8b, v3.8b, v3.8b, #2 // B2 + pmull v19.8h, v7.8b, v19.8b // G = A*B2 + ext v18.8b, v7.8b, v7.8b, #3 // A3 + eor v16.16b, v16.16b, v1.16b // L = E + F + pmull v18.8h, v18.8b, v3.8b // J = A3*B + ext v1.8b, v3.8b, v3.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v1.8h, v7.8b, v1.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v3.8b, v3.8b, #4 // B4 + eor v18.16b, v18.16b, v1.16b // N = I + J + pmull v19.8h, v7.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v1.8h, v7.8b, v3.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v1.16b, v1.16b, v16.16b + eor v1.16b, v1.16b, v18.16b + ext v16.8b, v6.8b, v6.8b, #1 // A1 + pmull v16.8h, v16.8b, v4.8b // F = A1*B + ext v2.8b, v4.8b, v4.8b, #1 // B1 + pmull v2.8h, v6.8b, v2.8b // E = A*B1 + ext v17.8b, v6.8b, v6.8b, #2 // A2 + pmull v17.8h, v17.8b, v4.8b // H = A2*B + ext v19.8b, v4.8b, v4.8b, #2 // B2 + pmull v19.8h, v6.8b, v19.8b // G = A*B2 + ext v18.8b, v6.8b, v6.8b, #3 // A3 + eor v16.16b, v16.16b, v2.16b // L = E + F + pmull v18.8h, v18.8b, v4.8b // J = A3*B + ext v2.8b, v4.8b, v4.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v2.8h, v6.8b, v2.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v4.8b, v4.8b, #4 // B4 + eor v18.16b, v18.16b, v2.16b // N = I + J + pmull v19.8h, v6.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v2.8h, v6.8b, v4.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v2.16b, v2.16b, v16.16b + eor v2.16b, v2.16b, v18.16b + ext v16.16b, v0.16b, v2.16b, #8 + eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing + eor v1.16b, v1.16b, v2.16b + eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi + ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result + // This is a no-op due to the ins instruction below. + // ins v2.d[0], v1.d[1] + + // equivalent of reduction_avx from ghash-x86_64.pl + shl v17.2d, v0.2d, #57 // 1st phase + shl v18.2d, v0.2d, #62 + eor v18.16b, v18.16b, v17.16b // + shl v17.2d, v0.2d, #63 + eor v18.16b, v18.16b, v17.16b // + // Note Xm contains {Xl.d[1], Xh.d[0]}. + eor v18.16b, v18.16b, v1.16b + ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0] + ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1] + + ushr v18.2d, v0.2d, #1 // 2nd phase + eor v2.16b, v2.16b,v0.16b + eor v0.16b, v0.16b,v18.16b // + ushr v18.2d, v18.2d, #6 + ushr v0.2d, v0.2d, #1 // + eor v0.16b, v0.16b, v2.16b // + eor v0.16b, v0.16b, v18.16b // + + subs x3, x3, #16 + bne Loop_neon + + rev64 v0.16b, v0.16b // byteswap Xi and write + ext v0.16b, v0.16b, v0.16b, #8 + st1 {v0.16b}, [x0] + + ret + + +.section .rodata +.align 4 +Lmasks: +.quad 0x0000ffffffffffff // k48 +.quad 0x00000000ffffffff // k32 +.quad 0x000000000000ffff // k16 +.quad 0x0000000000000000 // k0 +.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) diff --git a/ring-0.17.14/pregenerated/ghash-x86-elf.S b/ring-0.17.14/pregenerated/ghash-x86-elf.S new file mode 100644 index 0000000000..6c1753d421 --- /dev/null +++ b/ring-0.17.14/pregenerated/ghash-x86-elf.S @@ -0,0 +1,274 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) +.text +.globl gcm_init_clmul +.hidden gcm_init_clmul +.type gcm_init_clmul,@function +.align 16 +gcm_init_clmul: +.L_gcm_init_clmul_begin: + movl 4(%esp),%edx + movl 8(%esp),%eax + call .L000pic +.L000pic: + popl %ecx + leal .Lbswap-.L000pic(%ecx),%ecx + movdqu (%eax),%xmm2 + pshufd $78,%xmm2,%xmm2 + pshufd $255,%xmm2,%xmm4 + movdqa %xmm2,%xmm3 + psllq $1,%xmm2 + pxor %xmm5,%xmm5 + psrlq $63,%xmm3 + pcmpgtd %xmm4,%xmm5 + pslldq $8,%xmm3 + por %xmm3,%xmm2 + pand 16(%ecx),%xmm5 + pxor %xmm5,%xmm2 + movdqa %xmm2,%xmm0 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pshufd $78,%xmm2,%xmm4 + pxor %xmm0,%xmm3 + pxor %xmm2,%xmm4 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,220,0 + xorps %xmm0,%xmm3 + xorps %xmm1,%xmm3 + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + pshufd $78,%xmm2,%xmm3 + pshufd $78,%xmm0,%xmm4 + pxor %xmm2,%xmm3 + movdqu %xmm2,(%edx) + pxor %xmm0,%xmm4 + movdqu %xmm0,16(%edx) +.byte 102,15,58,15,227,8 + movdqu %xmm4,32(%edx) + ret +.size gcm_init_clmul,.-.L_gcm_init_clmul_begin +.globl gcm_ghash_clmul +.hidden gcm_ghash_clmul +.type gcm_ghash_clmul,@function +.align 16 +gcm_ghash_clmul: +.L_gcm_ghash_clmul_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%eax + movl 24(%esp),%edx + movl 28(%esp),%esi + movl 32(%esp),%ebx + call .L001pic +.L001pic: + popl %ecx + leal .Lbswap-.L001pic(%ecx),%ecx + movdqu (%eax),%xmm0 + movdqa (%ecx),%xmm5 + movdqu (%edx),%xmm2 +.byte 102,15,56,0,197 + subl $16,%ebx + jz .L002odd_tail + movdqu (%esi),%xmm3 + movdqu 16(%esi),%xmm6 +.byte 102,15,56,0,221 +.byte 102,15,56,0,245 + movdqu 32(%edx),%xmm5 + pxor %xmm3,%xmm0 + pshufd $78,%xmm6,%xmm3 + movdqa %xmm6,%xmm7 + pxor %xmm6,%xmm3 + leal 32(%esi),%esi +.byte 102,15,58,68,242,0 +.byte 102,15,58,68,250,17 +.byte 102,15,58,68,221,0 + movups 16(%edx),%xmm2 + nop + subl $32,%ebx + jbe .L003even_tail + jmp .L004mod_loop +.align 32 +.L004mod_loop: + pshufd $78,%xmm0,%xmm4 + movdqa %xmm0,%xmm1 + pxor %xmm0,%xmm4 + nop +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,229,16 + movups (%edx),%xmm2 + xorps %xmm6,%xmm0 + movdqa (%ecx),%xmm5 + xorps %xmm7,%xmm1 + movdqu (%esi),%xmm7 + pxor %xmm0,%xmm3 + movdqu 16(%esi),%xmm6 + pxor %xmm1,%xmm3 +.byte 102,15,56,0,253 + pxor %xmm3,%xmm4 + movdqa %xmm4,%xmm3 + psrldq $8,%xmm4 + pslldq $8,%xmm3 + pxor %xmm4,%xmm1 + pxor %xmm3,%xmm0 +.byte 102,15,56,0,245 + pxor %xmm7,%xmm1 + movdqa %xmm6,%xmm7 + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 +.byte 102,15,58,68,242,0 + movups 32(%edx),%xmm5 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + pshufd $78,%xmm7,%xmm3 + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm7,%xmm3 + pxor %xmm4,%xmm1 +.byte 102,15,58,68,250,17 + movups 16(%edx),%xmm2 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 +.byte 102,15,58,68,221,0 + leal 32(%esi),%esi + subl $32,%ebx + ja .L004mod_loop +.L003even_tail: + pshufd $78,%xmm0,%xmm4 + movdqa %xmm0,%xmm1 + pxor %xmm0,%xmm4 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,229,16 + movdqa (%ecx),%xmm5 + xorps %xmm6,%xmm0 + xorps %xmm7,%xmm1 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + pxor %xmm3,%xmm4 + movdqa %xmm4,%xmm3 + psrldq $8,%xmm4 + pslldq $8,%xmm3 + pxor %xmm4,%xmm1 + pxor %xmm3,%xmm0 + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + testl %ebx,%ebx + jnz .L005done + movups (%edx),%xmm2 +.L002odd_tail: + movdqu (%esi),%xmm3 +.byte 102,15,56,0,221 + pxor %xmm3,%xmm0 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pshufd $78,%xmm2,%xmm4 + pxor %xmm0,%xmm3 + pxor %xmm2,%xmm4 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,220,0 + xorps %xmm0,%xmm3 + xorps %xmm1,%xmm3 + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 +.L005done: +.byte 102,15,56,0,197 + movdqu %xmm0,(%eax) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size gcm_ghash_clmul,.-.L_gcm_ghash_clmul_begin +.align 64 +.Lbswap: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194 +.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67 +.byte 82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112 +.byte 112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62 +.byte 0 +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) diff --git a/ring-0.17.14/pregenerated/ghash-x86-win32n.asm b/ring-0.17.14/pregenerated/ghash-x86-win32n.asm new file mode 100644 index 0000000000..5b1cd80f36 --- /dev/null +++ b/ring-0.17.14/pregenerated/ghash-x86-win32n.asm @@ -0,0 +1,277 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%include "ring_core_generated/prefix_symbols_nasm.inc" +%ifidn __OUTPUT_FORMAT__, win32 +%ifidn __OUTPUT_FORMAT__,obj +section code use32 class=code align=64 +%elifidn __OUTPUT_FORMAT__,win32 +$@feat.00 equ 1 +section .text code align=64 +%else +section .text code +%endif +global _gcm_init_clmul +align 16 +_gcm_init_clmul: +L$_gcm_init_clmul_begin: + mov edx,DWORD [4+esp] + mov eax,DWORD [8+esp] + call L$000pic +L$000pic: + pop ecx + lea ecx,[(L$bswap-L$000pic)+ecx] + movdqu xmm2,[eax] + pshufd xmm2,xmm2,78 + pshufd xmm4,xmm2,255 + movdqa xmm3,xmm2 + psllq xmm2,1 + pxor xmm5,xmm5 + psrlq xmm3,63 + pcmpgtd xmm5,xmm4 + pslldq xmm3,8 + por xmm2,xmm3 + pand xmm5,[16+ecx] + pxor xmm2,xmm5 + movdqa xmm0,xmm2 + movdqa xmm1,xmm0 + pshufd xmm3,xmm0,78 + pshufd xmm4,xmm2,78 + pxor xmm3,xmm0 + pxor xmm4,xmm2 +db 102,15,58,68,194,0 +db 102,15,58,68,202,17 +db 102,15,58,68,220,0 + xorps xmm3,xmm0 + xorps xmm3,xmm1 + movdqa xmm4,xmm3 + psrldq xmm3,8 + pslldq xmm4,8 + pxor xmm1,xmm3 + pxor xmm0,xmm4 + movdqa xmm4,xmm0 + movdqa xmm3,xmm0 + psllq xmm0,5 + pxor xmm3,xmm0 + psllq xmm0,1 + pxor xmm0,xmm3 + psllq xmm0,57 + movdqa xmm3,xmm0 + pslldq xmm0,8 + psrldq xmm3,8 + pxor xmm0,xmm4 + pxor xmm1,xmm3 + movdqa xmm4,xmm0 + psrlq xmm0,1 + pxor xmm1,xmm4 + pxor xmm4,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm4 + psrlq xmm0,1 + pxor xmm0,xmm1 + pshufd xmm3,xmm2,78 + pshufd xmm4,xmm0,78 + pxor xmm3,xmm2 + movdqu [edx],xmm2 + pxor xmm4,xmm0 + movdqu [16+edx],xmm0 +db 102,15,58,15,227,8 + movdqu [32+edx],xmm4 + ret +global _gcm_ghash_clmul +align 16 +_gcm_ghash_clmul: +L$_gcm_ghash_clmul_begin: + push ebp + push ebx + push esi + push edi + mov eax,DWORD [20+esp] + mov edx,DWORD [24+esp] + mov esi,DWORD [28+esp] + mov ebx,DWORD [32+esp] + call L$001pic +L$001pic: + pop ecx + lea ecx,[(L$bswap-L$001pic)+ecx] + movdqu xmm0,[eax] + movdqa xmm5,[ecx] + movdqu xmm2,[edx] +db 102,15,56,0,197 + sub ebx,16 + jz NEAR L$002odd_tail + movdqu xmm3,[esi] + movdqu xmm6,[16+esi] +db 102,15,56,0,221 +db 102,15,56,0,245 + movdqu xmm5,[32+edx] + pxor xmm0,xmm3 + pshufd xmm3,xmm6,78 + movdqa xmm7,xmm6 + pxor xmm3,xmm6 + lea esi,[32+esi] +db 102,15,58,68,242,0 +db 102,15,58,68,250,17 +db 102,15,58,68,221,0 + movups xmm2,[16+edx] + nop + sub ebx,32 + jbe NEAR L$003even_tail + jmp NEAR L$004mod_loop +align 32 +L$004mod_loop: + pshufd xmm4,xmm0,78 + movdqa xmm1,xmm0 + pxor xmm4,xmm0 + nop +db 102,15,58,68,194,0 +db 102,15,58,68,202,17 +db 102,15,58,68,229,16 + movups xmm2,[edx] + xorps xmm0,xmm6 + movdqa xmm5,[ecx] + xorps xmm1,xmm7 + movdqu xmm7,[esi] + pxor xmm3,xmm0 + movdqu xmm6,[16+esi] + pxor xmm3,xmm1 +db 102,15,56,0,253 + pxor xmm4,xmm3 + movdqa xmm3,xmm4 + psrldq xmm4,8 + pslldq xmm3,8 + pxor xmm1,xmm4 + pxor xmm0,xmm3 +db 102,15,56,0,245 + pxor xmm1,xmm7 + movdqa xmm7,xmm6 + movdqa xmm4,xmm0 + movdqa xmm3,xmm0 + psllq xmm0,5 + pxor xmm3,xmm0 + psllq xmm0,1 + pxor xmm0,xmm3 +db 102,15,58,68,242,0 + movups xmm5,[32+edx] + psllq xmm0,57 + movdqa xmm3,xmm0 + pslldq xmm0,8 + psrldq xmm3,8 + pxor xmm0,xmm4 + pxor xmm1,xmm3 + pshufd xmm3,xmm7,78 + movdqa xmm4,xmm0 + psrlq xmm0,1 + pxor xmm3,xmm7 + pxor xmm1,xmm4 +db 102,15,58,68,250,17 + movups xmm2,[16+edx] + pxor xmm4,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm4 + psrlq xmm0,1 + pxor xmm0,xmm1 +db 102,15,58,68,221,0 + lea esi,[32+esi] + sub ebx,32 + ja NEAR L$004mod_loop +L$003even_tail: + pshufd xmm4,xmm0,78 + movdqa xmm1,xmm0 + pxor xmm4,xmm0 +db 102,15,58,68,194,0 +db 102,15,58,68,202,17 +db 102,15,58,68,229,16 + movdqa xmm5,[ecx] + xorps xmm0,xmm6 + xorps xmm1,xmm7 + pxor xmm3,xmm0 + pxor xmm3,xmm1 + pxor xmm4,xmm3 + movdqa xmm3,xmm4 + psrldq xmm4,8 + pslldq xmm3,8 + pxor xmm1,xmm4 + pxor xmm0,xmm3 + movdqa xmm4,xmm0 + movdqa xmm3,xmm0 + psllq xmm0,5 + pxor xmm3,xmm0 + psllq xmm0,1 + pxor xmm0,xmm3 + psllq xmm0,57 + movdqa xmm3,xmm0 + pslldq xmm0,8 + psrldq xmm3,8 + pxor xmm0,xmm4 + pxor xmm1,xmm3 + movdqa xmm4,xmm0 + psrlq xmm0,1 + pxor xmm1,xmm4 + pxor xmm4,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm4 + psrlq xmm0,1 + pxor xmm0,xmm1 + test ebx,ebx + jnz NEAR L$005done + movups xmm2,[edx] +L$002odd_tail: + movdqu xmm3,[esi] +db 102,15,56,0,221 + pxor xmm0,xmm3 + movdqa xmm1,xmm0 + pshufd xmm3,xmm0,78 + pshufd xmm4,xmm2,78 + pxor xmm3,xmm0 + pxor xmm4,xmm2 +db 102,15,58,68,194,0 +db 102,15,58,68,202,17 +db 102,15,58,68,220,0 + xorps xmm3,xmm0 + xorps xmm3,xmm1 + movdqa xmm4,xmm3 + psrldq xmm3,8 + pslldq xmm4,8 + pxor xmm1,xmm3 + pxor xmm0,xmm4 + movdqa xmm4,xmm0 + movdqa xmm3,xmm0 + psllq xmm0,5 + pxor xmm3,xmm0 + psllq xmm0,1 + pxor xmm0,xmm3 + psllq xmm0,57 + movdqa xmm3,xmm0 + pslldq xmm0,8 + psrldq xmm3,8 + pxor xmm0,xmm4 + pxor xmm1,xmm3 + movdqa xmm4,xmm0 + psrlq xmm0,1 + pxor xmm1,xmm4 + pxor xmm4,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm4 + psrlq xmm0,1 + pxor xmm0,xmm1 +L$005done: +db 102,15,56,0,197 + movdqu [eax],xmm0 + pop edi + pop esi + pop ebx + pop ebp + ret +align 64 +L$bswap: +db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +db 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194 +db 71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67 +db 82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112 +db 112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62 +db 0 +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/ring-0.17.14/pregenerated/ghash-x86-win32n.o b/ring-0.17.14/pregenerated/ghash-x86-win32n.o new file mode 100644 index 0000000000..f78a0eb0fe Binary files /dev/null and b/ring-0.17.14/pregenerated/ghash-x86-win32n.o differ diff --git a/ring-0.17.14/pregenerated/ghash-x86_64-elf.S b/ring-0.17.14/pregenerated/ghash-x86_64-elf.S new file mode 100644 index 0000000000..4c957b4dde --- /dev/null +++ b/ring-0.17.14/pregenerated/ghash-x86_64-elf.S @@ -0,0 +1,1062 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +.text +.globl gcm_init_clmul +.hidden gcm_init_clmul +.type gcm_init_clmul,@function +.align 16 +gcm_init_clmul: +.cfi_startproc + +_CET_ENDBR +.L_init_clmul: + movdqu (%rsi),%xmm2 + pshufd $78,%xmm2,%xmm2 + + + pshufd $255,%xmm2,%xmm4 + movdqa %xmm2,%xmm3 + psllq $1,%xmm2 + pxor %xmm5,%xmm5 + psrlq $63,%xmm3 + pcmpgtd %xmm4,%xmm5 + pslldq $8,%xmm3 + por %xmm3,%xmm2 + + + pand .L0x1c2_polynomial(%rip),%xmm5 + pxor %xmm5,%xmm2 + + + pshufd $78,%xmm2,%xmm6 + movdqa %xmm2,%xmm0 + pxor %xmm2,%xmm6 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,222,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + pshufd $78,%xmm2,%xmm3 + pshufd $78,%xmm0,%xmm4 + pxor %xmm2,%xmm3 + movdqu %xmm2,0(%rdi) + pxor %xmm0,%xmm4 + movdqu %xmm0,16(%rdi) +.byte 102,15,58,15,227,8 + movdqu %xmm4,32(%rdi) + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,222,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + movdqa %xmm0,%xmm5 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,222,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + pshufd $78,%xmm5,%xmm3 + pshufd $78,%xmm0,%xmm4 + pxor %xmm5,%xmm3 + movdqu %xmm5,48(%rdi) + pxor %xmm0,%xmm4 + movdqu %xmm0,64(%rdi) +.byte 102,15,58,15,227,8 + movdqu %xmm4,80(%rdi) + ret +.cfi_endproc + +.size gcm_init_clmul,.-gcm_init_clmul +.globl gcm_ghash_clmul +.hidden gcm_ghash_clmul +.type gcm_ghash_clmul,@function +.align 32 +gcm_ghash_clmul: +.cfi_startproc + +_CET_ENDBR +.L_ghash_clmul: + movdqa .Lbswap_mask(%rip),%xmm10 + + movdqu (%rdi),%xmm0 + movdqu (%rsi),%xmm2 + movdqu 32(%rsi),%xmm7 +.byte 102,65,15,56,0,194 + + subq $0x10,%rcx + jz .Lodd_tail + + movdqu 16(%rsi),%xmm6 + cmpq $0x30,%rcx + jb .Lskip4x + + subq $0x30,%rcx + movq $0xA040608020C0E000,%rax + movdqu 48(%rsi),%xmm14 + movdqu 64(%rsi),%xmm15 + + + + + movdqu 48(%rdx),%xmm3 + movdqu 32(%rdx),%xmm11 +.byte 102,65,15,56,0,218 +.byte 102,69,15,56,0,218 + movdqa %xmm3,%xmm5 + pshufd $78,%xmm3,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,68,218,0 +.byte 102,15,58,68,234,17 +.byte 102,15,58,68,231,0 + + movdqa %xmm11,%xmm13 + pshufd $78,%xmm11,%xmm12 + pxor %xmm11,%xmm12 +.byte 102,68,15,58,68,222,0 +.byte 102,68,15,58,68,238,17 +.byte 102,68,15,58,68,231,16 + xorps %xmm11,%xmm3 + xorps %xmm13,%xmm5 + movups 80(%rsi),%xmm7 + xorps %xmm12,%xmm4 + + movdqu 16(%rdx),%xmm11 + movdqu 0(%rdx),%xmm8 +.byte 102,69,15,56,0,218 +.byte 102,69,15,56,0,194 + movdqa %xmm11,%xmm13 + pshufd $78,%xmm11,%xmm12 + pxor %xmm8,%xmm0 + pxor %xmm11,%xmm12 +.byte 102,69,15,58,68,222,0 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm8 + pxor %xmm0,%xmm8 +.byte 102,69,15,58,68,238,17 +.byte 102,68,15,58,68,231,0 + xorps %xmm11,%xmm3 + xorps %xmm13,%xmm5 + + leaq 64(%rdx),%rdx + subq $0x40,%rcx + jc .Ltail4x + + jmp .Lmod4_loop +.align 32 +.Lmod4_loop: +.byte 102,65,15,58,68,199,0 + xorps %xmm12,%xmm4 + movdqu 48(%rdx),%xmm11 +.byte 102,69,15,56,0,218 +.byte 102,65,15,58,68,207,17 + xorps %xmm3,%xmm0 + movdqu 32(%rdx),%xmm3 + movdqa %xmm11,%xmm13 +.byte 102,68,15,58,68,199,16 + pshufd $78,%xmm11,%xmm12 + xorps %xmm5,%xmm1 + pxor %xmm11,%xmm12 +.byte 102,65,15,56,0,218 + movups 32(%rsi),%xmm7 + xorps %xmm4,%xmm8 +.byte 102,68,15,58,68,218,0 + pshufd $78,%xmm3,%xmm4 + + pxor %xmm0,%xmm8 + movdqa %xmm3,%xmm5 + pxor %xmm1,%xmm8 + pxor %xmm3,%xmm4 + movdqa %xmm8,%xmm9 +.byte 102,68,15,58,68,234,17 + pslldq $8,%xmm8 + psrldq $8,%xmm9 + pxor %xmm8,%xmm0 + movdqa .L7_mask(%rip),%xmm8 + pxor %xmm9,%xmm1 +.byte 102,76,15,110,200 + + pand %xmm0,%xmm8 +.byte 102,69,15,56,0,200 + pxor %xmm0,%xmm9 +.byte 102,68,15,58,68,231,0 + psllq $57,%xmm9 + movdqa %xmm9,%xmm8 + pslldq $8,%xmm9 +.byte 102,15,58,68,222,0 + psrldq $8,%xmm8 + pxor %xmm9,%xmm0 + pxor %xmm8,%xmm1 + movdqu 0(%rdx),%xmm8 + + movdqa %xmm0,%xmm9 + psrlq $1,%xmm0 +.byte 102,15,58,68,238,17 + xorps %xmm11,%xmm3 + movdqu 16(%rdx),%xmm11 +.byte 102,69,15,56,0,218 +.byte 102,15,58,68,231,16 + xorps %xmm13,%xmm5 + movups 80(%rsi),%xmm7 +.byte 102,69,15,56,0,194 + pxor %xmm9,%xmm1 + pxor %xmm0,%xmm9 + psrlq $5,%xmm0 + + movdqa %xmm11,%xmm13 + pxor %xmm12,%xmm4 + pshufd $78,%xmm11,%xmm12 + pxor %xmm9,%xmm0 + pxor %xmm8,%xmm1 + pxor %xmm11,%xmm12 +.byte 102,69,15,58,68,222,0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + movdqa %xmm0,%xmm1 +.byte 102,69,15,58,68,238,17 + xorps %xmm11,%xmm3 + pshufd $78,%xmm0,%xmm8 + pxor %xmm0,%xmm8 + +.byte 102,68,15,58,68,231,0 + xorps %xmm13,%xmm5 + + leaq 64(%rdx),%rdx + subq $0x40,%rcx + jnc .Lmod4_loop + +.Ltail4x: +.byte 102,65,15,58,68,199,0 +.byte 102,65,15,58,68,207,17 +.byte 102,68,15,58,68,199,16 + xorps %xmm12,%xmm4 + xorps %xmm3,%xmm0 + xorps %xmm5,%xmm1 + pxor %xmm0,%xmm1 + pxor %xmm4,%xmm8 + + pxor %xmm1,%xmm8 + pxor %xmm0,%xmm1 + + movdqa %xmm8,%xmm9 + psrldq $8,%xmm8 + pslldq $8,%xmm9 + pxor %xmm8,%xmm1 + pxor %xmm9,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + addq $0x40,%rcx + jz .Ldone + movdqu 32(%rsi),%xmm7 + subq $0x10,%rcx + jz .Lodd_tail +.Lskip4x: + + + + + + movdqu (%rdx),%xmm8 + movdqu 16(%rdx),%xmm3 +.byte 102,69,15,56,0,194 +.byte 102,65,15,56,0,218 + pxor %xmm8,%xmm0 + + movdqa %xmm3,%xmm5 + pshufd $78,%xmm3,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,68,218,0 +.byte 102,15,58,68,234,17 +.byte 102,15,58,68,231,0 + + leaq 32(%rdx),%rdx + nop + subq $0x20,%rcx + jbe .Leven_tail + nop + jmp .Lmod_loop + +.align 32 +.Lmod_loop: + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm8 + pshufd $78,%xmm0,%xmm4 + pxor %xmm0,%xmm4 + +.byte 102,15,58,68,198,0 +.byte 102,15,58,68,206,17 +.byte 102,15,58,68,231,16 + + pxor %xmm3,%xmm0 + pxor %xmm5,%xmm1 + movdqu (%rdx),%xmm9 + pxor %xmm0,%xmm8 +.byte 102,69,15,56,0,202 + movdqu 16(%rdx),%xmm3 + + pxor %xmm1,%xmm8 + pxor %xmm9,%xmm1 + pxor %xmm8,%xmm4 +.byte 102,65,15,56,0,218 + movdqa %xmm4,%xmm8 + psrldq $8,%xmm8 + pslldq $8,%xmm4 + pxor %xmm8,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm3,%xmm5 + + movdqa %xmm0,%xmm9 + movdqa %xmm0,%xmm8 + psllq $5,%xmm0 + pxor %xmm0,%xmm8 +.byte 102,15,58,68,218,0 + psllq $1,%xmm0 + pxor %xmm8,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm8 + pslldq $8,%xmm0 + psrldq $8,%xmm8 + pxor %xmm9,%xmm0 + pshufd $78,%xmm5,%xmm4 + pxor %xmm8,%xmm1 + pxor %xmm5,%xmm4 + + movdqa %xmm0,%xmm9 + psrlq $1,%xmm0 +.byte 102,15,58,68,234,17 + pxor %xmm9,%xmm1 + pxor %xmm0,%xmm9 + psrlq $5,%xmm0 + pxor %xmm9,%xmm0 + leaq 32(%rdx),%rdx + psrlq $1,%xmm0 +.byte 102,15,58,68,231,0 + pxor %xmm1,%xmm0 + + subq $0x20,%rcx + ja .Lmod_loop + +.Leven_tail: + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm8 + pshufd $78,%xmm0,%xmm4 + pxor %xmm0,%xmm4 + +.byte 102,15,58,68,198,0 +.byte 102,15,58,68,206,17 +.byte 102,15,58,68,231,16 + + pxor %xmm3,%xmm0 + pxor %xmm5,%xmm1 + pxor %xmm0,%xmm8 + pxor %xmm1,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm8 + psrldq $8,%xmm8 + pslldq $8,%xmm4 + pxor %xmm8,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + testq %rcx,%rcx + jnz .Ldone + +.Lodd_tail: + movdqu (%rdx),%xmm8 +.byte 102,69,15,56,0,194 + pxor %xmm8,%xmm0 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,223,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 +.Ldone: +.byte 102,65,15,56,0,194 + movdqu %xmm0,(%rdi) + ret +.cfi_endproc + +.size gcm_ghash_clmul,.-gcm_ghash_clmul +.globl gcm_init_avx +.hidden gcm_init_avx +.type gcm_init_avx,@function +.align 32 +gcm_init_avx: +.cfi_startproc + +_CET_ENDBR + vzeroupper + + vmovdqu (%rsi),%xmm2 + vpshufd $78,%xmm2,%xmm2 + + + vpshufd $255,%xmm2,%xmm4 + vpsrlq $63,%xmm2,%xmm3 + vpsllq $1,%xmm2,%xmm2 + vpxor %xmm5,%xmm5,%xmm5 + vpcmpgtd %xmm4,%xmm5,%xmm5 + vpslldq $8,%xmm3,%xmm3 + vpor %xmm3,%xmm2,%xmm2 + + + vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5 + vpxor %xmm5,%xmm2,%xmm2 + + vpunpckhqdq %xmm2,%xmm2,%xmm6 + vmovdqa %xmm2,%xmm0 + vpxor %xmm2,%xmm6,%xmm6 + movq $4,%r10 + jmp .Linit_start_avx +.align 32 +.Linit_loop_avx: + vpalignr $8,%xmm3,%xmm4,%xmm5 + vmovdqu %xmm5,-16(%rdi) + vpunpckhqdq %xmm0,%xmm0,%xmm3 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 + vpxor %xmm0,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + + vpslldq $8,%xmm3,%xmm4 + vpsrldq $8,%xmm3,%xmm3 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm3,%xmm1,%xmm1 + vpsllq $57,%xmm0,%xmm3 + vpsllq $62,%xmm0,%xmm4 + vpxor %xmm3,%xmm4,%xmm4 + vpsllq $63,%xmm0,%xmm3 + vpxor %xmm3,%xmm4,%xmm4 + vpslldq $8,%xmm4,%xmm3 + vpsrldq $8,%xmm4,%xmm4 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm4,%xmm1,%xmm1 + + vpsrlq $1,%xmm0,%xmm4 + vpxor %xmm0,%xmm1,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $5,%xmm4,%xmm4 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $1,%xmm0,%xmm0 + vpxor %xmm1,%xmm0,%xmm0 +.Linit_start_avx: + vmovdqa %xmm0,%xmm5 + vpunpckhqdq %xmm0,%xmm0,%xmm3 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 + vpxor %xmm0,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + + vpslldq $8,%xmm3,%xmm4 + vpsrldq $8,%xmm3,%xmm3 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm3,%xmm1,%xmm1 + vpsllq $57,%xmm0,%xmm3 + vpsllq $62,%xmm0,%xmm4 + vpxor %xmm3,%xmm4,%xmm4 + vpsllq $63,%xmm0,%xmm3 + vpxor %xmm3,%xmm4,%xmm4 + vpslldq $8,%xmm4,%xmm3 + vpsrldq $8,%xmm4,%xmm4 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm4,%xmm1,%xmm1 + + vpsrlq $1,%xmm0,%xmm4 + vpxor %xmm0,%xmm1,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $5,%xmm4,%xmm4 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $1,%xmm0,%xmm0 + vpxor %xmm1,%xmm0,%xmm0 + vpshufd $78,%xmm5,%xmm3 + vpshufd $78,%xmm0,%xmm4 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqu %xmm5,0(%rdi) + vpxor %xmm0,%xmm4,%xmm4 + vmovdqu %xmm0,16(%rdi) + leaq 48(%rdi),%rdi + subq $1,%r10 + jnz .Linit_loop_avx + + vpalignr $8,%xmm4,%xmm3,%xmm5 + vmovdqu %xmm5,-16(%rdi) + + vzeroupper + ret + +.cfi_endproc +.size gcm_init_avx,.-gcm_init_avx +.globl gcm_ghash_avx +.hidden gcm_ghash_avx +.type gcm_ghash_avx,@function +.align 32 +gcm_ghash_avx: +.cfi_startproc + +_CET_ENDBR + vzeroupper + + vmovdqu (%rdi),%xmm10 + leaq .L0x1c2_polynomial(%rip),%r10 + leaq 64(%rsi),%rsi + vmovdqu .Lbswap_mask(%rip),%xmm13 + vpshufb %xmm13,%xmm10,%xmm10 + cmpq $0x80,%rcx + jb .Lshort_avx + subq $0x80,%rcx + + vmovdqu 112(%rdx),%xmm14 + vmovdqu 0-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm14 + vmovdqu 32-64(%rsi),%xmm7 + + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vmovdqu 96(%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm14,%xmm9,%xmm9 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 16-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vmovdqu 80(%rdx),%xmm14 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + + vpshufb %xmm13,%xmm14,%xmm14 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 48-64(%rsi),%xmm6 + vpxor %xmm14,%xmm9,%xmm9 + vmovdqu 64(%rdx),%xmm15 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 80-64(%rsi),%xmm7 + + vpshufb %xmm13,%xmm15,%xmm15 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm1,%xmm4,%xmm4 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 64-64(%rsi),%xmm6 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + + vmovdqu 48(%rdx),%xmm14 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpxor %xmm4,%xmm1,%xmm1 + vpshufb %xmm13,%xmm14,%xmm14 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 96-64(%rsi),%xmm6 + vpxor %xmm5,%xmm2,%xmm2 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 128-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + + vmovdqu 32(%rdx),%xmm15 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm1,%xmm4,%xmm4 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 112-64(%rsi),%xmm6 + vpxor %xmm2,%xmm5,%xmm5 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + + vmovdqu 16(%rdx),%xmm14 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpxor %xmm4,%xmm1,%xmm1 + vpshufb %xmm13,%xmm14,%xmm14 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 144-64(%rsi),%xmm6 + vpxor %xmm5,%xmm2,%xmm2 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 176-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + + vmovdqu (%rdx),%xmm15 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm1,%xmm4,%xmm4 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 160-64(%rsi),%xmm6 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 + + leaq 128(%rdx),%rdx + cmpq $0x80,%rcx + jb .Ltail_avx + + vpxor %xmm10,%xmm15,%xmm15 + subq $0x80,%rcx + jmp .Loop8x_avx + +.align 32 +.Loop8x_avx: + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vmovdqu 112(%rdx),%xmm14 + vpxor %xmm0,%xmm3,%xmm3 + vpxor %xmm15,%xmm8,%xmm8 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10 + vpshufb %xmm13,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11 + vmovdqu 0-64(%rsi),%xmm6 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12 + vmovdqu 32-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + + vmovdqu 96(%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm3,%xmm10,%xmm10 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vxorps %xmm4,%xmm11,%xmm11 + vmovdqu 16-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm5,%xmm12,%xmm12 + vxorps %xmm15,%xmm8,%xmm8 + + vmovdqu 80(%rdx),%xmm14 + vpxor %xmm10,%xmm12,%xmm12 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpxor %xmm11,%xmm12,%xmm12 + vpslldq $8,%xmm12,%xmm9 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vpsrldq $8,%xmm12,%xmm12 + vpxor %xmm9,%xmm10,%xmm10 + vmovdqu 48-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm14 + vxorps %xmm12,%xmm11,%xmm11 + vpxor %xmm1,%xmm4,%xmm4 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 80-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu 64(%rdx),%xmm15 + vpalignr $8,%xmm10,%xmm10,%xmm12 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpshufb %xmm13,%xmm15,%xmm15 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 64-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vxorps %xmm15,%xmm8,%xmm8 + vpxor %xmm5,%xmm2,%xmm2 + + vmovdqu 48(%rdx),%xmm14 + vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpshufb %xmm13,%xmm14,%xmm14 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 96-64(%rsi),%xmm6 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 128-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu 32(%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpshufb %xmm13,%xmm15,%xmm15 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 112-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + vpxor %xmm5,%xmm2,%xmm2 + vxorps %xmm12,%xmm10,%xmm10 + + vmovdqu 16(%rdx),%xmm14 + vpalignr $8,%xmm10,%xmm10,%xmm12 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpshufb %xmm13,%xmm14,%xmm14 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 144-64(%rsi),%xmm6 + vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 + vxorps %xmm11,%xmm12,%xmm12 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 176-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu (%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 160-64(%rsi),%xmm6 + vpxor %xmm12,%xmm15,%xmm15 + vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 + vpxor %xmm10,%xmm15,%xmm15 + + leaq 128(%rdx),%rdx + subq $0x80,%rcx + jnc .Loop8x_avx + + addq $0x80,%rcx + jmp .Ltail_no_xor_avx + +.align 32 +.Lshort_avx: + vmovdqu -16(%rdx,%rcx,1),%xmm14 + leaq (%rdx,%rcx,1),%rdx + vmovdqu 0-64(%rsi),%xmm6 + vmovdqu 32-64(%rsi),%xmm7 + vpshufb %xmm13,%xmm14,%xmm15 + + vmovdqa %xmm0,%xmm3 + vmovdqa %xmm1,%xmm4 + vmovdqa %xmm2,%xmm5 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -32(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 16-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vpsrldq $8,%xmm7,%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -48(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 48-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vmovdqu 80-64(%rsi),%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -64(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 64-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vpsrldq $8,%xmm7,%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -80(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 96-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vmovdqu 128-64(%rsi),%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -96(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 112-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vpsrldq $8,%xmm7,%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -112(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 144-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vmovq 184-64(%rsi),%xmm7 + subq $0x10,%rcx + jmp .Ltail_avx + +.align 32 +.Ltail_avx: + vpxor %xmm10,%xmm15,%xmm15 +.Ltail_no_xor_avx: + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + + vmovdqu (%r10),%xmm12 + + vpxor %xmm0,%xmm3,%xmm10 + vpxor %xmm1,%xmm4,%xmm11 + vpxor %xmm2,%xmm5,%xmm5 + + vpxor %xmm10,%xmm5,%xmm5 + vpxor %xmm11,%xmm5,%xmm5 + vpslldq $8,%xmm5,%xmm9 + vpsrldq $8,%xmm5,%xmm5 + vpxor %xmm9,%xmm10,%xmm10 + vpxor %xmm5,%xmm11,%xmm11 + + vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 + vpalignr $8,%xmm10,%xmm10,%xmm10 + vpxor %xmm9,%xmm10,%xmm10 + + vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 + vpalignr $8,%xmm10,%xmm10,%xmm10 + vpxor %xmm11,%xmm10,%xmm10 + vpxor %xmm9,%xmm10,%xmm10 + + cmpq $0,%rcx + jne .Lshort_avx + + vpshufb %xmm13,%xmm10,%xmm10 + vmovdqu %xmm10,(%rdi) + vzeroupper + ret +.cfi_endproc + +.size gcm_ghash_avx,.-gcm_ghash_avx +.section .rodata +.align 64 +.Lbswap_mask: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.L0x1c2_polynomial: +.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +.L7_mask: +.long 7,0,7,0 +.align 64 + +.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 64 +.text +#endif diff --git a/ring-0.17.14/pregenerated/ghash-x86_64-macosx.S b/ring-0.17.14/pregenerated/ghash-x86_64-macosx.S new file mode 100644 index 0000000000..dc9786ad8c --- /dev/null +++ b/ring-0.17.14/pregenerated/ghash-x86_64-macosx.S @@ -0,0 +1,1062 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +.text +.globl _gcm_init_clmul +.private_extern _gcm_init_clmul + +.p2align 4 +_gcm_init_clmul: + + +_CET_ENDBR +L$_init_clmul: + movdqu (%rsi),%xmm2 + pshufd $78,%xmm2,%xmm2 + + + pshufd $255,%xmm2,%xmm4 + movdqa %xmm2,%xmm3 + psllq $1,%xmm2 + pxor %xmm5,%xmm5 + psrlq $63,%xmm3 + pcmpgtd %xmm4,%xmm5 + pslldq $8,%xmm3 + por %xmm3,%xmm2 + + + pand L$0x1c2_polynomial(%rip),%xmm5 + pxor %xmm5,%xmm2 + + + pshufd $78,%xmm2,%xmm6 + movdqa %xmm2,%xmm0 + pxor %xmm2,%xmm6 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,222,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + pshufd $78,%xmm2,%xmm3 + pshufd $78,%xmm0,%xmm4 + pxor %xmm2,%xmm3 + movdqu %xmm2,0(%rdi) + pxor %xmm0,%xmm4 + movdqu %xmm0,16(%rdi) +.byte 102,15,58,15,227,8 + movdqu %xmm4,32(%rdi) + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,222,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + movdqa %xmm0,%xmm5 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,222,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + pshufd $78,%xmm5,%xmm3 + pshufd $78,%xmm0,%xmm4 + pxor %xmm5,%xmm3 + movdqu %xmm5,48(%rdi) + pxor %xmm0,%xmm4 + movdqu %xmm0,64(%rdi) +.byte 102,15,58,15,227,8 + movdqu %xmm4,80(%rdi) + ret + + + +.globl _gcm_ghash_clmul +.private_extern _gcm_ghash_clmul + +.p2align 5 +_gcm_ghash_clmul: + + +_CET_ENDBR +L$_ghash_clmul: + movdqa L$bswap_mask(%rip),%xmm10 + + movdqu (%rdi),%xmm0 + movdqu (%rsi),%xmm2 + movdqu 32(%rsi),%xmm7 +.byte 102,65,15,56,0,194 + + subq $0x10,%rcx + jz L$odd_tail + + movdqu 16(%rsi),%xmm6 + cmpq $0x30,%rcx + jb L$skip4x + + subq $0x30,%rcx + movq $0xA040608020C0E000,%rax + movdqu 48(%rsi),%xmm14 + movdqu 64(%rsi),%xmm15 + + + + + movdqu 48(%rdx),%xmm3 + movdqu 32(%rdx),%xmm11 +.byte 102,65,15,56,0,218 +.byte 102,69,15,56,0,218 + movdqa %xmm3,%xmm5 + pshufd $78,%xmm3,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,68,218,0 +.byte 102,15,58,68,234,17 +.byte 102,15,58,68,231,0 + + movdqa %xmm11,%xmm13 + pshufd $78,%xmm11,%xmm12 + pxor %xmm11,%xmm12 +.byte 102,68,15,58,68,222,0 +.byte 102,68,15,58,68,238,17 +.byte 102,68,15,58,68,231,16 + xorps %xmm11,%xmm3 + xorps %xmm13,%xmm5 + movups 80(%rsi),%xmm7 + xorps %xmm12,%xmm4 + + movdqu 16(%rdx),%xmm11 + movdqu 0(%rdx),%xmm8 +.byte 102,69,15,56,0,218 +.byte 102,69,15,56,0,194 + movdqa %xmm11,%xmm13 + pshufd $78,%xmm11,%xmm12 + pxor %xmm8,%xmm0 + pxor %xmm11,%xmm12 +.byte 102,69,15,58,68,222,0 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm8 + pxor %xmm0,%xmm8 +.byte 102,69,15,58,68,238,17 +.byte 102,68,15,58,68,231,0 + xorps %xmm11,%xmm3 + xorps %xmm13,%xmm5 + + leaq 64(%rdx),%rdx + subq $0x40,%rcx + jc L$tail4x + + jmp L$mod4_loop +.p2align 5 +L$mod4_loop: +.byte 102,65,15,58,68,199,0 + xorps %xmm12,%xmm4 + movdqu 48(%rdx),%xmm11 +.byte 102,69,15,56,0,218 +.byte 102,65,15,58,68,207,17 + xorps %xmm3,%xmm0 + movdqu 32(%rdx),%xmm3 + movdqa %xmm11,%xmm13 +.byte 102,68,15,58,68,199,16 + pshufd $78,%xmm11,%xmm12 + xorps %xmm5,%xmm1 + pxor %xmm11,%xmm12 +.byte 102,65,15,56,0,218 + movups 32(%rsi),%xmm7 + xorps %xmm4,%xmm8 +.byte 102,68,15,58,68,218,0 + pshufd $78,%xmm3,%xmm4 + + pxor %xmm0,%xmm8 + movdqa %xmm3,%xmm5 + pxor %xmm1,%xmm8 + pxor %xmm3,%xmm4 + movdqa %xmm8,%xmm9 +.byte 102,68,15,58,68,234,17 + pslldq $8,%xmm8 + psrldq $8,%xmm9 + pxor %xmm8,%xmm0 + movdqa L$7_mask(%rip),%xmm8 + pxor %xmm9,%xmm1 +.byte 102,76,15,110,200 + + pand %xmm0,%xmm8 +.byte 102,69,15,56,0,200 + pxor %xmm0,%xmm9 +.byte 102,68,15,58,68,231,0 + psllq $57,%xmm9 + movdqa %xmm9,%xmm8 + pslldq $8,%xmm9 +.byte 102,15,58,68,222,0 + psrldq $8,%xmm8 + pxor %xmm9,%xmm0 + pxor %xmm8,%xmm1 + movdqu 0(%rdx),%xmm8 + + movdqa %xmm0,%xmm9 + psrlq $1,%xmm0 +.byte 102,15,58,68,238,17 + xorps %xmm11,%xmm3 + movdqu 16(%rdx),%xmm11 +.byte 102,69,15,56,0,218 +.byte 102,15,58,68,231,16 + xorps %xmm13,%xmm5 + movups 80(%rsi),%xmm7 +.byte 102,69,15,56,0,194 + pxor %xmm9,%xmm1 + pxor %xmm0,%xmm9 + psrlq $5,%xmm0 + + movdqa %xmm11,%xmm13 + pxor %xmm12,%xmm4 + pshufd $78,%xmm11,%xmm12 + pxor %xmm9,%xmm0 + pxor %xmm8,%xmm1 + pxor %xmm11,%xmm12 +.byte 102,69,15,58,68,222,0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + movdqa %xmm0,%xmm1 +.byte 102,69,15,58,68,238,17 + xorps %xmm11,%xmm3 + pshufd $78,%xmm0,%xmm8 + pxor %xmm0,%xmm8 + +.byte 102,68,15,58,68,231,0 + xorps %xmm13,%xmm5 + + leaq 64(%rdx),%rdx + subq $0x40,%rcx + jnc L$mod4_loop + +L$tail4x: +.byte 102,65,15,58,68,199,0 +.byte 102,65,15,58,68,207,17 +.byte 102,68,15,58,68,199,16 + xorps %xmm12,%xmm4 + xorps %xmm3,%xmm0 + xorps %xmm5,%xmm1 + pxor %xmm0,%xmm1 + pxor %xmm4,%xmm8 + + pxor %xmm1,%xmm8 + pxor %xmm0,%xmm1 + + movdqa %xmm8,%xmm9 + psrldq $8,%xmm8 + pslldq $8,%xmm9 + pxor %xmm8,%xmm1 + pxor %xmm9,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + addq $0x40,%rcx + jz L$done + movdqu 32(%rsi),%xmm7 + subq $0x10,%rcx + jz L$odd_tail +L$skip4x: + + + + + + movdqu (%rdx),%xmm8 + movdqu 16(%rdx),%xmm3 +.byte 102,69,15,56,0,194 +.byte 102,65,15,56,0,218 + pxor %xmm8,%xmm0 + + movdqa %xmm3,%xmm5 + pshufd $78,%xmm3,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,68,218,0 +.byte 102,15,58,68,234,17 +.byte 102,15,58,68,231,0 + + leaq 32(%rdx),%rdx + nop + subq $0x20,%rcx + jbe L$even_tail + nop + jmp L$mod_loop + +.p2align 5 +L$mod_loop: + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm8 + pshufd $78,%xmm0,%xmm4 + pxor %xmm0,%xmm4 + +.byte 102,15,58,68,198,0 +.byte 102,15,58,68,206,17 +.byte 102,15,58,68,231,16 + + pxor %xmm3,%xmm0 + pxor %xmm5,%xmm1 + movdqu (%rdx),%xmm9 + pxor %xmm0,%xmm8 +.byte 102,69,15,56,0,202 + movdqu 16(%rdx),%xmm3 + + pxor %xmm1,%xmm8 + pxor %xmm9,%xmm1 + pxor %xmm8,%xmm4 +.byte 102,65,15,56,0,218 + movdqa %xmm4,%xmm8 + psrldq $8,%xmm8 + pslldq $8,%xmm4 + pxor %xmm8,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm3,%xmm5 + + movdqa %xmm0,%xmm9 + movdqa %xmm0,%xmm8 + psllq $5,%xmm0 + pxor %xmm0,%xmm8 +.byte 102,15,58,68,218,0 + psllq $1,%xmm0 + pxor %xmm8,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm8 + pslldq $8,%xmm0 + psrldq $8,%xmm8 + pxor %xmm9,%xmm0 + pshufd $78,%xmm5,%xmm4 + pxor %xmm8,%xmm1 + pxor %xmm5,%xmm4 + + movdqa %xmm0,%xmm9 + psrlq $1,%xmm0 +.byte 102,15,58,68,234,17 + pxor %xmm9,%xmm1 + pxor %xmm0,%xmm9 + psrlq $5,%xmm0 + pxor %xmm9,%xmm0 + leaq 32(%rdx),%rdx + psrlq $1,%xmm0 +.byte 102,15,58,68,231,0 + pxor %xmm1,%xmm0 + + subq $0x20,%rcx + ja L$mod_loop + +L$even_tail: + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm8 + pshufd $78,%xmm0,%xmm4 + pxor %xmm0,%xmm4 + +.byte 102,15,58,68,198,0 +.byte 102,15,58,68,206,17 +.byte 102,15,58,68,231,16 + + pxor %xmm3,%xmm0 + pxor %xmm5,%xmm1 + pxor %xmm0,%xmm8 + pxor %xmm1,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm8 + psrldq $8,%xmm8 + pslldq $8,%xmm4 + pxor %xmm8,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + testq %rcx,%rcx + jnz L$done + +L$odd_tail: + movdqu (%rdx),%xmm8 +.byte 102,69,15,56,0,194 + pxor %xmm8,%xmm0 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,223,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 +L$done: +.byte 102,65,15,56,0,194 + movdqu %xmm0,(%rdi) + ret + + + +.globl _gcm_init_avx +.private_extern _gcm_init_avx + +.p2align 5 +_gcm_init_avx: + + +_CET_ENDBR + vzeroupper + + vmovdqu (%rsi),%xmm2 + vpshufd $78,%xmm2,%xmm2 + + + vpshufd $255,%xmm2,%xmm4 + vpsrlq $63,%xmm2,%xmm3 + vpsllq $1,%xmm2,%xmm2 + vpxor %xmm5,%xmm5,%xmm5 + vpcmpgtd %xmm4,%xmm5,%xmm5 + vpslldq $8,%xmm3,%xmm3 + vpor %xmm3,%xmm2,%xmm2 + + + vpand L$0x1c2_polynomial(%rip),%xmm5,%xmm5 + vpxor %xmm5,%xmm2,%xmm2 + + vpunpckhqdq %xmm2,%xmm2,%xmm6 + vmovdqa %xmm2,%xmm0 + vpxor %xmm2,%xmm6,%xmm6 + movq $4,%r10 + jmp L$init_start_avx +.p2align 5 +L$init_loop_avx: + vpalignr $8,%xmm3,%xmm4,%xmm5 + vmovdqu %xmm5,-16(%rdi) + vpunpckhqdq %xmm0,%xmm0,%xmm3 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 + vpxor %xmm0,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + + vpslldq $8,%xmm3,%xmm4 + vpsrldq $8,%xmm3,%xmm3 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm3,%xmm1,%xmm1 + vpsllq $57,%xmm0,%xmm3 + vpsllq $62,%xmm0,%xmm4 + vpxor %xmm3,%xmm4,%xmm4 + vpsllq $63,%xmm0,%xmm3 + vpxor %xmm3,%xmm4,%xmm4 + vpslldq $8,%xmm4,%xmm3 + vpsrldq $8,%xmm4,%xmm4 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm4,%xmm1,%xmm1 + + vpsrlq $1,%xmm0,%xmm4 + vpxor %xmm0,%xmm1,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $5,%xmm4,%xmm4 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $1,%xmm0,%xmm0 + vpxor %xmm1,%xmm0,%xmm0 +L$init_start_avx: + vmovdqa %xmm0,%xmm5 + vpunpckhqdq %xmm0,%xmm0,%xmm3 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 + vpxor %xmm0,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + + vpslldq $8,%xmm3,%xmm4 + vpsrldq $8,%xmm3,%xmm3 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm3,%xmm1,%xmm1 + vpsllq $57,%xmm0,%xmm3 + vpsllq $62,%xmm0,%xmm4 + vpxor %xmm3,%xmm4,%xmm4 + vpsllq $63,%xmm0,%xmm3 + vpxor %xmm3,%xmm4,%xmm4 + vpslldq $8,%xmm4,%xmm3 + vpsrldq $8,%xmm4,%xmm4 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm4,%xmm1,%xmm1 + + vpsrlq $1,%xmm0,%xmm4 + vpxor %xmm0,%xmm1,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $5,%xmm4,%xmm4 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $1,%xmm0,%xmm0 + vpxor %xmm1,%xmm0,%xmm0 + vpshufd $78,%xmm5,%xmm3 + vpshufd $78,%xmm0,%xmm4 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqu %xmm5,0(%rdi) + vpxor %xmm0,%xmm4,%xmm4 + vmovdqu %xmm0,16(%rdi) + leaq 48(%rdi),%rdi + subq $1,%r10 + jnz L$init_loop_avx + + vpalignr $8,%xmm4,%xmm3,%xmm5 + vmovdqu %xmm5,-16(%rdi) + + vzeroupper + ret + + + +.globl _gcm_ghash_avx +.private_extern _gcm_ghash_avx + +.p2align 5 +_gcm_ghash_avx: + + +_CET_ENDBR + vzeroupper + + vmovdqu (%rdi),%xmm10 + leaq L$0x1c2_polynomial(%rip),%r10 + leaq 64(%rsi),%rsi + vmovdqu L$bswap_mask(%rip),%xmm13 + vpshufb %xmm13,%xmm10,%xmm10 + cmpq $0x80,%rcx + jb L$short_avx + subq $0x80,%rcx + + vmovdqu 112(%rdx),%xmm14 + vmovdqu 0-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm14 + vmovdqu 32-64(%rsi),%xmm7 + + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vmovdqu 96(%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm14,%xmm9,%xmm9 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 16-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vmovdqu 80(%rdx),%xmm14 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + + vpshufb %xmm13,%xmm14,%xmm14 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 48-64(%rsi),%xmm6 + vpxor %xmm14,%xmm9,%xmm9 + vmovdqu 64(%rdx),%xmm15 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 80-64(%rsi),%xmm7 + + vpshufb %xmm13,%xmm15,%xmm15 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm1,%xmm4,%xmm4 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 64-64(%rsi),%xmm6 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + + vmovdqu 48(%rdx),%xmm14 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpxor %xmm4,%xmm1,%xmm1 + vpshufb %xmm13,%xmm14,%xmm14 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 96-64(%rsi),%xmm6 + vpxor %xmm5,%xmm2,%xmm2 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 128-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + + vmovdqu 32(%rdx),%xmm15 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm1,%xmm4,%xmm4 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 112-64(%rsi),%xmm6 + vpxor %xmm2,%xmm5,%xmm5 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + + vmovdqu 16(%rdx),%xmm14 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpxor %xmm4,%xmm1,%xmm1 + vpshufb %xmm13,%xmm14,%xmm14 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 144-64(%rsi),%xmm6 + vpxor %xmm5,%xmm2,%xmm2 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 176-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + + vmovdqu (%rdx),%xmm15 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm1,%xmm4,%xmm4 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 160-64(%rsi),%xmm6 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 + + leaq 128(%rdx),%rdx + cmpq $0x80,%rcx + jb L$tail_avx + + vpxor %xmm10,%xmm15,%xmm15 + subq $0x80,%rcx + jmp L$oop8x_avx + +.p2align 5 +L$oop8x_avx: + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vmovdqu 112(%rdx),%xmm14 + vpxor %xmm0,%xmm3,%xmm3 + vpxor %xmm15,%xmm8,%xmm8 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10 + vpshufb %xmm13,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11 + vmovdqu 0-64(%rsi),%xmm6 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12 + vmovdqu 32-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + + vmovdqu 96(%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm3,%xmm10,%xmm10 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vxorps %xmm4,%xmm11,%xmm11 + vmovdqu 16-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm5,%xmm12,%xmm12 + vxorps %xmm15,%xmm8,%xmm8 + + vmovdqu 80(%rdx),%xmm14 + vpxor %xmm10,%xmm12,%xmm12 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpxor %xmm11,%xmm12,%xmm12 + vpslldq $8,%xmm12,%xmm9 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vpsrldq $8,%xmm12,%xmm12 + vpxor %xmm9,%xmm10,%xmm10 + vmovdqu 48-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm14 + vxorps %xmm12,%xmm11,%xmm11 + vpxor %xmm1,%xmm4,%xmm4 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 80-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu 64(%rdx),%xmm15 + vpalignr $8,%xmm10,%xmm10,%xmm12 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpshufb %xmm13,%xmm15,%xmm15 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 64-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vxorps %xmm15,%xmm8,%xmm8 + vpxor %xmm5,%xmm2,%xmm2 + + vmovdqu 48(%rdx),%xmm14 + vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpshufb %xmm13,%xmm14,%xmm14 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 96-64(%rsi),%xmm6 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 128-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu 32(%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpshufb %xmm13,%xmm15,%xmm15 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 112-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + vpxor %xmm5,%xmm2,%xmm2 + vxorps %xmm12,%xmm10,%xmm10 + + vmovdqu 16(%rdx),%xmm14 + vpalignr $8,%xmm10,%xmm10,%xmm12 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpshufb %xmm13,%xmm14,%xmm14 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 144-64(%rsi),%xmm6 + vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 + vxorps %xmm11,%xmm12,%xmm12 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 176-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu (%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 160-64(%rsi),%xmm6 + vpxor %xmm12,%xmm15,%xmm15 + vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 + vpxor %xmm10,%xmm15,%xmm15 + + leaq 128(%rdx),%rdx + subq $0x80,%rcx + jnc L$oop8x_avx + + addq $0x80,%rcx + jmp L$tail_no_xor_avx + +.p2align 5 +L$short_avx: + vmovdqu -16(%rdx,%rcx,1),%xmm14 + leaq (%rdx,%rcx,1),%rdx + vmovdqu 0-64(%rsi),%xmm6 + vmovdqu 32-64(%rsi),%xmm7 + vpshufb %xmm13,%xmm14,%xmm15 + + vmovdqa %xmm0,%xmm3 + vmovdqa %xmm1,%xmm4 + vmovdqa %xmm2,%xmm5 + subq $0x10,%rcx + jz L$tail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -32(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 16-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vpsrldq $8,%xmm7,%xmm7 + subq $0x10,%rcx + jz L$tail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -48(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 48-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vmovdqu 80-64(%rsi),%xmm7 + subq $0x10,%rcx + jz L$tail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -64(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 64-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vpsrldq $8,%xmm7,%xmm7 + subq $0x10,%rcx + jz L$tail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -80(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 96-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vmovdqu 128-64(%rsi),%xmm7 + subq $0x10,%rcx + jz L$tail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -96(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 112-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vpsrldq $8,%xmm7,%xmm7 + subq $0x10,%rcx + jz L$tail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -112(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 144-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vmovq 184-64(%rsi),%xmm7 + subq $0x10,%rcx + jmp L$tail_avx + +.p2align 5 +L$tail_avx: + vpxor %xmm10,%xmm15,%xmm15 +L$tail_no_xor_avx: + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + + vmovdqu (%r10),%xmm12 + + vpxor %xmm0,%xmm3,%xmm10 + vpxor %xmm1,%xmm4,%xmm11 + vpxor %xmm2,%xmm5,%xmm5 + + vpxor %xmm10,%xmm5,%xmm5 + vpxor %xmm11,%xmm5,%xmm5 + vpslldq $8,%xmm5,%xmm9 + vpsrldq $8,%xmm5,%xmm5 + vpxor %xmm9,%xmm10,%xmm10 + vpxor %xmm5,%xmm11,%xmm11 + + vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 + vpalignr $8,%xmm10,%xmm10,%xmm10 + vpxor %xmm9,%xmm10,%xmm10 + + vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 + vpalignr $8,%xmm10,%xmm10,%xmm10 + vpxor %xmm11,%xmm10,%xmm10 + vpxor %xmm9,%xmm10,%xmm10 + + cmpq $0,%rcx + jne L$short_avx + + vpshufb %xmm13,%xmm10,%xmm10 + vmovdqu %xmm10,(%rdi) + vzeroupper + ret + + + +.section __DATA,__const +.p2align 6 +L$bswap_mask: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +L$0x1c2_polynomial: +.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +L$7_mask: +.long 7,0,7,0 +.p2align 6 + +.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.p2align 6 +.text +#endif diff --git a/ring-0.17.14/pregenerated/ghash-x86_64-nasm.asm b/ring-0.17.14/pregenerated/ghash-x86_64-nasm.asm new file mode 100644 index 0000000000..eb04eed2d6 --- /dev/null +++ b/ring-0.17.14/pregenerated/ghash-x86_64-nasm.asm @@ -0,0 +1,1277 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%include "ring_core_generated/prefix_symbols_nasm.inc" +section .text code align=64 + +global gcm_init_clmul + +ALIGN 16 +gcm_init_clmul: + +$L$SEH_begin_gcm_init_clmul_1: +_CET_ENDBR +$L$_init_clmul: + sub rsp,0x18 +$L$SEH_prologue_gcm_init_clmul_2: + movaps XMMWORD[rsp],xmm6 +$L$SEH_prologue_gcm_init_clmul_3: +$L$SEH_endprologue_gcm_init_clmul_4: + movdqu xmm2,XMMWORD[rdx] + pshufd xmm2,xmm2,78 + + + pshufd xmm4,xmm2,255 + movdqa xmm3,xmm2 + psllq xmm2,1 + pxor xmm5,xmm5 + psrlq xmm3,63 + pcmpgtd xmm5,xmm4 + pslldq xmm3,8 + por xmm2,xmm3 + + + pand xmm5,XMMWORD[$L$0x1c2_polynomial] + pxor xmm2,xmm5 + + + pshufd xmm6,xmm2,78 + movdqa xmm0,xmm2 + pxor xmm6,xmm2 + movdqa xmm1,xmm0 + pshufd xmm3,xmm0,78 + pxor xmm3,xmm0 +DB 102,15,58,68,194,0 +DB 102,15,58,68,202,17 +DB 102,15,58,68,222,0 + pxor xmm3,xmm0 + pxor xmm3,xmm1 + + movdqa xmm4,xmm3 + psrldq xmm3,8 + pslldq xmm4,8 + pxor xmm1,xmm3 + pxor xmm0,xmm4 + + movdqa xmm4,xmm0 + movdqa xmm3,xmm0 + psllq xmm0,5 + pxor xmm3,xmm0 + psllq xmm0,1 + pxor xmm0,xmm3 + psllq xmm0,57 + movdqa xmm3,xmm0 + pslldq xmm0,8 + psrldq xmm3,8 + pxor xmm0,xmm4 + pxor xmm1,xmm3 + + + movdqa xmm4,xmm0 + psrlq xmm0,1 + pxor xmm1,xmm4 + pxor xmm4,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm4 + psrlq xmm0,1 + pxor xmm0,xmm1 + pshufd xmm3,xmm2,78 + pshufd xmm4,xmm0,78 + pxor xmm3,xmm2 + movdqu XMMWORD[rcx],xmm2 + pxor xmm4,xmm0 + movdqu XMMWORD[16+rcx],xmm0 +DB 102,15,58,15,227,8 + movdqu XMMWORD[32+rcx],xmm4 + movdqa xmm1,xmm0 + pshufd xmm3,xmm0,78 + pxor xmm3,xmm0 +DB 102,15,58,68,194,0 +DB 102,15,58,68,202,17 +DB 102,15,58,68,222,0 + pxor xmm3,xmm0 + pxor xmm3,xmm1 + + movdqa xmm4,xmm3 + psrldq xmm3,8 + pslldq xmm4,8 + pxor xmm1,xmm3 + pxor xmm0,xmm4 + + movdqa xmm4,xmm0 + movdqa xmm3,xmm0 + psllq xmm0,5 + pxor xmm3,xmm0 + psllq xmm0,1 + pxor xmm0,xmm3 + psllq xmm0,57 + movdqa xmm3,xmm0 + pslldq xmm0,8 + psrldq xmm3,8 + pxor xmm0,xmm4 + pxor xmm1,xmm3 + + + movdqa xmm4,xmm0 + psrlq xmm0,1 + pxor xmm1,xmm4 + pxor xmm4,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm4 + psrlq xmm0,1 + pxor xmm0,xmm1 + movdqa xmm5,xmm0 + movdqa xmm1,xmm0 + pshufd xmm3,xmm0,78 + pxor xmm3,xmm0 +DB 102,15,58,68,194,0 +DB 102,15,58,68,202,17 +DB 102,15,58,68,222,0 + pxor xmm3,xmm0 + pxor xmm3,xmm1 + + movdqa xmm4,xmm3 + psrldq xmm3,8 + pslldq xmm4,8 + pxor xmm1,xmm3 + pxor xmm0,xmm4 + + movdqa xmm4,xmm0 + movdqa xmm3,xmm0 + psllq xmm0,5 + pxor xmm3,xmm0 + psllq xmm0,1 + pxor xmm0,xmm3 + psllq xmm0,57 + movdqa xmm3,xmm0 + pslldq xmm0,8 + psrldq xmm3,8 + pxor xmm0,xmm4 + pxor xmm1,xmm3 + + + movdqa xmm4,xmm0 + psrlq xmm0,1 + pxor xmm1,xmm4 + pxor xmm4,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm4 + psrlq xmm0,1 + pxor xmm0,xmm1 + pshufd xmm3,xmm5,78 + pshufd xmm4,xmm0,78 + pxor xmm3,xmm5 + movdqu XMMWORD[48+rcx],xmm5 + pxor xmm4,xmm0 + movdqu XMMWORD[64+rcx],xmm0 +DB 102,15,58,15,227,8 + movdqu XMMWORD[80+rcx],xmm4 + movaps xmm6,XMMWORD[rsp] + lea rsp,[24+rsp] + ret + +$L$SEH_end_gcm_init_clmul_5: + +global gcm_ghash_clmul + +ALIGN 32 +gcm_ghash_clmul: + +$L$SEH_begin_gcm_ghash_clmul_1: +_CET_ENDBR +$L$_ghash_clmul: + lea rax,[((-136))+rsp] + lea rsp,[((-32))+rax] +$L$SEH_prologue_gcm_ghash_clmul_2: + movaps XMMWORD[(-32)+rax],xmm6 +$L$SEH_prologue_gcm_ghash_clmul_3: + movaps XMMWORD[(-16)+rax],xmm7 +$L$SEH_prologue_gcm_ghash_clmul_4: + movaps XMMWORD[rax],xmm8 +$L$SEH_prologue_gcm_ghash_clmul_5: + movaps XMMWORD[16+rax],xmm9 +$L$SEH_prologue_gcm_ghash_clmul_6: + movaps XMMWORD[32+rax],xmm10 +$L$SEH_prologue_gcm_ghash_clmul_7: + movaps XMMWORD[48+rax],xmm11 +$L$SEH_prologue_gcm_ghash_clmul_8: + movaps XMMWORD[64+rax],xmm12 +$L$SEH_prologue_gcm_ghash_clmul_9: + movaps XMMWORD[80+rax],xmm13 +$L$SEH_prologue_gcm_ghash_clmul_10: + movaps XMMWORD[96+rax],xmm14 +$L$SEH_prologue_gcm_ghash_clmul_11: + movaps XMMWORD[112+rax],xmm15 +$L$SEH_prologue_gcm_ghash_clmul_12: +$L$SEH_endprologue_gcm_ghash_clmul_13: + movdqa xmm10,XMMWORD[$L$bswap_mask] + + movdqu xmm0,XMMWORD[rcx] + movdqu xmm2,XMMWORD[rdx] + movdqu xmm7,XMMWORD[32+rdx] +DB 102,65,15,56,0,194 + + sub r9,0x10 + jz NEAR $L$odd_tail + + movdqu xmm6,XMMWORD[16+rdx] + cmp r9,0x30 + jb NEAR $L$skip4x + + sub r9,0x30 + mov rax,0xA040608020C0E000 + movdqu xmm14,XMMWORD[48+rdx] + movdqu xmm15,XMMWORD[64+rdx] + + + + + movdqu xmm3,XMMWORD[48+r8] + movdqu xmm11,XMMWORD[32+r8] +DB 102,65,15,56,0,218 +DB 102,69,15,56,0,218 + movdqa xmm5,xmm3 + pshufd xmm4,xmm3,78 + pxor xmm4,xmm3 +DB 102,15,58,68,218,0 +DB 102,15,58,68,234,17 +DB 102,15,58,68,231,0 + + movdqa xmm13,xmm11 + pshufd xmm12,xmm11,78 + pxor xmm12,xmm11 +DB 102,68,15,58,68,222,0 +DB 102,68,15,58,68,238,17 +DB 102,68,15,58,68,231,16 + xorps xmm3,xmm11 + xorps xmm5,xmm13 + movups xmm7,XMMWORD[80+rdx] + xorps xmm4,xmm12 + + movdqu xmm11,XMMWORD[16+r8] + movdqu xmm8,XMMWORD[r8] +DB 102,69,15,56,0,218 +DB 102,69,15,56,0,194 + movdqa xmm13,xmm11 + pshufd xmm12,xmm11,78 + pxor xmm0,xmm8 + pxor xmm12,xmm11 +DB 102,69,15,58,68,222,0 + movdqa xmm1,xmm0 + pshufd xmm8,xmm0,78 + pxor xmm8,xmm0 +DB 102,69,15,58,68,238,17 +DB 102,68,15,58,68,231,0 + xorps xmm3,xmm11 + xorps xmm5,xmm13 + + lea r8,[64+r8] + sub r9,0x40 + jc NEAR $L$tail4x + + jmp NEAR $L$mod4_loop +ALIGN 32 +$L$mod4_loop: +DB 102,65,15,58,68,199,0 + xorps xmm4,xmm12 + movdqu xmm11,XMMWORD[48+r8] +DB 102,69,15,56,0,218 +DB 102,65,15,58,68,207,17 + xorps xmm0,xmm3 + movdqu xmm3,XMMWORD[32+r8] + movdqa xmm13,xmm11 +DB 102,68,15,58,68,199,16 + pshufd xmm12,xmm11,78 + xorps xmm1,xmm5 + pxor xmm12,xmm11 +DB 102,65,15,56,0,218 + movups xmm7,XMMWORD[32+rdx] + xorps xmm8,xmm4 +DB 102,68,15,58,68,218,0 + pshufd xmm4,xmm3,78 + + pxor xmm8,xmm0 + movdqa xmm5,xmm3 + pxor xmm8,xmm1 + pxor xmm4,xmm3 + movdqa xmm9,xmm8 +DB 102,68,15,58,68,234,17 + pslldq xmm8,8 + psrldq xmm9,8 + pxor xmm0,xmm8 + movdqa xmm8,XMMWORD[$L$7_mask] + pxor xmm1,xmm9 +DB 102,76,15,110,200 + + pand xmm8,xmm0 +DB 102,69,15,56,0,200 + pxor xmm9,xmm0 +DB 102,68,15,58,68,231,0 + psllq xmm9,57 + movdqa xmm8,xmm9 + pslldq xmm9,8 +DB 102,15,58,68,222,0 + psrldq xmm8,8 + pxor xmm0,xmm9 + pxor xmm1,xmm8 + movdqu xmm8,XMMWORD[r8] + + movdqa xmm9,xmm0 + psrlq xmm0,1 +DB 102,15,58,68,238,17 + xorps xmm3,xmm11 + movdqu xmm11,XMMWORD[16+r8] +DB 102,69,15,56,0,218 +DB 102,15,58,68,231,16 + xorps xmm5,xmm13 + movups xmm7,XMMWORD[80+rdx] +DB 102,69,15,56,0,194 + pxor xmm1,xmm9 + pxor xmm9,xmm0 + psrlq xmm0,5 + + movdqa xmm13,xmm11 + pxor xmm4,xmm12 + pshufd xmm12,xmm11,78 + pxor xmm0,xmm9 + pxor xmm1,xmm8 + pxor xmm12,xmm11 +DB 102,69,15,58,68,222,0 + psrlq xmm0,1 + pxor xmm0,xmm1 + movdqa xmm1,xmm0 +DB 102,69,15,58,68,238,17 + xorps xmm3,xmm11 + pshufd xmm8,xmm0,78 + pxor xmm8,xmm0 + +DB 102,68,15,58,68,231,0 + xorps xmm5,xmm13 + + lea r8,[64+r8] + sub r9,0x40 + jnc NEAR $L$mod4_loop + +$L$tail4x: +DB 102,65,15,58,68,199,0 +DB 102,65,15,58,68,207,17 +DB 102,68,15,58,68,199,16 + xorps xmm4,xmm12 + xorps xmm0,xmm3 + xorps xmm1,xmm5 + pxor xmm1,xmm0 + pxor xmm8,xmm4 + + pxor xmm8,xmm1 + pxor xmm1,xmm0 + + movdqa xmm9,xmm8 + psrldq xmm8,8 + pslldq xmm9,8 + pxor xmm1,xmm8 + pxor xmm0,xmm9 + + movdqa xmm4,xmm0 + movdqa xmm3,xmm0 + psllq xmm0,5 + pxor xmm3,xmm0 + psllq xmm0,1 + pxor xmm0,xmm3 + psllq xmm0,57 + movdqa xmm3,xmm0 + pslldq xmm0,8 + psrldq xmm3,8 + pxor xmm0,xmm4 + pxor xmm1,xmm3 + + + movdqa xmm4,xmm0 + psrlq xmm0,1 + pxor xmm1,xmm4 + pxor xmm4,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm4 + psrlq xmm0,1 + pxor xmm0,xmm1 + add r9,0x40 + jz NEAR $L$done + movdqu xmm7,XMMWORD[32+rdx] + sub r9,0x10 + jz NEAR $L$odd_tail +$L$skip4x: + + + + + + movdqu xmm8,XMMWORD[r8] + movdqu xmm3,XMMWORD[16+r8] +DB 102,69,15,56,0,194 +DB 102,65,15,56,0,218 + pxor xmm0,xmm8 + + movdqa xmm5,xmm3 + pshufd xmm4,xmm3,78 + pxor xmm4,xmm3 +DB 102,15,58,68,218,0 +DB 102,15,58,68,234,17 +DB 102,15,58,68,231,0 + + lea r8,[32+r8] + nop + sub r9,0x20 + jbe NEAR $L$even_tail + nop + jmp NEAR $L$mod_loop + +ALIGN 32 +$L$mod_loop: + movdqa xmm1,xmm0 + movdqa xmm8,xmm4 + pshufd xmm4,xmm0,78 + pxor xmm4,xmm0 + +DB 102,15,58,68,198,0 +DB 102,15,58,68,206,17 +DB 102,15,58,68,231,16 + + pxor xmm0,xmm3 + pxor xmm1,xmm5 + movdqu xmm9,XMMWORD[r8] + pxor xmm8,xmm0 +DB 102,69,15,56,0,202 + movdqu xmm3,XMMWORD[16+r8] + + pxor xmm8,xmm1 + pxor xmm1,xmm9 + pxor xmm4,xmm8 +DB 102,65,15,56,0,218 + movdqa xmm8,xmm4 + psrldq xmm8,8 + pslldq xmm4,8 + pxor xmm1,xmm8 + pxor xmm0,xmm4 + + movdqa xmm5,xmm3 + + movdqa xmm9,xmm0 + movdqa xmm8,xmm0 + psllq xmm0,5 + pxor xmm8,xmm0 +DB 102,15,58,68,218,0 + psllq xmm0,1 + pxor xmm0,xmm8 + psllq xmm0,57 + movdqa xmm8,xmm0 + pslldq xmm0,8 + psrldq xmm8,8 + pxor xmm0,xmm9 + pshufd xmm4,xmm5,78 + pxor xmm1,xmm8 + pxor xmm4,xmm5 + + movdqa xmm9,xmm0 + psrlq xmm0,1 +DB 102,15,58,68,234,17 + pxor xmm1,xmm9 + pxor xmm9,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm9 + lea r8,[32+r8] + psrlq xmm0,1 +DB 102,15,58,68,231,0 + pxor xmm0,xmm1 + + sub r9,0x20 + ja NEAR $L$mod_loop + +$L$even_tail: + movdqa xmm1,xmm0 + movdqa xmm8,xmm4 + pshufd xmm4,xmm0,78 + pxor xmm4,xmm0 + +DB 102,15,58,68,198,0 +DB 102,15,58,68,206,17 +DB 102,15,58,68,231,16 + + pxor xmm0,xmm3 + pxor xmm1,xmm5 + pxor xmm8,xmm0 + pxor xmm8,xmm1 + pxor xmm4,xmm8 + movdqa xmm8,xmm4 + psrldq xmm8,8 + pslldq xmm4,8 + pxor xmm1,xmm8 + pxor xmm0,xmm4 + + movdqa xmm4,xmm0 + movdqa xmm3,xmm0 + psllq xmm0,5 + pxor xmm3,xmm0 + psllq xmm0,1 + pxor xmm0,xmm3 + psllq xmm0,57 + movdqa xmm3,xmm0 + pslldq xmm0,8 + psrldq xmm3,8 + pxor xmm0,xmm4 + pxor xmm1,xmm3 + + + movdqa xmm4,xmm0 + psrlq xmm0,1 + pxor xmm1,xmm4 + pxor xmm4,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm4 + psrlq xmm0,1 + pxor xmm0,xmm1 + test r9,r9 + jnz NEAR $L$done + +$L$odd_tail: + movdqu xmm8,XMMWORD[r8] +DB 102,69,15,56,0,194 + pxor xmm0,xmm8 + movdqa xmm1,xmm0 + pshufd xmm3,xmm0,78 + pxor xmm3,xmm0 +DB 102,15,58,68,194,0 +DB 102,15,58,68,202,17 +DB 102,15,58,68,223,0 + pxor xmm3,xmm0 + pxor xmm3,xmm1 + + movdqa xmm4,xmm3 + psrldq xmm3,8 + pslldq xmm4,8 + pxor xmm1,xmm3 + pxor xmm0,xmm4 + + movdqa xmm4,xmm0 + movdqa xmm3,xmm0 + psllq xmm0,5 + pxor xmm3,xmm0 + psllq xmm0,1 + pxor xmm0,xmm3 + psllq xmm0,57 + movdqa xmm3,xmm0 + pslldq xmm0,8 + psrldq xmm3,8 + pxor xmm0,xmm4 + pxor xmm1,xmm3 + + + movdqa xmm4,xmm0 + psrlq xmm0,1 + pxor xmm1,xmm4 + pxor xmm4,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm4 + psrlq xmm0,1 + pxor xmm0,xmm1 +$L$done: +DB 102,65,15,56,0,194 + movdqu XMMWORD[rcx],xmm0 + movaps xmm6,XMMWORD[rsp] + movaps xmm7,XMMWORD[16+rsp] + movaps xmm8,XMMWORD[32+rsp] + movaps xmm9,XMMWORD[48+rsp] + movaps xmm10,XMMWORD[64+rsp] + movaps xmm11,XMMWORD[80+rsp] + movaps xmm12,XMMWORD[96+rsp] + movaps xmm13,XMMWORD[112+rsp] + movaps xmm14,XMMWORD[128+rsp] + movaps xmm15,XMMWORD[144+rsp] + lea rsp,[168+rsp] + ret + +$L$SEH_end_gcm_ghash_clmul_14: + +global gcm_init_avx + +ALIGN 32 +gcm_init_avx: + +$L$SEH_begin_gcm_init_avx_1: +_CET_ENDBR + sub rsp,0x18 +$L$SEH_prologue_gcm_init_avx_2: + movaps XMMWORD[rsp],xmm6 +$L$SEH_prologue_gcm_init_avx_3: +$L$SEH_endprologue_gcm_init_avx_4: + vzeroupper + + vmovdqu xmm2,XMMWORD[rdx] + vpshufd xmm2,xmm2,78 + + + vpshufd xmm4,xmm2,255 + vpsrlq xmm3,xmm2,63 + vpsllq xmm2,xmm2,1 + vpxor xmm5,xmm5,xmm5 + vpcmpgtd xmm5,xmm5,xmm4 + vpslldq xmm3,xmm3,8 + vpor xmm2,xmm2,xmm3 + + + vpand xmm5,xmm5,XMMWORD[$L$0x1c2_polynomial] + vpxor xmm2,xmm2,xmm5 + + vpunpckhqdq xmm6,xmm2,xmm2 + vmovdqa xmm0,xmm2 + vpxor xmm6,xmm6,xmm2 + mov r10,4 + jmp NEAR $L$init_start_avx +ALIGN 32 +$L$init_loop_avx: + vpalignr xmm5,xmm4,xmm3,8 + vmovdqu XMMWORD[(-16)+rcx],xmm5 + vpunpckhqdq xmm3,xmm0,xmm0 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm1,xmm0,xmm2,0x11 + vpclmulqdq xmm0,xmm0,xmm2,0x00 + vpclmulqdq xmm3,xmm3,xmm6,0x00 + vpxor xmm4,xmm1,xmm0 + vpxor xmm3,xmm3,xmm4 + + vpslldq xmm4,xmm3,8 + vpsrldq xmm3,xmm3,8 + vpxor xmm0,xmm0,xmm4 + vpxor xmm1,xmm1,xmm3 + vpsllq xmm3,xmm0,57 + vpsllq xmm4,xmm0,62 + vpxor xmm4,xmm4,xmm3 + vpsllq xmm3,xmm0,63 + vpxor xmm4,xmm4,xmm3 + vpslldq xmm3,xmm4,8 + vpsrldq xmm4,xmm4,8 + vpxor xmm0,xmm0,xmm3 + vpxor xmm1,xmm1,xmm4 + + vpsrlq xmm4,xmm0,1 + vpxor xmm1,xmm1,xmm0 + vpxor xmm0,xmm0,xmm4 + vpsrlq xmm4,xmm4,5 + vpxor xmm0,xmm0,xmm4 + vpsrlq xmm0,xmm0,1 + vpxor xmm0,xmm0,xmm1 +$L$init_start_avx: + vmovdqa xmm5,xmm0 + vpunpckhqdq xmm3,xmm0,xmm0 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm1,xmm0,xmm2,0x11 + vpclmulqdq xmm0,xmm0,xmm2,0x00 + vpclmulqdq xmm3,xmm3,xmm6,0x00 + vpxor xmm4,xmm1,xmm0 + vpxor xmm3,xmm3,xmm4 + + vpslldq xmm4,xmm3,8 + vpsrldq xmm3,xmm3,8 + vpxor xmm0,xmm0,xmm4 + vpxor xmm1,xmm1,xmm3 + vpsllq xmm3,xmm0,57 + vpsllq xmm4,xmm0,62 + vpxor xmm4,xmm4,xmm3 + vpsllq xmm3,xmm0,63 + vpxor xmm4,xmm4,xmm3 + vpslldq xmm3,xmm4,8 + vpsrldq xmm4,xmm4,8 + vpxor xmm0,xmm0,xmm3 + vpxor xmm1,xmm1,xmm4 + + vpsrlq xmm4,xmm0,1 + vpxor xmm1,xmm1,xmm0 + vpxor xmm0,xmm0,xmm4 + vpsrlq xmm4,xmm4,5 + vpxor xmm0,xmm0,xmm4 + vpsrlq xmm0,xmm0,1 + vpxor xmm0,xmm0,xmm1 + vpshufd xmm3,xmm5,78 + vpshufd xmm4,xmm0,78 + vpxor xmm3,xmm3,xmm5 + vmovdqu XMMWORD[rcx],xmm5 + vpxor xmm4,xmm4,xmm0 + vmovdqu XMMWORD[16+rcx],xmm0 + lea rcx,[48+rcx] + sub r10,1 + jnz NEAR $L$init_loop_avx + + vpalignr xmm5,xmm3,xmm4,8 + vmovdqu XMMWORD[(-16)+rcx],xmm5 + + vzeroupper + movaps xmm6,XMMWORD[rsp] + lea rsp,[24+rsp] + ret +$L$SEH_end_gcm_init_avx_5: + + +global gcm_ghash_avx + +ALIGN 32 +gcm_ghash_avx: + +$L$SEH_begin_gcm_ghash_avx_1: +_CET_ENDBR + lea rax,[((-136))+rsp] + lea rsp,[((-32))+rax] +$L$SEH_prologue_gcm_ghash_avx_2: + movaps XMMWORD[(-32)+rax],xmm6 +$L$SEH_prologue_gcm_ghash_avx_3: + movaps XMMWORD[(-16)+rax],xmm7 +$L$SEH_prologue_gcm_ghash_avx_4: + movaps XMMWORD[rax],xmm8 +$L$SEH_prologue_gcm_ghash_avx_5: + movaps XMMWORD[16+rax],xmm9 +$L$SEH_prologue_gcm_ghash_avx_6: + movaps XMMWORD[32+rax],xmm10 +$L$SEH_prologue_gcm_ghash_avx_7: + movaps XMMWORD[48+rax],xmm11 +$L$SEH_prologue_gcm_ghash_avx_8: + movaps XMMWORD[64+rax],xmm12 +$L$SEH_prologue_gcm_ghash_avx_9: + movaps XMMWORD[80+rax],xmm13 +$L$SEH_prologue_gcm_ghash_avx_10: + movaps XMMWORD[96+rax],xmm14 +$L$SEH_prologue_gcm_ghash_avx_11: + movaps XMMWORD[112+rax],xmm15 +$L$SEH_prologue_gcm_ghash_avx_12: +$L$SEH_endprologue_gcm_ghash_avx_13: + vzeroupper + + vmovdqu xmm10,XMMWORD[rcx] + lea r10,[$L$0x1c2_polynomial] + lea rdx,[64+rdx] + vmovdqu xmm13,XMMWORD[$L$bswap_mask] + vpshufb xmm10,xmm10,xmm13 + cmp r9,0x80 + jb NEAR $L$short_avx + sub r9,0x80 + + vmovdqu xmm14,XMMWORD[112+r8] + vmovdqu xmm6,XMMWORD[((0-64))+rdx] + vpshufb xmm14,xmm14,xmm13 + vmovdqu xmm7,XMMWORD[((32-64))+rdx] + + vpunpckhqdq xmm9,xmm14,xmm14 + vmovdqu xmm15,XMMWORD[96+r8] + vpclmulqdq xmm0,xmm14,xmm6,0x00 + vpxor xmm9,xmm9,xmm14 + vpshufb xmm15,xmm15,xmm13 + vpclmulqdq xmm1,xmm14,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((16-64))+rdx] + vpunpckhqdq xmm8,xmm15,xmm15 + vmovdqu xmm14,XMMWORD[80+r8] + vpclmulqdq xmm2,xmm9,xmm7,0x00 + vpxor xmm8,xmm8,xmm15 + + vpshufb xmm14,xmm14,xmm13 + vpclmulqdq xmm3,xmm15,xmm6,0x00 + vpunpckhqdq xmm9,xmm14,xmm14 + vpclmulqdq xmm4,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((48-64))+rdx] + vpxor xmm9,xmm9,xmm14 + vmovdqu xmm15,XMMWORD[64+r8] + vpclmulqdq xmm5,xmm8,xmm7,0x10 + vmovdqu xmm7,XMMWORD[((80-64))+rdx] + + vpshufb xmm15,xmm15,xmm13 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm14,xmm6,0x00 + vpxor xmm4,xmm4,xmm1 + vpunpckhqdq xmm8,xmm15,xmm15 + vpclmulqdq xmm1,xmm14,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((64-64))+rdx] + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm9,xmm7,0x00 + vpxor xmm8,xmm8,xmm15 + + vmovdqu xmm14,XMMWORD[48+r8] + vpxor xmm0,xmm0,xmm3 + vpclmulqdq xmm3,xmm15,xmm6,0x00 + vpxor xmm1,xmm1,xmm4 + vpshufb xmm14,xmm14,xmm13 + vpclmulqdq xmm4,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((96-64))+rdx] + vpxor xmm2,xmm2,xmm5 + vpunpckhqdq xmm9,xmm14,xmm14 + vpclmulqdq xmm5,xmm8,xmm7,0x10 + vmovdqu xmm7,XMMWORD[((128-64))+rdx] + vpxor xmm9,xmm9,xmm14 + + vmovdqu xmm15,XMMWORD[32+r8] + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm14,xmm6,0x00 + vpxor xmm4,xmm4,xmm1 + vpshufb xmm15,xmm15,xmm13 + vpclmulqdq xmm1,xmm14,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((112-64))+rdx] + vpxor xmm5,xmm5,xmm2 + vpunpckhqdq xmm8,xmm15,xmm15 + vpclmulqdq xmm2,xmm9,xmm7,0x00 + vpxor xmm8,xmm8,xmm15 + + vmovdqu xmm14,XMMWORD[16+r8] + vpxor xmm0,xmm0,xmm3 + vpclmulqdq xmm3,xmm15,xmm6,0x00 + vpxor xmm1,xmm1,xmm4 + vpshufb xmm14,xmm14,xmm13 + vpclmulqdq xmm4,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((144-64))+rdx] + vpxor xmm2,xmm2,xmm5 + vpunpckhqdq xmm9,xmm14,xmm14 + vpclmulqdq xmm5,xmm8,xmm7,0x10 + vmovdqu xmm7,XMMWORD[((176-64))+rdx] + vpxor xmm9,xmm9,xmm14 + + vmovdqu xmm15,XMMWORD[r8] + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm14,xmm6,0x00 + vpxor xmm4,xmm4,xmm1 + vpshufb xmm15,xmm15,xmm13 + vpclmulqdq xmm1,xmm14,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((160-64))+rdx] + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm9,xmm7,0x10 + + lea r8,[128+r8] + cmp r9,0x80 + jb NEAR $L$tail_avx + + vpxor xmm15,xmm15,xmm10 + sub r9,0x80 + jmp NEAR $L$oop8x_avx + +ALIGN 32 +$L$oop8x_avx: + vpunpckhqdq xmm8,xmm15,xmm15 + vmovdqu xmm14,XMMWORD[112+r8] + vpxor xmm3,xmm3,xmm0 + vpxor xmm8,xmm8,xmm15 + vpclmulqdq xmm10,xmm15,xmm6,0x00 + vpshufb xmm14,xmm14,xmm13 + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm11,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((0-64))+rdx] + vpunpckhqdq xmm9,xmm14,xmm14 + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm12,xmm8,xmm7,0x00 + vmovdqu xmm7,XMMWORD[((32-64))+rdx] + vpxor xmm9,xmm9,xmm14 + + vmovdqu xmm15,XMMWORD[96+r8] + vpclmulqdq xmm0,xmm14,xmm6,0x00 + vpxor xmm10,xmm10,xmm3 + vpshufb xmm15,xmm15,xmm13 + vpclmulqdq xmm1,xmm14,xmm6,0x11 + vxorps xmm11,xmm11,xmm4 + vmovdqu xmm6,XMMWORD[((16-64))+rdx] + vpunpckhqdq xmm8,xmm15,xmm15 + vpclmulqdq xmm2,xmm9,xmm7,0x00 + vpxor xmm12,xmm12,xmm5 + vxorps xmm8,xmm8,xmm15 + + vmovdqu xmm14,XMMWORD[80+r8] + vpxor xmm12,xmm12,xmm10 + vpclmulqdq xmm3,xmm15,xmm6,0x00 + vpxor xmm12,xmm12,xmm11 + vpslldq xmm9,xmm12,8 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm4,xmm15,xmm6,0x11 + vpsrldq xmm12,xmm12,8 + vpxor xmm10,xmm10,xmm9 + vmovdqu xmm6,XMMWORD[((48-64))+rdx] + vpshufb xmm14,xmm14,xmm13 + vxorps xmm11,xmm11,xmm12 + vpxor xmm4,xmm4,xmm1 + vpunpckhqdq xmm9,xmm14,xmm14 + vpclmulqdq xmm5,xmm8,xmm7,0x10 + vmovdqu xmm7,XMMWORD[((80-64))+rdx] + vpxor xmm9,xmm9,xmm14 + vpxor xmm5,xmm5,xmm2 + + vmovdqu xmm15,XMMWORD[64+r8] + vpalignr xmm12,xmm10,xmm10,8 + vpclmulqdq xmm0,xmm14,xmm6,0x00 + vpshufb xmm15,xmm15,xmm13 + vpxor xmm0,xmm0,xmm3 + vpclmulqdq xmm1,xmm14,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((64-64))+rdx] + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm2,xmm9,xmm7,0x00 + vxorps xmm8,xmm8,xmm15 + vpxor xmm2,xmm2,xmm5 + + vmovdqu xmm14,XMMWORD[48+r8] + vpclmulqdq xmm10,xmm10,XMMWORD[r10],0x10 + vpclmulqdq xmm3,xmm15,xmm6,0x00 + vpshufb xmm14,xmm14,xmm13 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm4,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((96-64))+rdx] + vpunpckhqdq xmm9,xmm14,xmm14 + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm5,xmm8,xmm7,0x10 + vmovdqu xmm7,XMMWORD[((128-64))+rdx] + vpxor xmm9,xmm9,xmm14 + vpxor xmm5,xmm5,xmm2 + + vmovdqu xmm15,XMMWORD[32+r8] + vpclmulqdq xmm0,xmm14,xmm6,0x00 + vpshufb xmm15,xmm15,xmm13 + vpxor xmm0,xmm0,xmm3 + vpclmulqdq xmm1,xmm14,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((112-64))+rdx] + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm2,xmm9,xmm7,0x00 + vpxor xmm8,xmm8,xmm15 + vpxor xmm2,xmm2,xmm5 + vxorps xmm10,xmm10,xmm12 + + vmovdqu xmm14,XMMWORD[16+r8] + vpalignr xmm12,xmm10,xmm10,8 + vpclmulqdq xmm3,xmm15,xmm6,0x00 + vpshufb xmm14,xmm14,xmm13 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm4,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((144-64))+rdx] + vpclmulqdq xmm10,xmm10,XMMWORD[r10],0x10 + vxorps xmm12,xmm12,xmm11 + vpunpckhqdq xmm9,xmm14,xmm14 + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm5,xmm8,xmm7,0x10 + vmovdqu xmm7,XMMWORD[((176-64))+rdx] + vpxor xmm9,xmm9,xmm14 + vpxor xmm5,xmm5,xmm2 + + vmovdqu xmm15,XMMWORD[r8] + vpclmulqdq xmm0,xmm14,xmm6,0x00 + vpshufb xmm15,xmm15,xmm13 + vpclmulqdq xmm1,xmm14,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((160-64))+rdx] + vpxor xmm15,xmm15,xmm12 + vpclmulqdq xmm2,xmm9,xmm7,0x10 + vpxor xmm15,xmm15,xmm10 + + lea r8,[128+r8] + sub r9,0x80 + jnc NEAR $L$oop8x_avx + + add r9,0x80 + jmp NEAR $L$tail_no_xor_avx + +ALIGN 32 +$L$short_avx: + vmovdqu xmm14,XMMWORD[((-16))+r9*1+r8] + lea r8,[r9*1+r8] + vmovdqu xmm6,XMMWORD[((0-64))+rdx] + vmovdqu xmm7,XMMWORD[((32-64))+rdx] + vpshufb xmm15,xmm14,xmm13 + + vmovdqa xmm3,xmm0 + vmovdqa xmm4,xmm1 + vmovdqa xmm5,xmm2 + sub r9,0x10 + jz NEAR $L$tail_avx + + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm15,xmm6,0x00 + vpxor xmm8,xmm8,xmm15 + vmovdqu xmm14,XMMWORD[((-32))+r8] + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm1,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((16-64))+rdx] + vpshufb xmm15,xmm14,xmm13 + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm8,xmm7,0x00 + vpsrldq xmm7,xmm7,8 + sub r9,0x10 + jz NEAR $L$tail_avx + + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm15,xmm6,0x00 + vpxor xmm8,xmm8,xmm15 + vmovdqu xmm14,XMMWORD[((-48))+r8] + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm1,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((48-64))+rdx] + vpshufb xmm15,xmm14,xmm13 + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm8,xmm7,0x00 + vmovdqu xmm7,XMMWORD[((80-64))+rdx] + sub r9,0x10 + jz NEAR $L$tail_avx + + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm15,xmm6,0x00 + vpxor xmm8,xmm8,xmm15 + vmovdqu xmm14,XMMWORD[((-64))+r8] + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm1,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((64-64))+rdx] + vpshufb xmm15,xmm14,xmm13 + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm8,xmm7,0x00 + vpsrldq xmm7,xmm7,8 + sub r9,0x10 + jz NEAR $L$tail_avx + + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm15,xmm6,0x00 + vpxor xmm8,xmm8,xmm15 + vmovdqu xmm14,XMMWORD[((-80))+r8] + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm1,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((96-64))+rdx] + vpshufb xmm15,xmm14,xmm13 + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm8,xmm7,0x00 + vmovdqu xmm7,XMMWORD[((128-64))+rdx] + sub r9,0x10 + jz NEAR $L$tail_avx + + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm15,xmm6,0x00 + vpxor xmm8,xmm8,xmm15 + vmovdqu xmm14,XMMWORD[((-96))+r8] + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm1,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((112-64))+rdx] + vpshufb xmm15,xmm14,xmm13 + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm8,xmm7,0x00 + vpsrldq xmm7,xmm7,8 + sub r9,0x10 + jz NEAR $L$tail_avx + + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm15,xmm6,0x00 + vpxor xmm8,xmm8,xmm15 + vmovdqu xmm14,XMMWORD[((-112))+r8] + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm1,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((144-64))+rdx] + vpshufb xmm15,xmm14,xmm13 + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm8,xmm7,0x00 + vmovq xmm7,QWORD[((184-64))+rdx] + sub r9,0x10 + jmp NEAR $L$tail_avx + +ALIGN 32 +$L$tail_avx: + vpxor xmm15,xmm15,xmm10 +$L$tail_no_xor_avx: + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm15,xmm6,0x00 + vpxor xmm8,xmm8,xmm15 + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm1,xmm15,xmm6,0x11 + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm8,xmm7,0x00 + + vmovdqu xmm12,XMMWORD[r10] + + vpxor xmm10,xmm3,xmm0 + vpxor xmm11,xmm4,xmm1 + vpxor xmm5,xmm5,xmm2 + + vpxor xmm5,xmm5,xmm10 + vpxor xmm5,xmm5,xmm11 + vpslldq xmm9,xmm5,8 + vpsrldq xmm5,xmm5,8 + vpxor xmm10,xmm10,xmm9 + vpxor xmm11,xmm11,xmm5 + + vpclmulqdq xmm9,xmm10,xmm12,0x10 + vpalignr xmm10,xmm10,xmm10,8 + vpxor xmm10,xmm10,xmm9 + + vpclmulqdq xmm9,xmm10,xmm12,0x10 + vpalignr xmm10,xmm10,xmm10,8 + vpxor xmm10,xmm10,xmm11 + vpxor xmm10,xmm10,xmm9 + + cmp r9,0 + jne NEAR $L$short_avx + + vpshufb xmm10,xmm10,xmm13 + vmovdqu XMMWORD[rcx],xmm10 + vzeroupper + movaps xmm6,XMMWORD[rsp] + movaps xmm7,XMMWORD[16+rsp] + movaps xmm8,XMMWORD[32+rsp] + movaps xmm9,XMMWORD[48+rsp] + movaps xmm10,XMMWORD[64+rsp] + movaps xmm11,XMMWORD[80+rsp] + movaps xmm12,XMMWORD[96+rsp] + movaps xmm13,XMMWORD[112+rsp] + movaps xmm14,XMMWORD[128+rsp] + movaps xmm15,XMMWORD[144+rsp] + lea rsp,[168+rsp] + ret + +$L$SEH_end_gcm_ghash_avx_14: + +section .rdata rdata align=8 +ALIGN 64 +$L$bswap_mask: + DB 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +$L$0x1c2_polynomial: + DB 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +$L$7_mask: + DD 7,0,7,0 +ALIGN 64 + + DB 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52 + DB 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 + DB 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 + DB 114,103,62,0 +ALIGN 64 +section .text + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_gcm_init_clmul_1 wrt ..imagebase + DD $L$SEH_end_gcm_init_clmul_5 wrt ..imagebase + DD $L$SEH_info_gcm_init_clmul_0 wrt ..imagebase + + DD $L$SEH_begin_gcm_ghash_clmul_1 wrt ..imagebase + DD $L$SEH_end_gcm_ghash_clmul_14 wrt ..imagebase + DD $L$SEH_info_gcm_ghash_clmul_0 wrt ..imagebase + + DD $L$SEH_begin_gcm_init_avx_1 wrt ..imagebase + DD $L$SEH_end_gcm_init_avx_5 wrt ..imagebase + DD $L$SEH_info_gcm_init_avx_0 wrt ..imagebase + + DD $L$SEH_begin_gcm_ghash_avx_1 wrt ..imagebase + DD $L$SEH_end_gcm_ghash_avx_14 wrt ..imagebase + DD $L$SEH_info_gcm_ghash_avx_0 wrt ..imagebase + + +section .xdata rdata align=8 +ALIGN 4 +$L$SEH_info_gcm_init_clmul_0: + DB 1 + DB $L$SEH_endprologue_gcm_init_clmul_4-$L$SEH_begin_gcm_init_clmul_1 + DB 3 + DB 0 + DB $L$SEH_prologue_gcm_init_clmul_3-$L$SEH_begin_gcm_init_clmul_1 + DB 104 + DW 0 + DB $L$SEH_prologue_gcm_init_clmul_2-$L$SEH_begin_gcm_init_clmul_1 + DB 34 + + DW 0 +$L$SEH_info_gcm_ghash_clmul_0: + DB 1 + DB $L$SEH_endprologue_gcm_ghash_clmul_13-$L$SEH_begin_gcm_ghash_clmul_1 + DB 22 + DB 0 + DB $L$SEH_prologue_gcm_ghash_clmul_12-$L$SEH_begin_gcm_ghash_clmul_1 + DB 248 + DW 9 + DB $L$SEH_prologue_gcm_ghash_clmul_11-$L$SEH_begin_gcm_ghash_clmul_1 + DB 232 + DW 8 + DB $L$SEH_prologue_gcm_ghash_clmul_10-$L$SEH_begin_gcm_ghash_clmul_1 + DB 216 + DW 7 + DB $L$SEH_prologue_gcm_ghash_clmul_9-$L$SEH_begin_gcm_ghash_clmul_1 + DB 200 + DW 6 + DB $L$SEH_prologue_gcm_ghash_clmul_8-$L$SEH_begin_gcm_ghash_clmul_1 + DB 184 + DW 5 + DB $L$SEH_prologue_gcm_ghash_clmul_7-$L$SEH_begin_gcm_ghash_clmul_1 + DB 168 + DW 4 + DB $L$SEH_prologue_gcm_ghash_clmul_6-$L$SEH_begin_gcm_ghash_clmul_1 + DB 152 + DW 3 + DB $L$SEH_prologue_gcm_ghash_clmul_5-$L$SEH_begin_gcm_ghash_clmul_1 + DB 136 + DW 2 + DB $L$SEH_prologue_gcm_ghash_clmul_4-$L$SEH_begin_gcm_ghash_clmul_1 + DB 120 + DW 1 + DB $L$SEH_prologue_gcm_ghash_clmul_3-$L$SEH_begin_gcm_ghash_clmul_1 + DB 104 + DW 0 + DB $L$SEH_prologue_gcm_ghash_clmul_2-$L$SEH_begin_gcm_ghash_clmul_1 + DB 1 + DW 21 + +$L$SEH_info_gcm_init_avx_0: + DB 1 + DB $L$SEH_endprologue_gcm_init_avx_4-$L$SEH_begin_gcm_init_avx_1 + DB 3 + DB 0 + DB $L$SEH_prologue_gcm_init_avx_3-$L$SEH_begin_gcm_init_avx_1 + DB 104 + DW 0 + DB $L$SEH_prologue_gcm_init_avx_2-$L$SEH_begin_gcm_init_avx_1 + DB 34 + + DW 0 +$L$SEH_info_gcm_ghash_avx_0: + DB 1 + DB $L$SEH_endprologue_gcm_ghash_avx_13-$L$SEH_begin_gcm_ghash_avx_1 + DB 22 + DB 0 + DB $L$SEH_prologue_gcm_ghash_avx_12-$L$SEH_begin_gcm_ghash_avx_1 + DB 248 + DW 9 + DB $L$SEH_prologue_gcm_ghash_avx_11-$L$SEH_begin_gcm_ghash_avx_1 + DB 232 + DW 8 + DB $L$SEH_prologue_gcm_ghash_avx_10-$L$SEH_begin_gcm_ghash_avx_1 + DB 216 + DW 7 + DB $L$SEH_prologue_gcm_ghash_avx_9-$L$SEH_begin_gcm_ghash_avx_1 + DB 200 + DW 6 + DB $L$SEH_prologue_gcm_ghash_avx_8-$L$SEH_begin_gcm_ghash_avx_1 + DB 184 + DW 5 + DB $L$SEH_prologue_gcm_ghash_avx_7-$L$SEH_begin_gcm_ghash_avx_1 + DB 168 + DW 4 + DB $L$SEH_prologue_gcm_ghash_avx_6-$L$SEH_begin_gcm_ghash_avx_1 + DB 152 + DW 3 + DB $L$SEH_prologue_gcm_ghash_avx_5-$L$SEH_begin_gcm_ghash_avx_1 + DB 136 + DW 2 + DB $L$SEH_prologue_gcm_ghash_avx_4-$L$SEH_begin_gcm_ghash_avx_1 + DB 120 + DW 1 + DB $L$SEH_prologue_gcm_ghash_avx_3-$L$SEH_begin_gcm_ghash_avx_1 + DB 104 + DW 0 + DB $L$SEH_prologue_gcm_ghash_avx_2-$L$SEH_begin_gcm_ghash_avx_1 + DB 1 + DW 21 +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/ring-0.17.14/pregenerated/ghash-x86_64-nasm.o b/ring-0.17.14/pregenerated/ghash-x86_64-nasm.o new file mode 100644 index 0000000000..2908f94555 Binary files /dev/null and b/ring-0.17.14/pregenerated/ghash-x86_64-nasm.o differ diff --git a/ring-0.17.14/pregenerated/ghashv8-armx-ios64.S b/ring-0.17.14/pregenerated/ghashv8-armx-ios64.S new file mode 100644 index 0000000000..0747a5fc6e --- /dev/null +++ b/ring-0.17.14/pregenerated/ghashv8-armx-ios64.S @@ -0,0 +1,149 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) +#if __ARM_MAX_ARCH__>=7 +.text + +.globl _gcm_init_clmul +.private_extern _gcm_init_clmul + +.align 4 +_gcm_init_clmul: + AARCH64_VALID_CALL_TARGET + ld1 {v17.2d},[x1] //load input H + movi v19.16b,#0xe1 + shl v19.2d,v19.2d,#57 //0xc2.0 + ext v3.16b,v17.16b,v17.16b,#8 + ushr v18.2d,v19.2d,#63 + dup v17.4s,v17.s[1] + ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 + ushr v18.2d,v3.2d,#63 + sshr v17.4s,v17.4s,#31 //broadcast carry bit + and v18.16b,v18.16b,v16.16b + shl v3.2d,v3.2d,#1 + ext v18.16b,v18.16b,v18.16b,#8 + and v16.16b,v16.16b,v17.16b + orr v3.16b,v3.16b,v18.16b //H<<<=1 + eor v20.16b,v3.16b,v16.16b //twisted H + st1 {v20.2d},[x0],#16 //store Htable[0] + + //calculate H^2 + ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing + pmull v0.1q,v20.1d,v20.1d + eor v16.16b,v16.16b,v20.16b + pmull2 v2.1q,v20.2d,v20.2d + pmull v1.1q,v16.1d,v16.1d + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase + + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v22.16b,v0.16b,v18.16b + + ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing + eor v17.16b,v17.16b,v22.16b + ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed + st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2] + //calculate H^3 and H^4 + pmull v0.1q,v20.1d, v22.1d + pmull v5.1q,v22.1d,v22.1d + pmull2 v2.1q,v20.2d, v22.2d + pmull2 v7.1q,v22.2d,v22.2d + pmull v1.1q,v16.1d,v17.1d + pmull v6.1q,v17.1d,v17.1d + + ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + ext v17.16b,v5.16b,v7.16b,#8 + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v16.16b + eor v4.16b,v5.16b,v7.16b + eor v6.16b,v6.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase + eor v6.16b,v6.16b,v4.16b + pmull v4.1q,v5.1d,v19.1d + + ins v2.d[0],v1.d[1] + ins v7.d[0],v6.d[1] + ins v1.d[1],v0.d[0] + ins v6.d[1],v5.d[0] + eor v0.16b,v1.16b,v18.16b + eor v5.16b,v6.16b,v4.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase + ext v4.16b,v5.16b,v5.16b,#8 + pmull v0.1q,v0.1d,v19.1d + pmull v5.1q,v5.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v4.16b,v4.16b,v7.16b + eor v20.16b, v0.16b,v18.16b //H^3 + eor v22.16b,v5.16b,v4.16b //H^4 + + ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing + ext v17.16b,v22.16b,v22.16b,#8 + eor v16.16b,v16.16b,v20.16b + eor v17.16b,v17.16b,v22.16b + ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed + st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5] + ret + +.globl _gcm_gmult_clmul +.private_extern _gcm_gmult_clmul + +.align 4 +_gcm_gmult_clmul: + AARCH64_VALID_CALL_TARGET + ld1 {v17.2d},[x0] //load Xi + movi v19.16b,#0xe1 + ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... + shl v19.2d,v19.2d,#57 +#ifndef __AARCH64EB__ + rev64 v17.16b,v17.16b +#endif + ext v3.16b,v17.16b,v17.16b,#8 + + pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo + eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing + pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi + pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b + +#ifndef __AARCH64EB__ + rev64 v0.16b,v0.16b +#endif + ext v0.16b,v0.16b,v0.16b,#8 + st1 {v0.2d},[x0] //write out Xi + + ret + +.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) diff --git a/ring-0.17.14/pregenerated/ghashv8-armx-linux64.S b/ring-0.17.14/pregenerated/ghashv8-armx-linux64.S new file mode 100644 index 0000000000..4e811b7a5c --- /dev/null +++ b/ring-0.17.14/pregenerated/ghashv8-armx-linux64.S @@ -0,0 +1,149 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) +#if __ARM_MAX_ARCH__>=7 +.text +.arch armv8-a+crypto +.globl gcm_init_clmul +.hidden gcm_init_clmul +.type gcm_init_clmul,%function +.align 4 +gcm_init_clmul: + AARCH64_VALID_CALL_TARGET + ld1 {v17.2d},[x1] //load input H + movi v19.16b,#0xe1 + shl v19.2d,v19.2d,#57 //0xc2.0 + ext v3.16b,v17.16b,v17.16b,#8 + ushr v18.2d,v19.2d,#63 + dup v17.4s,v17.s[1] + ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 + ushr v18.2d,v3.2d,#63 + sshr v17.4s,v17.4s,#31 //broadcast carry bit + and v18.16b,v18.16b,v16.16b + shl v3.2d,v3.2d,#1 + ext v18.16b,v18.16b,v18.16b,#8 + and v16.16b,v16.16b,v17.16b + orr v3.16b,v3.16b,v18.16b //H<<<=1 + eor v20.16b,v3.16b,v16.16b //twisted H + st1 {v20.2d},[x0],#16 //store Htable[0] + + //calculate H^2 + ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing + pmull v0.1q,v20.1d,v20.1d + eor v16.16b,v16.16b,v20.16b + pmull2 v2.1q,v20.2d,v20.2d + pmull v1.1q,v16.1d,v16.1d + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase + + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v22.16b,v0.16b,v18.16b + + ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing + eor v17.16b,v17.16b,v22.16b + ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed + st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2] + //calculate H^3 and H^4 + pmull v0.1q,v20.1d, v22.1d + pmull v5.1q,v22.1d,v22.1d + pmull2 v2.1q,v20.2d, v22.2d + pmull2 v7.1q,v22.2d,v22.2d + pmull v1.1q,v16.1d,v17.1d + pmull v6.1q,v17.1d,v17.1d + + ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + ext v17.16b,v5.16b,v7.16b,#8 + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v16.16b + eor v4.16b,v5.16b,v7.16b + eor v6.16b,v6.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase + eor v6.16b,v6.16b,v4.16b + pmull v4.1q,v5.1d,v19.1d + + ins v2.d[0],v1.d[1] + ins v7.d[0],v6.d[1] + ins v1.d[1],v0.d[0] + ins v6.d[1],v5.d[0] + eor v0.16b,v1.16b,v18.16b + eor v5.16b,v6.16b,v4.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase + ext v4.16b,v5.16b,v5.16b,#8 + pmull v0.1q,v0.1d,v19.1d + pmull v5.1q,v5.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v4.16b,v4.16b,v7.16b + eor v20.16b, v0.16b,v18.16b //H^3 + eor v22.16b,v5.16b,v4.16b //H^4 + + ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing + ext v17.16b,v22.16b,v22.16b,#8 + eor v16.16b,v16.16b,v20.16b + eor v17.16b,v17.16b,v22.16b + ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed + st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5] + ret +.size gcm_init_clmul,.-gcm_init_clmul +.globl gcm_gmult_clmul +.hidden gcm_gmult_clmul +.type gcm_gmult_clmul,%function +.align 4 +gcm_gmult_clmul: + AARCH64_VALID_CALL_TARGET + ld1 {v17.2d},[x0] //load Xi + movi v19.16b,#0xe1 + ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... + shl v19.2d,v19.2d,#57 +#ifndef __AARCH64EB__ + rev64 v17.16b,v17.16b +#endif + ext v3.16b,v17.16b,v17.16b,#8 + + pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo + eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing + pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi + pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b + +#ifndef __AARCH64EB__ + rev64 v0.16b,v0.16b +#endif + ext v0.16b,v0.16b,v0.16b,#8 + st1 {v0.2d},[x0] //write out Xi + + ret +.size gcm_gmult_clmul,.-gcm_gmult_clmul +.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) diff --git a/ring-0.17.14/pregenerated/ghashv8-armx-win64.S b/ring-0.17.14/pregenerated/ghashv8-armx-win64.S new file mode 100644 index 0000000000..86386ac7be --- /dev/null +++ b/ring-0.17.14/pregenerated/ghashv8-armx-win64.S @@ -0,0 +1,153 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +#if __ARM_MAX_ARCH__>=7 +.text +.arch armv8-a+crypto +.globl gcm_init_clmul + +.def gcm_init_clmul + .type 32 +.endef +.align 4 +gcm_init_clmul: + AARCH64_VALID_CALL_TARGET + ld1 {v17.2d},[x1] //load input H + movi v19.16b,#0xe1 + shl v19.2d,v19.2d,#57 //0xc2.0 + ext v3.16b,v17.16b,v17.16b,#8 + ushr v18.2d,v19.2d,#63 + dup v17.4s,v17.s[1] + ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 + ushr v18.2d,v3.2d,#63 + sshr v17.4s,v17.4s,#31 //broadcast carry bit + and v18.16b,v18.16b,v16.16b + shl v3.2d,v3.2d,#1 + ext v18.16b,v18.16b,v18.16b,#8 + and v16.16b,v16.16b,v17.16b + orr v3.16b,v3.16b,v18.16b //H<<<=1 + eor v20.16b,v3.16b,v16.16b //twisted H + st1 {v20.2d},[x0],#16 //store Htable[0] + + //calculate H^2 + ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing + pmull v0.1q,v20.1d,v20.1d + eor v16.16b,v16.16b,v20.16b + pmull2 v2.1q,v20.2d,v20.2d + pmull v1.1q,v16.1d,v16.1d + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase + + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v22.16b,v0.16b,v18.16b + + ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing + eor v17.16b,v17.16b,v22.16b + ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed + st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2] + //calculate H^3 and H^4 + pmull v0.1q,v20.1d, v22.1d + pmull v5.1q,v22.1d,v22.1d + pmull2 v2.1q,v20.2d, v22.2d + pmull2 v7.1q,v22.2d,v22.2d + pmull v1.1q,v16.1d,v17.1d + pmull v6.1q,v17.1d,v17.1d + + ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + ext v17.16b,v5.16b,v7.16b,#8 + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v16.16b + eor v4.16b,v5.16b,v7.16b + eor v6.16b,v6.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase + eor v6.16b,v6.16b,v4.16b + pmull v4.1q,v5.1d,v19.1d + + ins v2.d[0],v1.d[1] + ins v7.d[0],v6.d[1] + ins v1.d[1],v0.d[0] + ins v6.d[1],v5.d[0] + eor v0.16b,v1.16b,v18.16b + eor v5.16b,v6.16b,v4.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase + ext v4.16b,v5.16b,v5.16b,#8 + pmull v0.1q,v0.1d,v19.1d + pmull v5.1q,v5.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v4.16b,v4.16b,v7.16b + eor v20.16b, v0.16b,v18.16b //H^3 + eor v22.16b,v5.16b,v4.16b //H^4 + + ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing + ext v17.16b,v22.16b,v22.16b,#8 + eor v16.16b,v16.16b,v20.16b + eor v17.16b,v17.16b,v22.16b + ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed + st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5] + ret + +.globl gcm_gmult_clmul + +.def gcm_gmult_clmul + .type 32 +.endef +.align 4 +gcm_gmult_clmul: + AARCH64_VALID_CALL_TARGET + ld1 {v17.2d},[x0] //load Xi + movi v19.16b,#0xe1 + ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... + shl v19.2d,v19.2d,#57 +#ifndef __AARCH64EB__ + rev64 v17.16b,v17.16b +#endif + ext v3.16b,v17.16b,v17.16b,#8 + + pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo + eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing + pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi + pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b + +#ifndef __AARCH64EB__ + rev64 v0.16b,v0.16b +#endif + ext v0.16b,v0.16b,v0.16b,#8 + st1 {v0.2d},[x0] //write out Xi + + ret + +.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) diff --git a/ring-0.17.14/pregenerated/p256-armv8-asm-ios64.S b/ring-0.17.14/pregenerated/p256-armv8-asm-ios64.S new file mode 100644 index 0000000000..88355ca462 --- /dev/null +++ b/ring-0.17.14/pregenerated/p256-armv8-asm-ios64.S @@ -0,0 +1,1608 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) +.section __TEXT,__const +.align 5 +Lpoly: +.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 +LRR: // 2^512 mod P precomputed for NIST P256 polynomial +.quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd +Lone_mont: +.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe +Lone: +.quad 1,0,0,0 +Lord: +.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 +LordK: +.quad 0xccd1c8aaee00bc4f +.byte 69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.text + +// void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], +// const BN_ULONG x2[4]); +.globl _ecp_nistz256_mul_mont +.private_extern _ecp_nistz256_mul_mont + +.align 4 +_ecp_nistz256_mul_mont: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + + ldr x3,[x2] // bp[0] + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + adrp x13,Lpoly@PAGE + add x13,x13,Lpoly@PAGEOFF + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + bl __ecp_nistz256_mul_mont + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +// void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl _ecp_nistz256_sqr_mont +.private_extern _ecp_nistz256_sqr_mont + +.align 4 +_ecp_nistz256_sqr_mont: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + adrp x13,Lpoly@PAGE + add x13,x13,Lpoly@PAGEOFF + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + bl __ecp_nistz256_sqr_mont + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +// void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl _ecp_nistz256_neg +.private_extern _ecp_nistz256_neg + +.align 4 +_ecp_nistz256_neg: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x2,x1 + mov x14,xzr // a = 0 + mov x15,xzr + mov x16,xzr + mov x17,xzr + adrp x13,Lpoly@PAGE + add x13,x13,Lpoly@PAGEOFF + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + bl __ecp_nistz256_sub_from + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded +// to x4-x7 and b[0] - to x3 + +.align 4 +__ecp_nistz256_mul_mont: + mul x14,x4,x3 // a[0]*b[0] + umulh x8,x4,x3 + + mul x15,x5,x3 // a[1]*b[0] + umulh x9,x5,x3 + + mul x16,x6,x3 // a[2]*b[0] + umulh x10,x6,x3 + + mul x17,x7,x3 // a[3]*b[0] + umulh x11,x7,x3 + ldr x3,[x2,#8] // b[1] + + adds x15,x15,x8 // accumulate high parts of multiplication + lsl x8,x14,#32 + adcs x16,x16,x9 + lsr x9,x14,#32 + adcs x17,x17,x10 + adc x19,xzr,x11 + mov x20,xzr + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + mul x8,x4,x3 // lo(a[0]*b[i]) + adcs x15,x16,x9 + mul x9,x5,x3 // lo(a[1]*b[i]) + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + mul x10,x6,x3 // lo(a[2]*b[i]) + adcs x17,x19,x11 + mul x11,x7,x3 // lo(a[3]*b[i]) + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts of multiplication + umulh x8,x4,x3 // hi(a[0]*b[i]) + adcs x15,x15,x9 + umulh x9,x5,x3 // hi(a[1]*b[i]) + adcs x16,x16,x10 + umulh x10,x6,x3 // hi(a[2]*b[i]) + adcs x17,x17,x11 + umulh x11,x7,x3 // hi(a[3]*b[i]) + adc x19,x19,xzr + ldr x3,[x2,#8*(1+1)] // b[1+1] + adds x15,x15,x8 // accumulate high parts of multiplication + lsl x8,x14,#32 + adcs x16,x16,x9 + lsr x9,x14,#32 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + mul x8,x4,x3 // lo(a[0]*b[i]) + adcs x15,x16,x9 + mul x9,x5,x3 // lo(a[1]*b[i]) + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + mul x10,x6,x3 // lo(a[2]*b[i]) + adcs x17,x19,x11 + mul x11,x7,x3 // lo(a[3]*b[i]) + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts of multiplication + umulh x8,x4,x3 // hi(a[0]*b[i]) + adcs x15,x15,x9 + umulh x9,x5,x3 // hi(a[1]*b[i]) + adcs x16,x16,x10 + umulh x10,x6,x3 // hi(a[2]*b[i]) + adcs x17,x17,x11 + umulh x11,x7,x3 // hi(a[3]*b[i]) + adc x19,x19,xzr + ldr x3,[x2,#8*(2+1)] // b[2+1] + adds x15,x15,x8 // accumulate high parts of multiplication + lsl x8,x14,#32 + adcs x16,x16,x9 + lsr x9,x14,#32 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + mul x8,x4,x3 // lo(a[0]*b[i]) + adcs x15,x16,x9 + mul x9,x5,x3 // lo(a[1]*b[i]) + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + mul x10,x6,x3 // lo(a[2]*b[i]) + adcs x17,x19,x11 + mul x11,x7,x3 // lo(a[3]*b[i]) + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts of multiplication + umulh x8,x4,x3 // hi(a[0]*b[i]) + adcs x15,x15,x9 + umulh x9,x5,x3 // hi(a[1]*b[i]) + adcs x16,x16,x10 + umulh x10,x6,x3 // hi(a[2]*b[i]) + adcs x17,x17,x11 + umulh x11,x7,x3 // hi(a[3]*b[i]) + adc x19,x19,xzr + adds x15,x15,x8 // accumulate high parts of multiplication + lsl x8,x14,#32 + adcs x16,x16,x9 + lsr x9,x14,#32 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + // last reduction + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + adcs x17,x19,x11 + adc x19,x20,xzr + + adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus + sbcs x9,x15,x12 + sbcs x10,x16,xzr + sbcs x11,x17,x13 + sbcs xzr,x19,xzr // did it borrow? + + csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus + csel x15,x15,x9,lo + csel x16,x16,x10,lo + stp x14,x15,[x0] + csel x17,x17,x11,lo + stp x16,x17,[x0,#16] + + ret + + +// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded +// to x4-x7 + +.align 4 +__ecp_nistz256_sqr_mont: + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul x15,x5,x4 // a[1]*a[0] + umulh x9,x5,x4 + mul x16,x6,x4 // a[2]*a[0] + umulh x10,x6,x4 + mul x17,x7,x4 // a[3]*a[0] + umulh x19,x7,x4 + + adds x16,x16,x9 // accumulate high parts of multiplication + mul x8,x6,x5 // a[2]*a[1] + umulh x9,x6,x5 + adcs x17,x17,x10 + mul x10,x7,x5 // a[3]*a[1] + umulh x11,x7,x5 + adc x19,x19,xzr // can't overflow + + mul x20,x7,x6 // a[3]*a[2] + umulh x1,x7,x6 + + adds x9,x9,x10 // accumulate high parts of multiplication + mul x14,x4,x4 // a[0]*a[0] + adc x10,x11,xzr // can't overflow + + adds x17,x17,x8 // accumulate low parts of multiplication + umulh x4,x4,x4 + adcs x19,x19,x9 + mul x9,x5,x5 // a[1]*a[1] + adcs x20,x20,x10 + umulh x5,x5,x5 + adc x1,x1,xzr // can't overflow + + adds x15,x15,x15 // acc[1-6]*=2 + mul x10,x6,x6 // a[2]*a[2] + adcs x16,x16,x16 + umulh x6,x6,x6 + adcs x17,x17,x17 + mul x11,x7,x7 // a[3]*a[3] + adcs x19,x19,x19 + umulh x7,x7,x7 + adcs x20,x20,x20 + adcs x1,x1,x1 + adc x2,xzr,xzr + + adds x15,x15,x4 // +a[i]*a[i] + adcs x16,x16,x9 + adcs x17,x17,x5 + adcs x19,x19,x10 + adcs x20,x20,x6 + lsl x8,x14,#32 + adcs x1,x1,x11 + lsr x9,x14,#32 + adc x2,x2,x7 + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + lsl x8,x14,#32 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + lsr x9,x14,#32 + adc x17,x11,xzr // can't overflow + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + lsl x8,x14,#32 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + lsr x9,x14,#32 + adc x17,x11,xzr // can't overflow + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + lsl x8,x14,#32 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + lsr x9,x14,#32 + adc x17,x11,xzr // can't overflow + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + adc x17,x11,xzr // can't overflow + + adds x14,x14,x19 // accumulate upper half + adcs x15,x15,x20 + adcs x16,x16,x1 + adcs x17,x17,x2 + adc x19,xzr,xzr + + adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus + sbcs x9,x15,x12 + sbcs x10,x16,xzr + sbcs x11,x17,x13 + sbcs xzr,x19,xzr // did it borrow? + + csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus + csel x15,x15,x9,lo + csel x16,x16,x10,lo + stp x14,x15,[x0] + csel x17,x17,x11,lo + stp x16,x17,[x0,#16] + + ret + + +// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to +// x4-x7 and x8-x11. This is done because it's used in multiple +// contexts, e.g. in multiplication by 2 and 3... + +.align 4 +__ecp_nistz256_add_to: + adds x14,x14,x8 // ret = a+b + adcs x15,x15,x9 + adcs x16,x16,x10 + adcs x17,x17,x11 + adc x1,xzr,xzr // zap x1 + + adds x8,x14,#1 // subs x8,x4,#-1 // tmp = ret-modulus + sbcs x9,x15,x12 + sbcs x10,x16,xzr + sbcs x11,x17,x13 + sbcs xzr,x1,xzr // did subtraction borrow? + + csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus + csel x15,x15,x9,lo + csel x16,x16,x10,lo + stp x14,x15,[x0] + csel x17,x17,x11,lo + stp x16,x17,[x0,#16] + + ret + + + +.align 4 +__ecp_nistz256_sub_from: + ldp x8,x9,[x2] + ldp x10,x11,[x2,#16] + subs x14,x14,x8 // ret = a-b + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbcs x17,x17,x11 + sbc x1,xzr,xzr // zap x1 + + subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus + adcs x9,x15,x12 + adcs x10,x16,xzr + adc x11,x17,x13 + cmp x1,xzr // did subtraction borrow? + + csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret + csel x15,x15,x9,eq + csel x16,x16,x10,eq + stp x14,x15,[x0] + csel x17,x17,x11,eq + stp x16,x17,[x0,#16] + + ret + + + +.align 4 +__ecp_nistz256_sub_morf: + ldp x8,x9,[x2] + ldp x10,x11,[x2,#16] + subs x14,x8,x14 // ret = b-a + sbcs x15,x9,x15 + sbcs x16,x10,x16 + sbcs x17,x11,x17 + sbc x1,xzr,xzr // zap x1 + + subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus + adcs x9,x15,x12 + adcs x10,x16,xzr + adc x11,x17,x13 + cmp x1,xzr // did subtraction borrow? + + csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret + csel x15,x15,x9,eq + csel x16,x16,x10,eq + stp x14,x15,[x0] + csel x17,x17,x11,eq + stp x16,x17,[x0,#16] + + ret + + + +.align 4 +__ecp_nistz256_div_by_2: + subs x8,x14,#1 // adds x8,x4,#-1 // tmp = a+modulus + adcs x9,x15,x12 + adcs x10,x16,xzr + adcs x11,x17,x13 + adc x1,xzr,xzr // zap x1 + tst x14,#1 // is a even? + + csel x14,x14,x8,eq // ret = even ? a : a+modulus + csel x15,x15,x9,eq + csel x16,x16,x10,eq + csel x17,x17,x11,eq + csel x1,xzr,x1,eq + + lsr x14,x14,#1 // ret >>= 1 + orr x14,x14,x15,lsl#63 + lsr x15,x15,#1 + orr x15,x15,x16,lsl#63 + lsr x16,x16,#1 + orr x16,x16,x17,lsl#63 + lsr x17,x17,#1 + stp x14,x15,[x0] + orr x17,x17,x1,lsl#63 + stp x16,x17,[x0,#16] + + ret + +.globl _ecp_nistz256_point_double +.private_extern _ecp_nistz256_point_double + +.align 5 +_ecp_nistz256_point_double: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + sub sp,sp,#32*4 + +Ldouble_shortcut: + ldp x14,x15,[x1,#32] + mov x21,x0 + ldp x16,x17,[x1,#48] + mov x22,x1 + adrp x13,Lpoly@PAGE + add x13,x13,Lpoly@PAGEOFF + ldr x12,[x13,#8] + mov x8,x14 + ldr x13,[x13,#24] + mov x9,x15 + ldp x4,x5,[x22,#64] // forward load for p256_sqr_mont + mov x10,x16 + mov x11,x17 + ldp x6,x7,[x22,#64+16] + add x0,sp,#0 + bl __ecp_nistz256_add_to // p256_mul_by_2(S, in_y); + + add x0,sp,#64 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); + + ldp x8,x9,[x22] + ldp x10,x11,[x22,#16] + mov x4,x14 // put Zsqr aside for p256_sub + mov x5,x15 + mov x6,x16 + mov x7,x17 + add x0,sp,#32 + bl __ecp_nistz256_add_to // p256_add(M, Zsqr, in_x); + + add x2,x22,#0 + mov x14,x4 // restore Zsqr + mov x15,x5 + ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont + mov x16,x6 + mov x17,x7 + ldp x6,x7,[sp,#0+16] + add x0,sp,#64 + bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); + + add x0,sp,#0 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); + + ldr x3,[x22,#32] + ldp x4,x5,[x22,#64] + ldp x6,x7,[x22,#64+16] + add x2,x22,#32 + add x0,sp,#96 + bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); + + mov x8,x14 + mov x9,x15 + ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont + mov x10,x16 + mov x11,x17 + ldp x6,x7,[sp,#0+16] + add x0,x21,#64 + bl __ecp_nistz256_add_to // p256_mul_by_2(res_z, tmp0); + + add x0,sp,#96 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); + + ldr x3,[sp,#64] // forward load for p256_mul_mont + ldp x4,x5,[sp,#32] + ldp x6,x7,[sp,#32+16] + add x0,x21,#32 + bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); + + add x2,sp,#64 + add x0,sp,#32 + bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); + + mov x8,x14 // duplicate M + mov x9,x15 + mov x10,x16 + mov x11,x17 + mov x4,x14 // put M aside + mov x5,x15 + mov x6,x16 + mov x7,x17 + add x0,sp,#32 + bl __ecp_nistz256_add_to + mov x8,x4 // restore M + mov x9,x5 + ldr x3,[x22] // forward load for p256_mul_mont + mov x10,x6 + ldp x4,x5,[sp,#0] + mov x11,x7 + ldp x6,x7,[sp,#0+16] + bl __ecp_nistz256_add_to // p256_mul_by_3(M, M); + + add x2,x22,#0 + add x0,sp,#0 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); + + mov x8,x14 + mov x9,x15 + ldp x4,x5,[sp,#32] // forward load for p256_sqr_mont + mov x10,x16 + mov x11,x17 + ldp x6,x7,[sp,#32+16] + add x0,sp,#96 + bl __ecp_nistz256_add_to // p256_mul_by_2(tmp0, S); + + add x0,x21,#0 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); + + add x2,sp,#96 + bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); + + add x2,sp,#0 + add x0,sp,#0 + bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); + + ldr x3,[sp,#32] + mov x4,x14 // copy S + mov x5,x15 + mov x6,x16 + mov x7,x17 + add x2,sp,#32 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); + + add x2,x21,#32 + add x0,x21,#32 + bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); + + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.globl _ecp_nistz256_point_add +.private_extern _ecp_nistz256_point_add + +.align 5 +_ecp_nistz256_point_add: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#32*12 + + ldp x4,x5,[x2,#64] // in2_z + ldp x6,x7,[x2,#64+16] + mov x21,x0 + mov x22,x1 + mov x23,x2 + adrp x13,Lpoly@PAGE + add x13,x13,Lpoly@PAGEOFF + ldr x12,[x13,#8] + ldr x13,[x13,#24] + orr x8,x4,x5 + orr x10,x6,x7 + orr x25,x8,x10 + cmp x25,#0 + csetm x25,ne // ~in2infty + add x0,sp,#192 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); + + ldp x4,x5,[x22,#64] // in1_z + ldp x6,x7,[x22,#64+16] + orr x8,x4,x5 + orr x10,x6,x7 + orr x24,x8,x10 + cmp x24,#0 + csetm x24,ne // ~in1infty + add x0,sp,#128 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); + + ldr x3,[x23,#64] + ldp x4,x5,[sp,#192] + ldp x6,x7,[sp,#192+16] + add x2,x23,#64 + add x0,sp,#320 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z); + + ldr x3,[x22,#64] + ldp x4,x5,[sp,#128] + ldp x6,x7,[sp,#128+16] + add x2,x22,#64 + add x0,sp,#352 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); + + ldr x3,[x22,#32] + ldp x4,x5,[sp,#320] + ldp x6,x7,[sp,#320+16] + add x2,x22,#32 + add x0,sp,#320 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y); + + ldr x3,[x23,#32] + ldp x4,x5,[sp,#352] + ldp x6,x7,[sp,#352+16] + add x2,x23,#32 + add x0,sp,#352 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); + + add x2,sp,#320 + ldr x3,[sp,#192] // forward load for p256_mul_mont + ldp x4,x5,[x22] + ldp x6,x7,[x22,#16] + add x0,sp,#160 + bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1); + + orr x14,x14,x15 // see if result is zero + orr x16,x16,x17 + orr x26,x14,x16 // ~is_equal(S1,S2) + + add x2,sp,#192 + add x0,sp,#256 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr); + + ldr x3,[sp,#128] + ldp x4,x5,[x23] + ldp x6,x7,[x23,#16] + add x2,sp,#128 + add x0,sp,#288 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr); + + add x2,sp,#256 + ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont + ldp x6,x7,[sp,#160+16] + add x0,sp,#96 + bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1); + + orr x14,x14,x15 // see if result is zero + orr x16,x16,x17 + orr x14,x14,x16 // ~is_equal(U1,U2) + + mvn x27,x24 // -1/0 -> 0/-1 + mvn x28,x25 // -1/0 -> 0/-1 + orr x14,x14,x27 + orr x14,x14,x28 + orr x14,x14,x26 + cbnz x14,Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) + +Ladd_double: + mov x1,x22 + mov x0,x21 + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + add sp,sp,#256 // #256 is from #32*(12-4). difference in stack frames + b Ldouble_shortcut + +.align 4 +Ladd_proceed: + add x0,sp,#192 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); + + ldr x3,[x22,#64] + ldp x4,x5,[sp,#96] + ldp x6,x7,[sp,#96+16] + add x2,x22,#64 + add x0,sp,#64 + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); + + ldp x4,x5,[sp,#96] + ldp x6,x7,[sp,#96+16] + add x0,sp,#128 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); + + ldr x3,[x23,#64] + ldp x4,x5,[sp,#64] + ldp x6,x7,[sp,#64+16] + add x2,x23,#64 + add x0,sp,#64 + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z); + + ldr x3,[sp,#96] + ldp x4,x5,[sp,#128] + ldp x6,x7,[sp,#128+16] + add x2,sp,#96 + add x0,sp,#224 + bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); + + ldr x3,[sp,#128] + ldp x4,x5,[sp,#256] + ldp x6,x7,[sp,#256+16] + add x2,sp,#128 + add x0,sp,#288 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr); + + mov x8,x14 + mov x9,x15 + mov x10,x16 + mov x11,x17 + add x0,sp,#128 + bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); + + add x2,sp,#192 + add x0,sp,#0 + bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); + + add x2,sp,#224 + bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); + + add x2,sp,#288 + ldr x3,[sp,#224] // forward load for p256_mul_mont + ldp x4,x5,[sp,#320] + ldp x6,x7,[sp,#320+16] + add x0,sp,#32 + bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); + + add x2,sp,#224 + add x0,sp,#352 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub); + + ldr x3,[sp,#160] + ldp x4,x5,[sp,#32] + ldp x6,x7,[sp,#32+16] + add x2,sp,#160 + add x0,sp,#32 + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); + + add x2,sp,#352 + bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); + + ldp x4,x5,[sp,#0] // res + ldp x6,x7,[sp,#0+16] + ldp x8,x9,[x23] // in2 + ldp x10,x11,[x23,#16] + ldp x14,x15,[x22,#0] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#0+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + ldp x4,x5,[sp,#0+0+32] // res + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + ldp x6,x7,[sp,#0+0+48] + csel x14,x8,x14,ne + csel x15,x9,x15,ne + ldp x8,x9,[x23,#0+32] // in2 + csel x16,x10,x16,ne + csel x17,x11,x17,ne + ldp x10,x11,[x23,#0+48] + stp x14,x15,[x21,#0] + stp x16,x17,[x21,#0+16] + ldp x14,x15,[x22,#32] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#32+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + ldp x4,x5,[sp,#0+32+32] // res + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + ldp x6,x7,[sp,#0+32+48] + csel x14,x8,x14,ne + csel x15,x9,x15,ne + ldp x8,x9,[x23,#32+32] // in2 + csel x16,x10,x16,ne + csel x17,x11,x17,ne + ldp x10,x11,[x23,#32+48] + stp x14,x15,[x21,#32] + stp x16,x17,[x21,#32+16] + ldp x14,x15,[x22,#64] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#64+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + csel x14,x8,x14,ne + csel x15,x9,x15,ne + csel x16,x10,x16,ne + csel x17,x11,x17,ne + stp x14,x15,[x21,#64] + stp x16,x17,[x21,#64+16] + +Ladd_done: + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.globl _ecp_nistz256_point_add_affine +.private_extern _ecp_nistz256_point_add_affine + +.align 5 +_ecp_nistz256_point_add_affine: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + sub sp,sp,#32*10 + + mov x21,x0 + mov x22,x1 + mov x23,x2 + adrp x13,Lpoly@PAGE + add x13,x13,Lpoly@PAGEOFF + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + ldp x4,x5,[x1,#64] // in1_z + ldp x6,x7,[x1,#64+16] + orr x8,x4,x5 + orr x10,x6,x7 + orr x24,x8,x10 + cmp x24,#0 + csetm x24,ne // ~in1infty + + ldp x14,x15,[x2] // in2_x + ldp x16,x17,[x2,#16] + ldp x8,x9,[x2,#32] // in2_y + ldp x10,x11,[x2,#48] + orr x14,x14,x15 + orr x16,x16,x17 + orr x8,x8,x9 + orr x10,x10,x11 + orr x14,x14,x16 + orr x8,x8,x10 + orr x25,x14,x8 + cmp x25,#0 + csetm x25,ne // ~in2infty + + add x0,sp,#128 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); + + mov x4,x14 + mov x5,x15 + mov x6,x16 + mov x7,x17 + ldr x3,[x23] + add x2,x23,#0 + add x0,sp,#96 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); + + add x2,x22,#0 + ldr x3,[x22,#64] // forward load for p256_mul_mont + ldp x4,x5,[sp,#128] + ldp x6,x7,[sp,#128+16] + add x0,sp,#160 + bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); + + add x2,x22,#64 + add x0,sp,#128 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); + + ldr x3,[x22,#64] + ldp x4,x5,[sp,#160] + ldp x6,x7,[sp,#160+16] + add x2,x22,#64 + add x0,sp,#64 + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); + + ldr x3,[x23,#32] + ldp x4,x5,[sp,#128] + ldp x6,x7,[sp,#128+16] + add x2,x23,#32 + add x0,sp,#128 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); + + add x2,x22,#32 + ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont + ldp x6,x7,[sp,#160+16] + add x0,sp,#192 + bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); + + add x0,sp,#224 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); + + ldp x4,x5,[sp,#192] + ldp x6,x7,[sp,#192+16] + add x0,sp,#288 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); + + ldr x3,[sp,#160] + ldp x4,x5,[sp,#224] + ldp x6,x7,[sp,#224+16] + add x2,sp,#160 + add x0,sp,#256 + bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); + + ldr x3,[x22] + ldp x4,x5,[sp,#224] + ldp x6,x7,[sp,#224+16] + add x2,x22,#0 + add x0,sp,#96 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); + + mov x8,x14 + mov x9,x15 + mov x10,x16 + mov x11,x17 + add x0,sp,#224 + bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); + + add x2,sp,#288 + add x0,sp,#0 + bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); + + add x2,sp,#256 + bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); + + add x2,sp,#96 + ldr x3,[x22,#32] // forward load for p256_mul_mont + ldp x4,x5,[sp,#256] + ldp x6,x7,[sp,#256+16] + add x0,sp,#32 + bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); + + add x2,x22,#32 + add x0,sp,#128 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); + + ldr x3,[sp,#192] + ldp x4,x5,[sp,#32] + ldp x6,x7,[sp,#32+16] + add x2,sp,#192 + add x0,sp,#32 + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); + + add x2,sp,#128 + bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); + + ldp x4,x5,[sp,#0] // res + ldp x6,x7,[sp,#0+16] + ldp x8,x9,[x23] // in2 + ldp x10,x11,[x23,#16] + ldp x14,x15,[x22,#0] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#0+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + ldp x4,x5,[sp,#0+0+32] // res + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + ldp x6,x7,[sp,#0+0+48] + csel x14,x8,x14,ne + csel x15,x9,x15,ne + ldp x8,x9,[x23,#0+32] // in2 + csel x16,x10,x16,ne + csel x17,x11,x17,ne + ldp x10,x11,[x23,#0+48] + stp x14,x15,[x21,#0] + stp x16,x17,[x21,#0+16] + adrp x23,Lone_mont@PAGE-64 + add x23,x23,Lone_mont@PAGEOFF-64 + ldp x14,x15,[x22,#32] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#32+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + ldp x4,x5,[sp,#0+32+32] // res + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + ldp x6,x7,[sp,#0+32+48] + csel x14,x8,x14,ne + csel x15,x9,x15,ne + ldp x8,x9,[x23,#32+32] // in2 + csel x16,x10,x16,ne + csel x17,x11,x17,ne + ldp x10,x11,[x23,#32+48] + stp x14,x15,[x21,#32] + stp x16,x17,[x21,#32+16] + ldp x14,x15,[x22,#64] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#64+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + csel x14,x8,x14,ne + csel x15,x9,x15,ne + csel x16,x10,x16,ne + csel x17,x11,x17,ne + stp x14,x15,[x21,#64] + stp x16,x17,[x21,#64+16] + + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x29,x30,[sp],#80 + AARCH64_VALIDATE_LINK_REGISTER + ret + +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], +// uint64_t b[4]); +.globl _ecp_nistz256_ord_mul_mont +.private_extern _ecp_nistz256_ord_mul_mont + +.align 4 +_ecp_nistz256_ord_mul_mont: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + adrp x23,Lord@PAGE + add x23,x23,Lord@PAGEOFF + ldr x3,[x2] // bp[0] + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + + ldp x12,x13,[x23,#0] + ldp x21,x22,[x23,#16] + ldr x23,[x23,#32] + + mul x14,x4,x3 // a[0]*b[0] + umulh x8,x4,x3 + + mul x15,x5,x3 // a[1]*b[0] + umulh x9,x5,x3 + + mul x16,x6,x3 // a[2]*b[0] + umulh x10,x6,x3 + + mul x17,x7,x3 // a[3]*b[0] + umulh x19,x7,x3 + + mul x24,x14,x23 + + adds x15,x15,x8 // accumulate high parts of multiplication + adcs x16,x16,x9 + adcs x17,x17,x10 + adc x19,x19,xzr + mov x20,xzr + ldr x3,[x2,#8*1] // b[i] + + lsl x8,x24,#32 + subs x16,x16,x24 + lsr x9,x24,#32 + sbcs x17,x17,x8 + sbcs x19,x19,x9 + sbc x20,x20,xzr + + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + mul x8,x4,x3 + adc x11,x11,xzr + mul x9,x5,x3 + + adds x14,x15,x10 + mul x10,x6,x3 + adcs x15,x16,x11 + mul x11,x7,x3 + adcs x16,x17,x24 + adcs x17,x19,x24 + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts + umulh x8,x4,x3 + adcs x15,x15,x9 + umulh x9,x5,x3 + adcs x16,x16,x10 + umulh x10,x6,x3 + adcs x17,x17,x11 + umulh x11,x7,x3 + adc x19,x19,xzr + mul x24,x14,x23 + adds x15,x15,x8 // accumulate high parts + adcs x16,x16,x9 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + ldr x3,[x2,#8*2] // b[i] + + lsl x8,x24,#32 + subs x16,x16,x24 + lsr x9,x24,#32 + sbcs x17,x17,x8 + sbcs x19,x19,x9 + sbc x20,x20,xzr + + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + mul x8,x4,x3 + adc x11,x11,xzr + mul x9,x5,x3 + + adds x14,x15,x10 + mul x10,x6,x3 + adcs x15,x16,x11 + mul x11,x7,x3 + adcs x16,x17,x24 + adcs x17,x19,x24 + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts + umulh x8,x4,x3 + adcs x15,x15,x9 + umulh x9,x5,x3 + adcs x16,x16,x10 + umulh x10,x6,x3 + adcs x17,x17,x11 + umulh x11,x7,x3 + adc x19,x19,xzr + mul x24,x14,x23 + adds x15,x15,x8 // accumulate high parts + adcs x16,x16,x9 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + ldr x3,[x2,#8*3] // b[i] + + lsl x8,x24,#32 + subs x16,x16,x24 + lsr x9,x24,#32 + sbcs x17,x17,x8 + sbcs x19,x19,x9 + sbc x20,x20,xzr + + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + mul x8,x4,x3 + adc x11,x11,xzr + mul x9,x5,x3 + + adds x14,x15,x10 + mul x10,x6,x3 + adcs x15,x16,x11 + mul x11,x7,x3 + adcs x16,x17,x24 + adcs x17,x19,x24 + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts + umulh x8,x4,x3 + adcs x15,x15,x9 + umulh x9,x5,x3 + adcs x16,x16,x10 + umulh x10,x6,x3 + adcs x17,x17,x11 + umulh x11,x7,x3 + adc x19,x19,xzr + mul x24,x14,x23 + adds x15,x15,x8 // accumulate high parts + adcs x16,x16,x9 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + lsl x8,x24,#32 // last reduction + subs x16,x16,x24 + lsr x9,x24,#32 + sbcs x17,x17,x8 + sbcs x19,x19,x9 + sbc x20,x20,xzr + + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + adc x11,x11,xzr + + adds x14,x15,x10 + adcs x15,x16,x11 + adcs x16,x17,x24 + adcs x17,x19,x24 + adc x19,x20,xzr + + subs x8,x14,x12 // ret -= modulus + sbcs x9,x15,x13 + sbcs x10,x16,x21 + sbcs x11,x17,x22 + sbcs xzr,x19,xzr + + csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus + csel x15,x15,x9,lo + csel x16,x16,x10,lo + stp x14,x15,[x0] + csel x17,x17,x11,lo + stp x16,x17,[x0,#16] + + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldr x29,[sp],#64 + ret + + +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], +// uint64_t rep); +.globl _ecp_nistz256_ord_sqr_mont +.private_extern _ecp_nistz256_ord_sqr_mont + +.align 4 +_ecp_nistz256_ord_sqr_mont: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + adrp x23,Lord@PAGE + add x23,x23,Lord@PAGEOFF + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + + ldp x12,x13,[x23,#0] + ldp x21,x22,[x23,#16] + ldr x23,[x23,#32] + b Loop_ord_sqr + +.align 4 +Loop_ord_sqr: + sub x2,x2,#1 + //////////////////////////////////////////////////////////////// + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul x15,x5,x4 // a[1]*a[0] + umulh x9,x5,x4 + mul x16,x6,x4 // a[2]*a[0] + umulh x10,x6,x4 + mul x17,x7,x4 // a[3]*a[0] + umulh x19,x7,x4 + + adds x16,x16,x9 // accumulate high parts of multiplication + mul x8,x6,x5 // a[2]*a[1] + umulh x9,x6,x5 + adcs x17,x17,x10 + mul x10,x7,x5 // a[3]*a[1] + umulh x11,x7,x5 + adc x19,x19,xzr // can't overflow + + mul x20,x7,x6 // a[3]*a[2] + umulh x1,x7,x6 + + adds x9,x9,x10 // accumulate high parts of multiplication + mul x14,x4,x4 // a[0]*a[0] + adc x10,x11,xzr // can't overflow + + adds x17,x17,x8 // accumulate low parts of multiplication + umulh x4,x4,x4 + adcs x19,x19,x9 + mul x9,x5,x5 // a[1]*a[1] + adcs x20,x20,x10 + umulh x5,x5,x5 + adc x1,x1,xzr // can't overflow + + adds x15,x15,x15 // acc[1-6]*=2 + mul x10,x6,x6 // a[2]*a[2] + adcs x16,x16,x16 + umulh x6,x6,x6 + adcs x17,x17,x17 + mul x11,x7,x7 // a[3]*a[3] + adcs x19,x19,x19 + umulh x7,x7,x7 + adcs x20,x20,x20 + adcs x1,x1,x1 + adc x3,xzr,xzr + + adds x15,x15,x4 // +a[i]*a[i] + mul x24,x14,x23 + adcs x16,x16,x9 + adcs x17,x17,x5 + adcs x19,x19,x10 + adcs x20,x20,x6 + adcs x1,x1,x11 + adc x3,x3,x7 + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + adc x11,x11,xzr + + adds x14,x15,x10 + adcs x15,x16,x11 + adcs x16,x17,x24 + adc x17,xzr,x24 // can't overflow + mul x11,x14,x23 + lsl x8,x24,#32 + subs x15,x15,x24 + lsr x9,x24,#32 + sbcs x16,x16,x8 + sbc x17,x17,x9 // can't borrow + subs xzr,x14,#1 + umulh x9,x12,x11 + mul x10,x13,x11 + umulh x24,x13,x11 + + adcs x10,x10,x9 + adc x24,x24,xzr + + adds x14,x15,x10 + adcs x15,x16,x24 + adcs x16,x17,x11 + adc x17,xzr,x11 // can't overflow + mul x24,x14,x23 + lsl x8,x11,#32 + subs x15,x15,x11 + lsr x9,x11,#32 + sbcs x16,x16,x8 + sbc x17,x17,x9 // can't borrow + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + adc x11,x11,xzr + + adds x14,x15,x10 + adcs x15,x16,x11 + adcs x16,x17,x24 + adc x17,xzr,x24 // can't overflow + mul x11,x14,x23 + lsl x8,x24,#32 + subs x15,x15,x24 + lsr x9,x24,#32 + sbcs x16,x16,x8 + sbc x17,x17,x9 // can't borrow + subs xzr,x14,#1 + umulh x9,x12,x11 + mul x10,x13,x11 + umulh x24,x13,x11 + + adcs x10,x10,x9 + adc x24,x24,xzr + + adds x14,x15,x10 + adcs x15,x16,x24 + adcs x16,x17,x11 + adc x17,xzr,x11 // can't overflow + lsl x8,x11,#32 + subs x15,x15,x11 + lsr x9,x11,#32 + sbcs x16,x16,x8 + sbc x17,x17,x9 // can't borrow + adds x14,x14,x19 // accumulate upper half + adcs x15,x15,x20 + adcs x16,x16,x1 + adcs x17,x17,x3 + adc x19,xzr,xzr + + subs x8,x14,x12 // ret -= modulus + sbcs x9,x15,x13 + sbcs x10,x16,x21 + sbcs x11,x17,x22 + sbcs xzr,x19,xzr + + csel x4,x14,x8,lo // ret = borrow ? ret : ret-modulus + csel x5,x15,x9,lo + csel x6,x16,x10,lo + csel x7,x17,x11,lo + + cbnz x2,Loop_ord_sqr + + stp x4,x5,[x0] + stp x6,x7,[x0,#16] + + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldr x29,[sp],#64 + ret + +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); +.globl _ecp_nistz256_select_w5 +.private_extern _ecp_nistz256_select_w5 + +.align 4 +_ecp_nistz256_select_w5: + AARCH64_VALID_CALL_TARGET + + // x10 := x0 + // w9 := 0; loop counter and incremented internal index + mov x10, x0 + mov w9, #0 + + // [v16-v21] := 0 + movi v16.16b, #0 + movi v17.16b, #0 + movi v18.16b, #0 + movi v19.16b, #0 + movi v20.16b, #0 + movi v21.16b, #0 + +Lselect_w5_loop: + // Loop 16 times. + + // Increment index (loop counter); tested at the end of the loop + add w9, w9, #1 + + // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1 + // and advance x1 to point to the next entry + ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 + + // x11 := (w9 == w2)? All 1s : All 0s + cmp w9, w2 + csetm x11, eq + + // continue loading ... + ld1 {v26.2d, v27.2d}, [x1],#32 + + // duplicate mask_64 into Mask (all 0s or all 1s) + dup v3.2d, x11 + + // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] + // i.e., values in output registers will remain the same if w9 != w2 + bit v16.16b, v22.16b, v3.16b + bit v17.16b, v23.16b, v3.16b + + bit v18.16b, v24.16b, v3.16b + bit v19.16b, v25.16b, v3.16b + + bit v20.16b, v26.16b, v3.16b + bit v21.16b, v27.16b, v3.16b + + // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back + tbz w9, #4, Lselect_w5_loop + + // Write [v16-v21] to memory at the output pointer + st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64 + st1 {v20.2d, v21.2d}, [x10] + + ret + + + +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); +.globl _ecp_nistz256_select_w7 +.private_extern _ecp_nistz256_select_w7 + +.align 4 +_ecp_nistz256_select_w7: + AARCH64_VALID_CALL_TARGET + + // w9 := 0; loop counter and incremented internal index + mov w9, #0 + + // [v16-v21] := 0 + movi v16.16b, #0 + movi v17.16b, #0 + movi v18.16b, #0 + movi v19.16b, #0 + +Lselect_w7_loop: + // Loop 64 times. + + // Increment index (loop counter); tested at the end of the loop + add w9, w9, #1 + + // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1 + // and advance x1 to point to the next entry + ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 + + // x11 := (w9 == w2)? All 1s : All 0s + cmp w9, w2 + csetm x11, eq + + // duplicate mask_64 into Mask (all 0s or all 1s) + dup v3.2d, x11 + + // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] + // i.e., values in output registers will remain the same if w9 != w2 + bit v16.16b, v22.16b, v3.16b + bit v17.16b, v23.16b, v3.16b + + bit v18.16b, v24.16b, v3.16b + bit v19.16b, v25.16b, v3.16b + + // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back + tbz w9, #6, Lselect_w7_loop + + // Write [v16-v19] to memory at the output pointer + st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0] + + ret + +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) diff --git a/ring-0.17.14/pregenerated/p256-armv8-asm-linux64.S b/ring-0.17.14/pregenerated/p256-armv8-asm-linux64.S new file mode 100644 index 0000000000..e9a8c521f4 --- /dev/null +++ b/ring-0.17.14/pregenerated/p256-armv8-asm-linux64.S @@ -0,0 +1,1608 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) +.section .rodata +.align 5 +.Lpoly: +.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 +.LRR: // 2^512 mod P precomputed for NIST P256 polynomial +.quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd +.Lone_mont: +.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe +.Lone: +.quad 1,0,0,0 +.Lord: +.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 +.LordK: +.quad 0xccd1c8aaee00bc4f +.byte 69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.text + +// void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], +// const BN_ULONG x2[4]); +.globl ecp_nistz256_mul_mont +.hidden ecp_nistz256_mul_mont +.type ecp_nistz256_mul_mont,%function +.align 4 +ecp_nistz256_mul_mont: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + + ldr x3,[x2] // bp[0] + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + adrp x13,.Lpoly + add x13,x13,:lo12:.Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + bl __ecp_nistz256_mul_mont + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont + +// void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_sqr_mont +.hidden ecp_nistz256_sqr_mont +.type ecp_nistz256_sqr_mont,%function +.align 4 +ecp_nistz256_sqr_mont: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + adrp x13,.Lpoly + add x13,x13,:lo12:.Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + bl __ecp_nistz256_sqr_mont + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont + +// void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_neg +.hidden ecp_nistz256_neg +.type ecp_nistz256_neg,%function +.align 4 +ecp_nistz256_neg: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x2,x1 + mov x14,xzr // a = 0 + mov x15,xzr + mov x16,xzr + mov x17,xzr + adrp x13,.Lpoly + add x13,x13,:lo12:.Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + bl __ecp_nistz256_sub_from + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_nistz256_neg,.-ecp_nistz256_neg + +// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded +// to x4-x7 and b[0] - to x3 +.type __ecp_nistz256_mul_mont,%function +.align 4 +__ecp_nistz256_mul_mont: + mul x14,x4,x3 // a[0]*b[0] + umulh x8,x4,x3 + + mul x15,x5,x3 // a[1]*b[0] + umulh x9,x5,x3 + + mul x16,x6,x3 // a[2]*b[0] + umulh x10,x6,x3 + + mul x17,x7,x3 // a[3]*b[0] + umulh x11,x7,x3 + ldr x3,[x2,#8] // b[1] + + adds x15,x15,x8 // accumulate high parts of multiplication + lsl x8,x14,#32 + adcs x16,x16,x9 + lsr x9,x14,#32 + adcs x17,x17,x10 + adc x19,xzr,x11 + mov x20,xzr + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + mul x8,x4,x3 // lo(a[0]*b[i]) + adcs x15,x16,x9 + mul x9,x5,x3 // lo(a[1]*b[i]) + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + mul x10,x6,x3 // lo(a[2]*b[i]) + adcs x17,x19,x11 + mul x11,x7,x3 // lo(a[3]*b[i]) + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts of multiplication + umulh x8,x4,x3 // hi(a[0]*b[i]) + adcs x15,x15,x9 + umulh x9,x5,x3 // hi(a[1]*b[i]) + adcs x16,x16,x10 + umulh x10,x6,x3 // hi(a[2]*b[i]) + adcs x17,x17,x11 + umulh x11,x7,x3 // hi(a[3]*b[i]) + adc x19,x19,xzr + ldr x3,[x2,#8*(1+1)] // b[1+1] + adds x15,x15,x8 // accumulate high parts of multiplication + lsl x8,x14,#32 + adcs x16,x16,x9 + lsr x9,x14,#32 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + mul x8,x4,x3 // lo(a[0]*b[i]) + adcs x15,x16,x9 + mul x9,x5,x3 // lo(a[1]*b[i]) + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + mul x10,x6,x3 // lo(a[2]*b[i]) + adcs x17,x19,x11 + mul x11,x7,x3 // lo(a[3]*b[i]) + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts of multiplication + umulh x8,x4,x3 // hi(a[0]*b[i]) + adcs x15,x15,x9 + umulh x9,x5,x3 // hi(a[1]*b[i]) + adcs x16,x16,x10 + umulh x10,x6,x3 // hi(a[2]*b[i]) + adcs x17,x17,x11 + umulh x11,x7,x3 // hi(a[3]*b[i]) + adc x19,x19,xzr + ldr x3,[x2,#8*(2+1)] // b[2+1] + adds x15,x15,x8 // accumulate high parts of multiplication + lsl x8,x14,#32 + adcs x16,x16,x9 + lsr x9,x14,#32 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + mul x8,x4,x3 // lo(a[0]*b[i]) + adcs x15,x16,x9 + mul x9,x5,x3 // lo(a[1]*b[i]) + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + mul x10,x6,x3 // lo(a[2]*b[i]) + adcs x17,x19,x11 + mul x11,x7,x3 // lo(a[3]*b[i]) + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts of multiplication + umulh x8,x4,x3 // hi(a[0]*b[i]) + adcs x15,x15,x9 + umulh x9,x5,x3 // hi(a[1]*b[i]) + adcs x16,x16,x10 + umulh x10,x6,x3 // hi(a[2]*b[i]) + adcs x17,x17,x11 + umulh x11,x7,x3 // hi(a[3]*b[i]) + adc x19,x19,xzr + adds x15,x15,x8 // accumulate high parts of multiplication + lsl x8,x14,#32 + adcs x16,x16,x9 + lsr x9,x14,#32 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + // last reduction + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + adcs x17,x19,x11 + adc x19,x20,xzr + + adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus + sbcs x9,x15,x12 + sbcs x10,x16,xzr + sbcs x11,x17,x13 + sbcs xzr,x19,xzr // did it borrow? + + csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus + csel x15,x15,x9,lo + csel x16,x16,x10,lo + stp x14,x15,[x0] + csel x17,x17,x11,lo + stp x16,x17,[x0,#16] + + ret +.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont + +// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded +// to x4-x7 +.type __ecp_nistz256_sqr_mont,%function +.align 4 +__ecp_nistz256_sqr_mont: + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul x15,x5,x4 // a[1]*a[0] + umulh x9,x5,x4 + mul x16,x6,x4 // a[2]*a[0] + umulh x10,x6,x4 + mul x17,x7,x4 // a[3]*a[0] + umulh x19,x7,x4 + + adds x16,x16,x9 // accumulate high parts of multiplication + mul x8,x6,x5 // a[2]*a[1] + umulh x9,x6,x5 + adcs x17,x17,x10 + mul x10,x7,x5 // a[3]*a[1] + umulh x11,x7,x5 + adc x19,x19,xzr // can't overflow + + mul x20,x7,x6 // a[3]*a[2] + umulh x1,x7,x6 + + adds x9,x9,x10 // accumulate high parts of multiplication + mul x14,x4,x4 // a[0]*a[0] + adc x10,x11,xzr // can't overflow + + adds x17,x17,x8 // accumulate low parts of multiplication + umulh x4,x4,x4 + adcs x19,x19,x9 + mul x9,x5,x5 // a[1]*a[1] + adcs x20,x20,x10 + umulh x5,x5,x5 + adc x1,x1,xzr // can't overflow + + adds x15,x15,x15 // acc[1-6]*=2 + mul x10,x6,x6 // a[2]*a[2] + adcs x16,x16,x16 + umulh x6,x6,x6 + adcs x17,x17,x17 + mul x11,x7,x7 // a[3]*a[3] + adcs x19,x19,x19 + umulh x7,x7,x7 + adcs x20,x20,x20 + adcs x1,x1,x1 + adc x2,xzr,xzr + + adds x15,x15,x4 // +a[i]*a[i] + adcs x16,x16,x9 + adcs x17,x17,x5 + adcs x19,x19,x10 + adcs x20,x20,x6 + lsl x8,x14,#32 + adcs x1,x1,x11 + lsr x9,x14,#32 + adc x2,x2,x7 + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + lsl x8,x14,#32 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + lsr x9,x14,#32 + adc x17,x11,xzr // can't overflow + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + lsl x8,x14,#32 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + lsr x9,x14,#32 + adc x17,x11,xzr // can't overflow + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + lsl x8,x14,#32 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + lsr x9,x14,#32 + adc x17,x11,xzr // can't overflow + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + adc x17,x11,xzr // can't overflow + + adds x14,x14,x19 // accumulate upper half + adcs x15,x15,x20 + adcs x16,x16,x1 + adcs x17,x17,x2 + adc x19,xzr,xzr + + adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus + sbcs x9,x15,x12 + sbcs x10,x16,xzr + sbcs x11,x17,x13 + sbcs xzr,x19,xzr // did it borrow? + + csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus + csel x15,x15,x9,lo + csel x16,x16,x10,lo + stp x14,x15,[x0] + csel x17,x17,x11,lo + stp x16,x17,[x0,#16] + + ret +.size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont + +// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to +// x4-x7 and x8-x11. This is done because it's used in multiple +// contexts, e.g. in multiplication by 2 and 3... +.type __ecp_nistz256_add_to,%function +.align 4 +__ecp_nistz256_add_to: + adds x14,x14,x8 // ret = a+b + adcs x15,x15,x9 + adcs x16,x16,x10 + adcs x17,x17,x11 + adc x1,xzr,xzr // zap x1 + + adds x8,x14,#1 // subs x8,x4,#-1 // tmp = ret-modulus + sbcs x9,x15,x12 + sbcs x10,x16,xzr + sbcs x11,x17,x13 + sbcs xzr,x1,xzr // did subtraction borrow? + + csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus + csel x15,x15,x9,lo + csel x16,x16,x10,lo + stp x14,x15,[x0] + csel x17,x17,x11,lo + stp x16,x17,[x0,#16] + + ret +.size __ecp_nistz256_add_to,.-__ecp_nistz256_add_to + +.type __ecp_nistz256_sub_from,%function +.align 4 +__ecp_nistz256_sub_from: + ldp x8,x9,[x2] + ldp x10,x11,[x2,#16] + subs x14,x14,x8 // ret = a-b + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbcs x17,x17,x11 + sbc x1,xzr,xzr // zap x1 + + subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus + adcs x9,x15,x12 + adcs x10,x16,xzr + adc x11,x17,x13 + cmp x1,xzr // did subtraction borrow? + + csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret + csel x15,x15,x9,eq + csel x16,x16,x10,eq + stp x14,x15,[x0] + csel x17,x17,x11,eq + stp x16,x17,[x0,#16] + + ret +.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from + +.type __ecp_nistz256_sub_morf,%function +.align 4 +__ecp_nistz256_sub_morf: + ldp x8,x9,[x2] + ldp x10,x11,[x2,#16] + subs x14,x8,x14 // ret = b-a + sbcs x15,x9,x15 + sbcs x16,x10,x16 + sbcs x17,x11,x17 + sbc x1,xzr,xzr // zap x1 + + subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus + adcs x9,x15,x12 + adcs x10,x16,xzr + adc x11,x17,x13 + cmp x1,xzr // did subtraction borrow? + + csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret + csel x15,x15,x9,eq + csel x16,x16,x10,eq + stp x14,x15,[x0] + csel x17,x17,x11,eq + stp x16,x17,[x0,#16] + + ret +.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf + +.type __ecp_nistz256_div_by_2,%function +.align 4 +__ecp_nistz256_div_by_2: + subs x8,x14,#1 // adds x8,x4,#-1 // tmp = a+modulus + adcs x9,x15,x12 + adcs x10,x16,xzr + adcs x11,x17,x13 + adc x1,xzr,xzr // zap x1 + tst x14,#1 // is a even? + + csel x14,x14,x8,eq // ret = even ? a : a+modulus + csel x15,x15,x9,eq + csel x16,x16,x10,eq + csel x17,x17,x11,eq + csel x1,xzr,x1,eq + + lsr x14,x14,#1 // ret >>= 1 + orr x14,x14,x15,lsl#63 + lsr x15,x15,#1 + orr x15,x15,x16,lsl#63 + lsr x16,x16,#1 + orr x16,x16,x17,lsl#63 + lsr x17,x17,#1 + stp x14,x15,[x0] + orr x17,x17,x1,lsl#63 + stp x16,x17,[x0,#16] + + ret +.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 +.globl ecp_nistz256_point_double +.hidden ecp_nistz256_point_double +.type ecp_nistz256_point_double,%function +.align 5 +ecp_nistz256_point_double: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + sub sp,sp,#32*4 + +.Ldouble_shortcut: + ldp x14,x15,[x1,#32] + mov x21,x0 + ldp x16,x17,[x1,#48] + mov x22,x1 + adrp x13,.Lpoly + add x13,x13,:lo12:.Lpoly + ldr x12,[x13,#8] + mov x8,x14 + ldr x13,[x13,#24] + mov x9,x15 + ldp x4,x5,[x22,#64] // forward load for p256_sqr_mont + mov x10,x16 + mov x11,x17 + ldp x6,x7,[x22,#64+16] + add x0,sp,#0 + bl __ecp_nistz256_add_to // p256_mul_by_2(S, in_y); + + add x0,sp,#64 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); + + ldp x8,x9,[x22] + ldp x10,x11,[x22,#16] + mov x4,x14 // put Zsqr aside for p256_sub + mov x5,x15 + mov x6,x16 + mov x7,x17 + add x0,sp,#32 + bl __ecp_nistz256_add_to // p256_add(M, Zsqr, in_x); + + add x2,x22,#0 + mov x14,x4 // restore Zsqr + mov x15,x5 + ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont + mov x16,x6 + mov x17,x7 + ldp x6,x7,[sp,#0+16] + add x0,sp,#64 + bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); + + add x0,sp,#0 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); + + ldr x3,[x22,#32] + ldp x4,x5,[x22,#64] + ldp x6,x7,[x22,#64+16] + add x2,x22,#32 + add x0,sp,#96 + bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); + + mov x8,x14 + mov x9,x15 + ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont + mov x10,x16 + mov x11,x17 + ldp x6,x7,[sp,#0+16] + add x0,x21,#64 + bl __ecp_nistz256_add_to // p256_mul_by_2(res_z, tmp0); + + add x0,sp,#96 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); + + ldr x3,[sp,#64] // forward load for p256_mul_mont + ldp x4,x5,[sp,#32] + ldp x6,x7,[sp,#32+16] + add x0,x21,#32 + bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); + + add x2,sp,#64 + add x0,sp,#32 + bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); + + mov x8,x14 // duplicate M + mov x9,x15 + mov x10,x16 + mov x11,x17 + mov x4,x14 // put M aside + mov x5,x15 + mov x6,x16 + mov x7,x17 + add x0,sp,#32 + bl __ecp_nistz256_add_to + mov x8,x4 // restore M + mov x9,x5 + ldr x3,[x22] // forward load for p256_mul_mont + mov x10,x6 + ldp x4,x5,[sp,#0] + mov x11,x7 + ldp x6,x7,[sp,#0+16] + bl __ecp_nistz256_add_to // p256_mul_by_3(M, M); + + add x2,x22,#0 + add x0,sp,#0 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); + + mov x8,x14 + mov x9,x15 + ldp x4,x5,[sp,#32] // forward load for p256_sqr_mont + mov x10,x16 + mov x11,x17 + ldp x6,x7,[sp,#32+16] + add x0,sp,#96 + bl __ecp_nistz256_add_to // p256_mul_by_2(tmp0, S); + + add x0,x21,#0 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); + + add x2,sp,#96 + bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); + + add x2,sp,#0 + add x0,sp,#0 + bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); + + ldr x3,[sp,#32] + mov x4,x14 // copy S + mov x5,x15 + mov x6,x16 + mov x7,x17 + add x2,sp,#32 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); + + add x2,x21,#32 + add x0,x21,#32 + bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); + + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_nistz256_point_double,.-ecp_nistz256_point_double +.globl ecp_nistz256_point_add +.hidden ecp_nistz256_point_add +.type ecp_nistz256_point_add,%function +.align 5 +ecp_nistz256_point_add: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#32*12 + + ldp x4,x5,[x2,#64] // in2_z + ldp x6,x7,[x2,#64+16] + mov x21,x0 + mov x22,x1 + mov x23,x2 + adrp x13,.Lpoly + add x13,x13,:lo12:.Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + orr x8,x4,x5 + orr x10,x6,x7 + orr x25,x8,x10 + cmp x25,#0 + csetm x25,ne // ~in2infty + add x0,sp,#192 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); + + ldp x4,x5,[x22,#64] // in1_z + ldp x6,x7,[x22,#64+16] + orr x8,x4,x5 + orr x10,x6,x7 + orr x24,x8,x10 + cmp x24,#0 + csetm x24,ne // ~in1infty + add x0,sp,#128 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); + + ldr x3,[x23,#64] + ldp x4,x5,[sp,#192] + ldp x6,x7,[sp,#192+16] + add x2,x23,#64 + add x0,sp,#320 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z); + + ldr x3,[x22,#64] + ldp x4,x5,[sp,#128] + ldp x6,x7,[sp,#128+16] + add x2,x22,#64 + add x0,sp,#352 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); + + ldr x3,[x22,#32] + ldp x4,x5,[sp,#320] + ldp x6,x7,[sp,#320+16] + add x2,x22,#32 + add x0,sp,#320 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y); + + ldr x3,[x23,#32] + ldp x4,x5,[sp,#352] + ldp x6,x7,[sp,#352+16] + add x2,x23,#32 + add x0,sp,#352 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); + + add x2,sp,#320 + ldr x3,[sp,#192] // forward load for p256_mul_mont + ldp x4,x5,[x22] + ldp x6,x7,[x22,#16] + add x0,sp,#160 + bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1); + + orr x14,x14,x15 // see if result is zero + orr x16,x16,x17 + orr x26,x14,x16 // ~is_equal(S1,S2) + + add x2,sp,#192 + add x0,sp,#256 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr); + + ldr x3,[sp,#128] + ldp x4,x5,[x23] + ldp x6,x7,[x23,#16] + add x2,sp,#128 + add x0,sp,#288 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr); + + add x2,sp,#256 + ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont + ldp x6,x7,[sp,#160+16] + add x0,sp,#96 + bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1); + + orr x14,x14,x15 // see if result is zero + orr x16,x16,x17 + orr x14,x14,x16 // ~is_equal(U1,U2) + + mvn x27,x24 // -1/0 -> 0/-1 + mvn x28,x25 // -1/0 -> 0/-1 + orr x14,x14,x27 + orr x14,x14,x28 + orr x14,x14,x26 + cbnz x14,.Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) + +.Ladd_double: + mov x1,x22 + mov x0,x21 + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + add sp,sp,#256 // #256 is from #32*(12-4). difference in stack frames + b .Ldouble_shortcut + +.align 4 +.Ladd_proceed: + add x0,sp,#192 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); + + ldr x3,[x22,#64] + ldp x4,x5,[sp,#96] + ldp x6,x7,[sp,#96+16] + add x2,x22,#64 + add x0,sp,#64 + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); + + ldp x4,x5,[sp,#96] + ldp x6,x7,[sp,#96+16] + add x0,sp,#128 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); + + ldr x3,[x23,#64] + ldp x4,x5,[sp,#64] + ldp x6,x7,[sp,#64+16] + add x2,x23,#64 + add x0,sp,#64 + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z); + + ldr x3,[sp,#96] + ldp x4,x5,[sp,#128] + ldp x6,x7,[sp,#128+16] + add x2,sp,#96 + add x0,sp,#224 + bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); + + ldr x3,[sp,#128] + ldp x4,x5,[sp,#256] + ldp x6,x7,[sp,#256+16] + add x2,sp,#128 + add x0,sp,#288 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr); + + mov x8,x14 + mov x9,x15 + mov x10,x16 + mov x11,x17 + add x0,sp,#128 + bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); + + add x2,sp,#192 + add x0,sp,#0 + bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); + + add x2,sp,#224 + bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); + + add x2,sp,#288 + ldr x3,[sp,#224] // forward load for p256_mul_mont + ldp x4,x5,[sp,#320] + ldp x6,x7,[sp,#320+16] + add x0,sp,#32 + bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); + + add x2,sp,#224 + add x0,sp,#352 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub); + + ldr x3,[sp,#160] + ldp x4,x5,[sp,#32] + ldp x6,x7,[sp,#32+16] + add x2,sp,#160 + add x0,sp,#32 + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); + + add x2,sp,#352 + bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); + + ldp x4,x5,[sp,#0] // res + ldp x6,x7,[sp,#0+16] + ldp x8,x9,[x23] // in2 + ldp x10,x11,[x23,#16] + ldp x14,x15,[x22,#0] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#0+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + ldp x4,x5,[sp,#0+0+32] // res + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + ldp x6,x7,[sp,#0+0+48] + csel x14,x8,x14,ne + csel x15,x9,x15,ne + ldp x8,x9,[x23,#0+32] // in2 + csel x16,x10,x16,ne + csel x17,x11,x17,ne + ldp x10,x11,[x23,#0+48] + stp x14,x15,[x21,#0] + stp x16,x17,[x21,#0+16] + ldp x14,x15,[x22,#32] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#32+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + ldp x4,x5,[sp,#0+32+32] // res + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + ldp x6,x7,[sp,#0+32+48] + csel x14,x8,x14,ne + csel x15,x9,x15,ne + ldp x8,x9,[x23,#32+32] // in2 + csel x16,x10,x16,ne + csel x17,x11,x17,ne + ldp x10,x11,[x23,#32+48] + stp x14,x15,[x21,#32] + stp x16,x17,[x21,#32+16] + ldp x14,x15,[x22,#64] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#64+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + csel x14,x8,x14,ne + csel x15,x9,x15,ne + csel x16,x10,x16,ne + csel x17,x11,x17,ne + stp x14,x15,[x21,#64] + stp x16,x17,[x21,#64+16] + +.Ladd_done: + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_nistz256_point_add,.-ecp_nistz256_point_add +.globl ecp_nistz256_point_add_affine +.hidden ecp_nistz256_point_add_affine +.type ecp_nistz256_point_add_affine,%function +.align 5 +ecp_nistz256_point_add_affine: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + sub sp,sp,#32*10 + + mov x21,x0 + mov x22,x1 + mov x23,x2 + adrp x13,.Lpoly + add x13,x13,:lo12:.Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + ldp x4,x5,[x1,#64] // in1_z + ldp x6,x7,[x1,#64+16] + orr x8,x4,x5 + orr x10,x6,x7 + orr x24,x8,x10 + cmp x24,#0 + csetm x24,ne // ~in1infty + + ldp x14,x15,[x2] // in2_x + ldp x16,x17,[x2,#16] + ldp x8,x9,[x2,#32] // in2_y + ldp x10,x11,[x2,#48] + orr x14,x14,x15 + orr x16,x16,x17 + orr x8,x8,x9 + orr x10,x10,x11 + orr x14,x14,x16 + orr x8,x8,x10 + orr x25,x14,x8 + cmp x25,#0 + csetm x25,ne // ~in2infty + + add x0,sp,#128 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); + + mov x4,x14 + mov x5,x15 + mov x6,x16 + mov x7,x17 + ldr x3,[x23] + add x2,x23,#0 + add x0,sp,#96 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); + + add x2,x22,#0 + ldr x3,[x22,#64] // forward load for p256_mul_mont + ldp x4,x5,[sp,#128] + ldp x6,x7,[sp,#128+16] + add x0,sp,#160 + bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); + + add x2,x22,#64 + add x0,sp,#128 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); + + ldr x3,[x22,#64] + ldp x4,x5,[sp,#160] + ldp x6,x7,[sp,#160+16] + add x2,x22,#64 + add x0,sp,#64 + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); + + ldr x3,[x23,#32] + ldp x4,x5,[sp,#128] + ldp x6,x7,[sp,#128+16] + add x2,x23,#32 + add x0,sp,#128 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); + + add x2,x22,#32 + ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont + ldp x6,x7,[sp,#160+16] + add x0,sp,#192 + bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); + + add x0,sp,#224 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); + + ldp x4,x5,[sp,#192] + ldp x6,x7,[sp,#192+16] + add x0,sp,#288 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); + + ldr x3,[sp,#160] + ldp x4,x5,[sp,#224] + ldp x6,x7,[sp,#224+16] + add x2,sp,#160 + add x0,sp,#256 + bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); + + ldr x3,[x22] + ldp x4,x5,[sp,#224] + ldp x6,x7,[sp,#224+16] + add x2,x22,#0 + add x0,sp,#96 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); + + mov x8,x14 + mov x9,x15 + mov x10,x16 + mov x11,x17 + add x0,sp,#224 + bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); + + add x2,sp,#288 + add x0,sp,#0 + bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); + + add x2,sp,#256 + bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); + + add x2,sp,#96 + ldr x3,[x22,#32] // forward load for p256_mul_mont + ldp x4,x5,[sp,#256] + ldp x6,x7,[sp,#256+16] + add x0,sp,#32 + bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); + + add x2,x22,#32 + add x0,sp,#128 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); + + ldr x3,[sp,#192] + ldp x4,x5,[sp,#32] + ldp x6,x7,[sp,#32+16] + add x2,sp,#192 + add x0,sp,#32 + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); + + add x2,sp,#128 + bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); + + ldp x4,x5,[sp,#0] // res + ldp x6,x7,[sp,#0+16] + ldp x8,x9,[x23] // in2 + ldp x10,x11,[x23,#16] + ldp x14,x15,[x22,#0] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#0+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + ldp x4,x5,[sp,#0+0+32] // res + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + ldp x6,x7,[sp,#0+0+48] + csel x14,x8,x14,ne + csel x15,x9,x15,ne + ldp x8,x9,[x23,#0+32] // in2 + csel x16,x10,x16,ne + csel x17,x11,x17,ne + ldp x10,x11,[x23,#0+48] + stp x14,x15,[x21,#0] + stp x16,x17,[x21,#0+16] + adrp x23,.Lone_mont-64 + add x23,x23,:lo12:.Lone_mont-64 + ldp x14,x15,[x22,#32] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#32+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + ldp x4,x5,[sp,#0+32+32] // res + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + ldp x6,x7,[sp,#0+32+48] + csel x14,x8,x14,ne + csel x15,x9,x15,ne + ldp x8,x9,[x23,#32+32] // in2 + csel x16,x10,x16,ne + csel x17,x11,x17,ne + ldp x10,x11,[x23,#32+48] + stp x14,x15,[x21,#32] + stp x16,x17,[x21,#32+16] + ldp x14,x15,[x22,#64] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#64+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + csel x14,x8,x14,ne + csel x15,x9,x15,ne + csel x16,x10,x16,ne + csel x17,x11,x17,ne + stp x14,x15,[x21,#64] + stp x16,x17,[x21,#64+16] + + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x29,x30,[sp],#80 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], +// uint64_t b[4]); +.globl ecp_nistz256_ord_mul_mont +.hidden ecp_nistz256_ord_mul_mont +.type ecp_nistz256_ord_mul_mont,%function +.align 4 +ecp_nistz256_ord_mul_mont: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + adrp x23,.Lord + add x23,x23,:lo12:.Lord + ldr x3,[x2] // bp[0] + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + + ldp x12,x13,[x23,#0] + ldp x21,x22,[x23,#16] + ldr x23,[x23,#32] + + mul x14,x4,x3 // a[0]*b[0] + umulh x8,x4,x3 + + mul x15,x5,x3 // a[1]*b[0] + umulh x9,x5,x3 + + mul x16,x6,x3 // a[2]*b[0] + umulh x10,x6,x3 + + mul x17,x7,x3 // a[3]*b[0] + umulh x19,x7,x3 + + mul x24,x14,x23 + + adds x15,x15,x8 // accumulate high parts of multiplication + adcs x16,x16,x9 + adcs x17,x17,x10 + adc x19,x19,xzr + mov x20,xzr + ldr x3,[x2,#8*1] // b[i] + + lsl x8,x24,#32 + subs x16,x16,x24 + lsr x9,x24,#32 + sbcs x17,x17,x8 + sbcs x19,x19,x9 + sbc x20,x20,xzr + + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + mul x8,x4,x3 + adc x11,x11,xzr + mul x9,x5,x3 + + adds x14,x15,x10 + mul x10,x6,x3 + adcs x15,x16,x11 + mul x11,x7,x3 + adcs x16,x17,x24 + adcs x17,x19,x24 + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts + umulh x8,x4,x3 + adcs x15,x15,x9 + umulh x9,x5,x3 + adcs x16,x16,x10 + umulh x10,x6,x3 + adcs x17,x17,x11 + umulh x11,x7,x3 + adc x19,x19,xzr + mul x24,x14,x23 + adds x15,x15,x8 // accumulate high parts + adcs x16,x16,x9 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + ldr x3,[x2,#8*2] // b[i] + + lsl x8,x24,#32 + subs x16,x16,x24 + lsr x9,x24,#32 + sbcs x17,x17,x8 + sbcs x19,x19,x9 + sbc x20,x20,xzr + + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + mul x8,x4,x3 + adc x11,x11,xzr + mul x9,x5,x3 + + adds x14,x15,x10 + mul x10,x6,x3 + adcs x15,x16,x11 + mul x11,x7,x3 + adcs x16,x17,x24 + adcs x17,x19,x24 + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts + umulh x8,x4,x3 + adcs x15,x15,x9 + umulh x9,x5,x3 + adcs x16,x16,x10 + umulh x10,x6,x3 + adcs x17,x17,x11 + umulh x11,x7,x3 + adc x19,x19,xzr + mul x24,x14,x23 + adds x15,x15,x8 // accumulate high parts + adcs x16,x16,x9 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + ldr x3,[x2,#8*3] // b[i] + + lsl x8,x24,#32 + subs x16,x16,x24 + lsr x9,x24,#32 + sbcs x17,x17,x8 + sbcs x19,x19,x9 + sbc x20,x20,xzr + + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + mul x8,x4,x3 + adc x11,x11,xzr + mul x9,x5,x3 + + adds x14,x15,x10 + mul x10,x6,x3 + adcs x15,x16,x11 + mul x11,x7,x3 + adcs x16,x17,x24 + adcs x17,x19,x24 + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts + umulh x8,x4,x3 + adcs x15,x15,x9 + umulh x9,x5,x3 + adcs x16,x16,x10 + umulh x10,x6,x3 + adcs x17,x17,x11 + umulh x11,x7,x3 + adc x19,x19,xzr + mul x24,x14,x23 + adds x15,x15,x8 // accumulate high parts + adcs x16,x16,x9 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + lsl x8,x24,#32 // last reduction + subs x16,x16,x24 + lsr x9,x24,#32 + sbcs x17,x17,x8 + sbcs x19,x19,x9 + sbc x20,x20,xzr + + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + adc x11,x11,xzr + + adds x14,x15,x10 + adcs x15,x16,x11 + adcs x16,x17,x24 + adcs x17,x19,x24 + adc x19,x20,xzr + + subs x8,x14,x12 // ret -= modulus + sbcs x9,x15,x13 + sbcs x10,x16,x21 + sbcs x11,x17,x22 + sbcs xzr,x19,xzr + + csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus + csel x15,x15,x9,lo + csel x16,x16,x10,lo + stp x14,x15,[x0] + csel x17,x17,x11,lo + stp x16,x17,[x0,#16] + + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldr x29,[sp],#64 + ret +.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont + +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], +// uint64_t rep); +.globl ecp_nistz256_ord_sqr_mont +.hidden ecp_nistz256_ord_sqr_mont +.type ecp_nistz256_ord_sqr_mont,%function +.align 4 +ecp_nistz256_ord_sqr_mont: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + adrp x23,.Lord + add x23,x23,:lo12:.Lord + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + + ldp x12,x13,[x23,#0] + ldp x21,x22,[x23,#16] + ldr x23,[x23,#32] + b .Loop_ord_sqr + +.align 4 +.Loop_ord_sqr: + sub x2,x2,#1 + //////////////////////////////////////////////////////////////// + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul x15,x5,x4 // a[1]*a[0] + umulh x9,x5,x4 + mul x16,x6,x4 // a[2]*a[0] + umulh x10,x6,x4 + mul x17,x7,x4 // a[3]*a[0] + umulh x19,x7,x4 + + adds x16,x16,x9 // accumulate high parts of multiplication + mul x8,x6,x5 // a[2]*a[1] + umulh x9,x6,x5 + adcs x17,x17,x10 + mul x10,x7,x5 // a[3]*a[1] + umulh x11,x7,x5 + adc x19,x19,xzr // can't overflow + + mul x20,x7,x6 // a[3]*a[2] + umulh x1,x7,x6 + + adds x9,x9,x10 // accumulate high parts of multiplication + mul x14,x4,x4 // a[0]*a[0] + adc x10,x11,xzr // can't overflow + + adds x17,x17,x8 // accumulate low parts of multiplication + umulh x4,x4,x4 + adcs x19,x19,x9 + mul x9,x5,x5 // a[1]*a[1] + adcs x20,x20,x10 + umulh x5,x5,x5 + adc x1,x1,xzr // can't overflow + + adds x15,x15,x15 // acc[1-6]*=2 + mul x10,x6,x6 // a[2]*a[2] + adcs x16,x16,x16 + umulh x6,x6,x6 + adcs x17,x17,x17 + mul x11,x7,x7 // a[3]*a[3] + adcs x19,x19,x19 + umulh x7,x7,x7 + adcs x20,x20,x20 + adcs x1,x1,x1 + adc x3,xzr,xzr + + adds x15,x15,x4 // +a[i]*a[i] + mul x24,x14,x23 + adcs x16,x16,x9 + adcs x17,x17,x5 + adcs x19,x19,x10 + adcs x20,x20,x6 + adcs x1,x1,x11 + adc x3,x3,x7 + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + adc x11,x11,xzr + + adds x14,x15,x10 + adcs x15,x16,x11 + adcs x16,x17,x24 + adc x17,xzr,x24 // can't overflow + mul x11,x14,x23 + lsl x8,x24,#32 + subs x15,x15,x24 + lsr x9,x24,#32 + sbcs x16,x16,x8 + sbc x17,x17,x9 // can't borrow + subs xzr,x14,#1 + umulh x9,x12,x11 + mul x10,x13,x11 + umulh x24,x13,x11 + + adcs x10,x10,x9 + adc x24,x24,xzr + + adds x14,x15,x10 + adcs x15,x16,x24 + adcs x16,x17,x11 + adc x17,xzr,x11 // can't overflow + mul x24,x14,x23 + lsl x8,x11,#32 + subs x15,x15,x11 + lsr x9,x11,#32 + sbcs x16,x16,x8 + sbc x17,x17,x9 // can't borrow + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + adc x11,x11,xzr + + adds x14,x15,x10 + adcs x15,x16,x11 + adcs x16,x17,x24 + adc x17,xzr,x24 // can't overflow + mul x11,x14,x23 + lsl x8,x24,#32 + subs x15,x15,x24 + lsr x9,x24,#32 + sbcs x16,x16,x8 + sbc x17,x17,x9 // can't borrow + subs xzr,x14,#1 + umulh x9,x12,x11 + mul x10,x13,x11 + umulh x24,x13,x11 + + adcs x10,x10,x9 + adc x24,x24,xzr + + adds x14,x15,x10 + adcs x15,x16,x24 + adcs x16,x17,x11 + adc x17,xzr,x11 // can't overflow + lsl x8,x11,#32 + subs x15,x15,x11 + lsr x9,x11,#32 + sbcs x16,x16,x8 + sbc x17,x17,x9 // can't borrow + adds x14,x14,x19 // accumulate upper half + adcs x15,x15,x20 + adcs x16,x16,x1 + adcs x17,x17,x3 + adc x19,xzr,xzr + + subs x8,x14,x12 // ret -= modulus + sbcs x9,x15,x13 + sbcs x10,x16,x21 + sbcs x11,x17,x22 + sbcs xzr,x19,xzr + + csel x4,x14,x8,lo // ret = borrow ? ret : ret-modulus + csel x5,x15,x9,lo + csel x6,x16,x10,lo + csel x7,x17,x11,lo + + cbnz x2,.Loop_ord_sqr + + stp x4,x5,[x0] + stp x6,x7,[x0,#16] + + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldr x29,[sp],#64 + ret +.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); +.globl ecp_nistz256_select_w5 +.hidden ecp_nistz256_select_w5 +.type ecp_nistz256_select_w5,%function +.align 4 +ecp_nistz256_select_w5: + AARCH64_VALID_CALL_TARGET + + // x10 := x0 + // w9 := 0; loop counter and incremented internal index + mov x10, x0 + mov w9, #0 + + // [v16-v21] := 0 + movi v16.16b, #0 + movi v17.16b, #0 + movi v18.16b, #0 + movi v19.16b, #0 + movi v20.16b, #0 + movi v21.16b, #0 + +.Lselect_w5_loop: + // Loop 16 times. + + // Increment index (loop counter); tested at the end of the loop + add w9, w9, #1 + + // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1 + // and advance x1 to point to the next entry + ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 + + // x11 := (w9 == w2)? All 1s : All 0s + cmp w9, w2 + csetm x11, eq + + // continue loading ... + ld1 {v26.2d, v27.2d}, [x1],#32 + + // duplicate mask_64 into Mask (all 0s or all 1s) + dup v3.2d, x11 + + // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] + // i.e., values in output registers will remain the same if w9 != w2 + bit v16.16b, v22.16b, v3.16b + bit v17.16b, v23.16b, v3.16b + + bit v18.16b, v24.16b, v3.16b + bit v19.16b, v25.16b, v3.16b + + bit v20.16b, v26.16b, v3.16b + bit v21.16b, v27.16b, v3.16b + + // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back + tbz w9, #4, .Lselect_w5_loop + + // Write [v16-v21] to memory at the output pointer + st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64 + st1 {v20.2d, v21.2d}, [x10] + + ret +.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 + + +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); +.globl ecp_nistz256_select_w7 +.hidden ecp_nistz256_select_w7 +.type ecp_nistz256_select_w7,%function +.align 4 +ecp_nistz256_select_w7: + AARCH64_VALID_CALL_TARGET + + // w9 := 0; loop counter and incremented internal index + mov w9, #0 + + // [v16-v21] := 0 + movi v16.16b, #0 + movi v17.16b, #0 + movi v18.16b, #0 + movi v19.16b, #0 + +.Lselect_w7_loop: + // Loop 64 times. + + // Increment index (loop counter); tested at the end of the loop + add w9, w9, #1 + + // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1 + // and advance x1 to point to the next entry + ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 + + // x11 := (w9 == w2)? All 1s : All 0s + cmp w9, w2 + csetm x11, eq + + // duplicate mask_64 into Mask (all 0s or all 1s) + dup v3.2d, x11 + + // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] + // i.e., values in output registers will remain the same if w9 != w2 + bit v16.16b, v22.16b, v3.16b + bit v17.16b, v23.16b, v3.16b + + bit v18.16b, v24.16b, v3.16b + bit v19.16b, v25.16b, v3.16b + + // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back + tbz w9, #6, .Lselect_w7_loop + + // Write [v16-v19] to memory at the output pointer + st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0] + + ret +.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) diff --git a/ring-0.17.14/pregenerated/p256-armv8-asm-win64.S b/ring-0.17.14/pregenerated/p256-armv8-asm-win64.S new file mode 100644 index 0000000000..efc4581039 --- /dev/null +++ b/ring-0.17.14/pregenerated/p256-armv8-asm-win64.S @@ -0,0 +1,1640 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +.section .rodata +.align 5 +Lpoly: +.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 +LRR: // 2^512 mod P precomputed for NIST P256 polynomial +.quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd +Lone_mont: +.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe +Lone: +.quad 1,0,0,0 +Lord: +.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 +LordK: +.quad 0xccd1c8aaee00bc4f +.byte 69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.text + +// void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], +// const BN_ULONG x2[4]); +.globl ecp_nistz256_mul_mont + +.def ecp_nistz256_mul_mont + .type 32 +.endef +.align 4 +ecp_nistz256_mul_mont: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + + ldr x3,[x2] // bp[0] + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + adrp x13,Lpoly + add x13,x13,:lo12:Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + bl __ecp_nistz256_mul_mont + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +// void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_sqr_mont + +.def ecp_nistz256_sqr_mont + .type 32 +.endef +.align 4 +ecp_nistz256_sqr_mont: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + adrp x13,Lpoly + add x13,x13,:lo12:Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + bl __ecp_nistz256_sqr_mont + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +// void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_neg + +.def ecp_nistz256_neg + .type 32 +.endef +.align 4 +ecp_nistz256_neg: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x2,x1 + mov x14,xzr // a = 0 + mov x15,xzr + mov x16,xzr + mov x17,xzr + adrp x13,Lpoly + add x13,x13,:lo12:Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + bl __ecp_nistz256_sub_from + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded +// to x4-x7 and b[0] - to x3 +.def __ecp_nistz256_mul_mont + .type 32 +.endef +.align 4 +__ecp_nistz256_mul_mont: + mul x14,x4,x3 // a[0]*b[0] + umulh x8,x4,x3 + + mul x15,x5,x3 // a[1]*b[0] + umulh x9,x5,x3 + + mul x16,x6,x3 // a[2]*b[0] + umulh x10,x6,x3 + + mul x17,x7,x3 // a[3]*b[0] + umulh x11,x7,x3 + ldr x3,[x2,#8] // b[1] + + adds x15,x15,x8 // accumulate high parts of multiplication + lsl x8,x14,#32 + adcs x16,x16,x9 + lsr x9,x14,#32 + adcs x17,x17,x10 + adc x19,xzr,x11 + mov x20,xzr + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + mul x8,x4,x3 // lo(a[0]*b[i]) + adcs x15,x16,x9 + mul x9,x5,x3 // lo(a[1]*b[i]) + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + mul x10,x6,x3 // lo(a[2]*b[i]) + adcs x17,x19,x11 + mul x11,x7,x3 // lo(a[3]*b[i]) + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts of multiplication + umulh x8,x4,x3 // hi(a[0]*b[i]) + adcs x15,x15,x9 + umulh x9,x5,x3 // hi(a[1]*b[i]) + adcs x16,x16,x10 + umulh x10,x6,x3 // hi(a[2]*b[i]) + adcs x17,x17,x11 + umulh x11,x7,x3 // hi(a[3]*b[i]) + adc x19,x19,xzr + ldr x3,[x2,#8*(1+1)] // b[1+1] + adds x15,x15,x8 // accumulate high parts of multiplication + lsl x8,x14,#32 + adcs x16,x16,x9 + lsr x9,x14,#32 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + mul x8,x4,x3 // lo(a[0]*b[i]) + adcs x15,x16,x9 + mul x9,x5,x3 // lo(a[1]*b[i]) + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + mul x10,x6,x3 // lo(a[2]*b[i]) + adcs x17,x19,x11 + mul x11,x7,x3 // lo(a[3]*b[i]) + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts of multiplication + umulh x8,x4,x3 // hi(a[0]*b[i]) + adcs x15,x15,x9 + umulh x9,x5,x3 // hi(a[1]*b[i]) + adcs x16,x16,x10 + umulh x10,x6,x3 // hi(a[2]*b[i]) + adcs x17,x17,x11 + umulh x11,x7,x3 // hi(a[3]*b[i]) + adc x19,x19,xzr + ldr x3,[x2,#8*(2+1)] // b[2+1] + adds x15,x15,x8 // accumulate high parts of multiplication + lsl x8,x14,#32 + adcs x16,x16,x9 + lsr x9,x14,#32 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + mul x8,x4,x3 // lo(a[0]*b[i]) + adcs x15,x16,x9 + mul x9,x5,x3 // lo(a[1]*b[i]) + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + mul x10,x6,x3 // lo(a[2]*b[i]) + adcs x17,x19,x11 + mul x11,x7,x3 // lo(a[3]*b[i]) + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts of multiplication + umulh x8,x4,x3 // hi(a[0]*b[i]) + adcs x15,x15,x9 + umulh x9,x5,x3 // hi(a[1]*b[i]) + adcs x16,x16,x10 + umulh x10,x6,x3 // hi(a[2]*b[i]) + adcs x17,x17,x11 + umulh x11,x7,x3 // hi(a[3]*b[i]) + adc x19,x19,xzr + adds x15,x15,x8 // accumulate high parts of multiplication + lsl x8,x14,#32 + adcs x16,x16,x9 + lsr x9,x14,#32 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + // last reduction + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + adcs x17,x19,x11 + adc x19,x20,xzr + + adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus + sbcs x9,x15,x12 + sbcs x10,x16,xzr + sbcs x11,x17,x13 + sbcs xzr,x19,xzr // did it borrow? + + csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus + csel x15,x15,x9,lo + csel x16,x16,x10,lo + stp x14,x15,[x0] + csel x17,x17,x11,lo + stp x16,x17,[x0,#16] + + ret + + +// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded +// to x4-x7 +.def __ecp_nistz256_sqr_mont + .type 32 +.endef +.align 4 +__ecp_nistz256_sqr_mont: + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul x15,x5,x4 // a[1]*a[0] + umulh x9,x5,x4 + mul x16,x6,x4 // a[2]*a[0] + umulh x10,x6,x4 + mul x17,x7,x4 // a[3]*a[0] + umulh x19,x7,x4 + + adds x16,x16,x9 // accumulate high parts of multiplication + mul x8,x6,x5 // a[2]*a[1] + umulh x9,x6,x5 + adcs x17,x17,x10 + mul x10,x7,x5 // a[3]*a[1] + umulh x11,x7,x5 + adc x19,x19,xzr // can't overflow + + mul x20,x7,x6 // a[3]*a[2] + umulh x1,x7,x6 + + adds x9,x9,x10 // accumulate high parts of multiplication + mul x14,x4,x4 // a[0]*a[0] + adc x10,x11,xzr // can't overflow + + adds x17,x17,x8 // accumulate low parts of multiplication + umulh x4,x4,x4 + adcs x19,x19,x9 + mul x9,x5,x5 // a[1]*a[1] + adcs x20,x20,x10 + umulh x5,x5,x5 + adc x1,x1,xzr // can't overflow + + adds x15,x15,x15 // acc[1-6]*=2 + mul x10,x6,x6 // a[2]*a[2] + adcs x16,x16,x16 + umulh x6,x6,x6 + adcs x17,x17,x17 + mul x11,x7,x7 // a[3]*a[3] + adcs x19,x19,x19 + umulh x7,x7,x7 + adcs x20,x20,x20 + adcs x1,x1,x1 + adc x2,xzr,xzr + + adds x15,x15,x4 // +a[i]*a[i] + adcs x16,x16,x9 + adcs x17,x17,x5 + adcs x19,x19,x10 + adcs x20,x20,x6 + lsl x8,x14,#32 + adcs x1,x1,x11 + lsr x9,x14,#32 + adc x2,x2,x7 + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + lsl x8,x14,#32 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + lsr x9,x14,#32 + adc x17,x11,xzr // can't overflow + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + lsl x8,x14,#32 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + lsr x9,x14,#32 + adc x17,x11,xzr // can't overflow + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + lsl x8,x14,#32 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + lsr x9,x14,#32 + adc x17,x11,xzr // can't overflow + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + adc x17,x11,xzr // can't overflow + + adds x14,x14,x19 // accumulate upper half + adcs x15,x15,x20 + adcs x16,x16,x1 + adcs x17,x17,x2 + adc x19,xzr,xzr + + adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus + sbcs x9,x15,x12 + sbcs x10,x16,xzr + sbcs x11,x17,x13 + sbcs xzr,x19,xzr // did it borrow? + + csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus + csel x15,x15,x9,lo + csel x16,x16,x10,lo + stp x14,x15,[x0] + csel x17,x17,x11,lo + stp x16,x17,[x0,#16] + + ret + + +// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to +// x4-x7 and x8-x11. This is done because it's used in multiple +// contexts, e.g. in multiplication by 2 and 3... +.def __ecp_nistz256_add_to + .type 32 +.endef +.align 4 +__ecp_nistz256_add_to: + adds x14,x14,x8 // ret = a+b + adcs x15,x15,x9 + adcs x16,x16,x10 + adcs x17,x17,x11 + adc x1,xzr,xzr // zap x1 + + adds x8,x14,#1 // subs x8,x4,#-1 // tmp = ret-modulus + sbcs x9,x15,x12 + sbcs x10,x16,xzr + sbcs x11,x17,x13 + sbcs xzr,x1,xzr // did subtraction borrow? + + csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus + csel x15,x15,x9,lo + csel x16,x16,x10,lo + stp x14,x15,[x0] + csel x17,x17,x11,lo + stp x16,x17,[x0,#16] + + ret + + +.def __ecp_nistz256_sub_from + .type 32 +.endef +.align 4 +__ecp_nistz256_sub_from: + ldp x8,x9,[x2] + ldp x10,x11,[x2,#16] + subs x14,x14,x8 // ret = a-b + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbcs x17,x17,x11 + sbc x1,xzr,xzr // zap x1 + + subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus + adcs x9,x15,x12 + adcs x10,x16,xzr + adc x11,x17,x13 + cmp x1,xzr // did subtraction borrow? + + csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret + csel x15,x15,x9,eq + csel x16,x16,x10,eq + stp x14,x15,[x0] + csel x17,x17,x11,eq + stp x16,x17,[x0,#16] + + ret + + +.def __ecp_nistz256_sub_morf + .type 32 +.endef +.align 4 +__ecp_nistz256_sub_morf: + ldp x8,x9,[x2] + ldp x10,x11,[x2,#16] + subs x14,x8,x14 // ret = b-a + sbcs x15,x9,x15 + sbcs x16,x10,x16 + sbcs x17,x11,x17 + sbc x1,xzr,xzr // zap x1 + + subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus + adcs x9,x15,x12 + adcs x10,x16,xzr + adc x11,x17,x13 + cmp x1,xzr // did subtraction borrow? + + csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret + csel x15,x15,x9,eq + csel x16,x16,x10,eq + stp x14,x15,[x0] + csel x17,x17,x11,eq + stp x16,x17,[x0,#16] + + ret + + +.def __ecp_nistz256_div_by_2 + .type 32 +.endef +.align 4 +__ecp_nistz256_div_by_2: + subs x8,x14,#1 // adds x8,x4,#-1 // tmp = a+modulus + adcs x9,x15,x12 + adcs x10,x16,xzr + adcs x11,x17,x13 + adc x1,xzr,xzr // zap x1 + tst x14,#1 // is a even? + + csel x14,x14,x8,eq // ret = even ? a : a+modulus + csel x15,x15,x9,eq + csel x16,x16,x10,eq + csel x17,x17,x11,eq + csel x1,xzr,x1,eq + + lsr x14,x14,#1 // ret >>= 1 + orr x14,x14,x15,lsl#63 + lsr x15,x15,#1 + orr x15,x15,x16,lsl#63 + lsr x16,x16,#1 + orr x16,x16,x17,lsl#63 + lsr x17,x17,#1 + stp x14,x15,[x0] + orr x17,x17,x1,lsl#63 + stp x16,x17,[x0,#16] + + ret + +.globl ecp_nistz256_point_double + +.def ecp_nistz256_point_double + .type 32 +.endef +.align 5 +ecp_nistz256_point_double: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + sub sp,sp,#32*4 + +Ldouble_shortcut: + ldp x14,x15,[x1,#32] + mov x21,x0 + ldp x16,x17,[x1,#48] + mov x22,x1 + adrp x13,Lpoly + add x13,x13,:lo12:Lpoly + ldr x12,[x13,#8] + mov x8,x14 + ldr x13,[x13,#24] + mov x9,x15 + ldp x4,x5,[x22,#64] // forward load for p256_sqr_mont + mov x10,x16 + mov x11,x17 + ldp x6,x7,[x22,#64+16] + add x0,sp,#0 + bl __ecp_nistz256_add_to // p256_mul_by_2(S, in_y); + + add x0,sp,#64 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); + + ldp x8,x9,[x22] + ldp x10,x11,[x22,#16] + mov x4,x14 // put Zsqr aside for p256_sub + mov x5,x15 + mov x6,x16 + mov x7,x17 + add x0,sp,#32 + bl __ecp_nistz256_add_to // p256_add(M, Zsqr, in_x); + + add x2,x22,#0 + mov x14,x4 // restore Zsqr + mov x15,x5 + ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont + mov x16,x6 + mov x17,x7 + ldp x6,x7,[sp,#0+16] + add x0,sp,#64 + bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); + + add x0,sp,#0 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); + + ldr x3,[x22,#32] + ldp x4,x5,[x22,#64] + ldp x6,x7,[x22,#64+16] + add x2,x22,#32 + add x0,sp,#96 + bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); + + mov x8,x14 + mov x9,x15 + ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont + mov x10,x16 + mov x11,x17 + ldp x6,x7,[sp,#0+16] + add x0,x21,#64 + bl __ecp_nistz256_add_to // p256_mul_by_2(res_z, tmp0); + + add x0,sp,#96 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); + + ldr x3,[sp,#64] // forward load for p256_mul_mont + ldp x4,x5,[sp,#32] + ldp x6,x7,[sp,#32+16] + add x0,x21,#32 + bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); + + add x2,sp,#64 + add x0,sp,#32 + bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); + + mov x8,x14 // duplicate M + mov x9,x15 + mov x10,x16 + mov x11,x17 + mov x4,x14 // put M aside + mov x5,x15 + mov x6,x16 + mov x7,x17 + add x0,sp,#32 + bl __ecp_nistz256_add_to + mov x8,x4 // restore M + mov x9,x5 + ldr x3,[x22] // forward load for p256_mul_mont + mov x10,x6 + ldp x4,x5,[sp,#0] + mov x11,x7 + ldp x6,x7,[sp,#0+16] + bl __ecp_nistz256_add_to // p256_mul_by_3(M, M); + + add x2,x22,#0 + add x0,sp,#0 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); + + mov x8,x14 + mov x9,x15 + ldp x4,x5,[sp,#32] // forward load for p256_sqr_mont + mov x10,x16 + mov x11,x17 + ldp x6,x7,[sp,#32+16] + add x0,sp,#96 + bl __ecp_nistz256_add_to // p256_mul_by_2(tmp0, S); + + add x0,x21,#0 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); + + add x2,sp,#96 + bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); + + add x2,sp,#0 + add x0,sp,#0 + bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); + + ldr x3,[sp,#32] + mov x4,x14 // copy S + mov x5,x15 + mov x6,x16 + mov x7,x17 + add x2,sp,#32 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); + + add x2,x21,#32 + add x0,x21,#32 + bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); + + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.globl ecp_nistz256_point_add + +.def ecp_nistz256_point_add + .type 32 +.endef +.align 5 +ecp_nistz256_point_add: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#32*12 + + ldp x4,x5,[x2,#64] // in2_z + ldp x6,x7,[x2,#64+16] + mov x21,x0 + mov x22,x1 + mov x23,x2 + adrp x13,Lpoly + add x13,x13,:lo12:Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + orr x8,x4,x5 + orr x10,x6,x7 + orr x25,x8,x10 + cmp x25,#0 + csetm x25,ne // ~in2infty + add x0,sp,#192 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); + + ldp x4,x5,[x22,#64] // in1_z + ldp x6,x7,[x22,#64+16] + orr x8,x4,x5 + orr x10,x6,x7 + orr x24,x8,x10 + cmp x24,#0 + csetm x24,ne // ~in1infty + add x0,sp,#128 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); + + ldr x3,[x23,#64] + ldp x4,x5,[sp,#192] + ldp x6,x7,[sp,#192+16] + add x2,x23,#64 + add x0,sp,#320 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z); + + ldr x3,[x22,#64] + ldp x4,x5,[sp,#128] + ldp x6,x7,[sp,#128+16] + add x2,x22,#64 + add x0,sp,#352 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); + + ldr x3,[x22,#32] + ldp x4,x5,[sp,#320] + ldp x6,x7,[sp,#320+16] + add x2,x22,#32 + add x0,sp,#320 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y); + + ldr x3,[x23,#32] + ldp x4,x5,[sp,#352] + ldp x6,x7,[sp,#352+16] + add x2,x23,#32 + add x0,sp,#352 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); + + add x2,sp,#320 + ldr x3,[sp,#192] // forward load for p256_mul_mont + ldp x4,x5,[x22] + ldp x6,x7,[x22,#16] + add x0,sp,#160 + bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1); + + orr x14,x14,x15 // see if result is zero + orr x16,x16,x17 + orr x26,x14,x16 // ~is_equal(S1,S2) + + add x2,sp,#192 + add x0,sp,#256 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr); + + ldr x3,[sp,#128] + ldp x4,x5,[x23] + ldp x6,x7,[x23,#16] + add x2,sp,#128 + add x0,sp,#288 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr); + + add x2,sp,#256 + ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont + ldp x6,x7,[sp,#160+16] + add x0,sp,#96 + bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1); + + orr x14,x14,x15 // see if result is zero + orr x16,x16,x17 + orr x14,x14,x16 // ~is_equal(U1,U2) + + mvn x27,x24 // -1/0 -> 0/-1 + mvn x28,x25 // -1/0 -> 0/-1 + orr x14,x14,x27 + orr x14,x14,x28 + orr x14,x14,x26 + cbnz x14,Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) + +Ladd_double: + mov x1,x22 + mov x0,x21 + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + add sp,sp,#256 // #256 is from #32*(12-4). difference in stack frames + b Ldouble_shortcut + +.align 4 +Ladd_proceed: + add x0,sp,#192 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); + + ldr x3,[x22,#64] + ldp x4,x5,[sp,#96] + ldp x6,x7,[sp,#96+16] + add x2,x22,#64 + add x0,sp,#64 + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); + + ldp x4,x5,[sp,#96] + ldp x6,x7,[sp,#96+16] + add x0,sp,#128 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); + + ldr x3,[x23,#64] + ldp x4,x5,[sp,#64] + ldp x6,x7,[sp,#64+16] + add x2,x23,#64 + add x0,sp,#64 + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z); + + ldr x3,[sp,#96] + ldp x4,x5,[sp,#128] + ldp x6,x7,[sp,#128+16] + add x2,sp,#96 + add x0,sp,#224 + bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); + + ldr x3,[sp,#128] + ldp x4,x5,[sp,#256] + ldp x6,x7,[sp,#256+16] + add x2,sp,#128 + add x0,sp,#288 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr); + + mov x8,x14 + mov x9,x15 + mov x10,x16 + mov x11,x17 + add x0,sp,#128 + bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); + + add x2,sp,#192 + add x0,sp,#0 + bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); + + add x2,sp,#224 + bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); + + add x2,sp,#288 + ldr x3,[sp,#224] // forward load for p256_mul_mont + ldp x4,x5,[sp,#320] + ldp x6,x7,[sp,#320+16] + add x0,sp,#32 + bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); + + add x2,sp,#224 + add x0,sp,#352 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub); + + ldr x3,[sp,#160] + ldp x4,x5,[sp,#32] + ldp x6,x7,[sp,#32+16] + add x2,sp,#160 + add x0,sp,#32 + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); + + add x2,sp,#352 + bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); + + ldp x4,x5,[sp,#0] // res + ldp x6,x7,[sp,#0+16] + ldp x8,x9,[x23] // in2 + ldp x10,x11,[x23,#16] + ldp x14,x15,[x22,#0] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#0+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + ldp x4,x5,[sp,#0+0+32] // res + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + ldp x6,x7,[sp,#0+0+48] + csel x14,x8,x14,ne + csel x15,x9,x15,ne + ldp x8,x9,[x23,#0+32] // in2 + csel x16,x10,x16,ne + csel x17,x11,x17,ne + ldp x10,x11,[x23,#0+48] + stp x14,x15,[x21,#0] + stp x16,x17,[x21,#0+16] + ldp x14,x15,[x22,#32] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#32+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + ldp x4,x5,[sp,#0+32+32] // res + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + ldp x6,x7,[sp,#0+32+48] + csel x14,x8,x14,ne + csel x15,x9,x15,ne + ldp x8,x9,[x23,#32+32] // in2 + csel x16,x10,x16,ne + csel x17,x11,x17,ne + ldp x10,x11,[x23,#32+48] + stp x14,x15,[x21,#32] + stp x16,x17,[x21,#32+16] + ldp x14,x15,[x22,#64] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#64+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + csel x14,x8,x14,ne + csel x15,x9,x15,ne + csel x16,x10,x16,ne + csel x17,x11,x17,ne + stp x14,x15,[x21,#64] + stp x16,x17,[x21,#64+16] + +Ladd_done: + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.globl ecp_nistz256_point_add_affine + +.def ecp_nistz256_point_add_affine + .type 32 +.endef +.align 5 +ecp_nistz256_point_add_affine: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + sub sp,sp,#32*10 + + mov x21,x0 + mov x22,x1 + mov x23,x2 + adrp x13,Lpoly + add x13,x13,:lo12:Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + ldp x4,x5,[x1,#64] // in1_z + ldp x6,x7,[x1,#64+16] + orr x8,x4,x5 + orr x10,x6,x7 + orr x24,x8,x10 + cmp x24,#0 + csetm x24,ne // ~in1infty + + ldp x14,x15,[x2] // in2_x + ldp x16,x17,[x2,#16] + ldp x8,x9,[x2,#32] // in2_y + ldp x10,x11,[x2,#48] + orr x14,x14,x15 + orr x16,x16,x17 + orr x8,x8,x9 + orr x10,x10,x11 + orr x14,x14,x16 + orr x8,x8,x10 + orr x25,x14,x8 + cmp x25,#0 + csetm x25,ne // ~in2infty + + add x0,sp,#128 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); + + mov x4,x14 + mov x5,x15 + mov x6,x16 + mov x7,x17 + ldr x3,[x23] + add x2,x23,#0 + add x0,sp,#96 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); + + add x2,x22,#0 + ldr x3,[x22,#64] // forward load for p256_mul_mont + ldp x4,x5,[sp,#128] + ldp x6,x7,[sp,#128+16] + add x0,sp,#160 + bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); + + add x2,x22,#64 + add x0,sp,#128 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); + + ldr x3,[x22,#64] + ldp x4,x5,[sp,#160] + ldp x6,x7,[sp,#160+16] + add x2,x22,#64 + add x0,sp,#64 + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); + + ldr x3,[x23,#32] + ldp x4,x5,[sp,#128] + ldp x6,x7,[sp,#128+16] + add x2,x23,#32 + add x0,sp,#128 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); + + add x2,x22,#32 + ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont + ldp x6,x7,[sp,#160+16] + add x0,sp,#192 + bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); + + add x0,sp,#224 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); + + ldp x4,x5,[sp,#192] + ldp x6,x7,[sp,#192+16] + add x0,sp,#288 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); + + ldr x3,[sp,#160] + ldp x4,x5,[sp,#224] + ldp x6,x7,[sp,#224+16] + add x2,sp,#160 + add x0,sp,#256 + bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); + + ldr x3,[x22] + ldp x4,x5,[sp,#224] + ldp x6,x7,[sp,#224+16] + add x2,x22,#0 + add x0,sp,#96 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); + + mov x8,x14 + mov x9,x15 + mov x10,x16 + mov x11,x17 + add x0,sp,#224 + bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); + + add x2,sp,#288 + add x0,sp,#0 + bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); + + add x2,sp,#256 + bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); + + add x2,sp,#96 + ldr x3,[x22,#32] // forward load for p256_mul_mont + ldp x4,x5,[sp,#256] + ldp x6,x7,[sp,#256+16] + add x0,sp,#32 + bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); + + add x2,x22,#32 + add x0,sp,#128 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); + + ldr x3,[sp,#192] + ldp x4,x5,[sp,#32] + ldp x6,x7,[sp,#32+16] + add x2,sp,#192 + add x0,sp,#32 + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); + + add x2,sp,#128 + bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); + + ldp x4,x5,[sp,#0] // res + ldp x6,x7,[sp,#0+16] + ldp x8,x9,[x23] // in2 + ldp x10,x11,[x23,#16] + ldp x14,x15,[x22,#0] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#0+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + ldp x4,x5,[sp,#0+0+32] // res + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + ldp x6,x7,[sp,#0+0+48] + csel x14,x8,x14,ne + csel x15,x9,x15,ne + ldp x8,x9,[x23,#0+32] // in2 + csel x16,x10,x16,ne + csel x17,x11,x17,ne + ldp x10,x11,[x23,#0+48] + stp x14,x15,[x21,#0] + stp x16,x17,[x21,#0+16] + adrp x23,Lone_mont-64 + add x23,x23,:lo12:Lone_mont-64 + ldp x14,x15,[x22,#32] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#32+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + ldp x4,x5,[sp,#0+32+32] // res + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + ldp x6,x7,[sp,#0+32+48] + csel x14,x8,x14,ne + csel x15,x9,x15,ne + ldp x8,x9,[x23,#32+32] // in2 + csel x16,x10,x16,ne + csel x17,x11,x17,ne + ldp x10,x11,[x23,#32+48] + stp x14,x15,[x21,#32] + stp x16,x17,[x21,#32+16] + ldp x14,x15,[x22,#64] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#64+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + csel x14,x8,x14,ne + csel x15,x9,x15,ne + csel x16,x10,x16,ne + csel x17,x11,x17,ne + stp x14,x15,[x21,#64] + stp x16,x17,[x21,#64+16] + + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x29,x30,[sp],#80 + AARCH64_VALIDATE_LINK_REGISTER + ret + +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], +// uint64_t b[4]); +.globl ecp_nistz256_ord_mul_mont + +.def ecp_nistz256_ord_mul_mont + .type 32 +.endef +.align 4 +ecp_nistz256_ord_mul_mont: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + adrp x23,Lord + add x23,x23,:lo12:Lord + ldr x3,[x2] // bp[0] + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + + ldp x12,x13,[x23,#0] + ldp x21,x22,[x23,#16] + ldr x23,[x23,#32] + + mul x14,x4,x3 // a[0]*b[0] + umulh x8,x4,x3 + + mul x15,x5,x3 // a[1]*b[0] + umulh x9,x5,x3 + + mul x16,x6,x3 // a[2]*b[0] + umulh x10,x6,x3 + + mul x17,x7,x3 // a[3]*b[0] + umulh x19,x7,x3 + + mul x24,x14,x23 + + adds x15,x15,x8 // accumulate high parts of multiplication + adcs x16,x16,x9 + adcs x17,x17,x10 + adc x19,x19,xzr + mov x20,xzr + ldr x3,[x2,#8*1] // b[i] + + lsl x8,x24,#32 + subs x16,x16,x24 + lsr x9,x24,#32 + sbcs x17,x17,x8 + sbcs x19,x19,x9 + sbc x20,x20,xzr + + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + mul x8,x4,x3 + adc x11,x11,xzr + mul x9,x5,x3 + + adds x14,x15,x10 + mul x10,x6,x3 + adcs x15,x16,x11 + mul x11,x7,x3 + adcs x16,x17,x24 + adcs x17,x19,x24 + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts + umulh x8,x4,x3 + adcs x15,x15,x9 + umulh x9,x5,x3 + adcs x16,x16,x10 + umulh x10,x6,x3 + adcs x17,x17,x11 + umulh x11,x7,x3 + adc x19,x19,xzr + mul x24,x14,x23 + adds x15,x15,x8 // accumulate high parts + adcs x16,x16,x9 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + ldr x3,[x2,#8*2] // b[i] + + lsl x8,x24,#32 + subs x16,x16,x24 + lsr x9,x24,#32 + sbcs x17,x17,x8 + sbcs x19,x19,x9 + sbc x20,x20,xzr + + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + mul x8,x4,x3 + adc x11,x11,xzr + mul x9,x5,x3 + + adds x14,x15,x10 + mul x10,x6,x3 + adcs x15,x16,x11 + mul x11,x7,x3 + adcs x16,x17,x24 + adcs x17,x19,x24 + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts + umulh x8,x4,x3 + adcs x15,x15,x9 + umulh x9,x5,x3 + adcs x16,x16,x10 + umulh x10,x6,x3 + adcs x17,x17,x11 + umulh x11,x7,x3 + adc x19,x19,xzr + mul x24,x14,x23 + adds x15,x15,x8 // accumulate high parts + adcs x16,x16,x9 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + ldr x3,[x2,#8*3] // b[i] + + lsl x8,x24,#32 + subs x16,x16,x24 + lsr x9,x24,#32 + sbcs x17,x17,x8 + sbcs x19,x19,x9 + sbc x20,x20,xzr + + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + mul x8,x4,x3 + adc x11,x11,xzr + mul x9,x5,x3 + + adds x14,x15,x10 + mul x10,x6,x3 + adcs x15,x16,x11 + mul x11,x7,x3 + adcs x16,x17,x24 + adcs x17,x19,x24 + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts + umulh x8,x4,x3 + adcs x15,x15,x9 + umulh x9,x5,x3 + adcs x16,x16,x10 + umulh x10,x6,x3 + adcs x17,x17,x11 + umulh x11,x7,x3 + adc x19,x19,xzr + mul x24,x14,x23 + adds x15,x15,x8 // accumulate high parts + adcs x16,x16,x9 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + lsl x8,x24,#32 // last reduction + subs x16,x16,x24 + lsr x9,x24,#32 + sbcs x17,x17,x8 + sbcs x19,x19,x9 + sbc x20,x20,xzr + + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + adc x11,x11,xzr + + adds x14,x15,x10 + adcs x15,x16,x11 + adcs x16,x17,x24 + adcs x17,x19,x24 + adc x19,x20,xzr + + subs x8,x14,x12 // ret -= modulus + sbcs x9,x15,x13 + sbcs x10,x16,x21 + sbcs x11,x17,x22 + sbcs xzr,x19,xzr + + csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus + csel x15,x15,x9,lo + csel x16,x16,x10,lo + stp x14,x15,[x0] + csel x17,x17,x11,lo + stp x16,x17,[x0,#16] + + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldr x29,[sp],#64 + ret + + +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], +// uint64_t rep); +.globl ecp_nistz256_ord_sqr_mont + +.def ecp_nistz256_ord_sqr_mont + .type 32 +.endef +.align 4 +ecp_nistz256_ord_sqr_mont: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + adrp x23,Lord + add x23,x23,:lo12:Lord + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + + ldp x12,x13,[x23,#0] + ldp x21,x22,[x23,#16] + ldr x23,[x23,#32] + b Loop_ord_sqr + +.align 4 +Loop_ord_sqr: + sub x2,x2,#1 + //////////////////////////////////////////////////////////////// + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul x15,x5,x4 // a[1]*a[0] + umulh x9,x5,x4 + mul x16,x6,x4 // a[2]*a[0] + umulh x10,x6,x4 + mul x17,x7,x4 // a[3]*a[0] + umulh x19,x7,x4 + + adds x16,x16,x9 // accumulate high parts of multiplication + mul x8,x6,x5 // a[2]*a[1] + umulh x9,x6,x5 + adcs x17,x17,x10 + mul x10,x7,x5 // a[3]*a[1] + umulh x11,x7,x5 + adc x19,x19,xzr // can't overflow + + mul x20,x7,x6 // a[3]*a[2] + umulh x1,x7,x6 + + adds x9,x9,x10 // accumulate high parts of multiplication + mul x14,x4,x4 // a[0]*a[0] + adc x10,x11,xzr // can't overflow + + adds x17,x17,x8 // accumulate low parts of multiplication + umulh x4,x4,x4 + adcs x19,x19,x9 + mul x9,x5,x5 // a[1]*a[1] + adcs x20,x20,x10 + umulh x5,x5,x5 + adc x1,x1,xzr // can't overflow + + adds x15,x15,x15 // acc[1-6]*=2 + mul x10,x6,x6 // a[2]*a[2] + adcs x16,x16,x16 + umulh x6,x6,x6 + adcs x17,x17,x17 + mul x11,x7,x7 // a[3]*a[3] + adcs x19,x19,x19 + umulh x7,x7,x7 + adcs x20,x20,x20 + adcs x1,x1,x1 + adc x3,xzr,xzr + + adds x15,x15,x4 // +a[i]*a[i] + mul x24,x14,x23 + adcs x16,x16,x9 + adcs x17,x17,x5 + adcs x19,x19,x10 + adcs x20,x20,x6 + adcs x1,x1,x11 + adc x3,x3,x7 + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + adc x11,x11,xzr + + adds x14,x15,x10 + adcs x15,x16,x11 + adcs x16,x17,x24 + adc x17,xzr,x24 // can't overflow + mul x11,x14,x23 + lsl x8,x24,#32 + subs x15,x15,x24 + lsr x9,x24,#32 + sbcs x16,x16,x8 + sbc x17,x17,x9 // can't borrow + subs xzr,x14,#1 + umulh x9,x12,x11 + mul x10,x13,x11 + umulh x24,x13,x11 + + adcs x10,x10,x9 + adc x24,x24,xzr + + adds x14,x15,x10 + adcs x15,x16,x24 + adcs x16,x17,x11 + adc x17,xzr,x11 // can't overflow + mul x24,x14,x23 + lsl x8,x11,#32 + subs x15,x15,x11 + lsr x9,x11,#32 + sbcs x16,x16,x8 + sbc x17,x17,x9 // can't borrow + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + adc x11,x11,xzr + + adds x14,x15,x10 + adcs x15,x16,x11 + adcs x16,x17,x24 + adc x17,xzr,x24 // can't overflow + mul x11,x14,x23 + lsl x8,x24,#32 + subs x15,x15,x24 + lsr x9,x24,#32 + sbcs x16,x16,x8 + sbc x17,x17,x9 // can't borrow + subs xzr,x14,#1 + umulh x9,x12,x11 + mul x10,x13,x11 + umulh x24,x13,x11 + + adcs x10,x10,x9 + adc x24,x24,xzr + + adds x14,x15,x10 + adcs x15,x16,x24 + adcs x16,x17,x11 + adc x17,xzr,x11 // can't overflow + lsl x8,x11,#32 + subs x15,x15,x11 + lsr x9,x11,#32 + sbcs x16,x16,x8 + sbc x17,x17,x9 // can't borrow + adds x14,x14,x19 // accumulate upper half + adcs x15,x15,x20 + adcs x16,x16,x1 + adcs x17,x17,x3 + adc x19,xzr,xzr + + subs x8,x14,x12 // ret -= modulus + sbcs x9,x15,x13 + sbcs x10,x16,x21 + sbcs x11,x17,x22 + sbcs xzr,x19,xzr + + csel x4,x14,x8,lo // ret = borrow ? ret : ret-modulus + csel x5,x15,x9,lo + csel x6,x16,x10,lo + csel x7,x17,x11,lo + + cbnz x2,Loop_ord_sqr + + stp x4,x5,[x0] + stp x6,x7,[x0,#16] + + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldr x29,[sp],#64 + ret + +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); +.globl ecp_nistz256_select_w5 + +.def ecp_nistz256_select_w5 + .type 32 +.endef +.align 4 +ecp_nistz256_select_w5: + AARCH64_VALID_CALL_TARGET + + // x10 := x0 + // w9 := 0; loop counter and incremented internal index + mov x10, x0 + mov w9, #0 + + // [v16-v21] := 0 + movi v16.16b, #0 + movi v17.16b, #0 + movi v18.16b, #0 + movi v19.16b, #0 + movi v20.16b, #0 + movi v21.16b, #0 + +Lselect_w5_loop: + // Loop 16 times. + + // Increment index (loop counter); tested at the end of the loop + add w9, w9, #1 + + // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1 + // and advance x1 to point to the next entry + ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 + + // x11 := (w9 == w2)? All 1s : All 0s + cmp w9, w2 + csetm x11, eq + + // continue loading ... + ld1 {v26.2d, v27.2d}, [x1],#32 + + // duplicate mask_64 into Mask (all 0s or all 1s) + dup v3.2d, x11 + + // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] + // i.e., values in output registers will remain the same if w9 != w2 + bit v16.16b, v22.16b, v3.16b + bit v17.16b, v23.16b, v3.16b + + bit v18.16b, v24.16b, v3.16b + bit v19.16b, v25.16b, v3.16b + + bit v20.16b, v26.16b, v3.16b + bit v21.16b, v27.16b, v3.16b + + // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back + tbz w9, #4, Lselect_w5_loop + + // Write [v16-v21] to memory at the output pointer + st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64 + st1 {v20.2d, v21.2d}, [x10] + + ret + + + +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); +.globl ecp_nistz256_select_w7 + +.def ecp_nistz256_select_w7 + .type 32 +.endef +.align 4 +ecp_nistz256_select_w7: + AARCH64_VALID_CALL_TARGET + + // w9 := 0; loop counter and incremented internal index + mov w9, #0 + + // [v16-v21] := 0 + movi v16.16b, #0 + movi v17.16b, #0 + movi v18.16b, #0 + movi v19.16b, #0 + +Lselect_w7_loop: + // Loop 64 times. + + // Increment index (loop counter); tested at the end of the loop + add w9, w9, #1 + + // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1 + // and advance x1 to point to the next entry + ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 + + // x11 := (w9 == w2)? All 1s : All 0s + cmp w9, w2 + csetm x11, eq + + // duplicate mask_64 into Mask (all 0s or all 1s) + dup v3.2d, x11 + + // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] + // i.e., values in output registers will remain the same if w9 != w2 + bit v16.16b, v22.16b, v3.16b + bit v17.16b, v23.16b, v3.16b + + bit v18.16b, v24.16b, v3.16b + bit v19.16b, v25.16b, v3.16b + + // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back + tbz w9, #6, Lselect_w7_loop + + // Write [v16-v19] to memory at the output pointer + st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0] + + ret + +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) diff --git a/ring-0.17.14/pregenerated/p256-x86_64-asm-elf.S b/ring-0.17.14/pregenerated/p256-x86_64-asm-elf.S new file mode 100644 index 0000000000..deeec078b2 --- /dev/null +++ b/ring-0.17.14/pregenerated/p256-x86_64-asm-elf.S @@ -0,0 +1,4599 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +.text + + +.section .rodata +.align 64 +.Lpoly: +.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 + +.LOne: +.long 1,1,1,1,1,1,1,1 +.LTwo: +.long 2,2,2,2,2,2,2,2 +.LThree: +.long 3,3,3,3,3,3,3,3 +.LONE_mont: +.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe + + +.Lord: +.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000 +.LordK: +.quad 0xccd1c8aaee00bc4f +.text + + + +.globl ecp_nistz256_neg +.hidden ecp_nistz256_neg +.type ecp_nistz256_neg,@function +.align 32 +ecp_nistz256_neg: +.cfi_startproc +_CET_ENDBR + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-16 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-24 +.Lneg_body: + + xorq %r8,%r8 + xorq %r9,%r9 + xorq %r10,%r10 + xorq %r11,%r11 + xorq %r13,%r13 + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r8,%rax + sbbq 24(%rsi),%r11 + leaq .Lpoly(%rip),%rsi + movq %r9,%rdx + sbbq $0,%r13 + + addq 0(%rsi),%r8 + movq %r10,%rcx + adcq 8(%rsi),%r9 + adcq 16(%rsi),%r10 + movq %r11,%r12 + adcq 24(%rsi),%r11 + testq %r13,%r13 + + cmovzq %rax,%r8 + cmovzq %rdx,%r9 + movq %r8,0(%rdi) + cmovzq %rcx,%r10 + movq %r9,8(%rdi) + cmovzq %r12,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 0(%rsp),%r13 +.cfi_restore %r13 + movq 8(%rsp),%r12 +.cfi_restore %r12 + leaq 16(%rsp),%rsp +.cfi_adjust_cfa_offset -16 +.Lneg_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_neg,.-ecp_nistz256_neg + + + + + + +.globl ecp_nistz256_ord_mul_mont_nohw +.hidden ecp_nistz256_ord_mul_mont_nohw +.type ecp_nistz256_ord_mul_mont_nohw,@function +.align 32 +ecp_nistz256_ord_mul_mont_nohw: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lord_mul_body: + + movq 0(%rdx),%rax + movq %rdx,%rbx + leaq .Lord(%rip),%r14 + movq .LordK(%rip),%r15 + + + movq %rax,%rcx + mulq 0(%rsi) + movq %rax,%r8 + movq %rcx,%rax + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r9 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r10 + movq %rcx,%rax + adcq $0,%rdx + + movq %r8,%r13 + imulq %r15,%r8 + + movq %rdx,%r11 + mulq 24(%rsi) + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r12 + + + mulq 0(%r14) + movq %r8,%rbp + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%rcx + + subq %r8,%r10 + sbbq $0,%r8 + + mulq 8(%r14) + addq %rcx,%r9 + adcq $0,%rdx + addq %rax,%r9 + movq %rbp,%rax + adcq %rdx,%r10 + movq %rbp,%rdx + adcq $0,%r8 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r11 + movq 8(%rbx),%rax + sbbq %rdx,%rbp + + addq %r8,%r11 + adcq %rbp,%r12 + adcq $0,%r13 + + + movq %rax,%rcx + mulq 0(%rsi) + addq %rax,%r9 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rbp,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rbp,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rcx,%rax + adcq $0,%rdx + + movq %r9,%rcx + imulq %r15,%r9 + + movq %rdx,%rbp + mulq 24(%rsi) + addq %rbp,%r12 + adcq $0,%rdx + xorq %r8,%r8 + addq %rax,%r12 + movq %r9,%rax + adcq %rdx,%r13 + adcq $0,%r8 + + + mulq 0(%r14) + movq %r9,%rbp + addq %rax,%rcx + movq %r9,%rax + adcq %rdx,%rcx + + subq %r9,%r11 + sbbq $0,%r9 + + mulq 8(%r14) + addq %rcx,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %rbp,%rax + adcq %rdx,%r11 + movq %rbp,%rdx + adcq $0,%r9 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r12 + movq 16(%rbx),%rax + sbbq %rdx,%rbp + + addq %r9,%r12 + adcq %rbp,%r13 + adcq $0,%r8 + + + movq %rax,%rcx + mulq 0(%rsi) + addq %rax,%r10 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rbp,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rbp,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rcx,%rax + adcq $0,%rdx + + movq %r10,%rcx + imulq %r15,%r10 + + movq %rdx,%rbp + mulq 24(%rsi) + addq %rbp,%r13 + adcq $0,%rdx + xorq %r9,%r9 + addq %rax,%r13 + movq %r10,%rax + adcq %rdx,%r8 + adcq $0,%r9 + + + mulq 0(%r14) + movq %r10,%rbp + addq %rax,%rcx + movq %r10,%rax + adcq %rdx,%rcx + + subq %r10,%r12 + sbbq $0,%r10 + + mulq 8(%r14) + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rbp,%rax + adcq %rdx,%r12 + movq %rbp,%rdx + adcq $0,%r10 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r13 + movq 24(%rbx),%rax + sbbq %rdx,%rbp + + addq %r10,%r13 + adcq %rbp,%r8 + adcq $0,%r9 + + + movq %rax,%rcx + mulq 0(%rsi) + addq %rax,%r11 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rbp,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rbp,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %rcx,%rax + adcq $0,%rdx + + movq %r11,%rcx + imulq %r15,%r11 + + movq %rdx,%rbp + mulq 24(%rsi) + addq %rbp,%r8 + adcq $0,%rdx + xorq %r10,%r10 + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + adcq $0,%r10 + + + mulq 0(%r14) + movq %r11,%rbp + addq %rax,%rcx + movq %r11,%rax + adcq %rdx,%rcx + + subq %r11,%r13 + sbbq $0,%r11 + + mulq 8(%r14) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rbp,%rax + adcq %rdx,%r13 + movq %rbp,%rdx + adcq $0,%r11 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r8 + sbbq %rdx,%rbp + + addq %r11,%r8 + adcq %rbp,%r9 + adcq $0,%r10 + + + movq %r12,%rsi + subq 0(%r14),%r12 + movq %r13,%r11 + sbbq 8(%r14),%r13 + movq %r8,%rcx + sbbq 16(%r14),%r8 + movq %r9,%rbp + sbbq 24(%r14),%r9 + sbbq $0,%r10 + + cmovcq %rsi,%r12 + cmovcq %r11,%r13 + cmovcq %rcx,%r8 + cmovcq %rbp,%r9 + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_mul_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_ord_mul_mont_nohw,.-ecp_nistz256_ord_mul_mont_nohw + + + + + + + +.globl ecp_nistz256_ord_sqr_mont_nohw +.hidden ecp_nistz256_ord_sqr_mont_nohw +.type ecp_nistz256_ord_sqr_mont_nohw,@function +.align 32 +ecp_nistz256_ord_sqr_mont_nohw: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lord_sqr_body: + + movq 0(%rsi),%r8 + movq 8(%rsi),%rax + movq 16(%rsi),%r14 + movq 24(%rsi),%r15 + leaq .Lord(%rip),%rsi + movq %rdx,%rbx + jmp .Loop_ord_sqr + +.align 32 +.Loop_ord_sqr: + + movq %rax,%rbp + mulq %r8 + movq %rax,%r9 +.byte 102,72,15,110,205 + movq %r14,%rax + movq %rdx,%r10 + + mulq %r8 + addq %rax,%r10 + movq %r15,%rax +.byte 102,73,15,110,214 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r8 + addq %rax,%r11 + movq %r15,%rax +.byte 102,73,15,110,223 + adcq $0,%rdx + movq %rdx,%r12 + + + mulq %r14 + movq %rax,%r13 + movq %r14,%rax + movq %rdx,%r14 + + + mulq %rbp + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + + addq %r15,%r12 + adcq %rdx,%r13 + adcq $0,%r14 + + + xorq %r15,%r15 + movq %r8,%rax + addq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %r12,%r12 + adcq %r13,%r13 + adcq %r14,%r14 + adcq $0,%r15 + + + mulq %rax + movq %rax,%r8 +.byte 102,72,15,126,200 + movq %rdx,%rbp + + mulq %rax + addq %rbp,%r9 + adcq %rax,%r10 +.byte 102,72,15,126,208 + adcq $0,%rdx + movq %rdx,%rbp + + mulq %rax + addq %rbp,%r11 + adcq %rax,%r12 +.byte 102,72,15,126,216 + adcq $0,%rdx + movq %rdx,%rbp + + movq %r8,%rcx + imulq 32(%rsi),%r8 + + mulq %rax + addq %rbp,%r13 + adcq %rax,%r14 + movq 0(%rsi),%rax + adcq %rdx,%r15 + + + mulq %r8 + movq %r8,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r8,%r10 + sbbq $0,%rbp + + mulq %r8 + addq %rcx,%r9 + adcq $0,%rdx + addq %rax,%r9 + movq %r8,%rax + adcq %rdx,%r10 + movq %r8,%rdx + adcq $0,%rbp + + movq %r9,%rcx + imulq 32(%rsi),%r9 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r11 + movq 0(%rsi),%rax + sbbq %rdx,%r8 + + addq %rbp,%r11 + adcq $0,%r8 + + + mulq %r9 + movq %r9,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r9,%r11 + sbbq $0,%rbp + + mulq %r9 + addq %rcx,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %r9,%rax + adcq %rdx,%r11 + movq %r9,%rdx + adcq $0,%rbp + + movq %r10,%rcx + imulq 32(%rsi),%r10 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r8 + movq 0(%rsi),%rax + sbbq %rdx,%r9 + + addq %rbp,%r8 + adcq $0,%r9 + + + mulq %r10 + movq %r10,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r10,%r8 + sbbq $0,%rbp + + mulq %r10 + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r10,%rax + adcq %rdx,%r8 + movq %r10,%rdx + adcq $0,%rbp + + movq %r11,%rcx + imulq 32(%rsi),%r11 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r9 + movq 0(%rsi),%rax + sbbq %rdx,%r10 + + addq %rbp,%r9 + adcq $0,%r10 + + + mulq %r11 + movq %r11,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r11,%r9 + sbbq $0,%rbp + + mulq %r11 + addq %rcx,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + movq %r11,%rdx + adcq $0,%rbp + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r10 + sbbq %rdx,%r11 + + addq %rbp,%r10 + adcq $0,%r11 + + + xorq %rdx,%rdx + addq %r12,%r8 + adcq %r13,%r9 + movq %r8,%r12 + adcq %r14,%r10 + adcq %r15,%r11 + movq %r9,%rax + adcq $0,%rdx + + + subq 0(%rsi),%r8 + movq %r10,%r14 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r11,%r15 + sbbq 24(%rsi),%r11 + sbbq $0,%rdx + + cmovcq %r12,%r8 + cmovncq %r9,%rax + cmovncq %r10,%r14 + cmovncq %r11,%r15 + + decq %rbx + jnz .Loop_ord_sqr + + movq %r8,0(%rdi) + movq %rax,8(%rdi) + pxor %xmm1,%xmm1 + movq %r14,16(%rdi) + pxor %xmm2,%xmm2 + movq %r15,24(%rdi) + pxor %xmm3,%xmm3 + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_sqr_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_ord_sqr_mont_nohw,.-ecp_nistz256_ord_sqr_mont_nohw + +.globl ecp_nistz256_ord_mul_mont_adx +.hidden ecp_nistz256_ord_mul_mont_adx +.type ecp_nistz256_ord_mul_mont_adx,@function +.align 32 +ecp_nistz256_ord_mul_mont_adx: +.cfi_startproc +.Lecp_nistz256_ord_mul_mont_adx: +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lord_mulx_body: + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + leaq -128(%rsi),%rsi + leaq .Lord-128(%rip),%r14 + movq .LordK(%rip),%r15 + + + mulxq %r9,%r8,%r9 + mulxq %r10,%rcx,%r10 + mulxq %r11,%rbp,%r11 + addq %rcx,%r9 + mulxq %r12,%rcx,%r12 + movq %r8,%rdx + mulxq %r15,%rdx,%rax + adcq %rbp,%r10 + adcq %rcx,%r11 + adcq $0,%r12 + + + xorq %r13,%r13 + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%r14),%rcx,%rbp + movq 8(%rbx),%rdx + adcxq %rcx,%r11 + adoxq %rbp,%r12 + adcxq %r8,%r12 + adoxq %r8,%r13 + adcq $0,%r13 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r9,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + adcxq %r8,%r13 + adoxq %r8,%r8 + adcq $0,%r8 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%r14),%rcx,%rbp + movq 16(%rbx),%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcxq %r9,%r13 + adoxq %r9,%r8 + adcq $0,%r8 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r10,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + adcxq %r9,%r8 + adoxq %r9,%r9 + adcq $0,%r9 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%r14),%rcx,%rbp + movq 24(%rbx),%rdx + adcxq %rcx,%r13 + adoxq %rbp,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcq $0,%r9 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r11,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r8 + adoxq %rbp,%r9 + + adcxq %r10,%r9 + adoxq %r10,%r10 + adcq $0,%r10 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%r14),%rcx,%rbp + leaq 128(%r14),%r14 + movq %r12,%rbx + adcxq %rcx,%r8 + adoxq %rbp,%r9 + movq %r13,%rdx + adcxq %r11,%r9 + adoxq %r11,%r10 + adcq $0,%r10 + + + + movq %r8,%rcx + subq 0(%r14),%r12 + sbbq 8(%r14),%r13 + sbbq 16(%r14),%r8 + movq %r9,%rbp + sbbq 24(%r14),%r9 + sbbq $0,%r10 + + cmovcq %rbx,%r12 + cmovcq %rdx,%r13 + cmovcq %rcx,%r8 + cmovcq %rbp,%r9 + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_mulx_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_ord_mul_mont_adx,.-ecp_nistz256_ord_mul_mont_adx + +.globl ecp_nistz256_ord_sqr_mont_adx +.hidden ecp_nistz256_ord_sqr_mont_adx +.type ecp_nistz256_ord_sqr_mont_adx,@function +.align 32 +ecp_nistz256_ord_sqr_mont_adx: +.cfi_startproc +_CET_ENDBR +.Lecp_nistz256_ord_sqr_mont_adx: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lord_sqrx_body: + + movq %rdx,%rbx + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + leaq .Lord(%rip),%rsi + jmp .Loop_ord_sqrx + +.align 32 +.Loop_ord_sqrx: + mulxq %r14,%r9,%r10 + mulxq %r15,%rcx,%r11 + movq %rdx,%rax +.byte 102,73,15,110,206 + mulxq %r8,%rbp,%r12 + movq %r14,%rdx + addq %rcx,%r10 +.byte 102,73,15,110,215 + adcq %rbp,%r11 + adcq $0,%r12 + xorq %r13,%r13 + + mulxq %r15,%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq %r8,%rcx,%rbp + movq %r15,%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcq $0,%r13 + + mulxq %r8,%rcx,%r14 + movq %rax,%rdx +.byte 102,73,15,110,216 + xorq %r15,%r15 + adcxq %r9,%r9 + adoxq %rcx,%r13 + adcxq %r10,%r10 + adoxq %r15,%r14 + + + mulxq %rdx,%r8,%rbp +.byte 102,72,15,126,202 + adcxq %r11,%r11 + adoxq %rbp,%r9 + adcxq %r12,%r12 + mulxq %rdx,%rcx,%rax +.byte 102,72,15,126,210 + adcxq %r13,%r13 + adoxq %rcx,%r10 + adcxq %r14,%r14 + mulxq %rdx,%rcx,%rbp +.byte 0x67 +.byte 102,72,15,126,218 + adoxq %rax,%r11 + adcxq %r15,%r15 + adoxq %rcx,%r12 + adoxq %rbp,%r13 + mulxq %rdx,%rcx,%rax + adoxq %rcx,%r14 + adoxq %rax,%r15 + + + movq %r8,%rdx + mulxq 32(%rsi),%rdx,%rcx + + xorq %rax,%rax + mulxq 0(%rsi),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + mulxq 8(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + mulxq 16(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + mulxq 24(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r8 + adcxq %rax,%r8 + + + movq %r9,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adoxq %rcx,%r9 + adcxq %rbp,%r10 + mulxq 8(%rsi),%rcx,%rbp + adoxq %rcx,%r10 + adcxq %rbp,%r11 + mulxq 16(%rsi),%rcx,%rbp + adoxq %rcx,%r11 + adcxq %rbp,%r8 + mulxq 24(%rsi),%rcx,%rbp + adoxq %rcx,%r8 + adcxq %rbp,%r9 + adoxq %rax,%r9 + + + movq %r10,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + mulxq 8(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r8 + mulxq 16(%rsi),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + mulxq 24(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + adcxq %rax,%r10 + + + movq %r11,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adoxq %rcx,%r11 + adcxq %rbp,%r8 + mulxq 8(%rsi),%rcx,%rbp + adoxq %rcx,%r8 + adcxq %rbp,%r9 + mulxq 16(%rsi),%rcx,%rbp + adoxq %rcx,%r9 + adcxq %rbp,%r10 + mulxq 24(%rsi),%rcx,%rbp + adoxq %rcx,%r10 + adcxq %rbp,%r11 + adoxq %rax,%r11 + + + addq %r8,%r12 + adcq %r13,%r9 + movq %r12,%rdx + adcq %r14,%r10 + adcq %r15,%r11 + movq %r9,%r14 + adcq $0,%rax + + + subq 0(%rsi),%r12 + movq %r10,%r15 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r11,%r8 + sbbq 24(%rsi),%r11 + sbbq $0,%rax + + cmovncq %r12,%rdx + cmovncq %r9,%r14 + cmovncq %r10,%r15 + cmovncq %r11,%r8 + + decq %rbx + jnz .Loop_ord_sqrx + + movq %rdx,0(%rdi) + movq %r14,8(%rdi) + pxor %xmm1,%xmm1 + movq %r15,16(%rdi) + pxor %xmm2,%xmm2 + movq %r8,24(%rdi) + pxor %xmm3,%xmm3 + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_sqrx_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_ord_sqr_mont_adx,.-ecp_nistz256_ord_sqr_mont_adx + + + + + + +.globl ecp_nistz256_mul_mont_nohw +.hidden ecp_nistz256_mul_mont_nohw +.type ecp_nistz256_mul_mont_nohw,@function +.align 32 +ecp_nistz256_mul_mont_nohw: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lmul_body: + movq %rdx,%rbx + movq 0(%rdx),%rax + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + + call __ecp_nistz256_mul_montq + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lmul_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_mul_mont_nohw,.-ecp_nistz256_mul_mont_nohw + +.type __ecp_nistz256_mul_montq,@function +.align 32 +__ecp_nistz256_mul_montq: +.cfi_startproc + + + movq %rax,%rbp + mulq %r9 + movq .Lpoly+8(%rip),%r14 + movq %rax,%r8 + movq %rbp,%rax + movq %rdx,%r9 + + mulq %r10 + movq .Lpoly+24(%rip),%r15 + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r11 + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r12 + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + xorq %r13,%r13 + movq %rdx,%r12 + + + + + + + + + + + movq %r8,%rbp + shlq $32,%r8 + mulq %r15 + shrq $32,%rbp + addq %r8,%r9 + adcq %rbp,%r10 + adcq %rax,%r11 + movq 8(%rbx),%rax + adcq %rdx,%r12 + adcq $0,%r13 + xorq %r8,%r8 + + + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rcx,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 16(%rsi) + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 24(%rsi) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq %rdx,%r13 + adcq $0,%r8 + + + + movq %r9,%rbp + shlq $32,%r9 + mulq %r15 + shrq $32,%rbp + addq %r9,%r10 + adcq %rbp,%r11 + adcq %rax,%r12 + movq 16(%rbx),%rax + adcq %rdx,%r13 + adcq $0,%r8 + xorq %r9,%r9 + + + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 16(%rsi) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 24(%rsi) + addq %rcx,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq %rdx,%r8 + adcq $0,%r9 + + + + movq %r10,%rbp + shlq $32,%r10 + mulq %r15 + shrq $32,%rbp + addq %r10,%r11 + adcq %rbp,%r12 + adcq %rax,%r13 + movq 24(%rbx),%rax + adcq %rdx,%r8 + adcq $0,%r9 + xorq %r10,%r10 + + + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 16(%rsi) + addq %rcx,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 24(%rsi) + addq %rcx,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + adcq $0,%r10 + + + + movq %r11,%rbp + shlq $32,%r11 + mulq %r15 + shrq $32,%rbp + addq %r11,%r12 + adcq %rbp,%r13 + movq %r12,%rcx + adcq %rax,%r8 + adcq %rdx,%r9 + movq %r13,%rbp + adcq $0,%r10 + + + + subq $-1,%r12 + movq %r8,%rbx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%rdx + sbbq %r15,%r9 + sbbq $0,%r10 + + cmovcq %rcx,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rbx,%r8 + movq %r13,8(%rdi) + cmovcq %rdx,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + ret +.cfi_endproc +.size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq + + + + + + + + +.globl ecp_nistz256_sqr_mont_nohw +.hidden ecp_nistz256_sqr_mont_nohw +.type ecp_nistz256_sqr_mont_nohw,@function +.align 32 +ecp_nistz256_sqr_mont_nohw: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lsqr_body: + movq 0(%rsi),%rax + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + + call __ecp_nistz256_sqr_montq + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lsqr_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_sqr_mont_nohw,.-ecp_nistz256_sqr_mont_nohw + +.type __ecp_nistz256_sqr_montq,@function +.align 32 +__ecp_nistz256_sqr_montq: +.cfi_startproc + movq %rax,%r13 + mulq %r14 + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + + mulq %r13 + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r13 + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r12 + + + mulq %r14 + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq %r14 + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r12 + movq %rdx,%r13 + adcq $0,%r13 + + + mulq %r15 + xorq %r15,%r15 + addq %rax,%r13 + movq 0(%rsi),%rax + movq %rdx,%r14 + adcq $0,%r14 + + addq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %r12,%r12 + adcq %r13,%r13 + adcq %r14,%r14 + adcq $0,%r15 + + mulq %rax + movq %rax,%r8 + movq 8(%rsi),%rax + movq %rdx,%rcx + + mulq %rax + addq %rcx,%r9 + adcq %rax,%r10 + movq 16(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq %rax + addq %rcx,%r11 + adcq %rax,%r12 + movq 24(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq %rax + addq %rcx,%r13 + adcq %rax,%r14 + movq %r8,%rax + adcq %rdx,%r15 + + movq .Lpoly+8(%rip),%rsi + movq .Lpoly+24(%rip),%rbp + + + + + movq %r8,%rcx + shlq $32,%r8 + mulq %rbp + shrq $32,%rcx + addq %r8,%r9 + adcq %rcx,%r10 + adcq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + + + + movq %r9,%rcx + shlq $32,%r9 + movq %rdx,%r8 + mulq %rbp + shrq $32,%rcx + addq %r9,%r10 + adcq %rcx,%r11 + adcq %rax,%r8 + movq %r10,%rax + adcq $0,%rdx + + + + movq %r10,%rcx + shlq $32,%r10 + movq %rdx,%r9 + mulq %rbp + shrq $32,%rcx + addq %r10,%r11 + adcq %rcx,%r8 + adcq %rax,%r9 + movq %r11,%rax + adcq $0,%rdx + + + + movq %r11,%rcx + shlq $32,%r11 + movq %rdx,%r10 + mulq %rbp + shrq $32,%rcx + addq %r11,%r8 + adcq %rcx,%r9 + adcq %rax,%r10 + adcq $0,%rdx + xorq %r11,%r11 + + + + addq %r8,%r12 + adcq %r9,%r13 + movq %r12,%r8 + adcq %r10,%r14 + adcq %rdx,%r15 + movq %r13,%r9 + adcq $0,%r11 + + subq $-1,%r12 + movq %r14,%r10 + sbbq %rsi,%r13 + sbbq $0,%r14 + movq %r15,%rcx + sbbq %rbp,%r15 + sbbq $0,%r11 + + cmovcq %r8,%r12 + cmovcq %r9,%r13 + movq %r12,0(%rdi) + cmovcq %r10,%r14 + movq %r13,8(%rdi) + cmovcq %rcx,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + + ret +.cfi_endproc +.size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq +.globl ecp_nistz256_mul_mont_adx +.hidden ecp_nistz256_mul_mont_adx +.type ecp_nistz256_mul_mont_adx,@function +.align 32 +ecp_nistz256_mul_mont_adx: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lmulx_body: + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + leaq -128(%rsi),%rsi + + call __ecp_nistz256_mul_montx + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lmulx_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_mul_mont_adx,.-ecp_nistz256_mul_mont_adx + +.type __ecp_nistz256_mul_montx,@function +.align 32 +__ecp_nistz256_mul_montx: +.cfi_startproc + + + mulxq %r9,%r8,%r9 + mulxq %r10,%rcx,%r10 + movq $32,%r14 + xorq %r13,%r13 + mulxq %r11,%rbp,%r11 + movq .Lpoly+24(%rip),%r15 + adcq %rcx,%r9 + mulxq %r12,%rcx,%r12 + movq %r8,%rdx + adcq %rbp,%r10 + shlxq %r14,%r8,%rbp + adcq %rcx,%r11 + shrxq %r14,%r8,%rcx + adcq $0,%r12 + + + + addq %rbp,%r9 + adcq %rcx,%r10 + + mulxq %r15,%rcx,%rbp + movq 8(%rbx),%rdx + adcq %rcx,%r11 + adcq %rbp,%r12 + adcq $0,%r13 + xorq %r8,%r8 + + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r9,%rdx + adcxq %rcx,%r12 + shlxq %r14,%r9,%rcx + adoxq %rbp,%r13 + shrxq %r14,%r9,%rbp + + adcxq %r8,%r13 + adoxq %r8,%r8 + adcq $0,%r8 + + + + addq %rcx,%r10 + adcq %rbp,%r11 + + mulxq %r15,%rcx,%rbp + movq 16(%rbx),%rdx + adcq %rcx,%r12 + adcq %rbp,%r13 + adcq $0,%r8 + xorq %r9,%r9 + + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r10,%rdx + adcxq %rcx,%r13 + shlxq %r14,%r10,%rcx + adoxq %rbp,%r8 + shrxq %r14,%r10,%rbp + + adcxq %r9,%r8 + adoxq %r9,%r9 + adcq $0,%r9 + + + + addq %rcx,%r11 + adcq %rbp,%r12 + + mulxq %r15,%rcx,%rbp + movq 24(%rbx),%rdx + adcq %rcx,%r13 + adcq %rbp,%r8 + adcq $0,%r9 + xorq %r10,%r10 + + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r11,%rdx + adcxq %rcx,%r8 + shlxq %r14,%r11,%rcx + adoxq %rbp,%r9 + shrxq %r14,%r11,%rbp + + adcxq %r10,%r9 + adoxq %r10,%r10 + adcq $0,%r10 + + + + addq %rcx,%r12 + adcq %rbp,%r13 + + mulxq %r15,%rcx,%rbp + movq %r12,%rbx + movq .Lpoly+8(%rip),%r14 + adcq %rcx,%r8 + movq %r13,%rdx + adcq %rbp,%r9 + adcq $0,%r10 + + + + xorl %eax,%eax + movq %r8,%rcx + sbbq $-1,%r12 + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%rbp + sbbq %r15,%r9 + sbbq $0,%r10 + + cmovcq %rbx,%r12 + cmovcq %rdx,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %rbp,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + ret +.cfi_endproc +.size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx + +.globl ecp_nistz256_sqr_mont_adx +.hidden ecp_nistz256_sqr_mont_adx +.type ecp_nistz256_sqr_mont_adx,@function +.align 32 +ecp_nistz256_sqr_mont_adx: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lsqrx_body: + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + leaq -128(%rsi),%rsi + + call __ecp_nistz256_sqr_montx + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lsqrx_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_sqr_mont_adx,.-ecp_nistz256_sqr_mont_adx + +.type __ecp_nistz256_sqr_montx,@function +.align 32 +__ecp_nistz256_sqr_montx: +.cfi_startproc + mulxq %r14,%r9,%r10 + mulxq %r15,%rcx,%r11 + xorl %eax,%eax + adcq %rcx,%r10 + mulxq %r8,%rbp,%r12 + movq %r14,%rdx + adcq %rbp,%r11 + adcq $0,%r12 + xorq %r13,%r13 + + + mulxq %r15,%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq %r8,%rcx,%rbp + movq %r15,%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcq $0,%r13 + + + mulxq %r8,%rcx,%r14 + movq 0+128(%rsi),%rdx + xorq %r15,%r15 + adcxq %r9,%r9 + adoxq %rcx,%r13 + adcxq %r10,%r10 + adoxq %r15,%r14 + + mulxq %rdx,%r8,%rbp + movq 8+128(%rsi),%rdx + adcxq %r11,%r11 + adoxq %rbp,%r9 + adcxq %r12,%r12 + mulxq %rdx,%rcx,%rax + movq 16+128(%rsi),%rdx + adcxq %r13,%r13 + adoxq %rcx,%r10 + adcxq %r14,%r14 +.byte 0x67 + mulxq %rdx,%rcx,%rbp + movq 24+128(%rsi),%rdx + adoxq %rax,%r11 + adcxq %r15,%r15 + adoxq %rcx,%r12 + movq $32,%rsi + adoxq %rbp,%r13 +.byte 0x67,0x67 + mulxq %rdx,%rcx,%rax + movq .Lpoly+24(%rip),%rdx + adoxq %rcx,%r14 + shlxq %rsi,%r8,%rcx + adoxq %rax,%r15 + shrxq %rsi,%r8,%rax + movq %rdx,%rbp + + + addq %rcx,%r9 + adcq %rax,%r10 + + mulxq %r8,%rcx,%r8 + adcq %rcx,%r11 + shlxq %rsi,%r9,%rcx + adcq $0,%r8 + shrxq %rsi,%r9,%rax + + + addq %rcx,%r10 + adcq %rax,%r11 + + mulxq %r9,%rcx,%r9 + adcq %rcx,%r8 + shlxq %rsi,%r10,%rcx + adcq $0,%r9 + shrxq %rsi,%r10,%rax + + + addq %rcx,%r11 + adcq %rax,%r8 + + mulxq %r10,%rcx,%r10 + adcq %rcx,%r9 + shlxq %rsi,%r11,%rcx + adcq $0,%r10 + shrxq %rsi,%r11,%rax + + + addq %rcx,%r8 + adcq %rax,%r9 + + mulxq %r11,%rcx,%r11 + adcq %rcx,%r10 + adcq $0,%r11 + + xorq %rdx,%rdx + addq %r8,%r12 + movq .Lpoly+8(%rip),%rsi + adcq %r9,%r13 + movq %r12,%r8 + adcq %r10,%r14 + adcq %r11,%r15 + movq %r13,%r9 + adcq $0,%rdx + + subq $-1,%r12 + movq %r14,%r10 + sbbq %rsi,%r13 + sbbq $0,%r14 + movq %r15,%r11 + sbbq %rbp,%r15 + sbbq $0,%rdx + + cmovcq %r8,%r12 + cmovcq %r9,%r13 + movq %r12,0(%rdi) + cmovcq %r10,%r14 + movq %r13,8(%rdi) + cmovcq %r11,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + + ret +.cfi_endproc +.size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx + + +.globl ecp_nistz256_select_w5_nohw +.hidden ecp_nistz256_select_w5_nohw +.type ecp_nistz256_select_w5_nohw,@function +.align 32 +ecp_nistz256_select_w5_nohw: +.cfi_startproc +_CET_ENDBR + movdqa .LOne(%rip),%xmm0 + movd %edx,%xmm1 + + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + + movdqa %xmm0,%xmm8 + pshufd $0,%xmm1,%xmm1 + + movq $16,%rax +.Lselect_loop_sse_w5: + + movdqa %xmm8,%xmm15 + paddd %xmm0,%xmm8 + pcmpeqd %xmm1,%xmm15 + + movdqa 0(%rsi),%xmm9 + movdqa 16(%rsi),%xmm10 + movdqa 32(%rsi),%xmm11 + movdqa 48(%rsi),%xmm12 + movdqa 64(%rsi),%xmm13 + movdqa 80(%rsi),%xmm14 + leaq 96(%rsi),%rsi + + pand %xmm15,%xmm9 + pand %xmm15,%xmm10 + por %xmm9,%xmm2 + pand %xmm15,%xmm11 + por %xmm10,%xmm3 + pand %xmm15,%xmm12 + por %xmm11,%xmm4 + pand %xmm15,%xmm13 + por %xmm12,%xmm5 + pand %xmm15,%xmm14 + por %xmm13,%xmm6 + por %xmm14,%xmm7 + + decq %rax + jnz .Lselect_loop_sse_w5 + + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + movdqu %xmm4,32(%rdi) + movdqu %xmm5,48(%rdi) + movdqu %xmm6,64(%rdi) + movdqu %xmm7,80(%rdi) + ret +.cfi_endproc +.LSEH_end_ecp_nistz256_select_w5_nohw: +.size ecp_nistz256_select_w5_nohw,.-ecp_nistz256_select_w5_nohw + + + +.globl ecp_nistz256_select_w7_nohw +.hidden ecp_nistz256_select_w7_nohw +.type ecp_nistz256_select_w7_nohw,@function +.align 32 +ecp_nistz256_select_w7_nohw: +.cfi_startproc +_CET_ENDBR + movdqa .LOne(%rip),%xmm8 + movd %edx,%xmm1 + + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + + movdqa %xmm8,%xmm0 + pshufd $0,%xmm1,%xmm1 + movq $64,%rax + +.Lselect_loop_sse_w7: + movdqa %xmm8,%xmm15 + paddd %xmm0,%xmm8 + movdqa 0(%rsi),%xmm9 + movdqa 16(%rsi),%xmm10 + pcmpeqd %xmm1,%xmm15 + movdqa 32(%rsi),%xmm11 + movdqa 48(%rsi),%xmm12 + leaq 64(%rsi),%rsi + + pand %xmm15,%xmm9 + pand %xmm15,%xmm10 + por %xmm9,%xmm2 + pand %xmm15,%xmm11 + por %xmm10,%xmm3 + pand %xmm15,%xmm12 + por %xmm11,%xmm4 + prefetcht0 255(%rsi) + por %xmm12,%xmm5 + + decq %rax + jnz .Lselect_loop_sse_w7 + + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + movdqu %xmm4,32(%rdi) + movdqu %xmm5,48(%rdi) + ret +.cfi_endproc +.LSEH_end_ecp_nistz256_select_w7_nohw: +.size ecp_nistz256_select_w7_nohw,.-ecp_nistz256_select_w7_nohw + + +.globl ecp_nistz256_select_w5_avx2 +.hidden ecp_nistz256_select_w5_avx2 +.type ecp_nistz256_select_w5_avx2,@function +.align 32 +ecp_nistz256_select_w5_avx2: +.cfi_startproc +_CET_ENDBR + vzeroupper + vmovdqa .LTwo(%rip),%ymm0 + + vpxor %ymm2,%ymm2,%ymm2 + vpxor %ymm3,%ymm3,%ymm3 + vpxor %ymm4,%ymm4,%ymm4 + + vmovdqa .LOne(%rip),%ymm5 + vmovdqa .LTwo(%rip),%ymm10 + + vmovd %edx,%xmm1 + vpermd %ymm1,%ymm2,%ymm1 + + movq $8,%rax +.Lselect_loop_avx2_w5: + + vmovdqa 0(%rsi),%ymm6 + vmovdqa 32(%rsi),%ymm7 + vmovdqa 64(%rsi),%ymm8 + + vmovdqa 96(%rsi),%ymm11 + vmovdqa 128(%rsi),%ymm12 + vmovdqa 160(%rsi),%ymm13 + + vpcmpeqd %ymm1,%ymm5,%ymm9 + vpcmpeqd %ymm1,%ymm10,%ymm14 + + vpaddd %ymm0,%ymm5,%ymm5 + vpaddd %ymm0,%ymm10,%ymm10 + leaq 192(%rsi),%rsi + + vpand %ymm9,%ymm6,%ymm6 + vpand %ymm9,%ymm7,%ymm7 + vpand %ymm9,%ymm8,%ymm8 + vpand %ymm14,%ymm11,%ymm11 + vpand %ymm14,%ymm12,%ymm12 + vpand %ymm14,%ymm13,%ymm13 + + vpxor %ymm6,%ymm2,%ymm2 + vpxor %ymm7,%ymm3,%ymm3 + vpxor %ymm8,%ymm4,%ymm4 + vpxor %ymm11,%ymm2,%ymm2 + vpxor %ymm12,%ymm3,%ymm3 + vpxor %ymm13,%ymm4,%ymm4 + + decq %rax + jnz .Lselect_loop_avx2_w5 + + vmovdqu %ymm2,0(%rdi) + vmovdqu %ymm3,32(%rdi) + vmovdqu %ymm4,64(%rdi) + vzeroupper + ret +.cfi_endproc +.LSEH_end_ecp_nistz256_select_w5_avx2: +.size ecp_nistz256_select_w5_avx2,.-ecp_nistz256_select_w5_avx2 + + + +.globl ecp_nistz256_select_w7_avx2 +.hidden ecp_nistz256_select_w7_avx2 +.type ecp_nistz256_select_w7_avx2,@function +.align 32 +ecp_nistz256_select_w7_avx2: +.cfi_startproc +_CET_ENDBR + vzeroupper + vmovdqa .LThree(%rip),%ymm0 + + vpxor %ymm2,%ymm2,%ymm2 + vpxor %ymm3,%ymm3,%ymm3 + + vmovdqa .LOne(%rip),%ymm4 + vmovdqa .LTwo(%rip),%ymm8 + vmovdqa .LThree(%rip),%ymm12 + + vmovd %edx,%xmm1 + vpermd %ymm1,%ymm2,%ymm1 + + + movq $21,%rax +.Lselect_loop_avx2_w7: + + vmovdqa 0(%rsi),%ymm5 + vmovdqa 32(%rsi),%ymm6 + + vmovdqa 64(%rsi),%ymm9 + vmovdqa 96(%rsi),%ymm10 + + vmovdqa 128(%rsi),%ymm13 + vmovdqa 160(%rsi),%ymm14 + + vpcmpeqd %ymm1,%ymm4,%ymm7 + vpcmpeqd %ymm1,%ymm8,%ymm11 + vpcmpeqd %ymm1,%ymm12,%ymm15 + + vpaddd %ymm0,%ymm4,%ymm4 + vpaddd %ymm0,%ymm8,%ymm8 + vpaddd %ymm0,%ymm12,%ymm12 + leaq 192(%rsi),%rsi + + vpand %ymm7,%ymm5,%ymm5 + vpand %ymm7,%ymm6,%ymm6 + vpand %ymm11,%ymm9,%ymm9 + vpand %ymm11,%ymm10,%ymm10 + vpand %ymm15,%ymm13,%ymm13 + vpand %ymm15,%ymm14,%ymm14 + + vpxor %ymm5,%ymm2,%ymm2 + vpxor %ymm6,%ymm3,%ymm3 + vpxor %ymm9,%ymm2,%ymm2 + vpxor %ymm10,%ymm3,%ymm3 + vpxor %ymm13,%ymm2,%ymm2 + vpxor %ymm14,%ymm3,%ymm3 + + decq %rax + jnz .Lselect_loop_avx2_w7 + + + vmovdqa 0(%rsi),%ymm5 + vmovdqa 32(%rsi),%ymm6 + + vpcmpeqd %ymm1,%ymm4,%ymm7 + + vpand %ymm7,%ymm5,%ymm5 + vpand %ymm7,%ymm6,%ymm6 + + vpxor %ymm5,%ymm2,%ymm2 + vpxor %ymm6,%ymm3,%ymm3 + + vmovdqu %ymm2,0(%rdi) + vmovdqu %ymm3,32(%rdi) + vzeroupper + ret +.cfi_endproc +.LSEH_end_ecp_nistz256_select_w7_avx2: +.size ecp_nistz256_select_w7_avx2,.-ecp_nistz256_select_w7_avx2 +.type __ecp_nistz256_add_toq,@function +.align 32 +__ecp_nistz256_add_toq: +.cfi_startproc + xorq %r11,%r11 + addq 0(%rbx),%r12 + adcq 8(%rbx),%r13 + movq %r12,%rax + adcq 16(%rbx),%r8 + adcq 24(%rbx),%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + ret +.cfi_endproc +.size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq + +.type __ecp_nistz256_sub_fromq,@function +.align 32 +__ecp_nistz256_sub_fromq: +.cfi_startproc + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r13 + movq %r12,%rax + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + movq %r13,%rbp + sbbq %r11,%r11 + + addq $-1,%r12 + movq %r8,%rcx + adcq %r14,%r13 + adcq $0,%r8 + movq %r9,%r10 + adcq %r15,%r9 + testq %r11,%r11 + + cmovzq %rax,%r12 + cmovzq %rbp,%r13 + movq %r12,0(%rdi) + cmovzq %rcx,%r8 + movq %r13,8(%rdi) + cmovzq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + ret +.cfi_endproc +.size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq + +.type __ecp_nistz256_subq,@function +.align 32 +__ecp_nistz256_subq: +.cfi_startproc + subq %r12,%rax + sbbq %r13,%rbp + movq %rax,%r12 + sbbq %r8,%rcx + sbbq %r9,%r10 + movq %rbp,%r13 + sbbq %r11,%r11 + + addq $-1,%rax + movq %rcx,%r8 + adcq %r14,%rbp + adcq $0,%rcx + movq %r10,%r9 + adcq %r15,%r10 + testq %r11,%r11 + + cmovnzq %rax,%r12 + cmovnzq %rbp,%r13 + cmovnzq %rcx,%r8 + cmovnzq %r10,%r9 + + ret +.cfi_endproc +.size __ecp_nistz256_subq,.-__ecp_nistz256_subq + +.type __ecp_nistz256_mul_by_2q,@function +.align 32 +__ecp_nistz256_mul_by_2q: +.cfi_startproc + xorq %r11,%r11 + addq %r12,%r12 + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + ret +.cfi_endproc +.size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q +.globl ecp_nistz256_point_double_nohw +.hidden ecp_nistz256_point_double_nohw +.type ecp_nistz256_point_double_nohw,@function +.align 32 +ecp_nistz256_point_double_nohw: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $160+8,%rsp +.cfi_adjust_cfa_offset 32*5+8 +.Lpoint_doubleq_body: + +.Lpoint_double_shortcutq: + movdqu 0(%rsi),%xmm0 + movq %rsi,%rbx + movdqu 16(%rsi),%xmm1 + movq 32+0(%rsi),%r12 + movq 32+8(%rsi),%r13 + movq 32+16(%rsi),%r8 + movq 32+24(%rsi),%r9 + movq .Lpoly+8(%rip),%r14 + movq .Lpoly+24(%rip),%r15 + movdqa %xmm0,96(%rsp) + movdqa %xmm1,96+16(%rsp) + leaq 32(%rdi),%r10 + leaq 64(%rdi),%r11 +.byte 102,72,15,110,199 +.byte 102,73,15,110,202 +.byte 102,73,15,110,211 + + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_by_2q + + movq 64+0(%rsi),%rax + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + leaq 64-0(%rsi),%rsi + leaq 64(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 0+0(%rsp),%rax + movq 8+0(%rsp),%r14 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 0(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 32(%rbx),%rax + movq 64+0(%rbx),%r9 + movq 64+8(%rbx),%r10 + movq 64+16(%rbx),%r11 + movq 64+24(%rbx),%r12 + leaq 64-0(%rbx),%rsi + leaq 32(%rbx),%rbx +.byte 102,72,15,126,215 + call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_by_2q + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_toq + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 0+0(%rsp),%rax + movq 8+0(%rsp),%r14 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 +.byte 102,72,15,126,207 + call __ecp_nistz256_sqr_montq + xorq %r9,%r9 + movq %r12,%rax + addq $-1,%r12 + movq %r13,%r10 + adcq %rsi,%r13 + movq %r14,%rcx + adcq $0,%r14 + movq %r15,%r8 + adcq %rbp,%r15 + adcq $0,%r9 + xorq %rsi,%rsi + testq $1,%rax + + cmovzq %rax,%r12 + cmovzq %r10,%r13 + cmovzq %rcx,%r14 + cmovzq %r8,%r15 + cmovzq %rsi,%r9 + + movq %r13,%rax + shrq $1,%r12 + shlq $63,%rax + movq %r14,%r10 + shrq $1,%r13 + orq %rax,%r12 + shlq $63,%r10 + movq %r15,%rcx + shrq $1,%r14 + orq %r10,%r13 + shlq $63,%rcx + movq %r12,0(%rdi) + shrq $1,%r15 + movq %r13,8(%rdi) + shlq $63,%r9 + orq %rcx,%r14 + orq %r9,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + movq 64(%rsp),%rax + leaq 64(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2q + + leaq 32(%rsp),%rbx + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_toq + + movq 96(%rsp),%rax + leaq 96(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2q + + movq 0+32(%rsp),%rax + movq 8+32(%rsp),%r14 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r15 + movq 24+32(%rsp),%r8 +.byte 102,72,15,126,199 + call __ecp_nistz256_sqr_montq + + leaq 128(%rsp),%rbx + movq %r14,%r8 + movq %r15,%r9 + movq %rsi,%r14 + movq %rbp,%r15 + call __ecp_nistz256_sub_fromq + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 0(%rsp),%rdi + call __ecp_nistz256_subq + + movq 32(%rsp),%rax + leaq 32(%rsp),%rbx + movq %r12,%r14 + xorl %ecx,%ecx + movq %r12,0+0(%rsp) + movq %r13,%r10 + movq %r13,0+8(%rsp) + cmovzq %r8,%r11 + movq %r8,0+16(%rsp) + leaq 0-0(%rsp),%rsi + cmovzq %r9,%r12 + movq %r9,0+24(%rsp) + movq %r14,%r9 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + +.byte 102,72,15,126,203 +.byte 102,72,15,126,207 + call __ecp_nistz256_sub_fromq + + leaq 160+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpoint_doubleq_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_point_double_nohw,.-ecp_nistz256_point_double_nohw +.globl ecp_nistz256_point_add_nohw +.hidden ecp_nistz256_point_add_nohw +.type ecp_nistz256_point_add_nohw,@function +.align 32 +ecp_nistz256_point_add_nohw: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $576+8,%rsp +.cfi_adjust_cfa_offset 32*18+8 +.Lpoint_addq_body: + + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq %rsi,%rbx + movq %rdx,%rsi + movdqa %xmm0,384(%rsp) + movdqa %xmm1,384+16(%rsp) + movdqa %xmm2,416(%rsp) + movdqa %xmm3,416+16(%rsp) + movdqa %xmm4,448(%rsp) + movdqa %xmm5,448+16(%rsp) + por %xmm4,%xmm5 + + movdqu 0(%rsi),%xmm0 + pshufd $0xb1,%xmm5,%xmm3 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rsi),%xmm3 + movq 64+0(%rsi),%rax + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,480(%rsp) + pshufd $0x1e,%xmm5,%xmm4 + movdqa %xmm1,480+16(%rsp) + movdqu 64(%rsi),%xmm0 + movdqu 80(%rsi),%xmm1 + movdqa %xmm2,512(%rsp) + movdqa %xmm3,512+16(%rsp) + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm0,%xmm1 +.byte 102,72,15,110,199 + + leaq 64-0(%rsi),%rsi + movq %rax,544+0(%rsp) + movq %r14,544+8(%rsp) + movq %r15,544+16(%rsp) + movq %r8,544+24(%rsp) + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + pcmpeqd %xmm4,%xmm5 + pshufd $0xb1,%xmm1,%xmm4 + por %xmm1,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $0x1e,%xmm4,%xmm3 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + movq 64+0(%rbx),%rax + movq 64+8(%rbx),%r14 + movq 64+16(%rbx),%r15 + movq 64+24(%rbx),%r8 +.byte 102,72,15,110,203 + + leaq 64-0(%rbx),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 544(%rsp),%rax + leaq 544(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq 0+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 448(%rsp),%rax + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 416(%rsp),%rax + leaq 416(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq 0+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 512(%rsp),%rax + leaq 512(%rsp),%rbx + movq 0+256(%rsp),%r9 + movq 8+256(%rsp),%r10 + leaq 0+256(%rsp),%rsi + movq 16+256(%rsp),%r11 + movq 24+256(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 224(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + orq %r13,%r12 + movdqa %xmm4,%xmm2 + orq %r8,%r12 + orq %r9,%r12 + por %xmm5,%xmm2 +.byte 102,73,15,110,220 + + movq 384(%rsp),%rax + leaq 384(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq 0+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 480(%rsp),%rax + leaq 480(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 160(%rsp),%rbx + leaq 0(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + orq %r13,%r12 + orq %r8,%r12 + orq %r9,%r12 + +.byte 102,73,15,126,208 +.byte 102,73,15,126,217 + orq %r8,%r12 +.byte 0x3e + jnz .Ladd_proceedq + + + + testq %r9,%r9 + jz .Ladd_doubleq + + + + + + +.byte 102,72,15,126,199 + pxor %xmm0,%xmm0 + movdqu %xmm0,0(%rdi) + movdqu %xmm0,16(%rdi) + movdqu %xmm0,32(%rdi) + movdqu %xmm0,48(%rdi) + movdqu %xmm0,64(%rdi) + movdqu %xmm0,80(%rdi) + jmp .Ladd_doneq + +.align 32 +.Ladd_doubleq: +.byte 102,72,15,126,206 +.byte 102,72,15,126,199 + addq $416,%rsp +.cfi_adjust_cfa_offset -416 + jmp .Lpoint_double_shortcutq +.cfi_adjust_cfa_offset 416 + +.align 32 +.Ladd_proceedq: + movq 0+64(%rsp),%rax + movq 8+64(%rsp),%r14 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 448(%rsp),%rax + leaq 448(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 0+0(%rsp),%rax + movq 8+0(%rsp),%r14 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 544(%rsp),%rax + leaq 544(%rsp),%rbx + movq 0+352(%rsp),%r9 + movq 8+352(%rsp),%r10 + leaq 0+352(%rsp),%rsi + movq 16+352(%rsp),%r11 + movq 24+352(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 0(%rsp),%rax + leaq 0(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 160(%rsp),%rax + leaq 160(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montq + + + + + xorq %r11,%r11 + addq %r12,%r12 + leaq 96(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + movq 0(%rsi),%rax + cmovcq %rbp,%r13 + movq 8(%rsi),%rbp + cmovcq %rcx,%r8 + movq 16(%rsi),%rcx + cmovcq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subq + + leaq 128(%rsp),%rbx + leaq 288(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 192+0(%rsp),%rax + movq 192+8(%rsp),%rbp + movq 192+16(%rsp),%rcx + movq 192+24(%rsp),%r10 + leaq 320(%rsp),%rdi + + call __ecp_nistz256_subq + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 128(%rsp),%rax + leaq 128(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq 0+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 320(%rsp),%rax + leaq 320(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 320(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 256(%rsp),%rbx + leaq 320(%rsp),%rdi + call __ecp_nistz256_sub_fromq + +.byte 102,72,15,126,199 + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 352(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 352+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 544(%rsp),%xmm2 + pand 544+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 480(%rsp),%xmm2 + pand 480+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 320(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 320+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 512(%rsp),%xmm2 + pand 512+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + +.Ladd_doneq: + leaq 576+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpoint_addq_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_point_add_nohw,.-ecp_nistz256_point_add_nohw +.globl ecp_nistz256_point_add_affine_nohw +.hidden ecp_nistz256_point_add_affine_nohw +.type ecp_nistz256_point_add_affine_nohw,@function +.align 32 +ecp_nistz256_point_add_affine_nohw: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $480+8,%rsp +.cfi_adjust_cfa_offset 32*15+8 +.Ladd_affineq_body: + + movdqu 0(%rsi),%xmm0 + movq %rdx,%rbx + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq 64+0(%rsi),%rax + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,320(%rsp) + movdqa %xmm1,320+16(%rsp) + movdqa %xmm2,352(%rsp) + movdqa %xmm3,352+16(%rsp) + movdqa %xmm4,384(%rsp) + movdqa %xmm5,384+16(%rsp) + por %xmm4,%xmm5 + + movdqu 0(%rbx),%xmm0 + pshufd $0xb1,%xmm5,%xmm3 + movdqu 16(%rbx),%xmm1 + movdqu 32(%rbx),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rbx),%xmm3 + movdqa %xmm0,416(%rsp) + pshufd $0x1e,%xmm5,%xmm4 + movdqa %xmm1,416+16(%rsp) + por %xmm0,%xmm1 +.byte 102,72,15,110,199 + movdqa %xmm2,448(%rsp) + movdqa %xmm3,448+16(%rsp) + por %xmm2,%xmm3 + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm1,%xmm3 + + leaq 64-0(%rsi),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + pcmpeqd %xmm4,%xmm5 + pshufd $0xb1,%xmm3,%xmm4 + movq 0(%rbx),%rax + + movq %r12,%r9 + por %xmm3,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $0x1e,%xmm4,%xmm3 + movq %r13,%r10 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + movq %r14,%r11 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + + leaq 32-0(%rsp),%rsi + movq %r15,%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 320(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 384(%rsp),%rax + leaq 384(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 384(%rsp),%rax + leaq 384(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 288(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 448(%rsp),%rax + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 352(%rsp),%rbx + leaq 96(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 0+64(%rsp),%rax + movq 8+64(%rsp),%r14 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 128(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 0+96(%rsp),%rax + movq 8+96(%rsp),%r14 + leaq 0+96(%rsp),%rsi + movq 16+96(%rsp),%r15 + movq 24+96(%rsp),%r8 + leaq 192(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 128(%rsp),%rax + leaq 128(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 320(%rsp),%rax + leaq 320(%rsp),%rbx + movq 0+128(%rsp),%r9 + movq 8+128(%rsp),%r10 + leaq 0+128(%rsp),%rsi + movq 16+128(%rsp),%r11 + movq 24+128(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + + + + + xorq %r11,%r11 + addq %r12,%r12 + leaq 192(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + movq 0(%rsi),%rax + cmovcq %rbp,%r13 + movq 8(%rsi),%rbp + cmovcq %rcx,%r8 + movq 16(%rsi),%rcx + cmovcq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subq + + leaq 160(%rsp),%rbx + leaq 224(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 64(%rsp),%rdi + + call __ecp_nistz256_subq + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 352(%rsp),%rax + leaq 352(%rsp),%rbx + movq 0+160(%rsp),%r9 + movq 8+160(%rsp),%r10 + leaq 0+160(%rsp),%rsi + movq 16+160(%rsp),%r11 + movq 24+160(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 96(%rsp),%rax + leaq 96(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 64(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 32(%rsp),%rbx + leaq 256(%rsp),%rdi + call __ecp_nistz256_sub_fromq + +.byte 102,72,15,126,199 + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand .LONE_mont(%rip),%xmm2 + pand .LONE_mont+16(%rip),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 224(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 224+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 320(%rsp),%xmm2 + pand 320+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 256(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 256+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 352(%rsp),%xmm2 + pand 352+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + + leaq 480+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Ladd_affineq_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_point_add_affine_nohw,.-ecp_nistz256_point_add_affine_nohw +.type __ecp_nistz256_add_tox,@function +.align 32 +__ecp_nistz256_add_tox: +.cfi_startproc + xorq %r11,%r11 + adcq 0(%rbx),%r12 + adcq 8(%rbx),%r13 + movq %r12,%rax + adcq 16(%rbx),%r8 + adcq 24(%rbx),%r9 + movq %r13,%rbp + adcq $0,%r11 + + xorq %r10,%r10 + sbbq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + ret +.cfi_endproc +.size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox + +.type __ecp_nistz256_sub_fromx,@function +.align 32 +__ecp_nistz256_sub_fromx: +.cfi_startproc + xorq %r11,%r11 + sbbq 0(%rbx),%r12 + sbbq 8(%rbx),%r13 + movq %r12,%rax + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + movq %r13,%rbp + sbbq $0,%r11 + + xorq %r10,%r10 + adcq $-1,%r12 + movq %r8,%rcx + adcq %r14,%r13 + adcq $0,%r8 + movq %r9,%r10 + adcq %r15,%r9 + + btq $0,%r11 + cmovncq %rax,%r12 + cmovncq %rbp,%r13 + movq %r12,0(%rdi) + cmovncq %rcx,%r8 + movq %r13,8(%rdi) + cmovncq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + ret +.cfi_endproc +.size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx + +.type __ecp_nistz256_subx,@function +.align 32 +__ecp_nistz256_subx: +.cfi_startproc + xorq %r11,%r11 + sbbq %r12,%rax + sbbq %r13,%rbp + movq %rax,%r12 + sbbq %r8,%rcx + sbbq %r9,%r10 + movq %rbp,%r13 + sbbq $0,%r11 + + xorq %r9,%r9 + adcq $-1,%rax + movq %rcx,%r8 + adcq %r14,%rbp + adcq $0,%rcx + movq %r10,%r9 + adcq %r15,%r10 + + btq $0,%r11 + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + cmovcq %rcx,%r8 + cmovcq %r10,%r9 + + ret +.cfi_endproc +.size __ecp_nistz256_subx,.-__ecp_nistz256_subx + +.type __ecp_nistz256_mul_by_2x,@function +.align 32 +__ecp_nistz256_mul_by_2x: +.cfi_startproc + xorq %r11,%r11 + adcq %r12,%r12 + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + xorq %r10,%r10 + sbbq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + ret +.cfi_endproc +.size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x +.globl ecp_nistz256_point_double_adx +.hidden ecp_nistz256_point_double_adx +.type ecp_nistz256_point_double_adx,@function +.align 32 +ecp_nistz256_point_double_adx: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $160+8,%rsp +.cfi_adjust_cfa_offset 32*5+8 +.Lpoint_doublex_body: + +.Lpoint_double_shortcutx: + movdqu 0(%rsi),%xmm0 + movq %rsi,%rbx + movdqu 16(%rsi),%xmm1 + movq 32+0(%rsi),%r12 + movq 32+8(%rsi),%r13 + movq 32+16(%rsi),%r8 + movq 32+24(%rsi),%r9 + movq .Lpoly+8(%rip),%r14 + movq .Lpoly+24(%rip),%r15 + movdqa %xmm0,96(%rsp) + movdqa %xmm1,96+16(%rsp) + leaq 32(%rdi),%r10 + leaq 64(%rdi),%r11 +.byte 102,72,15,110,199 +.byte 102,73,15,110,202 +.byte 102,73,15,110,211 + + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_by_2x + + movq 64+0(%rsi),%rdx + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + leaq 64-128(%rsi),%rsi + leaq 64(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 0+0(%rsp),%rdx + movq 8+0(%rsp),%r14 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 0(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 32(%rbx),%rdx + movq 64+0(%rbx),%r9 + movq 64+8(%rbx),%r10 + movq 64+16(%rbx),%r11 + movq 64+24(%rbx),%r12 + leaq 64-128(%rbx),%rsi + leaq 32(%rbx),%rbx +.byte 102,72,15,126,215 + call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_by_2x + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_tox + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 0+0(%rsp),%rdx + movq 8+0(%rsp),%r14 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 +.byte 102,72,15,126,207 + call __ecp_nistz256_sqr_montx + xorq %r9,%r9 + movq %r12,%rax + addq $-1,%r12 + movq %r13,%r10 + adcq %rsi,%r13 + movq %r14,%rcx + adcq $0,%r14 + movq %r15,%r8 + adcq %rbp,%r15 + adcq $0,%r9 + xorq %rsi,%rsi + testq $1,%rax + + cmovzq %rax,%r12 + cmovzq %r10,%r13 + cmovzq %rcx,%r14 + cmovzq %r8,%r15 + cmovzq %rsi,%r9 + + movq %r13,%rax + shrq $1,%r12 + shlq $63,%rax + movq %r14,%r10 + shrq $1,%r13 + orq %rax,%r12 + shlq $63,%r10 + movq %r15,%rcx + shrq $1,%r14 + orq %r10,%r13 + shlq $63,%rcx + movq %r12,0(%rdi) + shrq $1,%r15 + movq %r13,8(%rdi) + shlq $63,%r9 + orq %rcx,%r14 + orq %r9,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + movq 64(%rsp),%rdx + leaq 64(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2x + + leaq 32(%rsp),%rbx + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_tox + + movq 96(%rsp),%rdx + leaq 96(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2x + + movq 0+32(%rsp),%rdx + movq 8+32(%rsp),%r14 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r15 + movq 24+32(%rsp),%r8 +.byte 102,72,15,126,199 + call __ecp_nistz256_sqr_montx + + leaq 128(%rsp),%rbx + movq %r14,%r8 + movq %r15,%r9 + movq %rsi,%r14 + movq %rbp,%r15 + call __ecp_nistz256_sub_fromx + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 0(%rsp),%rdi + call __ecp_nistz256_subx + + movq 32(%rsp),%rdx + leaq 32(%rsp),%rbx + movq %r12,%r14 + xorl %ecx,%ecx + movq %r12,0+0(%rsp) + movq %r13,%r10 + movq %r13,0+8(%rsp) + cmovzq %r8,%r11 + movq %r8,0+16(%rsp) + leaq 0-128(%rsp),%rsi + cmovzq %r9,%r12 + movq %r9,0+24(%rsp) + movq %r14,%r9 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + +.byte 102,72,15,126,203 +.byte 102,72,15,126,207 + call __ecp_nistz256_sub_fromx + + leaq 160+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpoint_doublex_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_point_double_adx,.-ecp_nistz256_point_double_adx +.globl ecp_nistz256_point_add_adx +.hidden ecp_nistz256_point_add_adx +.type ecp_nistz256_point_add_adx,@function +.align 32 +ecp_nistz256_point_add_adx: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $576+8,%rsp +.cfi_adjust_cfa_offset 32*18+8 +.Lpoint_addx_body: + + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq %rsi,%rbx + movq %rdx,%rsi + movdqa %xmm0,384(%rsp) + movdqa %xmm1,384+16(%rsp) + movdqa %xmm2,416(%rsp) + movdqa %xmm3,416+16(%rsp) + movdqa %xmm4,448(%rsp) + movdqa %xmm5,448+16(%rsp) + por %xmm4,%xmm5 + + movdqu 0(%rsi),%xmm0 + pshufd $0xb1,%xmm5,%xmm3 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rsi),%xmm3 + movq 64+0(%rsi),%rdx + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,480(%rsp) + pshufd $0x1e,%xmm5,%xmm4 + movdqa %xmm1,480+16(%rsp) + movdqu 64(%rsi),%xmm0 + movdqu 80(%rsi),%xmm1 + movdqa %xmm2,512(%rsp) + movdqa %xmm3,512+16(%rsp) + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm0,%xmm1 +.byte 102,72,15,110,199 + + leaq 64-128(%rsi),%rsi + movq %rdx,544+0(%rsp) + movq %r14,544+8(%rsp) + movq %r15,544+16(%rsp) + movq %r8,544+24(%rsp) + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + pcmpeqd %xmm4,%xmm5 + pshufd $0xb1,%xmm1,%xmm4 + por %xmm1,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $0x1e,%xmm4,%xmm3 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + movq 64+0(%rbx),%rdx + movq 64+8(%rbx),%r14 + movq 64+16(%rbx),%r15 + movq 64+24(%rbx),%r8 +.byte 102,72,15,110,203 + + leaq 64-128(%rbx),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 544(%rsp),%rdx + leaq 544(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq -128+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 448(%rsp),%rdx + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 416(%rsp),%rdx + leaq 416(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq -128+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 512(%rsp),%rdx + leaq 512(%rsp),%rbx + movq 0+256(%rsp),%r9 + movq 8+256(%rsp),%r10 + leaq -128+256(%rsp),%rsi + movq 16+256(%rsp),%r11 + movq 24+256(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 224(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + orq %r13,%r12 + movdqa %xmm4,%xmm2 + orq %r8,%r12 + orq %r9,%r12 + por %xmm5,%xmm2 +.byte 102,73,15,110,220 + + movq 384(%rsp),%rdx + leaq 384(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq -128+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 480(%rsp),%rdx + leaq 480(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 160(%rsp),%rbx + leaq 0(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + orq %r13,%r12 + orq %r8,%r12 + orq %r9,%r12 + +.byte 102,73,15,126,208 +.byte 102,73,15,126,217 + orq %r8,%r12 +.byte 0x3e + jnz .Ladd_proceedx + + + + testq %r9,%r9 + jz .Ladd_doublex + + + + + + +.byte 102,72,15,126,199 + pxor %xmm0,%xmm0 + movdqu %xmm0,0(%rdi) + movdqu %xmm0,16(%rdi) + movdqu %xmm0,32(%rdi) + movdqu %xmm0,48(%rdi) + movdqu %xmm0,64(%rdi) + movdqu %xmm0,80(%rdi) + jmp .Ladd_donex + +.align 32 +.Ladd_doublex: +.byte 102,72,15,126,206 +.byte 102,72,15,126,199 + addq $416,%rsp +.cfi_adjust_cfa_offset -416 + jmp .Lpoint_double_shortcutx +.cfi_adjust_cfa_offset 416 + +.align 32 +.Ladd_proceedx: + movq 0+64(%rsp),%rdx + movq 8+64(%rsp),%r14 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 448(%rsp),%rdx + leaq 448(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 0+0(%rsp),%rdx + movq 8+0(%rsp),%r14 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 544(%rsp),%rdx + leaq 544(%rsp),%rbx + movq 0+352(%rsp),%r9 + movq 8+352(%rsp),%r10 + leaq -128+352(%rsp),%rsi + movq 16+352(%rsp),%r11 + movq 24+352(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 0(%rsp),%rdx + leaq 0(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 160(%rsp),%rdx + leaq 160(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montx + + + + + xorq %r11,%r11 + addq %r12,%r12 + leaq 96(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + movq 0(%rsi),%rax + cmovcq %rbp,%r13 + movq 8(%rsi),%rbp + cmovcq %rcx,%r8 + movq 16(%rsi),%rcx + cmovcq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subx + + leaq 128(%rsp),%rbx + leaq 288(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 192+0(%rsp),%rax + movq 192+8(%rsp),%rbp + movq 192+16(%rsp),%rcx + movq 192+24(%rsp),%r10 + leaq 320(%rsp),%rdi + + call __ecp_nistz256_subx + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 128(%rsp),%rdx + leaq 128(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq -128+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 320(%rsp),%rdx + leaq 320(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 320(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 256(%rsp),%rbx + leaq 320(%rsp),%rdi + call __ecp_nistz256_sub_fromx + +.byte 102,72,15,126,199 + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 352(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 352+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 544(%rsp),%xmm2 + pand 544+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 480(%rsp),%xmm2 + pand 480+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 320(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 320+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 512(%rsp),%xmm2 + pand 512+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + +.Ladd_donex: + leaq 576+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpoint_addx_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_point_add_adx,.-ecp_nistz256_point_add_adx +.globl ecp_nistz256_point_add_affine_adx +.hidden ecp_nistz256_point_add_affine_adx +.type ecp_nistz256_point_add_affine_adx,@function +.align 32 +ecp_nistz256_point_add_affine_adx: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $480+8,%rsp +.cfi_adjust_cfa_offset 32*15+8 +.Ladd_affinex_body: + + movdqu 0(%rsi),%xmm0 + movq %rdx,%rbx + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq 64+0(%rsi),%rdx + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,320(%rsp) + movdqa %xmm1,320+16(%rsp) + movdqa %xmm2,352(%rsp) + movdqa %xmm3,352+16(%rsp) + movdqa %xmm4,384(%rsp) + movdqa %xmm5,384+16(%rsp) + por %xmm4,%xmm5 + + movdqu 0(%rbx),%xmm0 + pshufd $0xb1,%xmm5,%xmm3 + movdqu 16(%rbx),%xmm1 + movdqu 32(%rbx),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rbx),%xmm3 + movdqa %xmm0,416(%rsp) + pshufd $0x1e,%xmm5,%xmm4 + movdqa %xmm1,416+16(%rsp) + por %xmm0,%xmm1 +.byte 102,72,15,110,199 + movdqa %xmm2,448(%rsp) + movdqa %xmm3,448+16(%rsp) + por %xmm2,%xmm3 + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm1,%xmm3 + + leaq 64-128(%rsi),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + pcmpeqd %xmm4,%xmm5 + pshufd $0xb1,%xmm3,%xmm4 + movq 0(%rbx),%rdx + + movq %r12,%r9 + por %xmm3,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $0x1e,%xmm4,%xmm3 + movq %r13,%r10 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + movq %r14,%r11 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + + leaq 32-128(%rsp),%rsi + movq %r15,%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 320(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 384(%rsp),%rdx + leaq 384(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 384(%rsp),%rdx + leaq 384(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 288(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 448(%rsp),%rdx + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 352(%rsp),%rbx + leaq 96(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 0+64(%rsp),%rdx + movq 8+64(%rsp),%r14 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 128(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 0+96(%rsp),%rdx + movq 8+96(%rsp),%r14 + leaq -128+96(%rsp),%rsi + movq 16+96(%rsp),%r15 + movq 24+96(%rsp),%r8 + leaq 192(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 128(%rsp),%rdx + leaq 128(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 320(%rsp),%rdx + leaq 320(%rsp),%rbx + movq 0+128(%rsp),%r9 + movq 8+128(%rsp),%r10 + leaq -128+128(%rsp),%rsi + movq 16+128(%rsp),%r11 + movq 24+128(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + + + + + xorq %r11,%r11 + addq %r12,%r12 + leaq 192(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + movq 0(%rsi),%rax + cmovcq %rbp,%r13 + movq 8(%rsi),%rbp + cmovcq %rcx,%r8 + movq 16(%rsi),%rcx + cmovcq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subx + + leaq 160(%rsp),%rbx + leaq 224(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 64(%rsp),%rdi + + call __ecp_nistz256_subx + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 352(%rsp),%rdx + leaq 352(%rsp),%rbx + movq 0+160(%rsp),%r9 + movq 8+160(%rsp),%r10 + leaq -128+160(%rsp),%rsi + movq 16+160(%rsp),%r11 + movq 24+160(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 96(%rsp),%rdx + leaq 96(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 64(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 32(%rsp),%rbx + leaq 256(%rsp),%rdi + call __ecp_nistz256_sub_fromx + +.byte 102,72,15,126,199 + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand .LONE_mont(%rip),%xmm2 + pand .LONE_mont+16(%rip),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 224(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 224+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 320(%rsp),%xmm2 + pand 320+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 256(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 256+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 352(%rsp),%xmm2 + pand 352+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + + leaq 480+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Ladd_affinex_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_point_add_affine_adx,.-ecp_nistz256_point_add_affine_adx +#endif diff --git a/ring-0.17.14/pregenerated/p256-x86_64-asm-macosx.S b/ring-0.17.14/pregenerated/p256-x86_64-asm-macosx.S new file mode 100644 index 0000000000..035c37fc95 --- /dev/null +++ b/ring-0.17.14/pregenerated/p256-x86_64-asm-macosx.S @@ -0,0 +1,4513 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +.text + + +.section __DATA,__const +.p2align 6 +L$poly: +.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 + +L$One: +.long 1,1,1,1,1,1,1,1 +L$Two: +.long 2,2,2,2,2,2,2,2 +L$Three: +.long 3,3,3,3,3,3,3,3 +L$ONE_mont: +.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe + + +L$ord: +.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000 +L$ordK: +.quad 0xccd1c8aaee00bc4f +.text + + + +.globl _ecp_nistz256_neg +.private_extern _ecp_nistz256_neg + +.p2align 5 +_ecp_nistz256_neg: + +_CET_ENDBR + pushq %r12 + + pushq %r13 + +L$neg_body: + + xorq %r8,%r8 + xorq %r9,%r9 + xorq %r10,%r10 + xorq %r11,%r11 + xorq %r13,%r13 + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r8,%rax + sbbq 24(%rsi),%r11 + leaq L$poly(%rip),%rsi + movq %r9,%rdx + sbbq $0,%r13 + + addq 0(%rsi),%r8 + movq %r10,%rcx + adcq 8(%rsi),%r9 + adcq 16(%rsi),%r10 + movq %r11,%r12 + adcq 24(%rsi),%r11 + testq %r13,%r13 + + cmovzq %rax,%r8 + cmovzq %rdx,%r9 + movq %r8,0(%rdi) + cmovzq %rcx,%r10 + movq %r9,8(%rdi) + cmovzq %r12,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 0(%rsp),%r13 + + movq 8(%rsp),%r12 + + leaq 16(%rsp),%rsp + +L$neg_epilogue: + ret + + + + + + + + +.globl _ecp_nistz256_ord_mul_mont_nohw +.private_extern _ecp_nistz256_ord_mul_mont_nohw + +.p2align 5 +_ecp_nistz256_ord_mul_mont_nohw: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$ord_mul_body: + + movq 0(%rdx),%rax + movq %rdx,%rbx + leaq L$ord(%rip),%r14 + movq L$ordK(%rip),%r15 + + + movq %rax,%rcx + mulq 0(%rsi) + movq %rax,%r8 + movq %rcx,%rax + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r9 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r10 + movq %rcx,%rax + adcq $0,%rdx + + movq %r8,%r13 + imulq %r15,%r8 + + movq %rdx,%r11 + mulq 24(%rsi) + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r12 + + + mulq 0(%r14) + movq %r8,%rbp + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%rcx + + subq %r8,%r10 + sbbq $0,%r8 + + mulq 8(%r14) + addq %rcx,%r9 + adcq $0,%rdx + addq %rax,%r9 + movq %rbp,%rax + adcq %rdx,%r10 + movq %rbp,%rdx + adcq $0,%r8 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r11 + movq 8(%rbx),%rax + sbbq %rdx,%rbp + + addq %r8,%r11 + adcq %rbp,%r12 + adcq $0,%r13 + + + movq %rax,%rcx + mulq 0(%rsi) + addq %rax,%r9 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rbp,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rbp,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rcx,%rax + adcq $0,%rdx + + movq %r9,%rcx + imulq %r15,%r9 + + movq %rdx,%rbp + mulq 24(%rsi) + addq %rbp,%r12 + adcq $0,%rdx + xorq %r8,%r8 + addq %rax,%r12 + movq %r9,%rax + adcq %rdx,%r13 + adcq $0,%r8 + + + mulq 0(%r14) + movq %r9,%rbp + addq %rax,%rcx + movq %r9,%rax + adcq %rdx,%rcx + + subq %r9,%r11 + sbbq $0,%r9 + + mulq 8(%r14) + addq %rcx,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %rbp,%rax + adcq %rdx,%r11 + movq %rbp,%rdx + adcq $0,%r9 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r12 + movq 16(%rbx),%rax + sbbq %rdx,%rbp + + addq %r9,%r12 + adcq %rbp,%r13 + adcq $0,%r8 + + + movq %rax,%rcx + mulq 0(%rsi) + addq %rax,%r10 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rbp,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rbp,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rcx,%rax + adcq $0,%rdx + + movq %r10,%rcx + imulq %r15,%r10 + + movq %rdx,%rbp + mulq 24(%rsi) + addq %rbp,%r13 + adcq $0,%rdx + xorq %r9,%r9 + addq %rax,%r13 + movq %r10,%rax + adcq %rdx,%r8 + adcq $0,%r9 + + + mulq 0(%r14) + movq %r10,%rbp + addq %rax,%rcx + movq %r10,%rax + adcq %rdx,%rcx + + subq %r10,%r12 + sbbq $0,%r10 + + mulq 8(%r14) + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rbp,%rax + adcq %rdx,%r12 + movq %rbp,%rdx + adcq $0,%r10 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r13 + movq 24(%rbx),%rax + sbbq %rdx,%rbp + + addq %r10,%r13 + adcq %rbp,%r8 + adcq $0,%r9 + + + movq %rax,%rcx + mulq 0(%rsi) + addq %rax,%r11 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rbp,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rbp,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %rcx,%rax + adcq $0,%rdx + + movq %r11,%rcx + imulq %r15,%r11 + + movq %rdx,%rbp + mulq 24(%rsi) + addq %rbp,%r8 + adcq $0,%rdx + xorq %r10,%r10 + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + adcq $0,%r10 + + + mulq 0(%r14) + movq %r11,%rbp + addq %rax,%rcx + movq %r11,%rax + adcq %rdx,%rcx + + subq %r11,%r13 + sbbq $0,%r11 + + mulq 8(%r14) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rbp,%rax + adcq %rdx,%r13 + movq %rbp,%rdx + adcq $0,%r11 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r8 + sbbq %rdx,%rbp + + addq %r11,%r8 + adcq %rbp,%r9 + adcq $0,%r10 + + + movq %r12,%rsi + subq 0(%r14),%r12 + movq %r13,%r11 + sbbq 8(%r14),%r13 + movq %r8,%rcx + sbbq 16(%r14),%r8 + movq %r9,%rbp + sbbq 24(%r14),%r9 + sbbq $0,%r10 + + cmovcq %rsi,%r12 + cmovcq %r11,%r13 + cmovcq %rcx,%r8 + cmovcq %rbp,%r9 + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$ord_mul_epilogue: + ret + + + + + + + + + +.globl _ecp_nistz256_ord_sqr_mont_nohw +.private_extern _ecp_nistz256_ord_sqr_mont_nohw + +.p2align 5 +_ecp_nistz256_ord_sqr_mont_nohw: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$ord_sqr_body: + + movq 0(%rsi),%r8 + movq 8(%rsi),%rax + movq 16(%rsi),%r14 + movq 24(%rsi),%r15 + leaq L$ord(%rip),%rsi + movq %rdx,%rbx + jmp L$oop_ord_sqr + +.p2align 5 +L$oop_ord_sqr: + + movq %rax,%rbp + mulq %r8 + movq %rax,%r9 +.byte 102,72,15,110,205 + movq %r14,%rax + movq %rdx,%r10 + + mulq %r8 + addq %rax,%r10 + movq %r15,%rax +.byte 102,73,15,110,214 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r8 + addq %rax,%r11 + movq %r15,%rax +.byte 102,73,15,110,223 + adcq $0,%rdx + movq %rdx,%r12 + + + mulq %r14 + movq %rax,%r13 + movq %r14,%rax + movq %rdx,%r14 + + + mulq %rbp + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + + addq %r15,%r12 + adcq %rdx,%r13 + adcq $0,%r14 + + + xorq %r15,%r15 + movq %r8,%rax + addq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %r12,%r12 + adcq %r13,%r13 + adcq %r14,%r14 + adcq $0,%r15 + + + mulq %rax + movq %rax,%r8 +.byte 102,72,15,126,200 + movq %rdx,%rbp + + mulq %rax + addq %rbp,%r9 + adcq %rax,%r10 +.byte 102,72,15,126,208 + adcq $0,%rdx + movq %rdx,%rbp + + mulq %rax + addq %rbp,%r11 + adcq %rax,%r12 +.byte 102,72,15,126,216 + adcq $0,%rdx + movq %rdx,%rbp + + movq %r8,%rcx + imulq 32(%rsi),%r8 + + mulq %rax + addq %rbp,%r13 + adcq %rax,%r14 + movq 0(%rsi),%rax + adcq %rdx,%r15 + + + mulq %r8 + movq %r8,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r8,%r10 + sbbq $0,%rbp + + mulq %r8 + addq %rcx,%r9 + adcq $0,%rdx + addq %rax,%r9 + movq %r8,%rax + adcq %rdx,%r10 + movq %r8,%rdx + adcq $0,%rbp + + movq %r9,%rcx + imulq 32(%rsi),%r9 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r11 + movq 0(%rsi),%rax + sbbq %rdx,%r8 + + addq %rbp,%r11 + adcq $0,%r8 + + + mulq %r9 + movq %r9,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r9,%r11 + sbbq $0,%rbp + + mulq %r9 + addq %rcx,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %r9,%rax + adcq %rdx,%r11 + movq %r9,%rdx + adcq $0,%rbp + + movq %r10,%rcx + imulq 32(%rsi),%r10 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r8 + movq 0(%rsi),%rax + sbbq %rdx,%r9 + + addq %rbp,%r8 + adcq $0,%r9 + + + mulq %r10 + movq %r10,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r10,%r8 + sbbq $0,%rbp + + mulq %r10 + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r10,%rax + adcq %rdx,%r8 + movq %r10,%rdx + adcq $0,%rbp + + movq %r11,%rcx + imulq 32(%rsi),%r11 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r9 + movq 0(%rsi),%rax + sbbq %rdx,%r10 + + addq %rbp,%r9 + adcq $0,%r10 + + + mulq %r11 + movq %r11,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r11,%r9 + sbbq $0,%rbp + + mulq %r11 + addq %rcx,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + movq %r11,%rdx + adcq $0,%rbp + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r10 + sbbq %rdx,%r11 + + addq %rbp,%r10 + adcq $0,%r11 + + + xorq %rdx,%rdx + addq %r12,%r8 + adcq %r13,%r9 + movq %r8,%r12 + adcq %r14,%r10 + adcq %r15,%r11 + movq %r9,%rax + adcq $0,%rdx + + + subq 0(%rsi),%r8 + movq %r10,%r14 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r11,%r15 + sbbq 24(%rsi),%r11 + sbbq $0,%rdx + + cmovcq %r12,%r8 + cmovncq %r9,%rax + cmovncq %r10,%r14 + cmovncq %r11,%r15 + + decq %rbx + jnz L$oop_ord_sqr + + movq %r8,0(%rdi) + movq %rax,8(%rdi) + pxor %xmm1,%xmm1 + movq %r14,16(%rdi) + pxor %xmm2,%xmm2 + movq %r15,24(%rdi) + pxor %xmm3,%xmm3 + + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$ord_sqr_epilogue: + ret + + + +.globl _ecp_nistz256_ord_mul_mont_adx +.private_extern _ecp_nistz256_ord_mul_mont_adx + +.p2align 5 +_ecp_nistz256_ord_mul_mont_adx: + +L$ecp_nistz256_ord_mul_mont_adx: +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$ord_mulx_body: + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + leaq -128(%rsi),%rsi + leaq L$ord-128(%rip),%r14 + movq L$ordK(%rip),%r15 + + + mulxq %r9,%r8,%r9 + mulxq %r10,%rcx,%r10 + mulxq %r11,%rbp,%r11 + addq %rcx,%r9 + mulxq %r12,%rcx,%r12 + movq %r8,%rdx + mulxq %r15,%rdx,%rax + adcq %rbp,%r10 + adcq %rcx,%r11 + adcq $0,%r12 + + + xorq %r13,%r13 + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%r14),%rcx,%rbp + movq 8(%rbx),%rdx + adcxq %rcx,%r11 + adoxq %rbp,%r12 + adcxq %r8,%r12 + adoxq %r8,%r13 + adcq $0,%r13 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r9,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + adcxq %r8,%r13 + adoxq %r8,%r8 + adcq $0,%r8 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%r14),%rcx,%rbp + movq 16(%rbx),%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcxq %r9,%r13 + adoxq %r9,%r8 + adcq $0,%r8 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r10,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + adcxq %r9,%r8 + adoxq %r9,%r9 + adcq $0,%r9 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%r14),%rcx,%rbp + movq 24(%rbx),%rdx + adcxq %rcx,%r13 + adoxq %rbp,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcq $0,%r9 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r11,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r8 + adoxq %rbp,%r9 + + adcxq %r10,%r9 + adoxq %r10,%r10 + adcq $0,%r10 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%r14),%rcx,%rbp + leaq 128(%r14),%r14 + movq %r12,%rbx + adcxq %rcx,%r8 + adoxq %rbp,%r9 + movq %r13,%rdx + adcxq %r11,%r9 + adoxq %r11,%r10 + adcq $0,%r10 + + + + movq %r8,%rcx + subq 0(%r14),%r12 + sbbq 8(%r14),%r13 + sbbq 16(%r14),%r8 + movq %r9,%rbp + sbbq 24(%r14),%r9 + sbbq $0,%r10 + + cmovcq %rbx,%r12 + cmovcq %rdx,%r13 + cmovcq %rcx,%r8 + cmovcq %rbp,%r9 + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$ord_mulx_epilogue: + ret + + + +.globl _ecp_nistz256_ord_sqr_mont_adx +.private_extern _ecp_nistz256_ord_sqr_mont_adx + +.p2align 5 +_ecp_nistz256_ord_sqr_mont_adx: + +_CET_ENDBR +L$ecp_nistz256_ord_sqr_mont_adx: + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$ord_sqrx_body: + + movq %rdx,%rbx + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + leaq L$ord(%rip),%rsi + jmp L$oop_ord_sqrx + +.p2align 5 +L$oop_ord_sqrx: + mulxq %r14,%r9,%r10 + mulxq %r15,%rcx,%r11 + movq %rdx,%rax +.byte 102,73,15,110,206 + mulxq %r8,%rbp,%r12 + movq %r14,%rdx + addq %rcx,%r10 +.byte 102,73,15,110,215 + adcq %rbp,%r11 + adcq $0,%r12 + xorq %r13,%r13 + + mulxq %r15,%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq %r8,%rcx,%rbp + movq %r15,%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcq $0,%r13 + + mulxq %r8,%rcx,%r14 + movq %rax,%rdx +.byte 102,73,15,110,216 + xorq %r15,%r15 + adcxq %r9,%r9 + adoxq %rcx,%r13 + adcxq %r10,%r10 + adoxq %r15,%r14 + + + mulxq %rdx,%r8,%rbp +.byte 102,72,15,126,202 + adcxq %r11,%r11 + adoxq %rbp,%r9 + adcxq %r12,%r12 + mulxq %rdx,%rcx,%rax +.byte 102,72,15,126,210 + adcxq %r13,%r13 + adoxq %rcx,%r10 + adcxq %r14,%r14 + mulxq %rdx,%rcx,%rbp +.byte 0x67 +.byte 102,72,15,126,218 + adoxq %rax,%r11 + adcxq %r15,%r15 + adoxq %rcx,%r12 + adoxq %rbp,%r13 + mulxq %rdx,%rcx,%rax + adoxq %rcx,%r14 + adoxq %rax,%r15 + + + movq %r8,%rdx + mulxq 32(%rsi),%rdx,%rcx + + xorq %rax,%rax + mulxq 0(%rsi),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + mulxq 8(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + mulxq 16(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + mulxq 24(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r8 + adcxq %rax,%r8 + + + movq %r9,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adoxq %rcx,%r9 + adcxq %rbp,%r10 + mulxq 8(%rsi),%rcx,%rbp + adoxq %rcx,%r10 + adcxq %rbp,%r11 + mulxq 16(%rsi),%rcx,%rbp + adoxq %rcx,%r11 + adcxq %rbp,%r8 + mulxq 24(%rsi),%rcx,%rbp + adoxq %rcx,%r8 + adcxq %rbp,%r9 + adoxq %rax,%r9 + + + movq %r10,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + mulxq 8(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r8 + mulxq 16(%rsi),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + mulxq 24(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + adcxq %rax,%r10 + + + movq %r11,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adoxq %rcx,%r11 + adcxq %rbp,%r8 + mulxq 8(%rsi),%rcx,%rbp + adoxq %rcx,%r8 + adcxq %rbp,%r9 + mulxq 16(%rsi),%rcx,%rbp + adoxq %rcx,%r9 + adcxq %rbp,%r10 + mulxq 24(%rsi),%rcx,%rbp + adoxq %rcx,%r10 + adcxq %rbp,%r11 + adoxq %rax,%r11 + + + addq %r8,%r12 + adcq %r13,%r9 + movq %r12,%rdx + adcq %r14,%r10 + adcq %r15,%r11 + movq %r9,%r14 + adcq $0,%rax + + + subq 0(%rsi),%r12 + movq %r10,%r15 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r11,%r8 + sbbq 24(%rsi),%r11 + sbbq $0,%rax + + cmovncq %r12,%rdx + cmovncq %r9,%r14 + cmovncq %r10,%r15 + cmovncq %r11,%r8 + + decq %rbx + jnz L$oop_ord_sqrx + + movq %rdx,0(%rdi) + movq %r14,8(%rdi) + pxor %xmm1,%xmm1 + movq %r15,16(%rdi) + pxor %xmm2,%xmm2 + movq %r8,24(%rdi) + pxor %xmm3,%xmm3 + + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$ord_sqrx_epilogue: + ret + + + + + + + + +.globl _ecp_nistz256_mul_mont_nohw +.private_extern _ecp_nistz256_mul_mont_nohw + +.p2align 5 +_ecp_nistz256_mul_mont_nohw: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$mul_body: + movq %rdx,%rbx + movq 0(%rdx),%rax + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + + call __ecp_nistz256_mul_montq + + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$mul_epilogue: + ret + + + + +.p2align 5 +__ecp_nistz256_mul_montq: + + + + movq %rax,%rbp + mulq %r9 + movq L$poly+8(%rip),%r14 + movq %rax,%r8 + movq %rbp,%rax + movq %rdx,%r9 + + mulq %r10 + movq L$poly+24(%rip),%r15 + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r11 + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r12 + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + xorq %r13,%r13 + movq %rdx,%r12 + + + + + + + + + + + movq %r8,%rbp + shlq $32,%r8 + mulq %r15 + shrq $32,%rbp + addq %r8,%r9 + adcq %rbp,%r10 + adcq %rax,%r11 + movq 8(%rbx),%rax + adcq %rdx,%r12 + adcq $0,%r13 + xorq %r8,%r8 + + + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rcx,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 16(%rsi) + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 24(%rsi) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq %rdx,%r13 + adcq $0,%r8 + + + + movq %r9,%rbp + shlq $32,%r9 + mulq %r15 + shrq $32,%rbp + addq %r9,%r10 + adcq %rbp,%r11 + adcq %rax,%r12 + movq 16(%rbx),%rax + adcq %rdx,%r13 + adcq $0,%r8 + xorq %r9,%r9 + + + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 16(%rsi) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 24(%rsi) + addq %rcx,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq %rdx,%r8 + adcq $0,%r9 + + + + movq %r10,%rbp + shlq $32,%r10 + mulq %r15 + shrq $32,%rbp + addq %r10,%r11 + adcq %rbp,%r12 + adcq %rax,%r13 + movq 24(%rbx),%rax + adcq %rdx,%r8 + adcq $0,%r9 + xorq %r10,%r10 + + + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 16(%rsi) + addq %rcx,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 24(%rsi) + addq %rcx,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + adcq $0,%r10 + + + + movq %r11,%rbp + shlq $32,%r11 + mulq %r15 + shrq $32,%rbp + addq %r11,%r12 + adcq %rbp,%r13 + movq %r12,%rcx + adcq %rax,%r8 + adcq %rdx,%r9 + movq %r13,%rbp + adcq $0,%r10 + + + + subq $-1,%r12 + movq %r8,%rbx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%rdx + sbbq %r15,%r9 + sbbq $0,%r10 + + cmovcq %rcx,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rbx,%r8 + movq %r13,8(%rdi) + cmovcq %rdx,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + ret + + + + + + + + + + +.globl _ecp_nistz256_sqr_mont_nohw +.private_extern _ecp_nistz256_sqr_mont_nohw + +.p2align 5 +_ecp_nistz256_sqr_mont_nohw: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$sqr_body: + movq 0(%rsi),%rax + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + + call __ecp_nistz256_sqr_montq + + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$sqr_epilogue: + ret + + + + +.p2align 5 +__ecp_nistz256_sqr_montq: + + movq %rax,%r13 + mulq %r14 + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + + mulq %r13 + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r13 + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r12 + + + mulq %r14 + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq %r14 + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r12 + movq %rdx,%r13 + adcq $0,%r13 + + + mulq %r15 + xorq %r15,%r15 + addq %rax,%r13 + movq 0(%rsi),%rax + movq %rdx,%r14 + adcq $0,%r14 + + addq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %r12,%r12 + adcq %r13,%r13 + adcq %r14,%r14 + adcq $0,%r15 + + mulq %rax + movq %rax,%r8 + movq 8(%rsi),%rax + movq %rdx,%rcx + + mulq %rax + addq %rcx,%r9 + adcq %rax,%r10 + movq 16(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq %rax + addq %rcx,%r11 + adcq %rax,%r12 + movq 24(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq %rax + addq %rcx,%r13 + adcq %rax,%r14 + movq %r8,%rax + adcq %rdx,%r15 + + movq L$poly+8(%rip),%rsi + movq L$poly+24(%rip),%rbp + + + + + movq %r8,%rcx + shlq $32,%r8 + mulq %rbp + shrq $32,%rcx + addq %r8,%r9 + adcq %rcx,%r10 + adcq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + + + + movq %r9,%rcx + shlq $32,%r9 + movq %rdx,%r8 + mulq %rbp + shrq $32,%rcx + addq %r9,%r10 + adcq %rcx,%r11 + adcq %rax,%r8 + movq %r10,%rax + adcq $0,%rdx + + + + movq %r10,%rcx + shlq $32,%r10 + movq %rdx,%r9 + mulq %rbp + shrq $32,%rcx + addq %r10,%r11 + adcq %rcx,%r8 + adcq %rax,%r9 + movq %r11,%rax + adcq $0,%rdx + + + + movq %r11,%rcx + shlq $32,%r11 + movq %rdx,%r10 + mulq %rbp + shrq $32,%rcx + addq %r11,%r8 + adcq %rcx,%r9 + adcq %rax,%r10 + adcq $0,%rdx + xorq %r11,%r11 + + + + addq %r8,%r12 + adcq %r9,%r13 + movq %r12,%r8 + adcq %r10,%r14 + adcq %rdx,%r15 + movq %r13,%r9 + adcq $0,%r11 + + subq $-1,%r12 + movq %r14,%r10 + sbbq %rsi,%r13 + sbbq $0,%r14 + movq %r15,%rcx + sbbq %rbp,%r15 + sbbq $0,%r11 + + cmovcq %r8,%r12 + cmovcq %r9,%r13 + movq %r12,0(%rdi) + cmovcq %r10,%r14 + movq %r13,8(%rdi) + cmovcq %rcx,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + + ret + + +.globl _ecp_nistz256_mul_mont_adx +.private_extern _ecp_nistz256_mul_mont_adx + +.p2align 5 +_ecp_nistz256_mul_mont_adx: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$mulx_body: + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + leaq -128(%rsi),%rsi + + call __ecp_nistz256_mul_montx + + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$mulx_epilogue: + ret + + + + +.p2align 5 +__ecp_nistz256_mul_montx: + + + + mulxq %r9,%r8,%r9 + mulxq %r10,%rcx,%r10 + movq $32,%r14 + xorq %r13,%r13 + mulxq %r11,%rbp,%r11 + movq L$poly+24(%rip),%r15 + adcq %rcx,%r9 + mulxq %r12,%rcx,%r12 + movq %r8,%rdx + adcq %rbp,%r10 + shlxq %r14,%r8,%rbp + adcq %rcx,%r11 + shrxq %r14,%r8,%rcx + adcq $0,%r12 + + + + addq %rbp,%r9 + adcq %rcx,%r10 + + mulxq %r15,%rcx,%rbp + movq 8(%rbx),%rdx + adcq %rcx,%r11 + adcq %rbp,%r12 + adcq $0,%r13 + xorq %r8,%r8 + + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r9,%rdx + adcxq %rcx,%r12 + shlxq %r14,%r9,%rcx + adoxq %rbp,%r13 + shrxq %r14,%r9,%rbp + + adcxq %r8,%r13 + adoxq %r8,%r8 + adcq $0,%r8 + + + + addq %rcx,%r10 + adcq %rbp,%r11 + + mulxq %r15,%rcx,%rbp + movq 16(%rbx),%rdx + adcq %rcx,%r12 + adcq %rbp,%r13 + adcq $0,%r8 + xorq %r9,%r9 + + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r10,%rdx + adcxq %rcx,%r13 + shlxq %r14,%r10,%rcx + adoxq %rbp,%r8 + shrxq %r14,%r10,%rbp + + adcxq %r9,%r8 + adoxq %r9,%r9 + adcq $0,%r9 + + + + addq %rcx,%r11 + adcq %rbp,%r12 + + mulxq %r15,%rcx,%rbp + movq 24(%rbx),%rdx + adcq %rcx,%r13 + adcq %rbp,%r8 + adcq $0,%r9 + xorq %r10,%r10 + + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r11,%rdx + adcxq %rcx,%r8 + shlxq %r14,%r11,%rcx + adoxq %rbp,%r9 + shrxq %r14,%r11,%rbp + + adcxq %r10,%r9 + adoxq %r10,%r10 + adcq $0,%r10 + + + + addq %rcx,%r12 + adcq %rbp,%r13 + + mulxq %r15,%rcx,%rbp + movq %r12,%rbx + movq L$poly+8(%rip),%r14 + adcq %rcx,%r8 + movq %r13,%rdx + adcq %rbp,%r9 + adcq $0,%r10 + + + + xorl %eax,%eax + movq %r8,%rcx + sbbq $-1,%r12 + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%rbp + sbbq %r15,%r9 + sbbq $0,%r10 + + cmovcq %rbx,%r12 + cmovcq %rdx,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %rbp,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + ret + + + +.globl _ecp_nistz256_sqr_mont_adx +.private_extern _ecp_nistz256_sqr_mont_adx + +.p2align 5 +_ecp_nistz256_sqr_mont_adx: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$sqrx_body: + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + leaq -128(%rsi),%rsi + + call __ecp_nistz256_sqr_montx + + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$sqrx_epilogue: + ret + + + + +.p2align 5 +__ecp_nistz256_sqr_montx: + + mulxq %r14,%r9,%r10 + mulxq %r15,%rcx,%r11 + xorl %eax,%eax + adcq %rcx,%r10 + mulxq %r8,%rbp,%r12 + movq %r14,%rdx + adcq %rbp,%r11 + adcq $0,%r12 + xorq %r13,%r13 + + + mulxq %r15,%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq %r8,%rcx,%rbp + movq %r15,%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcq $0,%r13 + + + mulxq %r8,%rcx,%r14 + movq 0+128(%rsi),%rdx + xorq %r15,%r15 + adcxq %r9,%r9 + adoxq %rcx,%r13 + adcxq %r10,%r10 + adoxq %r15,%r14 + + mulxq %rdx,%r8,%rbp + movq 8+128(%rsi),%rdx + adcxq %r11,%r11 + adoxq %rbp,%r9 + adcxq %r12,%r12 + mulxq %rdx,%rcx,%rax + movq 16+128(%rsi),%rdx + adcxq %r13,%r13 + adoxq %rcx,%r10 + adcxq %r14,%r14 +.byte 0x67 + mulxq %rdx,%rcx,%rbp + movq 24+128(%rsi),%rdx + adoxq %rax,%r11 + adcxq %r15,%r15 + adoxq %rcx,%r12 + movq $32,%rsi + adoxq %rbp,%r13 +.byte 0x67,0x67 + mulxq %rdx,%rcx,%rax + movq L$poly+24(%rip),%rdx + adoxq %rcx,%r14 + shlxq %rsi,%r8,%rcx + adoxq %rax,%r15 + shrxq %rsi,%r8,%rax + movq %rdx,%rbp + + + addq %rcx,%r9 + adcq %rax,%r10 + + mulxq %r8,%rcx,%r8 + adcq %rcx,%r11 + shlxq %rsi,%r9,%rcx + adcq $0,%r8 + shrxq %rsi,%r9,%rax + + + addq %rcx,%r10 + adcq %rax,%r11 + + mulxq %r9,%rcx,%r9 + adcq %rcx,%r8 + shlxq %rsi,%r10,%rcx + adcq $0,%r9 + shrxq %rsi,%r10,%rax + + + addq %rcx,%r11 + adcq %rax,%r8 + + mulxq %r10,%rcx,%r10 + adcq %rcx,%r9 + shlxq %rsi,%r11,%rcx + adcq $0,%r10 + shrxq %rsi,%r11,%rax + + + addq %rcx,%r8 + adcq %rax,%r9 + + mulxq %r11,%rcx,%r11 + adcq %rcx,%r10 + adcq $0,%r11 + + xorq %rdx,%rdx + addq %r8,%r12 + movq L$poly+8(%rip),%rsi + adcq %r9,%r13 + movq %r12,%r8 + adcq %r10,%r14 + adcq %r11,%r15 + movq %r13,%r9 + adcq $0,%rdx + + subq $-1,%r12 + movq %r14,%r10 + sbbq %rsi,%r13 + sbbq $0,%r14 + movq %r15,%r11 + sbbq %rbp,%r15 + sbbq $0,%rdx + + cmovcq %r8,%r12 + cmovcq %r9,%r13 + movq %r12,0(%rdi) + cmovcq %r10,%r14 + movq %r13,8(%rdi) + cmovcq %r11,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + + ret + + + + +.globl _ecp_nistz256_select_w5_nohw +.private_extern _ecp_nistz256_select_w5_nohw + +.p2align 5 +_ecp_nistz256_select_w5_nohw: + +_CET_ENDBR + movdqa L$One(%rip),%xmm0 + movd %edx,%xmm1 + + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + + movdqa %xmm0,%xmm8 + pshufd $0,%xmm1,%xmm1 + + movq $16,%rax +L$select_loop_sse_w5: + + movdqa %xmm8,%xmm15 + paddd %xmm0,%xmm8 + pcmpeqd %xmm1,%xmm15 + + movdqa 0(%rsi),%xmm9 + movdqa 16(%rsi),%xmm10 + movdqa 32(%rsi),%xmm11 + movdqa 48(%rsi),%xmm12 + movdqa 64(%rsi),%xmm13 + movdqa 80(%rsi),%xmm14 + leaq 96(%rsi),%rsi + + pand %xmm15,%xmm9 + pand %xmm15,%xmm10 + por %xmm9,%xmm2 + pand %xmm15,%xmm11 + por %xmm10,%xmm3 + pand %xmm15,%xmm12 + por %xmm11,%xmm4 + pand %xmm15,%xmm13 + por %xmm12,%xmm5 + pand %xmm15,%xmm14 + por %xmm13,%xmm6 + por %xmm14,%xmm7 + + decq %rax + jnz L$select_loop_sse_w5 + + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + movdqu %xmm4,32(%rdi) + movdqu %xmm5,48(%rdi) + movdqu %xmm6,64(%rdi) + movdqu %xmm7,80(%rdi) + ret + +L$SEH_end_ecp_nistz256_select_w5_nohw: + + + + +.globl _ecp_nistz256_select_w7_nohw +.private_extern _ecp_nistz256_select_w7_nohw + +.p2align 5 +_ecp_nistz256_select_w7_nohw: + +_CET_ENDBR + movdqa L$One(%rip),%xmm8 + movd %edx,%xmm1 + + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + + movdqa %xmm8,%xmm0 + pshufd $0,%xmm1,%xmm1 + movq $64,%rax + +L$select_loop_sse_w7: + movdqa %xmm8,%xmm15 + paddd %xmm0,%xmm8 + movdqa 0(%rsi),%xmm9 + movdqa 16(%rsi),%xmm10 + pcmpeqd %xmm1,%xmm15 + movdqa 32(%rsi),%xmm11 + movdqa 48(%rsi),%xmm12 + leaq 64(%rsi),%rsi + + pand %xmm15,%xmm9 + pand %xmm15,%xmm10 + por %xmm9,%xmm2 + pand %xmm15,%xmm11 + por %xmm10,%xmm3 + pand %xmm15,%xmm12 + por %xmm11,%xmm4 + prefetcht0 255(%rsi) + por %xmm12,%xmm5 + + decq %rax + jnz L$select_loop_sse_w7 + + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + movdqu %xmm4,32(%rdi) + movdqu %xmm5,48(%rdi) + ret + +L$SEH_end_ecp_nistz256_select_w7_nohw: + + + +.globl _ecp_nistz256_select_w5_avx2 +.private_extern _ecp_nistz256_select_w5_avx2 + +.p2align 5 +_ecp_nistz256_select_w5_avx2: + +_CET_ENDBR + vzeroupper + vmovdqa L$Two(%rip),%ymm0 + + vpxor %ymm2,%ymm2,%ymm2 + vpxor %ymm3,%ymm3,%ymm3 + vpxor %ymm4,%ymm4,%ymm4 + + vmovdqa L$One(%rip),%ymm5 + vmovdqa L$Two(%rip),%ymm10 + + vmovd %edx,%xmm1 + vpermd %ymm1,%ymm2,%ymm1 + + movq $8,%rax +L$select_loop_avx2_w5: + + vmovdqa 0(%rsi),%ymm6 + vmovdqa 32(%rsi),%ymm7 + vmovdqa 64(%rsi),%ymm8 + + vmovdqa 96(%rsi),%ymm11 + vmovdqa 128(%rsi),%ymm12 + vmovdqa 160(%rsi),%ymm13 + + vpcmpeqd %ymm1,%ymm5,%ymm9 + vpcmpeqd %ymm1,%ymm10,%ymm14 + + vpaddd %ymm0,%ymm5,%ymm5 + vpaddd %ymm0,%ymm10,%ymm10 + leaq 192(%rsi),%rsi + + vpand %ymm9,%ymm6,%ymm6 + vpand %ymm9,%ymm7,%ymm7 + vpand %ymm9,%ymm8,%ymm8 + vpand %ymm14,%ymm11,%ymm11 + vpand %ymm14,%ymm12,%ymm12 + vpand %ymm14,%ymm13,%ymm13 + + vpxor %ymm6,%ymm2,%ymm2 + vpxor %ymm7,%ymm3,%ymm3 + vpxor %ymm8,%ymm4,%ymm4 + vpxor %ymm11,%ymm2,%ymm2 + vpxor %ymm12,%ymm3,%ymm3 + vpxor %ymm13,%ymm4,%ymm4 + + decq %rax + jnz L$select_loop_avx2_w5 + + vmovdqu %ymm2,0(%rdi) + vmovdqu %ymm3,32(%rdi) + vmovdqu %ymm4,64(%rdi) + vzeroupper + ret + +L$SEH_end_ecp_nistz256_select_w5_avx2: + + + + +.globl _ecp_nistz256_select_w7_avx2 +.private_extern _ecp_nistz256_select_w7_avx2 + +.p2align 5 +_ecp_nistz256_select_w7_avx2: + +_CET_ENDBR + vzeroupper + vmovdqa L$Three(%rip),%ymm0 + + vpxor %ymm2,%ymm2,%ymm2 + vpxor %ymm3,%ymm3,%ymm3 + + vmovdqa L$One(%rip),%ymm4 + vmovdqa L$Two(%rip),%ymm8 + vmovdqa L$Three(%rip),%ymm12 + + vmovd %edx,%xmm1 + vpermd %ymm1,%ymm2,%ymm1 + + + movq $21,%rax +L$select_loop_avx2_w7: + + vmovdqa 0(%rsi),%ymm5 + vmovdqa 32(%rsi),%ymm6 + + vmovdqa 64(%rsi),%ymm9 + vmovdqa 96(%rsi),%ymm10 + + vmovdqa 128(%rsi),%ymm13 + vmovdqa 160(%rsi),%ymm14 + + vpcmpeqd %ymm1,%ymm4,%ymm7 + vpcmpeqd %ymm1,%ymm8,%ymm11 + vpcmpeqd %ymm1,%ymm12,%ymm15 + + vpaddd %ymm0,%ymm4,%ymm4 + vpaddd %ymm0,%ymm8,%ymm8 + vpaddd %ymm0,%ymm12,%ymm12 + leaq 192(%rsi),%rsi + + vpand %ymm7,%ymm5,%ymm5 + vpand %ymm7,%ymm6,%ymm6 + vpand %ymm11,%ymm9,%ymm9 + vpand %ymm11,%ymm10,%ymm10 + vpand %ymm15,%ymm13,%ymm13 + vpand %ymm15,%ymm14,%ymm14 + + vpxor %ymm5,%ymm2,%ymm2 + vpxor %ymm6,%ymm3,%ymm3 + vpxor %ymm9,%ymm2,%ymm2 + vpxor %ymm10,%ymm3,%ymm3 + vpxor %ymm13,%ymm2,%ymm2 + vpxor %ymm14,%ymm3,%ymm3 + + decq %rax + jnz L$select_loop_avx2_w7 + + + vmovdqa 0(%rsi),%ymm5 + vmovdqa 32(%rsi),%ymm6 + + vpcmpeqd %ymm1,%ymm4,%ymm7 + + vpand %ymm7,%ymm5,%ymm5 + vpand %ymm7,%ymm6,%ymm6 + + vpxor %ymm5,%ymm2,%ymm2 + vpxor %ymm6,%ymm3,%ymm3 + + vmovdqu %ymm2,0(%rdi) + vmovdqu %ymm3,32(%rdi) + vzeroupper + ret + +L$SEH_end_ecp_nistz256_select_w7_avx2: + + +.p2align 5 +__ecp_nistz256_add_toq: + + xorq %r11,%r11 + addq 0(%rbx),%r12 + adcq 8(%rbx),%r13 + movq %r12,%rax + adcq 16(%rbx),%r8 + adcq 24(%rbx),%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + ret + + + + +.p2align 5 +__ecp_nistz256_sub_fromq: + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r13 + movq %r12,%rax + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + movq %r13,%rbp + sbbq %r11,%r11 + + addq $-1,%r12 + movq %r8,%rcx + adcq %r14,%r13 + adcq $0,%r8 + movq %r9,%r10 + adcq %r15,%r9 + testq %r11,%r11 + + cmovzq %rax,%r12 + cmovzq %rbp,%r13 + movq %r12,0(%rdi) + cmovzq %rcx,%r8 + movq %r13,8(%rdi) + cmovzq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + ret + + + + +.p2align 5 +__ecp_nistz256_subq: + + subq %r12,%rax + sbbq %r13,%rbp + movq %rax,%r12 + sbbq %r8,%rcx + sbbq %r9,%r10 + movq %rbp,%r13 + sbbq %r11,%r11 + + addq $-1,%rax + movq %rcx,%r8 + adcq %r14,%rbp + adcq $0,%rcx + movq %r10,%r9 + adcq %r15,%r10 + testq %r11,%r11 + + cmovnzq %rax,%r12 + cmovnzq %rbp,%r13 + cmovnzq %rcx,%r8 + cmovnzq %r10,%r9 + + ret + + + + +.p2align 5 +__ecp_nistz256_mul_by_2q: + + xorq %r11,%r11 + addq %r12,%r12 + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + ret + + +.globl _ecp_nistz256_point_double_nohw +.private_extern _ecp_nistz256_point_double_nohw + +.p2align 5 +_ecp_nistz256_point_double_nohw: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $160+8,%rsp + +L$point_doubleq_body: + +L$point_double_shortcutq: + movdqu 0(%rsi),%xmm0 + movq %rsi,%rbx + movdqu 16(%rsi),%xmm1 + movq 32+0(%rsi),%r12 + movq 32+8(%rsi),%r13 + movq 32+16(%rsi),%r8 + movq 32+24(%rsi),%r9 + movq L$poly+8(%rip),%r14 + movq L$poly+24(%rip),%r15 + movdqa %xmm0,96(%rsp) + movdqa %xmm1,96+16(%rsp) + leaq 32(%rdi),%r10 + leaq 64(%rdi),%r11 +.byte 102,72,15,110,199 +.byte 102,73,15,110,202 +.byte 102,73,15,110,211 + + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_by_2q + + movq 64+0(%rsi),%rax + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + leaq 64-0(%rsi),%rsi + leaq 64(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 0+0(%rsp),%rax + movq 8+0(%rsp),%r14 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 0(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 32(%rbx),%rax + movq 64+0(%rbx),%r9 + movq 64+8(%rbx),%r10 + movq 64+16(%rbx),%r11 + movq 64+24(%rbx),%r12 + leaq 64-0(%rbx),%rsi + leaq 32(%rbx),%rbx +.byte 102,72,15,126,215 + call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_by_2q + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_toq + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 0+0(%rsp),%rax + movq 8+0(%rsp),%r14 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 +.byte 102,72,15,126,207 + call __ecp_nistz256_sqr_montq + xorq %r9,%r9 + movq %r12,%rax + addq $-1,%r12 + movq %r13,%r10 + adcq %rsi,%r13 + movq %r14,%rcx + adcq $0,%r14 + movq %r15,%r8 + adcq %rbp,%r15 + adcq $0,%r9 + xorq %rsi,%rsi + testq $1,%rax + + cmovzq %rax,%r12 + cmovzq %r10,%r13 + cmovzq %rcx,%r14 + cmovzq %r8,%r15 + cmovzq %rsi,%r9 + + movq %r13,%rax + shrq $1,%r12 + shlq $63,%rax + movq %r14,%r10 + shrq $1,%r13 + orq %rax,%r12 + shlq $63,%r10 + movq %r15,%rcx + shrq $1,%r14 + orq %r10,%r13 + shlq $63,%rcx + movq %r12,0(%rdi) + shrq $1,%r15 + movq %r13,8(%rdi) + shlq $63,%r9 + orq %rcx,%r14 + orq %r9,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + movq 64(%rsp),%rax + leaq 64(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2q + + leaq 32(%rsp),%rbx + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_toq + + movq 96(%rsp),%rax + leaq 96(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2q + + movq 0+32(%rsp),%rax + movq 8+32(%rsp),%r14 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r15 + movq 24+32(%rsp),%r8 +.byte 102,72,15,126,199 + call __ecp_nistz256_sqr_montq + + leaq 128(%rsp),%rbx + movq %r14,%r8 + movq %r15,%r9 + movq %rsi,%r14 + movq %rbp,%r15 + call __ecp_nistz256_sub_fromq + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 0(%rsp),%rdi + call __ecp_nistz256_subq + + movq 32(%rsp),%rax + leaq 32(%rsp),%rbx + movq %r12,%r14 + xorl %ecx,%ecx + movq %r12,0+0(%rsp) + movq %r13,%r10 + movq %r13,0+8(%rsp) + cmovzq %r8,%r11 + movq %r8,0+16(%rsp) + leaq 0-0(%rsp),%rsi + cmovzq %r9,%r12 + movq %r9,0+24(%rsp) + movq %r14,%r9 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + +.byte 102,72,15,126,203 +.byte 102,72,15,126,207 + call __ecp_nistz256_sub_fromq + + leaq 160+56(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbx + + movq -8(%rsi),%rbp + + leaq (%rsi),%rsp + +L$point_doubleq_epilogue: + ret + + +.globl _ecp_nistz256_point_add_nohw +.private_extern _ecp_nistz256_point_add_nohw + +.p2align 5 +_ecp_nistz256_point_add_nohw: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $576+8,%rsp + +L$point_addq_body: + + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq %rsi,%rbx + movq %rdx,%rsi + movdqa %xmm0,384(%rsp) + movdqa %xmm1,384+16(%rsp) + movdqa %xmm2,416(%rsp) + movdqa %xmm3,416+16(%rsp) + movdqa %xmm4,448(%rsp) + movdqa %xmm5,448+16(%rsp) + por %xmm4,%xmm5 + + movdqu 0(%rsi),%xmm0 + pshufd $0xb1,%xmm5,%xmm3 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rsi),%xmm3 + movq 64+0(%rsi),%rax + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,480(%rsp) + pshufd $0x1e,%xmm5,%xmm4 + movdqa %xmm1,480+16(%rsp) + movdqu 64(%rsi),%xmm0 + movdqu 80(%rsi),%xmm1 + movdqa %xmm2,512(%rsp) + movdqa %xmm3,512+16(%rsp) + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm0,%xmm1 +.byte 102,72,15,110,199 + + leaq 64-0(%rsi),%rsi + movq %rax,544+0(%rsp) + movq %r14,544+8(%rsp) + movq %r15,544+16(%rsp) + movq %r8,544+24(%rsp) + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + pcmpeqd %xmm4,%xmm5 + pshufd $0xb1,%xmm1,%xmm4 + por %xmm1,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $0x1e,%xmm4,%xmm3 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + movq 64+0(%rbx),%rax + movq 64+8(%rbx),%r14 + movq 64+16(%rbx),%r15 + movq 64+24(%rbx),%r8 +.byte 102,72,15,110,203 + + leaq 64-0(%rbx),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 544(%rsp),%rax + leaq 544(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq 0+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 448(%rsp),%rax + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 416(%rsp),%rax + leaq 416(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq 0+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 512(%rsp),%rax + leaq 512(%rsp),%rbx + movq 0+256(%rsp),%r9 + movq 8+256(%rsp),%r10 + leaq 0+256(%rsp),%rsi + movq 16+256(%rsp),%r11 + movq 24+256(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 224(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + orq %r13,%r12 + movdqa %xmm4,%xmm2 + orq %r8,%r12 + orq %r9,%r12 + por %xmm5,%xmm2 +.byte 102,73,15,110,220 + + movq 384(%rsp),%rax + leaq 384(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq 0+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 480(%rsp),%rax + leaq 480(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 160(%rsp),%rbx + leaq 0(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + orq %r13,%r12 + orq %r8,%r12 + orq %r9,%r12 + +.byte 102,73,15,126,208 +.byte 102,73,15,126,217 + orq %r8,%r12 +.byte 0x3e + jnz L$add_proceedq + + + + testq %r9,%r9 + jz L$add_doubleq + + + + + + +.byte 102,72,15,126,199 + pxor %xmm0,%xmm0 + movdqu %xmm0,0(%rdi) + movdqu %xmm0,16(%rdi) + movdqu %xmm0,32(%rdi) + movdqu %xmm0,48(%rdi) + movdqu %xmm0,64(%rdi) + movdqu %xmm0,80(%rdi) + jmp L$add_doneq + +.p2align 5 +L$add_doubleq: +.byte 102,72,15,126,206 +.byte 102,72,15,126,199 + addq $416,%rsp + + jmp L$point_double_shortcutq + + +.p2align 5 +L$add_proceedq: + movq 0+64(%rsp),%rax + movq 8+64(%rsp),%r14 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 448(%rsp),%rax + leaq 448(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 0+0(%rsp),%rax + movq 8+0(%rsp),%r14 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 544(%rsp),%rax + leaq 544(%rsp),%rbx + movq 0+352(%rsp),%r9 + movq 8+352(%rsp),%r10 + leaq 0+352(%rsp),%rsi + movq 16+352(%rsp),%r11 + movq 24+352(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 0(%rsp),%rax + leaq 0(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 160(%rsp),%rax + leaq 160(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montq + + + + + xorq %r11,%r11 + addq %r12,%r12 + leaq 96(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + movq 0(%rsi),%rax + cmovcq %rbp,%r13 + movq 8(%rsi),%rbp + cmovcq %rcx,%r8 + movq 16(%rsi),%rcx + cmovcq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subq + + leaq 128(%rsp),%rbx + leaq 288(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 192+0(%rsp),%rax + movq 192+8(%rsp),%rbp + movq 192+16(%rsp),%rcx + movq 192+24(%rsp),%r10 + leaq 320(%rsp),%rdi + + call __ecp_nistz256_subq + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 128(%rsp),%rax + leaq 128(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq 0+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 320(%rsp),%rax + leaq 320(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 320(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 256(%rsp),%rbx + leaq 320(%rsp),%rdi + call __ecp_nistz256_sub_fromq + +.byte 102,72,15,126,199 + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 352(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 352+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 544(%rsp),%xmm2 + pand 544+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 480(%rsp),%xmm2 + pand 480+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 320(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 320+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 512(%rsp),%xmm2 + pand 512+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + +L$add_doneq: + leaq 576+56(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbx + + movq -8(%rsi),%rbp + + leaq (%rsi),%rsp + +L$point_addq_epilogue: + ret + + +.globl _ecp_nistz256_point_add_affine_nohw +.private_extern _ecp_nistz256_point_add_affine_nohw + +.p2align 5 +_ecp_nistz256_point_add_affine_nohw: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $480+8,%rsp + +L$add_affineq_body: + + movdqu 0(%rsi),%xmm0 + movq %rdx,%rbx + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq 64+0(%rsi),%rax + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,320(%rsp) + movdqa %xmm1,320+16(%rsp) + movdqa %xmm2,352(%rsp) + movdqa %xmm3,352+16(%rsp) + movdqa %xmm4,384(%rsp) + movdqa %xmm5,384+16(%rsp) + por %xmm4,%xmm5 + + movdqu 0(%rbx),%xmm0 + pshufd $0xb1,%xmm5,%xmm3 + movdqu 16(%rbx),%xmm1 + movdqu 32(%rbx),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rbx),%xmm3 + movdqa %xmm0,416(%rsp) + pshufd $0x1e,%xmm5,%xmm4 + movdqa %xmm1,416+16(%rsp) + por %xmm0,%xmm1 +.byte 102,72,15,110,199 + movdqa %xmm2,448(%rsp) + movdqa %xmm3,448+16(%rsp) + por %xmm2,%xmm3 + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm1,%xmm3 + + leaq 64-0(%rsi),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + pcmpeqd %xmm4,%xmm5 + pshufd $0xb1,%xmm3,%xmm4 + movq 0(%rbx),%rax + + movq %r12,%r9 + por %xmm3,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $0x1e,%xmm4,%xmm3 + movq %r13,%r10 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + movq %r14,%r11 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + + leaq 32-0(%rsp),%rsi + movq %r15,%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 320(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 384(%rsp),%rax + leaq 384(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 384(%rsp),%rax + leaq 384(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 288(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 448(%rsp),%rax + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 352(%rsp),%rbx + leaq 96(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 0+64(%rsp),%rax + movq 8+64(%rsp),%r14 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 128(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 0+96(%rsp),%rax + movq 8+96(%rsp),%r14 + leaq 0+96(%rsp),%rsi + movq 16+96(%rsp),%r15 + movq 24+96(%rsp),%r8 + leaq 192(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 128(%rsp),%rax + leaq 128(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 320(%rsp),%rax + leaq 320(%rsp),%rbx + movq 0+128(%rsp),%r9 + movq 8+128(%rsp),%r10 + leaq 0+128(%rsp),%rsi + movq 16+128(%rsp),%r11 + movq 24+128(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + + + + + xorq %r11,%r11 + addq %r12,%r12 + leaq 192(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + movq 0(%rsi),%rax + cmovcq %rbp,%r13 + movq 8(%rsi),%rbp + cmovcq %rcx,%r8 + movq 16(%rsi),%rcx + cmovcq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subq + + leaq 160(%rsp),%rbx + leaq 224(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 64(%rsp),%rdi + + call __ecp_nistz256_subq + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 352(%rsp),%rax + leaq 352(%rsp),%rbx + movq 0+160(%rsp),%r9 + movq 8+160(%rsp),%r10 + leaq 0+160(%rsp),%rsi + movq 16+160(%rsp),%r11 + movq 24+160(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 96(%rsp),%rax + leaq 96(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 64(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 32(%rsp),%rbx + leaq 256(%rsp),%rdi + call __ecp_nistz256_sub_fromq + +.byte 102,72,15,126,199 + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand L$ONE_mont(%rip),%xmm2 + pand L$ONE_mont+16(%rip),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 224(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 224+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 320(%rsp),%xmm2 + pand 320+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 256(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 256+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 352(%rsp),%xmm2 + pand 352+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + + leaq 480+56(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbx + + movq -8(%rsi),%rbp + + leaq (%rsi),%rsp + +L$add_affineq_epilogue: + ret + + + +.p2align 5 +__ecp_nistz256_add_tox: + + xorq %r11,%r11 + adcq 0(%rbx),%r12 + adcq 8(%rbx),%r13 + movq %r12,%rax + adcq 16(%rbx),%r8 + adcq 24(%rbx),%r9 + movq %r13,%rbp + adcq $0,%r11 + + xorq %r10,%r10 + sbbq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + ret + + + + +.p2align 5 +__ecp_nistz256_sub_fromx: + + xorq %r11,%r11 + sbbq 0(%rbx),%r12 + sbbq 8(%rbx),%r13 + movq %r12,%rax + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + movq %r13,%rbp + sbbq $0,%r11 + + xorq %r10,%r10 + adcq $-1,%r12 + movq %r8,%rcx + adcq %r14,%r13 + adcq $0,%r8 + movq %r9,%r10 + adcq %r15,%r9 + + btq $0,%r11 + cmovncq %rax,%r12 + cmovncq %rbp,%r13 + movq %r12,0(%rdi) + cmovncq %rcx,%r8 + movq %r13,8(%rdi) + cmovncq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + ret + + + + +.p2align 5 +__ecp_nistz256_subx: + + xorq %r11,%r11 + sbbq %r12,%rax + sbbq %r13,%rbp + movq %rax,%r12 + sbbq %r8,%rcx + sbbq %r9,%r10 + movq %rbp,%r13 + sbbq $0,%r11 + + xorq %r9,%r9 + adcq $-1,%rax + movq %rcx,%r8 + adcq %r14,%rbp + adcq $0,%rcx + movq %r10,%r9 + adcq %r15,%r10 + + btq $0,%r11 + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + cmovcq %rcx,%r8 + cmovcq %r10,%r9 + + ret + + + + +.p2align 5 +__ecp_nistz256_mul_by_2x: + + xorq %r11,%r11 + adcq %r12,%r12 + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + xorq %r10,%r10 + sbbq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + ret + + +.globl _ecp_nistz256_point_double_adx +.private_extern _ecp_nistz256_point_double_adx + +.p2align 5 +_ecp_nistz256_point_double_adx: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $160+8,%rsp + +L$point_doublex_body: + +L$point_double_shortcutx: + movdqu 0(%rsi),%xmm0 + movq %rsi,%rbx + movdqu 16(%rsi),%xmm1 + movq 32+0(%rsi),%r12 + movq 32+8(%rsi),%r13 + movq 32+16(%rsi),%r8 + movq 32+24(%rsi),%r9 + movq L$poly+8(%rip),%r14 + movq L$poly+24(%rip),%r15 + movdqa %xmm0,96(%rsp) + movdqa %xmm1,96+16(%rsp) + leaq 32(%rdi),%r10 + leaq 64(%rdi),%r11 +.byte 102,72,15,110,199 +.byte 102,73,15,110,202 +.byte 102,73,15,110,211 + + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_by_2x + + movq 64+0(%rsi),%rdx + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + leaq 64-128(%rsi),%rsi + leaq 64(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 0+0(%rsp),%rdx + movq 8+0(%rsp),%r14 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 0(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 32(%rbx),%rdx + movq 64+0(%rbx),%r9 + movq 64+8(%rbx),%r10 + movq 64+16(%rbx),%r11 + movq 64+24(%rbx),%r12 + leaq 64-128(%rbx),%rsi + leaq 32(%rbx),%rbx +.byte 102,72,15,126,215 + call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_by_2x + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_tox + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 0+0(%rsp),%rdx + movq 8+0(%rsp),%r14 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 +.byte 102,72,15,126,207 + call __ecp_nistz256_sqr_montx + xorq %r9,%r9 + movq %r12,%rax + addq $-1,%r12 + movq %r13,%r10 + adcq %rsi,%r13 + movq %r14,%rcx + adcq $0,%r14 + movq %r15,%r8 + adcq %rbp,%r15 + adcq $0,%r9 + xorq %rsi,%rsi + testq $1,%rax + + cmovzq %rax,%r12 + cmovzq %r10,%r13 + cmovzq %rcx,%r14 + cmovzq %r8,%r15 + cmovzq %rsi,%r9 + + movq %r13,%rax + shrq $1,%r12 + shlq $63,%rax + movq %r14,%r10 + shrq $1,%r13 + orq %rax,%r12 + shlq $63,%r10 + movq %r15,%rcx + shrq $1,%r14 + orq %r10,%r13 + shlq $63,%rcx + movq %r12,0(%rdi) + shrq $1,%r15 + movq %r13,8(%rdi) + shlq $63,%r9 + orq %rcx,%r14 + orq %r9,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + movq 64(%rsp),%rdx + leaq 64(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2x + + leaq 32(%rsp),%rbx + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_tox + + movq 96(%rsp),%rdx + leaq 96(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2x + + movq 0+32(%rsp),%rdx + movq 8+32(%rsp),%r14 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r15 + movq 24+32(%rsp),%r8 +.byte 102,72,15,126,199 + call __ecp_nistz256_sqr_montx + + leaq 128(%rsp),%rbx + movq %r14,%r8 + movq %r15,%r9 + movq %rsi,%r14 + movq %rbp,%r15 + call __ecp_nistz256_sub_fromx + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 0(%rsp),%rdi + call __ecp_nistz256_subx + + movq 32(%rsp),%rdx + leaq 32(%rsp),%rbx + movq %r12,%r14 + xorl %ecx,%ecx + movq %r12,0+0(%rsp) + movq %r13,%r10 + movq %r13,0+8(%rsp) + cmovzq %r8,%r11 + movq %r8,0+16(%rsp) + leaq 0-128(%rsp),%rsi + cmovzq %r9,%r12 + movq %r9,0+24(%rsp) + movq %r14,%r9 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + +.byte 102,72,15,126,203 +.byte 102,72,15,126,207 + call __ecp_nistz256_sub_fromx + + leaq 160+56(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbx + + movq -8(%rsi),%rbp + + leaq (%rsi),%rsp + +L$point_doublex_epilogue: + ret + + +.globl _ecp_nistz256_point_add_adx +.private_extern _ecp_nistz256_point_add_adx + +.p2align 5 +_ecp_nistz256_point_add_adx: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $576+8,%rsp + +L$point_addx_body: + + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq %rsi,%rbx + movq %rdx,%rsi + movdqa %xmm0,384(%rsp) + movdqa %xmm1,384+16(%rsp) + movdqa %xmm2,416(%rsp) + movdqa %xmm3,416+16(%rsp) + movdqa %xmm4,448(%rsp) + movdqa %xmm5,448+16(%rsp) + por %xmm4,%xmm5 + + movdqu 0(%rsi),%xmm0 + pshufd $0xb1,%xmm5,%xmm3 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rsi),%xmm3 + movq 64+0(%rsi),%rdx + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,480(%rsp) + pshufd $0x1e,%xmm5,%xmm4 + movdqa %xmm1,480+16(%rsp) + movdqu 64(%rsi),%xmm0 + movdqu 80(%rsi),%xmm1 + movdqa %xmm2,512(%rsp) + movdqa %xmm3,512+16(%rsp) + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm0,%xmm1 +.byte 102,72,15,110,199 + + leaq 64-128(%rsi),%rsi + movq %rdx,544+0(%rsp) + movq %r14,544+8(%rsp) + movq %r15,544+16(%rsp) + movq %r8,544+24(%rsp) + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + pcmpeqd %xmm4,%xmm5 + pshufd $0xb1,%xmm1,%xmm4 + por %xmm1,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $0x1e,%xmm4,%xmm3 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + movq 64+0(%rbx),%rdx + movq 64+8(%rbx),%r14 + movq 64+16(%rbx),%r15 + movq 64+24(%rbx),%r8 +.byte 102,72,15,110,203 + + leaq 64-128(%rbx),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 544(%rsp),%rdx + leaq 544(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq -128+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 448(%rsp),%rdx + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 416(%rsp),%rdx + leaq 416(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq -128+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 512(%rsp),%rdx + leaq 512(%rsp),%rbx + movq 0+256(%rsp),%r9 + movq 8+256(%rsp),%r10 + leaq -128+256(%rsp),%rsi + movq 16+256(%rsp),%r11 + movq 24+256(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 224(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + orq %r13,%r12 + movdqa %xmm4,%xmm2 + orq %r8,%r12 + orq %r9,%r12 + por %xmm5,%xmm2 +.byte 102,73,15,110,220 + + movq 384(%rsp),%rdx + leaq 384(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq -128+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 480(%rsp),%rdx + leaq 480(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 160(%rsp),%rbx + leaq 0(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + orq %r13,%r12 + orq %r8,%r12 + orq %r9,%r12 + +.byte 102,73,15,126,208 +.byte 102,73,15,126,217 + orq %r8,%r12 +.byte 0x3e + jnz L$add_proceedx + + + + testq %r9,%r9 + jz L$add_doublex + + + + + + +.byte 102,72,15,126,199 + pxor %xmm0,%xmm0 + movdqu %xmm0,0(%rdi) + movdqu %xmm0,16(%rdi) + movdqu %xmm0,32(%rdi) + movdqu %xmm0,48(%rdi) + movdqu %xmm0,64(%rdi) + movdqu %xmm0,80(%rdi) + jmp L$add_donex + +.p2align 5 +L$add_doublex: +.byte 102,72,15,126,206 +.byte 102,72,15,126,199 + addq $416,%rsp + + jmp L$point_double_shortcutx + + +.p2align 5 +L$add_proceedx: + movq 0+64(%rsp),%rdx + movq 8+64(%rsp),%r14 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 448(%rsp),%rdx + leaq 448(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 0+0(%rsp),%rdx + movq 8+0(%rsp),%r14 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 544(%rsp),%rdx + leaq 544(%rsp),%rbx + movq 0+352(%rsp),%r9 + movq 8+352(%rsp),%r10 + leaq -128+352(%rsp),%rsi + movq 16+352(%rsp),%r11 + movq 24+352(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 0(%rsp),%rdx + leaq 0(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 160(%rsp),%rdx + leaq 160(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montx + + + + + xorq %r11,%r11 + addq %r12,%r12 + leaq 96(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + movq 0(%rsi),%rax + cmovcq %rbp,%r13 + movq 8(%rsi),%rbp + cmovcq %rcx,%r8 + movq 16(%rsi),%rcx + cmovcq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subx + + leaq 128(%rsp),%rbx + leaq 288(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 192+0(%rsp),%rax + movq 192+8(%rsp),%rbp + movq 192+16(%rsp),%rcx + movq 192+24(%rsp),%r10 + leaq 320(%rsp),%rdi + + call __ecp_nistz256_subx + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 128(%rsp),%rdx + leaq 128(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq -128+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 320(%rsp),%rdx + leaq 320(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 320(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 256(%rsp),%rbx + leaq 320(%rsp),%rdi + call __ecp_nistz256_sub_fromx + +.byte 102,72,15,126,199 + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 352(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 352+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 544(%rsp),%xmm2 + pand 544+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 480(%rsp),%xmm2 + pand 480+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 320(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 320+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 512(%rsp),%xmm2 + pand 512+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + +L$add_donex: + leaq 576+56(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbx + + movq -8(%rsi),%rbp + + leaq (%rsi),%rsp + +L$point_addx_epilogue: + ret + + +.globl _ecp_nistz256_point_add_affine_adx +.private_extern _ecp_nistz256_point_add_affine_adx + +.p2align 5 +_ecp_nistz256_point_add_affine_adx: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $480+8,%rsp + +L$add_affinex_body: + + movdqu 0(%rsi),%xmm0 + movq %rdx,%rbx + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq 64+0(%rsi),%rdx + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,320(%rsp) + movdqa %xmm1,320+16(%rsp) + movdqa %xmm2,352(%rsp) + movdqa %xmm3,352+16(%rsp) + movdqa %xmm4,384(%rsp) + movdqa %xmm5,384+16(%rsp) + por %xmm4,%xmm5 + + movdqu 0(%rbx),%xmm0 + pshufd $0xb1,%xmm5,%xmm3 + movdqu 16(%rbx),%xmm1 + movdqu 32(%rbx),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rbx),%xmm3 + movdqa %xmm0,416(%rsp) + pshufd $0x1e,%xmm5,%xmm4 + movdqa %xmm1,416+16(%rsp) + por %xmm0,%xmm1 +.byte 102,72,15,110,199 + movdqa %xmm2,448(%rsp) + movdqa %xmm3,448+16(%rsp) + por %xmm2,%xmm3 + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm1,%xmm3 + + leaq 64-128(%rsi),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + pcmpeqd %xmm4,%xmm5 + pshufd $0xb1,%xmm3,%xmm4 + movq 0(%rbx),%rdx + + movq %r12,%r9 + por %xmm3,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $0x1e,%xmm4,%xmm3 + movq %r13,%r10 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + movq %r14,%r11 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + + leaq 32-128(%rsp),%rsi + movq %r15,%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 320(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 384(%rsp),%rdx + leaq 384(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 384(%rsp),%rdx + leaq 384(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 288(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 448(%rsp),%rdx + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 352(%rsp),%rbx + leaq 96(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 0+64(%rsp),%rdx + movq 8+64(%rsp),%r14 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 128(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 0+96(%rsp),%rdx + movq 8+96(%rsp),%r14 + leaq -128+96(%rsp),%rsi + movq 16+96(%rsp),%r15 + movq 24+96(%rsp),%r8 + leaq 192(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 128(%rsp),%rdx + leaq 128(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 320(%rsp),%rdx + leaq 320(%rsp),%rbx + movq 0+128(%rsp),%r9 + movq 8+128(%rsp),%r10 + leaq -128+128(%rsp),%rsi + movq 16+128(%rsp),%r11 + movq 24+128(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + + + + + xorq %r11,%r11 + addq %r12,%r12 + leaq 192(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + movq 0(%rsi),%rax + cmovcq %rbp,%r13 + movq 8(%rsi),%rbp + cmovcq %rcx,%r8 + movq 16(%rsi),%rcx + cmovcq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subx + + leaq 160(%rsp),%rbx + leaq 224(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 64(%rsp),%rdi + + call __ecp_nistz256_subx + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 352(%rsp),%rdx + leaq 352(%rsp),%rbx + movq 0+160(%rsp),%r9 + movq 8+160(%rsp),%r10 + leaq -128+160(%rsp),%rsi + movq 16+160(%rsp),%r11 + movq 24+160(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 96(%rsp),%rdx + leaq 96(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 64(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 32(%rsp),%rbx + leaq 256(%rsp),%rdi + call __ecp_nistz256_sub_fromx + +.byte 102,72,15,126,199 + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand L$ONE_mont(%rip),%xmm2 + pand L$ONE_mont+16(%rip),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 224(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 224+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 320(%rsp),%xmm2 + pand 320+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 256(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 256+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 352(%rsp),%xmm2 + pand 352+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + + leaq 480+56(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbx + + movq -8(%rsi),%rbp + + leaq (%rsi),%rsp + +L$add_affinex_epilogue: + ret + + +#endif diff --git a/ring-0.17.14/pregenerated/p256-x86_64-asm-nasm.asm b/ring-0.17.14/pregenerated/p256-x86_64-asm-nasm.asm new file mode 100644 index 0000000000..e837575870 --- /dev/null +++ b/ring-0.17.14/pregenerated/p256-x86_64-asm-nasm.asm @@ -0,0 +1,5071 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%include "ring_core_generated/prefix_symbols_nasm.inc" +section .text code align=64 + + + +section .rdata rdata align=8 +ALIGN 64 +$L$poly: + DQ 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 + +$L$One: + DD 1,1,1,1,1,1,1,1 +$L$Two: + DD 2,2,2,2,2,2,2,2 +$L$Three: + DD 3,3,3,3,3,3,3,3 +$L$ONE_mont: + DQ 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe + + +$L$ord: + DQ 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 +$L$ordK: + DQ 0xccd1c8aaee00bc4f +section .text + + + + +global ecp_nistz256_neg + +ALIGN 32 +ecp_nistz256_neg: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_neg: + mov rdi,rcx + mov rsi,rdx + + + +_CET_ENDBR + push r12 + + push r13 + +$L$neg_body: + + xor r8,r8 + xor r9,r9 + xor r10,r10 + xor r11,r11 + xor r13,r13 + + sub r8,QWORD[rsi] + sbb r9,QWORD[8+rsi] + sbb r10,QWORD[16+rsi] + mov rax,r8 + sbb r11,QWORD[24+rsi] + lea rsi,[$L$poly] + mov rdx,r9 + sbb r13,0 + + add r8,QWORD[rsi] + mov rcx,r10 + adc r9,QWORD[8+rsi] + adc r10,QWORD[16+rsi] + mov r12,r11 + adc r11,QWORD[24+rsi] + test r13,r13 + + cmovz r8,rax + cmovz r9,rdx + mov QWORD[rdi],r8 + cmovz r10,rcx + mov QWORD[8+rdi],r9 + cmovz r11,r12 + mov QWORD[16+rdi],r10 + mov QWORD[24+rdi],r11 + + mov r13,QWORD[rsp] + + mov r12,QWORD[8+rsp] + + lea rsp,[16+rsp] + +$L$neg_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ecp_nistz256_neg: + + + + + + +global ecp_nistz256_ord_mul_mont_nohw + +ALIGN 32 +ecp_nistz256_ord_mul_mont_nohw: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_ord_mul_mont_nohw: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$ord_mul_body: + + mov rax,QWORD[rdx] + mov rbx,rdx + lea r14,[$L$ord] + mov r15,QWORD[$L$ordK] + + + mov rcx,rax + mul QWORD[rsi] + mov r8,rax + mov rax,rcx + mov r9,rdx + + mul QWORD[8+rsi] + add r9,rax + mov rax,rcx + adc rdx,0 + mov r10,rdx + + mul QWORD[16+rsi] + add r10,rax + mov rax,rcx + adc rdx,0 + + mov r13,r8 + imul r8,r15 + + mov r11,rdx + mul QWORD[24+rsi] + add r11,rax + mov rax,r8 + adc rdx,0 + mov r12,rdx + + + mul QWORD[r14] + mov rbp,r8 + add r13,rax + mov rax,r8 + adc rdx,0 + mov rcx,rdx + + sub r10,r8 + sbb r8,0 + + mul QWORD[8+r14] + add r9,rcx + adc rdx,0 + add r9,rax + mov rax,rbp + adc r10,rdx + mov rdx,rbp + adc r8,0 + + shl rax,32 + shr rdx,32 + sub r11,rax + mov rax,QWORD[8+rbx] + sbb rbp,rdx + + add r11,r8 + adc r12,rbp + adc r13,0 + + + mov rcx,rax + mul QWORD[rsi] + add r9,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[8+rsi] + add r10,rbp + adc rdx,0 + add r10,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[16+rsi] + add r11,rbp + adc rdx,0 + add r11,rax + mov rax,rcx + adc rdx,0 + + mov rcx,r9 + imul r9,r15 + + mov rbp,rdx + mul QWORD[24+rsi] + add r12,rbp + adc rdx,0 + xor r8,r8 + add r12,rax + mov rax,r9 + adc r13,rdx + adc r8,0 + + + mul QWORD[r14] + mov rbp,r9 + add rcx,rax + mov rax,r9 + adc rcx,rdx + + sub r11,r9 + sbb r9,0 + + mul QWORD[8+r14] + add r10,rcx + adc rdx,0 + add r10,rax + mov rax,rbp + adc r11,rdx + mov rdx,rbp + adc r9,0 + + shl rax,32 + shr rdx,32 + sub r12,rax + mov rax,QWORD[16+rbx] + sbb rbp,rdx + + add r12,r9 + adc r13,rbp + adc r8,0 + + + mov rcx,rax + mul QWORD[rsi] + add r10,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[8+rsi] + add r11,rbp + adc rdx,0 + add r11,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[16+rsi] + add r12,rbp + adc rdx,0 + add r12,rax + mov rax,rcx + adc rdx,0 + + mov rcx,r10 + imul r10,r15 + + mov rbp,rdx + mul QWORD[24+rsi] + add r13,rbp + adc rdx,0 + xor r9,r9 + add r13,rax + mov rax,r10 + adc r8,rdx + adc r9,0 + + + mul QWORD[r14] + mov rbp,r10 + add rcx,rax + mov rax,r10 + adc rcx,rdx + + sub r12,r10 + sbb r10,0 + + mul QWORD[8+r14] + add r11,rcx + adc rdx,0 + add r11,rax + mov rax,rbp + adc r12,rdx + mov rdx,rbp + adc r10,0 + + shl rax,32 + shr rdx,32 + sub r13,rax + mov rax,QWORD[24+rbx] + sbb rbp,rdx + + add r13,r10 + adc r8,rbp + adc r9,0 + + + mov rcx,rax + mul QWORD[rsi] + add r11,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[8+rsi] + add r12,rbp + adc rdx,0 + add r12,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[16+rsi] + add r13,rbp + adc rdx,0 + add r13,rax + mov rax,rcx + adc rdx,0 + + mov rcx,r11 + imul r11,r15 + + mov rbp,rdx + mul QWORD[24+rsi] + add r8,rbp + adc rdx,0 + xor r10,r10 + add r8,rax + mov rax,r11 + adc r9,rdx + adc r10,0 + + + mul QWORD[r14] + mov rbp,r11 + add rcx,rax + mov rax,r11 + adc rcx,rdx + + sub r13,r11 + sbb r11,0 + + mul QWORD[8+r14] + add r12,rcx + adc rdx,0 + add r12,rax + mov rax,rbp + adc r13,rdx + mov rdx,rbp + adc r11,0 + + shl rax,32 + shr rdx,32 + sub r8,rax + sbb rbp,rdx + + add r8,r11 + adc r9,rbp + adc r10,0 + + + mov rsi,r12 + sub r12,QWORD[r14] + mov r11,r13 + sbb r13,QWORD[8+r14] + mov rcx,r8 + sbb r8,QWORD[16+r14] + mov rbp,r9 + sbb r9,QWORD[24+r14] + sbb r10,0 + + cmovc r12,rsi + cmovc r13,r11 + cmovc r8,rcx + cmovc r9,rbp + + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$ord_mul_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ecp_nistz256_ord_mul_mont_nohw: + + + + + + + +global ecp_nistz256_ord_sqr_mont_nohw + +ALIGN 32 +ecp_nistz256_ord_sqr_mont_nohw: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_ord_sqr_mont_nohw: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$ord_sqr_body: + + mov r8,QWORD[rsi] + mov rax,QWORD[8+rsi] + mov r14,QWORD[16+rsi] + mov r15,QWORD[24+rsi] + lea rsi,[$L$ord] + mov rbx,rdx + jmp NEAR $L$oop_ord_sqr + +ALIGN 32 +$L$oop_ord_sqr: + + mov rbp,rax + mul r8 + mov r9,rax +DB 102,72,15,110,205 + mov rax,r14 + mov r10,rdx + + mul r8 + add r10,rax + mov rax,r15 +DB 102,73,15,110,214 + adc rdx,0 + mov r11,rdx + + mul r8 + add r11,rax + mov rax,r15 +DB 102,73,15,110,223 + adc rdx,0 + mov r12,rdx + + + mul r14 + mov r13,rax + mov rax,r14 + mov r14,rdx + + + mul rbp + add r11,rax + mov rax,r15 + adc rdx,0 + mov r15,rdx + + mul rbp + add r12,rax + adc rdx,0 + + add r12,r15 + adc r13,rdx + adc r14,0 + + + xor r15,r15 + mov rax,r8 + add r9,r9 + adc r10,r10 + adc r11,r11 + adc r12,r12 + adc r13,r13 + adc r14,r14 + adc r15,0 + + + mul rax + mov r8,rax +DB 102,72,15,126,200 + mov rbp,rdx + + mul rax + add r9,rbp + adc r10,rax +DB 102,72,15,126,208 + adc rdx,0 + mov rbp,rdx + + mul rax + add r11,rbp + adc r12,rax +DB 102,72,15,126,216 + adc rdx,0 + mov rbp,rdx + + mov rcx,r8 + imul r8,QWORD[32+rsi] + + mul rax + add r13,rbp + adc r14,rax + mov rax,QWORD[rsi] + adc r15,rdx + + + mul r8 + mov rbp,r8 + add rcx,rax + mov rax,QWORD[8+rsi] + adc rcx,rdx + + sub r10,r8 + sbb rbp,0 + + mul r8 + add r9,rcx + adc rdx,0 + add r9,rax + mov rax,r8 + adc r10,rdx + mov rdx,r8 + adc rbp,0 + + mov rcx,r9 + imul r9,QWORD[32+rsi] + + shl rax,32 + shr rdx,32 + sub r11,rax + mov rax,QWORD[rsi] + sbb r8,rdx + + add r11,rbp + adc r8,0 + + + mul r9 + mov rbp,r9 + add rcx,rax + mov rax,QWORD[8+rsi] + adc rcx,rdx + + sub r11,r9 + sbb rbp,0 + + mul r9 + add r10,rcx + adc rdx,0 + add r10,rax + mov rax,r9 + adc r11,rdx + mov rdx,r9 + adc rbp,0 + + mov rcx,r10 + imul r10,QWORD[32+rsi] + + shl rax,32 + shr rdx,32 + sub r8,rax + mov rax,QWORD[rsi] + sbb r9,rdx + + add r8,rbp + adc r9,0 + + + mul r10 + mov rbp,r10 + add rcx,rax + mov rax,QWORD[8+rsi] + adc rcx,rdx + + sub r8,r10 + sbb rbp,0 + + mul r10 + add r11,rcx + adc rdx,0 + add r11,rax + mov rax,r10 + adc r8,rdx + mov rdx,r10 + adc rbp,0 + + mov rcx,r11 + imul r11,QWORD[32+rsi] + + shl rax,32 + shr rdx,32 + sub r9,rax + mov rax,QWORD[rsi] + sbb r10,rdx + + add r9,rbp + adc r10,0 + + + mul r11 + mov rbp,r11 + add rcx,rax + mov rax,QWORD[8+rsi] + adc rcx,rdx + + sub r9,r11 + sbb rbp,0 + + mul r11 + add r8,rcx + adc rdx,0 + add r8,rax + mov rax,r11 + adc r9,rdx + mov rdx,r11 + adc rbp,0 + + shl rax,32 + shr rdx,32 + sub r10,rax + sbb r11,rdx + + add r10,rbp + adc r11,0 + + + xor rdx,rdx + add r8,r12 + adc r9,r13 + mov r12,r8 + adc r10,r14 + adc r11,r15 + mov rax,r9 + adc rdx,0 + + + sub r8,QWORD[rsi] + mov r14,r10 + sbb r9,QWORD[8+rsi] + sbb r10,QWORD[16+rsi] + mov r15,r11 + sbb r11,QWORD[24+rsi] + sbb rdx,0 + + cmovc r8,r12 + cmovnc rax,r9 + cmovnc r14,r10 + cmovnc r15,r11 + + dec rbx + jnz NEAR $L$oop_ord_sqr + + mov QWORD[rdi],r8 + mov QWORD[8+rdi],rax + pxor xmm1,xmm1 + mov QWORD[16+rdi],r14 + pxor xmm2,xmm2 + mov QWORD[24+rdi],r15 + pxor xmm3,xmm3 + + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$ord_sqr_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ecp_nistz256_ord_sqr_mont_nohw: + +global ecp_nistz256_ord_mul_mont_adx + +ALIGN 32 +ecp_nistz256_ord_mul_mont_adx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_ord_mul_mont_adx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +$L$ecp_nistz256_ord_mul_mont_adx: +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$ord_mulx_body: + + mov rbx,rdx + mov rdx,QWORD[rdx] + mov r9,QWORD[rsi] + mov r10,QWORD[8+rsi] + mov r11,QWORD[16+rsi] + mov r12,QWORD[24+rsi] + lea rsi,[((-128))+rsi] + lea r14,[(($L$ord-128))] + mov r15,QWORD[$L$ordK] + + + mulx r9,r8,r9 + mulx r10,rcx,r10 + mulx r11,rbp,r11 + add r9,rcx + mulx r12,rcx,r12 + mov rdx,r8 + mulx rax,rdx,r15 + adc r10,rbp + adc r11,rcx + adc r12,0 + + + xor r13,r13 + mulx rbp,rcx,QWORD[((0+128))+r14] + adcx r8,rcx + adox r9,rbp + + mulx rbp,rcx,QWORD[((8+128))+r14] + adcx r9,rcx + adox r10,rbp + + mulx rbp,rcx,QWORD[((16+128))+r14] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((24+128))+r14] + mov rdx,QWORD[8+rbx] + adcx r11,rcx + adox r12,rbp + adcx r12,r8 + adox r13,r8 + adc r13,0 + + + mulx rbp,rcx,QWORD[((0+128))+rsi] + adcx r9,rcx + adox r10,rbp + + mulx rbp,rcx,QWORD[((8+128))+rsi] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((16+128))+rsi] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((24+128))+rsi] + mov rdx,r9 + mulx rax,rdx,r15 + adcx r12,rcx + adox r13,rbp + + adcx r13,r8 + adox r8,r8 + adc r8,0 + + + mulx rbp,rcx,QWORD[((0+128))+r14] + adcx r9,rcx + adox r10,rbp + + mulx rbp,rcx,QWORD[((8+128))+r14] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((16+128))+r14] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((24+128))+r14] + mov rdx,QWORD[16+rbx] + adcx r12,rcx + adox r13,rbp + adcx r13,r9 + adox r8,r9 + adc r8,0 + + + mulx rbp,rcx,QWORD[((0+128))+rsi] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((8+128))+rsi] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((16+128))+rsi] + adcx r12,rcx + adox r13,rbp + + mulx rbp,rcx,QWORD[((24+128))+rsi] + mov rdx,r10 + mulx rax,rdx,r15 + adcx r13,rcx + adox r8,rbp + + adcx r8,r9 + adox r9,r9 + adc r9,0 + + + mulx rbp,rcx,QWORD[((0+128))+r14] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((8+128))+r14] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((16+128))+r14] + adcx r12,rcx + adox r13,rbp + + mulx rbp,rcx,QWORD[((24+128))+r14] + mov rdx,QWORD[24+rbx] + adcx r13,rcx + adox r8,rbp + adcx r8,r10 + adox r9,r10 + adc r9,0 + + + mulx rbp,rcx,QWORD[((0+128))+rsi] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((8+128))+rsi] + adcx r12,rcx + adox r13,rbp + + mulx rbp,rcx,QWORD[((16+128))+rsi] + adcx r13,rcx + adox r8,rbp + + mulx rbp,rcx,QWORD[((24+128))+rsi] + mov rdx,r11 + mulx rax,rdx,r15 + adcx r8,rcx + adox r9,rbp + + adcx r9,r10 + adox r10,r10 + adc r10,0 + + + mulx rbp,rcx,QWORD[((0+128))+r14] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((8+128))+r14] + adcx r12,rcx + adox r13,rbp + + mulx rbp,rcx,QWORD[((16+128))+r14] + adcx r13,rcx + adox r8,rbp + + mulx rbp,rcx,QWORD[((24+128))+r14] + lea r14,[128+r14] + mov rbx,r12 + adcx r8,rcx + adox r9,rbp + mov rdx,r13 + adcx r9,r11 + adox r10,r11 + adc r10,0 + + + + mov rcx,r8 + sub r12,QWORD[r14] + sbb r13,QWORD[8+r14] + sbb r8,QWORD[16+r14] + mov rbp,r9 + sbb r9,QWORD[24+r14] + sbb r10,0 + + cmovc r12,rbx + cmovc r13,rdx + cmovc r8,rcx + cmovc r9,rbp + + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$ord_mulx_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ecp_nistz256_ord_mul_mont_adx: + +global ecp_nistz256_ord_sqr_mont_adx + +ALIGN 32 +ecp_nistz256_ord_sqr_mont_adx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_ord_sqr_mont_adx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR +$L$ecp_nistz256_ord_sqr_mont_adx: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$ord_sqrx_body: + + mov rbx,rdx + mov rdx,QWORD[rsi] + mov r14,QWORD[8+rsi] + mov r15,QWORD[16+rsi] + mov r8,QWORD[24+rsi] + lea rsi,[$L$ord] + jmp NEAR $L$oop_ord_sqrx + +ALIGN 32 +$L$oop_ord_sqrx: + mulx r10,r9,r14 + mulx r11,rcx,r15 + mov rax,rdx +DB 102,73,15,110,206 + mulx r12,rbp,r8 + mov rdx,r14 + add r10,rcx +DB 102,73,15,110,215 + adc r11,rbp + adc r12,0 + xor r13,r13 + + mulx rbp,rcx,r15 + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,r8 + mov rdx,r15 + adcx r12,rcx + adox r13,rbp + adc r13,0 + + mulx r14,rcx,r8 + mov rdx,rax +DB 102,73,15,110,216 + xor r15,r15 + adcx r9,r9 + adox r13,rcx + adcx r10,r10 + adox r14,r15 + + + mulx rbp,r8,rdx +DB 102,72,15,126,202 + adcx r11,r11 + adox r9,rbp + adcx r12,r12 + mulx rax,rcx,rdx +DB 102,72,15,126,210 + adcx r13,r13 + adox r10,rcx + adcx r14,r14 + mulx rbp,rcx,rdx + DB 0x67 +DB 102,72,15,126,218 + adox r11,rax + adcx r15,r15 + adox r12,rcx + adox r13,rbp + mulx rax,rcx,rdx + adox r14,rcx + adox r15,rax + + + mov rdx,r8 + mulx rcx,rdx,QWORD[32+rsi] + + xor rax,rax + mulx rbp,rcx,QWORD[rsi] + adcx r8,rcx + adox r9,rbp + mulx rbp,rcx,QWORD[8+rsi] + adcx r9,rcx + adox r10,rbp + mulx rbp,rcx,QWORD[16+rsi] + adcx r10,rcx + adox r11,rbp + mulx rbp,rcx,QWORD[24+rsi] + adcx r11,rcx + adox r8,rbp + adcx r8,rax + + + mov rdx,r9 + mulx rcx,rdx,QWORD[32+rsi] + + mulx rbp,rcx,QWORD[rsi] + adox r9,rcx + adcx r10,rbp + mulx rbp,rcx,QWORD[8+rsi] + adox r10,rcx + adcx r11,rbp + mulx rbp,rcx,QWORD[16+rsi] + adox r11,rcx + adcx r8,rbp + mulx rbp,rcx,QWORD[24+rsi] + adox r8,rcx + adcx r9,rbp + adox r9,rax + + + mov rdx,r10 + mulx rcx,rdx,QWORD[32+rsi] + + mulx rbp,rcx,QWORD[rsi] + adcx r10,rcx + adox r11,rbp + mulx rbp,rcx,QWORD[8+rsi] + adcx r11,rcx + adox r8,rbp + mulx rbp,rcx,QWORD[16+rsi] + adcx r8,rcx + adox r9,rbp + mulx rbp,rcx,QWORD[24+rsi] + adcx r9,rcx + adox r10,rbp + adcx r10,rax + + + mov rdx,r11 + mulx rcx,rdx,QWORD[32+rsi] + + mulx rbp,rcx,QWORD[rsi] + adox r11,rcx + adcx r8,rbp + mulx rbp,rcx,QWORD[8+rsi] + adox r8,rcx + adcx r9,rbp + mulx rbp,rcx,QWORD[16+rsi] + adox r9,rcx + adcx r10,rbp + mulx rbp,rcx,QWORD[24+rsi] + adox r10,rcx + adcx r11,rbp + adox r11,rax + + + add r12,r8 + adc r9,r13 + mov rdx,r12 + adc r10,r14 + adc r11,r15 + mov r14,r9 + adc rax,0 + + + sub r12,QWORD[rsi] + mov r15,r10 + sbb r9,QWORD[8+rsi] + sbb r10,QWORD[16+rsi] + mov r8,r11 + sbb r11,QWORD[24+rsi] + sbb rax,0 + + cmovnc rdx,r12 + cmovnc r14,r9 + cmovnc r15,r10 + cmovnc r8,r11 + + dec rbx + jnz NEAR $L$oop_ord_sqrx + + mov QWORD[rdi],rdx + mov QWORD[8+rdi],r14 + pxor xmm1,xmm1 + mov QWORD[16+rdi],r15 + pxor xmm2,xmm2 + mov QWORD[24+rdi],r8 + pxor xmm3,xmm3 + + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$ord_sqrx_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ecp_nistz256_ord_sqr_mont_adx: + + + + + + +global ecp_nistz256_mul_mont_nohw + +ALIGN 32 +ecp_nistz256_mul_mont_nohw: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_mul_mont_nohw: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$mul_body: + mov rbx,rdx + mov rax,QWORD[rdx] + mov r9,QWORD[rsi] + mov r10,QWORD[8+rsi] + mov r11,QWORD[16+rsi] + mov r12,QWORD[24+rsi] + + call __ecp_nistz256_mul_montq + + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$mul_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ecp_nistz256_mul_mont_nohw: + + +ALIGN 32 +__ecp_nistz256_mul_montq: + + + + mov rbp,rax + mul r9 + mov r14,QWORD[(($L$poly+8))] + mov r8,rax + mov rax,rbp + mov r9,rdx + + mul r10 + mov r15,QWORD[(($L$poly+24))] + add r9,rax + mov rax,rbp + adc rdx,0 + mov r10,rdx + + mul r11 + add r10,rax + mov rax,rbp + adc rdx,0 + mov r11,rdx + + mul r12 + add r11,rax + mov rax,r8 + adc rdx,0 + xor r13,r13 + mov r12,rdx + + + + + + + + + + + mov rbp,r8 + shl r8,32 + mul r15 + shr rbp,32 + add r9,r8 + adc r10,rbp + adc r11,rax + mov rax,QWORD[8+rbx] + adc r12,rdx + adc r13,0 + xor r8,r8 + + + + mov rbp,rax + mul QWORD[rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[8+rsi] + add r10,rcx + adc rdx,0 + add r10,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[16+rsi] + add r11,rcx + adc rdx,0 + add r11,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[24+rsi] + add r12,rcx + adc rdx,0 + add r12,rax + mov rax,r9 + adc r13,rdx + adc r8,0 + + + + mov rbp,r9 + shl r9,32 + mul r15 + shr rbp,32 + add r10,r9 + adc r11,rbp + adc r12,rax + mov rax,QWORD[16+rbx] + adc r13,rdx + adc r8,0 + xor r9,r9 + + + + mov rbp,rax + mul QWORD[rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[8+rsi] + add r11,rcx + adc rdx,0 + add r11,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[16+rsi] + add r12,rcx + adc rdx,0 + add r12,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[24+rsi] + add r13,rcx + adc rdx,0 + add r13,rax + mov rax,r10 + adc r8,rdx + adc r9,0 + + + + mov rbp,r10 + shl r10,32 + mul r15 + shr rbp,32 + add r11,r10 + adc r12,rbp + adc r13,rax + mov rax,QWORD[24+rbx] + adc r8,rdx + adc r9,0 + xor r10,r10 + + + + mov rbp,rax + mul QWORD[rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[8+rsi] + add r12,rcx + adc rdx,0 + add r12,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[16+rsi] + add r13,rcx + adc rdx,0 + add r13,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[24+rsi] + add r8,rcx + adc rdx,0 + add r8,rax + mov rax,r11 + adc r9,rdx + adc r10,0 + + + + mov rbp,r11 + shl r11,32 + mul r15 + shr rbp,32 + add r12,r11 + adc r13,rbp + mov rcx,r12 + adc r8,rax + adc r9,rdx + mov rbp,r13 + adc r10,0 + + + + sub r12,-1 + mov rbx,r8 + sbb r13,r14 + sbb r8,0 + mov rdx,r9 + sbb r9,r15 + sbb r10,0 + + cmovc r12,rcx + cmovc r13,rbp + mov QWORD[rdi],r12 + cmovc r8,rbx + mov QWORD[8+rdi],r13 + cmovc r9,rdx + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + ret + + + + + + + + + + +global ecp_nistz256_sqr_mont_nohw + +ALIGN 32 +ecp_nistz256_sqr_mont_nohw: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_sqr_mont_nohw: + mov rdi,rcx + mov rsi,rdx + + + +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$sqr_body: + mov rax,QWORD[rsi] + mov r14,QWORD[8+rsi] + mov r15,QWORD[16+rsi] + mov r8,QWORD[24+rsi] + + call __ecp_nistz256_sqr_montq + + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$sqr_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ecp_nistz256_sqr_mont_nohw: + + +ALIGN 32 +__ecp_nistz256_sqr_montq: + + mov r13,rax + mul r14 + mov r9,rax + mov rax,r15 + mov r10,rdx + + mul r13 + add r10,rax + mov rax,r8 + adc rdx,0 + mov r11,rdx + + mul r13 + add r11,rax + mov rax,r15 + adc rdx,0 + mov r12,rdx + + + mul r14 + add r11,rax + mov rax,r8 + adc rdx,0 + mov rbp,rdx + + mul r14 + add r12,rax + mov rax,r8 + adc rdx,0 + add r12,rbp + mov r13,rdx + adc r13,0 + + + mul r15 + xor r15,r15 + add r13,rax + mov rax,QWORD[rsi] + mov r14,rdx + adc r14,0 + + add r9,r9 + adc r10,r10 + adc r11,r11 + adc r12,r12 + adc r13,r13 + adc r14,r14 + adc r15,0 + + mul rax + mov r8,rax + mov rax,QWORD[8+rsi] + mov rcx,rdx + + mul rax + add r9,rcx + adc r10,rax + mov rax,QWORD[16+rsi] + adc rdx,0 + mov rcx,rdx + + mul rax + add r11,rcx + adc r12,rax + mov rax,QWORD[24+rsi] + adc rdx,0 + mov rcx,rdx + + mul rax + add r13,rcx + adc r14,rax + mov rax,r8 + adc r15,rdx + + mov rsi,QWORD[(($L$poly+8))] + mov rbp,QWORD[(($L$poly+24))] + + + + + mov rcx,r8 + shl r8,32 + mul rbp + shr rcx,32 + add r9,r8 + adc r10,rcx + adc r11,rax + mov rax,r9 + adc rdx,0 + + + + mov rcx,r9 + shl r9,32 + mov r8,rdx + mul rbp + shr rcx,32 + add r10,r9 + adc r11,rcx + adc r8,rax + mov rax,r10 + adc rdx,0 + + + + mov rcx,r10 + shl r10,32 + mov r9,rdx + mul rbp + shr rcx,32 + add r11,r10 + adc r8,rcx + adc r9,rax + mov rax,r11 + adc rdx,0 + + + + mov rcx,r11 + shl r11,32 + mov r10,rdx + mul rbp + shr rcx,32 + add r8,r11 + adc r9,rcx + adc r10,rax + adc rdx,0 + xor r11,r11 + + + + add r12,r8 + adc r13,r9 + mov r8,r12 + adc r14,r10 + adc r15,rdx + mov r9,r13 + adc r11,0 + + sub r12,-1 + mov r10,r14 + sbb r13,rsi + sbb r14,0 + mov rcx,r15 + sbb r15,rbp + sbb r11,0 + + cmovc r12,r8 + cmovc r13,r9 + mov QWORD[rdi],r12 + cmovc r14,r10 + mov QWORD[8+rdi],r13 + cmovc r15,rcx + mov QWORD[16+rdi],r14 + mov QWORD[24+rdi],r15 + + ret + + +global ecp_nistz256_mul_mont_adx + +ALIGN 32 +ecp_nistz256_mul_mont_adx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_mul_mont_adx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$mulx_body: + mov rbx,rdx + mov rdx,QWORD[rdx] + mov r9,QWORD[rsi] + mov r10,QWORD[8+rsi] + mov r11,QWORD[16+rsi] + mov r12,QWORD[24+rsi] + lea rsi,[((-128))+rsi] + + call __ecp_nistz256_mul_montx + + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$mulx_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ecp_nistz256_mul_mont_adx: + + +ALIGN 32 +__ecp_nistz256_mul_montx: + + + + mulx r9,r8,r9 + mulx r10,rcx,r10 + mov r14,32 + xor r13,r13 + mulx r11,rbp,r11 + mov r15,QWORD[(($L$poly+24))] + adc r9,rcx + mulx r12,rcx,r12 + mov rdx,r8 + adc r10,rbp + shlx rbp,r8,r14 + adc r11,rcx + shrx rcx,r8,r14 + adc r12,0 + + + + add r9,rbp + adc r10,rcx + + mulx rbp,rcx,r15 + mov rdx,QWORD[8+rbx] + adc r11,rcx + adc r12,rbp + adc r13,0 + xor r8,r8 + + + + mulx rbp,rcx,QWORD[((0+128))+rsi] + adcx r9,rcx + adox r10,rbp + + mulx rbp,rcx,QWORD[((8+128))+rsi] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((16+128))+rsi] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((24+128))+rsi] + mov rdx,r9 + adcx r12,rcx + shlx rcx,r9,r14 + adox r13,rbp + shrx rbp,r9,r14 + + adcx r13,r8 + adox r8,r8 + adc r8,0 + + + + add r10,rcx + adc r11,rbp + + mulx rbp,rcx,r15 + mov rdx,QWORD[16+rbx] + adc r12,rcx + adc r13,rbp + adc r8,0 + xor r9,r9 + + + + mulx rbp,rcx,QWORD[((0+128))+rsi] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((8+128))+rsi] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((16+128))+rsi] + adcx r12,rcx + adox r13,rbp + + mulx rbp,rcx,QWORD[((24+128))+rsi] + mov rdx,r10 + adcx r13,rcx + shlx rcx,r10,r14 + adox r8,rbp + shrx rbp,r10,r14 + + adcx r8,r9 + adox r9,r9 + adc r9,0 + + + + add r11,rcx + adc r12,rbp + + mulx rbp,rcx,r15 + mov rdx,QWORD[24+rbx] + adc r13,rcx + adc r8,rbp + adc r9,0 + xor r10,r10 + + + + mulx rbp,rcx,QWORD[((0+128))+rsi] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((8+128))+rsi] + adcx r12,rcx + adox r13,rbp + + mulx rbp,rcx,QWORD[((16+128))+rsi] + adcx r13,rcx + adox r8,rbp + + mulx rbp,rcx,QWORD[((24+128))+rsi] + mov rdx,r11 + adcx r8,rcx + shlx rcx,r11,r14 + adox r9,rbp + shrx rbp,r11,r14 + + adcx r9,r10 + adox r10,r10 + adc r10,0 + + + + add r12,rcx + adc r13,rbp + + mulx rbp,rcx,r15 + mov rbx,r12 + mov r14,QWORD[(($L$poly+8))] + adc r8,rcx + mov rdx,r13 + adc r9,rbp + adc r10,0 + + + + xor eax,eax + mov rcx,r8 + sbb r12,-1 + sbb r13,r14 + sbb r8,0 + mov rbp,r9 + sbb r9,r15 + sbb r10,0 + + cmovc r12,rbx + cmovc r13,rdx + mov QWORD[rdi],r12 + cmovc r8,rcx + mov QWORD[8+rdi],r13 + cmovc r9,rbp + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + ret + + + +global ecp_nistz256_sqr_mont_adx + +ALIGN 32 +ecp_nistz256_sqr_mont_adx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_sqr_mont_adx: + mov rdi,rcx + mov rsi,rdx + + + +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$sqrx_body: + mov rdx,QWORD[rsi] + mov r14,QWORD[8+rsi] + mov r15,QWORD[16+rsi] + mov r8,QWORD[24+rsi] + lea rsi,[((-128))+rsi] + + call __ecp_nistz256_sqr_montx + + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$sqrx_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ecp_nistz256_sqr_mont_adx: + + +ALIGN 32 +__ecp_nistz256_sqr_montx: + + mulx r10,r9,r14 + mulx r11,rcx,r15 + xor eax,eax + adc r10,rcx + mulx r12,rbp,r8 + mov rdx,r14 + adc r11,rbp + adc r12,0 + xor r13,r13 + + + mulx rbp,rcx,r15 + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,r8 + mov rdx,r15 + adcx r12,rcx + adox r13,rbp + adc r13,0 + + + mulx r14,rcx,r8 + mov rdx,QWORD[((0+128))+rsi] + xor r15,r15 + adcx r9,r9 + adox r13,rcx + adcx r10,r10 + adox r14,r15 + + mulx rbp,r8,rdx + mov rdx,QWORD[((8+128))+rsi] + adcx r11,r11 + adox r9,rbp + adcx r12,r12 + mulx rax,rcx,rdx + mov rdx,QWORD[((16+128))+rsi] + adcx r13,r13 + adox r10,rcx + adcx r14,r14 + DB 0x67 + mulx rbp,rcx,rdx + mov rdx,QWORD[((24+128))+rsi] + adox r11,rax + adcx r15,r15 + adox r12,rcx + mov rsi,32 + adox r13,rbp + DB 0x67,0x67 + mulx rax,rcx,rdx + mov rdx,QWORD[(($L$poly+24))] + adox r14,rcx + shlx rcx,r8,rsi + adox r15,rax + shrx rax,r8,rsi + mov rbp,rdx + + + add r9,rcx + adc r10,rax + + mulx r8,rcx,r8 + adc r11,rcx + shlx rcx,r9,rsi + adc r8,0 + shrx rax,r9,rsi + + + add r10,rcx + adc r11,rax + + mulx r9,rcx,r9 + adc r8,rcx + shlx rcx,r10,rsi + adc r9,0 + shrx rax,r10,rsi + + + add r11,rcx + adc r8,rax + + mulx r10,rcx,r10 + adc r9,rcx + shlx rcx,r11,rsi + adc r10,0 + shrx rax,r11,rsi + + + add r8,rcx + adc r9,rax + + mulx r11,rcx,r11 + adc r10,rcx + adc r11,0 + + xor rdx,rdx + add r12,r8 + mov rsi,QWORD[(($L$poly+8))] + adc r13,r9 + mov r8,r12 + adc r14,r10 + adc r15,r11 + mov r9,r13 + adc rdx,0 + + sub r12,-1 + mov r10,r14 + sbb r13,rsi + sbb r14,0 + mov r11,r15 + sbb r15,rbp + sbb rdx,0 + + cmovc r12,r8 + cmovc r13,r9 + mov QWORD[rdi],r12 + cmovc r14,r10 + mov QWORD[8+rdi],r13 + cmovc r15,r11 + mov QWORD[16+rdi],r14 + mov QWORD[24+rdi],r15 + + ret + + + + +global ecp_nistz256_select_w5_nohw + +ALIGN 32 +ecp_nistz256_select_w5_nohw: + +_CET_ENDBR + lea rax,[((-136))+rsp] +$L$SEH_begin_ecp_nistz256_select_w5_nohw: + DB 0x48,0x8d,0x60,0xe0 + DB 0x0f,0x29,0x70,0xe0 + DB 0x0f,0x29,0x78,0xf0 + DB 0x44,0x0f,0x29,0x00 + DB 0x44,0x0f,0x29,0x48,0x10 + DB 0x44,0x0f,0x29,0x50,0x20 + DB 0x44,0x0f,0x29,0x58,0x30 + DB 0x44,0x0f,0x29,0x60,0x40 + DB 0x44,0x0f,0x29,0x68,0x50 + DB 0x44,0x0f,0x29,0x70,0x60 + DB 0x44,0x0f,0x29,0x78,0x70 + movdqa xmm0,XMMWORD[$L$One] + movd xmm1,r8d + + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + pxor xmm6,xmm6 + pxor xmm7,xmm7 + + movdqa xmm8,xmm0 + pshufd xmm1,xmm1,0 + + mov rax,16 +$L$select_loop_sse_w5: + + movdqa xmm15,xmm8 + paddd xmm8,xmm0 + pcmpeqd xmm15,xmm1 + + movdqa xmm9,XMMWORD[rdx] + movdqa xmm10,XMMWORD[16+rdx] + movdqa xmm11,XMMWORD[32+rdx] + movdqa xmm12,XMMWORD[48+rdx] + movdqa xmm13,XMMWORD[64+rdx] + movdqa xmm14,XMMWORD[80+rdx] + lea rdx,[96+rdx] + + pand xmm9,xmm15 + pand xmm10,xmm15 + por xmm2,xmm9 + pand xmm11,xmm15 + por xmm3,xmm10 + pand xmm12,xmm15 + por xmm4,xmm11 + pand xmm13,xmm15 + por xmm5,xmm12 + pand xmm14,xmm15 + por xmm6,xmm13 + por xmm7,xmm14 + + dec rax + jnz NEAR $L$select_loop_sse_w5 + + movdqu XMMWORD[rcx],xmm2 + movdqu XMMWORD[16+rcx],xmm3 + movdqu XMMWORD[32+rcx],xmm4 + movdqu XMMWORD[48+rcx],xmm5 + movdqu XMMWORD[64+rcx],xmm6 + movdqu XMMWORD[80+rcx],xmm7 + movaps xmm6,XMMWORD[rsp] + movaps xmm7,XMMWORD[16+rsp] + movaps xmm8,XMMWORD[32+rsp] + movaps xmm9,XMMWORD[48+rsp] + movaps xmm10,XMMWORD[64+rsp] + movaps xmm11,XMMWORD[80+rsp] + movaps xmm12,XMMWORD[96+rsp] + movaps xmm13,XMMWORD[112+rsp] + movaps xmm14,XMMWORD[128+rsp] + movaps xmm15,XMMWORD[144+rsp] + lea rsp,[168+rsp] + ret + +$L$SEH_end_ecp_nistz256_select_w5_nohw: + + + + +global ecp_nistz256_select_w7_nohw + +ALIGN 32 +ecp_nistz256_select_w7_nohw: + +_CET_ENDBR + lea rax,[((-136))+rsp] +$L$SEH_begin_ecp_nistz256_select_w7_nohw: + DB 0x48,0x8d,0x60,0xe0 + DB 0x0f,0x29,0x70,0xe0 + DB 0x0f,0x29,0x78,0xf0 + DB 0x44,0x0f,0x29,0x00 + DB 0x44,0x0f,0x29,0x48,0x10 + DB 0x44,0x0f,0x29,0x50,0x20 + DB 0x44,0x0f,0x29,0x58,0x30 + DB 0x44,0x0f,0x29,0x60,0x40 + DB 0x44,0x0f,0x29,0x68,0x50 + DB 0x44,0x0f,0x29,0x70,0x60 + DB 0x44,0x0f,0x29,0x78,0x70 + movdqa xmm8,XMMWORD[$L$One] + movd xmm1,r8d + + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + + movdqa xmm0,xmm8 + pshufd xmm1,xmm1,0 + mov rax,64 + +$L$select_loop_sse_w7: + movdqa xmm15,xmm8 + paddd xmm8,xmm0 + movdqa xmm9,XMMWORD[rdx] + movdqa xmm10,XMMWORD[16+rdx] + pcmpeqd xmm15,xmm1 + movdqa xmm11,XMMWORD[32+rdx] + movdqa xmm12,XMMWORD[48+rdx] + lea rdx,[64+rdx] + + pand xmm9,xmm15 + pand xmm10,xmm15 + por xmm2,xmm9 + pand xmm11,xmm15 + por xmm3,xmm10 + pand xmm12,xmm15 + por xmm4,xmm11 + prefetcht0 [255+rdx] + por xmm5,xmm12 + + dec rax + jnz NEAR $L$select_loop_sse_w7 + + movdqu XMMWORD[rcx],xmm2 + movdqu XMMWORD[16+rcx],xmm3 + movdqu XMMWORD[32+rcx],xmm4 + movdqu XMMWORD[48+rcx],xmm5 + movaps xmm6,XMMWORD[rsp] + movaps xmm7,XMMWORD[16+rsp] + movaps xmm8,XMMWORD[32+rsp] + movaps xmm9,XMMWORD[48+rsp] + movaps xmm10,XMMWORD[64+rsp] + movaps xmm11,XMMWORD[80+rsp] + movaps xmm12,XMMWORD[96+rsp] + movaps xmm13,XMMWORD[112+rsp] + movaps xmm14,XMMWORD[128+rsp] + movaps xmm15,XMMWORD[144+rsp] + lea rsp,[168+rsp] + ret + +$L$SEH_end_ecp_nistz256_select_w7_nohw: + + + +global ecp_nistz256_select_w5_avx2 + +ALIGN 32 +ecp_nistz256_select_w5_avx2: + +_CET_ENDBR + vzeroupper + lea rax,[((-136))+rsp] + mov r11,rsp +$L$SEH_begin_ecp_nistz256_select_w5_avx2: + DB 0x48,0x8d,0x60,0xe0 + DB 0xc5,0xf8,0x29,0x70,0xe0 + DB 0xc5,0xf8,0x29,0x78,0xf0 + DB 0xc5,0x78,0x29,0x40,0x00 + DB 0xc5,0x78,0x29,0x48,0x10 + DB 0xc5,0x78,0x29,0x50,0x20 + DB 0xc5,0x78,0x29,0x58,0x30 + DB 0xc5,0x78,0x29,0x60,0x40 + DB 0xc5,0x78,0x29,0x68,0x50 + DB 0xc5,0x78,0x29,0x70,0x60 + DB 0xc5,0x78,0x29,0x78,0x70 + vmovdqa ymm0,YMMWORD[$L$Two] + + vpxor ymm2,ymm2,ymm2 + vpxor ymm3,ymm3,ymm3 + vpxor ymm4,ymm4,ymm4 + + vmovdqa ymm5,YMMWORD[$L$One] + vmovdqa ymm10,YMMWORD[$L$Two] + + vmovd xmm1,r8d + vpermd ymm1,ymm2,ymm1 + + mov rax,8 +$L$select_loop_avx2_w5: + + vmovdqa ymm6,YMMWORD[rdx] + vmovdqa ymm7,YMMWORD[32+rdx] + vmovdqa ymm8,YMMWORD[64+rdx] + + vmovdqa ymm11,YMMWORD[96+rdx] + vmovdqa ymm12,YMMWORD[128+rdx] + vmovdqa ymm13,YMMWORD[160+rdx] + + vpcmpeqd ymm9,ymm5,ymm1 + vpcmpeqd ymm14,ymm10,ymm1 + + vpaddd ymm5,ymm5,ymm0 + vpaddd ymm10,ymm10,ymm0 + lea rdx,[192+rdx] + + vpand ymm6,ymm6,ymm9 + vpand ymm7,ymm7,ymm9 + vpand ymm8,ymm8,ymm9 + vpand ymm11,ymm11,ymm14 + vpand ymm12,ymm12,ymm14 + vpand ymm13,ymm13,ymm14 + + vpxor ymm2,ymm2,ymm6 + vpxor ymm3,ymm3,ymm7 + vpxor ymm4,ymm4,ymm8 + vpxor ymm2,ymm2,ymm11 + vpxor ymm3,ymm3,ymm12 + vpxor ymm4,ymm4,ymm13 + + dec rax + jnz NEAR $L$select_loop_avx2_w5 + + vmovdqu YMMWORD[rcx],ymm2 + vmovdqu YMMWORD[32+rcx],ymm3 + vmovdqu YMMWORD[64+rcx],ymm4 + vzeroupper + movaps xmm6,XMMWORD[rsp] + movaps xmm7,XMMWORD[16+rsp] + movaps xmm8,XMMWORD[32+rsp] + movaps xmm9,XMMWORD[48+rsp] + movaps xmm10,XMMWORD[64+rsp] + movaps xmm11,XMMWORD[80+rsp] + movaps xmm12,XMMWORD[96+rsp] + movaps xmm13,XMMWORD[112+rsp] + movaps xmm14,XMMWORD[128+rsp] + movaps xmm15,XMMWORD[144+rsp] + lea rsp,[r11] + ret + +$L$SEH_end_ecp_nistz256_select_w5_avx2: + + + + +global ecp_nistz256_select_w7_avx2 + +ALIGN 32 +ecp_nistz256_select_w7_avx2: + +_CET_ENDBR + vzeroupper + mov r11,rsp + lea rax,[((-136))+rsp] +$L$SEH_begin_ecp_nistz256_select_w7_avx2: + DB 0x48,0x8d,0x60,0xe0 + DB 0xc5,0xf8,0x29,0x70,0xe0 + DB 0xc5,0xf8,0x29,0x78,0xf0 + DB 0xc5,0x78,0x29,0x40,0x00 + DB 0xc5,0x78,0x29,0x48,0x10 + DB 0xc5,0x78,0x29,0x50,0x20 + DB 0xc5,0x78,0x29,0x58,0x30 + DB 0xc5,0x78,0x29,0x60,0x40 + DB 0xc5,0x78,0x29,0x68,0x50 + DB 0xc5,0x78,0x29,0x70,0x60 + DB 0xc5,0x78,0x29,0x78,0x70 + vmovdqa ymm0,YMMWORD[$L$Three] + + vpxor ymm2,ymm2,ymm2 + vpxor ymm3,ymm3,ymm3 + + vmovdqa ymm4,YMMWORD[$L$One] + vmovdqa ymm8,YMMWORD[$L$Two] + vmovdqa ymm12,YMMWORD[$L$Three] + + vmovd xmm1,r8d + vpermd ymm1,ymm2,ymm1 + + + mov rax,21 +$L$select_loop_avx2_w7: + + vmovdqa ymm5,YMMWORD[rdx] + vmovdqa ymm6,YMMWORD[32+rdx] + + vmovdqa ymm9,YMMWORD[64+rdx] + vmovdqa ymm10,YMMWORD[96+rdx] + + vmovdqa ymm13,YMMWORD[128+rdx] + vmovdqa ymm14,YMMWORD[160+rdx] + + vpcmpeqd ymm7,ymm4,ymm1 + vpcmpeqd ymm11,ymm8,ymm1 + vpcmpeqd ymm15,ymm12,ymm1 + + vpaddd ymm4,ymm4,ymm0 + vpaddd ymm8,ymm8,ymm0 + vpaddd ymm12,ymm12,ymm0 + lea rdx,[192+rdx] + + vpand ymm5,ymm5,ymm7 + vpand ymm6,ymm6,ymm7 + vpand ymm9,ymm9,ymm11 + vpand ymm10,ymm10,ymm11 + vpand ymm13,ymm13,ymm15 + vpand ymm14,ymm14,ymm15 + + vpxor ymm2,ymm2,ymm5 + vpxor ymm3,ymm3,ymm6 + vpxor ymm2,ymm2,ymm9 + vpxor ymm3,ymm3,ymm10 + vpxor ymm2,ymm2,ymm13 + vpxor ymm3,ymm3,ymm14 + + dec rax + jnz NEAR $L$select_loop_avx2_w7 + + + vmovdqa ymm5,YMMWORD[rdx] + vmovdqa ymm6,YMMWORD[32+rdx] + + vpcmpeqd ymm7,ymm4,ymm1 + + vpand ymm5,ymm5,ymm7 + vpand ymm6,ymm6,ymm7 + + vpxor ymm2,ymm2,ymm5 + vpxor ymm3,ymm3,ymm6 + + vmovdqu YMMWORD[rcx],ymm2 + vmovdqu YMMWORD[32+rcx],ymm3 + vzeroupper + movaps xmm6,XMMWORD[rsp] + movaps xmm7,XMMWORD[16+rsp] + movaps xmm8,XMMWORD[32+rsp] + movaps xmm9,XMMWORD[48+rsp] + movaps xmm10,XMMWORD[64+rsp] + movaps xmm11,XMMWORD[80+rsp] + movaps xmm12,XMMWORD[96+rsp] + movaps xmm13,XMMWORD[112+rsp] + movaps xmm14,XMMWORD[128+rsp] + movaps xmm15,XMMWORD[144+rsp] + lea rsp,[r11] + ret + +$L$SEH_end_ecp_nistz256_select_w7_avx2: + + +ALIGN 32 +__ecp_nistz256_add_toq: + + xor r11,r11 + add r12,QWORD[rbx] + adc r13,QWORD[8+rbx] + mov rax,r12 + adc r8,QWORD[16+rbx] + adc r9,QWORD[24+rbx] + mov rbp,r13 + adc r11,0 + + sub r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + sbb r11,0 + + cmovc r12,rax + cmovc r13,rbp + mov QWORD[rdi],r12 + cmovc r8,rcx + mov QWORD[8+rdi],r13 + cmovc r9,r10 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + ret + + + + +ALIGN 32 +__ecp_nistz256_sub_fromq: + + sub r12,QWORD[rbx] + sbb r13,QWORD[8+rbx] + mov rax,r12 + sbb r8,QWORD[16+rbx] + sbb r9,QWORD[24+rbx] + mov rbp,r13 + sbb r11,r11 + + add r12,-1 + mov rcx,r8 + adc r13,r14 + adc r8,0 + mov r10,r9 + adc r9,r15 + test r11,r11 + + cmovz r12,rax + cmovz r13,rbp + mov QWORD[rdi],r12 + cmovz r8,rcx + mov QWORD[8+rdi],r13 + cmovz r9,r10 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + ret + + + + +ALIGN 32 +__ecp_nistz256_subq: + + sub rax,r12 + sbb rbp,r13 + mov r12,rax + sbb rcx,r8 + sbb r10,r9 + mov r13,rbp + sbb r11,r11 + + add rax,-1 + mov r8,rcx + adc rbp,r14 + adc rcx,0 + mov r9,r10 + adc r10,r15 + test r11,r11 + + cmovnz r12,rax + cmovnz r13,rbp + cmovnz r8,rcx + cmovnz r9,r10 + + ret + + + + +ALIGN 32 +__ecp_nistz256_mul_by_2q: + + xor r11,r11 + add r12,r12 + adc r13,r13 + mov rax,r12 + adc r8,r8 + adc r9,r9 + mov rbp,r13 + adc r11,0 + + sub r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + sbb r11,0 + + cmovc r12,rax + cmovc r13,rbp + mov QWORD[rdi],r12 + cmovc r8,rcx + mov QWORD[8+rdi],r13 + cmovc r9,r10 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + ret + + +global ecp_nistz256_point_double_nohw + +ALIGN 32 +ecp_nistz256_point_double_nohw: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_point_double_nohw: + mov rdi,rcx + mov rsi,rdx + + + +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,32*5+8 + +$L$point_doubleq_body: + +$L$point_double_shortcutq: + movdqu xmm0,XMMWORD[rsi] + mov rbx,rsi + movdqu xmm1,XMMWORD[16+rsi] + mov r12,QWORD[((32+0))+rsi] + mov r13,QWORD[((32+8))+rsi] + mov r8,QWORD[((32+16))+rsi] + mov r9,QWORD[((32+24))+rsi] + mov r14,QWORD[(($L$poly+8))] + mov r15,QWORD[(($L$poly+24))] + movdqa XMMWORD[96+rsp],xmm0 + movdqa XMMWORD[(96+16)+rsp],xmm1 + lea r10,[32+rdi] + lea r11,[64+rdi] +DB 102,72,15,110,199 +DB 102,73,15,110,202 +DB 102,73,15,110,211 + + lea rdi,[rsp] + call __ecp_nistz256_mul_by_2q + + mov rax,QWORD[((64+0))+rsi] + mov r14,QWORD[((64+8))+rsi] + mov r15,QWORD[((64+16))+rsi] + mov r8,QWORD[((64+24))+rsi] + lea rsi,[((64-0))+rsi] + lea rdi,[64+rsp] + call __ecp_nistz256_sqr_montq + + mov rax,QWORD[((0+0))+rsp] + mov r14,QWORD[((8+0))+rsp] + lea rsi,[((0+0))+rsp] + mov r15,QWORD[((16+0))+rsp] + mov r8,QWORD[((24+0))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_sqr_montq + + mov rax,QWORD[32+rbx] + mov r9,QWORD[((64+0))+rbx] + mov r10,QWORD[((64+8))+rbx] + mov r11,QWORD[((64+16))+rbx] + mov r12,QWORD[((64+24))+rbx] + lea rsi,[((64-0))+rbx] + lea rbx,[32+rbx] +DB 102,72,15,126,215 + call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_by_2q + + mov r12,QWORD[((96+0))+rsp] + mov r13,QWORD[((96+8))+rsp] + lea rbx,[64+rsp] + mov r8,QWORD[((96+16))+rsp] + mov r9,QWORD[((96+24))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_add_toq + + mov r12,QWORD[((96+0))+rsp] + mov r13,QWORD[((96+8))+rsp] + lea rbx,[64+rsp] + mov r8,QWORD[((96+16))+rsp] + mov r9,QWORD[((96+24))+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_sub_fromq + + mov rax,QWORD[((0+0))+rsp] + mov r14,QWORD[((8+0))+rsp] + lea rsi,[((0+0))+rsp] + mov r15,QWORD[((16+0))+rsp] + mov r8,QWORD[((24+0))+rsp] +DB 102,72,15,126,207 + call __ecp_nistz256_sqr_montq + xor r9,r9 + mov rax,r12 + add r12,-1 + mov r10,r13 + adc r13,rsi + mov rcx,r14 + adc r14,0 + mov r8,r15 + adc r15,rbp + adc r9,0 + xor rsi,rsi + test rax,1 + + cmovz r12,rax + cmovz r13,r10 + cmovz r14,rcx + cmovz r15,r8 + cmovz r9,rsi + + mov rax,r13 + shr r12,1 + shl rax,63 + mov r10,r14 + shr r13,1 + or r12,rax + shl r10,63 + mov rcx,r15 + shr r14,1 + or r13,r10 + shl rcx,63 + mov QWORD[rdi],r12 + shr r15,1 + mov QWORD[8+rdi],r13 + shl r9,63 + or r14,rcx + or r15,r9 + mov QWORD[16+rdi],r14 + mov QWORD[24+rdi],r15 + mov rax,QWORD[64+rsp] + lea rbx,[64+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montq + + lea rdi,[128+rsp] + call __ecp_nistz256_mul_by_2q + + lea rbx,[32+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_add_toq + + mov rax,QWORD[96+rsp] + lea rbx,[96+rsp] + mov r9,QWORD[((0+0))+rsp] + mov r10,QWORD[((8+0))+rsp] + lea rsi,[((0+0))+rsp] + mov r11,QWORD[((16+0))+rsp] + mov r12,QWORD[((24+0))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_mul_montq + + lea rdi,[128+rsp] + call __ecp_nistz256_mul_by_2q + + mov rax,QWORD[((0+32))+rsp] + mov r14,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r15,QWORD[((16+32))+rsp] + mov r8,QWORD[((24+32))+rsp] +DB 102,72,15,126,199 + call __ecp_nistz256_sqr_montq + + lea rbx,[128+rsp] + mov r8,r14 + mov r9,r15 + mov r14,rsi + mov r15,rbp + call __ecp_nistz256_sub_fromq + + mov rax,QWORD[((0+0))+rsp] + mov rbp,QWORD[((0+8))+rsp] + mov rcx,QWORD[((0+16))+rsp] + mov r10,QWORD[((0+24))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_subq + + mov rax,QWORD[32+rsp] + lea rbx,[32+rsp] + mov r14,r12 + xor ecx,ecx + mov QWORD[((0+0))+rsp],r12 + mov r10,r13 + mov QWORD[((0+8))+rsp],r13 + cmovz r11,r8 + mov QWORD[((0+16))+rsp],r8 + lea rsi,[((0-0))+rsp] + cmovz r12,r9 + mov QWORD[((0+24))+rsp],r9 + mov r9,r14 + lea rdi,[rsp] + call __ecp_nistz256_mul_montq + +DB 102,72,15,126,203 +DB 102,72,15,126,207 + call __ecp_nistz256_sub_fromq + + lea rsi,[((160+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$point_doubleq_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ecp_nistz256_point_double_nohw: +global ecp_nistz256_point_add_nohw + +ALIGN 32 +ecp_nistz256_point_add_nohw: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_point_add_nohw: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,32*18+8 + +$L$point_addq_body: + + movdqu xmm0,XMMWORD[rsi] + movdqu xmm1,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm3,XMMWORD[48+rsi] + movdqu xmm4,XMMWORD[64+rsi] + movdqu xmm5,XMMWORD[80+rsi] + mov rbx,rsi + mov rsi,rdx + movdqa XMMWORD[384+rsp],xmm0 + movdqa XMMWORD[(384+16)+rsp],xmm1 + movdqa XMMWORD[416+rsp],xmm2 + movdqa XMMWORD[(416+16)+rsp],xmm3 + movdqa XMMWORD[448+rsp],xmm4 + movdqa XMMWORD[(448+16)+rsp],xmm5 + por xmm5,xmm4 + + movdqu xmm0,XMMWORD[rsi] + pshufd xmm3,xmm5,0xb1 + movdqu xmm1,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + por xmm5,xmm3 + movdqu xmm3,XMMWORD[48+rsi] + mov rax,QWORD[((64+0))+rsi] + mov r14,QWORD[((64+8))+rsi] + mov r15,QWORD[((64+16))+rsi] + mov r8,QWORD[((64+24))+rsi] + movdqa XMMWORD[480+rsp],xmm0 + pshufd xmm4,xmm5,0x1e + movdqa XMMWORD[(480+16)+rsp],xmm1 + movdqu xmm0,XMMWORD[64+rsi] + movdqu xmm1,XMMWORD[80+rsi] + movdqa XMMWORD[512+rsp],xmm2 + movdqa XMMWORD[(512+16)+rsp],xmm3 + por xmm5,xmm4 + pxor xmm4,xmm4 + por xmm1,xmm0 +DB 102,72,15,110,199 + + lea rsi,[((64-0))+rsi] + mov QWORD[((544+0))+rsp],rax + mov QWORD[((544+8))+rsp],r14 + mov QWORD[((544+16))+rsp],r15 + mov QWORD[((544+24))+rsp],r8 + lea rdi,[96+rsp] + call __ecp_nistz256_sqr_montq + + pcmpeqd xmm5,xmm4 + pshufd xmm4,xmm1,0xb1 + por xmm4,xmm1 + pshufd xmm5,xmm5,0 + pshufd xmm3,xmm4,0x1e + por xmm4,xmm3 + pxor xmm3,xmm3 + pcmpeqd xmm4,xmm3 + pshufd xmm4,xmm4,0 + mov rax,QWORD[((64+0))+rbx] + mov r14,QWORD[((64+8))+rbx] + mov r15,QWORD[((64+16))+rbx] + mov r8,QWORD[((64+24))+rbx] +DB 102,72,15,110,203 + + lea rsi,[((64-0))+rbx] + lea rdi,[32+rsp] + call __ecp_nistz256_sqr_montq + + mov rax,QWORD[544+rsp] + lea rbx,[544+rsp] + mov r9,QWORD[((0+96))+rsp] + mov r10,QWORD[((8+96))+rsp] + lea rsi,[((0+96))+rsp] + mov r11,QWORD[((16+96))+rsp] + mov r12,QWORD[((24+96))+rsp] + lea rdi,[224+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[448+rsp] + lea rbx,[448+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[416+rsp] + lea rbx,[416+rsp] + mov r9,QWORD[((0+224))+rsp] + mov r10,QWORD[((8+224))+rsp] + lea rsi,[((0+224))+rsp] + mov r11,QWORD[((16+224))+rsp] + mov r12,QWORD[((24+224))+rsp] + lea rdi,[224+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[512+rsp] + lea rbx,[512+rsp] + mov r9,QWORD[((0+256))+rsp] + mov r10,QWORD[((8+256))+rsp] + lea rsi,[((0+256))+rsp] + mov r11,QWORD[((16+256))+rsp] + mov r12,QWORD[((24+256))+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_mul_montq + + lea rbx,[224+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_sub_fromq + + or r12,r13 + movdqa xmm2,xmm4 + or r12,r8 + or r12,r9 + por xmm2,xmm5 +DB 102,73,15,110,220 + + mov rax,QWORD[384+rsp] + lea rbx,[384+rsp] + mov r9,QWORD[((0+96))+rsp] + mov r10,QWORD[((8+96))+rsp] + lea rsi,[((0+96))+rsp] + mov r11,QWORD[((16+96))+rsp] + mov r12,QWORD[((24+96))+rsp] + lea rdi,[160+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[480+rsp] + lea rbx,[480+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[192+rsp] + call __ecp_nistz256_mul_montq + + lea rbx,[160+rsp] + lea rdi,[rsp] + call __ecp_nistz256_sub_fromq + + or r12,r13 + or r12,r8 + or r12,r9 + +DB 102,73,15,126,208 +DB 102,73,15,126,217 + or r12,r8 + DB 0x3e + jnz NEAR $L$add_proceedq + + + + test r9,r9 + jz NEAR $L$add_doubleq + + + + + + +DB 102,72,15,126,199 + pxor xmm0,xmm0 + movdqu XMMWORD[rdi],xmm0 + movdqu XMMWORD[16+rdi],xmm0 + movdqu XMMWORD[32+rdi],xmm0 + movdqu XMMWORD[48+rdi],xmm0 + movdqu XMMWORD[64+rdi],xmm0 + movdqu XMMWORD[80+rdi],xmm0 + jmp NEAR $L$add_doneq + +ALIGN 32 +$L$add_doubleq: +DB 102,72,15,126,206 +DB 102,72,15,126,199 + add rsp,416 + + jmp NEAR $L$point_double_shortcutq + + +ALIGN 32 +$L$add_proceedq: + mov rax,QWORD[((0+64))+rsp] + mov r14,QWORD[((8+64))+rsp] + lea rsi,[((0+64))+rsp] + mov r15,QWORD[((16+64))+rsp] + mov r8,QWORD[((24+64))+rsp] + lea rdi,[96+rsp] + call __ecp_nistz256_sqr_montq + + mov rax,QWORD[448+rsp] + lea rbx,[448+rsp] + mov r9,QWORD[((0+0))+rsp] + mov r10,QWORD[((8+0))+rsp] + lea rsi,[((0+0))+rsp] + mov r11,QWORD[((16+0))+rsp] + mov r12,QWORD[((24+0))+rsp] + lea rdi,[352+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[((0+0))+rsp] + mov r14,QWORD[((8+0))+rsp] + lea rsi,[((0+0))+rsp] + mov r15,QWORD[((16+0))+rsp] + mov r8,QWORD[((24+0))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_sqr_montq + + mov rax,QWORD[544+rsp] + lea rbx,[544+rsp] + mov r9,QWORD[((0+352))+rsp] + mov r10,QWORD[((8+352))+rsp] + lea rsi,[((0+352))+rsp] + mov r11,QWORD[((16+352))+rsp] + mov r12,QWORD[((24+352))+rsp] + lea rdi,[352+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[rsp] + lea rbx,[rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[128+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[160+rsp] + lea rbx,[160+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[192+rsp] + call __ecp_nistz256_mul_montq + + + + + xor r11,r11 + add r12,r12 + lea rsi,[96+rsp] + adc r13,r13 + mov rax,r12 + adc r8,r8 + adc r9,r9 + mov rbp,r13 + adc r11,0 + + sub r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + sbb r11,0 + + cmovc r12,rax + mov rax,QWORD[rsi] + cmovc r13,rbp + mov rbp,QWORD[8+rsi] + cmovc r8,rcx + mov rcx,QWORD[16+rsi] + cmovc r9,r10 + mov r10,QWORD[24+rsi] + + call __ecp_nistz256_subq + + lea rbx,[128+rsp] + lea rdi,[288+rsp] + call __ecp_nistz256_sub_fromq + + mov rax,QWORD[((192+0))+rsp] + mov rbp,QWORD[((192+8))+rsp] + mov rcx,QWORD[((192+16))+rsp] + mov r10,QWORD[((192+24))+rsp] + lea rdi,[320+rsp] + + call __ecp_nistz256_subq + + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + mov rax,QWORD[128+rsp] + lea rbx,[128+rsp] + mov r9,QWORD[((0+224))+rsp] + mov r10,QWORD[((8+224))+rsp] + lea rsi,[((0+224))+rsp] + mov r11,QWORD[((16+224))+rsp] + mov r12,QWORD[((24+224))+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[320+rsp] + lea rbx,[320+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((0+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[320+rsp] + call __ecp_nistz256_mul_montq + + lea rbx,[256+rsp] + lea rdi,[320+rsp] + call __ecp_nistz256_sub_fromq + +DB 102,72,15,126,199 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[352+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((352+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[544+rsp] + pand xmm3,XMMWORD[((544+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[448+rsp] + pand xmm3,XMMWORD[((448+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[64+rdi],xmm2 + movdqu XMMWORD[80+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[288+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((288+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[480+rsp] + pand xmm3,XMMWORD[((480+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[384+rsp] + pand xmm3,XMMWORD[((384+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[rdi],xmm2 + movdqu XMMWORD[16+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[320+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((320+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[512+rsp] + pand xmm3,XMMWORD[((512+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[416+rsp] + pand xmm3,XMMWORD[((416+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm3 + +$L$add_doneq: + lea rsi,[((576+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$point_addq_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ecp_nistz256_point_add_nohw: +global ecp_nistz256_point_add_affine_nohw + +ALIGN 32 +ecp_nistz256_point_add_affine_nohw: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_point_add_affine_nohw: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,32*15+8 + +$L$add_affineq_body: + + movdqu xmm0,XMMWORD[rsi] + mov rbx,rdx + movdqu xmm1,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm3,XMMWORD[48+rsi] + movdqu xmm4,XMMWORD[64+rsi] + movdqu xmm5,XMMWORD[80+rsi] + mov rax,QWORD[((64+0))+rsi] + mov r14,QWORD[((64+8))+rsi] + mov r15,QWORD[((64+16))+rsi] + mov r8,QWORD[((64+24))+rsi] + movdqa XMMWORD[320+rsp],xmm0 + movdqa XMMWORD[(320+16)+rsp],xmm1 + movdqa XMMWORD[352+rsp],xmm2 + movdqa XMMWORD[(352+16)+rsp],xmm3 + movdqa XMMWORD[384+rsp],xmm4 + movdqa XMMWORD[(384+16)+rsp],xmm5 + por xmm5,xmm4 + + movdqu xmm0,XMMWORD[rbx] + pshufd xmm3,xmm5,0xb1 + movdqu xmm1,XMMWORD[16+rbx] + movdqu xmm2,XMMWORD[32+rbx] + por xmm5,xmm3 + movdqu xmm3,XMMWORD[48+rbx] + movdqa XMMWORD[416+rsp],xmm0 + pshufd xmm4,xmm5,0x1e + movdqa XMMWORD[(416+16)+rsp],xmm1 + por xmm1,xmm0 +DB 102,72,15,110,199 + movdqa XMMWORD[448+rsp],xmm2 + movdqa XMMWORD[(448+16)+rsp],xmm3 + por xmm3,xmm2 + por xmm5,xmm4 + pxor xmm4,xmm4 + por xmm3,xmm1 + + lea rsi,[((64-0))+rsi] + lea rdi,[32+rsp] + call __ecp_nistz256_sqr_montq + + pcmpeqd xmm5,xmm4 + pshufd xmm4,xmm3,0xb1 + mov rax,QWORD[rbx] + + mov r9,r12 + por xmm4,xmm3 + pshufd xmm5,xmm5,0 + pshufd xmm3,xmm4,0x1e + mov r10,r13 + por xmm4,xmm3 + pxor xmm3,xmm3 + mov r11,r14 + pcmpeqd xmm4,xmm3 + pshufd xmm4,xmm4,0 + + lea rsi,[((32-0))+rsp] + mov r12,r15 + lea rdi,[rsp] + call __ecp_nistz256_mul_montq + + lea rbx,[320+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_sub_fromq + + mov rax,QWORD[384+rsp] + lea rbx,[384+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[384+rsp] + lea rbx,[384+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((0+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[288+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[448+rsp] + lea rbx,[448+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montq + + lea rbx,[352+rsp] + lea rdi,[96+rsp] + call __ecp_nistz256_sub_fromq + + mov rax,QWORD[((0+64))+rsp] + mov r14,QWORD[((8+64))+rsp] + lea rsi,[((0+64))+rsp] + mov r15,QWORD[((16+64))+rsp] + mov r8,QWORD[((24+64))+rsp] + lea rdi,[128+rsp] + call __ecp_nistz256_sqr_montq + + mov rax,QWORD[((0+96))+rsp] + mov r14,QWORD[((8+96))+rsp] + lea rsi,[((0+96))+rsp] + mov r15,QWORD[((16+96))+rsp] + mov r8,QWORD[((24+96))+rsp] + lea rdi,[192+rsp] + call __ecp_nistz256_sqr_montq + + mov rax,QWORD[128+rsp] + lea rbx,[128+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((0+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[160+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[320+rsp] + lea rbx,[320+rsp] + mov r9,QWORD[((0+128))+rsp] + mov r10,QWORD[((8+128))+rsp] + lea rsi,[((0+128))+rsp] + mov r11,QWORD[((16+128))+rsp] + mov r12,QWORD[((24+128))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_mul_montq + + + + + xor r11,r11 + add r12,r12 + lea rsi,[192+rsp] + adc r13,r13 + mov rax,r12 + adc r8,r8 + adc r9,r9 + mov rbp,r13 + adc r11,0 + + sub r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + sbb r11,0 + + cmovc r12,rax + mov rax,QWORD[rsi] + cmovc r13,rbp + mov rbp,QWORD[8+rsi] + cmovc r8,rcx + mov rcx,QWORD[16+rsi] + cmovc r9,r10 + mov r10,QWORD[24+rsi] + + call __ecp_nistz256_subq + + lea rbx,[160+rsp] + lea rdi,[224+rsp] + call __ecp_nistz256_sub_fromq + + mov rax,QWORD[((0+0))+rsp] + mov rbp,QWORD[((0+8))+rsp] + mov rcx,QWORD[((0+16))+rsp] + mov r10,QWORD[((0+24))+rsp] + lea rdi,[64+rsp] + + call __ecp_nistz256_subq + + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + mov rax,QWORD[352+rsp] + lea rbx,[352+rsp] + mov r9,QWORD[((0+160))+rsp] + mov r10,QWORD[((8+160))+rsp] + lea rsi,[((0+160))+rsp] + mov r11,QWORD[((16+160))+rsp] + mov r12,QWORD[((24+160))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[96+rsp] + lea rbx,[96+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((0+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_mul_montq + + lea rbx,[32+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_sub_fromq + +DB 102,72,15,126,199 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[288+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((288+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[$L$ONE_mont] + pand xmm3,XMMWORD[(($L$ONE_mont+16))] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[384+rsp] + pand xmm3,XMMWORD[((384+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[64+rdi],xmm2 + movdqu XMMWORD[80+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[224+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((224+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[416+rsp] + pand xmm3,XMMWORD[((416+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[320+rsp] + pand xmm3,XMMWORD[((320+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[rdi],xmm2 + movdqu XMMWORD[16+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[256+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((256+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[448+rsp] + pand xmm3,XMMWORD[((448+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[352+rsp] + pand xmm3,XMMWORD[((352+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm3 + + lea rsi,[((480+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$add_affineq_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ecp_nistz256_point_add_affine_nohw: + +ALIGN 32 +__ecp_nistz256_add_tox: + + xor r11,r11 + adc r12,QWORD[rbx] + adc r13,QWORD[8+rbx] + mov rax,r12 + adc r8,QWORD[16+rbx] + adc r9,QWORD[24+rbx] + mov rbp,r13 + adc r11,0 + + xor r10,r10 + sbb r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + sbb r11,0 + + cmovc r12,rax + cmovc r13,rbp + mov QWORD[rdi],r12 + cmovc r8,rcx + mov QWORD[8+rdi],r13 + cmovc r9,r10 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + ret + + + + +ALIGN 32 +__ecp_nistz256_sub_fromx: + + xor r11,r11 + sbb r12,QWORD[rbx] + sbb r13,QWORD[8+rbx] + mov rax,r12 + sbb r8,QWORD[16+rbx] + sbb r9,QWORD[24+rbx] + mov rbp,r13 + sbb r11,0 + + xor r10,r10 + adc r12,-1 + mov rcx,r8 + adc r13,r14 + adc r8,0 + mov r10,r9 + adc r9,r15 + + bt r11,0 + cmovnc r12,rax + cmovnc r13,rbp + mov QWORD[rdi],r12 + cmovnc r8,rcx + mov QWORD[8+rdi],r13 + cmovnc r9,r10 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + ret + + + + +ALIGN 32 +__ecp_nistz256_subx: + + xor r11,r11 + sbb rax,r12 + sbb rbp,r13 + mov r12,rax + sbb rcx,r8 + sbb r10,r9 + mov r13,rbp + sbb r11,0 + + xor r9,r9 + adc rax,-1 + mov r8,rcx + adc rbp,r14 + adc rcx,0 + mov r9,r10 + adc r10,r15 + + bt r11,0 + cmovc r12,rax + cmovc r13,rbp + cmovc r8,rcx + cmovc r9,r10 + + ret + + + + +ALIGN 32 +__ecp_nistz256_mul_by_2x: + + xor r11,r11 + adc r12,r12 + adc r13,r13 + mov rax,r12 + adc r8,r8 + adc r9,r9 + mov rbp,r13 + adc r11,0 + + xor r10,r10 + sbb r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + sbb r11,0 + + cmovc r12,rax + cmovc r13,rbp + mov QWORD[rdi],r12 + cmovc r8,rcx + mov QWORD[8+rdi],r13 + cmovc r9,r10 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + ret + + +global ecp_nistz256_point_double_adx + +ALIGN 32 +ecp_nistz256_point_double_adx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_point_double_adx: + mov rdi,rcx + mov rsi,rdx + + + +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,32*5+8 + +$L$point_doublex_body: + +$L$point_double_shortcutx: + movdqu xmm0,XMMWORD[rsi] + mov rbx,rsi + movdqu xmm1,XMMWORD[16+rsi] + mov r12,QWORD[((32+0))+rsi] + mov r13,QWORD[((32+8))+rsi] + mov r8,QWORD[((32+16))+rsi] + mov r9,QWORD[((32+24))+rsi] + mov r14,QWORD[(($L$poly+8))] + mov r15,QWORD[(($L$poly+24))] + movdqa XMMWORD[96+rsp],xmm0 + movdqa XMMWORD[(96+16)+rsp],xmm1 + lea r10,[32+rdi] + lea r11,[64+rdi] +DB 102,72,15,110,199 +DB 102,73,15,110,202 +DB 102,73,15,110,211 + + lea rdi,[rsp] + call __ecp_nistz256_mul_by_2x + + mov rdx,QWORD[((64+0))+rsi] + mov r14,QWORD[((64+8))+rsi] + mov r15,QWORD[((64+16))+rsi] + mov r8,QWORD[((64+24))+rsi] + lea rsi,[((64-128))+rsi] + lea rdi,[64+rsp] + call __ecp_nistz256_sqr_montx + + mov rdx,QWORD[((0+0))+rsp] + mov r14,QWORD[((8+0))+rsp] + lea rsi,[((-128+0))+rsp] + mov r15,QWORD[((16+0))+rsp] + mov r8,QWORD[((24+0))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_sqr_montx + + mov rdx,QWORD[32+rbx] + mov r9,QWORD[((64+0))+rbx] + mov r10,QWORD[((64+8))+rbx] + mov r11,QWORD[((64+16))+rbx] + mov r12,QWORD[((64+24))+rbx] + lea rsi,[((64-128))+rbx] + lea rbx,[32+rbx] +DB 102,72,15,126,215 + call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_by_2x + + mov r12,QWORD[((96+0))+rsp] + mov r13,QWORD[((96+8))+rsp] + lea rbx,[64+rsp] + mov r8,QWORD[((96+16))+rsp] + mov r9,QWORD[((96+24))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_add_tox + + mov r12,QWORD[((96+0))+rsp] + mov r13,QWORD[((96+8))+rsp] + lea rbx,[64+rsp] + mov r8,QWORD[((96+16))+rsp] + mov r9,QWORD[((96+24))+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_sub_fromx + + mov rdx,QWORD[((0+0))+rsp] + mov r14,QWORD[((8+0))+rsp] + lea rsi,[((-128+0))+rsp] + mov r15,QWORD[((16+0))+rsp] + mov r8,QWORD[((24+0))+rsp] +DB 102,72,15,126,207 + call __ecp_nistz256_sqr_montx + xor r9,r9 + mov rax,r12 + add r12,-1 + mov r10,r13 + adc r13,rsi + mov rcx,r14 + adc r14,0 + mov r8,r15 + adc r15,rbp + adc r9,0 + xor rsi,rsi + test rax,1 + + cmovz r12,rax + cmovz r13,r10 + cmovz r14,rcx + cmovz r15,r8 + cmovz r9,rsi + + mov rax,r13 + shr r12,1 + shl rax,63 + mov r10,r14 + shr r13,1 + or r12,rax + shl r10,63 + mov rcx,r15 + shr r14,1 + or r13,r10 + shl rcx,63 + mov QWORD[rdi],r12 + shr r15,1 + mov QWORD[8+rdi],r13 + shl r9,63 + or r14,rcx + or r15,r9 + mov QWORD[16+rdi],r14 + mov QWORD[24+rdi],r15 + mov rdx,QWORD[64+rsp] + lea rbx,[64+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montx + + lea rdi,[128+rsp] + call __ecp_nistz256_mul_by_2x + + lea rbx,[32+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_add_tox + + mov rdx,QWORD[96+rsp] + lea rbx,[96+rsp] + mov r9,QWORD[((0+0))+rsp] + mov r10,QWORD[((8+0))+rsp] + lea rsi,[((-128+0))+rsp] + mov r11,QWORD[((16+0))+rsp] + mov r12,QWORD[((24+0))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_mul_montx + + lea rdi,[128+rsp] + call __ecp_nistz256_mul_by_2x + + mov rdx,QWORD[((0+32))+rsp] + mov r14,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r15,QWORD[((16+32))+rsp] + mov r8,QWORD[((24+32))+rsp] +DB 102,72,15,126,199 + call __ecp_nistz256_sqr_montx + + lea rbx,[128+rsp] + mov r8,r14 + mov r9,r15 + mov r14,rsi + mov r15,rbp + call __ecp_nistz256_sub_fromx + + mov rax,QWORD[((0+0))+rsp] + mov rbp,QWORD[((0+8))+rsp] + mov rcx,QWORD[((0+16))+rsp] + mov r10,QWORD[((0+24))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_subx + + mov rdx,QWORD[32+rsp] + lea rbx,[32+rsp] + mov r14,r12 + xor ecx,ecx + mov QWORD[((0+0))+rsp],r12 + mov r10,r13 + mov QWORD[((0+8))+rsp],r13 + cmovz r11,r8 + mov QWORD[((0+16))+rsp],r8 + lea rsi,[((0-128))+rsp] + cmovz r12,r9 + mov QWORD[((0+24))+rsp],r9 + mov r9,r14 + lea rdi,[rsp] + call __ecp_nistz256_mul_montx + +DB 102,72,15,126,203 +DB 102,72,15,126,207 + call __ecp_nistz256_sub_fromx + + lea rsi,[((160+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$point_doublex_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ecp_nistz256_point_double_adx: +global ecp_nistz256_point_add_adx + +ALIGN 32 +ecp_nistz256_point_add_adx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_point_add_adx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,32*18+8 + +$L$point_addx_body: + + movdqu xmm0,XMMWORD[rsi] + movdqu xmm1,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm3,XMMWORD[48+rsi] + movdqu xmm4,XMMWORD[64+rsi] + movdqu xmm5,XMMWORD[80+rsi] + mov rbx,rsi + mov rsi,rdx + movdqa XMMWORD[384+rsp],xmm0 + movdqa XMMWORD[(384+16)+rsp],xmm1 + movdqa XMMWORD[416+rsp],xmm2 + movdqa XMMWORD[(416+16)+rsp],xmm3 + movdqa XMMWORD[448+rsp],xmm4 + movdqa XMMWORD[(448+16)+rsp],xmm5 + por xmm5,xmm4 + + movdqu xmm0,XMMWORD[rsi] + pshufd xmm3,xmm5,0xb1 + movdqu xmm1,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + por xmm5,xmm3 + movdqu xmm3,XMMWORD[48+rsi] + mov rdx,QWORD[((64+0))+rsi] + mov r14,QWORD[((64+8))+rsi] + mov r15,QWORD[((64+16))+rsi] + mov r8,QWORD[((64+24))+rsi] + movdqa XMMWORD[480+rsp],xmm0 + pshufd xmm4,xmm5,0x1e + movdqa XMMWORD[(480+16)+rsp],xmm1 + movdqu xmm0,XMMWORD[64+rsi] + movdqu xmm1,XMMWORD[80+rsi] + movdqa XMMWORD[512+rsp],xmm2 + movdqa XMMWORD[(512+16)+rsp],xmm3 + por xmm5,xmm4 + pxor xmm4,xmm4 + por xmm1,xmm0 +DB 102,72,15,110,199 + + lea rsi,[((64-128))+rsi] + mov QWORD[((544+0))+rsp],rdx + mov QWORD[((544+8))+rsp],r14 + mov QWORD[((544+16))+rsp],r15 + mov QWORD[((544+24))+rsp],r8 + lea rdi,[96+rsp] + call __ecp_nistz256_sqr_montx + + pcmpeqd xmm5,xmm4 + pshufd xmm4,xmm1,0xb1 + por xmm4,xmm1 + pshufd xmm5,xmm5,0 + pshufd xmm3,xmm4,0x1e + por xmm4,xmm3 + pxor xmm3,xmm3 + pcmpeqd xmm4,xmm3 + pshufd xmm4,xmm4,0 + mov rdx,QWORD[((64+0))+rbx] + mov r14,QWORD[((64+8))+rbx] + mov r15,QWORD[((64+16))+rbx] + mov r8,QWORD[((64+24))+rbx] +DB 102,72,15,110,203 + + lea rsi,[((64-128))+rbx] + lea rdi,[32+rsp] + call __ecp_nistz256_sqr_montx + + mov rdx,QWORD[544+rsp] + lea rbx,[544+rsp] + mov r9,QWORD[((0+96))+rsp] + mov r10,QWORD[((8+96))+rsp] + lea rsi,[((-128+96))+rsp] + mov r11,QWORD[((16+96))+rsp] + mov r12,QWORD[((24+96))+rsp] + lea rdi,[224+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[448+rsp] + lea rbx,[448+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[416+rsp] + lea rbx,[416+rsp] + mov r9,QWORD[((0+224))+rsp] + mov r10,QWORD[((8+224))+rsp] + lea rsi,[((-128+224))+rsp] + mov r11,QWORD[((16+224))+rsp] + mov r12,QWORD[((24+224))+rsp] + lea rdi,[224+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[512+rsp] + lea rbx,[512+rsp] + mov r9,QWORD[((0+256))+rsp] + mov r10,QWORD[((8+256))+rsp] + lea rsi,[((-128+256))+rsp] + mov r11,QWORD[((16+256))+rsp] + mov r12,QWORD[((24+256))+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_mul_montx + + lea rbx,[224+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_sub_fromx + + or r12,r13 + movdqa xmm2,xmm4 + or r12,r8 + or r12,r9 + por xmm2,xmm5 +DB 102,73,15,110,220 + + mov rdx,QWORD[384+rsp] + lea rbx,[384+rsp] + mov r9,QWORD[((0+96))+rsp] + mov r10,QWORD[((8+96))+rsp] + lea rsi,[((-128+96))+rsp] + mov r11,QWORD[((16+96))+rsp] + mov r12,QWORD[((24+96))+rsp] + lea rdi,[160+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[480+rsp] + lea rbx,[480+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[192+rsp] + call __ecp_nistz256_mul_montx + + lea rbx,[160+rsp] + lea rdi,[rsp] + call __ecp_nistz256_sub_fromx + + or r12,r13 + or r12,r8 + or r12,r9 + +DB 102,73,15,126,208 +DB 102,73,15,126,217 + or r12,r8 + DB 0x3e + jnz NEAR $L$add_proceedx + + + + test r9,r9 + jz NEAR $L$add_doublex + + + + + + +DB 102,72,15,126,199 + pxor xmm0,xmm0 + movdqu XMMWORD[rdi],xmm0 + movdqu XMMWORD[16+rdi],xmm0 + movdqu XMMWORD[32+rdi],xmm0 + movdqu XMMWORD[48+rdi],xmm0 + movdqu XMMWORD[64+rdi],xmm0 + movdqu XMMWORD[80+rdi],xmm0 + jmp NEAR $L$add_donex + +ALIGN 32 +$L$add_doublex: +DB 102,72,15,126,206 +DB 102,72,15,126,199 + add rsp,416 + + jmp NEAR $L$point_double_shortcutx + + +ALIGN 32 +$L$add_proceedx: + mov rdx,QWORD[((0+64))+rsp] + mov r14,QWORD[((8+64))+rsp] + lea rsi,[((-128+64))+rsp] + mov r15,QWORD[((16+64))+rsp] + mov r8,QWORD[((24+64))+rsp] + lea rdi,[96+rsp] + call __ecp_nistz256_sqr_montx + + mov rdx,QWORD[448+rsp] + lea rbx,[448+rsp] + mov r9,QWORD[((0+0))+rsp] + mov r10,QWORD[((8+0))+rsp] + lea rsi,[((-128+0))+rsp] + mov r11,QWORD[((16+0))+rsp] + mov r12,QWORD[((24+0))+rsp] + lea rdi,[352+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[((0+0))+rsp] + mov r14,QWORD[((8+0))+rsp] + lea rsi,[((-128+0))+rsp] + mov r15,QWORD[((16+0))+rsp] + mov r8,QWORD[((24+0))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_sqr_montx + + mov rdx,QWORD[544+rsp] + lea rbx,[544+rsp] + mov r9,QWORD[((0+352))+rsp] + mov r10,QWORD[((8+352))+rsp] + lea rsi,[((-128+352))+rsp] + mov r11,QWORD[((16+352))+rsp] + mov r12,QWORD[((24+352))+rsp] + lea rdi,[352+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[rsp] + lea rbx,[rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[128+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[160+rsp] + lea rbx,[160+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[192+rsp] + call __ecp_nistz256_mul_montx + + + + + xor r11,r11 + add r12,r12 + lea rsi,[96+rsp] + adc r13,r13 + mov rax,r12 + adc r8,r8 + adc r9,r9 + mov rbp,r13 + adc r11,0 + + sub r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + sbb r11,0 + + cmovc r12,rax + mov rax,QWORD[rsi] + cmovc r13,rbp + mov rbp,QWORD[8+rsi] + cmovc r8,rcx + mov rcx,QWORD[16+rsi] + cmovc r9,r10 + mov r10,QWORD[24+rsi] + + call __ecp_nistz256_subx + + lea rbx,[128+rsp] + lea rdi,[288+rsp] + call __ecp_nistz256_sub_fromx + + mov rax,QWORD[((192+0))+rsp] + mov rbp,QWORD[((192+8))+rsp] + mov rcx,QWORD[((192+16))+rsp] + mov r10,QWORD[((192+24))+rsp] + lea rdi,[320+rsp] + + call __ecp_nistz256_subx + + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + mov rdx,QWORD[128+rsp] + lea rbx,[128+rsp] + mov r9,QWORD[((0+224))+rsp] + mov r10,QWORD[((8+224))+rsp] + lea rsi,[((-128+224))+rsp] + mov r11,QWORD[((16+224))+rsp] + mov r12,QWORD[((24+224))+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[320+rsp] + lea rbx,[320+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((-128+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[320+rsp] + call __ecp_nistz256_mul_montx + + lea rbx,[256+rsp] + lea rdi,[320+rsp] + call __ecp_nistz256_sub_fromx + +DB 102,72,15,126,199 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[352+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((352+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[544+rsp] + pand xmm3,XMMWORD[((544+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[448+rsp] + pand xmm3,XMMWORD[((448+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[64+rdi],xmm2 + movdqu XMMWORD[80+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[288+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((288+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[480+rsp] + pand xmm3,XMMWORD[((480+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[384+rsp] + pand xmm3,XMMWORD[((384+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[rdi],xmm2 + movdqu XMMWORD[16+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[320+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((320+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[512+rsp] + pand xmm3,XMMWORD[((512+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[416+rsp] + pand xmm3,XMMWORD[((416+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm3 + +$L$add_donex: + lea rsi,[((576+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$point_addx_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ecp_nistz256_point_add_adx: +global ecp_nistz256_point_add_affine_adx + +ALIGN 32 +ecp_nistz256_point_add_affine_adx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_point_add_affine_adx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,32*15+8 + +$L$add_affinex_body: + + movdqu xmm0,XMMWORD[rsi] + mov rbx,rdx + movdqu xmm1,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm3,XMMWORD[48+rsi] + movdqu xmm4,XMMWORD[64+rsi] + movdqu xmm5,XMMWORD[80+rsi] + mov rdx,QWORD[((64+0))+rsi] + mov r14,QWORD[((64+8))+rsi] + mov r15,QWORD[((64+16))+rsi] + mov r8,QWORD[((64+24))+rsi] + movdqa XMMWORD[320+rsp],xmm0 + movdqa XMMWORD[(320+16)+rsp],xmm1 + movdqa XMMWORD[352+rsp],xmm2 + movdqa XMMWORD[(352+16)+rsp],xmm3 + movdqa XMMWORD[384+rsp],xmm4 + movdqa XMMWORD[(384+16)+rsp],xmm5 + por xmm5,xmm4 + + movdqu xmm0,XMMWORD[rbx] + pshufd xmm3,xmm5,0xb1 + movdqu xmm1,XMMWORD[16+rbx] + movdqu xmm2,XMMWORD[32+rbx] + por xmm5,xmm3 + movdqu xmm3,XMMWORD[48+rbx] + movdqa XMMWORD[416+rsp],xmm0 + pshufd xmm4,xmm5,0x1e + movdqa XMMWORD[(416+16)+rsp],xmm1 + por xmm1,xmm0 +DB 102,72,15,110,199 + movdqa XMMWORD[448+rsp],xmm2 + movdqa XMMWORD[(448+16)+rsp],xmm3 + por xmm3,xmm2 + por xmm5,xmm4 + pxor xmm4,xmm4 + por xmm3,xmm1 + + lea rsi,[((64-128))+rsi] + lea rdi,[32+rsp] + call __ecp_nistz256_sqr_montx + + pcmpeqd xmm5,xmm4 + pshufd xmm4,xmm3,0xb1 + mov rdx,QWORD[rbx] + + mov r9,r12 + por xmm4,xmm3 + pshufd xmm5,xmm5,0 + pshufd xmm3,xmm4,0x1e + mov r10,r13 + por xmm4,xmm3 + pxor xmm3,xmm3 + mov r11,r14 + pcmpeqd xmm4,xmm3 + pshufd xmm4,xmm4,0 + + lea rsi,[((32-128))+rsp] + mov r12,r15 + lea rdi,[rsp] + call __ecp_nistz256_mul_montx + + lea rbx,[320+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_sub_fromx + + mov rdx,QWORD[384+rsp] + lea rbx,[384+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[384+rsp] + lea rbx,[384+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((-128+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[288+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[448+rsp] + lea rbx,[448+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montx + + lea rbx,[352+rsp] + lea rdi,[96+rsp] + call __ecp_nistz256_sub_fromx + + mov rdx,QWORD[((0+64))+rsp] + mov r14,QWORD[((8+64))+rsp] + lea rsi,[((-128+64))+rsp] + mov r15,QWORD[((16+64))+rsp] + mov r8,QWORD[((24+64))+rsp] + lea rdi,[128+rsp] + call __ecp_nistz256_sqr_montx + + mov rdx,QWORD[((0+96))+rsp] + mov r14,QWORD[((8+96))+rsp] + lea rsi,[((-128+96))+rsp] + mov r15,QWORD[((16+96))+rsp] + mov r8,QWORD[((24+96))+rsp] + lea rdi,[192+rsp] + call __ecp_nistz256_sqr_montx + + mov rdx,QWORD[128+rsp] + lea rbx,[128+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((-128+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[160+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[320+rsp] + lea rbx,[320+rsp] + mov r9,QWORD[((0+128))+rsp] + mov r10,QWORD[((8+128))+rsp] + lea rsi,[((-128+128))+rsp] + mov r11,QWORD[((16+128))+rsp] + mov r12,QWORD[((24+128))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_mul_montx + + + + + xor r11,r11 + add r12,r12 + lea rsi,[192+rsp] + adc r13,r13 + mov rax,r12 + adc r8,r8 + adc r9,r9 + mov rbp,r13 + adc r11,0 + + sub r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + sbb r11,0 + + cmovc r12,rax + mov rax,QWORD[rsi] + cmovc r13,rbp + mov rbp,QWORD[8+rsi] + cmovc r8,rcx + mov rcx,QWORD[16+rsi] + cmovc r9,r10 + mov r10,QWORD[24+rsi] + + call __ecp_nistz256_subx + + lea rbx,[160+rsp] + lea rdi,[224+rsp] + call __ecp_nistz256_sub_fromx + + mov rax,QWORD[((0+0))+rsp] + mov rbp,QWORD[((0+8))+rsp] + mov rcx,QWORD[((0+16))+rsp] + mov r10,QWORD[((0+24))+rsp] + lea rdi,[64+rsp] + + call __ecp_nistz256_subx + + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + mov rdx,QWORD[352+rsp] + lea rbx,[352+rsp] + mov r9,QWORD[((0+160))+rsp] + mov r10,QWORD[((8+160))+rsp] + lea rsi,[((-128+160))+rsp] + mov r11,QWORD[((16+160))+rsp] + mov r12,QWORD[((24+160))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[96+rsp] + lea rbx,[96+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((-128+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_mul_montx + + lea rbx,[32+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_sub_fromx + +DB 102,72,15,126,199 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[288+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((288+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[$L$ONE_mont] + pand xmm3,XMMWORD[(($L$ONE_mont+16))] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[384+rsp] + pand xmm3,XMMWORD[((384+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[64+rdi],xmm2 + movdqu XMMWORD[80+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[224+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((224+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[416+rsp] + pand xmm3,XMMWORD[((416+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[320+rsp] + pand xmm3,XMMWORD[((320+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[rdi],xmm2 + movdqu XMMWORD[16+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[256+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((256+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[448+rsp] + pand xmm3,XMMWORD[((448+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[352+rsp] + pand xmm3,XMMWORD[((352+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm3 + + lea rsi,[((480+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$add_affinex_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ecp_nistz256_point_add_affine_adx: +EXTERN __imp_RtlVirtualUnwind + + +ALIGN 16 +short_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + lea rax,[16+rax] + + mov r12,QWORD[((-8))+rax] + mov r13,QWORD[((-16))+rax] + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + + jmp NEAR $L$common_seh_tail + + + +ALIGN 16 +full_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + mov r10d,DWORD[8+r11] + lea rax,[r10*1+rax] + + mov rbp,QWORD[((-8))+rax] + mov rbx,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 + +$L$common_seh_tail: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + ret + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_ecp_nistz256_neg wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_neg wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_neg wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_ord_mul_mont_nohw wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_ord_mul_mont_nohw wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_ord_mul_mont_nohw wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_ord_sqr_mont_nohw wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_ord_sqr_mont_nohw wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_ord_sqr_mont_nohw wrt ..imagebase + DD $L$SEH_begin_ecp_nistz256_ord_mul_mont_adx wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_ord_mul_mont_adx wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_ord_mul_mont_adx wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_ord_sqr_mont_adx wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_ord_sqr_mont_adx wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_ord_sqr_mont_adx wrt ..imagebase + DD $L$SEH_begin_ecp_nistz256_mul_mont_nohw wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_mul_mont_nohw wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_mul_mont_nohw wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_sqr_mont_nohw wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_sqr_mont_nohw wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_sqr_mont_nohw wrt ..imagebase + DD $L$SEH_begin_ecp_nistz256_mul_mont_adx wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_mul_mont_adx wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_mul_mont_adx wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_sqr_mont_adx wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_sqr_mont_adx wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_sqr_mont_adx wrt ..imagebase + DD $L$SEH_begin_ecp_nistz256_select_w5_nohw wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_select_w5_nohw wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_select_wX_nohw wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_select_w7_nohw wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_select_w7_nohw wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_select_wX_nohw wrt ..imagebase + DD $L$SEH_begin_ecp_nistz256_select_w5_avx2 wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_select_w5_avx2 wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_select_wX_avx2 wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_select_w7_avx2 wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_select_w7_avx2 wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_select_wX_avx2 wrt ..imagebase + DD $L$SEH_begin_ecp_nistz256_point_double_nohw wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_point_double_nohw wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_point_double_nohw wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_point_add_nohw wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_point_add_nohw wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_point_add_nohw wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_point_add_affine_nohw wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_point_add_affine_nohw wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_point_add_affine_nohw wrt ..imagebase + DD $L$SEH_begin_ecp_nistz256_point_double_adx wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_point_double_adx wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_point_double_adx wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_point_add_adx wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_point_add_adx wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_point_add_adx wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_point_add_affine_adx wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_point_add_affine_adx wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_point_add_affine_adx wrt ..imagebase + +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_ecp_nistz256_neg: + DB 9,0,0,0 + DD short_handler wrt ..imagebase + DD $L$neg_body wrt ..imagebase,$L$neg_epilogue wrt ..imagebase +$L$SEH_info_ecp_nistz256_ord_mul_mont_nohw: + DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$ord_mul_body wrt ..imagebase,$L$ord_mul_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_ord_sqr_mont_nohw: + DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$ord_sqr_body wrt ..imagebase,$L$ord_sqr_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_ord_mul_mont_adx: + DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$ord_mulx_body wrt ..imagebase,$L$ord_mulx_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_ord_sqr_mont_adx: + DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$ord_sqrx_body wrt ..imagebase,$L$ord_sqrx_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_mul_mont_nohw: + DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_sqr_mont_nohw: + DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$sqr_body wrt ..imagebase,$L$sqr_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_mul_mont_adx: + DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$mulx_body wrt ..imagebase,$L$mulx_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_sqr_mont_adx: + DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$sqrx_body wrt ..imagebase,$L$sqrx_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_select_wX_nohw: + DB 0x01,0x33,0x16,0x00 + DB 0x33,0xf8,0x09,0x00 + DB 0x2e,0xe8,0x08,0x00 + DB 0x29,0xd8,0x07,0x00 + DB 0x24,0xc8,0x06,0x00 + DB 0x1f,0xb8,0x05,0x00 + DB 0x1a,0xa8,0x04,0x00 + DB 0x15,0x98,0x03,0x00 + DB 0x10,0x88,0x02,0x00 + DB 0x0c,0x78,0x01,0x00 + DB 0x08,0x68,0x00,0x00 + DB 0x04,0x01,0x15,0x00 +ALIGN 8 +$L$SEH_info_ecp_nistz256_select_wX_avx2: + DB 0x01,0x36,0x17,0x0b + DB 0x36,0xf8,0x09,0x00 + DB 0x31,0xe8,0x08,0x00 + DB 0x2c,0xd8,0x07,0x00 + DB 0x27,0xc8,0x06,0x00 + DB 0x22,0xb8,0x05,0x00 + DB 0x1d,0xa8,0x04,0x00 + DB 0x18,0x98,0x03,0x00 + DB 0x13,0x88,0x02,0x00 + DB 0x0e,0x78,0x01,0x00 + DB 0x09,0x68,0x00,0x00 + DB 0x04,0x01,0x15,0x00 + DB 0x00,0xb3,0x00,0x00 +ALIGN 8 +$L$SEH_info_ecp_nistz256_point_double_nohw: + DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$point_doubleq_body wrt ..imagebase,$L$point_doubleq_epilogue wrt ..imagebase + DD 32*5+56,0 +$L$SEH_info_ecp_nistz256_point_add_nohw: + DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$point_addq_body wrt ..imagebase,$L$point_addq_epilogue wrt ..imagebase + DD 32*18+56,0 +$L$SEH_info_ecp_nistz256_point_add_affine_nohw: + DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$add_affineq_body wrt ..imagebase,$L$add_affineq_epilogue wrt ..imagebase + DD 32*15+56,0 +ALIGN 8 +$L$SEH_info_ecp_nistz256_point_double_adx: + DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$point_doublex_body wrt ..imagebase,$L$point_doublex_epilogue wrt ..imagebase + DD 32*5+56,0 +$L$SEH_info_ecp_nistz256_point_add_adx: + DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$point_addx_body wrt ..imagebase,$L$point_addx_epilogue wrt ..imagebase + DD 32*18+56,0 +$L$SEH_info_ecp_nistz256_point_add_affine_adx: + DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$add_affinex_body wrt ..imagebase,$L$add_affinex_epilogue wrt ..imagebase + DD 32*15+56,0 +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/ring-0.17.14/pregenerated/p256-x86_64-asm-nasm.o b/ring-0.17.14/pregenerated/p256-x86_64-asm-nasm.o new file mode 100644 index 0000000000..7c7647eb5e Binary files /dev/null and b/ring-0.17.14/pregenerated/p256-x86_64-asm-nasm.o differ diff --git a/ring-0.17.14/pregenerated/ring_core_generated/prefix_symbols.h b/ring-0.17.14/pregenerated/ring_core_generated/prefix_symbols.h new file mode 100644 index 0000000000..da4b6fad47 --- /dev/null +++ b/ring-0.17.14/pregenerated/ring_core_generated/prefix_symbols.h @@ -0,0 +1,168 @@ + +#ifndef ring_core_generated_PREFIX_SYMBOLS_H +#define ring_core_generated_PREFIX_SYMBOLS_H + +#define ecp_nistz256_point_double p256_point_double +#define ecp_nistz256_point_add p256_point_add +#define ecp_nistz256_point_add_affine p256_point_add_affine +#define ecp_nistz256_ord_mul_mont p256_scalar_mul_mont +#define ecp_nistz256_ord_sqr_mont p256_scalar_sqr_rep_mont +#define ecp_nistz256_mul_mont p256_mul_mont +#define ecp_nistz256_sqr_mont p256_sqr_mont +#define adx_bmi2_available ring_core_0_17_14__adx_bmi2_available +#define avx2_available ring_core_0_17_14__avx2_available +#define CRYPTO_memcmp ring_core_0_17_14__CRYPTO_memcmp +#define CRYPTO_poly1305_finish ring_core_0_17_14__CRYPTO_poly1305_finish +#define CRYPTO_poly1305_finish_neon ring_core_0_17_14__CRYPTO_poly1305_finish_neon +#define CRYPTO_poly1305_init ring_core_0_17_14__CRYPTO_poly1305_init +#define CRYPTO_poly1305_init_neon ring_core_0_17_14__CRYPTO_poly1305_init_neon +#define CRYPTO_poly1305_update ring_core_0_17_14__CRYPTO_poly1305_update +#define CRYPTO_poly1305_update_neon ring_core_0_17_14__CRYPTO_poly1305_update_neon +#define ChaCha20_ctr32 ring_core_0_17_14__ChaCha20_ctr32 +#define ChaCha20_ctr32_avx2 ring_core_0_17_14__ChaCha20_ctr32_avx2 +#define ChaCha20_ctr32_neon ring_core_0_17_14__ChaCha20_ctr32_neon +#define ChaCha20_ctr32_nohw ring_core_0_17_14__ChaCha20_ctr32_nohw +#define ChaCha20_ctr32_ssse3 ring_core_0_17_14__ChaCha20_ctr32_ssse3 +#define ChaCha20_ctr32_ssse3_4x ring_core_0_17_14__ChaCha20_ctr32_ssse3_4x +#define LIMB_is_zero ring_core_0_17_14__LIMB_is_zero +#define LIMBS_add_mod ring_core_0_17_14__LIMBS_add_mod +#define LIMBS_are_zero ring_core_0_17_14__LIMBS_are_zero +#define LIMBS_equal ring_core_0_17_14__LIMBS_equal +#define LIMBS_less_than ring_core_0_17_14__LIMBS_less_than +#define LIMBS_reduce_once ring_core_0_17_14__LIMBS_reduce_once +#define LIMBS_select_512_32 ring_core_0_17_14__LIMBS_select_512_32 +#define LIMBS_shl_mod ring_core_0_17_14__LIMBS_shl_mod +#define LIMBS_sub_mod ring_core_0_17_14__LIMBS_sub_mod +#define LIMBS_window5_split_window ring_core_0_17_14__LIMBS_window5_split_window +#define LIMBS_window5_unsplit_window ring_core_0_17_14__LIMBS_window5_unsplit_window +#define LIMB_shr ring_core_0_17_14__LIMB_shr +#define OPENSSL_cpuid_setup ring_core_0_17_14__OPENSSL_cpuid_setup +#define aes_gcm_dec_kernel ring_core_0_17_14__aes_gcm_dec_kernel +#define aes_gcm_dec_update_vaes_avx2 ring_core_0_17_14__aes_gcm_dec_update_vaes_avx2 +#define aes_gcm_enc_kernel ring_core_0_17_14__aes_gcm_enc_kernel +#define aes_gcm_enc_update_vaes_avx2 ring_core_0_17_14__aes_gcm_enc_update_vaes_avx2 +#define aes_hw_ctr32_encrypt_blocks ring_core_0_17_14__aes_hw_ctr32_encrypt_blocks +#define aes_hw_set_encrypt_key ring_core_0_17_14__aes_hw_set_encrypt_key +#define aes_hw_set_encrypt_key_alt ring_core_0_17_14__aes_hw_set_encrypt_key_alt +#define aes_hw_set_encrypt_key_base ring_core_0_17_14__aes_hw_set_encrypt_key_base +#define aes_nohw_ctr32_encrypt_blocks ring_core_0_17_14__aes_nohw_ctr32_encrypt_blocks +#define aes_nohw_encrypt ring_core_0_17_14__aes_nohw_encrypt +#define aes_nohw_set_encrypt_key ring_core_0_17_14__aes_nohw_set_encrypt_key +#define aesni_gcm_decrypt ring_core_0_17_14__aesni_gcm_decrypt +#define aesni_gcm_encrypt ring_core_0_17_14__aesni_gcm_encrypt +#define bn_from_montgomery_in_place ring_core_0_17_14__bn_from_montgomery_in_place +#define bn_gather5 ring_core_0_17_14__bn_gather5 +#define bn_mul_mont ring_core_0_17_14__bn_mul_mont +#define bn_mul_mont_nohw ring_core_0_17_14__bn_mul_mont_nohw +#define bn_mul4x_mont ring_core_0_17_14__bn_mul4x_mont +#define bn_mulx4x_mont ring_core_0_17_14__bn_mulx4x_mont +#define bn_mul8x_mont_neon ring_core_0_17_14__bn_mul8x_mont_neon +#define bn_mul4x_mont_gather5 ring_core_0_17_14__bn_mul4x_mont_gather5 +#define bn_mulx4x_mont_gather5 ring_core_0_17_14__bn_mulx4x_mont_gather5 +#define bn_neg_inv_mod_r_u64 ring_core_0_17_14__bn_neg_inv_mod_r_u64 +#define bn_power5_nohw ring_core_0_17_14__bn_power5_nohw +#define bn_powerx5 ring_core_0_17_14__bn_powerx5 +#define bn_scatter5 ring_core_0_17_14__bn_scatter5 +#define bn_sqr8x_internal ring_core_0_17_14__bn_sqr8x_internal +#define bn_sqr8x_mont ring_core_0_17_14__bn_sqr8x_mont +#define bn_sqrx8x_internal ring_core_0_17_14__bn_sqrx8x_internal +#define bsaes_ctr32_encrypt_blocks ring_core_0_17_14__bsaes_ctr32_encrypt_blocks +#define bssl_constant_time_test_conditional_memcpy ring_core_0_17_14__bssl_constant_time_test_conditional_memcpy +#define bssl_constant_time_test_conditional_memxor ring_core_0_17_14__bssl_constant_time_test_conditional_memxor +#define bssl_constant_time_test_main ring_core_0_17_14__bssl_constant_time_test_main +#define chacha20_poly1305_open ring_core_0_17_14__chacha20_poly1305_open +#define chacha20_poly1305_open_avx2 ring_core_0_17_14__chacha20_poly1305_open_avx2 +#define chacha20_poly1305_open_sse41 ring_core_0_17_14__chacha20_poly1305_open_sse41 +#define chacha20_poly1305_seal ring_core_0_17_14__chacha20_poly1305_seal +#define chacha20_poly1305_seal_avx2 ring_core_0_17_14__chacha20_poly1305_seal_avx2 +#define chacha20_poly1305_seal_sse41 ring_core_0_17_14__chacha20_poly1305_seal_sse41 +#define ecp_nistz256_mul_mont_adx ring_core_0_17_14__ecp_nistz256_mul_mont_adx +#define ecp_nistz256_mul_mont_nohw ring_core_0_17_14__ecp_nistz256_mul_mont_nohw +#define ecp_nistz256_ord_mul_mont_adx ring_core_0_17_14__ecp_nistz256_ord_mul_mont_adx +#define ecp_nistz256_ord_mul_mont_nohw ring_core_0_17_14__ecp_nistz256_ord_mul_mont_nohw +#define ecp_nistz256_ord_sqr_mont_adx ring_core_0_17_14__ecp_nistz256_ord_sqr_mont_adx +#define ecp_nistz256_ord_sqr_mont_nohw ring_core_0_17_14__ecp_nistz256_ord_sqr_mont_nohw +#define ecp_nistz256_point_add_adx ring_core_0_17_14__ecp_nistz256_point_add_adx +#define ecp_nistz256_point_add_nohw ring_core_0_17_14__ecp_nistz256_point_add_nohw +#define ecp_nistz256_point_add_affine_adx ring_core_0_17_14__ecp_nistz256_point_add_affine_adx +#define ecp_nistz256_point_add_affine_nohw ring_core_0_17_14__ecp_nistz256_point_add_affine_nohw +#define ecp_nistz256_point_double_adx ring_core_0_17_14__ecp_nistz256_point_double_adx +#define ecp_nistz256_point_double_nohw ring_core_0_17_14__ecp_nistz256_point_double_nohw +#define ecp_nistz256_select_w5_avx2 ring_core_0_17_14__ecp_nistz256_select_w5_avx2 +#define ecp_nistz256_select_w5_nohw ring_core_0_17_14__ecp_nistz256_select_w5_nohw +#define ecp_nistz256_select_w7_avx2 ring_core_0_17_14__ecp_nistz256_select_w7_avx2 +#define ecp_nistz256_select_w7_nohw ring_core_0_17_14__ecp_nistz256_select_w7_nohw +#define ecp_nistz256_sqr_mont_adx ring_core_0_17_14__ecp_nistz256_sqr_mont_adx +#define ecp_nistz256_sqr_mont_nohw ring_core_0_17_14__ecp_nistz256_sqr_mont_nohw +#define fiat_curve25519_adx_mul ring_core_0_17_14__fiat_curve25519_adx_mul +#define fiat_curve25519_adx_square ring_core_0_17_14__fiat_curve25519_adx_square +#define gcm_ghash_avx ring_core_0_17_14__gcm_ghash_avx +#define gcm_ghash_clmul ring_core_0_17_14__gcm_ghash_clmul +#define gcm_ghash_neon ring_core_0_17_14__gcm_ghash_neon +#define gcm_ghash_vpclmulqdq_avx2_1 ring_core_0_17_14__gcm_ghash_vpclmulqdq_avx2_1 +#define gcm_gmult_clmul ring_core_0_17_14__gcm_gmult_clmul +#define gcm_gmult_neon ring_core_0_17_14__gcm_gmult_neon +#define gcm_init_avx ring_core_0_17_14__gcm_init_avx +#define gcm_init_clmul ring_core_0_17_14__gcm_init_clmul +#define gcm_init_neon ring_core_0_17_14__gcm_init_neon +#define gcm_init_vpclmulqdq_avx2 ring_core_0_17_14__gcm_init_vpclmulqdq_avx2 +#define k25519Precomp ring_core_0_17_14__k25519Precomp +#define limbs_mul_add_limb ring_core_0_17_14__limbs_mul_add_limb +#define little_endian_bytes_from_scalar ring_core_0_17_14__little_endian_bytes_from_scalar +#define ecp_nistz256_neg ring_core_0_17_14__ecp_nistz256_neg +#define ecp_nistz256_select_w5 ring_core_0_17_14__ecp_nistz256_select_w5 +#define ecp_nistz256_select_w7 ring_core_0_17_14__ecp_nistz256_select_w7 +#define neon_available ring_core_0_17_14__neon_available +#define p256_mul_mont ring_core_0_17_14__p256_mul_mont +#define p256_point_add ring_core_0_17_14__p256_point_add +#define p256_point_add_affine ring_core_0_17_14__p256_point_add_affine +#define p256_point_double ring_core_0_17_14__p256_point_double +#define p256_point_mul ring_core_0_17_14__p256_point_mul +#define p256_point_mul_base ring_core_0_17_14__p256_point_mul_base +#define p256_point_mul_base_vartime ring_core_0_17_14__p256_point_mul_base_vartime +#define p256_scalar_mul_mont ring_core_0_17_14__p256_scalar_mul_mont +#define p256_scalar_sqr_rep_mont ring_core_0_17_14__p256_scalar_sqr_rep_mont +#define p256_sqr_mont ring_core_0_17_14__p256_sqr_mont +#define p384_elem_div_by_2 ring_core_0_17_14__p384_elem_div_by_2 +#define p384_elem_mul_mont ring_core_0_17_14__p384_elem_mul_mont +#define p384_elem_neg ring_core_0_17_14__p384_elem_neg +#define p384_elem_sub ring_core_0_17_14__p384_elem_sub +#define p384_point_add ring_core_0_17_14__p384_point_add +#define p384_point_double ring_core_0_17_14__p384_point_double +#define p384_point_mul ring_core_0_17_14__p384_point_mul +#define p384_scalar_mul_mont ring_core_0_17_14__p384_scalar_mul_mont +#define openssl_poly1305_neon2_addmulmod ring_core_0_17_14__openssl_poly1305_neon2_addmulmod +#define openssl_poly1305_neon2_blocks ring_core_0_17_14__openssl_poly1305_neon2_blocks +#define sha256_block_data_order ring_core_0_17_14__sha256_block_data_order +#define sha256_block_data_order_avx ring_core_0_17_14__sha256_block_data_order_avx +#define sha256_block_data_order_ssse3 ring_core_0_17_14__sha256_block_data_order_ssse3 +#define sha256_block_data_order_hw ring_core_0_17_14__sha256_block_data_order_hw +#define sha256_block_data_order_neon ring_core_0_17_14__sha256_block_data_order_neon +#define sha256_block_data_order_nohw ring_core_0_17_14__sha256_block_data_order_nohw +#define sha512_block_data_order ring_core_0_17_14__sha512_block_data_order +#define sha512_block_data_order_avx ring_core_0_17_14__sha512_block_data_order_avx +#define sha512_block_data_order_hw ring_core_0_17_14__sha512_block_data_order_hw +#define sha512_block_data_order_neon ring_core_0_17_14__sha512_block_data_order_neon +#define sha512_block_data_order_nohw ring_core_0_17_14__sha512_block_data_order_nohw +#define vpaes_ctr32_encrypt_blocks ring_core_0_17_14__vpaes_ctr32_encrypt_blocks +#define vpaes_encrypt ring_core_0_17_14__vpaes_encrypt +#define vpaes_encrypt_key_to_bsaes ring_core_0_17_14__vpaes_encrypt_key_to_bsaes +#define vpaes_set_encrypt_key ring_core_0_17_14__vpaes_set_encrypt_key +#define x25519_NEON ring_core_0_17_14__x25519_NEON +#define x25519_fe_invert ring_core_0_17_14__x25519_fe_invert +#define x25519_fe_isnegative ring_core_0_17_14__x25519_fe_isnegative +#define x25519_fe_mul_ttt ring_core_0_17_14__x25519_fe_mul_ttt +#define x25519_fe_neg ring_core_0_17_14__x25519_fe_neg +#define x25519_fe_tobytes ring_core_0_17_14__x25519_fe_tobytes +#define x25519_ge_double_scalarmult_vartime ring_core_0_17_14__x25519_ge_double_scalarmult_vartime +#define x25519_ge_frombytes_vartime ring_core_0_17_14__x25519_ge_frombytes_vartime +#define x25519_ge_scalarmult_base ring_core_0_17_14__x25519_ge_scalarmult_base +#define x25519_ge_scalarmult_base_adx ring_core_0_17_14__x25519_ge_scalarmult_base_adx +#define x25519_public_from_private_generic_masked ring_core_0_17_14__x25519_public_from_private_generic_masked +#define x25519_sc_mask ring_core_0_17_14__x25519_sc_mask +#define x25519_sc_muladd ring_core_0_17_14__x25519_sc_muladd +#define x25519_sc_reduce ring_core_0_17_14__x25519_sc_reduce +#define x25519_scalar_mult_adx ring_core_0_17_14__x25519_scalar_mult_adx +#define x25519_scalar_mult_generic_masked ring_core_0_17_14__x25519_scalar_mult_generic_masked + +#endif diff --git a/ring-0.17.14/pregenerated/ring_core_generated/prefix_symbols_asm.h b/ring-0.17.14/pregenerated/ring_core_generated/prefix_symbols_asm.h new file mode 100644 index 0000000000..5db19eb659 --- /dev/null +++ b/ring-0.17.14/pregenerated/ring_core_generated/prefix_symbols_asm.h @@ -0,0 +1,334 @@ + +#ifndef ring_core_generated_PREFIX_SYMBOLS_ASM_H +#define ring_core_generated_PREFIX_SYMBOLS_ASM_H + +#if defined(__APPLE__) +#define _ecp_nistz256_point_double _p256_point_double +#define _ecp_nistz256_point_add _p256_point_add +#define _ecp_nistz256_point_add_affine _p256_point_add_affine +#define _ecp_nistz256_ord_mul_mont _p256_scalar_mul_mont +#define _ecp_nistz256_ord_sqr_mont _p256_scalar_sqr_rep_mont +#define _ecp_nistz256_mul_mont _p256_mul_mont +#define _ecp_nistz256_sqr_mont _p256_sqr_mont +#define _adx_bmi2_available _ring_core_0_17_14__adx_bmi2_available +#define _avx2_available _ring_core_0_17_14__avx2_available +#define _CRYPTO_memcmp _ring_core_0_17_14__CRYPTO_memcmp +#define _CRYPTO_poly1305_finish _ring_core_0_17_14__CRYPTO_poly1305_finish +#define _CRYPTO_poly1305_finish_neon _ring_core_0_17_14__CRYPTO_poly1305_finish_neon +#define _CRYPTO_poly1305_init _ring_core_0_17_14__CRYPTO_poly1305_init +#define _CRYPTO_poly1305_init_neon _ring_core_0_17_14__CRYPTO_poly1305_init_neon +#define _CRYPTO_poly1305_update _ring_core_0_17_14__CRYPTO_poly1305_update +#define _CRYPTO_poly1305_update_neon _ring_core_0_17_14__CRYPTO_poly1305_update_neon +#define _ChaCha20_ctr32 _ring_core_0_17_14__ChaCha20_ctr32 +#define _ChaCha20_ctr32_avx2 _ring_core_0_17_14__ChaCha20_ctr32_avx2 +#define _ChaCha20_ctr32_neon _ring_core_0_17_14__ChaCha20_ctr32_neon +#define _ChaCha20_ctr32_nohw _ring_core_0_17_14__ChaCha20_ctr32_nohw +#define _ChaCha20_ctr32_ssse3 _ring_core_0_17_14__ChaCha20_ctr32_ssse3 +#define _ChaCha20_ctr32_ssse3_4x _ring_core_0_17_14__ChaCha20_ctr32_ssse3_4x +#define _LIMB_is_zero _ring_core_0_17_14__LIMB_is_zero +#define _LIMBS_add_mod _ring_core_0_17_14__LIMBS_add_mod +#define _LIMBS_are_zero _ring_core_0_17_14__LIMBS_are_zero +#define _LIMBS_equal _ring_core_0_17_14__LIMBS_equal +#define _LIMBS_less_than _ring_core_0_17_14__LIMBS_less_than +#define _LIMBS_reduce_once _ring_core_0_17_14__LIMBS_reduce_once +#define _LIMBS_select_512_32 _ring_core_0_17_14__LIMBS_select_512_32 +#define _LIMBS_shl_mod _ring_core_0_17_14__LIMBS_shl_mod +#define _LIMBS_sub_mod _ring_core_0_17_14__LIMBS_sub_mod +#define _LIMBS_window5_split_window _ring_core_0_17_14__LIMBS_window5_split_window +#define _LIMBS_window5_unsplit_window _ring_core_0_17_14__LIMBS_window5_unsplit_window +#define _LIMB_shr _ring_core_0_17_14__LIMB_shr +#define _OPENSSL_cpuid_setup _ring_core_0_17_14__OPENSSL_cpuid_setup +#define _aes_gcm_dec_kernel _ring_core_0_17_14__aes_gcm_dec_kernel +#define _aes_gcm_dec_update_vaes_avx2 _ring_core_0_17_14__aes_gcm_dec_update_vaes_avx2 +#define _aes_gcm_enc_kernel _ring_core_0_17_14__aes_gcm_enc_kernel +#define _aes_gcm_enc_update_vaes_avx2 _ring_core_0_17_14__aes_gcm_enc_update_vaes_avx2 +#define _aes_hw_ctr32_encrypt_blocks _ring_core_0_17_14__aes_hw_ctr32_encrypt_blocks +#define _aes_hw_set_encrypt_key _ring_core_0_17_14__aes_hw_set_encrypt_key +#define _aes_hw_set_encrypt_key_alt _ring_core_0_17_14__aes_hw_set_encrypt_key_alt +#define _aes_hw_set_encrypt_key_base _ring_core_0_17_14__aes_hw_set_encrypt_key_base +#define _aes_nohw_ctr32_encrypt_blocks _ring_core_0_17_14__aes_nohw_ctr32_encrypt_blocks +#define _aes_nohw_encrypt _ring_core_0_17_14__aes_nohw_encrypt +#define _aes_nohw_set_encrypt_key _ring_core_0_17_14__aes_nohw_set_encrypt_key +#define _aesni_gcm_decrypt _ring_core_0_17_14__aesni_gcm_decrypt +#define _aesni_gcm_encrypt _ring_core_0_17_14__aesni_gcm_encrypt +#define _bn_from_montgomery_in_place _ring_core_0_17_14__bn_from_montgomery_in_place +#define _bn_gather5 _ring_core_0_17_14__bn_gather5 +#define _bn_mul_mont _ring_core_0_17_14__bn_mul_mont +#define _bn_mul_mont_nohw _ring_core_0_17_14__bn_mul_mont_nohw +#define _bn_mul4x_mont _ring_core_0_17_14__bn_mul4x_mont +#define _bn_mulx4x_mont _ring_core_0_17_14__bn_mulx4x_mont +#define _bn_mul8x_mont_neon _ring_core_0_17_14__bn_mul8x_mont_neon +#define _bn_mul4x_mont_gather5 _ring_core_0_17_14__bn_mul4x_mont_gather5 +#define _bn_mulx4x_mont_gather5 _ring_core_0_17_14__bn_mulx4x_mont_gather5 +#define _bn_neg_inv_mod_r_u64 _ring_core_0_17_14__bn_neg_inv_mod_r_u64 +#define _bn_power5_nohw _ring_core_0_17_14__bn_power5_nohw +#define _bn_powerx5 _ring_core_0_17_14__bn_powerx5 +#define _bn_scatter5 _ring_core_0_17_14__bn_scatter5 +#define _bn_sqr8x_internal _ring_core_0_17_14__bn_sqr8x_internal +#define _bn_sqr8x_mont _ring_core_0_17_14__bn_sqr8x_mont +#define _bn_sqrx8x_internal _ring_core_0_17_14__bn_sqrx8x_internal +#define _bsaes_ctr32_encrypt_blocks _ring_core_0_17_14__bsaes_ctr32_encrypt_blocks +#define _bssl_constant_time_test_conditional_memcpy _ring_core_0_17_14__bssl_constant_time_test_conditional_memcpy +#define _bssl_constant_time_test_conditional_memxor _ring_core_0_17_14__bssl_constant_time_test_conditional_memxor +#define _bssl_constant_time_test_main _ring_core_0_17_14__bssl_constant_time_test_main +#define _chacha20_poly1305_open _ring_core_0_17_14__chacha20_poly1305_open +#define _chacha20_poly1305_open_avx2 _ring_core_0_17_14__chacha20_poly1305_open_avx2 +#define _chacha20_poly1305_open_sse41 _ring_core_0_17_14__chacha20_poly1305_open_sse41 +#define _chacha20_poly1305_seal _ring_core_0_17_14__chacha20_poly1305_seal +#define _chacha20_poly1305_seal_avx2 _ring_core_0_17_14__chacha20_poly1305_seal_avx2 +#define _chacha20_poly1305_seal_sse41 _ring_core_0_17_14__chacha20_poly1305_seal_sse41 +#define _ecp_nistz256_mul_mont_adx _ring_core_0_17_14__ecp_nistz256_mul_mont_adx +#define _ecp_nistz256_mul_mont_nohw _ring_core_0_17_14__ecp_nistz256_mul_mont_nohw +#define _ecp_nistz256_ord_mul_mont_adx _ring_core_0_17_14__ecp_nistz256_ord_mul_mont_adx +#define _ecp_nistz256_ord_mul_mont_nohw _ring_core_0_17_14__ecp_nistz256_ord_mul_mont_nohw +#define _ecp_nistz256_ord_sqr_mont_adx _ring_core_0_17_14__ecp_nistz256_ord_sqr_mont_adx +#define _ecp_nistz256_ord_sqr_mont_nohw _ring_core_0_17_14__ecp_nistz256_ord_sqr_mont_nohw +#define _ecp_nistz256_point_add_adx _ring_core_0_17_14__ecp_nistz256_point_add_adx +#define _ecp_nistz256_point_add_nohw _ring_core_0_17_14__ecp_nistz256_point_add_nohw +#define _ecp_nistz256_point_add_affine_adx _ring_core_0_17_14__ecp_nistz256_point_add_affine_adx +#define _ecp_nistz256_point_add_affine_nohw _ring_core_0_17_14__ecp_nistz256_point_add_affine_nohw +#define _ecp_nistz256_point_double_adx _ring_core_0_17_14__ecp_nistz256_point_double_adx +#define _ecp_nistz256_point_double_nohw _ring_core_0_17_14__ecp_nistz256_point_double_nohw +#define _ecp_nistz256_select_w5_avx2 _ring_core_0_17_14__ecp_nistz256_select_w5_avx2 +#define _ecp_nistz256_select_w5_nohw _ring_core_0_17_14__ecp_nistz256_select_w5_nohw +#define _ecp_nistz256_select_w7_avx2 _ring_core_0_17_14__ecp_nistz256_select_w7_avx2 +#define _ecp_nistz256_select_w7_nohw _ring_core_0_17_14__ecp_nistz256_select_w7_nohw +#define _ecp_nistz256_sqr_mont_adx _ring_core_0_17_14__ecp_nistz256_sqr_mont_adx +#define _ecp_nistz256_sqr_mont_nohw _ring_core_0_17_14__ecp_nistz256_sqr_mont_nohw +#define _fiat_curve25519_adx_mul _ring_core_0_17_14__fiat_curve25519_adx_mul +#define _fiat_curve25519_adx_square _ring_core_0_17_14__fiat_curve25519_adx_square +#define _gcm_ghash_avx _ring_core_0_17_14__gcm_ghash_avx +#define _gcm_ghash_clmul _ring_core_0_17_14__gcm_ghash_clmul +#define _gcm_ghash_neon _ring_core_0_17_14__gcm_ghash_neon +#define _gcm_ghash_vpclmulqdq_avx2_1 _ring_core_0_17_14__gcm_ghash_vpclmulqdq_avx2_1 +#define _gcm_gmult_clmul _ring_core_0_17_14__gcm_gmult_clmul +#define _gcm_gmult_neon _ring_core_0_17_14__gcm_gmult_neon +#define _gcm_init_avx _ring_core_0_17_14__gcm_init_avx +#define _gcm_init_clmul _ring_core_0_17_14__gcm_init_clmul +#define _gcm_init_neon _ring_core_0_17_14__gcm_init_neon +#define _gcm_init_vpclmulqdq_avx2 _ring_core_0_17_14__gcm_init_vpclmulqdq_avx2 +#define _k25519Precomp _ring_core_0_17_14__k25519Precomp +#define _limbs_mul_add_limb _ring_core_0_17_14__limbs_mul_add_limb +#define _little_endian_bytes_from_scalar _ring_core_0_17_14__little_endian_bytes_from_scalar +#define _ecp_nistz256_neg _ring_core_0_17_14__ecp_nistz256_neg +#define _ecp_nistz256_select_w5 _ring_core_0_17_14__ecp_nistz256_select_w5 +#define _ecp_nistz256_select_w7 _ring_core_0_17_14__ecp_nistz256_select_w7 +#define _neon_available _ring_core_0_17_14__neon_available +#define _p256_mul_mont _ring_core_0_17_14__p256_mul_mont +#define _p256_point_add _ring_core_0_17_14__p256_point_add +#define _p256_point_add_affine _ring_core_0_17_14__p256_point_add_affine +#define _p256_point_double _ring_core_0_17_14__p256_point_double +#define _p256_point_mul _ring_core_0_17_14__p256_point_mul +#define _p256_point_mul_base _ring_core_0_17_14__p256_point_mul_base +#define _p256_point_mul_base_vartime _ring_core_0_17_14__p256_point_mul_base_vartime +#define _p256_scalar_mul_mont _ring_core_0_17_14__p256_scalar_mul_mont +#define _p256_scalar_sqr_rep_mont _ring_core_0_17_14__p256_scalar_sqr_rep_mont +#define _p256_sqr_mont _ring_core_0_17_14__p256_sqr_mont +#define _p384_elem_div_by_2 _ring_core_0_17_14__p384_elem_div_by_2 +#define _p384_elem_mul_mont _ring_core_0_17_14__p384_elem_mul_mont +#define _p384_elem_neg _ring_core_0_17_14__p384_elem_neg +#define _p384_elem_sub _ring_core_0_17_14__p384_elem_sub +#define _p384_point_add _ring_core_0_17_14__p384_point_add +#define _p384_point_double _ring_core_0_17_14__p384_point_double +#define _p384_point_mul _ring_core_0_17_14__p384_point_mul +#define _p384_scalar_mul_mont _ring_core_0_17_14__p384_scalar_mul_mont +#define _openssl_poly1305_neon2_addmulmod _ring_core_0_17_14__openssl_poly1305_neon2_addmulmod +#define _openssl_poly1305_neon2_blocks _ring_core_0_17_14__openssl_poly1305_neon2_blocks +#define _sha256_block_data_order _ring_core_0_17_14__sha256_block_data_order +#define _sha256_block_data_order_avx _ring_core_0_17_14__sha256_block_data_order_avx +#define _sha256_block_data_order_ssse3 _ring_core_0_17_14__sha256_block_data_order_ssse3 +#define _sha256_block_data_order_hw _ring_core_0_17_14__sha256_block_data_order_hw +#define _sha256_block_data_order_neon _ring_core_0_17_14__sha256_block_data_order_neon +#define _sha256_block_data_order_nohw _ring_core_0_17_14__sha256_block_data_order_nohw +#define _sha512_block_data_order _ring_core_0_17_14__sha512_block_data_order +#define _sha512_block_data_order_avx _ring_core_0_17_14__sha512_block_data_order_avx +#define _sha512_block_data_order_hw _ring_core_0_17_14__sha512_block_data_order_hw +#define _sha512_block_data_order_neon _ring_core_0_17_14__sha512_block_data_order_neon +#define _sha512_block_data_order_nohw _ring_core_0_17_14__sha512_block_data_order_nohw +#define _vpaes_ctr32_encrypt_blocks _ring_core_0_17_14__vpaes_ctr32_encrypt_blocks +#define _vpaes_encrypt _ring_core_0_17_14__vpaes_encrypt +#define _vpaes_encrypt_key_to_bsaes _ring_core_0_17_14__vpaes_encrypt_key_to_bsaes +#define _vpaes_set_encrypt_key _ring_core_0_17_14__vpaes_set_encrypt_key +#define _x25519_NEON _ring_core_0_17_14__x25519_NEON +#define _x25519_fe_invert _ring_core_0_17_14__x25519_fe_invert +#define _x25519_fe_isnegative _ring_core_0_17_14__x25519_fe_isnegative +#define _x25519_fe_mul_ttt _ring_core_0_17_14__x25519_fe_mul_ttt +#define _x25519_fe_neg _ring_core_0_17_14__x25519_fe_neg +#define _x25519_fe_tobytes _ring_core_0_17_14__x25519_fe_tobytes +#define _x25519_ge_double_scalarmult_vartime _ring_core_0_17_14__x25519_ge_double_scalarmult_vartime +#define _x25519_ge_frombytes_vartime _ring_core_0_17_14__x25519_ge_frombytes_vartime +#define _x25519_ge_scalarmult_base _ring_core_0_17_14__x25519_ge_scalarmult_base +#define _x25519_ge_scalarmult_base_adx _ring_core_0_17_14__x25519_ge_scalarmult_base_adx +#define _x25519_public_from_private_generic_masked _ring_core_0_17_14__x25519_public_from_private_generic_masked +#define _x25519_sc_mask _ring_core_0_17_14__x25519_sc_mask +#define _x25519_sc_muladd _ring_core_0_17_14__x25519_sc_muladd +#define _x25519_sc_reduce _ring_core_0_17_14__x25519_sc_reduce +#define _x25519_scalar_mult_adx _ring_core_0_17_14__x25519_scalar_mult_adx +#define _x25519_scalar_mult_generic_masked _ring_core_0_17_14__x25519_scalar_mult_generic_masked + +#else +#define ecp_nistz256_point_double p256_point_double +#define ecp_nistz256_point_add p256_point_add +#define ecp_nistz256_point_add_affine p256_point_add_affine +#define ecp_nistz256_ord_mul_mont p256_scalar_mul_mont +#define ecp_nistz256_ord_sqr_mont p256_scalar_sqr_rep_mont +#define ecp_nistz256_mul_mont p256_mul_mont +#define ecp_nistz256_sqr_mont p256_sqr_mont +#define adx_bmi2_available ring_core_0_17_14__adx_bmi2_available +#define avx2_available ring_core_0_17_14__avx2_available +#define CRYPTO_memcmp ring_core_0_17_14__CRYPTO_memcmp +#define CRYPTO_poly1305_finish ring_core_0_17_14__CRYPTO_poly1305_finish +#define CRYPTO_poly1305_finish_neon ring_core_0_17_14__CRYPTO_poly1305_finish_neon +#define CRYPTO_poly1305_init ring_core_0_17_14__CRYPTO_poly1305_init +#define CRYPTO_poly1305_init_neon ring_core_0_17_14__CRYPTO_poly1305_init_neon +#define CRYPTO_poly1305_update ring_core_0_17_14__CRYPTO_poly1305_update +#define CRYPTO_poly1305_update_neon ring_core_0_17_14__CRYPTO_poly1305_update_neon +#define ChaCha20_ctr32 ring_core_0_17_14__ChaCha20_ctr32 +#define ChaCha20_ctr32_avx2 ring_core_0_17_14__ChaCha20_ctr32_avx2 +#define ChaCha20_ctr32_neon ring_core_0_17_14__ChaCha20_ctr32_neon +#define ChaCha20_ctr32_nohw ring_core_0_17_14__ChaCha20_ctr32_nohw +#define ChaCha20_ctr32_ssse3 ring_core_0_17_14__ChaCha20_ctr32_ssse3 +#define ChaCha20_ctr32_ssse3_4x ring_core_0_17_14__ChaCha20_ctr32_ssse3_4x +#define LIMB_is_zero ring_core_0_17_14__LIMB_is_zero +#define LIMBS_add_mod ring_core_0_17_14__LIMBS_add_mod +#define LIMBS_are_zero ring_core_0_17_14__LIMBS_are_zero +#define LIMBS_equal ring_core_0_17_14__LIMBS_equal +#define LIMBS_less_than ring_core_0_17_14__LIMBS_less_than +#define LIMBS_reduce_once ring_core_0_17_14__LIMBS_reduce_once +#define LIMBS_select_512_32 ring_core_0_17_14__LIMBS_select_512_32 +#define LIMBS_shl_mod ring_core_0_17_14__LIMBS_shl_mod +#define LIMBS_sub_mod ring_core_0_17_14__LIMBS_sub_mod +#define LIMBS_window5_split_window ring_core_0_17_14__LIMBS_window5_split_window +#define LIMBS_window5_unsplit_window ring_core_0_17_14__LIMBS_window5_unsplit_window +#define LIMB_shr ring_core_0_17_14__LIMB_shr +#define OPENSSL_cpuid_setup ring_core_0_17_14__OPENSSL_cpuid_setup +#define aes_gcm_dec_kernel ring_core_0_17_14__aes_gcm_dec_kernel +#define aes_gcm_dec_update_vaes_avx2 ring_core_0_17_14__aes_gcm_dec_update_vaes_avx2 +#define aes_gcm_enc_kernel ring_core_0_17_14__aes_gcm_enc_kernel +#define aes_gcm_enc_update_vaes_avx2 ring_core_0_17_14__aes_gcm_enc_update_vaes_avx2 +#define aes_hw_ctr32_encrypt_blocks ring_core_0_17_14__aes_hw_ctr32_encrypt_blocks +#define aes_hw_set_encrypt_key ring_core_0_17_14__aes_hw_set_encrypt_key +#define aes_hw_set_encrypt_key_alt ring_core_0_17_14__aes_hw_set_encrypt_key_alt +#define aes_hw_set_encrypt_key_base ring_core_0_17_14__aes_hw_set_encrypt_key_base +#define aes_nohw_ctr32_encrypt_blocks ring_core_0_17_14__aes_nohw_ctr32_encrypt_blocks +#define aes_nohw_encrypt ring_core_0_17_14__aes_nohw_encrypt +#define aes_nohw_set_encrypt_key ring_core_0_17_14__aes_nohw_set_encrypt_key +#define aesni_gcm_decrypt ring_core_0_17_14__aesni_gcm_decrypt +#define aesni_gcm_encrypt ring_core_0_17_14__aesni_gcm_encrypt +#define bn_from_montgomery_in_place ring_core_0_17_14__bn_from_montgomery_in_place +#define bn_gather5 ring_core_0_17_14__bn_gather5 +#define bn_mul_mont ring_core_0_17_14__bn_mul_mont +#define bn_mul_mont_nohw ring_core_0_17_14__bn_mul_mont_nohw +#define bn_mul4x_mont ring_core_0_17_14__bn_mul4x_mont +#define bn_mulx4x_mont ring_core_0_17_14__bn_mulx4x_mont +#define bn_mul8x_mont_neon ring_core_0_17_14__bn_mul8x_mont_neon +#define bn_mul4x_mont_gather5 ring_core_0_17_14__bn_mul4x_mont_gather5 +#define bn_mulx4x_mont_gather5 ring_core_0_17_14__bn_mulx4x_mont_gather5 +#define bn_neg_inv_mod_r_u64 ring_core_0_17_14__bn_neg_inv_mod_r_u64 +#define bn_power5_nohw ring_core_0_17_14__bn_power5_nohw +#define bn_powerx5 ring_core_0_17_14__bn_powerx5 +#define bn_scatter5 ring_core_0_17_14__bn_scatter5 +#define bn_sqr8x_internal ring_core_0_17_14__bn_sqr8x_internal +#define bn_sqr8x_mont ring_core_0_17_14__bn_sqr8x_mont +#define bn_sqrx8x_internal ring_core_0_17_14__bn_sqrx8x_internal +#define bsaes_ctr32_encrypt_blocks ring_core_0_17_14__bsaes_ctr32_encrypt_blocks +#define bssl_constant_time_test_conditional_memcpy ring_core_0_17_14__bssl_constant_time_test_conditional_memcpy +#define bssl_constant_time_test_conditional_memxor ring_core_0_17_14__bssl_constant_time_test_conditional_memxor +#define bssl_constant_time_test_main ring_core_0_17_14__bssl_constant_time_test_main +#define chacha20_poly1305_open ring_core_0_17_14__chacha20_poly1305_open +#define chacha20_poly1305_open_avx2 ring_core_0_17_14__chacha20_poly1305_open_avx2 +#define chacha20_poly1305_open_sse41 ring_core_0_17_14__chacha20_poly1305_open_sse41 +#define chacha20_poly1305_seal ring_core_0_17_14__chacha20_poly1305_seal +#define chacha20_poly1305_seal_avx2 ring_core_0_17_14__chacha20_poly1305_seal_avx2 +#define chacha20_poly1305_seal_sse41 ring_core_0_17_14__chacha20_poly1305_seal_sse41 +#define ecp_nistz256_mul_mont_adx ring_core_0_17_14__ecp_nistz256_mul_mont_adx +#define ecp_nistz256_mul_mont_nohw ring_core_0_17_14__ecp_nistz256_mul_mont_nohw +#define ecp_nistz256_ord_mul_mont_adx ring_core_0_17_14__ecp_nistz256_ord_mul_mont_adx +#define ecp_nistz256_ord_mul_mont_nohw ring_core_0_17_14__ecp_nistz256_ord_mul_mont_nohw +#define ecp_nistz256_ord_sqr_mont_adx ring_core_0_17_14__ecp_nistz256_ord_sqr_mont_adx +#define ecp_nistz256_ord_sqr_mont_nohw ring_core_0_17_14__ecp_nistz256_ord_sqr_mont_nohw +#define ecp_nistz256_point_add_adx ring_core_0_17_14__ecp_nistz256_point_add_adx +#define ecp_nistz256_point_add_nohw ring_core_0_17_14__ecp_nistz256_point_add_nohw +#define ecp_nistz256_point_add_affine_adx ring_core_0_17_14__ecp_nistz256_point_add_affine_adx +#define ecp_nistz256_point_add_affine_nohw ring_core_0_17_14__ecp_nistz256_point_add_affine_nohw +#define ecp_nistz256_point_double_adx ring_core_0_17_14__ecp_nistz256_point_double_adx +#define ecp_nistz256_point_double_nohw ring_core_0_17_14__ecp_nistz256_point_double_nohw +#define ecp_nistz256_select_w5_avx2 ring_core_0_17_14__ecp_nistz256_select_w5_avx2 +#define ecp_nistz256_select_w5_nohw ring_core_0_17_14__ecp_nistz256_select_w5_nohw +#define ecp_nistz256_select_w7_avx2 ring_core_0_17_14__ecp_nistz256_select_w7_avx2 +#define ecp_nistz256_select_w7_nohw ring_core_0_17_14__ecp_nistz256_select_w7_nohw +#define ecp_nistz256_sqr_mont_adx ring_core_0_17_14__ecp_nistz256_sqr_mont_adx +#define ecp_nistz256_sqr_mont_nohw ring_core_0_17_14__ecp_nistz256_sqr_mont_nohw +#define fiat_curve25519_adx_mul ring_core_0_17_14__fiat_curve25519_adx_mul +#define fiat_curve25519_adx_square ring_core_0_17_14__fiat_curve25519_adx_square +#define gcm_ghash_avx ring_core_0_17_14__gcm_ghash_avx +#define gcm_ghash_clmul ring_core_0_17_14__gcm_ghash_clmul +#define gcm_ghash_neon ring_core_0_17_14__gcm_ghash_neon +#define gcm_ghash_vpclmulqdq_avx2_1 ring_core_0_17_14__gcm_ghash_vpclmulqdq_avx2_1 +#define gcm_gmult_clmul ring_core_0_17_14__gcm_gmult_clmul +#define gcm_gmult_neon ring_core_0_17_14__gcm_gmult_neon +#define gcm_init_avx ring_core_0_17_14__gcm_init_avx +#define gcm_init_clmul ring_core_0_17_14__gcm_init_clmul +#define gcm_init_neon ring_core_0_17_14__gcm_init_neon +#define gcm_init_vpclmulqdq_avx2 ring_core_0_17_14__gcm_init_vpclmulqdq_avx2 +#define k25519Precomp ring_core_0_17_14__k25519Precomp +#define limbs_mul_add_limb ring_core_0_17_14__limbs_mul_add_limb +#define little_endian_bytes_from_scalar ring_core_0_17_14__little_endian_bytes_from_scalar +#define ecp_nistz256_neg ring_core_0_17_14__ecp_nistz256_neg +#define ecp_nistz256_select_w5 ring_core_0_17_14__ecp_nistz256_select_w5 +#define ecp_nistz256_select_w7 ring_core_0_17_14__ecp_nistz256_select_w7 +#define neon_available ring_core_0_17_14__neon_available +#define p256_mul_mont ring_core_0_17_14__p256_mul_mont +#define p256_point_add ring_core_0_17_14__p256_point_add +#define p256_point_add_affine ring_core_0_17_14__p256_point_add_affine +#define p256_point_double ring_core_0_17_14__p256_point_double +#define p256_point_mul ring_core_0_17_14__p256_point_mul +#define p256_point_mul_base ring_core_0_17_14__p256_point_mul_base +#define p256_point_mul_base_vartime ring_core_0_17_14__p256_point_mul_base_vartime +#define p256_scalar_mul_mont ring_core_0_17_14__p256_scalar_mul_mont +#define p256_scalar_sqr_rep_mont ring_core_0_17_14__p256_scalar_sqr_rep_mont +#define p256_sqr_mont ring_core_0_17_14__p256_sqr_mont +#define p384_elem_div_by_2 ring_core_0_17_14__p384_elem_div_by_2 +#define p384_elem_mul_mont ring_core_0_17_14__p384_elem_mul_mont +#define p384_elem_neg ring_core_0_17_14__p384_elem_neg +#define p384_elem_sub ring_core_0_17_14__p384_elem_sub +#define p384_point_add ring_core_0_17_14__p384_point_add +#define p384_point_double ring_core_0_17_14__p384_point_double +#define p384_point_mul ring_core_0_17_14__p384_point_mul +#define p384_scalar_mul_mont ring_core_0_17_14__p384_scalar_mul_mont +#define openssl_poly1305_neon2_addmulmod ring_core_0_17_14__openssl_poly1305_neon2_addmulmod +#define openssl_poly1305_neon2_blocks ring_core_0_17_14__openssl_poly1305_neon2_blocks +#define sha256_block_data_order ring_core_0_17_14__sha256_block_data_order +#define sha256_block_data_order_avx ring_core_0_17_14__sha256_block_data_order_avx +#define sha256_block_data_order_ssse3 ring_core_0_17_14__sha256_block_data_order_ssse3 +#define sha256_block_data_order_hw ring_core_0_17_14__sha256_block_data_order_hw +#define sha256_block_data_order_neon ring_core_0_17_14__sha256_block_data_order_neon +#define sha256_block_data_order_nohw ring_core_0_17_14__sha256_block_data_order_nohw +#define sha512_block_data_order ring_core_0_17_14__sha512_block_data_order +#define sha512_block_data_order_avx ring_core_0_17_14__sha512_block_data_order_avx +#define sha512_block_data_order_hw ring_core_0_17_14__sha512_block_data_order_hw +#define sha512_block_data_order_neon ring_core_0_17_14__sha512_block_data_order_neon +#define sha512_block_data_order_nohw ring_core_0_17_14__sha512_block_data_order_nohw +#define vpaes_ctr32_encrypt_blocks ring_core_0_17_14__vpaes_ctr32_encrypt_blocks +#define vpaes_encrypt ring_core_0_17_14__vpaes_encrypt +#define vpaes_encrypt_key_to_bsaes ring_core_0_17_14__vpaes_encrypt_key_to_bsaes +#define vpaes_set_encrypt_key ring_core_0_17_14__vpaes_set_encrypt_key +#define x25519_NEON ring_core_0_17_14__x25519_NEON +#define x25519_fe_invert ring_core_0_17_14__x25519_fe_invert +#define x25519_fe_isnegative ring_core_0_17_14__x25519_fe_isnegative +#define x25519_fe_mul_ttt ring_core_0_17_14__x25519_fe_mul_ttt +#define x25519_fe_neg ring_core_0_17_14__x25519_fe_neg +#define x25519_fe_tobytes ring_core_0_17_14__x25519_fe_tobytes +#define x25519_ge_double_scalarmult_vartime ring_core_0_17_14__x25519_ge_double_scalarmult_vartime +#define x25519_ge_frombytes_vartime ring_core_0_17_14__x25519_ge_frombytes_vartime +#define x25519_ge_scalarmult_base ring_core_0_17_14__x25519_ge_scalarmult_base +#define x25519_ge_scalarmult_base_adx ring_core_0_17_14__x25519_ge_scalarmult_base_adx +#define x25519_public_from_private_generic_masked ring_core_0_17_14__x25519_public_from_private_generic_masked +#define x25519_sc_mask ring_core_0_17_14__x25519_sc_mask +#define x25519_sc_muladd ring_core_0_17_14__x25519_sc_muladd +#define x25519_sc_reduce ring_core_0_17_14__x25519_sc_reduce +#define x25519_scalar_mult_adx ring_core_0_17_14__x25519_scalar_mult_adx +#define x25519_scalar_mult_generic_masked ring_core_0_17_14__x25519_scalar_mult_generic_masked + +#endif +#endif diff --git a/ring-0.17.14/pregenerated/ring_core_generated/prefix_symbols_nasm.inc b/ring-0.17.14/pregenerated/ring_core_generated/prefix_symbols_nasm.inc new file mode 100644 index 0000000000..900abb03b5 --- /dev/null +++ b/ring-0.17.14/pregenerated/ring_core_generated/prefix_symbols_nasm.inc @@ -0,0 +1,334 @@ + +%ifndef ring_core_generated_PREFIX_SYMBOLS_NASM_INC +%define ring_core_generated_PREFIX_SYMBOLS_NASM_INC + +%ifidn __OUTPUT_FORMAT__,win32 +%define _ecp_nistz256_point_double _p256_point_double +%define _ecp_nistz256_point_add _p256_point_add +%define _ecp_nistz256_point_add_affine _p256_point_add_affine +%define _ecp_nistz256_ord_mul_mont _p256_scalar_mul_mont +%define _ecp_nistz256_ord_sqr_mont _p256_scalar_sqr_rep_mont +%define _ecp_nistz256_mul_mont _p256_mul_mont +%define _ecp_nistz256_sqr_mont _p256_sqr_mont +%define _adx_bmi2_available _ring_core_0_17_14__adx_bmi2_available +%define _avx2_available _ring_core_0_17_14__avx2_available +%define _CRYPTO_memcmp _ring_core_0_17_14__CRYPTO_memcmp +%define _CRYPTO_poly1305_finish _ring_core_0_17_14__CRYPTO_poly1305_finish +%define _CRYPTO_poly1305_finish_neon _ring_core_0_17_14__CRYPTO_poly1305_finish_neon +%define _CRYPTO_poly1305_init _ring_core_0_17_14__CRYPTO_poly1305_init +%define _CRYPTO_poly1305_init_neon _ring_core_0_17_14__CRYPTO_poly1305_init_neon +%define _CRYPTO_poly1305_update _ring_core_0_17_14__CRYPTO_poly1305_update +%define _CRYPTO_poly1305_update_neon _ring_core_0_17_14__CRYPTO_poly1305_update_neon +%define _ChaCha20_ctr32 _ring_core_0_17_14__ChaCha20_ctr32 +%define _ChaCha20_ctr32_avx2 _ring_core_0_17_14__ChaCha20_ctr32_avx2 +%define _ChaCha20_ctr32_neon _ring_core_0_17_14__ChaCha20_ctr32_neon +%define _ChaCha20_ctr32_nohw _ring_core_0_17_14__ChaCha20_ctr32_nohw +%define _ChaCha20_ctr32_ssse3 _ring_core_0_17_14__ChaCha20_ctr32_ssse3 +%define _ChaCha20_ctr32_ssse3_4x _ring_core_0_17_14__ChaCha20_ctr32_ssse3_4x +%define _LIMB_is_zero _ring_core_0_17_14__LIMB_is_zero +%define _LIMBS_add_mod _ring_core_0_17_14__LIMBS_add_mod +%define _LIMBS_are_zero _ring_core_0_17_14__LIMBS_are_zero +%define _LIMBS_equal _ring_core_0_17_14__LIMBS_equal +%define _LIMBS_less_than _ring_core_0_17_14__LIMBS_less_than +%define _LIMBS_reduce_once _ring_core_0_17_14__LIMBS_reduce_once +%define _LIMBS_select_512_32 _ring_core_0_17_14__LIMBS_select_512_32 +%define _LIMBS_shl_mod _ring_core_0_17_14__LIMBS_shl_mod +%define _LIMBS_sub_mod _ring_core_0_17_14__LIMBS_sub_mod +%define _LIMBS_window5_split_window _ring_core_0_17_14__LIMBS_window5_split_window +%define _LIMBS_window5_unsplit_window _ring_core_0_17_14__LIMBS_window5_unsplit_window +%define _LIMB_shr _ring_core_0_17_14__LIMB_shr +%define _OPENSSL_cpuid_setup _ring_core_0_17_14__OPENSSL_cpuid_setup +%define _aes_gcm_dec_kernel _ring_core_0_17_14__aes_gcm_dec_kernel +%define _aes_gcm_dec_update_vaes_avx2 _ring_core_0_17_14__aes_gcm_dec_update_vaes_avx2 +%define _aes_gcm_enc_kernel _ring_core_0_17_14__aes_gcm_enc_kernel +%define _aes_gcm_enc_update_vaes_avx2 _ring_core_0_17_14__aes_gcm_enc_update_vaes_avx2 +%define _aes_hw_ctr32_encrypt_blocks _ring_core_0_17_14__aes_hw_ctr32_encrypt_blocks +%define _aes_hw_set_encrypt_key _ring_core_0_17_14__aes_hw_set_encrypt_key +%define _aes_hw_set_encrypt_key_alt _ring_core_0_17_14__aes_hw_set_encrypt_key_alt +%define _aes_hw_set_encrypt_key_base _ring_core_0_17_14__aes_hw_set_encrypt_key_base +%define _aes_nohw_ctr32_encrypt_blocks _ring_core_0_17_14__aes_nohw_ctr32_encrypt_blocks +%define _aes_nohw_encrypt _ring_core_0_17_14__aes_nohw_encrypt +%define _aes_nohw_set_encrypt_key _ring_core_0_17_14__aes_nohw_set_encrypt_key +%define _aesni_gcm_decrypt _ring_core_0_17_14__aesni_gcm_decrypt +%define _aesni_gcm_encrypt _ring_core_0_17_14__aesni_gcm_encrypt +%define _bn_from_montgomery_in_place _ring_core_0_17_14__bn_from_montgomery_in_place +%define _bn_gather5 _ring_core_0_17_14__bn_gather5 +%define _bn_mul_mont _ring_core_0_17_14__bn_mul_mont +%define _bn_mul_mont_nohw _ring_core_0_17_14__bn_mul_mont_nohw +%define _bn_mul4x_mont _ring_core_0_17_14__bn_mul4x_mont +%define _bn_mulx4x_mont _ring_core_0_17_14__bn_mulx4x_mont +%define _bn_mul8x_mont_neon _ring_core_0_17_14__bn_mul8x_mont_neon +%define _bn_mul4x_mont_gather5 _ring_core_0_17_14__bn_mul4x_mont_gather5 +%define _bn_mulx4x_mont_gather5 _ring_core_0_17_14__bn_mulx4x_mont_gather5 +%define _bn_neg_inv_mod_r_u64 _ring_core_0_17_14__bn_neg_inv_mod_r_u64 +%define _bn_power5_nohw _ring_core_0_17_14__bn_power5_nohw +%define _bn_powerx5 _ring_core_0_17_14__bn_powerx5 +%define _bn_scatter5 _ring_core_0_17_14__bn_scatter5 +%define _bn_sqr8x_internal _ring_core_0_17_14__bn_sqr8x_internal +%define _bn_sqr8x_mont _ring_core_0_17_14__bn_sqr8x_mont +%define _bn_sqrx8x_internal _ring_core_0_17_14__bn_sqrx8x_internal +%define _bsaes_ctr32_encrypt_blocks _ring_core_0_17_14__bsaes_ctr32_encrypt_blocks +%define _bssl_constant_time_test_conditional_memcpy _ring_core_0_17_14__bssl_constant_time_test_conditional_memcpy +%define _bssl_constant_time_test_conditional_memxor _ring_core_0_17_14__bssl_constant_time_test_conditional_memxor +%define _bssl_constant_time_test_main _ring_core_0_17_14__bssl_constant_time_test_main +%define _chacha20_poly1305_open _ring_core_0_17_14__chacha20_poly1305_open +%define _chacha20_poly1305_open_avx2 _ring_core_0_17_14__chacha20_poly1305_open_avx2 +%define _chacha20_poly1305_open_sse41 _ring_core_0_17_14__chacha20_poly1305_open_sse41 +%define _chacha20_poly1305_seal _ring_core_0_17_14__chacha20_poly1305_seal +%define _chacha20_poly1305_seal_avx2 _ring_core_0_17_14__chacha20_poly1305_seal_avx2 +%define _chacha20_poly1305_seal_sse41 _ring_core_0_17_14__chacha20_poly1305_seal_sse41 +%define _ecp_nistz256_mul_mont_adx _ring_core_0_17_14__ecp_nistz256_mul_mont_adx +%define _ecp_nistz256_mul_mont_nohw _ring_core_0_17_14__ecp_nistz256_mul_mont_nohw +%define _ecp_nistz256_ord_mul_mont_adx _ring_core_0_17_14__ecp_nistz256_ord_mul_mont_adx +%define _ecp_nistz256_ord_mul_mont_nohw _ring_core_0_17_14__ecp_nistz256_ord_mul_mont_nohw +%define _ecp_nistz256_ord_sqr_mont_adx _ring_core_0_17_14__ecp_nistz256_ord_sqr_mont_adx +%define _ecp_nistz256_ord_sqr_mont_nohw _ring_core_0_17_14__ecp_nistz256_ord_sqr_mont_nohw +%define _ecp_nistz256_point_add_adx _ring_core_0_17_14__ecp_nistz256_point_add_adx +%define _ecp_nistz256_point_add_nohw _ring_core_0_17_14__ecp_nistz256_point_add_nohw +%define _ecp_nistz256_point_add_affine_adx _ring_core_0_17_14__ecp_nistz256_point_add_affine_adx +%define _ecp_nistz256_point_add_affine_nohw _ring_core_0_17_14__ecp_nistz256_point_add_affine_nohw +%define _ecp_nistz256_point_double_adx _ring_core_0_17_14__ecp_nistz256_point_double_adx +%define _ecp_nistz256_point_double_nohw _ring_core_0_17_14__ecp_nistz256_point_double_nohw +%define _ecp_nistz256_select_w5_avx2 _ring_core_0_17_14__ecp_nistz256_select_w5_avx2 +%define _ecp_nistz256_select_w5_nohw _ring_core_0_17_14__ecp_nistz256_select_w5_nohw +%define _ecp_nistz256_select_w7_avx2 _ring_core_0_17_14__ecp_nistz256_select_w7_avx2 +%define _ecp_nistz256_select_w7_nohw _ring_core_0_17_14__ecp_nistz256_select_w7_nohw +%define _ecp_nistz256_sqr_mont_adx _ring_core_0_17_14__ecp_nistz256_sqr_mont_adx +%define _ecp_nistz256_sqr_mont_nohw _ring_core_0_17_14__ecp_nistz256_sqr_mont_nohw +%define _fiat_curve25519_adx_mul _ring_core_0_17_14__fiat_curve25519_adx_mul +%define _fiat_curve25519_adx_square _ring_core_0_17_14__fiat_curve25519_adx_square +%define _gcm_ghash_avx _ring_core_0_17_14__gcm_ghash_avx +%define _gcm_ghash_clmul _ring_core_0_17_14__gcm_ghash_clmul +%define _gcm_ghash_neon _ring_core_0_17_14__gcm_ghash_neon +%define _gcm_ghash_vpclmulqdq_avx2_1 _ring_core_0_17_14__gcm_ghash_vpclmulqdq_avx2_1 +%define _gcm_gmult_clmul _ring_core_0_17_14__gcm_gmult_clmul +%define _gcm_gmult_neon _ring_core_0_17_14__gcm_gmult_neon +%define _gcm_init_avx _ring_core_0_17_14__gcm_init_avx +%define _gcm_init_clmul _ring_core_0_17_14__gcm_init_clmul +%define _gcm_init_neon _ring_core_0_17_14__gcm_init_neon +%define _gcm_init_vpclmulqdq_avx2 _ring_core_0_17_14__gcm_init_vpclmulqdq_avx2 +%define _k25519Precomp _ring_core_0_17_14__k25519Precomp +%define _limbs_mul_add_limb _ring_core_0_17_14__limbs_mul_add_limb +%define _little_endian_bytes_from_scalar _ring_core_0_17_14__little_endian_bytes_from_scalar +%define _ecp_nistz256_neg _ring_core_0_17_14__ecp_nistz256_neg +%define _ecp_nistz256_select_w5 _ring_core_0_17_14__ecp_nistz256_select_w5 +%define _ecp_nistz256_select_w7 _ring_core_0_17_14__ecp_nistz256_select_w7 +%define _neon_available _ring_core_0_17_14__neon_available +%define _p256_mul_mont _ring_core_0_17_14__p256_mul_mont +%define _p256_point_add _ring_core_0_17_14__p256_point_add +%define _p256_point_add_affine _ring_core_0_17_14__p256_point_add_affine +%define _p256_point_double _ring_core_0_17_14__p256_point_double +%define _p256_point_mul _ring_core_0_17_14__p256_point_mul +%define _p256_point_mul_base _ring_core_0_17_14__p256_point_mul_base +%define _p256_point_mul_base_vartime _ring_core_0_17_14__p256_point_mul_base_vartime +%define _p256_scalar_mul_mont _ring_core_0_17_14__p256_scalar_mul_mont +%define _p256_scalar_sqr_rep_mont _ring_core_0_17_14__p256_scalar_sqr_rep_mont +%define _p256_sqr_mont _ring_core_0_17_14__p256_sqr_mont +%define _p384_elem_div_by_2 _ring_core_0_17_14__p384_elem_div_by_2 +%define _p384_elem_mul_mont _ring_core_0_17_14__p384_elem_mul_mont +%define _p384_elem_neg _ring_core_0_17_14__p384_elem_neg +%define _p384_elem_sub _ring_core_0_17_14__p384_elem_sub +%define _p384_point_add _ring_core_0_17_14__p384_point_add +%define _p384_point_double _ring_core_0_17_14__p384_point_double +%define _p384_point_mul _ring_core_0_17_14__p384_point_mul +%define _p384_scalar_mul_mont _ring_core_0_17_14__p384_scalar_mul_mont +%define _openssl_poly1305_neon2_addmulmod _ring_core_0_17_14__openssl_poly1305_neon2_addmulmod +%define _openssl_poly1305_neon2_blocks _ring_core_0_17_14__openssl_poly1305_neon2_blocks +%define _sha256_block_data_order _ring_core_0_17_14__sha256_block_data_order +%define _sha256_block_data_order_avx _ring_core_0_17_14__sha256_block_data_order_avx +%define _sha256_block_data_order_ssse3 _ring_core_0_17_14__sha256_block_data_order_ssse3 +%define _sha256_block_data_order_hw _ring_core_0_17_14__sha256_block_data_order_hw +%define _sha256_block_data_order_neon _ring_core_0_17_14__sha256_block_data_order_neon +%define _sha256_block_data_order_nohw _ring_core_0_17_14__sha256_block_data_order_nohw +%define _sha512_block_data_order _ring_core_0_17_14__sha512_block_data_order +%define _sha512_block_data_order_avx _ring_core_0_17_14__sha512_block_data_order_avx +%define _sha512_block_data_order_hw _ring_core_0_17_14__sha512_block_data_order_hw +%define _sha512_block_data_order_neon _ring_core_0_17_14__sha512_block_data_order_neon +%define _sha512_block_data_order_nohw _ring_core_0_17_14__sha512_block_data_order_nohw +%define _vpaes_ctr32_encrypt_blocks _ring_core_0_17_14__vpaes_ctr32_encrypt_blocks +%define _vpaes_encrypt _ring_core_0_17_14__vpaes_encrypt +%define _vpaes_encrypt_key_to_bsaes _ring_core_0_17_14__vpaes_encrypt_key_to_bsaes +%define _vpaes_set_encrypt_key _ring_core_0_17_14__vpaes_set_encrypt_key +%define _x25519_NEON _ring_core_0_17_14__x25519_NEON +%define _x25519_fe_invert _ring_core_0_17_14__x25519_fe_invert +%define _x25519_fe_isnegative _ring_core_0_17_14__x25519_fe_isnegative +%define _x25519_fe_mul_ttt _ring_core_0_17_14__x25519_fe_mul_ttt +%define _x25519_fe_neg _ring_core_0_17_14__x25519_fe_neg +%define _x25519_fe_tobytes _ring_core_0_17_14__x25519_fe_tobytes +%define _x25519_ge_double_scalarmult_vartime _ring_core_0_17_14__x25519_ge_double_scalarmult_vartime +%define _x25519_ge_frombytes_vartime _ring_core_0_17_14__x25519_ge_frombytes_vartime +%define _x25519_ge_scalarmult_base _ring_core_0_17_14__x25519_ge_scalarmult_base +%define _x25519_ge_scalarmult_base_adx _ring_core_0_17_14__x25519_ge_scalarmult_base_adx +%define _x25519_public_from_private_generic_masked _ring_core_0_17_14__x25519_public_from_private_generic_masked +%define _x25519_sc_mask _ring_core_0_17_14__x25519_sc_mask +%define _x25519_sc_muladd _ring_core_0_17_14__x25519_sc_muladd +%define _x25519_sc_reduce _ring_core_0_17_14__x25519_sc_reduce +%define _x25519_scalar_mult_adx _ring_core_0_17_14__x25519_scalar_mult_adx +%define _x25519_scalar_mult_generic_masked _ring_core_0_17_14__x25519_scalar_mult_generic_masked + +%else +%define ecp_nistz256_point_double p256_point_double +%define ecp_nistz256_point_add p256_point_add +%define ecp_nistz256_point_add_affine p256_point_add_affine +%define ecp_nistz256_ord_mul_mont p256_scalar_mul_mont +%define ecp_nistz256_ord_sqr_mont p256_scalar_sqr_rep_mont +%define ecp_nistz256_mul_mont p256_mul_mont +%define ecp_nistz256_sqr_mont p256_sqr_mont +%define adx_bmi2_available ring_core_0_17_14__adx_bmi2_available +%define avx2_available ring_core_0_17_14__avx2_available +%define CRYPTO_memcmp ring_core_0_17_14__CRYPTO_memcmp +%define CRYPTO_poly1305_finish ring_core_0_17_14__CRYPTO_poly1305_finish +%define CRYPTO_poly1305_finish_neon ring_core_0_17_14__CRYPTO_poly1305_finish_neon +%define CRYPTO_poly1305_init ring_core_0_17_14__CRYPTO_poly1305_init +%define CRYPTO_poly1305_init_neon ring_core_0_17_14__CRYPTO_poly1305_init_neon +%define CRYPTO_poly1305_update ring_core_0_17_14__CRYPTO_poly1305_update +%define CRYPTO_poly1305_update_neon ring_core_0_17_14__CRYPTO_poly1305_update_neon +%define ChaCha20_ctr32 ring_core_0_17_14__ChaCha20_ctr32 +%define ChaCha20_ctr32_avx2 ring_core_0_17_14__ChaCha20_ctr32_avx2 +%define ChaCha20_ctr32_neon ring_core_0_17_14__ChaCha20_ctr32_neon +%define ChaCha20_ctr32_nohw ring_core_0_17_14__ChaCha20_ctr32_nohw +%define ChaCha20_ctr32_ssse3 ring_core_0_17_14__ChaCha20_ctr32_ssse3 +%define ChaCha20_ctr32_ssse3_4x ring_core_0_17_14__ChaCha20_ctr32_ssse3_4x +%define LIMB_is_zero ring_core_0_17_14__LIMB_is_zero +%define LIMBS_add_mod ring_core_0_17_14__LIMBS_add_mod +%define LIMBS_are_zero ring_core_0_17_14__LIMBS_are_zero +%define LIMBS_equal ring_core_0_17_14__LIMBS_equal +%define LIMBS_less_than ring_core_0_17_14__LIMBS_less_than +%define LIMBS_reduce_once ring_core_0_17_14__LIMBS_reduce_once +%define LIMBS_select_512_32 ring_core_0_17_14__LIMBS_select_512_32 +%define LIMBS_shl_mod ring_core_0_17_14__LIMBS_shl_mod +%define LIMBS_sub_mod ring_core_0_17_14__LIMBS_sub_mod +%define LIMBS_window5_split_window ring_core_0_17_14__LIMBS_window5_split_window +%define LIMBS_window5_unsplit_window ring_core_0_17_14__LIMBS_window5_unsplit_window +%define LIMB_shr ring_core_0_17_14__LIMB_shr +%define OPENSSL_cpuid_setup ring_core_0_17_14__OPENSSL_cpuid_setup +%define aes_gcm_dec_kernel ring_core_0_17_14__aes_gcm_dec_kernel +%define aes_gcm_dec_update_vaes_avx2 ring_core_0_17_14__aes_gcm_dec_update_vaes_avx2 +%define aes_gcm_enc_kernel ring_core_0_17_14__aes_gcm_enc_kernel +%define aes_gcm_enc_update_vaes_avx2 ring_core_0_17_14__aes_gcm_enc_update_vaes_avx2 +%define aes_hw_ctr32_encrypt_blocks ring_core_0_17_14__aes_hw_ctr32_encrypt_blocks +%define aes_hw_set_encrypt_key ring_core_0_17_14__aes_hw_set_encrypt_key +%define aes_hw_set_encrypt_key_alt ring_core_0_17_14__aes_hw_set_encrypt_key_alt +%define aes_hw_set_encrypt_key_base ring_core_0_17_14__aes_hw_set_encrypt_key_base +%define aes_nohw_ctr32_encrypt_blocks ring_core_0_17_14__aes_nohw_ctr32_encrypt_blocks +%define aes_nohw_encrypt ring_core_0_17_14__aes_nohw_encrypt +%define aes_nohw_set_encrypt_key ring_core_0_17_14__aes_nohw_set_encrypt_key +%define aesni_gcm_decrypt ring_core_0_17_14__aesni_gcm_decrypt +%define aesni_gcm_encrypt ring_core_0_17_14__aesni_gcm_encrypt +%define bn_from_montgomery_in_place ring_core_0_17_14__bn_from_montgomery_in_place +%define bn_gather5 ring_core_0_17_14__bn_gather5 +%define bn_mul_mont ring_core_0_17_14__bn_mul_mont +%define bn_mul_mont_nohw ring_core_0_17_14__bn_mul_mont_nohw +%define bn_mul4x_mont ring_core_0_17_14__bn_mul4x_mont +%define bn_mulx4x_mont ring_core_0_17_14__bn_mulx4x_mont +%define bn_mul8x_mont_neon ring_core_0_17_14__bn_mul8x_mont_neon +%define bn_mul4x_mont_gather5 ring_core_0_17_14__bn_mul4x_mont_gather5 +%define bn_mulx4x_mont_gather5 ring_core_0_17_14__bn_mulx4x_mont_gather5 +%define bn_neg_inv_mod_r_u64 ring_core_0_17_14__bn_neg_inv_mod_r_u64 +%define bn_power5_nohw ring_core_0_17_14__bn_power5_nohw +%define bn_powerx5 ring_core_0_17_14__bn_powerx5 +%define bn_scatter5 ring_core_0_17_14__bn_scatter5 +%define bn_sqr8x_internal ring_core_0_17_14__bn_sqr8x_internal +%define bn_sqr8x_mont ring_core_0_17_14__bn_sqr8x_mont +%define bn_sqrx8x_internal ring_core_0_17_14__bn_sqrx8x_internal +%define bsaes_ctr32_encrypt_blocks ring_core_0_17_14__bsaes_ctr32_encrypt_blocks +%define bssl_constant_time_test_conditional_memcpy ring_core_0_17_14__bssl_constant_time_test_conditional_memcpy +%define bssl_constant_time_test_conditional_memxor ring_core_0_17_14__bssl_constant_time_test_conditional_memxor +%define bssl_constant_time_test_main ring_core_0_17_14__bssl_constant_time_test_main +%define chacha20_poly1305_open ring_core_0_17_14__chacha20_poly1305_open +%define chacha20_poly1305_open_avx2 ring_core_0_17_14__chacha20_poly1305_open_avx2 +%define chacha20_poly1305_open_sse41 ring_core_0_17_14__chacha20_poly1305_open_sse41 +%define chacha20_poly1305_seal ring_core_0_17_14__chacha20_poly1305_seal +%define chacha20_poly1305_seal_avx2 ring_core_0_17_14__chacha20_poly1305_seal_avx2 +%define chacha20_poly1305_seal_sse41 ring_core_0_17_14__chacha20_poly1305_seal_sse41 +%define ecp_nistz256_mul_mont_adx ring_core_0_17_14__ecp_nistz256_mul_mont_adx +%define ecp_nistz256_mul_mont_nohw ring_core_0_17_14__ecp_nistz256_mul_mont_nohw +%define ecp_nistz256_ord_mul_mont_adx ring_core_0_17_14__ecp_nistz256_ord_mul_mont_adx +%define ecp_nistz256_ord_mul_mont_nohw ring_core_0_17_14__ecp_nistz256_ord_mul_mont_nohw +%define ecp_nistz256_ord_sqr_mont_adx ring_core_0_17_14__ecp_nistz256_ord_sqr_mont_adx +%define ecp_nistz256_ord_sqr_mont_nohw ring_core_0_17_14__ecp_nistz256_ord_sqr_mont_nohw +%define ecp_nistz256_point_add_adx ring_core_0_17_14__ecp_nistz256_point_add_adx +%define ecp_nistz256_point_add_nohw ring_core_0_17_14__ecp_nistz256_point_add_nohw +%define ecp_nistz256_point_add_affine_adx ring_core_0_17_14__ecp_nistz256_point_add_affine_adx +%define ecp_nistz256_point_add_affine_nohw ring_core_0_17_14__ecp_nistz256_point_add_affine_nohw +%define ecp_nistz256_point_double_adx ring_core_0_17_14__ecp_nistz256_point_double_adx +%define ecp_nistz256_point_double_nohw ring_core_0_17_14__ecp_nistz256_point_double_nohw +%define ecp_nistz256_select_w5_avx2 ring_core_0_17_14__ecp_nistz256_select_w5_avx2 +%define ecp_nistz256_select_w5_nohw ring_core_0_17_14__ecp_nistz256_select_w5_nohw +%define ecp_nistz256_select_w7_avx2 ring_core_0_17_14__ecp_nistz256_select_w7_avx2 +%define ecp_nistz256_select_w7_nohw ring_core_0_17_14__ecp_nistz256_select_w7_nohw +%define ecp_nistz256_sqr_mont_adx ring_core_0_17_14__ecp_nistz256_sqr_mont_adx +%define ecp_nistz256_sqr_mont_nohw ring_core_0_17_14__ecp_nistz256_sqr_mont_nohw +%define fiat_curve25519_adx_mul ring_core_0_17_14__fiat_curve25519_adx_mul +%define fiat_curve25519_adx_square ring_core_0_17_14__fiat_curve25519_adx_square +%define gcm_ghash_avx ring_core_0_17_14__gcm_ghash_avx +%define gcm_ghash_clmul ring_core_0_17_14__gcm_ghash_clmul +%define gcm_ghash_neon ring_core_0_17_14__gcm_ghash_neon +%define gcm_ghash_vpclmulqdq_avx2_1 ring_core_0_17_14__gcm_ghash_vpclmulqdq_avx2_1 +%define gcm_gmult_clmul ring_core_0_17_14__gcm_gmult_clmul +%define gcm_gmult_neon ring_core_0_17_14__gcm_gmult_neon +%define gcm_init_avx ring_core_0_17_14__gcm_init_avx +%define gcm_init_clmul ring_core_0_17_14__gcm_init_clmul +%define gcm_init_neon ring_core_0_17_14__gcm_init_neon +%define gcm_init_vpclmulqdq_avx2 ring_core_0_17_14__gcm_init_vpclmulqdq_avx2 +%define k25519Precomp ring_core_0_17_14__k25519Precomp +%define limbs_mul_add_limb ring_core_0_17_14__limbs_mul_add_limb +%define little_endian_bytes_from_scalar ring_core_0_17_14__little_endian_bytes_from_scalar +%define ecp_nistz256_neg ring_core_0_17_14__ecp_nistz256_neg +%define ecp_nistz256_select_w5 ring_core_0_17_14__ecp_nistz256_select_w5 +%define ecp_nistz256_select_w7 ring_core_0_17_14__ecp_nistz256_select_w7 +%define neon_available ring_core_0_17_14__neon_available +%define p256_mul_mont ring_core_0_17_14__p256_mul_mont +%define p256_point_add ring_core_0_17_14__p256_point_add +%define p256_point_add_affine ring_core_0_17_14__p256_point_add_affine +%define p256_point_double ring_core_0_17_14__p256_point_double +%define p256_point_mul ring_core_0_17_14__p256_point_mul +%define p256_point_mul_base ring_core_0_17_14__p256_point_mul_base +%define p256_point_mul_base_vartime ring_core_0_17_14__p256_point_mul_base_vartime +%define p256_scalar_mul_mont ring_core_0_17_14__p256_scalar_mul_mont +%define p256_scalar_sqr_rep_mont ring_core_0_17_14__p256_scalar_sqr_rep_mont +%define p256_sqr_mont ring_core_0_17_14__p256_sqr_mont +%define p384_elem_div_by_2 ring_core_0_17_14__p384_elem_div_by_2 +%define p384_elem_mul_mont ring_core_0_17_14__p384_elem_mul_mont +%define p384_elem_neg ring_core_0_17_14__p384_elem_neg +%define p384_elem_sub ring_core_0_17_14__p384_elem_sub +%define p384_point_add ring_core_0_17_14__p384_point_add +%define p384_point_double ring_core_0_17_14__p384_point_double +%define p384_point_mul ring_core_0_17_14__p384_point_mul +%define p384_scalar_mul_mont ring_core_0_17_14__p384_scalar_mul_mont +%define openssl_poly1305_neon2_addmulmod ring_core_0_17_14__openssl_poly1305_neon2_addmulmod +%define openssl_poly1305_neon2_blocks ring_core_0_17_14__openssl_poly1305_neon2_blocks +%define sha256_block_data_order ring_core_0_17_14__sha256_block_data_order +%define sha256_block_data_order_avx ring_core_0_17_14__sha256_block_data_order_avx +%define sha256_block_data_order_ssse3 ring_core_0_17_14__sha256_block_data_order_ssse3 +%define sha256_block_data_order_hw ring_core_0_17_14__sha256_block_data_order_hw +%define sha256_block_data_order_neon ring_core_0_17_14__sha256_block_data_order_neon +%define sha256_block_data_order_nohw ring_core_0_17_14__sha256_block_data_order_nohw +%define sha512_block_data_order ring_core_0_17_14__sha512_block_data_order +%define sha512_block_data_order_avx ring_core_0_17_14__sha512_block_data_order_avx +%define sha512_block_data_order_hw ring_core_0_17_14__sha512_block_data_order_hw +%define sha512_block_data_order_neon ring_core_0_17_14__sha512_block_data_order_neon +%define sha512_block_data_order_nohw ring_core_0_17_14__sha512_block_data_order_nohw +%define vpaes_ctr32_encrypt_blocks ring_core_0_17_14__vpaes_ctr32_encrypt_blocks +%define vpaes_encrypt ring_core_0_17_14__vpaes_encrypt +%define vpaes_encrypt_key_to_bsaes ring_core_0_17_14__vpaes_encrypt_key_to_bsaes +%define vpaes_set_encrypt_key ring_core_0_17_14__vpaes_set_encrypt_key +%define x25519_NEON ring_core_0_17_14__x25519_NEON +%define x25519_fe_invert ring_core_0_17_14__x25519_fe_invert +%define x25519_fe_isnegative ring_core_0_17_14__x25519_fe_isnegative +%define x25519_fe_mul_ttt ring_core_0_17_14__x25519_fe_mul_ttt +%define x25519_fe_neg ring_core_0_17_14__x25519_fe_neg +%define x25519_fe_tobytes ring_core_0_17_14__x25519_fe_tobytes +%define x25519_ge_double_scalarmult_vartime ring_core_0_17_14__x25519_ge_double_scalarmult_vartime +%define x25519_ge_frombytes_vartime ring_core_0_17_14__x25519_ge_frombytes_vartime +%define x25519_ge_scalarmult_base ring_core_0_17_14__x25519_ge_scalarmult_base +%define x25519_ge_scalarmult_base_adx ring_core_0_17_14__x25519_ge_scalarmult_base_adx +%define x25519_public_from_private_generic_masked ring_core_0_17_14__x25519_public_from_private_generic_masked +%define x25519_sc_mask ring_core_0_17_14__x25519_sc_mask +%define x25519_sc_muladd ring_core_0_17_14__x25519_sc_muladd +%define x25519_sc_reduce ring_core_0_17_14__x25519_sc_reduce +%define x25519_scalar_mult_adx ring_core_0_17_14__x25519_scalar_mult_adx +%define x25519_scalar_mult_generic_masked ring_core_0_17_14__x25519_scalar_mult_generic_masked + +%endif +%endif diff --git a/ring-0.17.14/pregenerated/sha256-armv4-linux32.S b/ring-0.17.14/pregenerated/sha256-armv4-linux32.S new file mode 100644 index 0000000000..b8b33352df --- /dev/null +++ b/ring-0.17.14/pregenerated/sha256-armv4-linux32.S @@ -0,0 +1,2678 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__) +@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +@ +@ Licensed under the Apache License, Version 2.0 (the "License"); +@ you may not use this file except in compliance with the License. +@ You may obtain a copy of the License at +@ +@ https://www.apache.org/licenses/LICENSE-2.0 +@ +@ Unless required by applicable law or agreed to in writing, software +@ distributed under the License is distributed on an "AS IS" BASIS, +@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ See the License for the specific language governing permissions and +@ limitations under the License. + + +@ ==================================================================== +@ Written by Andy Polyakov for the OpenSSL +@ project. +@ ==================================================================== + +@ SHA256 block procedure for ARMv4. May 2007. + +@ Performance is ~2x better than gcc 3.4 generated code and in "abso- +@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per +@ byte [on single-issue Xscale PXA250 core]. + +@ July 2010. +@ +@ Rescheduling for dual-issue pipeline resulted in 22% improvement on +@ Cortex A8 core and ~20 cycles per processed byte. + +@ February 2011. +@ +@ Profiler-assisted and platform-specific optimization resulted in 16% +@ improvement on Cortex A8 core and ~15.4 cycles per processed byte. + +@ September 2013. +@ +@ Add NEON implementation. On Cortex A8 it was measured to process one +@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon +@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only +@ code (meaning that latter performs sub-optimally, nothing was done +@ about it). + +@ May 2014. +@ +@ Add ARMv8 code path performing at 2.0 cpb on Apple A7. + +#ifdef __KERNEL__ +# define __ARM_ARCH __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ 7 +#endif + +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those +@ instructions are manually-encoded. (See unsha256.) +.arch armv7-a + +.text +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif + +.type K256,%object +.align 5 +K256: +.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.size K256,.-K256 +.word 0 @ terminator +.align 5 + +.globl sha256_block_data_order_nohw +.hidden sha256_block_data_order_nohw +.type sha256_block_data_order_nohw,%function +sha256_block_data_order_nohw: + add r2,r1,r2,lsl#6 @ len to point at the end of inp + stmdb sp!,{r0,r1,r2,r4-r11,lr} + ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11} + adr r14,K256 + sub sp,sp,#16*4 @ alloca(X[16]) +.Loop: +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ magic + eor r12,r12,r12 +#if __ARM_ARCH>=7 + @ ldr r2,[r1],#4 @ 0 +# if 0==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r8,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 0 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 0==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r8,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#0*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 0==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 0<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#2*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#15*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH>=7 + @ ldr r2,[r1],#4 @ 1 +# if 1==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r7,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 1 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 1==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r7,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#1*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 1==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 1<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#3*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#0*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH>=7 + @ ldr r2,[r1],#4 @ 2 +# if 2==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r6,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 2 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 2==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r6,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#2*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 2==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 2<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#4*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#1*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH>=7 + @ ldr r2,[r1],#4 @ 3 +# if 3==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r5,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 3 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 3==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r5,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#3*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 3==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 3<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#5*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#2*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH>=7 + @ ldr r2,[r1],#4 @ 4 +# if 4==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r4,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 4 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 4==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r4,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#4*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 4==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 4<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#6*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#3*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH>=7 + @ ldr r2,[r1],#4 @ 5 +# if 5==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r11,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 5 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 5==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r11,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#5*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 5==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 5<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#7*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#4*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH>=7 + @ ldr r2,[r1],#4 @ 6 +# if 6==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r10,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 6 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 6==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r10,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#6*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 6==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 6<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#8*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#5*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH>=7 + @ ldr r2,[r1],#4 @ 7 +# if 7==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r9,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 7 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 7==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r9,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#7*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 7==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 7<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#9*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#6*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH>=7 + @ ldr r2,[r1],#4 @ 8 +# if 8==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r8,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 8 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 8==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r8,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#8*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 8==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 8<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#10*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#7*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH>=7 + @ ldr r2,[r1],#4 @ 9 +# if 9==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r7,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 9 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 9==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r7,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#9*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 9==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 9<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#11*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#8*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH>=7 + @ ldr r2,[r1],#4 @ 10 +# if 10==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r6,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 10 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 10==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r6,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#10*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 10==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 10<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#12*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#9*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH>=7 + @ ldr r2,[r1],#4 @ 11 +# if 11==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r5,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 11 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 11==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r5,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#11*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 11==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 11<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#13*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#10*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH>=7 + @ ldr r2,[r1],#4 @ 12 +# if 12==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r4,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 12 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 12==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r4,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#12*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 12==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 12<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#14*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#11*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH>=7 + @ ldr r2,[r1],#4 @ 13 +# if 13==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r11,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 13 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 13==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r11,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#13*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 13==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 13<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#15*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#12*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH>=7 + @ ldr r2,[r1],#4 @ 14 +# if 14==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r10,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 14 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 14==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r10,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#14*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 14==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 14<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#0*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#13*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH>=7 + @ ldr r2,[r1],#4 @ 15 +# if 15==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r9,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 15 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 15==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r9,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#15*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 15==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 15<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#1*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#14*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) +.Lrounds_16_xx: + @ ldr r2,[sp,#1*4] @ 16 + @ ldr r1,[sp,#14*4] + mov r0,r2,ror#7 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#0*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#9*4] + + add r12,r12,r0 + eor r0,r8,r8,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r8,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#0*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 16==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 16<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#2*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#15*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#2*4] @ 17 + @ ldr r1,[sp,#15*4] + mov r0,r2,ror#7 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#1*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#10*4] + + add r3,r3,r0 + eor r0,r7,r7,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r7,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#1*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 17==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 17<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#3*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#0*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#3*4] @ 18 + @ ldr r1,[sp,#0*4] + mov r0,r2,ror#7 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#2*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#11*4] + + add r12,r12,r0 + eor r0,r6,r6,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r6,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#2*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 18==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 18<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#4*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#1*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#4*4] @ 19 + @ ldr r1,[sp,#1*4] + mov r0,r2,ror#7 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#3*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#12*4] + + add r3,r3,r0 + eor r0,r5,r5,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r5,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#3*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 19==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 19<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#5*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#2*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#5*4] @ 20 + @ ldr r1,[sp,#2*4] + mov r0,r2,ror#7 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#4*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#13*4] + + add r12,r12,r0 + eor r0,r4,r4,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r4,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#4*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 20==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 20<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#6*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#3*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#6*4] @ 21 + @ ldr r1,[sp,#3*4] + mov r0,r2,ror#7 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#5*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#14*4] + + add r3,r3,r0 + eor r0,r11,r11,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r11,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#5*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 21==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 21<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#7*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#4*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#7*4] @ 22 + @ ldr r1,[sp,#4*4] + mov r0,r2,ror#7 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#6*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#15*4] + + add r12,r12,r0 + eor r0,r10,r10,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r10,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#6*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 22==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 22<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#8*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#5*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#8*4] @ 23 + @ ldr r1,[sp,#5*4] + mov r0,r2,ror#7 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#7*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#0*4] + + add r3,r3,r0 + eor r0,r9,r9,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r9,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#7*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 23==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 23<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#9*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#6*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#9*4] @ 24 + @ ldr r1,[sp,#6*4] + mov r0,r2,ror#7 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#8*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#1*4] + + add r12,r12,r0 + eor r0,r8,r8,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r8,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#8*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 24==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 24<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#10*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#7*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#10*4] @ 25 + @ ldr r1,[sp,#7*4] + mov r0,r2,ror#7 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#9*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#2*4] + + add r3,r3,r0 + eor r0,r7,r7,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r7,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#9*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 25==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 25<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#11*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#8*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#11*4] @ 26 + @ ldr r1,[sp,#8*4] + mov r0,r2,ror#7 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#10*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#3*4] + + add r12,r12,r0 + eor r0,r6,r6,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r6,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#10*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 26==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 26<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#12*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#9*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#12*4] @ 27 + @ ldr r1,[sp,#9*4] + mov r0,r2,ror#7 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#11*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#4*4] + + add r3,r3,r0 + eor r0,r5,r5,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r5,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#11*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 27==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 27<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#13*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#10*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#13*4] @ 28 + @ ldr r1,[sp,#10*4] + mov r0,r2,ror#7 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#12*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#5*4] + + add r12,r12,r0 + eor r0,r4,r4,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r4,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#12*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 28==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 28<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#14*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#11*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#14*4] @ 29 + @ ldr r1,[sp,#11*4] + mov r0,r2,ror#7 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#13*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#6*4] + + add r3,r3,r0 + eor r0,r11,r11,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r11,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#13*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 29==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 29<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#15*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#12*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#15*4] @ 30 + @ ldr r1,[sp,#12*4] + mov r0,r2,ror#7 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#14*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#7*4] + + add r12,r12,r0 + eor r0,r10,r10,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r10,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#14*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 30==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 30<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#0*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#13*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#0*4] @ 31 + @ ldr r1,[sp,#13*4] + mov r0,r2,ror#7 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#15*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#8*4] + + add r3,r3,r0 + eor r0,r9,r9,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r9,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#15*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 31==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 31<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#1*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#14*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH>=7 + ite eq @ Thumb2 thing, sanity check in ARM +#endif + ldreq r3,[sp,#16*4] @ pull ctx + bne .Lrounds_16_xx + + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldr r0,[r3,#0] + ldr r2,[r3,#4] + ldr r12,[r3,#8] + add r4,r4,r0 + ldr r0,[r3,#12] + add r5,r5,r2 + ldr r2,[r3,#16] + add r6,r6,r12 + ldr r12,[r3,#20] + add r7,r7,r0 + ldr r0,[r3,#24] + add r8,r8,r2 + ldr r2,[r3,#28] + add r9,r9,r12 + ldr r1,[sp,#17*4] @ pull inp + ldr r12,[sp,#18*4] @ pull inp+len + add r10,r10,r0 + add r11,r11,r2 + stmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} + cmp r1,r12 + sub r14,r14,#256 @ rewind Ktbl + bne .Loop + + add sp,sp,#19*4 @ destroy frame +#if __ARM_ARCH>=5 + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} +#else + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size sha256_block_data_order_nohw,.-sha256_block_data_order_nohw +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.LK256_shortcut_neon: +@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode. +#if defined(__thumb2__) +.word K256-(.LK256_add_neon+4) +#else +.word K256-(.LK256_add_neon+8) +#endif + +.globl sha256_block_data_order_neon +.hidden sha256_block_data_order_neon +.type sha256_block_data_order_neon,%function +.align 5 +.skip 16 +sha256_block_data_order_neon: + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + + sub r11,sp,#16*4+16 + + @ K256 is just at the boundary of being easily referenced by an ADR from + @ this function. In Arm mode, when building with __ARM_ARCH=6, it does + @ not fit. By moving code around, we could make it fit, but this is too + @ fragile. For simplicity, just load the offset from + @ .LK256_shortcut_neon. + @ + @ TODO(davidben): adrl would avoid a load, but clang-assembler does not + @ support it. We might be able to emulate it with a macro, but Android's + @ did not work when I tried it. + @ https://android.googlesource.com/platform/ndk/+/refs/heads/main/docs/ClangMigration.md#arm + ldr r14,.LK256_shortcut_neon +.LK256_add_neon: + add r14,pc,r14 + + bic r11,r11,#15 @ align for 128-bit stores + mov r12,sp + mov sp,r11 @ alloca + add r2,r1,r2,lsl#6 @ len to point at the end of inp + + vld1.8 {q0},[r1]! + vld1.8 {q1},[r1]! + vld1.8 {q2},[r1]! + vld1.8 {q3},[r1]! + vld1.32 {q8},[r14,:128]! + vld1.32 {q9},[r14,:128]! + vld1.32 {q10},[r14,:128]! + vld1.32 {q11},[r14,:128]! + vrev32.8 q0,q0 @ yes, even on + str r0,[sp,#64] + vrev32.8 q1,q1 @ big-endian + str r1,[sp,#68] + mov r1,sp + vrev32.8 q2,q2 + str r2,[sp,#72] + vrev32.8 q3,q3 + str r12,[sp,#76] @ save original sp + vadd.i32 q8,q8,q0 + vadd.i32 q9,q9,q1 + vst1.32 {q8},[r1,:128]! + vadd.i32 q10,q10,q2 + vst1.32 {q9},[r1,:128]! + vadd.i32 q11,q11,q3 + vst1.32 {q10},[r1,:128]! + vst1.32 {q11},[r1,:128]! + + ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11} + sub r1,r1,#64 + ldr r2,[sp,#0] + eor r12,r12,r12 + eor r3,r5,r6 + b .L_00_48 + +.align 4 +.L_00_48: + vext.8 q8,q0,q1,#4 + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + vext.8 q9,q2,q3,#4 + add r4,r4,r12 + and r2,r2,r8 + eor r12,r0,r8,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vadd.i32 q0,q0,q9 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + vshr.u32 q9,q8,#3 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#4] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + veor q9,q9,q10 + add r10,r10,r2 + vsli.32 q11,q8,#14 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + vshr.u32 d24,d7,#17 + add r11,r11,r3 + and r2,r2,r7 + veor q9,q9,q11 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + vsli.32 d24,d7,#15 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + vshr.u32 d25,d7,#10 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + vadd.i32 q0,q0,q9 + add r10,r10,r2 + ldr r2,[sp,#8] + veor d25,d25,d24 + and r12,r12,r3 + add r6,r6,r10 + vshr.u32 d24,d7,#19 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + vsli.32 d24,d7,#13 + add r9,r9,r2 + eor r2,r7,r8 + veor d25,d25,d24 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + vadd.i32 d0,d0,d25 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + vshr.u32 d24,d0,#17 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + vsli.32 d24,d0,#15 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + vshr.u32 d25,d0,#10 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + veor d25,d25,d24 + ldr r2,[sp,#12] + and r3,r3,r12 + vshr.u32 d24,d0,#19 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + vld1.32 {q8},[r14,:128]! + add r8,r8,r2 + vsli.32 d24,d0,#13 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + veor d25,d25,d24 + add r9,r9,r3 + and r2,r2,r5 + vadd.i32 d1,d1,d25 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + vadd.i32 q8,q8,q0 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#16] + and r12,r12,r3 + add r4,r4,r8 + vst1.32 {q8},[r1,:128]! + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vext.8 q8,q1,q2,#4 + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + vext.8 q9,q3,q0,#4 + add r8,r8,r12 + and r2,r2,r4 + eor r12,r0,r4,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vadd.i32 q1,q1,q9 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + vshr.u32 q9,q8,#3 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#20] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + veor q9,q9,q10 + add r6,r6,r2 + vsli.32 q11,q8,#14 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + vshr.u32 d24,d1,#17 + add r7,r7,r3 + and r2,r2,r11 + veor q9,q9,q11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + vsli.32 d24,d1,#15 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + vshr.u32 d25,d1,#10 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + vadd.i32 q1,q1,q9 + add r6,r6,r2 + ldr r2,[sp,#24] + veor d25,d25,d24 + and r12,r12,r3 + add r10,r10,r6 + vshr.u32 d24,d1,#19 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + vsli.32 d24,d1,#13 + add r5,r5,r2 + eor r2,r11,r4 + veor d25,d25,d24 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + vadd.i32 d2,d2,d25 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + vshr.u32 d24,d2,#17 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + vsli.32 d24,d2,#15 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + vshr.u32 d25,d2,#10 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + veor d25,d25,d24 + ldr r2,[sp,#28] + and r3,r3,r12 + vshr.u32 d24,d2,#19 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + vld1.32 {q8},[r14,:128]! + add r4,r4,r2 + vsli.32 d24,d2,#13 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + veor d25,d25,d24 + add r5,r5,r3 + and r2,r2,r9 + vadd.i32 d3,d3,d25 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + vadd.i32 q8,q8,q1 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[sp,#32] + and r12,r12,r3 + add r8,r8,r4 + vst1.32 {q8},[r1,:128]! + add r4,r4,r0,ror#2 + eor r12,r12,r6 + vext.8 q8,q2,q3,#4 + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + vext.8 q9,q0,q1,#4 + add r4,r4,r12 + and r2,r2,r8 + eor r12,r0,r8,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vadd.i32 q2,q2,q9 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + vshr.u32 q9,q8,#3 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#36] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + veor q9,q9,q10 + add r10,r10,r2 + vsli.32 q11,q8,#14 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + vshr.u32 d24,d3,#17 + add r11,r11,r3 + and r2,r2,r7 + veor q9,q9,q11 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + vsli.32 d24,d3,#15 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + vshr.u32 d25,d3,#10 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + vadd.i32 q2,q2,q9 + add r10,r10,r2 + ldr r2,[sp,#40] + veor d25,d25,d24 + and r12,r12,r3 + add r6,r6,r10 + vshr.u32 d24,d3,#19 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + vsli.32 d24,d3,#13 + add r9,r9,r2 + eor r2,r7,r8 + veor d25,d25,d24 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + vadd.i32 d4,d4,d25 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + vshr.u32 d24,d4,#17 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + vsli.32 d24,d4,#15 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + vshr.u32 d25,d4,#10 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + veor d25,d25,d24 + ldr r2,[sp,#44] + and r3,r3,r12 + vshr.u32 d24,d4,#19 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + vld1.32 {q8},[r14,:128]! + add r8,r8,r2 + vsli.32 d24,d4,#13 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + veor d25,d25,d24 + add r9,r9,r3 + and r2,r2,r5 + vadd.i32 d5,d5,d25 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + vadd.i32 q8,q8,q2 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#48] + and r12,r12,r3 + add r4,r4,r8 + vst1.32 {q8},[r1,:128]! + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vext.8 q8,q3,q0,#4 + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + vext.8 q9,q1,q2,#4 + add r8,r8,r12 + and r2,r2,r4 + eor r12,r0,r4,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vadd.i32 q3,q3,q9 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + vshr.u32 q9,q8,#3 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#52] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + veor q9,q9,q10 + add r6,r6,r2 + vsli.32 q11,q8,#14 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + vshr.u32 d24,d5,#17 + add r7,r7,r3 + and r2,r2,r11 + veor q9,q9,q11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + vsli.32 d24,d5,#15 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + vshr.u32 d25,d5,#10 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + vadd.i32 q3,q3,q9 + add r6,r6,r2 + ldr r2,[sp,#56] + veor d25,d25,d24 + and r12,r12,r3 + add r10,r10,r6 + vshr.u32 d24,d5,#19 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + vsli.32 d24,d5,#13 + add r5,r5,r2 + eor r2,r11,r4 + veor d25,d25,d24 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + vadd.i32 d6,d6,d25 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + vshr.u32 d24,d6,#17 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + vsli.32 d24,d6,#15 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + vshr.u32 d25,d6,#10 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + veor d25,d25,d24 + ldr r2,[sp,#60] + and r3,r3,r12 + vshr.u32 d24,d6,#19 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + vld1.32 {q8},[r14,:128]! + add r4,r4,r2 + vsli.32 d24,d6,#13 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + veor d25,d25,d24 + add r5,r5,r3 + and r2,r2,r9 + vadd.i32 d7,d7,d25 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + vadd.i32 q8,q8,q3 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[r14] + and r12,r12,r3 + add r8,r8,r4 + vst1.32 {q8},[r1,:128]! + add r4,r4,r0,ror#2 + eor r12,r12,r6 + teq r2,#0 @ check for K256 terminator + ldr r2,[sp,#0] + sub r1,r1,#64 + bne .L_00_48 + + ldr r1,[sp,#68] + ldr r0,[sp,#72] + sub r14,r14,#256 @ rewind r14 + teq r1,r0 + it eq + subeq r1,r1,#64 @ avoid SEGV + vld1.8 {q0},[r1]! @ load next input block + vld1.8 {q1},[r1]! + vld1.8 {q2},[r1]! + vld1.8 {q3},[r1]! + it ne + strne r1,[sp,#68] + mov r1,sp + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + add r4,r4,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r8 + eor r12,r0,r8,ror#19 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vrev32.8 q0,q0 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vadd.i32 q8,q8,q0 + ldr r2,[sp,#4] + and r3,r3,r12 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + add r10,r10,r2 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + add r11,r11,r3 + and r2,r2,r7 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + add r10,r10,r2 + ldr r2,[sp,#8] + and r12,r12,r3 + add r6,r6,r10 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + add r9,r9,r2 + eor r2,r7,r8 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + ldr r2,[sp,#12] + and r3,r3,r12 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + add r8,r8,r2 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + add r9,r9,r3 + and r2,r2,r5 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#16] + and r12,r12,r3 + add r4,r4,r8 + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vst1.32 {q8},[r1,:128]! + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + add r8,r8,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r4 + eor r12,r0,r4,ror#19 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vrev32.8 q1,q1 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vadd.i32 q8,q8,q1 + ldr r2,[sp,#20] + and r3,r3,r12 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + add r6,r6,r2 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + add r7,r7,r3 + and r2,r2,r11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + add r6,r6,r2 + ldr r2,[sp,#24] + and r12,r12,r3 + add r10,r10,r6 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + add r5,r5,r2 + eor r2,r11,r4 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + ldr r2,[sp,#28] + and r3,r3,r12 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + add r4,r4,r2 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + add r5,r5,r3 + and r2,r2,r9 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[sp,#32] + and r12,r12,r3 + add r8,r8,r4 + add r4,r4,r0,ror#2 + eor r12,r12,r6 + vst1.32 {q8},[r1,:128]! + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + add r4,r4,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r8 + eor r12,r0,r8,ror#19 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vrev32.8 q2,q2 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vadd.i32 q8,q8,q2 + ldr r2,[sp,#36] + and r3,r3,r12 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + add r10,r10,r2 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + add r11,r11,r3 + and r2,r2,r7 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + add r10,r10,r2 + ldr r2,[sp,#40] + and r12,r12,r3 + add r6,r6,r10 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + add r9,r9,r2 + eor r2,r7,r8 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + ldr r2,[sp,#44] + and r3,r3,r12 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + add r8,r8,r2 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + add r9,r9,r3 + and r2,r2,r5 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#48] + and r12,r12,r3 + add r4,r4,r8 + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vst1.32 {q8},[r1,:128]! + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + add r8,r8,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r4 + eor r12,r0,r4,ror#19 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vrev32.8 q3,q3 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vadd.i32 q8,q8,q3 + ldr r2,[sp,#52] + and r3,r3,r12 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + add r6,r6,r2 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + add r7,r7,r3 + and r2,r2,r11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + add r6,r6,r2 + ldr r2,[sp,#56] + and r12,r12,r3 + add r10,r10,r6 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + add r5,r5,r2 + eor r2,r11,r4 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + ldr r2,[sp,#60] + and r3,r3,r12 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + add r4,r4,r2 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + add r5,r5,r3 + and r2,r2,r9 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[sp,#64] + and r12,r12,r3 + add r8,r8,r4 + add r4,r4,r0,ror#2 + eor r12,r12,r6 + vst1.32 {q8},[r1,:128]! + ldr r0,[r2,#0] + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldr r12,[r2,#4] + ldr r3,[r2,#8] + ldr r1,[r2,#12] + add r4,r4,r0 @ accumulate + ldr r0,[r2,#16] + add r5,r5,r12 + ldr r12,[r2,#20] + add r6,r6,r3 + ldr r3,[r2,#24] + add r7,r7,r1 + ldr r1,[r2,#28] + add r8,r8,r0 + str r4,[r2],#4 + add r9,r9,r12 + str r5,[r2],#4 + add r10,r10,r3 + str r6,[r2],#4 + add r11,r11,r1 + str r7,[r2],#4 + stmia r2,{r8,r9,r10,r11} + + ittte ne + movne r1,sp + ldrne r2,[sp,#0] + eorne r12,r12,r12 + ldreq sp,[sp,#76] @ restore original sp + itt ne + eorne r3,r5,r6 + bne .L_00_48 + + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} +.size sha256_block_data_order_neon,.-sha256_block_data_order_neon +#endif +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__) diff --git a/ring-0.17.14/pregenerated/sha256-armv8-ios64.S b/ring-0.17.14/pregenerated/sha256-armv8-ios64.S new file mode 100644 index 0000000000..8ecafb46a2 --- /dev/null +++ b/ring-0.17.14/pregenerated/sha256-armv8-ios64.S @@ -0,0 +1,1195 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) +// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// ==================================================================== +// Written by Andy Polyakov for the OpenSSL +// project. +// ==================================================================== +// +// SHA256/512 for ARMv8. +// +// Performance in cycles per processed byte and improvement coefficient +// over code generated with "default" compiler: +// +// SHA256-hw SHA256(*) SHA512 +// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) +// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) +// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) +// Denver 2.01 10.5 (+26%) 6.70 (+8%) +// X-Gene 20.0 (+100%) 12.8 (+300%(***)) +// Mongoose 2.36 13.0 (+50%) 8.36 (+33%) +// Kryo 1.92 17.4 (+30%) 11.2 (+8%) +// +// (*) Software SHA256 results are of lesser relevance, presented +// mostly for informational purposes. +// (**) The result is a trade-off: it's possible to improve it by +// 10% (or by 1 cycle per round), but at the cost of 20% loss +// on Cortex-A53 (or by 4 cycles per round). +// (***) Super-impressive coefficients over gcc-generated code are +// indication of some compiler "pathology", most notably code +// generated with -mgeneral-regs-only is significantly faster +// and the gap is only 40-90%. + +#ifndef __KERNEL__ +#endif + +.text + +.globl _sha256_block_data_order_nohw +.private_extern _sha256_block_data_order_nohw + +.align 6 +_sha256_block_data_order_nohw: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#4*4 + + ldp w20,w21,[x0] // load context + ldp w22,w23,[x0,#2*4] + ldp w24,w25,[x0,#4*4] + add x2,x1,x2,lsl#6 // end of input + ldp w26,w27,[x0,#6*4] + adrp x30,LK256@PAGE + add x30,x30,LK256@PAGEOFF + stp x0,x2,[x29,#96] + +Loop: + ldp w3,w4,[x1],#2*4 + ldr w19,[x30],#4 // *K++ + eor w28,w21,w22 // magic seed + str x1,[x29,#112] +#ifndef __AARCH64EB__ + rev w3,w3 // 0 +#endif + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + eor w6,w24,w24,ror#14 + and w17,w25,w24 + bic w19,w26,w24 + add w27,w27,w3 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w6,ror#11 // Sigma1(e) + ror w6,w20,#2 + add w27,w27,w17 // h+=Ch(e,f,g) + eor w17,w20,w20,ror#9 + add w27,w27,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w23,w23,w27 // d+=h + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w6,w17,ror#13 // Sigma0(a) + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w27,w27,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w4,w4 // 1 +#endif + ldp w5,w6,[x1],#2*4 + add w27,w27,w17 // h+=Sigma0(a) + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + eor w7,w23,w23,ror#14 + and w17,w24,w23 + bic w28,w25,w23 + add w26,w26,w4 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w7,ror#11 // Sigma1(e) + ror w7,w27,#2 + add w26,w26,w17 // h+=Ch(e,f,g) + eor w17,w27,w27,ror#9 + add w26,w26,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w22,w22,w26 // d+=h + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w7,w17,ror#13 // Sigma0(a) + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w26,w26,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w5,w5 // 2 +#endif + add w26,w26,w17 // h+=Sigma0(a) + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + eor w8,w22,w22,ror#14 + and w17,w23,w22 + bic w19,w24,w22 + add w25,w25,w5 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w8,ror#11 // Sigma1(e) + ror w8,w26,#2 + add w25,w25,w17 // h+=Ch(e,f,g) + eor w17,w26,w26,ror#9 + add w25,w25,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w21,w21,w25 // d+=h + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w8,w17,ror#13 // Sigma0(a) + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w25,w25,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w6,w6 // 3 +#endif + ldp w7,w8,[x1],#2*4 + add w25,w25,w17 // h+=Sigma0(a) + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + eor w9,w21,w21,ror#14 + and w17,w22,w21 + bic w28,w23,w21 + add w24,w24,w6 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w9,ror#11 // Sigma1(e) + ror w9,w25,#2 + add w24,w24,w17 // h+=Ch(e,f,g) + eor w17,w25,w25,ror#9 + add w24,w24,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w20,w20,w24 // d+=h + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w9,w17,ror#13 // Sigma0(a) + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w24,w24,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w7,w7 // 4 +#endif + add w24,w24,w17 // h+=Sigma0(a) + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + eor w10,w20,w20,ror#14 + and w17,w21,w20 + bic w19,w22,w20 + add w23,w23,w7 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w10,ror#11 // Sigma1(e) + ror w10,w24,#2 + add w23,w23,w17 // h+=Ch(e,f,g) + eor w17,w24,w24,ror#9 + add w23,w23,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w27,w27,w23 // d+=h + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w10,w17,ror#13 // Sigma0(a) + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w23,w23,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w8,w8 // 5 +#endif + ldp w9,w10,[x1],#2*4 + add w23,w23,w17 // h+=Sigma0(a) + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + eor w11,w27,w27,ror#14 + and w17,w20,w27 + bic w28,w21,w27 + add w22,w22,w8 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w11,ror#11 // Sigma1(e) + ror w11,w23,#2 + add w22,w22,w17 // h+=Ch(e,f,g) + eor w17,w23,w23,ror#9 + add w22,w22,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w26,w26,w22 // d+=h + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w11,w17,ror#13 // Sigma0(a) + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w22,w22,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w9,w9 // 6 +#endif + add w22,w22,w17 // h+=Sigma0(a) + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + eor w12,w26,w26,ror#14 + and w17,w27,w26 + bic w19,w20,w26 + add w21,w21,w9 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w12,ror#11 // Sigma1(e) + ror w12,w22,#2 + add w21,w21,w17 // h+=Ch(e,f,g) + eor w17,w22,w22,ror#9 + add w21,w21,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w25,w25,w21 // d+=h + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w12,w17,ror#13 // Sigma0(a) + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w21,w21,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w10,w10 // 7 +#endif + ldp w11,w12,[x1],#2*4 + add w21,w21,w17 // h+=Sigma0(a) + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + eor w13,w25,w25,ror#14 + and w17,w26,w25 + bic w28,w27,w25 + add w20,w20,w10 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w13,ror#11 // Sigma1(e) + ror w13,w21,#2 + add w20,w20,w17 // h+=Ch(e,f,g) + eor w17,w21,w21,ror#9 + add w20,w20,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w24,w24,w20 // d+=h + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w13,w17,ror#13 // Sigma0(a) + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w20,w20,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w11,w11 // 8 +#endif + add w20,w20,w17 // h+=Sigma0(a) + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + eor w14,w24,w24,ror#14 + and w17,w25,w24 + bic w19,w26,w24 + add w27,w27,w11 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w14,ror#11 // Sigma1(e) + ror w14,w20,#2 + add w27,w27,w17 // h+=Ch(e,f,g) + eor w17,w20,w20,ror#9 + add w27,w27,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w23,w23,w27 // d+=h + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w14,w17,ror#13 // Sigma0(a) + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w27,w27,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w12,w12 // 9 +#endif + ldp w13,w14,[x1],#2*4 + add w27,w27,w17 // h+=Sigma0(a) + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + eor w15,w23,w23,ror#14 + and w17,w24,w23 + bic w28,w25,w23 + add w26,w26,w12 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w15,ror#11 // Sigma1(e) + ror w15,w27,#2 + add w26,w26,w17 // h+=Ch(e,f,g) + eor w17,w27,w27,ror#9 + add w26,w26,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w22,w22,w26 // d+=h + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w15,w17,ror#13 // Sigma0(a) + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w26,w26,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w13,w13 // 10 +#endif + add w26,w26,w17 // h+=Sigma0(a) + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + eor w0,w22,w22,ror#14 + and w17,w23,w22 + bic w19,w24,w22 + add w25,w25,w13 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w0,ror#11 // Sigma1(e) + ror w0,w26,#2 + add w25,w25,w17 // h+=Ch(e,f,g) + eor w17,w26,w26,ror#9 + add w25,w25,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w21,w21,w25 // d+=h + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w0,w17,ror#13 // Sigma0(a) + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w25,w25,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w14,w14 // 11 +#endif + ldp w15,w0,[x1],#2*4 + add w25,w25,w17 // h+=Sigma0(a) + str w6,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + eor w6,w21,w21,ror#14 + and w17,w22,w21 + bic w28,w23,w21 + add w24,w24,w14 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w6,ror#11 // Sigma1(e) + ror w6,w25,#2 + add w24,w24,w17 // h+=Ch(e,f,g) + eor w17,w25,w25,ror#9 + add w24,w24,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w20,w20,w24 // d+=h + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w6,w17,ror#13 // Sigma0(a) + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w24,w24,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w15,w15 // 12 +#endif + add w24,w24,w17 // h+=Sigma0(a) + str w7,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + eor w7,w20,w20,ror#14 + and w17,w21,w20 + bic w19,w22,w20 + add w23,w23,w15 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w7,ror#11 // Sigma1(e) + ror w7,w24,#2 + add w23,w23,w17 // h+=Ch(e,f,g) + eor w17,w24,w24,ror#9 + add w23,w23,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w27,w27,w23 // d+=h + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w7,w17,ror#13 // Sigma0(a) + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w23,w23,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w0,w0 // 13 +#endif + ldp w1,w2,[x1] + add w23,w23,w17 // h+=Sigma0(a) + str w8,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + eor w8,w27,w27,ror#14 + and w17,w20,w27 + bic w28,w21,w27 + add w22,w22,w0 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w8,ror#11 // Sigma1(e) + ror w8,w23,#2 + add w22,w22,w17 // h+=Ch(e,f,g) + eor w17,w23,w23,ror#9 + add w22,w22,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w26,w26,w22 // d+=h + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w8,w17,ror#13 // Sigma0(a) + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w22,w22,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w1,w1 // 14 +#endif + ldr w6,[sp,#12] + add w22,w22,w17 // h+=Sigma0(a) + str w9,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + eor w9,w26,w26,ror#14 + and w17,w27,w26 + bic w19,w20,w26 + add w21,w21,w1 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w9,ror#11 // Sigma1(e) + ror w9,w22,#2 + add w21,w21,w17 // h+=Ch(e,f,g) + eor w17,w22,w22,ror#9 + add w21,w21,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w25,w25,w21 // d+=h + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w9,w17,ror#13 // Sigma0(a) + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w21,w21,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w2,w2 // 15 +#endif + ldr w7,[sp,#0] + add w21,w21,w17 // h+=Sigma0(a) + str w10,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w9,w4,#7 + and w17,w26,w25 + ror w8,w1,#17 + bic w28,w27,w25 + ror w10,w21,#2 + add w20,w20,w2 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w9,w9,w4,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w10,w10,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w8,w8,w1,ror#19 + eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w10,w21,ror#22 // Sigma0(a) + eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) + add w3,w3,w12 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w3,w3,w9 + add w20,w20,w17 // h+=Sigma0(a) + add w3,w3,w8 +Loop_16_xx: + ldr w8,[sp,#4] + str w11,[sp,#0] + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + ror w10,w5,#7 + and w17,w25,w24 + ror w9,w2,#17 + bic w19,w26,w24 + ror w11,w20,#2 + add w27,w27,w3 // h+=X[i] + eor w16,w16,w24,ror#11 + eor w10,w10,w5,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w24,ror#25 // Sigma1(e) + eor w11,w11,w20,ror#13 + add w27,w27,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w9,w9,w2,ror#19 + eor w10,w10,w5,lsr#3 // sigma0(X[i+1]) + add w27,w27,w16 // h+=Sigma1(e) + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w11,w20,ror#22 // Sigma0(a) + eor w9,w9,w2,lsr#10 // sigma1(X[i+14]) + add w4,w4,w13 + add w23,w23,w27 // d+=h + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w4,w4,w10 + add w27,w27,w17 // h+=Sigma0(a) + add w4,w4,w9 + ldr w9,[sp,#8] + str w12,[sp,#4] + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + ror w11,w6,#7 + and w17,w24,w23 + ror w10,w3,#17 + bic w28,w25,w23 + ror w12,w27,#2 + add w26,w26,w4 // h+=X[i] + eor w16,w16,w23,ror#11 + eor w11,w11,w6,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w23,ror#25 // Sigma1(e) + eor w12,w12,w27,ror#13 + add w26,w26,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w10,w10,w3,ror#19 + eor w11,w11,w6,lsr#3 // sigma0(X[i+1]) + add w26,w26,w16 // h+=Sigma1(e) + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w12,w27,ror#22 // Sigma0(a) + eor w10,w10,w3,lsr#10 // sigma1(X[i+14]) + add w5,w5,w14 + add w22,w22,w26 // d+=h + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w5,w5,w11 + add w26,w26,w17 // h+=Sigma0(a) + add w5,w5,w10 + ldr w10,[sp,#12] + str w13,[sp,#8] + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + ror w12,w7,#7 + and w17,w23,w22 + ror w11,w4,#17 + bic w19,w24,w22 + ror w13,w26,#2 + add w25,w25,w5 // h+=X[i] + eor w16,w16,w22,ror#11 + eor w12,w12,w7,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w22,ror#25 // Sigma1(e) + eor w13,w13,w26,ror#13 + add w25,w25,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w11,w11,w4,ror#19 + eor w12,w12,w7,lsr#3 // sigma0(X[i+1]) + add w25,w25,w16 // h+=Sigma1(e) + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w13,w26,ror#22 // Sigma0(a) + eor w11,w11,w4,lsr#10 // sigma1(X[i+14]) + add w6,w6,w15 + add w21,w21,w25 // d+=h + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w6,w6,w12 + add w25,w25,w17 // h+=Sigma0(a) + add w6,w6,w11 + ldr w11,[sp,#0] + str w14,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + ror w13,w8,#7 + and w17,w22,w21 + ror w12,w5,#17 + bic w28,w23,w21 + ror w14,w25,#2 + add w24,w24,w6 // h+=X[i] + eor w16,w16,w21,ror#11 + eor w13,w13,w8,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w21,ror#25 // Sigma1(e) + eor w14,w14,w25,ror#13 + add w24,w24,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w12,w12,w5,ror#19 + eor w13,w13,w8,lsr#3 // sigma0(X[i+1]) + add w24,w24,w16 // h+=Sigma1(e) + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w14,w25,ror#22 // Sigma0(a) + eor w12,w12,w5,lsr#10 // sigma1(X[i+14]) + add w7,w7,w0 + add w20,w20,w24 // d+=h + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w7,w7,w13 + add w24,w24,w17 // h+=Sigma0(a) + add w7,w7,w12 + ldr w12,[sp,#4] + str w15,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + ror w14,w9,#7 + and w17,w21,w20 + ror w13,w6,#17 + bic w19,w22,w20 + ror w15,w24,#2 + add w23,w23,w7 // h+=X[i] + eor w16,w16,w20,ror#11 + eor w14,w14,w9,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w20,ror#25 // Sigma1(e) + eor w15,w15,w24,ror#13 + add w23,w23,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w13,w13,w6,ror#19 + eor w14,w14,w9,lsr#3 // sigma0(X[i+1]) + add w23,w23,w16 // h+=Sigma1(e) + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w15,w24,ror#22 // Sigma0(a) + eor w13,w13,w6,lsr#10 // sigma1(X[i+14]) + add w8,w8,w1 + add w27,w27,w23 // d+=h + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w8,w8,w14 + add w23,w23,w17 // h+=Sigma0(a) + add w8,w8,w13 + ldr w13,[sp,#8] + str w0,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + ror w15,w10,#7 + and w17,w20,w27 + ror w14,w7,#17 + bic w28,w21,w27 + ror w0,w23,#2 + add w22,w22,w8 // h+=X[i] + eor w16,w16,w27,ror#11 + eor w15,w15,w10,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w27,ror#25 // Sigma1(e) + eor w0,w0,w23,ror#13 + add w22,w22,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w14,w14,w7,ror#19 + eor w15,w15,w10,lsr#3 // sigma0(X[i+1]) + add w22,w22,w16 // h+=Sigma1(e) + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w0,w23,ror#22 // Sigma0(a) + eor w14,w14,w7,lsr#10 // sigma1(X[i+14]) + add w9,w9,w2 + add w26,w26,w22 // d+=h + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w9,w9,w15 + add w22,w22,w17 // h+=Sigma0(a) + add w9,w9,w14 + ldr w14,[sp,#12] + str w1,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + ror w0,w11,#7 + and w17,w27,w26 + ror w15,w8,#17 + bic w19,w20,w26 + ror w1,w22,#2 + add w21,w21,w9 // h+=X[i] + eor w16,w16,w26,ror#11 + eor w0,w0,w11,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w26,ror#25 // Sigma1(e) + eor w1,w1,w22,ror#13 + add w21,w21,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w15,w15,w8,ror#19 + eor w0,w0,w11,lsr#3 // sigma0(X[i+1]) + add w21,w21,w16 // h+=Sigma1(e) + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w1,w22,ror#22 // Sigma0(a) + eor w15,w15,w8,lsr#10 // sigma1(X[i+14]) + add w10,w10,w3 + add w25,w25,w21 // d+=h + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w10,w10,w0 + add w21,w21,w17 // h+=Sigma0(a) + add w10,w10,w15 + ldr w15,[sp,#0] + str w2,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w1,w12,#7 + and w17,w26,w25 + ror w0,w9,#17 + bic w28,w27,w25 + ror w2,w21,#2 + add w20,w20,w10 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w1,w1,w12,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w2,w2,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w0,w0,w9,ror#19 + eor w1,w1,w12,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w2,w21,ror#22 // Sigma0(a) + eor w0,w0,w9,lsr#10 // sigma1(X[i+14]) + add w11,w11,w4 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w11,w11,w1 + add w20,w20,w17 // h+=Sigma0(a) + add w11,w11,w0 + ldr w0,[sp,#4] + str w3,[sp,#0] + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + ror w2,w13,#7 + and w17,w25,w24 + ror w1,w10,#17 + bic w19,w26,w24 + ror w3,w20,#2 + add w27,w27,w11 // h+=X[i] + eor w16,w16,w24,ror#11 + eor w2,w2,w13,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w24,ror#25 // Sigma1(e) + eor w3,w3,w20,ror#13 + add w27,w27,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w1,w1,w10,ror#19 + eor w2,w2,w13,lsr#3 // sigma0(X[i+1]) + add w27,w27,w16 // h+=Sigma1(e) + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w3,w20,ror#22 // Sigma0(a) + eor w1,w1,w10,lsr#10 // sigma1(X[i+14]) + add w12,w12,w5 + add w23,w23,w27 // d+=h + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w12,w12,w2 + add w27,w27,w17 // h+=Sigma0(a) + add w12,w12,w1 + ldr w1,[sp,#8] + str w4,[sp,#4] + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + ror w3,w14,#7 + and w17,w24,w23 + ror w2,w11,#17 + bic w28,w25,w23 + ror w4,w27,#2 + add w26,w26,w12 // h+=X[i] + eor w16,w16,w23,ror#11 + eor w3,w3,w14,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w23,ror#25 // Sigma1(e) + eor w4,w4,w27,ror#13 + add w26,w26,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w2,w2,w11,ror#19 + eor w3,w3,w14,lsr#3 // sigma0(X[i+1]) + add w26,w26,w16 // h+=Sigma1(e) + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w4,w27,ror#22 // Sigma0(a) + eor w2,w2,w11,lsr#10 // sigma1(X[i+14]) + add w13,w13,w6 + add w22,w22,w26 // d+=h + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w13,w13,w3 + add w26,w26,w17 // h+=Sigma0(a) + add w13,w13,w2 + ldr w2,[sp,#12] + str w5,[sp,#8] + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + ror w4,w15,#7 + and w17,w23,w22 + ror w3,w12,#17 + bic w19,w24,w22 + ror w5,w26,#2 + add w25,w25,w13 // h+=X[i] + eor w16,w16,w22,ror#11 + eor w4,w4,w15,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w22,ror#25 // Sigma1(e) + eor w5,w5,w26,ror#13 + add w25,w25,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w3,w3,w12,ror#19 + eor w4,w4,w15,lsr#3 // sigma0(X[i+1]) + add w25,w25,w16 // h+=Sigma1(e) + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w5,w26,ror#22 // Sigma0(a) + eor w3,w3,w12,lsr#10 // sigma1(X[i+14]) + add w14,w14,w7 + add w21,w21,w25 // d+=h + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w14,w14,w4 + add w25,w25,w17 // h+=Sigma0(a) + add w14,w14,w3 + ldr w3,[sp,#0] + str w6,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + ror w5,w0,#7 + and w17,w22,w21 + ror w4,w13,#17 + bic w28,w23,w21 + ror w6,w25,#2 + add w24,w24,w14 // h+=X[i] + eor w16,w16,w21,ror#11 + eor w5,w5,w0,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w21,ror#25 // Sigma1(e) + eor w6,w6,w25,ror#13 + add w24,w24,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w4,w4,w13,ror#19 + eor w5,w5,w0,lsr#3 // sigma0(X[i+1]) + add w24,w24,w16 // h+=Sigma1(e) + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w6,w25,ror#22 // Sigma0(a) + eor w4,w4,w13,lsr#10 // sigma1(X[i+14]) + add w15,w15,w8 + add w20,w20,w24 // d+=h + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w15,w15,w5 + add w24,w24,w17 // h+=Sigma0(a) + add w15,w15,w4 + ldr w4,[sp,#4] + str w7,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + ror w6,w1,#7 + and w17,w21,w20 + ror w5,w14,#17 + bic w19,w22,w20 + ror w7,w24,#2 + add w23,w23,w15 // h+=X[i] + eor w16,w16,w20,ror#11 + eor w6,w6,w1,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w20,ror#25 // Sigma1(e) + eor w7,w7,w24,ror#13 + add w23,w23,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w5,w5,w14,ror#19 + eor w6,w6,w1,lsr#3 // sigma0(X[i+1]) + add w23,w23,w16 // h+=Sigma1(e) + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w7,w24,ror#22 // Sigma0(a) + eor w5,w5,w14,lsr#10 // sigma1(X[i+14]) + add w0,w0,w9 + add w27,w27,w23 // d+=h + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w0,w0,w6 + add w23,w23,w17 // h+=Sigma0(a) + add w0,w0,w5 + ldr w5,[sp,#8] + str w8,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + ror w7,w2,#7 + and w17,w20,w27 + ror w6,w15,#17 + bic w28,w21,w27 + ror w8,w23,#2 + add w22,w22,w0 // h+=X[i] + eor w16,w16,w27,ror#11 + eor w7,w7,w2,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w27,ror#25 // Sigma1(e) + eor w8,w8,w23,ror#13 + add w22,w22,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w6,w6,w15,ror#19 + eor w7,w7,w2,lsr#3 // sigma0(X[i+1]) + add w22,w22,w16 // h+=Sigma1(e) + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w8,w23,ror#22 // Sigma0(a) + eor w6,w6,w15,lsr#10 // sigma1(X[i+14]) + add w1,w1,w10 + add w26,w26,w22 // d+=h + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w1,w1,w7 + add w22,w22,w17 // h+=Sigma0(a) + add w1,w1,w6 + ldr w6,[sp,#12] + str w9,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + ror w8,w3,#7 + and w17,w27,w26 + ror w7,w0,#17 + bic w19,w20,w26 + ror w9,w22,#2 + add w21,w21,w1 // h+=X[i] + eor w16,w16,w26,ror#11 + eor w8,w8,w3,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w26,ror#25 // Sigma1(e) + eor w9,w9,w22,ror#13 + add w21,w21,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w7,w7,w0,ror#19 + eor w8,w8,w3,lsr#3 // sigma0(X[i+1]) + add w21,w21,w16 // h+=Sigma1(e) + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w9,w22,ror#22 // Sigma0(a) + eor w7,w7,w0,lsr#10 // sigma1(X[i+14]) + add w2,w2,w11 + add w25,w25,w21 // d+=h + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w2,w2,w8 + add w21,w21,w17 // h+=Sigma0(a) + add w2,w2,w7 + ldr w7,[sp,#0] + str w10,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w9,w4,#7 + and w17,w26,w25 + ror w8,w1,#17 + bic w28,w27,w25 + ror w10,w21,#2 + add w20,w20,w2 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w9,w9,w4,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w10,w10,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w8,w8,w1,ror#19 + eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w10,w21,ror#22 // Sigma0(a) + eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) + add w3,w3,w12 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w3,w3,w9 + add w20,w20,w17 // h+=Sigma0(a) + add w3,w3,w8 + cbnz w19,Loop_16_xx + + ldp x0,x2,[x29,#96] + ldr x1,[x29,#112] + sub x30,x30,#260 // rewind + + ldp w3,w4,[x0] + ldp w5,w6,[x0,#2*4] + add x1,x1,#14*4 // advance input pointer + ldp w7,w8,[x0,#4*4] + add w20,w20,w3 + ldp w9,w10,[x0,#6*4] + add w21,w21,w4 + add w22,w22,w5 + add w23,w23,w6 + stp w20,w21,[x0] + add w24,w24,w7 + add w25,w25,w8 + stp w22,w23,[x0,#2*4] + add w26,w26,w9 + add w27,w27,w10 + cmp x1,x2 + stp w24,w25,[x0,#4*4] + stp w26,w27,[x0,#6*4] + b.ne Loop + + ldp x19,x20,[x29,#16] + add sp,sp,#4*4 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +.section __TEXT,__const +.align 6 + +LK256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0 //terminator + +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +.text +#ifndef __KERNEL__ +.globl _sha256_block_data_order_hw +.private_extern _sha256_block_data_order_hw + +.align 6 +_sha256_block_data_order_hw: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v0.4s,v1.4s},[x0] + adrp x3,LK256@PAGE + add x3,x3,LK256@PAGEOFF + +Loop_hw: + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + sub x2,x2,#1 + ld1 {v16.4s},[x3],#16 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + rev32 v6.16b,v6.16b + rev32 v7.16b,v7.16b + orr v18.16b,v0.16b,v0.16b // offload + orr v19.16b,v1.16b,v1.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + ld1 {v17.4s},[x3] + add v16.4s,v16.4s,v6.4s + sub x3,x3,#64*4-16 // rewind + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + add v17.4s,v17.4s,v7.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + add v0.4s,v0.4s,v18.4s + add v1.4s,v1.4s,v19.4s + + cbnz x2,Loop_hw + + st1 {v0.4s,v1.4s},[x0] + + ldr x29,[sp],#16 + ret + +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) diff --git a/ring-0.17.14/pregenerated/sha256-armv8-linux64.S b/ring-0.17.14/pregenerated/sha256-armv8-linux64.S new file mode 100644 index 0000000000..94c0682a95 --- /dev/null +++ b/ring-0.17.14/pregenerated/sha256-armv8-linux64.S @@ -0,0 +1,1195 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) +// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// ==================================================================== +// Written by Andy Polyakov for the OpenSSL +// project. +// ==================================================================== +// +// SHA256/512 for ARMv8. +// +// Performance in cycles per processed byte and improvement coefficient +// over code generated with "default" compiler: +// +// SHA256-hw SHA256(*) SHA512 +// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) +// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) +// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) +// Denver 2.01 10.5 (+26%) 6.70 (+8%) +// X-Gene 20.0 (+100%) 12.8 (+300%(***)) +// Mongoose 2.36 13.0 (+50%) 8.36 (+33%) +// Kryo 1.92 17.4 (+30%) 11.2 (+8%) +// +// (*) Software SHA256 results are of lesser relevance, presented +// mostly for informational purposes. +// (**) The result is a trade-off: it's possible to improve it by +// 10% (or by 1 cycle per round), but at the cost of 20% loss +// on Cortex-A53 (or by 4 cycles per round). +// (***) Super-impressive coefficients over gcc-generated code are +// indication of some compiler "pathology", most notably code +// generated with -mgeneral-regs-only is significantly faster +// and the gap is only 40-90%. + +#ifndef __KERNEL__ +#endif + +.text + +.globl sha256_block_data_order_nohw +.hidden sha256_block_data_order_nohw +.type sha256_block_data_order_nohw,%function +.align 6 +sha256_block_data_order_nohw: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#4*4 + + ldp w20,w21,[x0] // load context + ldp w22,w23,[x0,#2*4] + ldp w24,w25,[x0,#4*4] + add x2,x1,x2,lsl#6 // end of input + ldp w26,w27,[x0,#6*4] + adrp x30,.LK256 + add x30,x30,:lo12:.LK256 + stp x0,x2,[x29,#96] + +.Loop: + ldp w3,w4,[x1],#2*4 + ldr w19,[x30],#4 // *K++ + eor w28,w21,w22 // magic seed + str x1,[x29,#112] +#ifndef __AARCH64EB__ + rev w3,w3 // 0 +#endif + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + eor w6,w24,w24,ror#14 + and w17,w25,w24 + bic w19,w26,w24 + add w27,w27,w3 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w6,ror#11 // Sigma1(e) + ror w6,w20,#2 + add w27,w27,w17 // h+=Ch(e,f,g) + eor w17,w20,w20,ror#9 + add w27,w27,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w23,w23,w27 // d+=h + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w6,w17,ror#13 // Sigma0(a) + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w27,w27,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w4,w4 // 1 +#endif + ldp w5,w6,[x1],#2*4 + add w27,w27,w17 // h+=Sigma0(a) + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + eor w7,w23,w23,ror#14 + and w17,w24,w23 + bic w28,w25,w23 + add w26,w26,w4 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w7,ror#11 // Sigma1(e) + ror w7,w27,#2 + add w26,w26,w17 // h+=Ch(e,f,g) + eor w17,w27,w27,ror#9 + add w26,w26,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w22,w22,w26 // d+=h + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w7,w17,ror#13 // Sigma0(a) + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w26,w26,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w5,w5 // 2 +#endif + add w26,w26,w17 // h+=Sigma0(a) + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + eor w8,w22,w22,ror#14 + and w17,w23,w22 + bic w19,w24,w22 + add w25,w25,w5 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w8,ror#11 // Sigma1(e) + ror w8,w26,#2 + add w25,w25,w17 // h+=Ch(e,f,g) + eor w17,w26,w26,ror#9 + add w25,w25,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w21,w21,w25 // d+=h + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w8,w17,ror#13 // Sigma0(a) + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w25,w25,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w6,w6 // 3 +#endif + ldp w7,w8,[x1],#2*4 + add w25,w25,w17 // h+=Sigma0(a) + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + eor w9,w21,w21,ror#14 + and w17,w22,w21 + bic w28,w23,w21 + add w24,w24,w6 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w9,ror#11 // Sigma1(e) + ror w9,w25,#2 + add w24,w24,w17 // h+=Ch(e,f,g) + eor w17,w25,w25,ror#9 + add w24,w24,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w20,w20,w24 // d+=h + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w9,w17,ror#13 // Sigma0(a) + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w24,w24,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w7,w7 // 4 +#endif + add w24,w24,w17 // h+=Sigma0(a) + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + eor w10,w20,w20,ror#14 + and w17,w21,w20 + bic w19,w22,w20 + add w23,w23,w7 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w10,ror#11 // Sigma1(e) + ror w10,w24,#2 + add w23,w23,w17 // h+=Ch(e,f,g) + eor w17,w24,w24,ror#9 + add w23,w23,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w27,w27,w23 // d+=h + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w10,w17,ror#13 // Sigma0(a) + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w23,w23,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w8,w8 // 5 +#endif + ldp w9,w10,[x1],#2*4 + add w23,w23,w17 // h+=Sigma0(a) + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + eor w11,w27,w27,ror#14 + and w17,w20,w27 + bic w28,w21,w27 + add w22,w22,w8 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w11,ror#11 // Sigma1(e) + ror w11,w23,#2 + add w22,w22,w17 // h+=Ch(e,f,g) + eor w17,w23,w23,ror#9 + add w22,w22,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w26,w26,w22 // d+=h + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w11,w17,ror#13 // Sigma0(a) + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w22,w22,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w9,w9 // 6 +#endif + add w22,w22,w17 // h+=Sigma0(a) + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + eor w12,w26,w26,ror#14 + and w17,w27,w26 + bic w19,w20,w26 + add w21,w21,w9 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w12,ror#11 // Sigma1(e) + ror w12,w22,#2 + add w21,w21,w17 // h+=Ch(e,f,g) + eor w17,w22,w22,ror#9 + add w21,w21,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w25,w25,w21 // d+=h + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w12,w17,ror#13 // Sigma0(a) + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w21,w21,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w10,w10 // 7 +#endif + ldp w11,w12,[x1],#2*4 + add w21,w21,w17 // h+=Sigma0(a) + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + eor w13,w25,w25,ror#14 + and w17,w26,w25 + bic w28,w27,w25 + add w20,w20,w10 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w13,ror#11 // Sigma1(e) + ror w13,w21,#2 + add w20,w20,w17 // h+=Ch(e,f,g) + eor w17,w21,w21,ror#9 + add w20,w20,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w24,w24,w20 // d+=h + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w13,w17,ror#13 // Sigma0(a) + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w20,w20,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w11,w11 // 8 +#endif + add w20,w20,w17 // h+=Sigma0(a) + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + eor w14,w24,w24,ror#14 + and w17,w25,w24 + bic w19,w26,w24 + add w27,w27,w11 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w14,ror#11 // Sigma1(e) + ror w14,w20,#2 + add w27,w27,w17 // h+=Ch(e,f,g) + eor w17,w20,w20,ror#9 + add w27,w27,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w23,w23,w27 // d+=h + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w14,w17,ror#13 // Sigma0(a) + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w27,w27,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w12,w12 // 9 +#endif + ldp w13,w14,[x1],#2*4 + add w27,w27,w17 // h+=Sigma0(a) + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + eor w15,w23,w23,ror#14 + and w17,w24,w23 + bic w28,w25,w23 + add w26,w26,w12 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w15,ror#11 // Sigma1(e) + ror w15,w27,#2 + add w26,w26,w17 // h+=Ch(e,f,g) + eor w17,w27,w27,ror#9 + add w26,w26,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w22,w22,w26 // d+=h + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w15,w17,ror#13 // Sigma0(a) + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w26,w26,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w13,w13 // 10 +#endif + add w26,w26,w17 // h+=Sigma0(a) + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + eor w0,w22,w22,ror#14 + and w17,w23,w22 + bic w19,w24,w22 + add w25,w25,w13 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w0,ror#11 // Sigma1(e) + ror w0,w26,#2 + add w25,w25,w17 // h+=Ch(e,f,g) + eor w17,w26,w26,ror#9 + add w25,w25,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w21,w21,w25 // d+=h + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w0,w17,ror#13 // Sigma0(a) + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w25,w25,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w14,w14 // 11 +#endif + ldp w15,w0,[x1],#2*4 + add w25,w25,w17 // h+=Sigma0(a) + str w6,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + eor w6,w21,w21,ror#14 + and w17,w22,w21 + bic w28,w23,w21 + add w24,w24,w14 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w6,ror#11 // Sigma1(e) + ror w6,w25,#2 + add w24,w24,w17 // h+=Ch(e,f,g) + eor w17,w25,w25,ror#9 + add w24,w24,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w20,w20,w24 // d+=h + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w6,w17,ror#13 // Sigma0(a) + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w24,w24,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w15,w15 // 12 +#endif + add w24,w24,w17 // h+=Sigma0(a) + str w7,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + eor w7,w20,w20,ror#14 + and w17,w21,w20 + bic w19,w22,w20 + add w23,w23,w15 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w7,ror#11 // Sigma1(e) + ror w7,w24,#2 + add w23,w23,w17 // h+=Ch(e,f,g) + eor w17,w24,w24,ror#9 + add w23,w23,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w27,w27,w23 // d+=h + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w7,w17,ror#13 // Sigma0(a) + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w23,w23,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w0,w0 // 13 +#endif + ldp w1,w2,[x1] + add w23,w23,w17 // h+=Sigma0(a) + str w8,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + eor w8,w27,w27,ror#14 + and w17,w20,w27 + bic w28,w21,w27 + add w22,w22,w0 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w8,ror#11 // Sigma1(e) + ror w8,w23,#2 + add w22,w22,w17 // h+=Ch(e,f,g) + eor w17,w23,w23,ror#9 + add w22,w22,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w26,w26,w22 // d+=h + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w8,w17,ror#13 // Sigma0(a) + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w22,w22,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w1,w1 // 14 +#endif + ldr w6,[sp,#12] + add w22,w22,w17 // h+=Sigma0(a) + str w9,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + eor w9,w26,w26,ror#14 + and w17,w27,w26 + bic w19,w20,w26 + add w21,w21,w1 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w9,ror#11 // Sigma1(e) + ror w9,w22,#2 + add w21,w21,w17 // h+=Ch(e,f,g) + eor w17,w22,w22,ror#9 + add w21,w21,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w25,w25,w21 // d+=h + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w9,w17,ror#13 // Sigma0(a) + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w21,w21,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w2,w2 // 15 +#endif + ldr w7,[sp,#0] + add w21,w21,w17 // h+=Sigma0(a) + str w10,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w9,w4,#7 + and w17,w26,w25 + ror w8,w1,#17 + bic w28,w27,w25 + ror w10,w21,#2 + add w20,w20,w2 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w9,w9,w4,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w10,w10,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w8,w8,w1,ror#19 + eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w10,w21,ror#22 // Sigma0(a) + eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) + add w3,w3,w12 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w3,w3,w9 + add w20,w20,w17 // h+=Sigma0(a) + add w3,w3,w8 +.Loop_16_xx: + ldr w8,[sp,#4] + str w11,[sp,#0] + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + ror w10,w5,#7 + and w17,w25,w24 + ror w9,w2,#17 + bic w19,w26,w24 + ror w11,w20,#2 + add w27,w27,w3 // h+=X[i] + eor w16,w16,w24,ror#11 + eor w10,w10,w5,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w24,ror#25 // Sigma1(e) + eor w11,w11,w20,ror#13 + add w27,w27,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w9,w9,w2,ror#19 + eor w10,w10,w5,lsr#3 // sigma0(X[i+1]) + add w27,w27,w16 // h+=Sigma1(e) + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w11,w20,ror#22 // Sigma0(a) + eor w9,w9,w2,lsr#10 // sigma1(X[i+14]) + add w4,w4,w13 + add w23,w23,w27 // d+=h + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w4,w4,w10 + add w27,w27,w17 // h+=Sigma0(a) + add w4,w4,w9 + ldr w9,[sp,#8] + str w12,[sp,#4] + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + ror w11,w6,#7 + and w17,w24,w23 + ror w10,w3,#17 + bic w28,w25,w23 + ror w12,w27,#2 + add w26,w26,w4 // h+=X[i] + eor w16,w16,w23,ror#11 + eor w11,w11,w6,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w23,ror#25 // Sigma1(e) + eor w12,w12,w27,ror#13 + add w26,w26,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w10,w10,w3,ror#19 + eor w11,w11,w6,lsr#3 // sigma0(X[i+1]) + add w26,w26,w16 // h+=Sigma1(e) + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w12,w27,ror#22 // Sigma0(a) + eor w10,w10,w3,lsr#10 // sigma1(X[i+14]) + add w5,w5,w14 + add w22,w22,w26 // d+=h + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w5,w5,w11 + add w26,w26,w17 // h+=Sigma0(a) + add w5,w5,w10 + ldr w10,[sp,#12] + str w13,[sp,#8] + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + ror w12,w7,#7 + and w17,w23,w22 + ror w11,w4,#17 + bic w19,w24,w22 + ror w13,w26,#2 + add w25,w25,w5 // h+=X[i] + eor w16,w16,w22,ror#11 + eor w12,w12,w7,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w22,ror#25 // Sigma1(e) + eor w13,w13,w26,ror#13 + add w25,w25,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w11,w11,w4,ror#19 + eor w12,w12,w7,lsr#3 // sigma0(X[i+1]) + add w25,w25,w16 // h+=Sigma1(e) + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w13,w26,ror#22 // Sigma0(a) + eor w11,w11,w4,lsr#10 // sigma1(X[i+14]) + add w6,w6,w15 + add w21,w21,w25 // d+=h + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w6,w6,w12 + add w25,w25,w17 // h+=Sigma0(a) + add w6,w6,w11 + ldr w11,[sp,#0] + str w14,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + ror w13,w8,#7 + and w17,w22,w21 + ror w12,w5,#17 + bic w28,w23,w21 + ror w14,w25,#2 + add w24,w24,w6 // h+=X[i] + eor w16,w16,w21,ror#11 + eor w13,w13,w8,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w21,ror#25 // Sigma1(e) + eor w14,w14,w25,ror#13 + add w24,w24,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w12,w12,w5,ror#19 + eor w13,w13,w8,lsr#3 // sigma0(X[i+1]) + add w24,w24,w16 // h+=Sigma1(e) + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w14,w25,ror#22 // Sigma0(a) + eor w12,w12,w5,lsr#10 // sigma1(X[i+14]) + add w7,w7,w0 + add w20,w20,w24 // d+=h + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w7,w7,w13 + add w24,w24,w17 // h+=Sigma0(a) + add w7,w7,w12 + ldr w12,[sp,#4] + str w15,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + ror w14,w9,#7 + and w17,w21,w20 + ror w13,w6,#17 + bic w19,w22,w20 + ror w15,w24,#2 + add w23,w23,w7 // h+=X[i] + eor w16,w16,w20,ror#11 + eor w14,w14,w9,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w20,ror#25 // Sigma1(e) + eor w15,w15,w24,ror#13 + add w23,w23,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w13,w13,w6,ror#19 + eor w14,w14,w9,lsr#3 // sigma0(X[i+1]) + add w23,w23,w16 // h+=Sigma1(e) + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w15,w24,ror#22 // Sigma0(a) + eor w13,w13,w6,lsr#10 // sigma1(X[i+14]) + add w8,w8,w1 + add w27,w27,w23 // d+=h + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w8,w8,w14 + add w23,w23,w17 // h+=Sigma0(a) + add w8,w8,w13 + ldr w13,[sp,#8] + str w0,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + ror w15,w10,#7 + and w17,w20,w27 + ror w14,w7,#17 + bic w28,w21,w27 + ror w0,w23,#2 + add w22,w22,w8 // h+=X[i] + eor w16,w16,w27,ror#11 + eor w15,w15,w10,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w27,ror#25 // Sigma1(e) + eor w0,w0,w23,ror#13 + add w22,w22,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w14,w14,w7,ror#19 + eor w15,w15,w10,lsr#3 // sigma0(X[i+1]) + add w22,w22,w16 // h+=Sigma1(e) + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w0,w23,ror#22 // Sigma0(a) + eor w14,w14,w7,lsr#10 // sigma1(X[i+14]) + add w9,w9,w2 + add w26,w26,w22 // d+=h + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w9,w9,w15 + add w22,w22,w17 // h+=Sigma0(a) + add w9,w9,w14 + ldr w14,[sp,#12] + str w1,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + ror w0,w11,#7 + and w17,w27,w26 + ror w15,w8,#17 + bic w19,w20,w26 + ror w1,w22,#2 + add w21,w21,w9 // h+=X[i] + eor w16,w16,w26,ror#11 + eor w0,w0,w11,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w26,ror#25 // Sigma1(e) + eor w1,w1,w22,ror#13 + add w21,w21,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w15,w15,w8,ror#19 + eor w0,w0,w11,lsr#3 // sigma0(X[i+1]) + add w21,w21,w16 // h+=Sigma1(e) + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w1,w22,ror#22 // Sigma0(a) + eor w15,w15,w8,lsr#10 // sigma1(X[i+14]) + add w10,w10,w3 + add w25,w25,w21 // d+=h + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w10,w10,w0 + add w21,w21,w17 // h+=Sigma0(a) + add w10,w10,w15 + ldr w15,[sp,#0] + str w2,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w1,w12,#7 + and w17,w26,w25 + ror w0,w9,#17 + bic w28,w27,w25 + ror w2,w21,#2 + add w20,w20,w10 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w1,w1,w12,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w2,w2,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w0,w0,w9,ror#19 + eor w1,w1,w12,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w2,w21,ror#22 // Sigma0(a) + eor w0,w0,w9,lsr#10 // sigma1(X[i+14]) + add w11,w11,w4 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w11,w11,w1 + add w20,w20,w17 // h+=Sigma0(a) + add w11,w11,w0 + ldr w0,[sp,#4] + str w3,[sp,#0] + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + ror w2,w13,#7 + and w17,w25,w24 + ror w1,w10,#17 + bic w19,w26,w24 + ror w3,w20,#2 + add w27,w27,w11 // h+=X[i] + eor w16,w16,w24,ror#11 + eor w2,w2,w13,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w24,ror#25 // Sigma1(e) + eor w3,w3,w20,ror#13 + add w27,w27,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w1,w1,w10,ror#19 + eor w2,w2,w13,lsr#3 // sigma0(X[i+1]) + add w27,w27,w16 // h+=Sigma1(e) + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w3,w20,ror#22 // Sigma0(a) + eor w1,w1,w10,lsr#10 // sigma1(X[i+14]) + add w12,w12,w5 + add w23,w23,w27 // d+=h + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w12,w12,w2 + add w27,w27,w17 // h+=Sigma0(a) + add w12,w12,w1 + ldr w1,[sp,#8] + str w4,[sp,#4] + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + ror w3,w14,#7 + and w17,w24,w23 + ror w2,w11,#17 + bic w28,w25,w23 + ror w4,w27,#2 + add w26,w26,w12 // h+=X[i] + eor w16,w16,w23,ror#11 + eor w3,w3,w14,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w23,ror#25 // Sigma1(e) + eor w4,w4,w27,ror#13 + add w26,w26,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w2,w2,w11,ror#19 + eor w3,w3,w14,lsr#3 // sigma0(X[i+1]) + add w26,w26,w16 // h+=Sigma1(e) + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w4,w27,ror#22 // Sigma0(a) + eor w2,w2,w11,lsr#10 // sigma1(X[i+14]) + add w13,w13,w6 + add w22,w22,w26 // d+=h + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w13,w13,w3 + add w26,w26,w17 // h+=Sigma0(a) + add w13,w13,w2 + ldr w2,[sp,#12] + str w5,[sp,#8] + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + ror w4,w15,#7 + and w17,w23,w22 + ror w3,w12,#17 + bic w19,w24,w22 + ror w5,w26,#2 + add w25,w25,w13 // h+=X[i] + eor w16,w16,w22,ror#11 + eor w4,w4,w15,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w22,ror#25 // Sigma1(e) + eor w5,w5,w26,ror#13 + add w25,w25,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w3,w3,w12,ror#19 + eor w4,w4,w15,lsr#3 // sigma0(X[i+1]) + add w25,w25,w16 // h+=Sigma1(e) + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w5,w26,ror#22 // Sigma0(a) + eor w3,w3,w12,lsr#10 // sigma1(X[i+14]) + add w14,w14,w7 + add w21,w21,w25 // d+=h + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w14,w14,w4 + add w25,w25,w17 // h+=Sigma0(a) + add w14,w14,w3 + ldr w3,[sp,#0] + str w6,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + ror w5,w0,#7 + and w17,w22,w21 + ror w4,w13,#17 + bic w28,w23,w21 + ror w6,w25,#2 + add w24,w24,w14 // h+=X[i] + eor w16,w16,w21,ror#11 + eor w5,w5,w0,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w21,ror#25 // Sigma1(e) + eor w6,w6,w25,ror#13 + add w24,w24,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w4,w4,w13,ror#19 + eor w5,w5,w0,lsr#3 // sigma0(X[i+1]) + add w24,w24,w16 // h+=Sigma1(e) + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w6,w25,ror#22 // Sigma0(a) + eor w4,w4,w13,lsr#10 // sigma1(X[i+14]) + add w15,w15,w8 + add w20,w20,w24 // d+=h + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w15,w15,w5 + add w24,w24,w17 // h+=Sigma0(a) + add w15,w15,w4 + ldr w4,[sp,#4] + str w7,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + ror w6,w1,#7 + and w17,w21,w20 + ror w5,w14,#17 + bic w19,w22,w20 + ror w7,w24,#2 + add w23,w23,w15 // h+=X[i] + eor w16,w16,w20,ror#11 + eor w6,w6,w1,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w20,ror#25 // Sigma1(e) + eor w7,w7,w24,ror#13 + add w23,w23,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w5,w5,w14,ror#19 + eor w6,w6,w1,lsr#3 // sigma0(X[i+1]) + add w23,w23,w16 // h+=Sigma1(e) + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w7,w24,ror#22 // Sigma0(a) + eor w5,w5,w14,lsr#10 // sigma1(X[i+14]) + add w0,w0,w9 + add w27,w27,w23 // d+=h + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w0,w0,w6 + add w23,w23,w17 // h+=Sigma0(a) + add w0,w0,w5 + ldr w5,[sp,#8] + str w8,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + ror w7,w2,#7 + and w17,w20,w27 + ror w6,w15,#17 + bic w28,w21,w27 + ror w8,w23,#2 + add w22,w22,w0 // h+=X[i] + eor w16,w16,w27,ror#11 + eor w7,w7,w2,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w27,ror#25 // Sigma1(e) + eor w8,w8,w23,ror#13 + add w22,w22,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w6,w6,w15,ror#19 + eor w7,w7,w2,lsr#3 // sigma0(X[i+1]) + add w22,w22,w16 // h+=Sigma1(e) + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w8,w23,ror#22 // Sigma0(a) + eor w6,w6,w15,lsr#10 // sigma1(X[i+14]) + add w1,w1,w10 + add w26,w26,w22 // d+=h + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w1,w1,w7 + add w22,w22,w17 // h+=Sigma0(a) + add w1,w1,w6 + ldr w6,[sp,#12] + str w9,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + ror w8,w3,#7 + and w17,w27,w26 + ror w7,w0,#17 + bic w19,w20,w26 + ror w9,w22,#2 + add w21,w21,w1 // h+=X[i] + eor w16,w16,w26,ror#11 + eor w8,w8,w3,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w26,ror#25 // Sigma1(e) + eor w9,w9,w22,ror#13 + add w21,w21,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w7,w7,w0,ror#19 + eor w8,w8,w3,lsr#3 // sigma0(X[i+1]) + add w21,w21,w16 // h+=Sigma1(e) + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w9,w22,ror#22 // Sigma0(a) + eor w7,w7,w0,lsr#10 // sigma1(X[i+14]) + add w2,w2,w11 + add w25,w25,w21 // d+=h + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w2,w2,w8 + add w21,w21,w17 // h+=Sigma0(a) + add w2,w2,w7 + ldr w7,[sp,#0] + str w10,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w9,w4,#7 + and w17,w26,w25 + ror w8,w1,#17 + bic w28,w27,w25 + ror w10,w21,#2 + add w20,w20,w2 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w9,w9,w4,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w10,w10,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w8,w8,w1,ror#19 + eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w10,w21,ror#22 // Sigma0(a) + eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) + add w3,w3,w12 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w3,w3,w9 + add w20,w20,w17 // h+=Sigma0(a) + add w3,w3,w8 + cbnz w19,.Loop_16_xx + + ldp x0,x2,[x29,#96] + ldr x1,[x29,#112] + sub x30,x30,#260 // rewind + + ldp w3,w4,[x0] + ldp w5,w6,[x0,#2*4] + add x1,x1,#14*4 // advance input pointer + ldp w7,w8,[x0,#4*4] + add w20,w20,w3 + ldp w9,w10,[x0,#6*4] + add w21,w21,w4 + add w22,w22,w5 + add w23,w23,w6 + stp w20,w21,[x0] + add w24,w24,w7 + add w25,w25,w8 + stp w22,w23,[x0,#2*4] + add w26,w26,w9 + add w27,w27,w10 + cmp x1,x2 + stp w24,w25,[x0,#4*4] + stp w26,w27,[x0,#6*4] + b.ne .Loop + + ldp x19,x20,[x29,#16] + add sp,sp,#4*4 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size sha256_block_data_order_nohw,.-sha256_block_data_order_nohw + +.section .rodata +.align 6 +.type .LK256,%object +.LK256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0 //terminator +.size .LK256,.-.LK256 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +.text +#ifndef __KERNEL__ +.globl sha256_block_data_order_hw +.hidden sha256_block_data_order_hw +.type sha256_block_data_order_hw,%function +.align 6 +sha256_block_data_order_hw: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v0.4s,v1.4s},[x0] + adrp x3,.LK256 + add x3,x3,:lo12:.LK256 + +.Loop_hw: + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + sub x2,x2,#1 + ld1 {v16.4s},[x3],#16 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + rev32 v6.16b,v6.16b + rev32 v7.16b,v7.16b + orr v18.16b,v0.16b,v0.16b // offload + orr v19.16b,v1.16b,v1.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + ld1 {v17.4s},[x3] + add v16.4s,v16.4s,v6.4s + sub x3,x3,#64*4-16 // rewind + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + add v17.4s,v17.4s,v7.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + add v0.4s,v0.4s,v18.4s + add v1.4s,v1.4s,v19.4s + + cbnz x2,.Loop_hw + + st1 {v0.4s,v1.4s},[x0] + + ldr x29,[sp],#16 + ret +.size sha256_block_data_order_hw,.-sha256_block_data_order_hw +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) diff --git a/ring-0.17.14/pregenerated/sha256-armv8-win64.S b/ring-0.17.14/pregenerated/sha256-armv8-win64.S new file mode 100644 index 0000000000..3049ef3eb1 --- /dev/null +++ b/ring-0.17.14/pregenerated/sha256-armv8-win64.S @@ -0,0 +1,1199 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// ==================================================================== +// Written by Andy Polyakov for the OpenSSL +// project. +// ==================================================================== +// +// SHA256/512 for ARMv8. +// +// Performance in cycles per processed byte and improvement coefficient +// over code generated with "default" compiler: +// +// SHA256-hw SHA256(*) SHA512 +// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) +// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) +// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) +// Denver 2.01 10.5 (+26%) 6.70 (+8%) +// X-Gene 20.0 (+100%) 12.8 (+300%(***)) +// Mongoose 2.36 13.0 (+50%) 8.36 (+33%) +// Kryo 1.92 17.4 (+30%) 11.2 (+8%) +// +// (*) Software SHA256 results are of lesser relevance, presented +// mostly for informational purposes. +// (**) The result is a trade-off: it's possible to improve it by +// 10% (or by 1 cycle per round), but at the cost of 20% loss +// on Cortex-A53 (or by 4 cycles per round). +// (***) Super-impressive coefficients over gcc-generated code are +// indication of some compiler "pathology", most notably code +// generated with -mgeneral-regs-only is significantly faster +// and the gap is only 40-90%. + +#ifndef __KERNEL__ +#endif + +.text + +.globl sha256_block_data_order_nohw + +.def sha256_block_data_order_nohw + .type 32 +.endef +.align 6 +sha256_block_data_order_nohw: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#4*4 + + ldp w20,w21,[x0] // load context + ldp w22,w23,[x0,#2*4] + ldp w24,w25,[x0,#4*4] + add x2,x1,x2,lsl#6 // end of input + ldp w26,w27,[x0,#6*4] + adrp x30,LK256 + add x30,x30,:lo12:LK256 + stp x0,x2,[x29,#96] + +Loop: + ldp w3,w4,[x1],#2*4 + ldr w19,[x30],#4 // *K++ + eor w28,w21,w22 // magic seed + str x1,[x29,#112] +#ifndef __AARCH64EB__ + rev w3,w3 // 0 +#endif + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + eor w6,w24,w24,ror#14 + and w17,w25,w24 + bic w19,w26,w24 + add w27,w27,w3 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w6,ror#11 // Sigma1(e) + ror w6,w20,#2 + add w27,w27,w17 // h+=Ch(e,f,g) + eor w17,w20,w20,ror#9 + add w27,w27,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w23,w23,w27 // d+=h + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w6,w17,ror#13 // Sigma0(a) + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w27,w27,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w4,w4 // 1 +#endif + ldp w5,w6,[x1],#2*4 + add w27,w27,w17 // h+=Sigma0(a) + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + eor w7,w23,w23,ror#14 + and w17,w24,w23 + bic w28,w25,w23 + add w26,w26,w4 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w7,ror#11 // Sigma1(e) + ror w7,w27,#2 + add w26,w26,w17 // h+=Ch(e,f,g) + eor w17,w27,w27,ror#9 + add w26,w26,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w22,w22,w26 // d+=h + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w7,w17,ror#13 // Sigma0(a) + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w26,w26,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w5,w5 // 2 +#endif + add w26,w26,w17 // h+=Sigma0(a) + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + eor w8,w22,w22,ror#14 + and w17,w23,w22 + bic w19,w24,w22 + add w25,w25,w5 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w8,ror#11 // Sigma1(e) + ror w8,w26,#2 + add w25,w25,w17 // h+=Ch(e,f,g) + eor w17,w26,w26,ror#9 + add w25,w25,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w21,w21,w25 // d+=h + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w8,w17,ror#13 // Sigma0(a) + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w25,w25,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w6,w6 // 3 +#endif + ldp w7,w8,[x1],#2*4 + add w25,w25,w17 // h+=Sigma0(a) + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + eor w9,w21,w21,ror#14 + and w17,w22,w21 + bic w28,w23,w21 + add w24,w24,w6 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w9,ror#11 // Sigma1(e) + ror w9,w25,#2 + add w24,w24,w17 // h+=Ch(e,f,g) + eor w17,w25,w25,ror#9 + add w24,w24,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w20,w20,w24 // d+=h + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w9,w17,ror#13 // Sigma0(a) + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w24,w24,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w7,w7 // 4 +#endif + add w24,w24,w17 // h+=Sigma0(a) + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + eor w10,w20,w20,ror#14 + and w17,w21,w20 + bic w19,w22,w20 + add w23,w23,w7 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w10,ror#11 // Sigma1(e) + ror w10,w24,#2 + add w23,w23,w17 // h+=Ch(e,f,g) + eor w17,w24,w24,ror#9 + add w23,w23,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w27,w27,w23 // d+=h + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w10,w17,ror#13 // Sigma0(a) + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w23,w23,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w8,w8 // 5 +#endif + ldp w9,w10,[x1],#2*4 + add w23,w23,w17 // h+=Sigma0(a) + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + eor w11,w27,w27,ror#14 + and w17,w20,w27 + bic w28,w21,w27 + add w22,w22,w8 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w11,ror#11 // Sigma1(e) + ror w11,w23,#2 + add w22,w22,w17 // h+=Ch(e,f,g) + eor w17,w23,w23,ror#9 + add w22,w22,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w26,w26,w22 // d+=h + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w11,w17,ror#13 // Sigma0(a) + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w22,w22,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w9,w9 // 6 +#endif + add w22,w22,w17 // h+=Sigma0(a) + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + eor w12,w26,w26,ror#14 + and w17,w27,w26 + bic w19,w20,w26 + add w21,w21,w9 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w12,ror#11 // Sigma1(e) + ror w12,w22,#2 + add w21,w21,w17 // h+=Ch(e,f,g) + eor w17,w22,w22,ror#9 + add w21,w21,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w25,w25,w21 // d+=h + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w12,w17,ror#13 // Sigma0(a) + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w21,w21,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w10,w10 // 7 +#endif + ldp w11,w12,[x1],#2*4 + add w21,w21,w17 // h+=Sigma0(a) + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + eor w13,w25,w25,ror#14 + and w17,w26,w25 + bic w28,w27,w25 + add w20,w20,w10 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w13,ror#11 // Sigma1(e) + ror w13,w21,#2 + add w20,w20,w17 // h+=Ch(e,f,g) + eor w17,w21,w21,ror#9 + add w20,w20,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w24,w24,w20 // d+=h + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w13,w17,ror#13 // Sigma0(a) + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w20,w20,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w11,w11 // 8 +#endif + add w20,w20,w17 // h+=Sigma0(a) + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + eor w14,w24,w24,ror#14 + and w17,w25,w24 + bic w19,w26,w24 + add w27,w27,w11 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w14,ror#11 // Sigma1(e) + ror w14,w20,#2 + add w27,w27,w17 // h+=Ch(e,f,g) + eor w17,w20,w20,ror#9 + add w27,w27,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w23,w23,w27 // d+=h + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w14,w17,ror#13 // Sigma0(a) + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w27,w27,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w12,w12 // 9 +#endif + ldp w13,w14,[x1],#2*4 + add w27,w27,w17 // h+=Sigma0(a) + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + eor w15,w23,w23,ror#14 + and w17,w24,w23 + bic w28,w25,w23 + add w26,w26,w12 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w15,ror#11 // Sigma1(e) + ror w15,w27,#2 + add w26,w26,w17 // h+=Ch(e,f,g) + eor w17,w27,w27,ror#9 + add w26,w26,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w22,w22,w26 // d+=h + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w15,w17,ror#13 // Sigma0(a) + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w26,w26,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w13,w13 // 10 +#endif + add w26,w26,w17 // h+=Sigma0(a) + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + eor w0,w22,w22,ror#14 + and w17,w23,w22 + bic w19,w24,w22 + add w25,w25,w13 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w0,ror#11 // Sigma1(e) + ror w0,w26,#2 + add w25,w25,w17 // h+=Ch(e,f,g) + eor w17,w26,w26,ror#9 + add w25,w25,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w21,w21,w25 // d+=h + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w0,w17,ror#13 // Sigma0(a) + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w25,w25,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w14,w14 // 11 +#endif + ldp w15,w0,[x1],#2*4 + add w25,w25,w17 // h+=Sigma0(a) + str w6,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + eor w6,w21,w21,ror#14 + and w17,w22,w21 + bic w28,w23,w21 + add w24,w24,w14 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w6,ror#11 // Sigma1(e) + ror w6,w25,#2 + add w24,w24,w17 // h+=Ch(e,f,g) + eor w17,w25,w25,ror#9 + add w24,w24,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w20,w20,w24 // d+=h + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w6,w17,ror#13 // Sigma0(a) + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w24,w24,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w15,w15 // 12 +#endif + add w24,w24,w17 // h+=Sigma0(a) + str w7,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + eor w7,w20,w20,ror#14 + and w17,w21,w20 + bic w19,w22,w20 + add w23,w23,w15 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w7,ror#11 // Sigma1(e) + ror w7,w24,#2 + add w23,w23,w17 // h+=Ch(e,f,g) + eor w17,w24,w24,ror#9 + add w23,w23,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w27,w27,w23 // d+=h + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w7,w17,ror#13 // Sigma0(a) + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w23,w23,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w0,w0 // 13 +#endif + ldp w1,w2,[x1] + add w23,w23,w17 // h+=Sigma0(a) + str w8,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + eor w8,w27,w27,ror#14 + and w17,w20,w27 + bic w28,w21,w27 + add w22,w22,w0 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w8,ror#11 // Sigma1(e) + ror w8,w23,#2 + add w22,w22,w17 // h+=Ch(e,f,g) + eor w17,w23,w23,ror#9 + add w22,w22,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w26,w26,w22 // d+=h + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w8,w17,ror#13 // Sigma0(a) + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w22,w22,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w1,w1 // 14 +#endif + ldr w6,[sp,#12] + add w22,w22,w17 // h+=Sigma0(a) + str w9,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + eor w9,w26,w26,ror#14 + and w17,w27,w26 + bic w19,w20,w26 + add w21,w21,w1 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w9,ror#11 // Sigma1(e) + ror w9,w22,#2 + add w21,w21,w17 // h+=Ch(e,f,g) + eor w17,w22,w22,ror#9 + add w21,w21,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w25,w25,w21 // d+=h + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w9,w17,ror#13 // Sigma0(a) + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w21,w21,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w2,w2 // 15 +#endif + ldr w7,[sp,#0] + add w21,w21,w17 // h+=Sigma0(a) + str w10,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w9,w4,#7 + and w17,w26,w25 + ror w8,w1,#17 + bic w28,w27,w25 + ror w10,w21,#2 + add w20,w20,w2 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w9,w9,w4,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w10,w10,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w8,w8,w1,ror#19 + eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w10,w21,ror#22 // Sigma0(a) + eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) + add w3,w3,w12 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w3,w3,w9 + add w20,w20,w17 // h+=Sigma0(a) + add w3,w3,w8 +Loop_16_xx: + ldr w8,[sp,#4] + str w11,[sp,#0] + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + ror w10,w5,#7 + and w17,w25,w24 + ror w9,w2,#17 + bic w19,w26,w24 + ror w11,w20,#2 + add w27,w27,w3 // h+=X[i] + eor w16,w16,w24,ror#11 + eor w10,w10,w5,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w24,ror#25 // Sigma1(e) + eor w11,w11,w20,ror#13 + add w27,w27,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w9,w9,w2,ror#19 + eor w10,w10,w5,lsr#3 // sigma0(X[i+1]) + add w27,w27,w16 // h+=Sigma1(e) + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w11,w20,ror#22 // Sigma0(a) + eor w9,w9,w2,lsr#10 // sigma1(X[i+14]) + add w4,w4,w13 + add w23,w23,w27 // d+=h + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w4,w4,w10 + add w27,w27,w17 // h+=Sigma0(a) + add w4,w4,w9 + ldr w9,[sp,#8] + str w12,[sp,#4] + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + ror w11,w6,#7 + and w17,w24,w23 + ror w10,w3,#17 + bic w28,w25,w23 + ror w12,w27,#2 + add w26,w26,w4 // h+=X[i] + eor w16,w16,w23,ror#11 + eor w11,w11,w6,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w23,ror#25 // Sigma1(e) + eor w12,w12,w27,ror#13 + add w26,w26,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w10,w10,w3,ror#19 + eor w11,w11,w6,lsr#3 // sigma0(X[i+1]) + add w26,w26,w16 // h+=Sigma1(e) + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w12,w27,ror#22 // Sigma0(a) + eor w10,w10,w3,lsr#10 // sigma1(X[i+14]) + add w5,w5,w14 + add w22,w22,w26 // d+=h + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w5,w5,w11 + add w26,w26,w17 // h+=Sigma0(a) + add w5,w5,w10 + ldr w10,[sp,#12] + str w13,[sp,#8] + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + ror w12,w7,#7 + and w17,w23,w22 + ror w11,w4,#17 + bic w19,w24,w22 + ror w13,w26,#2 + add w25,w25,w5 // h+=X[i] + eor w16,w16,w22,ror#11 + eor w12,w12,w7,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w22,ror#25 // Sigma1(e) + eor w13,w13,w26,ror#13 + add w25,w25,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w11,w11,w4,ror#19 + eor w12,w12,w7,lsr#3 // sigma0(X[i+1]) + add w25,w25,w16 // h+=Sigma1(e) + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w13,w26,ror#22 // Sigma0(a) + eor w11,w11,w4,lsr#10 // sigma1(X[i+14]) + add w6,w6,w15 + add w21,w21,w25 // d+=h + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w6,w6,w12 + add w25,w25,w17 // h+=Sigma0(a) + add w6,w6,w11 + ldr w11,[sp,#0] + str w14,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + ror w13,w8,#7 + and w17,w22,w21 + ror w12,w5,#17 + bic w28,w23,w21 + ror w14,w25,#2 + add w24,w24,w6 // h+=X[i] + eor w16,w16,w21,ror#11 + eor w13,w13,w8,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w21,ror#25 // Sigma1(e) + eor w14,w14,w25,ror#13 + add w24,w24,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w12,w12,w5,ror#19 + eor w13,w13,w8,lsr#3 // sigma0(X[i+1]) + add w24,w24,w16 // h+=Sigma1(e) + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w14,w25,ror#22 // Sigma0(a) + eor w12,w12,w5,lsr#10 // sigma1(X[i+14]) + add w7,w7,w0 + add w20,w20,w24 // d+=h + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w7,w7,w13 + add w24,w24,w17 // h+=Sigma0(a) + add w7,w7,w12 + ldr w12,[sp,#4] + str w15,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + ror w14,w9,#7 + and w17,w21,w20 + ror w13,w6,#17 + bic w19,w22,w20 + ror w15,w24,#2 + add w23,w23,w7 // h+=X[i] + eor w16,w16,w20,ror#11 + eor w14,w14,w9,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w20,ror#25 // Sigma1(e) + eor w15,w15,w24,ror#13 + add w23,w23,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w13,w13,w6,ror#19 + eor w14,w14,w9,lsr#3 // sigma0(X[i+1]) + add w23,w23,w16 // h+=Sigma1(e) + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w15,w24,ror#22 // Sigma0(a) + eor w13,w13,w6,lsr#10 // sigma1(X[i+14]) + add w8,w8,w1 + add w27,w27,w23 // d+=h + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w8,w8,w14 + add w23,w23,w17 // h+=Sigma0(a) + add w8,w8,w13 + ldr w13,[sp,#8] + str w0,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + ror w15,w10,#7 + and w17,w20,w27 + ror w14,w7,#17 + bic w28,w21,w27 + ror w0,w23,#2 + add w22,w22,w8 // h+=X[i] + eor w16,w16,w27,ror#11 + eor w15,w15,w10,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w27,ror#25 // Sigma1(e) + eor w0,w0,w23,ror#13 + add w22,w22,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w14,w14,w7,ror#19 + eor w15,w15,w10,lsr#3 // sigma0(X[i+1]) + add w22,w22,w16 // h+=Sigma1(e) + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w0,w23,ror#22 // Sigma0(a) + eor w14,w14,w7,lsr#10 // sigma1(X[i+14]) + add w9,w9,w2 + add w26,w26,w22 // d+=h + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w9,w9,w15 + add w22,w22,w17 // h+=Sigma0(a) + add w9,w9,w14 + ldr w14,[sp,#12] + str w1,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + ror w0,w11,#7 + and w17,w27,w26 + ror w15,w8,#17 + bic w19,w20,w26 + ror w1,w22,#2 + add w21,w21,w9 // h+=X[i] + eor w16,w16,w26,ror#11 + eor w0,w0,w11,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w26,ror#25 // Sigma1(e) + eor w1,w1,w22,ror#13 + add w21,w21,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w15,w15,w8,ror#19 + eor w0,w0,w11,lsr#3 // sigma0(X[i+1]) + add w21,w21,w16 // h+=Sigma1(e) + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w1,w22,ror#22 // Sigma0(a) + eor w15,w15,w8,lsr#10 // sigma1(X[i+14]) + add w10,w10,w3 + add w25,w25,w21 // d+=h + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w10,w10,w0 + add w21,w21,w17 // h+=Sigma0(a) + add w10,w10,w15 + ldr w15,[sp,#0] + str w2,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w1,w12,#7 + and w17,w26,w25 + ror w0,w9,#17 + bic w28,w27,w25 + ror w2,w21,#2 + add w20,w20,w10 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w1,w1,w12,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w2,w2,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w0,w0,w9,ror#19 + eor w1,w1,w12,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w2,w21,ror#22 // Sigma0(a) + eor w0,w0,w9,lsr#10 // sigma1(X[i+14]) + add w11,w11,w4 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w11,w11,w1 + add w20,w20,w17 // h+=Sigma0(a) + add w11,w11,w0 + ldr w0,[sp,#4] + str w3,[sp,#0] + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + ror w2,w13,#7 + and w17,w25,w24 + ror w1,w10,#17 + bic w19,w26,w24 + ror w3,w20,#2 + add w27,w27,w11 // h+=X[i] + eor w16,w16,w24,ror#11 + eor w2,w2,w13,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w24,ror#25 // Sigma1(e) + eor w3,w3,w20,ror#13 + add w27,w27,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w1,w1,w10,ror#19 + eor w2,w2,w13,lsr#3 // sigma0(X[i+1]) + add w27,w27,w16 // h+=Sigma1(e) + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w3,w20,ror#22 // Sigma0(a) + eor w1,w1,w10,lsr#10 // sigma1(X[i+14]) + add w12,w12,w5 + add w23,w23,w27 // d+=h + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w12,w12,w2 + add w27,w27,w17 // h+=Sigma0(a) + add w12,w12,w1 + ldr w1,[sp,#8] + str w4,[sp,#4] + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + ror w3,w14,#7 + and w17,w24,w23 + ror w2,w11,#17 + bic w28,w25,w23 + ror w4,w27,#2 + add w26,w26,w12 // h+=X[i] + eor w16,w16,w23,ror#11 + eor w3,w3,w14,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w23,ror#25 // Sigma1(e) + eor w4,w4,w27,ror#13 + add w26,w26,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w2,w2,w11,ror#19 + eor w3,w3,w14,lsr#3 // sigma0(X[i+1]) + add w26,w26,w16 // h+=Sigma1(e) + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w4,w27,ror#22 // Sigma0(a) + eor w2,w2,w11,lsr#10 // sigma1(X[i+14]) + add w13,w13,w6 + add w22,w22,w26 // d+=h + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w13,w13,w3 + add w26,w26,w17 // h+=Sigma0(a) + add w13,w13,w2 + ldr w2,[sp,#12] + str w5,[sp,#8] + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + ror w4,w15,#7 + and w17,w23,w22 + ror w3,w12,#17 + bic w19,w24,w22 + ror w5,w26,#2 + add w25,w25,w13 // h+=X[i] + eor w16,w16,w22,ror#11 + eor w4,w4,w15,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w22,ror#25 // Sigma1(e) + eor w5,w5,w26,ror#13 + add w25,w25,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w3,w3,w12,ror#19 + eor w4,w4,w15,lsr#3 // sigma0(X[i+1]) + add w25,w25,w16 // h+=Sigma1(e) + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w5,w26,ror#22 // Sigma0(a) + eor w3,w3,w12,lsr#10 // sigma1(X[i+14]) + add w14,w14,w7 + add w21,w21,w25 // d+=h + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w14,w14,w4 + add w25,w25,w17 // h+=Sigma0(a) + add w14,w14,w3 + ldr w3,[sp,#0] + str w6,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + ror w5,w0,#7 + and w17,w22,w21 + ror w4,w13,#17 + bic w28,w23,w21 + ror w6,w25,#2 + add w24,w24,w14 // h+=X[i] + eor w16,w16,w21,ror#11 + eor w5,w5,w0,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w21,ror#25 // Sigma1(e) + eor w6,w6,w25,ror#13 + add w24,w24,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w4,w4,w13,ror#19 + eor w5,w5,w0,lsr#3 // sigma0(X[i+1]) + add w24,w24,w16 // h+=Sigma1(e) + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w6,w25,ror#22 // Sigma0(a) + eor w4,w4,w13,lsr#10 // sigma1(X[i+14]) + add w15,w15,w8 + add w20,w20,w24 // d+=h + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w15,w15,w5 + add w24,w24,w17 // h+=Sigma0(a) + add w15,w15,w4 + ldr w4,[sp,#4] + str w7,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + ror w6,w1,#7 + and w17,w21,w20 + ror w5,w14,#17 + bic w19,w22,w20 + ror w7,w24,#2 + add w23,w23,w15 // h+=X[i] + eor w16,w16,w20,ror#11 + eor w6,w6,w1,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w20,ror#25 // Sigma1(e) + eor w7,w7,w24,ror#13 + add w23,w23,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w5,w5,w14,ror#19 + eor w6,w6,w1,lsr#3 // sigma0(X[i+1]) + add w23,w23,w16 // h+=Sigma1(e) + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w7,w24,ror#22 // Sigma0(a) + eor w5,w5,w14,lsr#10 // sigma1(X[i+14]) + add w0,w0,w9 + add w27,w27,w23 // d+=h + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w0,w0,w6 + add w23,w23,w17 // h+=Sigma0(a) + add w0,w0,w5 + ldr w5,[sp,#8] + str w8,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + ror w7,w2,#7 + and w17,w20,w27 + ror w6,w15,#17 + bic w28,w21,w27 + ror w8,w23,#2 + add w22,w22,w0 // h+=X[i] + eor w16,w16,w27,ror#11 + eor w7,w7,w2,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w27,ror#25 // Sigma1(e) + eor w8,w8,w23,ror#13 + add w22,w22,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w6,w6,w15,ror#19 + eor w7,w7,w2,lsr#3 // sigma0(X[i+1]) + add w22,w22,w16 // h+=Sigma1(e) + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w8,w23,ror#22 // Sigma0(a) + eor w6,w6,w15,lsr#10 // sigma1(X[i+14]) + add w1,w1,w10 + add w26,w26,w22 // d+=h + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w1,w1,w7 + add w22,w22,w17 // h+=Sigma0(a) + add w1,w1,w6 + ldr w6,[sp,#12] + str w9,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + ror w8,w3,#7 + and w17,w27,w26 + ror w7,w0,#17 + bic w19,w20,w26 + ror w9,w22,#2 + add w21,w21,w1 // h+=X[i] + eor w16,w16,w26,ror#11 + eor w8,w8,w3,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w26,ror#25 // Sigma1(e) + eor w9,w9,w22,ror#13 + add w21,w21,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w7,w7,w0,ror#19 + eor w8,w8,w3,lsr#3 // sigma0(X[i+1]) + add w21,w21,w16 // h+=Sigma1(e) + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w9,w22,ror#22 // Sigma0(a) + eor w7,w7,w0,lsr#10 // sigma1(X[i+14]) + add w2,w2,w11 + add w25,w25,w21 // d+=h + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w2,w2,w8 + add w21,w21,w17 // h+=Sigma0(a) + add w2,w2,w7 + ldr w7,[sp,#0] + str w10,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w9,w4,#7 + and w17,w26,w25 + ror w8,w1,#17 + bic w28,w27,w25 + ror w10,w21,#2 + add w20,w20,w2 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w9,w9,w4,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w10,w10,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w8,w8,w1,ror#19 + eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w10,w21,ror#22 // Sigma0(a) + eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) + add w3,w3,w12 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w3,w3,w9 + add w20,w20,w17 // h+=Sigma0(a) + add w3,w3,w8 + cbnz w19,Loop_16_xx + + ldp x0,x2,[x29,#96] + ldr x1,[x29,#112] + sub x30,x30,#260 // rewind + + ldp w3,w4,[x0] + ldp w5,w6,[x0,#2*4] + add x1,x1,#14*4 // advance input pointer + ldp w7,w8,[x0,#4*4] + add w20,w20,w3 + ldp w9,w10,[x0,#6*4] + add w21,w21,w4 + add w22,w22,w5 + add w23,w23,w6 + stp w20,w21,[x0] + add w24,w24,w7 + add w25,w25,w8 + stp w22,w23,[x0,#2*4] + add w26,w26,w9 + add w27,w27,w10 + cmp x1,x2 + stp w24,w25,[x0,#4*4] + stp w26,w27,[x0,#6*4] + b.ne Loop + + ldp x19,x20,[x29,#16] + add sp,sp,#4*4 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +.section .rodata +.align 6 + +LK256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0 //terminator + +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +.text +#ifndef __KERNEL__ +.globl sha256_block_data_order_hw + +.def sha256_block_data_order_hw + .type 32 +.endef +.align 6 +sha256_block_data_order_hw: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v0.4s,v1.4s},[x0] + adrp x3,LK256 + add x3,x3,:lo12:LK256 + +Loop_hw: + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + sub x2,x2,#1 + ld1 {v16.4s},[x3],#16 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + rev32 v6.16b,v6.16b + rev32 v7.16b,v7.16b + orr v18.16b,v0.16b,v0.16b // offload + orr v19.16b,v1.16b,v1.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + ld1 {v17.4s},[x3] + add v16.4s,v16.4s,v6.4s + sub x3,x3,#64*4-16 // rewind + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + add v17.4s,v17.4s,v7.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + add v0.4s,v0.4s,v18.4s + add v1.4s,v1.4s,v19.4s + + cbnz x2,Loop_hw + + st1 {v0.4s,v1.4s},[x0] + + ldr x29,[sp],#16 + ret + +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) diff --git a/ring-0.17.14/pregenerated/sha256-x86_64-elf.S b/ring-0.17.14/pregenerated/sha256-x86_64-elf.S new file mode 100644 index 0000000000..c800f8ee39 --- /dev/null +++ b/ring-0.17.14/pregenerated/sha256-x86_64-elf.S @@ -0,0 +1,4170 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +.text + +.globl sha256_block_data_order_nohw +.hidden sha256_block_data_order_nohw +.type sha256_block_data_order_nohw,@function +.align 16 +sha256_block_data_order_nohw: +.cfi_startproc +_CET_ENDBR + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $64+32,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %rax,88(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 +.Lprologue: + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + jmp .Lloop + +.align 16 +.Lloop: + movl %ebx,%edi + leaq K256(%rip),%rbp + xorl %ecx,%edi + movl 0(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + + leaq 4(%rbp),%rbp + addl %r14d,%r11d + movl 4(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + + leaq 4(%rbp),%rbp + addl %r14d,%r10d + movl 8(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + + leaq 4(%rbp),%rbp + addl %r14d,%r9d + movl 12(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + + leaq 20(%rbp),%rbp + addl %r14d,%r8d + movl 16(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + + leaq 4(%rbp),%rbp + addl %r14d,%edx + movl 20(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + + leaq 4(%rbp),%rbp + addl %r14d,%ecx + movl 24(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + + leaq 4(%rbp),%rbp + addl %r14d,%ebx + movl 28(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + + leaq 20(%rbp),%rbp + addl %r14d,%eax + movl 32(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + + leaq 4(%rbp),%rbp + addl %r14d,%r11d + movl 36(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + + leaq 4(%rbp),%rbp + addl %r14d,%r10d + movl 40(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + + leaq 4(%rbp),%rbp + addl %r14d,%r9d + movl 44(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + + leaq 20(%rbp),%rbp + addl %r14d,%r8d + movl 48(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + + leaq 4(%rbp),%rbp + addl %r14d,%edx + movl 52(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + + leaq 4(%rbp),%rbp + addl %r14d,%ecx + movl 56(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + + leaq 4(%rbp),%rbp + addl %r14d,%ebx + movl 60(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + + leaq 20(%rbp),%rbp + jmp .Lrounds_16_xx +.align 16 +.Lrounds_16_xx: + movl 4(%rsp),%r13d + movl 56(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 36(%rsp),%r12d + + addl 0(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + + leaq 4(%rbp),%rbp + movl 8(%rsp),%r13d + movl 60(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 40(%rsp),%r12d + + addl 4(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + + leaq 4(%rbp),%rbp + movl 12(%rsp),%r13d + movl 0(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 44(%rsp),%r12d + + addl 8(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + + leaq 4(%rbp),%rbp + movl 16(%rsp),%r13d + movl 4(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 48(%rsp),%r12d + + addl 12(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + + leaq 20(%rbp),%rbp + movl 20(%rsp),%r13d + movl 8(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 52(%rsp),%r12d + + addl 16(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + + leaq 4(%rbp),%rbp + movl 24(%rsp),%r13d + movl 12(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 56(%rsp),%r12d + + addl 20(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + + leaq 4(%rbp),%rbp + movl 28(%rsp),%r13d + movl 16(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 60(%rsp),%r12d + + addl 24(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + + leaq 4(%rbp),%rbp + movl 32(%rsp),%r13d + movl 20(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 0(%rsp),%r12d + + addl 28(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + + leaq 20(%rbp),%rbp + movl 36(%rsp),%r13d + movl 24(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 4(%rsp),%r12d + + addl 32(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + + leaq 4(%rbp),%rbp + movl 40(%rsp),%r13d + movl 28(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 8(%rsp),%r12d + + addl 36(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + + leaq 4(%rbp),%rbp + movl 44(%rsp),%r13d + movl 32(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 12(%rsp),%r12d + + addl 40(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + + leaq 4(%rbp),%rbp + movl 48(%rsp),%r13d + movl 36(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 16(%rsp),%r12d + + addl 44(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + + leaq 20(%rbp),%rbp + movl 52(%rsp),%r13d + movl 40(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 20(%rsp),%r12d + + addl 48(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + + leaq 4(%rbp),%rbp + movl 56(%rsp),%r13d + movl 44(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 24(%rsp),%r12d + + addl 52(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + + leaq 4(%rbp),%rbp + movl 60(%rsp),%r13d + movl 48(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 28(%rsp),%r12d + + addl 56(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + + leaq 4(%rbp),%rbp + movl 0(%rsp),%r13d + movl 52(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 32(%rsp),%r12d + + addl 60(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + + leaq 20(%rbp),%rbp + cmpb $0,3(%rbp) + jnz .Lrounds_16_xx + + movq 64+0(%rsp),%rdi + addl %r14d,%eax + leaq 64(%rsi),%rsi + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop + + movq 88(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue: + ret +.cfi_endproc +.size sha256_block_data_order_nohw,.-sha256_block_data_order_nohw +.section .rodata +.align 64 +.type K256,@object +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.text +.globl sha256_block_data_order_hw +.hidden sha256_block_data_order_hw +.type sha256_block_data_order_hw,@function +.align 64 +sha256_block_data_order_hw: +.cfi_startproc +_CET_ENDBR + leaq K256+128(%rip),%rcx + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa 512-128(%rcx),%xmm7 + + pshufd $0x1b,%xmm1,%xmm0 + pshufd $0xb1,%xmm1,%xmm1 + pshufd $0x1b,%xmm2,%xmm2 + movdqa %xmm7,%xmm8 +.byte 102,15,58,15,202,8 + punpcklqdq %xmm0,%xmm2 + jmp .Loop_shaext + +.align 16 +.Loop_shaext: + movdqu (%rsi),%xmm3 + movdqu 16(%rsi),%xmm4 + movdqu 32(%rsi),%xmm5 +.byte 102,15,56,0,223 + movdqu 48(%rsi),%xmm6 + + movdqa 0-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 102,15,56,0,231 + movdqa %xmm2,%xmm10 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + nop + movdqa %xmm1,%xmm9 +.byte 15,56,203,202 + + movdqa 32-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 102,15,56,0,239 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + leaq 64(%rsi),%rsi +.byte 15,56,204,220 +.byte 15,56,203,202 + + movdqa 64-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 102,15,56,0,247 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + + movdqa 96-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 128-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 160-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 192-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 224-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 256-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 288-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 320-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 352-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 384-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 416-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 +.byte 15,56,203,202 + paddd %xmm7,%xmm6 + + movdqa 448-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 +.byte 15,56,205,245 + movdqa %xmm8,%xmm7 +.byte 15,56,203,202 + + movdqa 480-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 + nop +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + decq %rdx + nop +.byte 15,56,203,202 + + paddd %xmm10,%xmm2 + paddd %xmm9,%xmm1 + jnz .Loop_shaext + + pshufd $0xb1,%xmm2,%xmm2 + pshufd $0x1b,%xmm1,%xmm7 + pshufd $0xb1,%xmm1,%xmm1 + punpckhqdq %xmm2,%xmm1 +.byte 102,15,58,15,215,8 + + movdqu %xmm1,(%rdi) + movdqu %xmm2,16(%rdi) + ret +.cfi_endproc +.size sha256_block_data_order_hw,.-sha256_block_data_order_hw +.globl sha256_block_data_order_ssse3 +.hidden sha256_block_data_order_ssse3 +.type sha256_block_data_order_ssse3,@function +.align 64 +sha256_block_data_order_ssse3: +.cfi_startproc +_CET_ENDBR + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $96,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %rax,88(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 +.Lprologue_ssse3: + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + + + jmp .Lloop_ssse3 +.align 16 +.Lloop_ssse3: + movdqa K256+512(%rip),%xmm7 + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 +.byte 102,15,56,0,199 + movdqu 48(%rsi),%xmm3 + leaq K256(%rip),%rbp +.byte 102,15,56,0,207 + movdqa 0(%rbp),%xmm4 + movdqa 32(%rbp),%xmm5 +.byte 102,15,56,0,215 + paddd %xmm0,%xmm4 + movdqa 64(%rbp),%xmm6 +.byte 102,15,56,0,223 + movdqa 96(%rbp),%xmm7 + paddd %xmm1,%xmm5 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + movdqa %xmm4,0(%rsp) + movl %eax,%r14d + movdqa %xmm5,16(%rsp) + movl %ebx,%edi + movdqa %xmm6,32(%rsp) + xorl %ecx,%edi + movdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp .Lssse3_00_47 + +.align 16 +.Lssse3_00_47: + subq $-128,%rbp + rorl $14,%r13d + movdqa %xmm1,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm3,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,224,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,250,4 + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm3,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 4(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm0 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm0 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm0,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 0(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm0,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,0(%rsp) + rorl $14,%r13d + movdqa %xmm2,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm0,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,225,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,251,4 + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm0,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 20(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm1 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm1 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm1,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 32(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm1,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,16(%rsp) + rorl $14,%r13d + movdqa %xmm3,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm1,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,226,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,248,4 + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm1,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 36(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm2 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm2 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm2,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 64(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm2,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,32(%rsp) + rorl $14,%r13d + movdqa %xmm0,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm2,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,227,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,249,4 + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm2,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 52(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm3 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm3 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm3,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 96(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm3,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,48(%rsp) + cmpb $0,131(%rbp) + jne .Lssse3_00_47 + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 64+0(%rsp),%rdi + movl %r14d,%eax + + addl 0(%rdi),%eax + leaq 64(%rsi),%rsi + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop_ssse3 + + movq 88(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_ssse3: + ret +.cfi_endproc +.size sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3 +.globl sha256_block_data_order_avx +.hidden sha256_block_data_order_avx +.type sha256_block_data_order_avx,@function +.align 64 +sha256_block_data_order_avx: +.cfi_startproc +_CET_ENDBR + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $96,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %rax,88(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 +.Lprologue_avx: + + vzeroupper + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + vmovdqa K256+512+32(%rip),%xmm8 + vmovdqa K256+512+64(%rip),%xmm9 + jmp .Lloop_avx +.align 16 +.Lloop_avx: + vmovdqa K256+512(%rip),%xmm7 + vmovdqu 0(%rsi),%xmm0 + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm7,%xmm0,%xmm0 + leaq K256(%rip),%rbp + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd 0(%rbp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 32(%rbp),%xmm1,%xmm5 + vpaddd 64(%rbp),%xmm2,%xmm6 + vpaddd 96(%rbp),%xmm3,%xmm7 + vmovdqa %xmm4,0(%rsp) + movl %eax,%r14d + vmovdqa %xmm5,16(%rsp) + movl %ebx,%edi + vmovdqa %xmm6,32(%rsp) + xorl %ecx,%edi + vmovdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp .Lavx_00_47 + +.align 16 +.Lavx_00_47: + subq $-128,%rbp + vpalignr $4,%xmm0,%xmm1,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + vpalignr $4,%xmm2,%xmm3,%xmm7 + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpaddd %xmm7,%xmm0,%xmm0 + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + vpshufd $250,%xmm3,%xmm7 + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + vpsrld $11,%xmm6,%xmm6 + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + vpaddd %xmm4,%xmm0,%xmm0 + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + vpxor %xmm7,%xmm6,%xmm6 + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + vpaddd %xmm6,%xmm0,%xmm0 + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + vpshufd $80,%xmm0,%xmm7 + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + vpxor %xmm7,%xmm6,%xmm6 + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + vpaddd %xmm6,%xmm0,%xmm0 + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vpaddd 0(%rbp),%xmm0,%xmm6 + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,0(%rsp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + vpalignr $4,%xmm3,%xmm0,%xmm7 + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpaddd %xmm7,%xmm1,%xmm1 + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + vpshufd $250,%xmm0,%xmm7 + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + vpsrld $11,%xmm6,%xmm6 + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + vpaddd %xmm4,%xmm1,%xmm1 + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + vpaddd %xmm6,%xmm1,%xmm1 + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + vpshufd $80,%xmm1,%xmm7 + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + vpxor %xmm7,%xmm6,%xmm6 + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + vpaddd %xmm6,%xmm1,%xmm1 + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpaddd 32(%rbp),%xmm1,%xmm6 + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,16(%rsp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + vpalignr $4,%xmm0,%xmm1,%xmm7 + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpaddd %xmm7,%xmm2,%xmm2 + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + vpshufd $250,%xmm1,%xmm7 + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + vpsrld $11,%xmm6,%xmm6 + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + vpaddd %xmm4,%xmm2,%xmm2 + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + vpxor %xmm7,%xmm6,%xmm6 + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + vpaddd %xmm6,%xmm2,%xmm2 + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + vpshufd $80,%xmm2,%xmm7 + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + vpxor %xmm7,%xmm6,%xmm6 + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + vpaddd %xmm6,%xmm2,%xmm2 + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vpaddd 64(%rbp),%xmm2,%xmm6 + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,32(%rsp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + vpalignr $4,%xmm1,%xmm2,%xmm7 + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpaddd %xmm7,%xmm3,%xmm3 + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + vpshufd $250,%xmm2,%xmm7 + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + vpsrld $11,%xmm6,%xmm6 + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + vpaddd %xmm4,%xmm3,%xmm3 + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + vpaddd %xmm6,%xmm3,%xmm3 + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + vpshufd $80,%xmm3,%xmm7 + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + vpxor %xmm7,%xmm6,%xmm6 + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + vpaddd %xmm6,%xmm3,%xmm3 + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpaddd 96(%rbp),%xmm3,%xmm6 + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,48(%rsp) + cmpb $0,131(%rbp) + jne .Lavx_00_47 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 64+0(%rsp),%rdi + movl %r14d,%eax + + addl 0(%rdi),%eax + leaq 64(%rsi),%rsi + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop_avx + + movq 88(%rsp),%rsi +.cfi_def_cfa %rsi,8 + vzeroupper + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx: + ret +.cfi_endproc +.size sha256_block_data_order_avx,.-sha256_block_data_order_avx +#endif diff --git a/ring-0.17.14/pregenerated/sha256-x86_64-macosx.S b/ring-0.17.14/pregenerated/sha256-x86_64-macosx.S new file mode 100644 index 0000000000..fe5d347a59 --- /dev/null +++ b/ring-0.17.14/pregenerated/sha256-x86_64-macosx.S @@ -0,0 +1,4170 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +.text + +.globl _sha256_block_data_order_nohw +.private_extern _sha256_block_data_order_nohw + +.p2align 4 +_sha256_block_data_order_nohw: + +_CET_ENDBR + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + shlq $4,%rdx + subq $64+32,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %rax,88(%rsp) + +L$prologue: + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + jmp L$loop + +.p2align 4 +L$loop: + movl %ebx,%edi + leaq K256(%rip),%rbp + xorl %ecx,%edi + movl 0(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + + leaq 4(%rbp),%rbp + addl %r14d,%r11d + movl 4(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + + leaq 4(%rbp),%rbp + addl %r14d,%r10d + movl 8(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + + leaq 4(%rbp),%rbp + addl %r14d,%r9d + movl 12(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + + leaq 20(%rbp),%rbp + addl %r14d,%r8d + movl 16(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + + leaq 4(%rbp),%rbp + addl %r14d,%edx + movl 20(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + + leaq 4(%rbp),%rbp + addl %r14d,%ecx + movl 24(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + + leaq 4(%rbp),%rbp + addl %r14d,%ebx + movl 28(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + + leaq 20(%rbp),%rbp + addl %r14d,%eax + movl 32(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + + leaq 4(%rbp),%rbp + addl %r14d,%r11d + movl 36(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + + leaq 4(%rbp),%rbp + addl %r14d,%r10d + movl 40(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + + leaq 4(%rbp),%rbp + addl %r14d,%r9d + movl 44(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + + leaq 20(%rbp),%rbp + addl %r14d,%r8d + movl 48(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + + leaq 4(%rbp),%rbp + addl %r14d,%edx + movl 52(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + + leaq 4(%rbp),%rbp + addl %r14d,%ecx + movl 56(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + + leaq 4(%rbp),%rbp + addl %r14d,%ebx + movl 60(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + + leaq 20(%rbp),%rbp + jmp L$rounds_16_xx +.p2align 4 +L$rounds_16_xx: + movl 4(%rsp),%r13d + movl 56(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 36(%rsp),%r12d + + addl 0(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + + leaq 4(%rbp),%rbp + movl 8(%rsp),%r13d + movl 60(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 40(%rsp),%r12d + + addl 4(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + + leaq 4(%rbp),%rbp + movl 12(%rsp),%r13d + movl 0(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 44(%rsp),%r12d + + addl 8(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + + leaq 4(%rbp),%rbp + movl 16(%rsp),%r13d + movl 4(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 48(%rsp),%r12d + + addl 12(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + + leaq 20(%rbp),%rbp + movl 20(%rsp),%r13d + movl 8(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 52(%rsp),%r12d + + addl 16(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + + leaq 4(%rbp),%rbp + movl 24(%rsp),%r13d + movl 12(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 56(%rsp),%r12d + + addl 20(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + + leaq 4(%rbp),%rbp + movl 28(%rsp),%r13d + movl 16(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 60(%rsp),%r12d + + addl 24(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + + leaq 4(%rbp),%rbp + movl 32(%rsp),%r13d + movl 20(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 0(%rsp),%r12d + + addl 28(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + + leaq 20(%rbp),%rbp + movl 36(%rsp),%r13d + movl 24(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 4(%rsp),%r12d + + addl 32(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + + leaq 4(%rbp),%rbp + movl 40(%rsp),%r13d + movl 28(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 8(%rsp),%r12d + + addl 36(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + + leaq 4(%rbp),%rbp + movl 44(%rsp),%r13d + movl 32(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 12(%rsp),%r12d + + addl 40(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + + leaq 4(%rbp),%rbp + movl 48(%rsp),%r13d + movl 36(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 16(%rsp),%r12d + + addl 44(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + + leaq 20(%rbp),%rbp + movl 52(%rsp),%r13d + movl 40(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 20(%rsp),%r12d + + addl 48(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + + leaq 4(%rbp),%rbp + movl 56(%rsp),%r13d + movl 44(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 24(%rsp),%r12d + + addl 52(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + + leaq 4(%rbp),%rbp + movl 60(%rsp),%r13d + movl 48(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 28(%rsp),%r12d + + addl 56(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + + leaq 4(%rbp),%rbp + movl 0(%rsp),%r13d + movl 52(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 32(%rsp),%r12d + + addl 60(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + + leaq 20(%rbp),%rbp + cmpb $0,3(%rbp) + jnz L$rounds_16_xx + + movq 64+0(%rsp),%rdi + addl %r14d,%eax + leaq 64(%rsi),%rsi + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb L$loop + + movq 88(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$epilogue: + ret + + +.section __DATA,__const +.p2align 6 + +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.text +.globl _sha256_block_data_order_hw +.private_extern _sha256_block_data_order_hw + +.p2align 6 +_sha256_block_data_order_hw: + +_CET_ENDBR + leaq K256+128(%rip),%rcx + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa 512-128(%rcx),%xmm7 + + pshufd $0x1b,%xmm1,%xmm0 + pshufd $0xb1,%xmm1,%xmm1 + pshufd $0x1b,%xmm2,%xmm2 + movdqa %xmm7,%xmm8 +.byte 102,15,58,15,202,8 + punpcklqdq %xmm0,%xmm2 + jmp L$oop_shaext + +.p2align 4 +L$oop_shaext: + movdqu (%rsi),%xmm3 + movdqu 16(%rsi),%xmm4 + movdqu 32(%rsi),%xmm5 +.byte 102,15,56,0,223 + movdqu 48(%rsi),%xmm6 + + movdqa 0-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 102,15,56,0,231 + movdqa %xmm2,%xmm10 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + nop + movdqa %xmm1,%xmm9 +.byte 15,56,203,202 + + movdqa 32-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 102,15,56,0,239 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + leaq 64(%rsi),%rsi +.byte 15,56,204,220 +.byte 15,56,203,202 + + movdqa 64-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 102,15,56,0,247 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + + movdqa 96-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 128-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 160-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 192-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 224-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 256-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 288-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 320-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 352-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 384-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 416-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 +.byte 15,56,203,202 + paddd %xmm7,%xmm6 + + movdqa 448-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 +.byte 15,56,205,245 + movdqa %xmm8,%xmm7 +.byte 15,56,203,202 + + movdqa 480-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 + nop +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + decq %rdx + nop +.byte 15,56,203,202 + + paddd %xmm10,%xmm2 + paddd %xmm9,%xmm1 + jnz L$oop_shaext + + pshufd $0xb1,%xmm2,%xmm2 + pshufd $0x1b,%xmm1,%xmm7 + pshufd $0xb1,%xmm1,%xmm1 + punpckhqdq %xmm2,%xmm1 +.byte 102,15,58,15,215,8 + + movdqu %xmm1,(%rdi) + movdqu %xmm2,16(%rdi) + ret + + +.globl _sha256_block_data_order_ssse3 +.private_extern _sha256_block_data_order_ssse3 + +.p2align 6 +_sha256_block_data_order_ssse3: + +_CET_ENDBR + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + shlq $4,%rdx + subq $96,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %rax,88(%rsp) + +L$prologue_ssse3: + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + + + jmp L$loop_ssse3 +.p2align 4 +L$loop_ssse3: + movdqa K256+512(%rip),%xmm7 + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 +.byte 102,15,56,0,199 + movdqu 48(%rsi),%xmm3 + leaq K256(%rip),%rbp +.byte 102,15,56,0,207 + movdqa 0(%rbp),%xmm4 + movdqa 32(%rbp),%xmm5 +.byte 102,15,56,0,215 + paddd %xmm0,%xmm4 + movdqa 64(%rbp),%xmm6 +.byte 102,15,56,0,223 + movdqa 96(%rbp),%xmm7 + paddd %xmm1,%xmm5 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + movdqa %xmm4,0(%rsp) + movl %eax,%r14d + movdqa %xmm5,16(%rsp) + movl %ebx,%edi + movdqa %xmm6,32(%rsp) + xorl %ecx,%edi + movdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp L$ssse3_00_47 + +.p2align 4 +L$ssse3_00_47: + subq $-128,%rbp + rorl $14,%r13d + movdqa %xmm1,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm3,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,224,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,250,4 + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm3,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 4(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm0 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm0 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm0,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 0(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm0,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,0(%rsp) + rorl $14,%r13d + movdqa %xmm2,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm0,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,225,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,251,4 + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm0,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 20(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm1 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm1 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm1,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 32(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm1,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,16(%rsp) + rorl $14,%r13d + movdqa %xmm3,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm1,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,226,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,248,4 + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm1,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 36(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm2 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm2 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm2,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 64(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm2,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,32(%rsp) + rorl $14,%r13d + movdqa %xmm0,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm2,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,227,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,249,4 + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm2,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 52(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm3 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm3 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm3,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 96(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm3,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,48(%rsp) + cmpb $0,131(%rbp) + jne L$ssse3_00_47 + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 64+0(%rsp),%rdi + movl %r14d,%eax + + addl 0(%rdi),%eax + leaq 64(%rsi),%rsi + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb L$loop_ssse3 + + movq 88(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$epilogue_ssse3: + ret + + +.globl _sha256_block_data_order_avx +.private_extern _sha256_block_data_order_avx + +.p2align 6 +_sha256_block_data_order_avx: + +_CET_ENDBR + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + shlq $4,%rdx + subq $96,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %rax,88(%rsp) + +L$prologue_avx: + + vzeroupper + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + vmovdqa K256+512+32(%rip),%xmm8 + vmovdqa K256+512+64(%rip),%xmm9 + jmp L$loop_avx +.p2align 4 +L$loop_avx: + vmovdqa K256+512(%rip),%xmm7 + vmovdqu 0(%rsi),%xmm0 + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm7,%xmm0,%xmm0 + leaq K256(%rip),%rbp + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd 0(%rbp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 32(%rbp),%xmm1,%xmm5 + vpaddd 64(%rbp),%xmm2,%xmm6 + vpaddd 96(%rbp),%xmm3,%xmm7 + vmovdqa %xmm4,0(%rsp) + movl %eax,%r14d + vmovdqa %xmm5,16(%rsp) + movl %ebx,%edi + vmovdqa %xmm6,32(%rsp) + xorl %ecx,%edi + vmovdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp L$avx_00_47 + +.p2align 4 +L$avx_00_47: + subq $-128,%rbp + vpalignr $4,%xmm0,%xmm1,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + vpalignr $4,%xmm2,%xmm3,%xmm7 + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpaddd %xmm7,%xmm0,%xmm0 + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + vpshufd $250,%xmm3,%xmm7 + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + vpsrld $11,%xmm6,%xmm6 + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + vpaddd %xmm4,%xmm0,%xmm0 + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + vpxor %xmm7,%xmm6,%xmm6 + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + vpaddd %xmm6,%xmm0,%xmm0 + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + vpshufd $80,%xmm0,%xmm7 + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + vpxor %xmm7,%xmm6,%xmm6 + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + vpaddd %xmm6,%xmm0,%xmm0 + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vpaddd 0(%rbp),%xmm0,%xmm6 + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,0(%rsp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + vpalignr $4,%xmm3,%xmm0,%xmm7 + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpaddd %xmm7,%xmm1,%xmm1 + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + vpshufd $250,%xmm0,%xmm7 + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + vpsrld $11,%xmm6,%xmm6 + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + vpaddd %xmm4,%xmm1,%xmm1 + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + vpaddd %xmm6,%xmm1,%xmm1 + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + vpshufd $80,%xmm1,%xmm7 + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + vpxor %xmm7,%xmm6,%xmm6 + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + vpaddd %xmm6,%xmm1,%xmm1 + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpaddd 32(%rbp),%xmm1,%xmm6 + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,16(%rsp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + vpalignr $4,%xmm0,%xmm1,%xmm7 + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpaddd %xmm7,%xmm2,%xmm2 + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + vpshufd $250,%xmm1,%xmm7 + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + vpsrld $11,%xmm6,%xmm6 + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + vpaddd %xmm4,%xmm2,%xmm2 + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + vpxor %xmm7,%xmm6,%xmm6 + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + vpaddd %xmm6,%xmm2,%xmm2 + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + vpshufd $80,%xmm2,%xmm7 + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + vpxor %xmm7,%xmm6,%xmm6 + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + vpaddd %xmm6,%xmm2,%xmm2 + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vpaddd 64(%rbp),%xmm2,%xmm6 + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,32(%rsp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + vpalignr $4,%xmm1,%xmm2,%xmm7 + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpaddd %xmm7,%xmm3,%xmm3 + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + vpshufd $250,%xmm2,%xmm7 + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + vpsrld $11,%xmm6,%xmm6 + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + vpaddd %xmm4,%xmm3,%xmm3 + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + vpaddd %xmm6,%xmm3,%xmm3 + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + vpshufd $80,%xmm3,%xmm7 + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + vpxor %xmm7,%xmm6,%xmm6 + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + vpaddd %xmm6,%xmm3,%xmm3 + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpaddd 96(%rbp),%xmm3,%xmm6 + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,48(%rsp) + cmpb $0,131(%rbp) + jne L$avx_00_47 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 64+0(%rsp),%rdi + movl %r14d,%eax + + addl 0(%rdi),%eax + leaq 64(%rsi),%rsi + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb L$loop_avx + + movq 88(%rsp),%rsi + + vzeroupper + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$epilogue_avx: + ret + + +#endif diff --git a/ring-0.17.14/pregenerated/sha256-x86_64-nasm.asm b/ring-0.17.14/pregenerated/sha256-x86_64-nasm.asm new file mode 100644 index 0000000000..ba0da615be --- /dev/null +++ b/ring-0.17.14/pregenerated/sha256-x86_64-nasm.asm @@ -0,0 +1,4413 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%include "ring_core_generated/prefix_symbols_nasm.inc" +section .text code align=64 + + +global sha256_block_data_order_nohw + +ALIGN 16 +sha256_block_data_order_nohw: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sha256_block_data_order_nohw: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + mov rax,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + shl rdx,4 + sub rsp,16*4+4*8 + lea rdx,[rdx*4+rsi] + and rsp,-64 + mov QWORD[((64+0))+rsp],rdi + mov QWORD[((64+8))+rsp],rsi + mov QWORD[((64+16))+rsp],rdx + mov QWORD[88+rsp],rax + +$L$prologue: + + mov eax,DWORD[rdi] + mov ebx,DWORD[4+rdi] + mov ecx,DWORD[8+rdi] + mov edx,DWORD[12+rdi] + mov r8d,DWORD[16+rdi] + mov r9d,DWORD[20+rdi] + mov r10d,DWORD[24+rdi] + mov r11d,DWORD[28+rdi] + jmp NEAR $L$loop + +ALIGN 16 +$L$loop: + mov edi,ebx + lea rbp,[K256] + xor edi,ecx + mov r12d,DWORD[rsi] + mov r13d,r8d + mov r14d,eax + bswap r12d + ror r13d,14 + mov r15d,r9d + + xor r13d,r8d + ror r14d,9 + xor r15d,r10d + + mov DWORD[rsp],r12d + xor r14d,eax + and r15d,r8d + + ror r13d,5 + add r12d,r11d + xor r15d,r10d + + ror r14d,11 + xor r13d,r8d + add r12d,r15d + + mov r15d,eax + add r12d,DWORD[rbp] + xor r14d,eax + + xor r15d,ebx + ror r13d,6 + mov r11d,ebx + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor r11d,edi + add edx,r12d + add r11d,r12d + + lea rbp,[4+rbp] + add r11d,r14d + mov r12d,DWORD[4+rsi] + mov r13d,edx + mov r14d,r11d + bswap r12d + ror r13d,14 + mov edi,r8d + + xor r13d,edx + ror r14d,9 + xor edi,r9d + + mov DWORD[4+rsp],r12d + xor r14d,r11d + and edi,edx + + ror r13d,5 + add r12d,r10d + xor edi,r9d + + ror r14d,11 + xor r13d,edx + add r12d,edi + + mov edi,r11d + add r12d,DWORD[rbp] + xor r14d,r11d + + xor edi,eax + ror r13d,6 + mov r10d,eax + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor r10d,r15d + add ecx,r12d + add r10d,r12d + + lea rbp,[4+rbp] + add r10d,r14d + mov r12d,DWORD[8+rsi] + mov r13d,ecx + mov r14d,r10d + bswap r12d + ror r13d,14 + mov r15d,edx + + xor r13d,ecx + ror r14d,9 + xor r15d,r8d + + mov DWORD[8+rsp],r12d + xor r14d,r10d + and r15d,ecx + + ror r13d,5 + add r12d,r9d + xor r15d,r8d + + ror r14d,11 + xor r13d,ecx + add r12d,r15d + + mov r15d,r10d + add r12d,DWORD[rbp] + xor r14d,r10d + + xor r15d,r11d + ror r13d,6 + mov r9d,r11d + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor r9d,edi + add ebx,r12d + add r9d,r12d + + lea rbp,[4+rbp] + add r9d,r14d + mov r12d,DWORD[12+rsi] + mov r13d,ebx + mov r14d,r9d + bswap r12d + ror r13d,14 + mov edi,ecx + + xor r13d,ebx + ror r14d,9 + xor edi,edx + + mov DWORD[12+rsp],r12d + xor r14d,r9d + and edi,ebx + + ror r13d,5 + add r12d,r8d + xor edi,edx + + ror r14d,11 + xor r13d,ebx + add r12d,edi + + mov edi,r9d + add r12d,DWORD[rbp] + xor r14d,r9d + + xor edi,r10d + ror r13d,6 + mov r8d,r10d + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor r8d,r15d + add eax,r12d + add r8d,r12d + + lea rbp,[20+rbp] + add r8d,r14d + mov r12d,DWORD[16+rsi] + mov r13d,eax + mov r14d,r8d + bswap r12d + ror r13d,14 + mov r15d,ebx + + xor r13d,eax + ror r14d,9 + xor r15d,ecx + + mov DWORD[16+rsp],r12d + xor r14d,r8d + and r15d,eax + + ror r13d,5 + add r12d,edx + xor r15d,ecx + + ror r14d,11 + xor r13d,eax + add r12d,r15d + + mov r15d,r8d + add r12d,DWORD[rbp] + xor r14d,r8d + + xor r15d,r9d + ror r13d,6 + mov edx,r9d + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor edx,edi + add r11d,r12d + add edx,r12d + + lea rbp,[4+rbp] + add edx,r14d + mov r12d,DWORD[20+rsi] + mov r13d,r11d + mov r14d,edx + bswap r12d + ror r13d,14 + mov edi,eax + + xor r13d,r11d + ror r14d,9 + xor edi,ebx + + mov DWORD[20+rsp],r12d + xor r14d,edx + and edi,r11d + + ror r13d,5 + add r12d,ecx + xor edi,ebx + + ror r14d,11 + xor r13d,r11d + add r12d,edi + + mov edi,edx + add r12d,DWORD[rbp] + xor r14d,edx + + xor edi,r8d + ror r13d,6 + mov ecx,r8d + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor ecx,r15d + add r10d,r12d + add ecx,r12d + + lea rbp,[4+rbp] + add ecx,r14d + mov r12d,DWORD[24+rsi] + mov r13d,r10d + mov r14d,ecx + bswap r12d + ror r13d,14 + mov r15d,r11d + + xor r13d,r10d + ror r14d,9 + xor r15d,eax + + mov DWORD[24+rsp],r12d + xor r14d,ecx + and r15d,r10d + + ror r13d,5 + add r12d,ebx + xor r15d,eax + + ror r14d,11 + xor r13d,r10d + add r12d,r15d + + mov r15d,ecx + add r12d,DWORD[rbp] + xor r14d,ecx + + xor r15d,edx + ror r13d,6 + mov ebx,edx + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor ebx,edi + add r9d,r12d + add ebx,r12d + + lea rbp,[4+rbp] + add ebx,r14d + mov r12d,DWORD[28+rsi] + mov r13d,r9d + mov r14d,ebx + bswap r12d + ror r13d,14 + mov edi,r10d + + xor r13d,r9d + ror r14d,9 + xor edi,r11d + + mov DWORD[28+rsp],r12d + xor r14d,ebx + and edi,r9d + + ror r13d,5 + add r12d,eax + xor edi,r11d + + ror r14d,11 + xor r13d,r9d + add r12d,edi + + mov edi,ebx + add r12d,DWORD[rbp] + xor r14d,ebx + + xor edi,ecx + ror r13d,6 + mov eax,ecx + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor eax,r15d + add r8d,r12d + add eax,r12d + + lea rbp,[20+rbp] + add eax,r14d + mov r12d,DWORD[32+rsi] + mov r13d,r8d + mov r14d,eax + bswap r12d + ror r13d,14 + mov r15d,r9d + + xor r13d,r8d + ror r14d,9 + xor r15d,r10d + + mov DWORD[32+rsp],r12d + xor r14d,eax + and r15d,r8d + + ror r13d,5 + add r12d,r11d + xor r15d,r10d + + ror r14d,11 + xor r13d,r8d + add r12d,r15d + + mov r15d,eax + add r12d,DWORD[rbp] + xor r14d,eax + + xor r15d,ebx + ror r13d,6 + mov r11d,ebx + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor r11d,edi + add edx,r12d + add r11d,r12d + + lea rbp,[4+rbp] + add r11d,r14d + mov r12d,DWORD[36+rsi] + mov r13d,edx + mov r14d,r11d + bswap r12d + ror r13d,14 + mov edi,r8d + + xor r13d,edx + ror r14d,9 + xor edi,r9d + + mov DWORD[36+rsp],r12d + xor r14d,r11d + and edi,edx + + ror r13d,5 + add r12d,r10d + xor edi,r9d + + ror r14d,11 + xor r13d,edx + add r12d,edi + + mov edi,r11d + add r12d,DWORD[rbp] + xor r14d,r11d + + xor edi,eax + ror r13d,6 + mov r10d,eax + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor r10d,r15d + add ecx,r12d + add r10d,r12d + + lea rbp,[4+rbp] + add r10d,r14d + mov r12d,DWORD[40+rsi] + mov r13d,ecx + mov r14d,r10d + bswap r12d + ror r13d,14 + mov r15d,edx + + xor r13d,ecx + ror r14d,9 + xor r15d,r8d + + mov DWORD[40+rsp],r12d + xor r14d,r10d + and r15d,ecx + + ror r13d,5 + add r12d,r9d + xor r15d,r8d + + ror r14d,11 + xor r13d,ecx + add r12d,r15d + + mov r15d,r10d + add r12d,DWORD[rbp] + xor r14d,r10d + + xor r15d,r11d + ror r13d,6 + mov r9d,r11d + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor r9d,edi + add ebx,r12d + add r9d,r12d + + lea rbp,[4+rbp] + add r9d,r14d + mov r12d,DWORD[44+rsi] + mov r13d,ebx + mov r14d,r9d + bswap r12d + ror r13d,14 + mov edi,ecx + + xor r13d,ebx + ror r14d,9 + xor edi,edx + + mov DWORD[44+rsp],r12d + xor r14d,r9d + and edi,ebx + + ror r13d,5 + add r12d,r8d + xor edi,edx + + ror r14d,11 + xor r13d,ebx + add r12d,edi + + mov edi,r9d + add r12d,DWORD[rbp] + xor r14d,r9d + + xor edi,r10d + ror r13d,6 + mov r8d,r10d + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor r8d,r15d + add eax,r12d + add r8d,r12d + + lea rbp,[20+rbp] + add r8d,r14d + mov r12d,DWORD[48+rsi] + mov r13d,eax + mov r14d,r8d + bswap r12d + ror r13d,14 + mov r15d,ebx + + xor r13d,eax + ror r14d,9 + xor r15d,ecx + + mov DWORD[48+rsp],r12d + xor r14d,r8d + and r15d,eax + + ror r13d,5 + add r12d,edx + xor r15d,ecx + + ror r14d,11 + xor r13d,eax + add r12d,r15d + + mov r15d,r8d + add r12d,DWORD[rbp] + xor r14d,r8d + + xor r15d,r9d + ror r13d,6 + mov edx,r9d + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor edx,edi + add r11d,r12d + add edx,r12d + + lea rbp,[4+rbp] + add edx,r14d + mov r12d,DWORD[52+rsi] + mov r13d,r11d + mov r14d,edx + bswap r12d + ror r13d,14 + mov edi,eax + + xor r13d,r11d + ror r14d,9 + xor edi,ebx + + mov DWORD[52+rsp],r12d + xor r14d,edx + and edi,r11d + + ror r13d,5 + add r12d,ecx + xor edi,ebx + + ror r14d,11 + xor r13d,r11d + add r12d,edi + + mov edi,edx + add r12d,DWORD[rbp] + xor r14d,edx + + xor edi,r8d + ror r13d,6 + mov ecx,r8d + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor ecx,r15d + add r10d,r12d + add ecx,r12d + + lea rbp,[4+rbp] + add ecx,r14d + mov r12d,DWORD[56+rsi] + mov r13d,r10d + mov r14d,ecx + bswap r12d + ror r13d,14 + mov r15d,r11d + + xor r13d,r10d + ror r14d,9 + xor r15d,eax + + mov DWORD[56+rsp],r12d + xor r14d,ecx + and r15d,r10d + + ror r13d,5 + add r12d,ebx + xor r15d,eax + + ror r14d,11 + xor r13d,r10d + add r12d,r15d + + mov r15d,ecx + add r12d,DWORD[rbp] + xor r14d,ecx + + xor r15d,edx + ror r13d,6 + mov ebx,edx + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor ebx,edi + add r9d,r12d + add ebx,r12d + + lea rbp,[4+rbp] + add ebx,r14d + mov r12d,DWORD[60+rsi] + mov r13d,r9d + mov r14d,ebx + bswap r12d + ror r13d,14 + mov edi,r10d + + xor r13d,r9d + ror r14d,9 + xor edi,r11d + + mov DWORD[60+rsp],r12d + xor r14d,ebx + and edi,r9d + + ror r13d,5 + add r12d,eax + xor edi,r11d + + ror r14d,11 + xor r13d,r9d + add r12d,edi + + mov edi,ebx + add r12d,DWORD[rbp] + xor r14d,ebx + + xor edi,ecx + ror r13d,6 + mov eax,ecx + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor eax,r15d + add r8d,r12d + add eax,r12d + + lea rbp,[20+rbp] + jmp NEAR $L$rounds_16_xx +ALIGN 16 +$L$rounds_16_xx: + mov r13d,DWORD[4+rsp] + mov r15d,DWORD[56+rsp] + + mov r12d,r13d + ror r13d,11 + add eax,r14d + mov r14d,r15d + ror r15d,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor r15d,r14d + shr r14d,10 + + ror r15d,17 + xor r12d,r13d + xor r15d,r14d + add r12d,DWORD[36+rsp] + + add r12d,DWORD[rsp] + mov r13d,r8d + add r12d,r15d + mov r14d,eax + ror r13d,14 + mov r15d,r9d + + xor r13d,r8d + ror r14d,9 + xor r15d,r10d + + mov DWORD[rsp],r12d + xor r14d,eax + and r15d,r8d + + ror r13d,5 + add r12d,r11d + xor r15d,r10d + + ror r14d,11 + xor r13d,r8d + add r12d,r15d + + mov r15d,eax + add r12d,DWORD[rbp] + xor r14d,eax + + xor r15d,ebx + ror r13d,6 + mov r11d,ebx + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor r11d,edi + add edx,r12d + add r11d,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[8+rsp] + mov edi,DWORD[60+rsp] + + mov r12d,r13d + ror r13d,11 + add r11d,r14d + mov r14d,edi + ror edi,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor edi,r14d + shr r14d,10 + + ror edi,17 + xor r12d,r13d + xor edi,r14d + add r12d,DWORD[40+rsp] + + add r12d,DWORD[4+rsp] + mov r13d,edx + add r12d,edi + mov r14d,r11d + ror r13d,14 + mov edi,r8d + + xor r13d,edx + ror r14d,9 + xor edi,r9d + + mov DWORD[4+rsp],r12d + xor r14d,r11d + and edi,edx + + ror r13d,5 + add r12d,r10d + xor edi,r9d + + ror r14d,11 + xor r13d,edx + add r12d,edi + + mov edi,r11d + add r12d,DWORD[rbp] + xor r14d,r11d + + xor edi,eax + ror r13d,6 + mov r10d,eax + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor r10d,r15d + add ecx,r12d + add r10d,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[12+rsp] + mov r15d,DWORD[rsp] + + mov r12d,r13d + ror r13d,11 + add r10d,r14d + mov r14d,r15d + ror r15d,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor r15d,r14d + shr r14d,10 + + ror r15d,17 + xor r12d,r13d + xor r15d,r14d + add r12d,DWORD[44+rsp] + + add r12d,DWORD[8+rsp] + mov r13d,ecx + add r12d,r15d + mov r14d,r10d + ror r13d,14 + mov r15d,edx + + xor r13d,ecx + ror r14d,9 + xor r15d,r8d + + mov DWORD[8+rsp],r12d + xor r14d,r10d + and r15d,ecx + + ror r13d,5 + add r12d,r9d + xor r15d,r8d + + ror r14d,11 + xor r13d,ecx + add r12d,r15d + + mov r15d,r10d + add r12d,DWORD[rbp] + xor r14d,r10d + + xor r15d,r11d + ror r13d,6 + mov r9d,r11d + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor r9d,edi + add ebx,r12d + add r9d,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[16+rsp] + mov edi,DWORD[4+rsp] + + mov r12d,r13d + ror r13d,11 + add r9d,r14d + mov r14d,edi + ror edi,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor edi,r14d + shr r14d,10 + + ror edi,17 + xor r12d,r13d + xor edi,r14d + add r12d,DWORD[48+rsp] + + add r12d,DWORD[12+rsp] + mov r13d,ebx + add r12d,edi + mov r14d,r9d + ror r13d,14 + mov edi,ecx + + xor r13d,ebx + ror r14d,9 + xor edi,edx + + mov DWORD[12+rsp],r12d + xor r14d,r9d + and edi,ebx + + ror r13d,5 + add r12d,r8d + xor edi,edx + + ror r14d,11 + xor r13d,ebx + add r12d,edi + + mov edi,r9d + add r12d,DWORD[rbp] + xor r14d,r9d + + xor edi,r10d + ror r13d,6 + mov r8d,r10d + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor r8d,r15d + add eax,r12d + add r8d,r12d + + lea rbp,[20+rbp] + mov r13d,DWORD[20+rsp] + mov r15d,DWORD[8+rsp] + + mov r12d,r13d + ror r13d,11 + add r8d,r14d + mov r14d,r15d + ror r15d,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor r15d,r14d + shr r14d,10 + + ror r15d,17 + xor r12d,r13d + xor r15d,r14d + add r12d,DWORD[52+rsp] + + add r12d,DWORD[16+rsp] + mov r13d,eax + add r12d,r15d + mov r14d,r8d + ror r13d,14 + mov r15d,ebx + + xor r13d,eax + ror r14d,9 + xor r15d,ecx + + mov DWORD[16+rsp],r12d + xor r14d,r8d + and r15d,eax + + ror r13d,5 + add r12d,edx + xor r15d,ecx + + ror r14d,11 + xor r13d,eax + add r12d,r15d + + mov r15d,r8d + add r12d,DWORD[rbp] + xor r14d,r8d + + xor r15d,r9d + ror r13d,6 + mov edx,r9d + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor edx,edi + add r11d,r12d + add edx,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[24+rsp] + mov edi,DWORD[12+rsp] + + mov r12d,r13d + ror r13d,11 + add edx,r14d + mov r14d,edi + ror edi,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor edi,r14d + shr r14d,10 + + ror edi,17 + xor r12d,r13d + xor edi,r14d + add r12d,DWORD[56+rsp] + + add r12d,DWORD[20+rsp] + mov r13d,r11d + add r12d,edi + mov r14d,edx + ror r13d,14 + mov edi,eax + + xor r13d,r11d + ror r14d,9 + xor edi,ebx + + mov DWORD[20+rsp],r12d + xor r14d,edx + and edi,r11d + + ror r13d,5 + add r12d,ecx + xor edi,ebx + + ror r14d,11 + xor r13d,r11d + add r12d,edi + + mov edi,edx + add r12d,DWORD[rbp] + xor r14d,edx + + xor edi,r8d + ror r13d,6 + mov ecx,r8d + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor ecx,r15d + add r10d,r12d + add ecx,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[28+rsp] + mov r15d,DWORD[16+rsp] + + mov r12d,r13d + ror r13d,11 + add ecx,r14d + mov r14d,r15d + ror r15d,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor r15d,r14d + shr r14d,10 + + ror r15d,17 + xor r12d,r13d + xor r15d,r14d + add r12d,DWORD[60+rsp] + + add r12d,DWORD[24+rsp] + mov r13d,r10d + add r12d,r15d + mov r14d,ecx + ror r13d,14 + mov r15d,r11d + + xor r13d,r10d + ror r14d,9 + xor r15d,eax + + mov DWORD[24+rsp],r12d + xor r14d,ecx + and r15d,r10d + + ror r13d,5 + add r12d,ebx + xor r15d,eax + + ror r14d,11 + xor r13d,r10d + add r12d,r15d + + mov r15d,ecx + add r12d,DWORD[rbp] + xor r14d,ecx + + xor r15d,edx + ror r13d,6 + mov ebx,edx + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor ebx,edi + add r9d,r12d + add ebx,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[32+rsp] + mov edi,DWORD[20+rsp] + + mov r12d,r13d + ror r13d,11 + add ebx,r14d + mov r14d,edi + ror edi,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor edi,r14d + shr r14d,10 + + ror edi,17 + xor r12d,r13d + xor edi,r14d + add r12d,DWORD[rsp] + + add r12d,DWORD[28+rsp] + mov r13d,r9d + add r12d,edi + mov r14d,ebx + ror r13d,14 + mov edi,r10d + + xor r13d,r9d + ror r14d,9 + xor edi,r11d + + mov DWORD[28+rsp],r12d + xor r14d,ebx + and edi,r9d + + ror r13d,5 + add r12d,eax + xor edi,r11d + + ror r14d,11 + xor r13d,r9d + add r12d,edi + + mov edi,ebx + add r12d,DWORD[rbp] + xor r14d,ebx + + xor edi,ecx + ror r13d,6 + mov eax,ecx + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor eax,r15d + add r8d,r12d + add eax,r12d + + lea rbp,[20+rbp] + mov r13d,DWORD[36+rsp] + mov r15d,DWORD[24+rsp] + + mov r12d,r13d + ror r13d,11 + add eax,r14d + mov r14d,r15d + ror r15d,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor r15d,r14d + shr r14d,10 + + ror r15d,17 + xor r12d,r13d + xor r15d,r14d + add r12d,DWORD[4+rsp] + + add r12d,DWORD[32+rsp] + mov r13d,r8d + add r12d,r15d + mov r14d,eax + ror r13d,14 + mov r15d,r9d + + xor r13d,r8d + ror r14d,9 + xor r15d,r10d + + mov DWORD[32+rsp],r12d + xor r14d,eax + and r15d,r8d + + ror r13d,5 + add r12d,r11d + xor r15d,r10d + + ror r14d,11 + xor r13d,r8d + add r12d,r15d + + mov r15d,eax + add r12d,DWORD[rbp] + xor r14d,eax + + xor r15d,ebx + ror r13d,6 + mov r11d,ebx + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor r11d,edi + add edx,r12d + add r11d,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[40+rsp] + mov edi,DWORD[28+rsp] + + mov r12d,r13d + ror r13d,11 + add r11d,r14d + mov r14d,edi + ror edi,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor edi,r14d + shr r14d,10 + + ror edi,17 + xor r12d,r13d + xor edi,r14d + add r12d,DWORD[8+rsp] + + add r12d,DWORD[36+rsp] + mov r13d,edx + add r12d,edi + mov r14d,r11d + ror r13d,14 + mov edi,r8d + + xor r13d,edx + ror r14d,9 + xor edi,r9d + + mov DWORD[36+rsp],r12d + xor r14d,r11d + and edi,edx + + ror r13d,5 + add r12d,r10d + xor edi,r9d + + ror r14d,11 + xor r13d,edx + add r12d,edi + + mov edi,r11d + add r12d,DWORD[rbp] + xor r14d,r11d + + xor edi,eax + ror r13d,6 + mov r10d,eax + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor r10d,r15d + add ecx,r12d + add r10d,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[44+rsp] + mov r15d,DWORD[32+rsp] + + mov r12d,r13d + ror r13d,11 + add r10d,r14d + mov r14d,r15d + ror r15d,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor r15d,r14d + shr r14d,10 + + ror r15d,17 + xor r12d,r13d + xor r15d,r14d + add r12d,DWORD[12+rsp] + + add r12d,DWORD[40+rsp] + mov r13d,ecx + add r12d,r15d + mov r14d,r10d + ror r13d,14 + mov r15d,edx + + xor r13d,ecx + ror r14d,9 + xor r15d,r8d + + mov DWORD[40+rsp],r12d + xor r14d,r10d + and r15d,ecx + + ror r13d,5 + add r12d,r9d + xor r15d,r8d + + ror r14d,11 + xor r13d,ecx + add r12d,r15d + + mov r15d,r10d + add r12d,DWORD[rbp] + xor r14d,r10d + + xor r15d,r11d + ror r13d,6 + mov r9d,r11d + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor r9d,edi + add ebx,r12d + add r9d,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[48+rsp] + mov edi,DWORD[36+rsp] + + mov r12d,r13d + ror r13d,11 + add r9d,r14d + mov r14d,edi + ror edi,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor edi,r14d + shr r14d,10 + + ror edi,17 + xor r12d,r13d + xor edi,r14d + add r12d,DWORD[16+rsp] + + add r12d,DWORD[44+rsp] + mov r13d,ebx + add r12d,edi + mov r14d,r9d + ror r13d,14 + mov edi,ecx + + xor r13d,ebx + ror r14d,9 + xor edi,edx + + mov DWORD[44+rsp],r12d + xor r14d,r9d + and edi,ebx + + ror r13d,5 + add r12d,r8d + xor edi,edx + + ror r14d,11 + xor r13d,ebx + add r12d,edi + + mov edi,r9d + add r12d,DWORD[rbp] + xor r14d,r9d + + xor edi,r10d + ror r13d,6 + mov r8d,r10d + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor r8d,r15d + add eax,r12d + add r8d,r12d + + lea rbp,[20+rbp] + mov r13d,DWORD[52+rsp] + mov r15d,DWORD[40+rsp] + + mov r12d,r13d + ror r13d,11 + add r8d,r14d + mov r14d,r15d + ror r15d,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor r15d,r14d + shr r14d,10 + + ror r15d,17 + xor r12d,r13d + xor r15d,r14d + add r12d,DWORD[20+rsp] + + add r12d,DWORD[48+rsp] + mov r13d,eax + add r12d,r15d + mov r14d,r8d + ror r13d,14 + mov r15d,ebx + + xor r13d,eax + ror r14d,9 + xor r15d,ecx + + mov DWORD[48+rsp],r12d + xor r14d,r8d + and r15d,eax + + ror r13d,5 + add r12d,edx + xor r15d,ecx + + ror r14d,11 + xor r13d,eax + add r12d,r15d + + mov r15d,r8d + add r12d,DWORD[rbp] + xor r14d,r8d + + xor r15d,r9d + ror r13d,6 + mov edx,r9d + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor edx,edi + add r11d,r12d + add edx,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[56+rsp] + mov edi,DWORD[44+rsp] + + mov r12d,r13d + ror r13d,11 + add edx,r14d + mov r14d,edi + ror edi,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor edi,r14d + shr r14d,10 + + ror edi,17 + xor r12d,r13d + xor edi,r14d + add r12d,DWORD[24+rsp] + + add r12d,DWORD[52+rsp] + mov r13d,r11d + add r12d,edi + mov r14d,edx + ror r13d,14 + mov edi,eax + + xor r13d,r11d + ror r14d,9 + xor edi,ebx + + mov DWORD[52+rsp],r12d + xor r14d,edx + and edi,r11d + + ror r13d,5 + add r12d,ecx + xor edi,ebx + + ror r14d,11 + xor r13d,r11d + add r12d,edi + + mov edi,edx + add r12d,DWORD[rbp] + xor r14d,edx + + xor edi,r8d + ror r13d,6 + mov ecx,r8d + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor ecx,r15d + add r10d,r12d + add ecx,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[60+rsp] + mov r15d,DWORD[48+rsp] + + mov r12d,r13d + ror r13d,11 + add ecx,r14d + mov r14d,r15d + ror r15d,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor r15d,r14d + shr r14d,10 + + ror r15d,17 + xor r12d,r13d + xor r15d,r14d + add r12d,DWORD[28+rsp] + + add r12d,DWORD[56+rsp] + mov r13d,r10d + add r12d,r15d + mov r14d,ecx + ror r13d,14 + mov r15d,r11d + + xor r13d,r10d + ror r14d,9 + xor r15d,eax + + mov DWORD[56+rsp],r12d + xor r14d,ecx + and r15d,r10d + + ror r13d,5 + add r12d,ebx + xor r15d,eax + + ror r14d,11 + xor r13d,r10d + add r12d,r15d + + mov r15d,ecx + add r12d,DWORD[rbp] + xor r14d,ecx + + xor r15d,edx + ror r13d,6 + mov ebx,edx + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor ebx,edi + add r9d,r12d + add ebx,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[rsp] + mov edi,DWORD[52+rsp] + + mov r12d,r13d + ror r13d,11 + add ebx,r14d + mov r14d,edi + ror edi,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor edi,r14d + shr r14d,10 + + ror edi,17 + xor r12d,r13d + xor edi,r14d + add r12d,DWORD[32+rsp] + + add r12d,DWORD[60+rsp] + mov r13d,r9d + add r12d,edi + mov r14d,ebx + ror r13d,14 + mov edi,r10d + + xor r13d,r9d + ror r14d,9 + xor edi,r11d + + mov DWORD[60+rsp],r12d + xor r14d,ebx + and edi,r9d + + ror r13d,5 + add r12d,eax + xor edi,r11d + + ror r14d,11 + xor r13d,r9d + add r12d,edi + + mov edi,ebx + add r12d,DWORD[rbp] + xor r14d,ebx + + xor edi,ecx + ror r13d,6 + mov eax,ecx + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor eax,r15d + add r8d,r12d + add eax,r12d + + lea rbp,[20+rbp] + cmp BYTE[3+rbp],0 + jnz NEAR $L$rounds_16_xx + + mov rdi,QWORD[((64+0))+rsp] + add eax,r14d + lea rsi,[64+rsi] + + add eax,DWORD[rdi] + add ebx,DWORD[4+rdi] + add ecx,DWORD[8+rdi] + add edx,DWORD[12+rdi] + add r8d,DWORD[16+rdi] + add r9d,DWORD[20+rdi] + add r10d,DWORD[24+rdi] + add r11d,DWORD[28+rdi] + + cmp rsi,QWORD[((64+16))+rsp] + + mov DWORD[rdi],eax + mov DWORD[4+rdi],ebx + mov DWORD[8+rdi],ecx + mov DWORD[12+rdi],edx + mov DWORD[16+rdi],r8d + mov DWORD[20+rdi],r9d + mov DWORD[24+rdi],r10d + mov DWORD[28+rdi],r11d + jb NEAR $L$loop + + mov rsi,QWORD[88+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_sha256_block_data_order_nohw: +section .rdata rdata align=8 +ALIGN 64 + +K256: + DD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + DD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + DD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + DD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + DD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + DD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + DD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + DD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + DD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + DD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + DD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + DD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + DD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + DD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + DD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + DD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + DD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + DD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + DD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + DD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + DD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + DD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + DD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + DD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + DD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + DD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + DD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + DD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + DD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + DD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + DD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + DD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + + DD 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f + DD 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f + DD 0x03020100,0x0b0a0908,0xffffffff,0xffffffff + DD 0x03020100,0x0b0a0908,0xffffffff,0xffffffff + DD 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 + DD 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 + DB 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97 + DB 110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54 + DB 52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 + DB 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 + DB 111,114,103,62,0 +section .text + +global sha256_block_data_order_hw + +ALIGN 64 +sha256_block_data_order_hw: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sha256_block_data_order_hw: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + lea rsp,[((-88))+rsp] + movaps XMMWORD[(-8-80)+rax],xmm6 + movaps XMMWORD[(-8-64)+rax],xmm7 + movaps XMMWORD[(-8-48)+rax],xmm8 + movaps XMMWORD[(-8-32)+rax],xmm9 + movaps XMMWORD[(-8-16)+rax],xmm10 +$L$prologue_shaext: + lea rcx,[((K256+128))] + movdqu xmm1,XMMWORD[rdi] + movdqu xmm2,XMMWORD[16+rdi] + movdqa xmm7,XMMWORD[((512-128))+rcx] + + pshufd xmm0,xmm1,0x1b + pshufd xmm1,xmm1,0xb1 + pshufd xmm2,xmm2,0x1b + movdqa xmm8,xmm7 +DB 102,15,58,15,202,8 + punpcklqdq xmm2,xmm0 + jmp NEAR $L$oop_shaext + +ALIGN 16 +$L$oop_shaext: + movdqu xmm3,XMMWORD[rsi] + movdqu xmm4,XMMWORD[16+rsi] + movdqu xmm5,XMMWORD[32+rsi] +DB 102,15,56,0,223 + movdqu xmm6,XMMWORD[48+rsi] + + movdqa xmm0,XMMWORD[((0-128))+rcx] + paddd xmm0,xmm3 +DB 102,15,56,0,231 + movdqa xmm10,xmm2 + DB 15,56,203,209 + pshufd xmm0,xmm0,0x0e + nop + movdqa xmm9,xmm1 + DB 15,56,203,202 + + movdqa xmm0,XMMWORD[((32-128))+rcx] + paddd xmm0,xmm4 +DB 102,15,56,0,239 + DB 15,56,203,209 + pshufd xmm0,xmm0,0x0e + lea rsi,[64+rsi] + DB 15,56,204,220 + DB 15,56,203,202 + + movdqa xmm0,XMMWORD[((64-128))+rcx] + paddd xmm0,xmm5 +DB 102,15,56,0,247 + DB 15,56,203,209 + pshufd xmm0,xmm0,0x0e + movdqa xmm7,xmm6 +DB 102,15,58,15,253,4 + nop + paddd xmm3,xmm7 + DB 15,56,204,229 + DB 15,56,203,202 + + movdqa xmm0,XMMWORD[((96-128))+rcx] + paddd xmm0,xmm6 + DB 15,56,205,222 + DB 15,56,203,209 + pshufd xmm0,xmm0,0x0e + movdqa xmm7,xmm3 +DB 102,15,58,15,254,4 + nop + paddd xmm4,xmm7 + DB 15,56,204,238 + DB 15,56,203,202 + movdqa xmm0,XMMWORD[((128-128))+rcx] + paddd xmm0,xmm3 + DB 15,56,205,227 + DB 15,56,203,209 + pshufd xmm0,xmm0,0x0e + movdqa xmm7,xmm4 +DB 102,15,58,15,251,4 + nop + paddd xmm5,xmm7 + DB 15,56,204,243 + DB 15,56,203,202 + movdqa xmm0,XMMWORD[((160-128))+rcx] + paddd xmm0,xmm4 + DB 15,56,205,236 + DB 15,56,203,209 + pshufd xmm0,xmm0,0x0e + movdqa xmm7,xmm5 +DB 102,15,58,15,252,4 + nop + paddd xmm6,xmm7 + DB 15,56,204,220 + DB 15,56,203,202 + movdqa xmm0,XMMWORD[((192-128))+rcx] + paddd xmm0,xmm5 + DB 15,56,205,245 + DB 15,56,203,209 + pshufd xmm0,xmm0,0x0e + movdqa xmm7,xmm6 +DB 102,15,58,15,253,4 + nop + paddd xmm3,xmm7 + DB 15,56,204,229 + DB 15,56,203,202 + movdqa xmm0,XMMWORD[((224-128))+rcx] + paddd xmm0,xmm6 + DB 15,56,205,222 + DB 15,56,203,209 + pshufd xmm0,xmm0,0x0e + movdqa xmm7,xmm3 +DB 102,15,58,15,254,4 + nop + paddd xmm4,xmm7 + DB 15,56,204,238 + DB 15,56,203,202 + movdqa xmm0,XMMWORD[((256-128))+rcx] + paddd xmm0,xmm3 + DB 15,56,205,227 + DB 15,56,203,209 + pshufd xmm0,xmm0,0x0e + movdqa xmm7,xmm4 +DB 102,15,58,15,251,4 + nop + paddd xmm5,xmm7 + DB 15,56,204,243 + DB 15,56,203,202 + movdqa xmm0,XMMWORD[((288-128))+rcx] + paddd xmm0,xmm4 + DB 15,56,205,236 + DB 15,56,203,209 + pshufd xmm0,xmm0,0x0e + movdqa xmm7,xmm5 +DB 102,15,58,15,252,4 + nop + paddd xmm6,xmm7 + DB 15,56,204,220 + DB 15,56,203,202 + movdqa xmm0,XMMWORD[((320-128))+rcx] + paddd xmm0,xmm5 + DB 15,56,205,245 + DB 15,56,203,209 + pshufd xmm0,xmm0,0x0e + movdqa xmm7,xmm6 +DB 102,15,58,15,253,4 + nop + paddd xmm3,xmm7 + DB 15,56,204,229 + DB 15,56,203,202 + movdqa xmm0,XMMWORD[((352-128))+rcx] + paddd xmm0,xmm6 + DB 15,56,205,222 + DB 15,56,203,209 + pshufd xmm0,xmm0,0x0e + movdqa xmm7,xmm3 +DB 102,15,58,15,254,4 + nop + paddd xmm4,xmm7 + DB 15,56,204,238 + DB 15,56,203,202 + movdqa xmm0,XMMWORD[((384-128))+rcx] + paddd xmm0,xmm3 + DB 15,56,205,227 + DB 15,56,203,209 + pshufd xmm0,xmm0,0x0e + movdqa xmm7,xmm4 +DB 102,15,58,15,251,4 + nop + paddd xmm5,xmm7 + DB 15,56,204,243 + DB 15,56,203,202 + movdqa xmm0,XMMWORD[((416-128))+rcx] + paddd xmm0,xmm4 + DB 15,56,205,236 + DB 15,56,203,209 + pshufd xmm0,xmm0,0x0e + movdqa xmm7,xmm5 +DB 102,15,58,15,252,4 + DB 15,56,203,202 + paddd xmm6,xmm7 + + movdqa xmm0,XMMWORD[((448-128))+rcx] + paddd xmm0,xmm5 + DB 15,56,203,209 + pshufd xmm0,xmm0,0x0e + DB 15,56,205,245 + movdqa xmm7,xmm8 + DB 15,56,203,202 + + movdqa xmm0,XMMWORD[((480-128))+rcx] + paddd xmm0,xmm6 + nop + DB 15,56,203,209 + pshufd xmm0,xmm0,0x0e + dec rdx + nop + DB 15,56,203,202 + + paddd xmm2,xmm10 + paddd xmm1,xmm9 + jnz NEAR $L$oop_shaext + + pshufd xmm2,xmm2,0xb1 + pshufd xmm7,xmm1,0x1b + pshufd xmm1,xmm1,0xb1 + punpckhqdq xmm1,xmm2 +DB 102,15,58,15,215,8 + + movdqu XMMWORD[rdi],xmm1 + movdqu XMMWORD[16+rdi],xmm2 + movaps xmm6,XMMWORD[((-8-80))+rax] + movaps xmm7,XMMWORD[((-8-64))+rax] + movaps xmm8,XMMWORD[((-8-48))+rax] + movaps xmm9,XMMWORD[((-8-32))+rax] + movaps xmm10,XMMWORD[((-8-16))+rax] + mov rsp,rax +$L$epilogue_shaext: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_sha256_block_data_order_hw: +global sha256_block_data_order_ssse3 + +ALIGN 64 +sha256_block_data_order_ssse3: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sha256_block_data_order_ssse3: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + mov rax,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + shl rdx,4 + sub rsp,160 + lea rdx,[rdx*4+rsi] + and rsp,-64 + mov QWORD[((64+0))+rsp],rdi + mov QWORD[((64+8))+rsp],rsi + mov QWORD[((64+16))+rsp],rdx + mov QWORD[88+rsp],rax + + movaps XMMWORD[(64+32)+rsp],xmm6 + movaps XMMWORD[(64+48)+rsp],xmm7 + movaps XMMWORD[(64+64)+rsp],xmm8 + movaps XMMWORD[(64+80)+rsp],xmm9 +$L$prologue_ssse3: + + mov eax,DWORD[rdi] + mov ebx,DWORD[4+rdi] + mov ecx,DWORD[8+rdi] + mov edx,DWORD[12+rdi] + mov r8d,DWORD[16+rdi] + mov r9d,DWORD[20+rdi] + mov r10d,DWORD[24+rdi] + mov r11d,DWORD[28+rdi] + + + jmp NEAR $L$loop_ssse3 +ALIGN 16 +$L$loop_ssse3: + movdqa xmm7,XMMWORD[((K256+512))] + movdqu xmm0,XMMWORD[rsi] + movdqu xmm1,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] +DB 102,15,56,0,199 + movdqu xmm3,XMMWORD[48+rsi] + lea rbp,[K256] +DB 102,15,56,0,207 + movdqa xmm4,XMMWORD[rbp] + movdqa xmm5,XMMWORD[32+rbp] +DB 102,15,56,0,215 + paddd xmm4,xmm0 + movdqa xmm6,XMMWORD[64+rbp] +DB 102,15,56,0,223 + movdqa xmm7,XMMWORD[96+rbp] + paddd xmm5,xmm1 + paddd xmm6,xmm2 + paddd xmm7,xmm3 + movdqa XMMWORD[rsp],xmm4 + mov r14d,eax + movdqa XMMWORD[16+rsp],xmm5 + mov edi,ebx + movdqa XMMWORD[32+rsp],xmm6 + xor edi,ecx + movdqa XMMWORD[48+rsp],xmm7 + mov r13d,r8d + jmp NEAR $L$ssse3_00_47 + +ALIGN 16 +$L$ssse3_00_47: + sub rbp,-128 + ror r13d,14 + movdqa xmm4,xmm1 + mov eax,r14d + mov r12d,r9d + movdqa xmm7,xmm3 + ror r14d,9 + xor r13d,r8d + xor r12d,r10d + ror r13d,5 + xor r14d,eax +DB 102,15,58,15,224,4 + and r12d,r8d + xor r13d,r8d +DB 102,15,58,15,250,4 + add r11d,DWORD[rsp] + mov r15d,eax + xor r12d,r10d + ror r14d,11 + movdqa xmm5,xmm4 + xor r15d,ebx + add r11d,r12d + movdqa xmm6,xmm4 + ror r13d,6 + and edi,r15d + psrld xmm4,3 + xor r14d,eax + add r11d,r13d + xor edi,ebx + paddd xmm0,xmm7 + ror r14d,2 + add edx,r11d + psrld xmm6,7 + add r11d,edi + mov r13d,edx + pshufd xmm7,xmm3,250 + add r14d,r11d + ror r13d,14 + pslld xmm5,14 + mov r11d,r14d + mov r12d,r8d + pxor xmm4,xmm6 + ror r14d,9 + xor r13d,edx + xor r12d,r9d + ror r13d,5 + psrld xmm6,11 + xor r14d,r11d + pxor xmm4,xmm5 + and r12d,edx + xor r13d,edx + pslld xmm5,11 + add r10d,DWORD[4+rsp] + mov edi,r11d + pxor xmm4,xmm6 + xor r12d,r9d + ror r14d,11 + movdqa xmm6,xmm7 + xor edi,eax + add r10d,r12d + pxor xmm4,xmm5 + ror r13d,6 + and r15d,edi + xor r14d,r11d + psrld xmm7,10 + add r10d,r13d + xor r15d,eax + paddd xmm0,xmm4 + ror r14d,2 + add ecx,r10d + psrlq xmm6,17 + add r10d,r15d + mov r13d,ecx + add r14d,r10d + pxor xmm7,xmm6 + ror r13d,14 + mov r10d,r14d + mov r12d,edx + ror r14d,9 + psrlq xmm6,2 + xor r13d,ecx + xor r12d,r8d + pxor xmm7,xmm6 + ror r13d,5 + xor r14d,r10d + and r12d,ecx + pshufd xmm7,xmm7,128 + xor r13d,ecx + add r9d,DWORD[8+rsp] + mov r15d,r10d + psrldq xmm7,8 + xor r12d,r8d + ror r14d,11 + xor r15d,r11d + add r9d,r12d + ror r13d,6 + paddd xmm0,xmm7 + and edi,r15d + xor r14d,r10d + add r9d,r13d + pshufd xmm7,xmm0,80 + xor edi,r11d + ror r14d,2 + add ebx,r9d + movdqa xmm6,xmm7 + add r9d,edi + mov r13d,ebx + psrld xmm7,10 + add r14d,r9d + ror r13d,14 + psrlq xmm6,17 + mov r9d,r14d + mov r12d,ecx + pxor xmm7,xmm6 + ror r14d,9 + xor r13d,ebx + xor r12d,edx + ror r13d,5 + xor r14d,r9d + psrlq xmm6,2 + and r12d,ebx + xor r13d,ebx + add r8d,DWORD[12+rsp] + pxor xmm7,xmm6 + mov edi,r9d + xor r12d,edx + ror r14d,11 + pshufd xmm7,xmm7,8 + xor edi,r10d + add r8d,r12d + movdqa xmm6,XMMWORD[rbp] + ror r13d,6 + and r15d,edi + pslldq xmm7,8 + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + paddd xmm0,xmm7 + ror r14d,2 + add eax,r8d + add r8d,r15d + paddd xmm6,xmm0 + mov r13d,eax + add r14d,r8d + movdqa XMMWORD[rsp],xmm6 + ror r13d,14 + movdqa xmm4,xmm2 + mov r8d,r14d + mov r12d,ebx + movdqa xmm7,xmm0 + ror r14d,9 + xor r13d,eax + xor r12d,ecx + ror r13d,5 + xor r14d,r8d +DB 102,15,58,15,225,4 + and r12d,eax + xor r13d,eax +DB 102,15,58,15,251,4 + add edx,DWORD[16+rsp] + mov r15d,r8d + xor r12d,ecx + ror r14d,11 + movdqa xmm5,xmm4 + xor r15d,r9d + add edx,r12d + movdqa xmm6,xmm4 + ror r13d,6 + and edi,r15d + psrld xmm4,3 + xor r14d,r8d + add edx,r13d + xor edi,r9d + paddd xmm1,xmm7 + ror r14d,2 + add r11d,edx + psrld xmm6,7 + add edx,edi + mov r13d,r11d + pshufd xmm7,xmm0,250 + add r14d,edx + ror r13d,14 + pslld xmm5,14 + mov edx,r14d + mov r12d,eax + pxor xmm4,xmm6 + ror r14d,9 + xor r13d,r11d + xor r12d,ebx + ror r13d,5 + psrld xmm6,11 + xor r14d,edx + pxor xmm4,xmm5 + and r12d,r11d + xor r13d,r11d + pslld xmm5,11 + add ecx,DWORD[20+rsp] + mov edi,edx + pxor xmm4,xmm6 + xor r12d,ebx + ror r14d,11 + movdqa xmm6,xmm7 + xor edi,r8d + add ecx,r12d + pxor xmm4,xmm5 + ror r13d,6 + and r15d,edi + xor r14d,edx + psrld xmm7,10 + add ecx,r13d + xor r15d,r8d + paddd xmm1,xmm4 + ror r14d,2 + add r10d,ecx + psrlq xmm6,17 + add ecx,r15d + mov r13d,r10d + add r14d,ecx + pxor xmm7,xmm6 + ror r13d,14 + mov ecx,r14d + mov r12d,r11d + ror r14d,9 + psrlq xmm6,2 + xor r13d,r10d + xor r12d,eax + pxor xmm7,xmm6 + ror r13d,5 + xor r14d,ecx + and r12d,r10d + pshufd xmm7,xmm7,128 + xor r13d,r10d + add ebx,DWORD[24+rsp] + mov r15d,ecx + psrldq xmm7,8 + xor r12d,eax + ror r14d,11 + xor r15d,edx + add ebx,r12d + ror r13d,6 + paddd xmm1,xmm7 + and edi,r15d + xor r14d,ecx + add ebx,r13d + pshufd xmm7,xmm1,80 + xor edi,edx + ror r14d,2 + add r9d,ebx + movdqa xmm6,xmm7 + add ebx,edi + mov r13d,r9d + psrld xmm7,10 + add r14d,ebx + ror r13d,14 + psrlq xmm6,17 + mov ebx,r14d + mov r12d,r10d + pxor xmm7,xmm6 + ror r14d,9 + xor r13d,r9d + xor r12d,r11d + ror r13d,5 + xor r14d,ebx + psrlq xmm6,2 + and r12d,r9d + xor r13d,r9d + add eax,DWORD[28+rsp] + pxor xmm7,xmm6 + mov edi,ebx + xor r12d,r11d + ror r14d,11 + pshufd xmm7,xmm7,8 + xor edi,ecx + add eax,r12d + movdqa xmm6,XMMWORD[32+rbp] + ror r13d,6 + and r15d,edi + pslldq xmm7,8 + xor r14d,ebx + add eax,r13d + xor r15d,ecx + paddd xmm1,xmm7 + ror r14d,2 + add r8d,eax + add eax,r15d + paddd xmm6,xmm1 + mov r13d,r8d + add r14d,eax + movdqa XMMWORD[16+rsp],xmm6 + ror r13d,14 + movdqa xmm4,xmm3 + mov eax,r14d + mov r12d,r9d + movdqa xmm7,xmm1 + ror r14d,9 + xor r13d,r8d + xor r12d,r10d + ror r13d,5 + xor r14d,eax +DB 102,15,58,15,226,4 + and r12d,r8d + xor r13d,r8d +DB 102,15,58,15,248,4 + add r11d,DWORD[32+rsp] + mov r15d,eax + xor r12d,r10d + ror r14d,11 + movdqa xmm5,xmm4 + xor r15d,ebx + add r11d,r12d + movdqa xmm6,xmm4 + ror r13d,6 + and edi,r15d + psrld xmm4,3 + xor r14d,eax + add r11d,r13d + xor edi,ebx + paddd xmm2,xmm7 + ror r14d,2 + add edx,r11d + psrld xmm6,7 + add r11d,edi + mov r13d,edx + pshufd xmm7,xmm1,250 + add r14d,r11d + ror r13d,14 + pslld xmm5,14 + mov r11d,r14d + mov r12d,r8d + pxor xmm4,xmm6 + ror r14d,9 + xor r13d,edx + xor r12d,r9d + ror r13d,5 + psrld xmm6,11 + xor r14d,r11d + pxor xmm4,xmm5 + and r12d,edx + xor r13d,edx + pslld xmm5,11 + add r10d,DWORD[36+rsp] + mov edi,r11d + pxor xmm4,xmm6 + xor r12d,r9d + ror r14d,11 + movdqa xmm6,xmm7 + xor edi,eax + add r10d,r12d + pxor xmm4,xmm5 + ror r13d,6 + and r15d,edi + xor r14d,r11d + psrld xmm7,10 + add r10d,r13d + xor r15d,eax + paddd xmm2,xmm4 + ror r14d,2 + add ecx,r10d + psrlq xmm6,17 + add r10d,r15d + mov r13d,ecx + add r14d,r10d + pxor xmm7,xmm6 + ror r13d,14 + mov r10d,r14d + mov r12d,edx + ror r14d,9 + psrlq xmm6,2 + xor r13d,ecx + xor r12d,r8d + pxor xmm7,xmm6 + ror r13d,5 + xor r14d,r10d + and r12d,ecx + pshufd xmm7,xmm7,128 + xor r13d,ecx + add r9d,DWORD[40+rsp] + mov r15d,r10d + psrldq xmm7,8 + xor r12d,r8d + ror r14d,11 + xor r15d,r11d + add r9d,r12d + ror r13d,6 + paddd xmm2,xmm7 + and edi,r15d + xor r14d,r10d + add r9d,r13d + pshufd xmm7,xmm2,80 + xor edi,r11d + ror r14d,2 + add ebx,r9d + movdqa xmm6,xmm7 + add r9d,edi + mov r13d,ebx + psrld xmm7,10 + add r14d,r9d + ror r13d,14 + psrlq xmm6,17 + mov r9d,r14d + mov r12d,ecx + pxor xmm7,xmm6 + ror r14d,9 + xor r13d,ebx + xor r12d,edx + ror r13d,5 + xor r14d,r9d + psrlq xmm6,2 + and r12d,ebx + xor r13d,ebx + add r8d,DWORD[44+rsp] + pxor xmm7,xmm6 + mov edi,r9d + xor r12d,edx + ror r14d,11 + pshufd xmm7,xmm7,8 + xor edi,r10d + add r8d,r12d + movdqa xmm6,XMMWORD[64+rbp] + ror r13d,6 + and r15d,edi + pslldq xmm7,8 + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + paddd xmm2,xmm7 + ror r14d,2 + add eax,r8d + add r8d,r15d + paddd xmm6,xmm2 + mov r13d,eax + add r14d,r8d + movdqa XMMWORD[32+rsp],xmm6 + ror r13d,14 + movdqa xmm4,xmm0 + mov r8d,r14d + mov r12d,ebx + movdqa xmm7,xmm2 + ror r14d,9 + xor r13d,eax + xor r12d,ecx + ror r13d,5 + xor r14d,r8d +DB 102,15,58,15,227,4 + and r12d,eax + xor r13d,eax +DB 102,15,58,15,249,4 + add edx,DWORD[48+rsp] + mov r15d,r8d + xor r12d,ecx + ror r14d,11 + movdqa xmm5,xmm4 + xor r15d,r9d + add edx,r12d + movdqa xmm6,xmm4 + ror r13d,6 + and edi,r15d + psrld xmm4,3 + xor r14d,r8d + add edx,r13d + xor edi,r9d + paddd xmm3,xmm7 + ror r14d,2 + add r11d,edx + psrld xmm6,7 + add edx,edi + mov r13d,r11d + pshufd xmm7,xmm2,250 + add r14d,edx + ror r13d,14 + pslld xmm5,14 + mov edx,r14d + mov r12d,eax + pxor xmm4,xmm6 + ror r14d,9 + xor r13d,r11d + xor r12d,ebx + ror r13d,5 + psrld xmm6,11 + xor r14d,edx + pxor xmm4,xmm5 + and r12d,r11d + xor r13d,r11d + pslld xmm5,11 + add ecx,DWORD[52+rsp] + mov edi,edx + pxor xmm4,xmm6 + xor r12d,ebx + ror r14d,11 + movdqa xmm6,xmm7 + xor edi,r8d + add ecx,r12d + pxor xmm4,xmm5 + ror r13d,6 + and r15d,edi + xor r14d,edx + psrld xmm7,10 + add ecx,r13d + xor r15d,r8d + paddd xmm3,xmm4 + ror r14d,2 + add r10d,ecx + psrlq xmm6,17 + add ecx,r15d + mov r13d,r10d + add r14d,ecx + pxor xmm7,xmm6 + ror r13d,14 + mov ecx,r14d + mov r12d,r11d + ror r14d,9 + psrlq xmm6,2 + xor r13d,r10d + xor r12d,eax + pxor xmm7,xmm6 + ror r13d,5 + xor r14d,ecx + and r12d,r10d + pshufd xmm7,xmm7,128 + xor r13d,r10d + add ebx,DWORD[56+rsp] + mov r15d,ecx + psrldq xmm7,8 + xor r12d,eax + ror r14d,11 + xor r15d,edx + add ebx,r12d + ror r13d,6 + paddd xmm3,xmm7 + and edi,r15d + xor r14d,ecx + add ebx,r13d + pshufd xmm7,xmm3,80 + xor edi,edx + ror r14d,2 + add r9d,ebx + movdqa xmm6,xmm7 + add ebx,edi + mov r13d,r9d + psrld xmm7,10 + add r14d,ebx + ror r13d,14 + psrlq xmm6,17 + mov ebx,r14d + mov r12d,r10d + pxor xmm7,xmm6 + ror r14d,9 + xor r13d,r9d + xor r12d,r11d + ror r13d,5 + xor r14d,ebx + psrlq xmm6,2 + and r12d,r9d + xor r13d,r9d + add eax,DWORD[60+rsp] + pxor xmm7,xmm6 + mov edi,ebx + xor r12d,r11d + ror r14d,11 + pshufd xmm7,xmm7,8 + xor edi,ecx + add eax,r12d + movdqa xmm6,XMMWORD[96+rbp] + ror r13d,6 + and r15d,edi + pslldq xmm7,8 + xor r14d,ebx + add eax,r13d + xor r15d,ecx + paddd xmm3,xmm7 + ror r14d,2 + add r8d,eax + add eax,r15d + paddd xmm6,xmm3 + mov r13d,r8d + add r14d,eax + movdqa XMMWORD[48+rsp],xmm6 + cmp BYTE[131+rbp],0 + jne NEAR $L$ssse3_00_47 + ror r13d,14 + mov eax,r14d + mov r12d,r9d + ror r14d,9 + xor r13d,r8d + xor r12d,r10d + ror r13d,5 + xor r14d,eax + and r12d,r8d + xor r13d,r8d + add r11d,DWORD[rsp] + mov r15d,eax + xor r12d,r10d + ror r14d,11 + xor r15d,ebx + add r11d,r12d + ror r13d,6 + and edi,r15d + xor r14d,eax + add r11d,r13d + xor edi,ebx + ror r14d,2 + add edx,r11d + add r11d,edi + mov r13d,edx + add r14d,r11d + ror r13d,14 + mov r11d,r14d + mov r12d,r8d + ror r14d,9 + xor r13d,edx + xor r12d,r9d + ror r13d,5 + xor r14d,r11d + and r12d,edx + xor r13d,edx + add r10d,DWORD[4+rsp] + mov edi,r11d + xor r12d,r9d + ror r14d,11 + xor edi,eax + add r10d,r12d + ror r13d,6 + and r15d,edi + xor r14d,r11d + add r10d,r13d + xor r15d,eax + ror r14d,2 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + add r14d,r10d + ror r13d,14 + mov r10d,r14d + mov r12d,edx + ror r14d,9 + xor r13d,ecx + xor r12d,r8d + ror r13d,5 + xor r14d,r10d + and r12d,ecx + xor r13d,ecx + add r9d,DWORD[8+rsp] + mov r15d,r10d + xor r12d,r8d + ror r14d,11 + xor r15d,r11d + add r9d,r12d + ror r13d,6 + and edi,r15d + xor r14d,r10d + add r9d,r13d + xor edi,r11d + ror r14d,2 + add ebx,r9d + add r9d,edi + mov r13d,ebx + add r14d,r9d + ror r13d,14 + mov r9d,r14d + mov r12d,ecx + ror r14d,9 + xor r13d,ebx + xor r12d,edx + ror r13d,5 + xor r14d,r9d + and r12d,ebx + xor r13d,ebx + add r8d,DWORD[12+rsp] + mov edi,r9d + xor r12d,edx + ror r14d,11 + xor edi,r10d + add r8d,r12d + ror r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + ror r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + ror r13d,14 + mov r8d,r14d + mov r12d,ebx + ror r14d,9 + xor r13d,eax + xor r12d,ecx + ror r13d,5 + xor r14d,r8d + and r12d,eax + xor r13d,eax + add edx,DWORD[16+rsp] + mov r15d,r8d + xor r12d,ecx + ror r14d,11 + xor r15d,r9d + add edx,r12d + ror r13d,6 + and edi,r15d + xor r14d,r8d + add edx,r13d + xor edi,r9d + ror r14d,2 + add r11d,edx + add edx,edi + mov r13d,r11d + add r14d,edx + ror r13d,14 + mov edx,r14d + mov r12d,eax + ror r14d,9 + xor r13d,r11d + xor r12d,ebx + ror r13d,5 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + add ecx,DWORD[20+rsp] + mov edi,edx + xor r12d,ebx + ror r14d,11 + xor edi,r8d + add ecx,r12d + ror r13d,6 + and r15d,edi + xor r14d,edx + add ecx,r13d + xor r15d,r8d + ror r14d,2 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + add r14d,ecx + ror r13d,14 + mov ecx,r14d + mov r12d,r11d + ror r14d,9 + xor r13d,r10d + xor r12d,eax + ror r13d,5 + xor r14d,ecx + and r12d,r10d + xor r13d,r10d + add ebx,DWORD[24+rsp] + mov r15d,ecx + xor r12d,eax + ror r14d,11 + xor r15d,edx + add ebx,r12d + ror r13d,6 + and edi,r15d + xor r14d,ecx + add ebx,r13d + xor edi,edx + ror r14d,2 + add r9d,ebx + add ebx,edi + mov r13d,r9d + add r14d,ebx + ror r13d,14 + mov ebx,r14d + mov r12d,r10d + ror r14d,9 + xor r13d,r9d + xor r12d,r11d + ror r13d,5 + xor r14d,ebx + and r12d,r9d + xor r13d,r9d + add eax,DWORD[28+rsp] + mov edi,ebx + xor r12d,r11d + ror r14d,11 + xor edi,ecx + add eax,r12d + ror r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + ror r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + ror r13d,14 + mov eax,r14d + mov r12d,r9d + ror r14d,9 + xor r13d,r8d + xor r12d,r10d + ror r13d,5 + xor r14d,eax + and r12d,r8d + xor r13d,r8d + add r11d,DWORD[32+rsp] + mov r15d,eax + xor r12d,r10d + ror r14d,11 + xor r15d,ebx + add r11d,r12d + ror r13d,6 + and edi,r15d + xor r14d,eax + add r11d,r13d + xor edi,ebx + ror r14d,2 + add edx,r11d + add r11d,edi + mov r13d,edx + add r14d,r11d + ror r13d,14 + mov r11d,r14d + mov r12d,r8d + ror r14d,9 + xor r13d,edx + xor r12d,r9d + ror r13d,5 + xor r14d,r11d + and r12d,edx + xor r13d,edx + add r10d,DWORD[36+rsp] + mov edi,r11d + xor r12d,r9d + ror r14d,11 + xor edi,eax + add r10d,r12d + ror r13d,6 + and r15d,edi + xor r14d,r11d + add r10d,r13d + xor r15d,eax + ror r14d,2 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + add r14d,r10d + ror r13d,14 + mov r10d,r14d + mov r12d,edx + ror r14d,9 + xor r13d,ecx + xor r12d,r8d + ror r13d,5 + xor r14d,r10d + and r12d,ecx + xor r13d,ecx + add r9d,DWORD[40+rsp] + mov r15d,r10d + xor r12d,r8d + ror r14d,11 + xor r15d,r11d + add r9d,r12d + ror r13d,6 + and edi,r15d + xor r14d,r10d + add r9d,r13d + xor edi,r11d + ror r14d,2 + add ebx,r9d + add r9d,edi + mov r13d,ebx + add r14d,r9d + ror r13d,14 + mov r9d,r14d + mov r12d,ecx + ror r14d,9 + xor r13d,ebx + xor r12d,edx + ror r13d,5 + xor r14d,r9d + and r12d,ebx + xor r13d,ebx + add r8d,DWORD[44+rsp] + mov edi,r9d + xor r12d,edx + ror r14d,11 + xor edi,r10d + add r8d,r12d + ror r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + ror r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + ror r13d,14 + mov r8d,r14d + mov r12d,ebx + ror r14d,9 + xor r13d,eax + xor r12d,ecx + ror r13d,5 + xor r14d,r8d + and r12d,eax + xor r13d,eax + add edx,DWORD[48+rsp] + mov r15d,r8d + xor r12d,ecx + ror r14d,11 + xor r15d,r9d + add edx,r12d + ror r13d,6 + and edi,r15d + xor r14d,r8d + add edx,r13d + xor edi,r9d + ror r14d,2 + add r11d,edx + add edx,edi + mov r13d,r11d + add r14d,edx + ror r13d,14 + mov edx,r14d + mov r12d,eax + ror r14d,9 + xor r13d,r11d + xor r12d,ebx + ror r13d,5 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + add ecx,DWORD[52+rsp] + mov edi,edx + xor r12d,ebx + ror r14d,11 + xor edi,r8d + add ecx,r12d + ror r13d,6 + and r15d,edi + xor r14d,edx + add ecx,r13d + xor r15d,r8d + ror r14d,2 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + add r14d,ecx + ror r13d,14 + mov ecx,r14d + mov r12d,r11d + ror r14d,9 + xor r13d,r10d + xor r12d,eax + ror r13d,5 + xor r14d,ecx + and r12d,r10d + xor r13d,r10d + add ebx,DWORD[56+rsp] + mov r15d,ecx + xor r12d,eax + ror r14d,11 + xor r15d,edx + add ebx,r12d + ror r13d,6 + and edi,r15d + xor r14d,ecx + add ebx,r13d + xor edi,edx + ror r14d,2 + add r9d,ebx + add ebx,edi + mov r13d,r9d + add r14d,ebx + ror r13d,14 + mov ebx,r14d + mov r12d,r10d + ror r14d,9 + xor r13d,r9d + xor r12d,r11d + ror r13d,5 + xor r14d,ebx + and r12d,r9d + xor r13d,r9d + add eax,DWORD[60+rsp] + mov edi,ebx + xor r12d,r11d + ror r14d,11 + xor edi,ecx + add eax,r12d + ror r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + ror r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + mov rdi,QWORD[((64+0))+rsp] + mov eax,r14d + + add eax,DWORD[rdi] + lea rsi,[64+rsi] + add ebx,DWORD[4+rdi] + add ecx,DWORD[8+rdi] + add edx,DWORD[12+rdi] + add r8d,DWORD[16+rdi] + add r9d,DWORD[20+rdi] + add r10d,DWORD[24+rdi] + add r11d,DWORD[28+rdi] + + cmp rsi,QWORD[((64+16))+rsp] + + mov DWORD[rdi],eax + mov DWORD[4+rdi],ebx + mov DWORD[8+rdi],ecx + mov DWORD[12+rdi],edx + mov DWORD[16+rdi],r8d + mov DWORD[20+rdi],r9d + mov DWORD[24+rdi],r10d + mov DWORD[28+rdi],r11d + jb NEAR $L$loop_ssse3 + + mov rsi,QWORD[88+rsp] + + movaps xmm6,XMMWORD[((64+32))+rsp] + movaps xmm7,XMMWORD[((64+48))+rsp] + movaps xmm8,XMMWORD[((64+64))+rsp] + movaps xmm9,XMMWORD[((64+80))+rsp] + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$epilogue_ssse3: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_sha256_block_data_order_ssse3: +global sha256_block_data_order_avx + +ALIGN 64 +sha256_block_data_order_avx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sha256_block_data_order_avx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + mov rax,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + shl rdx,4 + sub rsp,160 + lea rdx,[rdx*4+rsi] + and rsp,-64 + mov QWORD[((64+0))+rsp],rdi + mov QWORD[((64+8))+rsp],rsi + mov QWORD[((64+16))+rsp],rdx + mov QWORD[88+rsp],rax + + movaps XMMWORD[(64+32)+rsp],xmm6 + movaps XMMWORD[(64+48)+rsp],xmm7 + movaps XMMWORD[(64+64)+rsp],xmm8 + movaps XMMWORD[(64+80)+rsp],xmm9 +$L$prologue_avx: + + vzeroupper + mov eax,DWORD[rdi] + mov ebx,DWORD[4+rdi] + mov ecx,DWORD[8+rdi] + mov edx,DWORD[12+rdi] + mov r8d,DWORD[16+rdi] + mov r9d,DWORD[20+rdi] + mov r10d,DWORD[24+rdi] + mov r11d,DWORD[28+rdi] + vmovdqa xmm8,XMMWORD[((K256+512+32))] + vmovdqa xmm9,XMMWORD[((K256+512+64))] + jmp NEAR $L$loop_avx +ALIGN 16 +$L$loop_avx: + vmovdqa xmm7,XMMWORD[((K256+512))] + vmovdqu xmm0,XMMWORD[rsi] + vmovdqu xmm1,XMMWORD[16+rsi] + vmovdqu xmm2,XMMWORD[32+rsi] + vmovdqu xmm3,XMMWORD[48+rsi] + vpshufb xmm0,xmm0,xmm7 + lea rbp,[K256] + vpshufb xmm1,xmm1,xmm7 + vpshufb xmm2,xmm2,xmm7 + vpaddd xmm4,xmm0,XMMWORD[rbp] + vpshufb xmm3,xmm3,xmm7 + vpaddd xmm5,xmm1,XMMWORD[32+rbp] + vpaddd xmm6,xmm2,XMMWORD[64+rbp] + vpaddd xmm7,xmm3,XMMWORD[96+rbp] + vmovdqa XMMWORD[rsp],xmm4 + mov r14d,eax + vmovdqa XMMWORD[16+rsp],xmm5 + mov edi,ebx + vmovdqa XMMWORD[32+rsp],xmm6 + xor edi,ecx + vmovdqa XMMWORD[48+rsp],xmm7 + mov r13d,r8d + jmp NEAR $L$avx_00_47 + +ALIGN 16 +$L$avx_00_47: + sub rbp,-128 + vpalignr xmm4,xmm1,xmm0,4 + shrd r13d,r13d,14 + mov eax,r14d + mov r12d,r9d + vpalignr xmm7,xmm3,xmm2,4 + shrd r14d,r14d,9 + xor r13d,r8d + xor r12d,r10d + vpsrld xmm6,xmm4,7 + shrd r13d,r13d,5 + xor r14d,eax + and r12d,r8d + vpaddd xmm0,xmm0,xmm7 + xor r13d,r8d + add r11d,DWORD[rsp] + mov r15d,eax + vpsrld xmm7,xmm4,3 + xor r12d,r10d + shrd r14d,r14d,11 + xor r15d,ebx + vpslld xmm5,xmm4,14 + add r11d,r12d + shrd r13d,r13d,6 + and edi,r15d + vpxor xmm4,xmm7,xmm6 + xor r14d,eax + add r11d,r13d + xor edi,ebx + vpshufd xmm7,xmm3,250 + shrd r14d,r14d,2 + add edx,r11d + add r11d,edi + vpsrld xmm6,xmm6,11 + mov r13d,edx + add r14d,r11d + shrd r13d,r13d,14 + vpxor xmm4,xmm4,xmm5 + mov r11d,r14d + mov r12d,r8d + shrd r14d,r14d,9 + vpslld xmm5,xmm5,11 + xor r13d,edx + xor r12d,r9d + shrd r13d,r13d,5 + vpxor xmm4,xmm4,xmm6 + xor r14d,r11d + and r12d,edx + xor r13d,edx + vpsrld xmm6,xmm7,10 + add r10d,DWORD[4+rsp] + mov edi,r11d + xor r12d,r9d + vpxor xmm4,xmm4,xmm5 + shrd r14d,r14d,11 + xor edi,eax + add r10d,r12d + vpsrlq xmm7,xmm7,17 + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r11d + vpaddd xmm0,xmm0,xmm4 + add r10d,r13d + xor r15d,eax + shrd r14d,r14d,2 + vpxor xmm6,xmm6,xmm7 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + vpsrlq xmm7,xmm7,2 + add r14d,r10d + shrd r13d,r13d,14 + mov r10d,r14d + vpxor xmm6,xmm6,xmm7 + mov r12d,edx + shrd r14d,r14d,9 + xor r13d,ecx + vpshufb xmm6,xmm6,xmm8 + xor r12d,r8d + shrd r13d,r13d,5 + xor r14d,r10d + vpaddd xmm0,xmm0,xmm6 + and r12d,ecx + xor r13d,ecx + add r9d,DWORD[8+rsp] + vpshufd xmm7,xmm0,80 + mov r15d,r10d + xor r12d,r8d + shrd r14d,r14d,11 + vpsrld xmm6,xmm7,10 + xor r15d,r11d + add r9d,r12d + shrd r13d,r13d,6 + vpsrlq xmm7,xmm7,17 + and edi,r15d + xor r14d,r10d + add r9d,r13d + vpxor xmm6,xmm6,xmm7 + xor edi,r11d + shrd r14d,r14d,2 + add ebx,r9d + vpsrlq xmm7,xmm7,2 + add r9d,edi + mov r13d,ebx + add r14d,r9d + vpxor xmm6,xmm6,xmm7 + shrd r13d,r13d,14 + mov r9d,r14d + mov r12d,ecx + vpshufb xmm6,xmm6,xmm9 + shrd r14d,r14d,9 + xor r13d,ebx + xor r12d,edx + vpaddd xmm0,xmm0,xmm6 + shrd r13d,r13d,5 + xor r14d,r9d + and r12d,ebx + vpaddd xmm6,xmm0,XMMWORD[rbp] + xor r13d,ebx + add r8d,DWORD[12+rsp] + mov edi,r9d + xor r12d,edx + shrd r14d,r14d,11 + xor edi,r10d + add r8d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + shrd r14d,r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + vmovdqa XMMWORD[rsp],xmm6 + vpalignr xmm4,xmm2,xmm1,4 + shrd r13d,r13d,14 + mov r8d,r14d + mov r12d,ebx + vpalignr xmm7,xmm0,xmm3,4 + shrd r14d,r14d,9 + xor r13d,eax + xor r12d,ecx + vpsrld xmm6,xmm4,7 + shrd r13d,r13d,5 + xor r14d,r8d + and r12d,eax + vpaddd xmm1,xmm1,xmm7 + xor r13d,eax + add edx,DWORD[16+rsp] + mov r15d,r8d + vpsrld xmm7,xmm4,3 + xor r12d,ecx + shrd r14d,r14d,11 + xor r15d,r9d + vpslld xmm5,xmm4,14 + add edx,r12d + shrd r13d,r13d,6 + and edi,r15d + vpxor xmm4,xmm7,xmm6 + xor r14d,r8d + add edx,r13d + xor edi,r9d + vpshufd xmm7,xmm0,250 + shrd r14d,r14d,2 + add r11d,edx + add edx,edi + vpsrld xmm6,xmm6,11 + mov r13d,r11d + add r14d,edx + shrd r13d,r13d,14 + vpxor xmm4,xmm4,xmm5 + mov edx,r14d + mov r12d,eax + shrd r14d,r14d,9 + vpslld xmm5,xmm5,11 + xor r13d,r11d + xor r12d,ebx + shrd r13d,r13d,5 + vpxor xmm4,xmm4,xmm6 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + vpsrld xmm6,xmm7,10 + add ecx,DWORD[20+rsp] + mov edi,edx + xor r12d,ebx + vpxor xmm4,xmm4,xmm5 + shrd r14d,r14d,11 + xor edi,r8d + add ecx,r12d + vpsrlq xmm7,xmm7,17 + shrd r13d,r13d,6 + and r15d,edi + xor r14d,edx + vpaddd xmm1,xmm1,xmm4 + add ecx,r13d + xor r15d,r8d + shrd r14d,r14d,2 + vpxor xmm6,xmm6,xmm7 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + vpsrlq xmm7,xmm7,2 + add r14d,ecx + shrd r13d,r13d,14 + mov ecx,r14d + vpxor xmm6,xmm6,xmm7 + mov r12d,r11d + shrd r14d,r14d,9 + xor r13d,r10d + vpshufb xmm6,xmm6,xmm8 + xor r12d,eax + shrd r13d,r13d,5 + xor r14d,ecx + vpaddd xmm1,xmm1,xmm6 + and r12d,r10d + xor r13d,r10d + add ebx,DWORD[24+rsp] + vpshufd xmm7,xmm1,80 + mov r15d,ecx + xor r12d,eax + shrd r14d,r14d,11 + vpsrld xmm6,xmm7,10 + xor r15d,edx + add ebx,r12d + shrd r13d,r13d,6 + vpsrlq xmm7,xmm7,17 + and edi,r15d + xor r14d,ecx + add ebx,r13d + vpxor xmm6,xmm6,xmm7 + xor edi,edx + shrd r14d,r14d,2 + add r9d,ebx + vpsrlq xmm7,xmm7,2 + add ebx,edi + mov r13d,r9d + add r14d,ebx + vpxor xmm6,xmm6,xmm7 + shrd r13d,r13d,14 + mov ebx,r14d + mov r12d,r10d + vpshufb xmm6,xmm6,xmm9 + shrd r14d,r14d,9 + xor r13d,r9d + xor r12d,r11d + vpaddd xmm1,xmm1,xmm6 + shrd r13d,r13d,5 + xor r14d,ebx + and r12d,r9d + vpaddd xmm6,xmm1,XMMWORD[32+rbp] + xor r13d,r9d + add eax,DWORD[28+rsp] + mov edi,ebx + xor r12d,r11d + shrd r14d,r14d,11 + xor edi,ecx + add eax,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + shrd r14d,r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + vmovdqa XMMWORD[16+rsp],xmm6 + vpalignr xmm4,xmm3,xmm2,4 + shrd r13d,r13d,14 + mov eax,r14d + mov r12d,r9d + vpalignr xmm7,xmm1,xmm0,4 + shrd r14d,r14d,9 + xor r13d,r8d + xor r12d,r10d + vpsrld xmm6,xmm4,7 + shrd r13d,r13d,5 + xor r14d,eax + and r12d,r8d + vpaddd xmm2,xmm2,xmm7 + xor r13d,r8d + add r11d,DWORD[32+rsp] + mov r15d,eax + vpsrld xmm7,xmm4,3 + xor r12d,r10d + shrd r14d,r14d,11 + xor r15d,ebx + vpslld xmm5,xmm4,14 + add r11d,r12d + shrd r13d,r13d,6 + and edi,r15d + vpxor xmm4,xmm7,xmm6 + xor r14d,eax + add r11d,r13d + xor edi,ebx + vpshufd xmm7,xmm1,250 + shrd r14d,r14d,2 + add edx,r11d + add r11d,edi + vpsrld xmm6,xmm6,11 + mov r13d,edx + add r14d,r11d + shrd r13d,r13d,14 + vpxor xmm4,xmm4,xmm5 + mov r11d,r14d + mov r12d,r8d + shrd r14d,r14d,9 + vpslld xmm5,xmm5,11 + xor r13d,edx + xor r12d,r9d + shrd r13d,r13d,5 + vpxor xmm4,xmm4,xmm6 + xor r14d,r11d + and r12d,edx + xor r13d,edx + vpsrld xmm6,xmm7,10 + add r10d,DWORD[36+rsp] + mov edi,r11d + xor r12d,r9d + vpxor xmm4,xmm4,xmm5 + shrd r14d,r14d,11 + xor edi,eax + add r10d,r12d + vpsrlq xmm7,xmm7,17 + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r11d + vpaddd xmm2,xmm2,xmm4 + add r10d,r13d + xor r15d,eax + shrd r14d,r14d,2 + vpxor xmm6,xmm6,xmm7 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + vpsrlq xmm7,xmm7,2 + add r14d,r10d + shrd r13d,r13d,14 + mov r10d,r14d + vpxor xmm6,xmm6,xmm7 + mov r12d,edx + shrd r14d,r14d,9 + xor r13d,ecx + vpshufb xmm6,xmm6,xmm8 + xor r12d,r8d + shrd r13d,r13d,5 + xor r14d,r10d + vpaddd xmm2,xmm2,xmm6 + and r12d,ecx + xor r13d,ecx + add r9d,DWORD[40+rsp] + vpshufd xmm7,xmm2,80 + mov r15d,r10d + xor r12d,r8d + shrd r14d,r14d,11 + vpsrld xmm6,xmm7,10 + xor r15d,r11d + add r9d,r12d + shrd r13d,r13d,6 + vpsrlq xmm7,xmm7,17 + and edi,r15d + xor r14d,r10d + add r9d,r13d + vpxor xmm6,xmm6,xmm7 + xor edi,r11d + shrd r14d,r14d,2 + add ebx,r9d + vpsrlq xmm7,xmm7,2 + add r9d,edi + mov r13d,ebx + add r14d,r9d + vpxor xmm6,xmm6,xmm7 + shrd r13d,r13d,14 + mov r9d,r14d + mov r12d,ecx + vpshufb xmm6,xmm6,xmm9 + shrd r14d,r14d,9 + xor r13d,ebx + xor r12d,edx + vpaddd xmm2,xmm2,xmm6 + shrd r13d,r13d,5 + xor r14d,r9d + and r12d,ebx + vpaddd xmm6,xmm2,XMMWORD[64+rbp] + xor r13d,ebx + add r8d,DWORD[44+rsp] + mov edi,r9d + xor r12d,edx + shrd r14d,r14d,11 + xor edi,r10d + add r8d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + shrd r14d,r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + vmovdqa XMMWORD[32+rsp],xmm6 + vpalignr xmm4,xmm0,xmm3,4 + shrd r13d,r13d,14 + mov r8d,r14d + mov r12d,ebx + vpalignr xmm7,xmm2,xmm1,4 + shrd r14d,r14d,9 + xor r13d,eax + xor r12d,ecx + vpsrld xmm6,xmm4,7 + shrd r13d,r13d,5 + xor r14d,r8d + and r12d,eax + vpaddd xmm3,xmm3,xmm7 + xor r13d,eax + add edx,DWORD[48+rsp] + mov r15d,r8d + vpsrld xmm7,xmm4,3 + xor r12d,ecx + shrd r14d,r14d,11 + xor r15d,r9d + vpslld xmm5,xmm4,14 + add edx,r12d + shrd r13d,r13d,6 + and edi,r15d + vpxor xmm4,xmm7,xmm6 + xor r14d,r8d + add edx,r13d + xor edi,r9d + vpshufd xmm7,xmm2,250 + shrd r14d,r14d,2 + add r11d,edx + add edx,edi + vpsrld xmm6,xmm6,11 + mov r13d,r11d + add r14d,edx + shrd r13d,r13d,14 + vpxor xmm4,xmm4,xmm5 + mov edx,r14d + mov r12d,eax + shrd r14d,r14d,9 + vpslld xmm5,xmm5,11 + xor r13d,r11d + xor r12d,ebx + shrd r13d,r13d,5 + vpxor xmm4,xmm4,xmm6 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + vpsrld xmm6,xmm7,10 + add ecx,DWORD[52+rsp] + mov edi,edx + xor r12d,ebx + vpxor xmm4,xmm4,xmm5 + shrd r14d,r14d,11 + xor edi,r8d + add ecx,r12d + vpsrlq xmm7,xmm7,17 + shrd r13d,r13d,6 + and r15d,edi + xor r14d,edx + vpaddd xmm3,xmm3,xmm4 + add ecx,r13d + xor r15d,r8d + shrd r14d,r14d,2 + vpxor xmm6,xmm6,xmm7 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + vpsrlq xmm7,xmm7,2 + add r14d,ecx + shrd r13d,r13d,14 + mov ecx,r14d + vpxor xmm6,xmm6,xmm7 + mov r12d,r11d + shrd r14d,r14d,9 + xor r13d,r10d + vpshufb xmm6,xmm6,xmm8 + xor r12d,eax + shrd r13d,r13d,5 + xor r14d,ecx + vpaddd xmm3,xmm3,xmm6 + and r12d,r10d + xor r13d,r10d + add ebx,DWORD[56+rsp] + vpshufd xmm7,xmm3,80 + mov r15d,ecx + xor r12d,eax + shrd r14d,r14d,11 + vpsrld xmm6,xmm7,10 + xor r15d,edx + add ebx,r12d + shrd r13d,r13d,6 + vpsrlq xmm7,xmm7,17 + and edi,r15d + xor r14d,ecx + add ebx,r13d + vpxor xmm6,xmm6,xmm7 + xor edi,edx + shrd r14d,r14d,2 + add r9d,ebx + vpsrlq xmm7,xmm7,2 + add ebx,edi + mov r13d,r9d + add r14d,ebx + vpxor xmm6,xmm6,xmm7 + shrd r13d,r13d,14 + mov ebx,r14d + mov r12d,r10d + vpshufb xmm6,xmm6,xmm9 + shrd r14d,r14d,9 + xor r13d,r9d + xor r12d,r11d + vpaddd xmm3,xmm3,xmm6 + shrd r13d,r13d,5 + xor r14d,ebx + and r12d,r9d + vpaddd xmm6,xmm3,XMMWORD[96+rbp] + xor r13d,r9d + add eax,DWORD[60+rsp] + mov edi,ebx + xor r12d,r11d + shrd r14d,r14d,11 + xor edi,ecx + add eax,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + shrd r14d,r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + vmovdqa XMMWORD[48+rsp],xmm6 + cmp BYTE[131+rbp],0 + jne NEAR $L$avx_00_47 + shrd r13d,r13d,14 + mov eax,r14d + mov r12d,r9d + shrd r14d,r14d,9 + xor r13d,r8d + xor r12d,r10d + shrd r13d,r13d,5 + xor r14d,eax + and r12d,r8d + xor r13d,r8d + add r11d,DWORD[rsp] + mov r15d,eax + xor r12d,r10d + shrd r14d,r14d,11 + xor r15d,ebx + add r11d,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,eax + add r11d,r13d + xor edi,ebx + shrd r14d,r14d,2 + add edx,r11d + add r11d,edi + mov r13d,edx + add r14d,r11d + shrd r13d,r13d,14 + mov r11d,r14d + mov r12d,r8d + shrd r14d,r14d,9 + xor r13d,edx + xor r12d,r9d + shrd r13d,r13d,5 + xor r14d,r11d + and r12d,edx + xor r13d,edx + add r10d,DWORD[4+rsp] + mov edi,r11d + xor r12d,r9d + shrd r14d,r14d,11 + xor edi,eax + add r10d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r11d + add r10d,r13d + xor r15d,eax + shrd r14d,r14d,2 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + add r14d,r10d + shrd r13d,r13d,14 + mov r10d,r14d + mov r12d,edx + shrd r14d,r14d,9 + xor r13d,ecx + xor r12d,r8d + shrd r13d,r13d,5 + xor r14d,r10d + and r12d,ecx + xor r13d,ecx + add r9d,DWORD[8+rsp] + mov r15d,r10d + xor r12d,r8d + shrd r14d,r14d,11 + xor r15d,r11d + add r9d,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,r10d + add r9d,r13d + xor edi,r11d + shrd r14d,r14d,2 + add ebx,r9d + add r9d,edi + mov r13d,ebx + add r14d,r9d + shrd r13d,r13d,14 + mov r9d,r14d + mov r12d,ecx + shrd r14d,r14d,9 + xor r13d,ebx + xor r12d,edx + shrd r13d,r13d,5 + xor r14d,r9d + and r12d,ebx + xor r13d,ebx + add r8d,DWORD[12+rsp] + mov edi,r9d + xor r12d,edx + shrd r14d,r14d,11 + xor edi,r10d + add r8d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + shrd r14d,r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + shrd r13d,r13d,14 + mov r8d,r14d + mov r12d,ebx + shrd r14d,r14d,9 + xor r13d,eax + xor r12d,ecx + shrd r13d,r13d,5 + xor r14d,r8d + and r12d,eax + xor r13d,eax + add edx,DWORD[16+rsp] + mov r15d,r8d + xor r12d,ecx + shrd r14d,r14d,11 + xor r15d,r9d + add edx,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,r8d + add edx,r13d + xor edi,r9d + shrd r14d,r14d,2 + add r11d,edx + add edx,edi + mov r13d,r11d + add r14d,edx + shrd r13d,r13d,14 + mov edx,r14d + mov r12d,eax + shrd r14d,r14d,9 + xor r13d,r11d + xor r12d,ebx + shrd r13d,r13d,5 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + add ecx,DWORD[20+rsp] + mov edi,edx + xor r12d,ebx + shrd r14d,r14d,11 + xor edi,r8d + add ecx,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,edx + add ecx,r13d + xor r15d,r8d + shrd r14d,r14d,2 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + add r14d,ecx + shrd r13d,r13d,14 + mov ecx,r14d + mov r12d,r11d + shrd r14d,r14d,9 + xor r13d,r10d + xor r12d,eax + shrd r13d,r13d,5 + xor r14d,ecx + and r12d,r10d + xor r13d,r10d + add ebx,DWORD[24+rsp] + mov r15d,ecx + xor r12d,eax + shrd r14d,r14d,11 + xor r15d,edx + add ebx,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,ecx + add ebx,r13d + xor edi,edx + shrd r14d,r14d,2 + add r9d,ebx + add ebx,edi + mov r13d,r9d + add r14d,ebx + shrd r13d,r13d,14 + mov ebx,r14d + mov r12d,r10d + shrd r14d,r14d,9 + xor r13d,r9d + xor r12d,r11d + shrd r13d,r13d,5 + xor r14d,ebx + and r12d,r9d + xor r13d,r9d + add eax,DWORD[28+rsp] + mov edi,ebx + xor r12d,r11d + shrd r14d,r14d,11 + xor edi,ecx + add eax,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + shrd r14d,r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + shrd r13d,r13d,14 + mov eax,r14d + mov r12d,r9d + shrd r14d,r14d,9 + xor r13d,r8d + xor r12d,r10d + shrd r13d,r13d,5 + xor r14d,eax + and r12d,r8d + xor r13d,r8d + add r11d,DWORD[32+rsp] + mov r15d,eax + xor r12d,r10d + shrd r14d,r14d,11 + xor r15d,ebx + add r11d,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,eax + add r11d,r13d + xor edi,ebx + shrd r14d,r14d,2 + add edx,r11d + add r11d,edi + mov r13d,edx + add r14d,r11d + shrd r13d,r13d,14 + mov r11d,r14d + mov r12d,r8d + shrd r14d,r14d,9 + xor r13d,edx + xor r12d,r9d + shrd r13d,r13d,5 + xor r14d,r11d + and r12d,edx + xor r13d,edx + add r10d,DWORD[36+rsp] + mov edi,r11d + xor r12d,r9d + shrd r14d,r14d,11 + xor edi,eax + add r10d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r11d + add r10d,r13d + xor r15d,eax + shrd r14d,r14d,2 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + add r14d,r10d + shrd r13d,r13d,14 + mov r10d,r14d + mov r12d,edx + shrd r14d,r14d,9 + xor r13d,ecx + xor r12d,r8d + shrd r13d,r13d,5 + xor r14d,r10d + and r12d,ecx + xor r13d,ecx + add r9d,DWORD[40+rsp] + mov r15d,r10d + xor r12d,r8d + shrd r14d,r14d,11 + xor r15d,r11d + add r9d,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,r10d + add r9d,r13d + xor edi,r11d + shrd r14d,r14d,2 + add ebx,r9d + add r9d,edi + mov r13d,ebx + add r14d,r9d + shrd r13d,r13d,14 + mov r9d,r14d + mov r12d,ecx + shrd r14d,r14d,9 + xor r13d,ebx + xor r12d,edx + shrd r13d,r13d,5 + xor r14d,r9d + and r12d,ebx + xor r13d,ebx + add r8d,DWORD[44+rsp] + mov edi,r9d + xor r12d,edx + shrd r14d,r14d,11 + xor edi,r10d + add r8d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + shrd r14d,r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + shrd r13d,r13d,14 + mov r8d,r14d + mov r12d,ebx + shrd r14d,r14d,9 + xor r13d,eax + xor r12d,ecx + shrd r13d,r13d,5 + xor r14d,r8d + and r12d,eax + xor r13d,eax + add edx,DWORD[48+rsp] + mov r15d,r8d + xor r12d,ecx + shrd r14d,r14d,11 + xor r15d,r9d + add edx,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,r8d + add edx,r13d + xor edi,r9d + shrd r14d,r14d,2 + add r11d,edx + add edx,edi + mov r13d,r11d + add r14d,edx + shrd r13d,r13d,14 + mov edx,r14d + mov r12d,eax + shrd r14d,r14d,9 + xor r13d,r11d + xor r12d,ebx + shrd r13d,r13d,5 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + add ecx,DWORD[52+rsp] + mov edi,edx + xor r12d,ebx + shrd r14d,r14d,11 + xor edi,r8d + add ecx,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,edx + add ecx,r13d + xor r15d,r8d + shrd r14d,r14d,2 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + add r14d,ecx + shrd r13d,r13d,14 + mov ecx,r14d + mov r12d,r11d + shrd r14d,r14d,9 + xor r13d,r10d + xor r12d,eax + shrd r13d,r13d,5 + xor r14d,ecx + and r12d,r10d + xor r13d,r10d + add ebx,DWORD[56+rsp] + mov r15d,ecx + xor r12d,eax + shrd r14d,r14d,11 + xor r15d,edx + add ebx,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,ecx + add ebx,r13d + xor edi,edx + shrd r14d,r14d,2 + add r9d,ebx + add ebx,edi + mov r13d,r9d + add r14d,ebx + shrd r13d,r13d,14 + mov ebx,r14d + mov r12d,r10d + shrd r14d,r14d,9 + xor r13d,r9d + xor r12d,r11d + shrd r13d,r13d,5 + xor r14d,ebx + and r12d,r9d + xor r13d,r9d + add eax,DWORD[60+rsp] + mov edi,ebx + xor r12d,r11d + shrd r14d,r14d,11 + xor edi,ecx + add eax,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + shrd r14d,r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + mov rdi,QWORD[((64+0))+rsp] + mov eax,r14d + + add eax,DWORD[rdi] + lea rsi,[64+rsi] + add ebx,DWORD[4+rdi] + add ecx,DWORD[8+rdi] + add edx,DWORD[12+rdi] + add r8d,DWORD[16+rdi] + add r9d,DWORD[20+rdi] + add r10d,DWORD[24+rdi] + add r11d,DWORD[28+rdi] + + cmp rsi,QWORD[((64+16))+rsp] + + mov DWORD[rdi],eax + mov DWORD[4+rdi],ebx + mov DWORD[8+rdi],ecx + mov DWORD[12+rdi],edx + mov DWORD[16+rdi],r8d + mov DWORD[20+rdi],r9d + mov DWORD[24+rdi],r10d + mov DWORD[28+rdi],r11d + jb NEAR $L$loop_avx + + mov rsi,QWORD[88+rsp] + + vzeroupper + movaps xmm6,XMMWORD[((64+32))+rsp] + movaps xmm7,XMMWORD[((64+48))+rsp] + movaps xmm8,XMMWORD[((64+64))+rsp] + movaps xmm9,XMMWORD[((64+80))+rsp] + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$epilogue_avx: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_sha256_block_data_order_avx: +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +se_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$in_prologue + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$in_prologue + mov rsi,rax + mov rax,QWORD[((64+24))+rax] + + mov rbx,QWORD[((-8))+rax] + mov rbp,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 + + lea r10,[$L$epilogue] + cmp rbx,r10 + jb NEAR $L$in_prologue + + lea rsi,[((64+32))+rsi] + lea rdi,[512+r8] + mov ecx,8 + DD 0xa548f3fc + +$L$in_prologue: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + ret + + +ALIGN 16 +shaext_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + lea r10,[$L$prologue_shaext] + cmp rbx,r10 + jb NEAR $L$in_prologue + + lea r10,[$L$epilogue_shaext] + cmp rbx,r10 + jae NEAR $L$in_prologue + + lea rsi,[((-8-80))+rax] + lea rdi,[512+r8] + mov ecx,10 + DD 0xa548f3fc + + jmp NEAR $L$in_prologue + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_sha256_block_data_order_nohw wrt ..imagebase + DD $L$SEH_end_sha256_block_data_order_nohw wrt ..imagebase + DD $L$SEH_info_sha256_block_data_order_nohw wrt ..imagebase + DD $L$SEH_begin_sha256_block_data_order_hw wrt ..imagebase + DD $L$SEH_end_sha256_block_data_order_hw wrt ..imagebase + DD $L$SEH_info_sha256_block_data_order_hw wrt ..imagebase + DD $L$SEH_begin_sha256_block_data_order_ssse3 wrt ..imagebase + DD $L$SEH_end_sha256_block_data_order_ssse3 wrt ..imagebase + DD $L$SEH_info_sha256_block_data_order_ssse3 wrt ..imagebase + DD $L$SEH_begin_sha256_block_data_order_avx wrt ..imagebase + DD $L$SEH_end_sha256_block_data_order_avx wrt ..imagebase + DD $L$SEH_info_sha256_block_data_order_avx wrt ..imagebase +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_sha256_block_data_order_nohw: + DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$prologue wrt ..imagebase,$L$epilogue wrt ..imagebase +$L$SEH_info_sha256_block_data_order_hw: + DB 9,0,0,0 + DD shaext_handler wrt ..imagebase +$L$SEH_info_sha256_block_data_order_ssse3: + DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$prologue_ssse3 wrt ..imagebase,$L$epilogue_ssse3 wrt ..imagebase +$L$SEH_info_sha256_block_data_order_avx: + DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/ring-0.17.14/pregenerated/sha256-x86_64-nasm.o b/ring-0.17.14/pregenerated/sha256-x86_64-nasm.o new file mode 100644 index 0000000000..fcd5723481 Binary files /dev/null and b/ring-0.17.14/pregenerated/sha256-x86_64-nasm.o differ diff --git a/ring-0.17.14/pregenerated/sha512-armv4-linux32.S b/ring-0.17.14/pregenerated/sha512-armv4-linux32.S new file mode 100644 index 0000000000..9150b0f405 --- /dev/null +++ b/ring-0.17.14/pregenerated/sha512-armv4-linux32.S @@ -0,0 +1,1857 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__) +@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +@ +@ Licensed under the Apache License, Version 2.0 (the "License"); +@ you may not use this file except in compliance with the License. +@ You may obtain a copy of the License at +@ +@ https://www.apache.org/licenses/LICENSE-2.0 +@ +@ Unless required by applicable law or agreed to in writing, software +@ distributed under the License is distributed on an "AS IS" BASIS, +@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ See the License for the specific language governing permissions and +@ limitations under the License. + + +@ ==================================================================== +@ Written by Andy Polyakov for the OpenSSL +@ project. +@ ==================================================================== + +@ SHA512 block procedure for ARMv4. September 2007. + +@ This code is ~4.5 (four and a half) times faster than code generated +@ by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue +@ Xscale PXA250 core]. +@ +@ July 2010. +@ +@ Rescheduling for dual-issue pipeline resulted in 6% improvement on +@ Cortex A8 core and ~40 cycles per processed byte. + +@ February 2011. +@ +@ Profiler-assisted and platform-specific optimization resulted in 7% +@ improvement on Coxtex A8 core and ~38 cycles per byte. + +@ March 2011. +@ +@ Add NEON implementation. On Cortex A8 it was measured to process +@ one byte in 23.3 cycles or ~60% faster than integer-only code. + +@ August 2012. +@ +@ Improve NEON performance by 12% on Snapdragon S4. In absolute +@ terms it's 22.6 cycles per byte, which is disappointing result. +@ Technical writers asserted that 3-way S4 pipeline can sustain +@ multiple NEON instructions per cycle, but dual NEON issue could +@ not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html +@ for further details. On side note Cortex-A15 processes one byte in +@ 16 cycles. + +@ Byte order [in]dependence. ========================================= +@ +@ Originally caller was expected to maintain specific *dword* order in +@ h[0-7], namely with most significant dword at *lower* address, which +@ was reflected in below two parameters as 0 and 4. Now caller is +@ expected to maintain native byte order for whole 64-bit values. +#ifndef __KERNEL__ +# define VFP_ABI_PUSH vstmdb sp!,{d8-d15} +# define VFP_ABI_POP vldmia sp!,{d8-d15} +#else +# define __ARM_MAX_ARCH__ 7 +# define VFP_ABI_PUSH +# define VFP_ABI_POP +#endif + +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. +.arch armv7-a + +#ifdef __ARMEL__ +# define LO 0 +# define HI 4 +# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1 +#else +# define HI 0 +# define LO 4 +# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 +#endif + +.text +#if defined(__thumb2__) +.syntax unified +.thumb +# define adrl adr +#else +.code 32 +#endif + +.type K512,%object +.align 5 +K512: + WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd) + WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc) + WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019) + WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118) + WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe) + WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2) + WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1) + WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694) + WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3) + WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65) + WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483) + WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5) + WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210) + WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4) + WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725) + WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70) + WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926) + WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df) + WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8) + WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b) + WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001) + WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30) + WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910) + WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8) + WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53) + WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8) + WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb) + WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3) + WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60) + WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec) + WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9) + WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b) + WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207) + WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178) + WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6) + WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b) + WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493) + WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) + WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) + WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) +.size K512,.-K512 + +.globl sha512_block_data_order_nohw +.hidden sha512_block_data_order_nohw +.type sha512_block_data_order_nohw,%function +sha512_block_data_order_nohw: + add r2,r1,r2,lsl#7 @ len to point at the end of inp + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + adr r14,K512 + sub sp,sp,#9*8 + + ldr r7,[r0,#32+LO] + ldr r8,[r0,#32+HI] + ldr r9, [r0,#48+LO] + ldr r10, [r0,#48+HI] + ldr r11, [r0,#56+LO] + ldr r12, [r0,#56+HI] +.Loop: + str r9, [sp,#48+0] + str r10, [sp,#48+4] + str r11, [sp,#56+0] + str r12, [sp,#56+4] + ldr r5,[r0,#0+LO] + ldr r6,[r0,#0+HI] + ldr r3,[r0,#8+LO] + ldr r4,[r0,#8+HI] + ldr r9, [r0,#16+LO] + ldr r10, [r0,#16+HI] + ldr r11, [r0,#24+LO] + ldr r12, [r0,#24+HI] + str r3,[sp,#8+0] + str r4,[sp,#8+4] + str r9, [sp,#16+0] + str r10, [sp,#16+4] + str r11, [sp,#24+0] + str r12, [sp,#24+4] + ldr r3,[r0,#40+LO] + ldr r4,[r0,#40+HI] + str r3,[sp,#40+0] + str r4,[sp,#40+4] + +.L00_15: +#if __ARM_ARCH<7 + ldrb r3,[r1,#7] + ldrb r9, [r1,#6] + ldrb r10, [r1,#5] + ldrb r11, [r1,#4] + ldrb r4,[r1,#3] + ldrb r12, [r1,#2] + orr r3,r3,r9,lsl#8 + ldrb r9, [r1,#1] + orr r3,r3,r10,lsl#16 + ldrb r10, [r1],#8 + orr r3,r3,r11,lsl#24 + orr r4,r4,r12,lsl#8 + orr r4,r4,r9,lsl#16 + orr r4,r4,r10,lsl#24 +#else + ldr r3,[r1,#4] + ldr r4,[r1],#8 +#ifdef __ARMEL__ + rev r3,r3 + rev r4,r4 +#endif +#endif + @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) + @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 + @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 + mov r9,r7,lsr#14 + str r3,[sp,#64+0] + mov r10,r8,lsr#14 + str r4,[sp,#64+4] + eor r9,r9,r8,lsl#18 + ldr r11,[sp,#56+0] @ h.lo + eor r10,r10,r7,lsl#18 + ldr r12,[sp,#56+4] @ h.hi + eor r9,r9,r7,lsr#18 + eor r10,r10,r8,lsr#18 + eor r9,r9,r8,lsl#14 + eor r10,r10,r7,lsl#14 + eor r9,r9,r8,lsr#9 + eor r10,r10,r7,lsr#9 + eor r9,r9,r7,lsl#23 + eor r10,r10,r8,lsl#23 @ Sigma1(e) + adds r3,r3,r9 + ldr r9,[sp,#40+0] @ f.lo + adc r4,r4,r10 @ T += Sigma1(e) + ldr r10,[sp,#40+4] @ f.hi + adds r3,r3,r11 + ldr r11,[sp,#48+0] @ g.lo + adc r4,r4,r12 @ T += h + ldr r12,[sp,#48+4] @ g.hi + + eor r9,r9,r11 + str r7,[sp,#32+0] + eor r10,r10,r12 + str r8,[sp,#32+4] + and r9,r9,r7 + str r5,[sp,#0+0] + and r10,r10,r8 + str r6,[sp,#0+4] + eor r9,r9,r11 + ldr r11,[r14,#LO] @ K[i].lo + eor r10,r10,r12 @ Ch(e,f,g) + ldr r12,[r14,#HI] @ K[i].hi + + adds r3,r3,r9 + ldr r7,[sp,#24+0] @ d.lo + adc r4,r4,r10 @ T += Ch(e,f,g) + ldr r8,[sp,#24+4] @ d.hi + adds r3,r3,r11 + and r9,r11,#0xff + adc r4,r4,r12 @ T += K[i] + adds r7,r7,r3 + ldr r11,[sp,#8+0] @ b.lo + adc r8,r8,r4 @ d += T + teq r9,#148 + + ldr r12,[sp,#16+0] @ c.lo +#if __ARM_ARCH>=7 + it eq @ Thumb2 thing, sanity check in ARM +#endif + orreq r14,r14,#1 + @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) + @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 + @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 + mov r9,r5,lsr#28 + mov r10,r6,lsr#28 + eor r9,r9,r6,lsl#4 + eor r10,r10,r5,lsl#4 + eor r9,r9,r6,lsr#2 + eor r10,r10,r5,lsr#2 + eor r9,r9,r5,lsl#30 + eor r10,r10,r6,lsl#30 + eor r9,r9,r6,lsr#7 + eor r10,r10,r5,lsr#7 + eor r9,r9,r5,lsl#25 + eor r10,r10,r6,lsl#25 @ Sigma0(a) + adds r3,r3,r9 + and r9,r5,r11 + adc r4,r4,r10 @ T += Sigma0(a) + + ldr r10,[sp,#8+4] @ b.hi + orr r5,r5,r11 + ldr r11,[sp,#16+4] @ c.hi + and r5,r5,r12 + and r12,r6,r10 + orr r6,r6,r10 + orr r5,r5,r9 @ Maj(a,b,c).lo + and r6,r6,r11 + adds r5,r5,r3 + orr r6,r6,r12 @ Maj(a,b,c).hi + sub sp,sp,#8 + adc r6,r6,r4 @ h += T + tst r14,#1 + add r14,r14,#8 + tst r14,#1 + beq .L00_15 + ldr r9,[sp,#184+0] + ldr r10,[sp,#184+4] + bic r14,r14,#1 +.L16_79: + @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) + @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 + @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 + mov r3,r9,lsr#1 + ldr r11,[sp,#80+0] + mov r4,r10,lsr#1 + ldr r12,[sp,#80+4] + eor r3,r3,r10,lsl#31 + eor r4,r4,r9,lsl#31 + eor r3,r3,r9,lsr#8 + eor r4,r4,r10,lsr#8 + eor r3,r3,r10,lsl#24 + eor r4,r4,r9,lsl#24 + eor r3,r3,r9,lsr#7 + eor r4,r4,r10,lsr#7 + eor r3,r3,r10,lsl#25 + + @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) + @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26 + @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6 + mov r9,r11,lsr#19 + mov r10,r12,lsr#19 + eor r9,r9,r12,lsl#13 + eor r10,r10,r11,lsl#13 + eor r9,r9,r12,lsr#29 + eor r10,r10,r11,lsr#29 + eor r9,r9,r11,lsl#3 + eor r10,r10,r12,lsl#3 + eor r9,r9,r11,lsr#6 + eor r10,r10,r12,lsr#6 + ldr r11,[sp,#120+0] + eor r9,r9,r12,lsl#26 + + ldr r12,[sp,#120+4] + adds r3,r3,r9 + ldr r9,[sp,#192+0] + adc r4,r4,r10 + + ldr r10,[sp,#192+4] + adds r3,r3,r11 + adc r4,r4,r12 + adds r3,r3,r9 + adc r4,r4,r10 + @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) + @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 + @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 + mov r9,r7,lsr#14 + str r3,[sp,#64+0] + mov r10,r8,lsr#14 + str r4,[sp,#64+4] + eor r9,r9,r8,lsl#18 + ldr r11,[sp,#56+0] @ h.lo + eor r10,r10,r7,lsl#18 + ldr r12,[sp,#56+4] @ h.hi + eor r9,r9,r7,lsr#18 + eor r10,r10,r8,lsr#18 + eor r9,r9,r8,lsl#14 + eor r10,r10,r7,lsl#14 + eor r9,r9,r8,lsr#9 + eor r10,r10,r7,lsr#9 + eor r9,r9,r7,lsl#23 + eor r10,r10,r8,lsl#23 @ Sigma1(e) + adds r3,r3,r9 + ldr r9,[sp,#40+0] @ f.lo + adc r4,r4,r10 @ T += Sigma1(e) + ldr r10,[sp,#40+4] @ f.hi + adds r3,r3,r11 + ldr r11,[sp,#48+0] @ g.lo + adc r4,r4,r12 @ T += h + ldr r12,[sp,#48+4] @ g.hi + + eor r9,r9,r11 + str r7,[sp,#32+0] + eor r10,r10,r12 + str r8,[sp,#32+4] + and r9,r9,r7 + str r5,[sp,#0+0] + and r10,r10,r8 + str r6,[sp,#0+4] + eor r9,r9,r11 + ldr r11,[r14,#LO] @ K[i].lo + eor r10,r10,r12 @ Ch(e,f,g) + ldr r12,[r14,#HI] @ K[i].hi + + adds r3,r3,r9 + ldr r7,[sp,#24+0] @ d.lo + adc r4,r4,r10 @ T += Ch(e,f,g) + ldr r8,[sp,#24+4] @ d.hi + adds r3,r3,r11 + and r9,r11,#0xff + adc r4,r4,r12 @ T += K[i] + adds r7,r7,r3 + ldr r11,[sp,#8+0] @ b.lo + adc r8,r8,r4 @ d += T + teq r9,#23 + + ldr r12,[sp,#16+0] @ c.lo +#if __ARM_ARCH>=7 + it eq @ Thumb2 thing, sanity check in ARM +#endif + orreq r14,r14,#1 + @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) + @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 + @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 + mov r9,r5,lsr#28 + mov r10,r6,lsr#28 + eor r9,r9,r6,lsl#4 + eor r10,r10,r5,lsl#4 + eor r9,r9,r6,lsr#2 + eor r10,r10,r5,lsr#2 + eor r9,r9,r5,lsl#30 + eor r10,r10,r6,lsl#30 + eor r9,r9,r6,lsr#7 + eor r10,r10,r5,lsr#7 + eor r9,r9,r5,lsl#25 + eor r10,r10,r6,lsl#25 @ Sigma0(a) + adds r3,r3,r9 + and r9,r5,r11 + adc r4,r4,r10 @ T += Sigma0(a) + + ldr r10,[sp,#8+4] @ b.hi + orr r5,r5,r11 + ldr r11,[sp,#16+4] @ c.hi + and r5,r5,r12 + and r12,r6,r10 + orr r6,r6,r10 + orr r5,r5,r9 @ Maj(a,b,c).lo + and r6,r6,r11 + adds r5,r5,r3 + orr r6,r6,r12 @ Maj(a,b,c).hi + sub sp,sp,#8 + adc r6,r6,r4 @ h += T + tst r14,#1 + add r14,r14,#8 +#if __ARM_ARCH>=7 + ittt eq @ Thumb2 thing, sanity check in ARM +#endif + ldreq r9,[sp,#184+0] + ldreq r10,[sp,#184+4] + beq .L16_79 + bic r14,r14,#1 + + ldr r3,[sp,#8+0] + ldr r4,[sp,#8+4] + ldr r9, [r0,#0+LO] + ldr r10, [r0,#0+HI] + ldr r11, [r0,#8+LO] + ldr r12, [r0,#8+HI] + adds r9,r5,r9 + str r9, [r0,#0+LO] + adc r10,r6,r10 + str r10, [r0,#0+HI] + adds r11,r3,r11 + str r11, [r0,#8+LO] + adc r12,r4,r12 + str r12, [r0,#8+HI] + + ldr r5,[sp,#16+0] + ldr r6,[sp,#16+4] + ldr r3,[sp,#24+0] + ldr r4,[sp,#24+4] + ldr r9, [r0,#16+LO] + ldr r10, [r0,#16+HI] + ldr r11, [r0,#24+LO] + ldr r12, [r0,#24+HI] + adds r9,r5,r9 + str r9, [r0,#16+LO] + adc r10,r6,r10 + str r10, [r0,#16+HI] + adds r11,r3,r11 + str r11, [r0,#24+LO] + adc r12,r4,r12 + str r12, [r0,#24+HI] + + ldr r3,[sp,#40+0] + ldr r4,[sp,#40+4] + ldr r9, [r0,#32+LO] + ldr r10, [r0,#32+HI] + ldr r11, [r0,#40+LO] + ldr r12, [r0,#40+HI] + adds r7,r7,r9 + str r7,[r0,#32+LO] + adc r8,r8,r10 + str r8,[r0,#32+HI] + adds r11,r3,r11 + str r11, [r0,#40+LO] + adc r12,r4,r12 + str r12, [r0,#40+HI] + + ldr r5,[sp,#48+0] + ldr r6,[sp,#48+4] + ldr r3,[sp,#56+0] + ldr r4,[sp,#56+4] + ldr r9, [r0,#48+LO] + ldr r10, [r0,#48+HI] + ldr r11, [r0,#56+LO] + ldr r12, [r0,#56+HI] + adds r9,r5,r9 + str r9, [r0,#48+LO] + adc r10,r6,r10 + str r10, [r0,#48+HI] + adds r11,r3,r11 + str r11, [r0,#56+LO] + adc r12,r4,r12 + str r12, [r0,#56+HI] + + add sp,sp,#640 + sub r14,r14,#640 + + teq r1,r2 + bne .Loop + + add sp,sp,#8*9 @ destroy frame +#if __ARM_ARCH>=5 + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} +#else + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size sha512_block_data_order_nohw,.-sha512_block_data_order_nohw +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.globl sha512_block_data_order_neon +.hidden sha512_block_data_order_neon +.type sha512_block_data_order_neon,%function +.align 4 +sha512_block_data_order_neon: + dmb @ errata #451034 on early Cortex A8 + add r2,r1,r2,lsl#7 @ len to point at the end of inp + adr r3,K512 + VFP_ABI_PUSH + vldmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ load context +.Loop_neon: + vshr.u64 d24,d20,#14 @ 0 +#if 0<16 + vld1.64 {d0},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d20,#18 +#if 0>0 + vadd.i64 d16,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d20,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d20,#50 + vsli.64 d25,d20,#46 + vmov d29,d20 + vsli.64 d26,d20,#23 +#if 0<16 && defined(__ARMEL__) + vrev64.8 d0,d0 +#endif + veor d25,d24 + vbsl d29,d21,d22 @ Ch(e,f,g) + vshr.u64 d24,d16,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d23 + vshr.u64 d25,d16,#34 + vsli.64 d24,d16,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d16,#39 + vadd.i64 d28,d0 + vsli.64 d25,d16,#30 + veor d30,d16,d17 + vsli.64 d26,d16,#25 + veor d23,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d18,d17 @ Maj(a,b,c) + veor d23,d26 @ Sigma0(a) + vadd.i64 d19,d27 + vadd.i64 d30,d27 + @ vadd.i64 d23,d30 + vshr.u64 d24,d19,#14 @ 1 +#if 1<16 + vld1.64 {d1},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d19,#18 +#if 1>0 + vadd.i64 d23,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d19,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d19,#50 + vsli.64 d25,d19,#46 + vmov d29,d19 + vsli.64 d26,d19,#23 +#if 1<16 && defined(__ARMEL__) + vrev64.8 d1,d1 +#endif + veor d25,d24 + vbsl d29,d20,d21 @ Ch(e,f,g) + vshr.u64 d24,d23,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d22 + vshr.u64 d25,d23,#34 + vsli.64 d24,d23,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d23,#39 + vadd.i64 d28,d1 + vsli.64 d25,d23,#30 + veor d30,d23,d16 + vsli.64 d26,d23,#25 + veor d22,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d17,d16 @ Maj(a,b,c) + veor d22,d26 @ Sigma0(a) + vadd.i64 d18,d27 + vadd.i64 d30,d27 + @ vadd.i64 d22,d30 + vshr.u64 d24,d18,#14 @ 2 +#if 2<16 + vld1.64 {d2},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d18,#18 +#if 2>0 + vadd.i64 d22,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d18,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d18,#50 + vsli.64 d25,d18,#46 + vmov d29,d18 + vsli.64 d26,d18,#23 +#if 2<16 && defined(__ARMEL__) + vrev64.8 d2,d2 +#endif + veor d25,d24 + vbsl d29,d19,d20 @ Ch(e,f,g) + vshr.u64 d24,d22,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d21 + vshr.u64 d25,d22,#34 + vsli.64 d24,d22,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d22,#39 + vadd.i64 d28,d2 + vsli.64 d25,d22,#30 + veor d30,d22,d23 + vsli.64 d26,d22,#25 + veor d21,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d16,d23 @ Maj(a,b,c) + veor d21,d26 @ Sigma0(a) + vadd.i64 d17,d27 + vadd.i64 d30,d27 + @ vadd.i64 d21,d30 + vshr.u64 d24,d17,#14 @ 3 +#if 3<16 + vld1.64 {d3},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d17,#18 +#if 3>0 + vadd.i64 d21,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d17,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d17,#50 + vsli.64 d25,d17,#46 + vmov d29,d17 + vsli.64 d26,d17,#23 +#if 3<16 && defined(__ARMEL__) + vrev64.8 d3,d3 +#endif + veor d25,d24 + vbsl d29,d18,d19 @ Ch(e,f,g) + vshr.u64 d24,d21,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d20 + vshr.u64 d25,d21,#34 + vsli.64 d24,d21,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d21,#39 + vadd.i64 d28,d3 + vsli.64 d25,d21,#30 + veor d30,d21,d22 + vsli.64 d26,d21,#25 + veor d20,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d23,d22 @ Maj(a,b,c) + veor d20,d26 @ Sigma0(a) + vadd.i64 d16,d27 + vadd.i64 d30,d27 + @ vadd.i64 d20,d30 + vshr.u64 d24,d16,#14 @ 4 +#if 4<16 + vld1.64 {d4},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d16,#18 +#if 4>0 + vadd.i64 d20,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d16,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d16,#50 + vsli.64 d25,d16,#46 + vmov d29,d16 + vsli.64 d26,d16,#23 +#if 4<16 && defined(__ARMEL__) + vrev64.8 d4,d4 +#endif + veor d25,d24 + vbsl d29,d17,d18 @ Ch(e,f,g) + vshr.u64 d24,d20,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d19 + vshr.u64 d25,d20,#34 + vsli.64 d24,d20,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d20,#39 + vadd.i64 d28,d4 + vsli.64 d25,d20,#30 + veor d30,d20,d21 + vsli.64 d26,d20,#25 + veor d19,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d22,d21 @ Maj(a,b,c) + veor d19,d26 @ Sigma0(a) + vadd.i64 d23,d27 + vadd.i64 d30,d27 + @ vadd.i64 d19,d30 + vshr.u64 d24,d23,#14 @ 5 +#if 5<16 + vld1.64 {d5},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d23,#18 +#if 5>0 + vadd.i64 d19,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d23,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d23,#50 + vsli.64 d25,d23,#46 + vmov d29,d23 + vsli.64 d26,d23,#23 +#if 5<16 && defined(__ARMEL__) + vrev64.8 d5,d5 +#endif + veor d25,d24 + vbsl d29,d16,d17 @ Ch(e,f,g) + vshr.u64 d24,d19,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d18 + vshr.u64 d25,d19,#34 + vsli.64 d24,d19,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d19,#39 + vadd.i64 d28,d5 + vsli.64 d25,d19,#30 + veor d30,d19,d20 + vsli.64 d26,d19,#25 + veor d18,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d21,d20 @ Maj(a,b,c) + veor d18,d26 @ Sigma0(a) + vadd.i64 d22,d27 + vadd.i64 d30,d27 + @ vadd.i64 d18,d30 + vshr.u64 d24,d22,#14 @ 6 +#if 6<16 + vld1.64 {d6},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d22,#18 +#if 6>0 + vadd.i64 d18,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d22,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d22,#50 + vsli.64 d25,d22,#46 + vmov d29,d22 + vsli.64 d26,d22,#23 +#if 6<16 && defined(__ARMEL__) + vrev64.8 d6,d6 +#endif + veor d25,d24 + vbsl d29,d23,d16 @ Ch(e,f,g) + vshr.u64 d24,d18,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d17 + vshr.u64 d25,d18,#34 + vsli.64 d24,d18,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d18,#39 + vadd.i64 d28,d6 + vsli.64 d25,d18,#30 + veor d30,d18,d19 + vsli.64 d26,d18,#25 + veor d17,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d20,d19 @ Maj(a,b,c) + veor d17,d26 @ Sigma0(a) + vadd.i64 d21,d27 + vadd.i64 d30,d27 + @ vadd.i64 d17,d30 + vshr.u64 d24,d21,#14 @ 7 +#if 7<16 + vld1.64 {d7},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d21,#18 +#if 7>0 + vadd.i64 d17,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d21,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d21,#50 + vsli.64 d25,d21,#46 + vmov d29,d21 + vsli.64 d26,d21,#23 +#if 7<16 && defined(__ARMEL__) + vrev64.8 d7,d7 +#endif + veor d25,d24 + vbsl d29,d22,d23 @ Ch(e,f,g) + vshr.u64 d24,d17,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d16 + vshr.u64 d25,d17,#34 + vsli.64 d24,d17,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d17,#39 + vadd.i64 d28,d7 + vsli.64 d25,d17,#30 + veor d30,d17,d18 + vsli.64 d26,d17,#25 + veor d16,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d19,d18 @ Maj(a,b,c) + veor d16,d26 @ Sigma0(a) + vadd.i64 d20,d27 + vadd.i64 d30,d27 + @ vadd.i64 d16,d30 + vshr.u64 d24,d20,#14 @ 8 +#if 8<16 + vld1.64 {d8},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d20,#18 +#if 8>0 + vadd.i64 d16,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d20,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d20,#50 + vsli.64 d25,d20,#46 + vmov d29,d20 + vsli.64 d26,d20,#23 +#if 8<16 && defined(__ARMEL__) + vrev64.8 d8,d8 +#endif + veor d25,d24 + vbsl d29,d21,d22 @ Ch(e,f,g) + vshr.u64 d24,d16,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d23 + vshr.u64 d25,d16,#34 + vsli.64 d24,d16,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d16,#39 + vadd.i64 d28,d8 + vsli.64 d25,d16,#30 + veor d30,d16,d17 + vsli.64 d26,d16,#25 + veor d23,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d18,d17 @ Maj(a,b,c) + veor d23,d26 @ Sigma0(a) + vadd.i64 d19,d27 + vadd.i64 d30,d27 + @ vadd.i64 d23,d30 + vshr.u64 d24,d19,#14 @ 9 +#if 9<16 + vld1.64 {d9},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d19,#18 +#if 9>0 + vadd.i64 d23,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d19,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d19,#50 + vsli.64 d25,d19,#46 + vmov d29,d19 + vsli.64 d26,d19,#23 +#if 9<16 && defined(__ARMEL__) + vrev64.8 d9,d9 +#endif + veor d25,d24 + vbsl d29,d20,d21 @ Ch(e,f,g) + vshr.u64 d24,d23,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d22 + vshr.u64 d25,d23,#34 + vsli.64 d24,d23,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d23,#39 + vadd.i64 d28,d9 + vsli.64 d25,d23,#30 + veor d30,d23,d16 + vsli.64 d26,d23,#25 + veor d22,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d17,d16 @ Maj(a,b,c) + veor d22,d26 @ Sigma0(a) + vadd.i64 d18,d27 + vadd.i64 d30,d27 + @ vadd.i64 d22,d30 + vshr.u64 d24,d18,#14 @ 10 +#if 10<16 + vld1.64 {d10},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d18,#18 +#if 10>0 + vadd.i64 d22,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d18,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d18,#50 + vsli.64 d25,d18,#46 + vmov d29,d18 + vsli.64 d26,d18,#23 +#if 10<16 && defined(__ARMEL__) + vrev64.8 d10,d10 +#endif + veor d25,d24 + vbsl d29,d19,d20 @ Ch(e,f,g) + vshr.u64 d24,d22,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d21 + vshr.u64 d25,d22,#34 + vsli.64 d24,d22,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d22,#39 + vadd.i64 d28,d10 + vsli.64 d25,d22,#30 + veor d30,d22,d23 + vsli.64 d26,d22,#25 + veor d21,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d16,d23 @ Maj(a,b,c) + veor d21,d26 @ Sigma0(a) + vadd.i64 d17,d27 + vadd.i64 d30,d27 + @ vadd.i64 d21,d30 + vshr.u64 d24,d17,#14 @ 11 +#if 11<16 + vld1.64 {d11},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d17,#18 +#if 11>0 + vadd.i64 d21,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d17,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d17,#50 + vsli.64 d25,d17,#46 + vmov d29,d17 + vsli.64 d26,d17,#23 +#if 11<16 && defined(__ARMEL__) + vrev64.8 d11,d11 +#endif + veor d25,d24 + vbsl d29,d18,d19 @ Ch(e,f,g) + vshr.u64 d24,d21,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d20 + vshr.u64 d25,d21,#34 + vsli.64 d24,d21,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d21,#39 + vadd.i64 d28,d11 + vsli.64 d25,d21,#30 + veor d30,d21,d22 + vsli.64 d26,d21,#25 + veor d20,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d23,d22 @ Maj(a,b,c) + veor d20,d26 @ Sigma0(a) + vadd.i64 d16,d27 + vadd.i64 d30,d27 + @ vadd.i64 d20,d30 + vshr.u64 d24,d16,#14 @ 12 +#if 12<16 + vld1.64 {d12},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d16,#18 +#if 12>0 + vadd.i64 d20,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d16,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d16,#50 + vsli.64 d25,d16,#46 + vmov d29,d16 + vsli.64 d26,d16,#23 +#if 12<16 && defined(__ARMEL__) + vrev64.8 d12,d12 +#endif + veor d25,d24 + vbsl d29,d17,d18 @ Ch(e,f,g) + vshr.u64 d24,d20,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d19 + vshr.u64 d25,d20,#34 + vsli.64 d24,d20,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d20,#39 + vadd.i64 d28,d12 + vsli.64 d25,d20,#30 + veor d30,d20,d21 + vsli.64 d26,d20,#25 + veor d19,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d22,d21 @ Maj(a,b,c) + veor d19,d26 @ Sigma0(a) + vadd.i64 d23,d27 + vadd.i64 d30,d27 + @ vadd.i64 d19,d30 + vshr.u64 d24,d23,#14 @ 13 +#if 13<16 + vld1.64 {d13},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d23,#18 +#if 13>0 + vadd.i64 d19,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d23,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d23,#50 + vsli.64 d25,d23,#46 + vmov d29,d23 + vsli.64 d26,d23,#23 +#if 13<16 && defined(__ARMEL__) + vrev64.8 d13,d13 +#endif + veor d25,d24 + vbsl d29,d16,d17 @ Ch(e,f,g) + vshr.u64 d24,d19,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d18 + vshr.u64 d25,d19,#34 + vsli.64 d24,d19,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d19,#39 + vadd.i64 d28,d13 + vsli.64 d25,d19,#30 + veor d30,d19,d20 + vsli.64 d26,d19,#25 + veor d18,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d21,d20 @ Maj(a,b,c) + veor d18,d26 @ Sigma0(a) + vadd.i64 d22,d27 + vadd.i64 d30,d27 + @ vadd.i64 d18,d30 + vshr.u64 d24,d22,#14 @ 14 +#if 14<16 + vld1.64 {d14},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d22,#18 +#if 14>0 + vadd.i64 d18,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d22,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d22,#50 + vsli.64 d25,d22,#46 + vmov d29,d22 + vsli.64 d26,d22,#23 +#if 14<16 && defined(__ARMEL__) + vrev64.8 d14,d14 +#endif + veor d25,d24 + vbsl d29,d23,d16 @ Ch(e,f,g) + vshr.u64 d24,d18,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d17 + vshr.u64 d25,d18,#34 + vsli.64 d24,d18,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d18,#39 + vadd.i64 d28,d14 + vsli.64 d25,d18,#30 + veor d30,d18,d19 + vsli.64 d26,d18,#25 + veor d17,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d20,d19 @ Maj(a,b,c) + veor d17,d26 @ Sigma0(a) + vadd.i64 d21,d27 + vadd.i64 d30,d27 + @ vadd.i64 d17,d30 + vshr.u64 d24,d21,#14 @ 15 +#if 15<16 + vld1.64 {d15},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d21,#18 +#if 15>0 + vadd.i64 d17,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d21,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d21,#50 + vsli.64 d25,d21,#46 + vmov d29,d21 + vsli.64 d26,d21,#23 +#if 15<16 && defined(__ARMEL__) + vrev64.8 d15,d15 +#endif + veor d25,d24 + vbsl d29,d22,d23 @ Ch(e,f,g) + vshr.u64 d24,d17,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d16 + vshr.u64 d25,d17,#34 + vsli.64 d24,d17,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d17,#39 + vadd.i64 d28,d15 + vsli.64 d25,d17,#30 + veor d30,d17,d18 + vsli.64 d26,d17,#25 + veor d16,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d19,d18 @ Maj(a,b,c) + veor d16,d26 @ Sigma0(a) + vadd.i64 d20,d27 + vadd.i64 d30,d27 + @ vadd.i64 d16,d30 + mov r12,#4 +.L16_79_neon: + subs r12,#1 + vshr.u64 q12,q7,#19 + vshr.u64 q13,q7,#61 + vadd.i64 d16,d30 @ h+=Maj from the past + vshr.u64 q15,q7,#6 + vsli.64 q12,q7,#45 + vext.8 q14,q0,q1,#8 @ X[i+1] + vsli.64 q13,q7,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q0,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q4,q5,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d20,#14 @ from NEON_00_15 + vadd.i64 q0,q14 + vshr.u64 d25,d20,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d20,#41 @ from NEON_00_15 + vadd.i64 q0,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d20,#50 + vsli.64 d25,d20,#46 + vmov d29,d20 + vsli.64 d26,d20,#23 +#if 16<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d21,d22 @ Ch(e,f,g) + vshr.u64 d24,d16,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d23 + vshr.u64 d25,d16,#34 + vsli.64 d24,d16,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d16,#39 + vadd.i64 d28,d0 + vsli.64 d25,d16,#30 + veor d30,d16,d17 + vsli.64 d26,d16,#25 + veor d23,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d18,d17 @ Maj(a,b,c) + veor d23,d26 @ Sigma0(a) + vadd.i64 d19,d27 + vadd.i64 d30,d27 + @ vadd.i64 d23,d30 + vshr.u64 d24,d19,#14 @ 17 +#if 17<16 + vld1.64 {d1},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d19,#18 +#if 17>0 + vadd.i64 d23,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d19,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d19,#50 + vsli.64 d25,d19,#46 + vmov d29,d19 + vsli.64 d26,d19,#23 +#if 17<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d20,d21 @ Ch(e,f,g) + vshr.u64 d24,d23,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d22 + vshr.u64 d25,d23,#34 + vsli.64 d24,d23,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d23,#39 + vadd.i64 d28,d1 + vsli.64 d25,d23,#30 + veor d30,d23,d16 + vsli.64 d26,d23,#25 + veor d22,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d17,d16 @ Maj(a,b,c) + veor d22,d26 @ Sigma0(a) + vadd.i64 d18,d27 + vadd.i64 d30,d27 + @ vadd.i64 d22,d30 + vshr.u64 q12,q0,#19 + vshr.u64 q13,q0,#61 + vadd.i64 d22,d30 @ h+=Maj from the past + vshr.u64 q15,q0,#6 + vsli.64 q12,q0,#45 + vext.8 q14,q1,q2,#8 @ X[i+1] + vsli.64 q13,q0,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q1,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q5,q6,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d18,#14 @ from NEON_00_15 + vadd.i64 q1,q14 + vshr.u64 d25,d18,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d18,#41 @ from NEON_00_15 + vadd.i64 q1,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d18,#50 + vsli.64 d25,d18,#46 + vmov d29,d18 + vsli.64 d26,d18,#23 +#if 18<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d19,d20 @ Ch(e,f,g) + vshr.u64 d24,d22,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d21 + vshr.u64 d25,d22,#34 + vsli.64 d24,d22,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d22,#39 + vadd.i64 d28,d2 + vsli.64 d25,d22,#30 + veor d30,d22,d23 + vsli.64 d26,d22,#25 + veor d21,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d16,d23 @ Maj(a,b,c) + veor d21,d26 @ Sigma0(a) + vadd.i64 d17,d27 + vadd.i64 d30,d27 + @ vadd.i64 d21,d30 + vshr.u64 d24,d17,#14 @ 19 +#if 19<16 + vld1.64 {d3},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d17,#18 +#if 19>0 + vadd.i64 d21,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d17,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d17,#50 + vsli.64 d25,d17,#46 + vmov d29,d17 + vsli.64 d26,d17,#23 +#if 19<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d18,d19 @ Ch(e,f,g) + vshr.u64 d24,d21,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d20 + vshr.u64 d25,d21,#34 + vsli.64 d24,d21,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d21,#39 + vadd.i64 d28,d3 + vsli.64 d25,d21,#30 + veor d30,d21,d22 + vsli.64 d26,d21,#25 + veor d20,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d23,d22 @ Maj(a,b,c) + veor d20,d26 @ Sigma0(a) + vadd.i64 d16,d27 + vadd.i64 d30,d27 + @ vadd.i64 d20,d30 + vshr.u64 q12,q1,#19 + vshr.u64 q13,q1,#61 + vadd.i64 d20,d30 @ h+=Maj from the past + vshr.u64 q15,q1,#6 + vsli.64 q12,q1,#45 + vext.8 q14,q2,q3,#8 @ X[i+1] + vsli.64 q13,q1,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q2,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q6,q7,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d16,#14 @ from NEON_00_15 + vadd.i64 q2,q14 + vshr.u64 d25,d16,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d16,#41 @ from NEON_00_15 + vadd.i64 q2,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d16,#50 + vsli.64 d25,d16,#46 + vmov d29,d16 + vsli.64 d26,d16,#23 +#if 20<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d17,d18 @ Ch(e,f,g) + vshr.u64 d24,d20,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d19 + vshr.u64 d25,d20,#34 + vsli.64 d24,d20,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d20,#39 + vadd.i64 d28,d4 + vsli.64 d25,d20,#30 + veor d30,d20,d21 + vsli.64 d26,d20,#25 + veor d19,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d22,d21 @ Maj(a,b,c) + veor d19,d26 @ Sigma0(a) + vadd.i64 d23,d27 + vadd.i64 d30,d27 + @ vadd.i64 d19,d30 + vshr.u64 d24,d23,#14 @ 21 +#if 21<16 + vld1.64 {d5},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d23,#18 +#if 21>0 + vadd.i64 d19,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d23,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d23,#50 + vsli.64 d25,d23,#46 + vmov d29,d23 + vsli.64 d26,d23,#23 +#if 21<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d16,d17 @ Ch(e,f,g) + vshr.u64 d24,d19,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d18 + vshr.u64 d25,d19,#34 + vsli.64 d24,d19,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d19,#39 + vadd.i64 d28,d5 + vsli.64 d25,d19,#30 + veor d30,d19,d20 + vsli.64 d26,d19,#25 + veor d18,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d21,d20 @ Maj(a,b,c) + veor d18,d26 @ Sigma0(a) + vadd.i64 d22,d27 + vadd.i64 d30,d27 + @ vadd.i64 d18,d30 + vshr.u64 q12,q2,#19 + vshr.u64 q13,q2,#61 + vadd.i64 d18,d30 @ h+=Maj from the past + vshr.u64 q15,q2,#6 + vsli.64 q12,q2,#45 + vext.8 q14,q3,q4,#8 @ X[i+1] + vsli.64 q13,q2,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q3,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q7,q0,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d22,#14 @ from NEON_00_15 + vadd.i64 q3,q14 + vshr.u64 d25,d22,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d22,#41 @ from NEON_00_15 + vadd.i64 q3,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d22,#50 + vsli.64 d25,d22,#46 + vmov d29,d22 + vsli.64 d26,d22,#23 +#if 22<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d23,d16 @ Ch(e,f,g) + vshr.u64 d24,d18,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d17 + vshr.u64 d25,d18,#34 + vsli.64 d24,d18,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d18,#39 + vadd.i64 d28,d6 + vsli.64 d25,d18,#30 + veor d30,d18,d19 + vsli.64 d26,d18,#25 + veor d17,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d20,d19 @ Maj(a,b,c) + veor d17,d26 @ Sigma0(a) + vadd.i64 d21,d27 + vadd.i64 d30,d27 + @ vadd.i64 d17,d30 + vshr.u64 d24,d21,#14 @ 23 +#if 23<16 + vld1.64 {d7},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d21,#18 +#if 23>0 + vadd.i64 d17,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d21,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d21,#50 + vsli.64 d25,d21,#46 + vmov d29,d21 + vsli.64 d26,d21,#23 +#if 23<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d22,d23 @ Ch(e,f,g) + vshr.u64 d24,d17,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d16 + vshr.u64 d25,d17,#34 + vsli.64 d24,d17,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d17,#39 + vadd.i64 d28,d7 + vsli.64 d25,d17,#30 + veor d30,d17,d18 + vsli.64 d26,d17,#25 + veor d16,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d19,d18 @ Maj(a,b,c) + veor d16,d26 @ Sigma0(a) + vadd.i64 d20,d27 + vadd.i64 d30,d27 + @ vadd.i64 d16,d30 + vshr.u64 q12,q3,#19 + vshr.u64 q13,q3,#61 + vadd.i64 d16,d30 @ h+=Maj from the past + vshr.u64 q15,q3,#6 + vsli.64 q12,q3,#45 + vext.8 q14,q4,q5,#8 @ X[i+1] + vsli.64 q13,q3,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q4,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q0,q1,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d20,#14 @ from NEON_00_15 + vadd.i64 q4,q14 + vshr.u64 d25,d20,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d20,#41 @ from NEON_00_15 + vadd.i64 q4,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d20,#50 + vsli.64 d25,d20,#46 + vmov d29,d20 + vsli.64 d26,d20,#23 +#if 24<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d21,d22 @ Ch(e,f,g) + vshr.u64 d24,d16,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d23 + vshr.u64 d25,d16,#34 + vsli.64 d24,d16,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d16,#39 + vadd.i64 d28,d8 + vsli.64 d25,d16,#30 + veor d30,d16,d17 + vsli.64 d26,d16,#25 + veor d23,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d18,d17 @ Maj(a,b,c) + veor d23,d26 @ Sigma0(a) + vadd.i64 d19,d27 + vadd.i64 d30,d27 + @ vadd.i64 d23,d30 + vshr.u64 d24,d19,#14 @ 25 +#if 25<16 + vld1.64 {d9},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d19,#18 +#if 25>0 + vadd.i64 d23,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d19,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d19,#50 + vsli.64 d25,d19,#46 + vmov d29,d19 + vsli.64 d26,d19,#23 +#if 25<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d20,d21 @ Ch(e,f,g) + vshr.u64 d24,d23,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d22 + vshr.u64 d25,d23,#34 + vsli.64 d24,d23,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d23,#39 + vadd.i64 d28,d9 + vsli.64 d25,d23,#30 + veor d30,d23,d16 + vsli.64 d26,d23,#25 + veor d22,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d17,d16 @ Maj(a,b,c) + veor d22,d26 @ Sigma0(a) + vadd.i64 d18,d27 + vadd.i64 d30,d27 + @ vadd.i64 d22,d30 + vshr.u64 q12,q4,#19 + vshr.u64 q13,q4,#61 + vadd.i64 d22,d30 @ h+=Maj from the past + vshr.u64 q15,q4,#6 + vsli.64 q12,q4,#45 + vext.8 q14,q5,q6,#8 @ X[i+1] + vsli.64 q13,q4,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q5,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q1,q2,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d18,#14 @ from NEON_00_15 + vadd.i64 q5,q14 + vshr.u64 d25,d18,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d18,#41 @ from NEON_00_15 + vadd.i64 q5,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d18,#50 + vsli.64 d25,d18,#46 + vmov d29,d18 + vsli.64 d26,d18,#23 +#if 26<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d19,d20 @ Ch(e,f,g) + vshr.u64 d24,d22,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d21 + vshr.u64 d25,d22,#34 + vsli.64 d24,d22,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d22,#39 + vadd.i64 d28,d10 + vsli.64 d25,d22,#30 + veor d30,d22,d23 + vsli.64 d26,d22,#25 + veor d21,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d16,d23 @ Maj(a,b,c) + veor d21,d26 @ Sigma0(a) + vadd.i64 d17,d27 + vadd.i64 d30,d27 + @ vadd.i64 d21,d30 + vshr.u64 d24,d17,#14 @ 27 +#if 27<16 + vld1.64 {d11},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d17,#18 +#if 27>0 + vadd.i64 d21,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d17,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d17,#50 + vsli.64 d25,d17,#46 + vmov d29,d17 + vsli.64 d26,d17,#23 +#if 27<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d18,d19 @ Ch(e,f,g) + vshr.u64 d24,d21,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d20 + vshr.u64 d25,d21,#34 + vsli.64 d24,d21,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d21,#39 + vadd.i64 d28,d11 + vsli.64 d25,d21,#30 + veor d30,d21,d22 + vsli.64 d26,d21,#25 + veor d20,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d23,d22 @ Maj(a,b,c) + veor d20,d26 @ Sigma0(a) + vadd.i64 d16,d27 + vadd.i64 d30,d27 + @ vadd.i64 d20,d30 + vshr.u64 q12,q5,#19 + vshr.u64 q13,q5,#61 + vadd.i64 d20,d30 @ h+=Maj from the past + vshr.u64 q15,q5,#6 + vsli.64 q12,q5,#45 + vext.8 q14,q6,q7,#8 @ X[i+1] + vsli.64 q13,q5,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q6,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q2,q3,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d16,#14 @ from NEON_00_15 + vadd.i64 q6,q14 + vshr.u64 d25,d16,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d16,#41 @ from NEON_00_15 + vadd.i64 q6,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d16,#50 + vsli.64 d25,d16,#46 + vmov d29,d16 + vsli.64 d26,d16,#23 +#if 28<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d17,d18 @ Ch(e,f,g) + vshr.u64 d24,d20,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d19 + vshr.u64 d25,d20,#34 + vsli.64 d24,d20,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d20,#39 + vadd.i64 d28,d12 + vsli.64 d25,d20,#30 + veor d30,d20,d21 + vsli.64 d26,d20,#25 + veor d19,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d22,d21 @ Maj(a,b,c) + veor d19,d26 @ Sigma0(a) + vadd.i64 d23,d27 + vadd.i64 d30,d27 + @ vadd.i64 d19,d30 + vshr.u64 d24,d23,#14 @ 29 +#if 29<16 + vld1.64 {d13},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d23,#18 +#if 29>0 + vadd.i64 d19,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d23,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d23,#50 + vsli.64 d25,d23,#46 + vmov d29,d23 + vsli.64 d26,d23,#23 +#if 29<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d16,d17 @ Ch(e,f,g) + vshr.u64 d24,d19,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d18 + vshr.u64 d25,d19,#34 + vsli.64 d24,d19,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d19,#39 + vadd.i64 d28,d13 + vsli.64 d25,d19,#30 + veor d30,d19,d20 + vsli.64 d26,d19,#25 + veor d18,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d21,d20 @ Maj(a,b,c) + veor d18,d26 @ Sigma0(a) + vadd.i64 d22,d27 + vadd.i64 d30,d27 + @ vadd.i64 d18,d30 + vshr.u64 q12,q6,#19 + vshr.u64 q13,q6,#61 + vadd.i64 d18,d30 @ h+=Maj from the past + vshr.u64 q15,q6,#6 + vsli.64 q12,q6,#45 + vext.8 q14,q7,q0,#8 @ X[i+1] + vsli.64 q13,q6,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q7,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q3,q4,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d22,#14 @ from NEON_00_15 + vadd.i64 q7,q14 + vshr.u64 d25,d22,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d22,#41 @ from NEON_00_15 + vadd.i64 q7,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d22,#50 + vsli.64 d25,d22,#46 + vmov d29,d22 + vsli.64 d26,d22,#23 +#if 30<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d23,d16 @ Ch(e,f,g) + vshr.u64 d24,d18,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d17 + vshr.u64 d25,d18,#34 + vsli.64 d24,d18,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d18,#39 + vadd.i64 d28,d14 + vsli.64 d25,d18,#30 + veor d30,d18,d19 + vsli.64 d26,d18,#25 + veor d17,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d20,d19 @ Maj(a,b,c) + veor d17,d26 @ Sigma0(a) + vadd.i64 d21,d27 + vadd.i64 d30,d27 + @ vadd.i64 d17,d30 + vshr.u64 d24,d21,#14 @ 31 +#if 31<16 + vld1.64 {d15},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d21,#18 +#if 31>0 + vadd.i64 d17,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d21,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d21,#50 + vsli.64 d25,d21,#46 + vmov d29,d21 + vsli.64 d26,d21,#23 +#if 31<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d22,d23 @ Ch(e,f,g) + vshr.u64 d24,d17,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d16 + vshr.u64 d25,d17,#34 + vsli.64 d24,d17,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d17,#39 + vadd.i64 d28,d15 + vsli.64 d25,d17,#30 + veor d30,d17,d18 + vsli.64 d26,d17,#25 + veor d16,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d19,d18 @ Maj(a,b,c) + veor d16,d26 @ Sigma0(a) + vadd.i64 d20,d27 + vadd.i64 d30,d27 + @ vadd.i64 d16,d30 + bne .L16_79_neon + + vadd.i64 d16,d30 @ h+=Maj from the past + vldmia r0,{d24,d25,d26,d27,d28,d29,d30,d31} @ load context to temp + vadd.i64 q8,q12 @ vectorized accumulate + vadd.i64 q9,q13 + vadd.i64 q10,q14 + vadd.i64 q11,q15 + vstmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ save context + teq r1,r2 + sub r3,#640 @ rewind K512 + bne .Loop_neon + + VFP_ABI_POP + bx lr @ .word 0xe12fff1e +.size sha512_block_data_order_neon,.-sha512_block_data_order_neon +#endif +.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__) diff --git a/ring-0.17.14/pregenerated/sha512-armv8-ios64.S b/ring-0.17.14/pregenerated/sha512-armv8-ios64.S new file mode 100644 index 0000000000..06745ec50d --- /dev/null +++ b/ring-0.17.14/pregenerated/sha512-armv8-ios64.S @@ -0,0 +1,1598 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) +// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// ==================================================================== +// Written by Andy Polyakov for the OpenSSL +// project. +// ==================================================================== +// +// SHA256/512 for ARMv8. +// +// Performance in cycles per processed byte and improvement coefficient +// over code generated with "default" compiler: +// +// SHA256-hw SHA256(*) SHA512 +// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) +// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) +// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) +// Denver 2.01 10.5 (+26%) 6.70 (+8%) +// X-Gene 20.0 (+100%) 12.8 (+300%(***)) +// Mongoose 2.36 13.0 (+50%) 8.36 (+33%) +// Kryo 1.92 17.4 (+30%) 11.2 (+8%) +// +// (*) Software SHA256 results are of lesser relevance, presented +// mostly for informational purposes. +// (**) The result is a trade-off: it's possible to improve it by +// 10% (or by 1 cycle per round), but at the cost of 20% loss +// on Cortex-A53 (or by 4 cycles per round). +// (***) Super-impressive coefficients over gcc-generated code are +// indication of some compiler "pathology", most notably code +// generated with -mgeneral-regs-only is significantly faster +// and the gap is only 40-90%. + +#ifndef __KERNEL__ +#endif + +.text + +.globl _sha512_block_data_order_nohw +.private_extern _sha512_block_data_order_nohw + +.align 6 +_sha512_block_data_order_nohw: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#4*8 + + ldp x20,x21,[x0] // load context + ldp x22,x23,[x0,#2*8] + ldp x24,x25,[x0,#4*8] + add x2,x1,x2,lsl#7 // end of input + ldp x26,x27,[x0,#6*8] + adrp x30,LK512@PAGE + add x30,x30,LK512@PAGEOFF + stp x0,x2,[x29,#96] + +Loop: + ldp x3,x4,[x1],#2*8 + ldr x19,[x30],#8 // *K++ + eor x28,x21,x22 // magic seed + str x1,[x29,#112] +#ifndef __AARCH64EB__ + rev x3,x3 // 0 +#endif + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + eor x6,x24,x24,ror#23 + and x17,x25,x24 + bic x19,x26,x24 + add x27,x27,x3 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x6,ror#18 // Sigma1(e) + ror x6,x20,#28 + add x27,x27,x17 // h+=Ch(e,f,g) + eor x17,x20,x20,ror#5 + add x27,x27,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x23,x23,x27 // d+=h + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x6,x17,ror#34 // Sigma0(a) + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x27,x27,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x4,x4 // 1 +#endif + ldp x5,x6,[x1],#2*8 + add x27,x27,x17 // h+=Sigma0(a) + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + eor x7,x23,x23,ror#23 + and x17,x24,x23 + bic x28,x25,x23 + add x26,x26,x4 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x7,ror#18 // Sigma1(e) + ror x7,x27,#28 + add x26,x26,x17 // h+=Ch(e,f,g) + eor x17,x27,x27,ror#5 + add x26,x26,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x22,x22,x26 // d+=h + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x7,x17,ror#34 // Sigma0(a) + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x26,x26,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x5,x5 // 2 +#endif + add x26,x26,x17 // h+=Sigma0(a) + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + eor x8,x22,x22,ror#23 + and x17,x23,x22 + bic x19,x24,x22 + add x25,x25,x5 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x8,ror#18 // Sigma1(e) + ror x8,x26,#28 + add x25,x25,x17 // h+=Ch(e,f,g) + eor x17,x26,x26,ror#5 + add x25,x25,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x21,x21,x25 // d+=h + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x8,x17,ror#34 // Sigma0(a) + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x25,x25,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x6,x6 // 3 +#endif + ldp x7,x8,[x1],#2*8 + add x25,x25,x17 // h+=Sigma0(a) + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + eor x9,x21,x21,ror#23 + and x17,x22,x21 + bic x28,x23,x21 + add x24,x24,x6 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x9,ror#18 // Sigma1(e) + ror x9,x25,#28 + add x24,x24,x17 // h+=Ch(e,f,g) + eor x17,x25,x25,ror#5 + add x24,x24,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x20,x20,x24 // d+=h + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x9,x17,ror#34 // Sigma0(a) + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x24,x24,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x7,x7 // 4 +#endif + add x24,x24,x17 // h+=Sigma0(a) + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + eor x10,x20,x20,ror#23 + and x17,x21,x20 + bic x19,x22,x20 + add x23,x23,x7 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x10,ror#18 // Sigma1(e) + ror x10,x24,#28 + add x23,x23,x17 // h+=Ch(e,f,g) + eor x17,x24,x24,ror#5 + add x23,x23,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x27,x27,x23 // d+=h + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x10,x17,ror#34 // Sigma0(a) + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x23,x23,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x8,x8 // 5 +#endif + ldp x9,x10,[x1],#2*8 + add x23,x23,x17 // h+=Sigma0(a) + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + eor x11,x27,x27,ror#23 + and x17,x20,x27 + bic x28,x21,x27 + add x22,x22,x8 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x11,ror#18 // Sigma1(e) + ror x11,x23,#28 + add x22,x22,x17 // h+=Ch(e,f,g) + eor x17,x23,x23,ror#5 + add x22,x22,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x26,x26,x22 // d+=h + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x11,x17,ror#34 // Sigma0(a) + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x22,x22,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x9,x9 // 6 +#endif + add x22,x22,x17 // h+=Sigma0(a) + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + eor x12,x26,x26,ror#23 + and x17,x27,x26 + bic x19,x20,x26 + add x21,x21,x9 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x12,ror#18 // Sigma1(e) + ror x12,x22,#28 + add x21,x21,x17 // h+=Ch(e,f,g) + eor x17,x22,x22,ror#5 + add x21,x21,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x25,x25,x21 // d+=h + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x12,x17,ror#34 // Sigma0(a) + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x21,x21,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x10,x10 // 7 +#endif + ldp x11,x12,[x1],#2*8 + add x21,x21,x17 // h+=Sigma0(a) + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + eor x13,x25,x25,ror#23 + and x17,x26,x25 + bic x28,x27,x25 + add x20,x20,x10 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x13,ror#18 // Sigma1(e) + ror x13,x21,#28 + add x20,x20,x17 // h+=Ch(e,f,g) + eor x17,x21,x21,ror#5 + add x20,x20,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x24,x24,x20 // d+=h + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x13,x17,ror#34 // Sigma0(a) + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x20,x20,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x11,x11 // 8 +#endif + add x20,x20,x17 // h+=Sigma0(a) + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + eor x14,x24,x24,ror#23 + and x17,x25,x24 + bic x19,x26,x24 + add x27,x27,x11 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x14,ror#18 // Sigma1(e) + ror x14,x20,#28 + add x27,x27,x17 // h+=Ch(e,f,g) + eor x17,x20,x20,ror#5 + add x27,x27,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x23,x23,x27 // d+=h + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x14,x17,ror#34 // Sigma0(a) + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x27,x27,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x12,x12 // 9 +#endif + ldp x13,x14,[x1],#2*8 + add x27,x27,x17 // h+=Sigma0(a) + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + eor x15,x23,x23,ror#23 + and x17,x24,x23 + bic x28,x25,x23 + add x26,x26,x12 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x15,ror#18 // Sigma1(e) + ror x15,x27,#28 + add x26,x26,x17 // h+=Ch(e,f,g) + eor x17,x27,x27,ror#5 + add x26,x26,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x22,x22,x26 // d+=h + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x15,x17,ror#34 // Sigma0(a) + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x26,x26,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x13,x13 // 10 +#endif + add x26,x26,x17 // h+=Sigma0(a) + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + eor x0,x22,x22,ror#23 + and x17,x23,x22 + bic x19,x24,x22 + add x25,x25,x13 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x0,ror#18 // Sigma1(e) + ror x0,x26,#28 + add x25,x25,x17 // h+=Ch(e,f,g) + eor x17,x26,x26,ror#5 + add x25,x25,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x21,x21,x25 // d+=h + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x0,x17,ror#34 // Sigma0(a) + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x25,x25,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x14,x14 // 11 +#endif + ldp x15,x0,[x1],#2*8 + add x25,x25,x17 // h+=Sigma0(a) + str x6,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + eor x6,x21,x21,ror#23 + and x17,x22,x21 + bic x28,x23,x21 + add x24,x24,x14 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x6,ror#18 // Sigma1(e) + ror x6,x25,#28 + add x24,x24,x17 // h+=Ch(e,f,g) + eor x17,x25,x25,ror#5 + add x24,x24,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x20,x20,x24 // d+=h + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x6,x17,ror#34 // Sigma0(a) + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x24,x24,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x15,x15 // 12 +#endif + add x24,x24,x17 // h+=Sigma0(a) + str x7,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + eor x7,x20,x20,ror#23 + and x17,x21,x20 + bic x19,x22,x20 + add x23,x23,x15 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x7,ror#18 // Sigma1(e) + ror x7,x24,#28 + add x23,x23,x17 // h+=Ch(e,f,g) + eor x17,x24,x24,ror#5 + add x23,x23,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x27,x27,x23 // d+=h + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x7,x17,ror#34 // Sigma0(a) + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x23,x23,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x0,x0 // 13 +#endif + ldp x1,x2,[x1] + add x23,x23,x17 // h+=Sigma0(a) + str x8,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + eor x8,x27,x27,ror#23 + and x17,x20,x27 + bic x28,x21,x27 + add x22,x22,x0 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x8,ror#18 // Sigma1(e) + ror x8,x23,#28 + add x22,x22,x17 // h+=Ch(e,f,g) + eor x17,x23,x23,ror#5 + add x22,x22,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x26,x26,x22 // d+=h + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x8,x17,ror#34 // Sigma0(a) + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x22,x22,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x1,x1 // 14 +#endif + ldr x6,[sp,#24] + add x22,x22,x17 // h+=Sigma0(a) + str x9,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + eor x9,x26,x26,ror#23 + and x17,x27,x26 + bic x19,x20,x26 + add x21,x21,x1 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x9,ror#18 // Sigma1(e) + ror x9,x22,#28 + add x21,x21,x17 // h+=Ch(e,f,g) + eor x17,x22,x22,ror#5 + add x21,x21,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x25,x25,x21 // d+=h + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x9,x17,ror#34 // Sigma0(a) + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x21,x21,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x2,x2 // 15 +#endif + ldr x7,[sp,#0] + add x21,x21,x17 // h+=Sigma0(a) + str x10,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x9,x4,#1 + and x17,x26,x25 + ror x8,x1,#19 + bic x28,x27,x25 + ror x10,x21,#28 + add x20,x20,x2 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x9,x9,x4,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x10,x10,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x8,x8,x1,ror#61 + eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x10,x21,ror#39 // Sigma0(a) + eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) + add x3,x3,x12 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x3,x3,x9 + add x20,x20,x17 // h+=Sigma0(a) + add x3,x3,x8 +Loop_16_xx: + ldr x8,[sp,#8] + str x11,[sp,#0] + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + ror x10,x5,#1 + and x17,x25,x24 + ror x9,x2,#19 + bic x19,x26,x24 + ror x11,x20,#28 + add x27,x27,x3 // h+=X[i] + eor x16,x16,x24,ror#18 + eor x10,x10,x5,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x24,ror#41 // Sigma1(e) + eor x11,x11,x20,ror#34 + add x27,x27,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x9,x9,x2,ror#61 + eor x10,x10,x5,lsr#7 // sigma0(X[i+1]) + add x27,x27,x16 // h+=Sigma1(e) + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x11,x20,ror#39 // Sigma0(a) + eor x9,x9,x2,lsr#6 // sigma1(X[i+14]) + add x4,x4,x13 + add x23,x23,x27 // d+=h + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x4,x4,x10 + add x27,x27,x17 // h+=Sigma0(a) + add x4,x4,x9 + ldr x9,[sp,#16] + str x12,[sp,#8] + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + ror x11,x6,#1 + and x17,x24,x23 + ror x10,x3,#19 + bic x28,x25,x23 + ror x12,x27,#28 + add x26,x26,x4 // h+=X[i] + eor x16,x16,x23,ror#18 + eor x11,x11,x6,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x23,ror#41 // Sigma1(e) + eor x12,x12,x27,ror#34 + add x26,x26,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x10,x10,x3,ror#61 + eor x11,x11,x6,lsr#7 // sigma0(X[i+1]) + add x26,x26,x16 // h+=Sigma1(e) + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x12,x27,ror#39 // Sigma0(a) + eor x10,x10,x3,lsr#6 // sigma1(X[i+14]) + add x5,x5,x14 + add x22,x22,x26 // d+=h + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x5,x5,x11 + add x26,x26,x17 // h+=Sigma0(a) + add x5,x5,x10 + ldr x10,[sp,#24] + str x13,[sp,#16] + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + ror x12,x7,#1 + and x17,x23,x22 + ror x11,x4,#19 + bic x19,x24,x22 + ror x13,x26,#28 + add x25,x25,x5 // h+=X[i] + eor x16,x16,x22,ror#18 + eor x12,x12,x7,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x22,ror#41 // Sigma1(e) + eor x13,x13,x26,ror#34 + add x25,x25,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x11,x11,x4,ror#61 + eor x12,x12,x7,lsr#7 // sigma0(X[i+1]) + add x25,x25,x16 // h+=Sigma1(e) + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x13,x26,ror#39 // Sigma0(a) + eor x11,x11,x4,lsr#6 // sigma1(X[i+14]) + add x6,x6,x15 + add x21,x21,x25 // d+=h + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x6,x6,x12 + add x25,x25,x17 // h+=Sigma0(a) + add x6,x6,x11 + ldr x11,[sp,#0] + str x14,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + ror x13,x8,#1 + and x17,x22,x21 + ror x12,x5,#19 + bic x28,x23,x21 + ror x14,x25,#28 + add x24,x24,x6 // h+=X[i] + eor x16,x16,x21,ror#18 + eor x13,x13,x8,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x21,ror#41 // Sigma1(e) + eor x14,x14,x25,ror#34 + add x24,x24,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x12,x12,x5,ror#61 + eor x13,x13,x8,lsr#7 // sigma0(X[i+1]) + add x24,x24,x16 // h+=Sigma1(e) + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x14,x25,ror#39 // Sigma0(a) + eor x12,x12,x5,lsr#6 // sigma1(X[i+14]) + add x7,x7,x0 + add x20,x20,x24 // d+=h + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x7,x7,x13 + add x24,x24,x17 // h+=Sigma0(a) + add x7,x7,x12 + ldr x12,[sp,#8] + str x15,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + ror x14,x9,#1 + and x17,x21,x20 + ror x13,x6,#19 + bic x19,x22,x20 + ror x15,x24,#28 + add x23,x23,x7 // h+=X[i] + eor x16,x16,x20,ror#18 + eor x14,x14,x9,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x20,ror#41 // Sigma1(e) + eor x15,x15,x24,ror#34 + add x23,x23,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x13,x13,x6,ror#61 + eor x14,x14,x9,lsr#7 // sigma0(X[i+1]) + add x23,x23,x16 // h+=Sigma1(e) + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x15,x24,ror#39 // Sigma0(a) + eor x13,x13,x6,lsr#6 // sigma1(X[i+14]) + add x8,x8,x1 + add x27,x27,x23 // d+=h + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x8,x8,x14 + add x23,x23,x17 // h+=Sigma0(a) + add x8,x8,x13 + ldr x13,[sp,#16] + str x0,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + ror x15,x10,#1 + and x17,x20,x27 + ror x14,x7,#19 + bic x28,x21,x27 + ror x0,x23,#28 + add x22,x22,x8 // h+=X[i] + eor x16,x16,x27,ror#18 + eor x15,x15,x10,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x27,ror#41 // Sigma1(e) + eor x0,x0,x23,ror#34 + add x22,x22,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x14,x14,x7,ror#61 + eor x15,x15,x10,lsr#7 // sigma0(X[i+1]) + add x22,x22,x16 // h+=Sigma1(e) + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x0,x23,ror#39 // Sigma0(a) + eor x14,x14,x7,lsr#6 // sigma1(X[i+14]) + add x9,x9,x2 + add x26,x26,x22 // d+=h + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x9,x9,x15 + add x22,x22,x17 // h+=Sigma0(a) + add x9,x9,x14 + ldr x14,[sp,#24] + str x1,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + ror x0,x11,#1 + and x17,x27,x26 + ror x15,x8,#19 + bic x19,x20,x26 + ror x1,x22,#28 + add x21,x21,x9 // h+=X[i] + eor x16,x16,x26,ror#18 + eor x0,x0,x11,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x26,ror#41 // Sigma1(e) + eor x1,x1,x22,ror#34 + add x21,x21,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x15,x15,x8,ror#61 + eor x0,x0,x11,lsr#7 // sigma0(X[i+1]) + add x21,x21,x16 // h+=Sigma1(e) + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x1,x22,ror#39 // Sigma0(a) + eor x15,x15,x8,lsr#6 // sigma1(X[i+14]) + add x10,x10,x3 + add x25,x25,x21 // d+=h + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x10,x10,x0 + add x21,x21,x17 // h+=Sigma0(a) + add x10,x10,x15 + ldr x15,[sp,#0] + str x2,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x1,x12,#1 + and x17,x26,x25 + ror x0,x9,#19 + bic x28,x27,x25 + ror x2,x21,#28 + add x20,x20,x10 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x1,x1,x12,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x2,x2,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x0,x0,x9,ror#61 + eor x1,x1,x12,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x2,x21,ror#39 // Sigma0(a) + eor x0,x0,x9,lsr#6 // sigma1(X[i+14]) + add x11,x11,x4 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x11,x11,x1 + add x20,x20,x17 // h+=Sigma0(a) + add x11,x11,x0 + ldr x0,[sp,#8] + str x3,[sp,#0] + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + ror x2,x13,#1 + and x17,x25,x24 + ror x1,x10,#19 + bic x19,x26,x24 + ror x3,x20,#28 + add x27,x27,x11 // h+=X[i] + eor x16,x16,x24,ror#18 + eor x2,x2,x13,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x24,ror#41 // Sigma1(e) + eor x3,x3,x20,ror#34 + add x27,x27,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x1,x1,x10,ror#61 + eor x2,x2,x13,lsr#7 // sigma0(X[i+1]) + add x27,x27,x16 // h+=Sigma1(e) + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x3,x20,ror#39 // Sigma0(a) + eor x1,x1,x10,lsr#6 // sigma1(X[i+14]) + add x12,x12,x5 + add x23,x23,x27 // d+=h + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x12,x12,x2 + add x27,x27,x17 // h+=Sigma0(a) + add x12,x12,x1 + ldr x1,[sp,#16] + str x4,[sp,#8] + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + ror x3,x14,#1 + and x17,x24,x23 + ror x2,x11,#19 + bic x28,x25,x23 + ror x4,x27,#28 + add x26,x26,x12 // h+=X[i] + eor x16,x16,x23,ror#18 + eor x3,x3,x14,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x23,ror#41 // Sigma1(e) + eor x4,x4,x27,ror#34 + add x26,x26,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x2,x2,x11,ror#61 + eor x3,x3,x14,lsr#7 // sigma0(X[i+1]) + add x26,x26,x16 // h+=Sigma1(e) + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x4,x27,ror#39 // Sigma0(a) + eor x2,x2,x11,lsr#6 // sigma1(X[i+14]) + add x13,x13,x6 + add x22,x22,x26 // d+=h + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x13,x13,x3 + add x26,x26,x17 // h+=Sigma0(a) + add x13,x13,x2 + ldr x2,[sp,#24] + str x5,[sp,#16] + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + ror x4,x15,#1 + and x17,x23,x22 + ror x3,x12,#19 + bic x19,x24,x22 + ror x5,x26,#28 + add x25,x25,x13 // h+=X[i] + eor x16,x16,x22,ror#18 + eor x4,x4,x15,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x22,ror#41 // Sigma1(e) + eor x5,x5,x26,ror#34 + add x25,x25,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x3,x3,x12,ror#61 + eor x4,x4,x15,lsr#7 // sigma0(X[i+1]) + add x25,x25,x16 // h+=Sigma1(e) + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x5,x26,ror#39 // Sigma0(a) + eor x3,x3,x12,lsr#6 // sigma1(X[i+14]) + add x14,x14,x7 + add x21,x21,x25 // d+=h + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x14,x14,x4 + add x25,x25,x17 // h+=Sigma0(a) + add x14,x14,x3 + ldr x3,[sp,#0] + str x6,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + ror x5,x0,#1 + and x17,x22,x21 + ror x4,x13,#19 + bic x28,x23,x21 + ror x6,x25,#28 + add x24,x24,x14 // h+=X[i] + eor x16,x16,x21,ror#18 + eor x5,x5,x0,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x21,ror#41 // Sigma1(e) + eor x6,x6,x25,ror#34 + add x24,x24,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x4,x4,x13,ror#61 + eor x5,x5,x0,lsr#7 // sigma0(X[i+1]) + add x24,x24,x16 // h+=Sigma1(e) + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x6,x25,ror#39 // Sigma0(a) + eor x4,x4,x13,lsr#6 // sigma1(X[i+14]) + add x15,x15,x8 + add x20,x20,x24 // d+=h + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x15,x15,x5 + add x24,x24,x17 // h+=Sigma0(a) + add x15,x15,x4 + ldr x4,[sp,#8] + str x7,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + ror x6,x1,#1 + and x17,x21,x20 + ror x5,x14,#19 + bic x19,x22,x20 + ror x7,x24,#28 + add x23,x23,x15 // h+=X[i] + eor x16,x16,x20,ror#18 + eor x6,x6,x1,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x20,ror#41 // Sigma1(e) + eor x7,x7,x24,ror#34 + add x23,x23,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x5,x5,x14,ror#61 + eor x6,x6,x1,lsr#7 // sigma0(X[i+1]) + add x23,x23,x16 // h+=Sigma1(e) + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x7,x24,ror#39 // Sigma0(a) + eor x5,x5,x14,lsr#6 // sigma1(X[i+14]) + add x0,x0,x9 + add x27,x27,x23 // d+=h + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x0,x0,x6 + add x23,x23,x17 // h+=Sigma0(a) + add x0,x0,x5 + ldr x5,[sp,#16] + str x8,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + ror x7,x2,#1 + and x17,x20,x27 + ror x6,x15,#19 + bic x28,x21,x27 + ror x8,x23,#28 + add x22,x22,x0 // h+=X[i] + eor x16,x16,x27,ror#18 + eor x7,x7,x2,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x27,ror#41 // Sigma1(e) + eor x8,x8,x23,ror#34 + add x22,x22,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x6,x6,x15,ror#61 + eor x7,x7,x2,lsr#7 // sigma0(X[i+1]) + add x22,x22,x16 // h+=Sigma1(e) + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x8,x23,ror#39 // Sigma0(a) + eor x6,x6,x15,lsr#6 // sigma1(X[i+14]) + add x1,x1,x10 + add x26,x26,x22 // d+=h + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x1,x1,x7 + add x22,x22,x17 // h+=Sigma0(a) + add x1,x1,x6 + ldr x6,[sp,#24] + str x9,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + ror x8,x3,#1 + and x17,x27,x26 + ror x7,x0,#19 + bic x19,x20,x26 + ror x9,x22,#28 + add x21,x21,x1 // h+=X[i] + eor x16,x16,x26,ror#18 + eor x8,x8,x3,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x26,ror#41 // Sigma1(e) + eor x9,x9,x22,ror#34 + add x21,x21,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x7,x7,x0,ror#61 + eor x8,x8,x3,lsr#7 // sigma0(X[i+1]) + add x21,x21,x16 // h+=Sigma1(e) + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x9,x22,ror#39 // Sigma0(a) + eor x7,x7,x0,lsr#6 // sigma1(X[i+14]) + add x2,x2,x11 + add x25,x25,x21 // d+=h + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x2,x2,x8 + add x21,x21,x17 // h+=Sigma0(a) + add x2,x2,x7 + ldr x7,[sp,#0] + str x10,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x9,x4,#1 + and x17,x26,x25 + ror x8,x1,#19 + bic x28,x27,x25 + ror x10,x21,#28 + add x20,x20,x2 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x9,x9,x4,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x10,x10,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x8,x8,x1,ror#61 + eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x10,x21,ror#39 // Sigma0(a) + eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) + add x3,x3,x12 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x3,x3,x9 + add x20,x20,x17 // h+=Sigma0(a) + add x3,x3,x8 + cbnz x19,Loop_16_xx + + ldp x0,x2,[x29,#96] + ldr x1,[x29,#112] + sub x30,x30,#648 // rewind + + ldp x3,x4,[x0] + ldp x5,x6,[x0,#2*8] + add x1,x1,#14*8 // advance input pointer + ldp x7,x8,[x0,#4*8] + add x20,x20,x3 + ldp x9,x10,[x0,#6*8] + add x21,x21,x4 + add x22,x22,x5 + add x23,x23,x6 + stp x20,x21,[x0] + add x24,x24,x7 + add x25,x25,x8 + stp x22,x23,[x0,#2*8] + add x26,x26,x9 + add x27,x27,x10 + cmp x1,x2 + stp x24,x25,[x0,#4*8] + stp x26,x27,[x0,#6*8] + b.ne Loop + + ldp x19,x20,[x29,#16] + add sp,sp,#4*8 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +.section __TEXT,__const +.align 6 + +LK512: +.quad 0x428a2f98d728ae22,0x7137449123ef65cd +.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc +.quad 0x3956c25bf348b538,0x59f111f1b605d019 +.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 +.quad 0xd807aa98a3030242,0x12835b0145706fbe +.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 +.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 +.quad 0x9bdc06a725c71235,0xc19bf174cf692694 +.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 +.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 +.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 +.quad 0x983e5152ee66dfab,0xa831c66d2db43210 +.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 +.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 +.quad 0x06ca6351e003826f,0x142929670a0e6e70 +.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 +.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df +.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 +.quad 0x81c2c92e47edaee6,0x92722c851482353b +.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 +.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 +.quad 0xd192e819d6ef5218,0xd69906245565a910 +.quad 0xf40e35855771202a,0x106aa07032bbd1b8 +.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 +.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb +.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 +.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 +.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec +.quad 0x90befffa23631e28,0xa4506cebde82bde9 +.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b +.quad 0xca273eceea26619c,0xd186b8c721c0c207 +.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 +.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 +.quad 0x113f9804bef90dae,0x1b710b35131c471b +.quad 0x28db77f523047d84,0x32caab7b40c72493 +.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c +.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a +.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 +.quad 0 // terminator + +.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +.text +#ifndef __KERNEL__ +.globl _sha512_block_data_order_hw +.private_extern _sha512_block_data_order_hw + +.align 6 +_sha512_block_data_order_hw: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 // load input + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + + ld1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // load context + adrp x3,LK512@PAGE + add x3,x3,LK512@PAGEOFF + + rev64 v16.16b,v16.16b + rev64 v17.16b,v17.16b + rev64 v18.16b,v18.16b + rev64 v19.16b,v19.16b + rev64 v20.16b,v20.16b + rev64 v21.16b,v21.16b + rev64 v22.16b,v22.16b + rev64 v23.16b,v23.16b + b Loop_hw + +.align 4 +Loop_hw: + ld1 {v24.2d},[x3],#16 + subs x2,x2,#1 + sub x4,x1,#128 + orr v26.16b,v0.16b,v0.16b // offload + orr v27.16b,v1.16b,v1.16b + orr v28.16b,v2.16b,v2.16b + orr v29.16b,v3.16b,v3.16b + csel x1,x1,x4,ne // conditional rewind + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08230 //sha512su0 v16.16b,v17.16b + ext v7.16b,v20.16b,v21.16b,#8 +.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b +.long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08251 //sha512su0 v17.16b,v18.16b + ext v7.16b,v21.16b,v22.16b,#8 +.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b +.long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08272 //sha512su0 v18.16b,v19.16b + ext v7.16b,v22.16b,v23.16b,#8 +.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b +.long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08293 //sha512su0 v19.16b,v20.16b + ext v7.16b,v23.16b,v16.16b,#8 +.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b +.long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082b4 //sha512su0 v20.16b,v21.16b + ext v7.16b,v16.16b,v17.16b,#8 +.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b +.long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec082d5 //sha512su0 v21.16b,v22.16b + ext v7.16b,v17.16b,v18.16b,#8 +.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b +.long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082f6 //sha512su0 v22.16b,v23.16b + ext v7.16b,v18.16b,v19.16b,#8 +.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b +.long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08217 //sha512su0 v23.16b,v16.16b + ext v7.16b,v19.16b,v20.16b,#8 +.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b +.long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08230 //sha512su0 v16.16b,v17.16b + ext v7.16b,v20.16b,v21.16b,#8 +.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b +.long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08251 //sha512su0 v17.16b,v18.16b + ext v7.16b,v21.16b,v22.16b,#8 +.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b +.long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08272 //sha512su0 v18.16b,v19.16b + ext v7.16b,v22.16b,v23.16b,#8 +.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b +.long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08293 //sha512su0 v19.16b,v20.16b + ext v7.16b,v23.16b,v16.16b,#8 +.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b +.long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082b4 //sha512su0 v20.16b,v21.16b + ext v7.16b,v16.16b,v17.16b,#8 +.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b +.long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec082d5 //sha512su0 v21.16b,v22.16b + ext v7.16b,v17.16b,v18.16b,#8 +.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b +.long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082f6 //sha512su0 v22.16b,v23.16b + ext v7.16b,v18.16b,v19.16b,#8 +.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b +.long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08217 //sha512su0 v23.16b,v16.16b + ext v7.16b,v19.16b,v20.16b,#8 +.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b +.long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08230 //sha512su0 v16.16b,v17.16b + ext v7.16b,v20.16b,v21.16b,#8 +.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b +.long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08251 //sha512su0 v17.16b,v18.16b + ext v7.16b,v21.16b,v22.16b,#8 +.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b +.long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08272 //sha512su0 v18.16b,v19.16b + ext v7.16b,v22.16b,v23.16b,#8 +.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b +.long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08293 //sha512su0 v19.16b,v20.16b + ext v7.16b,v23.16b,v16.16b,#8 +.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b +.long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082b4 //sha512su0 v20.16b,v21.16b + ext v7.16b,v16.16b,v17.16b,#8 +.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b +.long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec082d5 //sha512su0 v21.16b,v22.16b + ext v7.16b,v17.16b,v18.16b,#8 +.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b +.long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082f6 //sha512su0 v22.16b,v23.16b + ext v7.16b,v18.16b,v19.16b,#8 +.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b +.long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08217 //sha512su0 v23.16b,v16.16b + ext v7.16b,v19.16b,v20.16b,#8 +.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b +.long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08230 //sha512su0 v16.16b,v17.16b + ext v7.16b,v20.16b,v21.16b,#8 +.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b +.long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08251 //sha512su0 v17.16b,v18.16b + ext v7.16b,v21.16b,v22.16b,#8 +.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b +.long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08272 //sha512su0 v18.16b,v19.16b + ext v7.16b,v22.16b,v23.16b,#8 +.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b +.long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08293 //sha512su0 v19.16b,v20.16b + ext v7.16b,v23.16b,v16.16b,#8 +.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b +.long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082b4 //sha512su0 v20.16b,v21.16b + ext v7.16b,v16.16b,v17.16b,#8 +.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b +.long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec082d5 //sha512su0 v21.16b,v22.16b + ext v7.16b,v17.16b,v18.16b,#8 +.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b +.long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082f6 //sha512su0 v22.16b,v23.16b + ext v7.16b,v18.16b,v19.16b,#8 +.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b +.long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08217 //sha512su0 v23.16b,v16.16b + ext v7.16b,v19.16b,v20.16b,#8 +.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b +.long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v16.2d + ld1 {v16.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" +.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b + rev64 v16.16b,v16.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + ld1 {v24.2d},[x3],#16 + add v25.2d,v25.2d,v17.2d + ld1 {v17.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" +.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b + rev64 v17.16b,v17.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v18.2d + ld1 {v18.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" +.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b + rev64 v18.16b,v18.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + ld1 {v24.2d},[x3],#16 + add v25.2d,v25.2d,v19.2d + ld1 {v19.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" +.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b + rev64 v19.16b,v19.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v20.2d + ld1 {v20.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" +.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b + rev64 v20.16b,v20.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + ld1 {v24.2d},[x3],#16 + add v25.2d,v25.2d,v21.2d + ld1 {v21.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" +.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b + rev64 v21.16b,v21.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v22.2d + ld1 {v22.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" +.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b + rev64 v22.16b,v22.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + sub x3,x3,#80*8 // rewind + add v25.2d,v25.2d,v23.2d + ld1 {v23.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" +.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b + rev64 v23.16b,v23.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v0.2d,v0.2d,v26.2d // accumulate + add v1.2d,v1.2d,v27.2d + add v2.2d,v2.2d,v28.2d + add v3.2d,v3.2d,v29.2d + + cbnz x2,Loop_hw + + st1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // store context + + ldr x29,[sp],#16 + ret + +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) diff --git a/ring-0.17.14/pregenerated/sha512-armv8-linux64.S b/ring-0.17.14/pregenerated/sha512-armv8-linux64.S new file mode 100644 index 0000000000..c1aa3797a6 --- /dev/null +++ b/ring-0.17.14/pregenerated/sha512-armv8-linux64.S @@ -0,0 +1,1598 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) +// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// ==================================================================== +// Written by Andy Polyakov for the OpenSSL +// project. +// ==================================================================== +// +// SHA256/512 for ARMv8. +// +// Performance in cycles per processed byte and improvement coefficient +// over code generated with "default" compiler: +// +// SHA256-hw SHA256(*) SHA512 +// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) +// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) +// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) +// Denver 2.01 10.5 (+26%) 6.70 (+8%) +// X-Gene 20.0 (+100%) 12.8 (+300%(***)) +// Mongoose 2.36 13.0 (+50%) 8.36 (+33%) +// Kryo 1.92 17.4 (+30%) 11.2 (+8%) +// +// (*) Software SHA256 results are of lesser relevance, presented +// mostly for informational purposes. +// (**) The result is a trade-off: it's possible to improve it by +// 10% (or by 1 cycle per round), but at the cost of 20% loss +// on Cortex-A53 (or by 4 cycles per round). +// (***) Super-impressive coefficients over gcc-generated code are +// indication of some compiler "pathology", most notably code +// generated with -mgeneral-regs-only is significantly faster +// and the gap is only 40-90%. + +#ifndef __KERNEL__ +#endif + +.text + +.globl sha512_block_data_order_nohw +.hidden sha512_block_data_order_nohw +.type sha512_block_data_order_nohw,%function +.align 6 +sha512_block_data_order_nohw: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#4*8 + + ldp x20,x21,[x0] // load context + ldp x22,x23,[x0,#2*8] + ldp x24,x25,[x0,#4*8] + add x2,x1,x2,lsl#7 // end of input + ldp x26,x27,[x0,#6*8] + adrp x30,.LK512 + add x30,x30,:lo12:.LK512 + stp x0,x2,[x29,#96] + +.Loop: + ldp x3,x4,[x1],#2*8 + ldr x19,[x30],#8 // *K++ + eor x28,x21,x22 // magic seed + str x1,[x29,#112] +#ifndef __AARCH64EB__ + rev x3,x3 // 0 +#endif + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + eor x6,x24,x24,ror#23 + and x17,x25,x24 + bic x19,x26,x24 + add x27,x27,x3 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x6,ror#18 // Sigma1(e) + ror x6,x20,#28 + add x27,x27,x17 // h+=Ch(e,f,g) + eor x17,x20,x20,ror#5 + add x27,x27,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x23,x23,x27 // d+=h + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x6,x17,ror#34 // Sigma0(a) + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x27,x27,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x4,x4 // 1 +#endif + ldp x5,x6,[x1],#2*8 + add x27,x27,x17 // h+=Sigma0(a) + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + eor x7,x23,x23,ror#23 + and x17,x24,x23 + bic x28,x25,x23 + add x26,x26,x4 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x7,ror#18 // Sigma1(e) + ror x7,x27,#28 + add x26,x26,x17 // h+=Ch(e,f,g) + eor x17,x27,x27,ror#5 + add x26,x26,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x22,x22,x26 // d+=h + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x7,x17,ror#34 // Sigma0(a) + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x26,x26,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x5,x5 // 2 +#endif + add x26,x26,x17 // h+=Sigma0(a) + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + eor x8,x22,x22,ror#23 + and x17,x23,x22 + bic x19,x24,x22 + add x25,x25,x5 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x8,ror#18 // Sigma1(e) + ror x8,x26,#28 + add x25,x25,x17 // h+=Ch(e,f,g) + eor x17,x26,x26,ror#5 + add x25,x25,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x21,x21,x25 // d+=h + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x8,x17,ror#34 // Sigma0(a) + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x25,x25,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x6,x6 // 3 +#endif + ldp x7,x8,[x1],#2*8 + add x25,x25,x17 // h+=Sigma0(a) + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + eor x9,x21,x21,ror#23 + and x17,x22,x21 + bic x28,x23,x21 + add x24,x24,x6 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x9,ror#18 // Sigma1(e) + ror x9,x25,#28 + add x24,x24,x17 // h+=Ch(e,f,g) + eor x17,x25,x25,ror#5 + add x24,x24,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x20,x20,x24 // d+=h + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x9,x17,ror#34 // Sigma0(a) + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x24,x24,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x7,x7 // 4 +#endif + add x24,x24,x17 // h+=Sigma0(a) + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + eor x10,x20,x20,ror#23 + and x17,x21,x20 + bic x19,x22,x20 + add x23,x23,x7 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x10,ror#18 // Sigma1(e) + ror x10,x24,#28 + add x23,x23,x17 // h+=Ch(e,f,g) + eor x17,x24,x24,ror#5 + add x23,x23,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x27,x27,x23 // d+=h + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x10,x17,ror#34 // Sigma0(a) + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x23,x23,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x8,x8 // 5 +#endif + ldp x9,x10,[x1],#2*8 + add x23,x23,x17 // h+=Sigma0(a) + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + eor x11,x27,x27,ror#23 + and x17,x20,x27 + bic x28,x21,x27 + add x22,x22,x8 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x11,ror#18 // Sigma1(e) + ror x11,x23,#28 + add x22,x22,x17 // h+=Ch(e,f,g) + eor x17,x23,x23,ror#5 + add x22,x22,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x26,x26,x22 // d+=h + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x11,x17,ror#34 // Sigma0(a) + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x22,x22,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x9,x9 // 6 +#endif + add x22,x22,x17 // h+=Sigma0(a) + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + eor x12,x26,x26,ror#23 + and x17,x27,x26 + bic x19,x20,x26 + add x21,x21,x9 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x12,ror#18 // Sigma1(e) + ror x12,x22,#28 + add x21,x21,x17 // h+=Ch(e,f,g) + eor x17,x22,x22,ror#5 + add x21,x21,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x25,x25,x21 // d+=h + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x12,x17,ror#34 // Sigma0(a) + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x21,x21,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x10,x10 // 7 +#endif + ldp x11,x12,[x1],#2*8 + add x21,x21,x17 // h+=Sigma0(a) + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + eor x13,x25,x25,ror#23 + and x17,x26,x25 + bic x28,x27,x25 + add x20,x20,x10 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x13,ror#18 // Sigma1(e) + ror x13,x21,#28 + add x20,x20,x17 // h+=Ch(e,f,g) + eor x17,x21,x21,ror#5 + add x20,x20,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x24,x24,x20 // d+=h + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x13,x17,ror#34 // Sigma0(a) + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x20,x20,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x11,x11 // 8 +#endif + add x20,x20,x17 // h+=Sigma0(a) + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + eor x14,x24,x24,ror#23 + and x17,x25,x24 + bic x19,x26,x24 + add x27,x27,x11 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x14,ror#18 // Sigma1(e) + ror x14,x20,#28 + add x27,x27,x17 // h+=Ch(e,f,g) + eor x17,x20,x20,ror#5 + add x27,x27,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x23,x23,x27 // d+=h + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x14,x17,ror#34 // Sigma0(a) + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x27,x27,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x12,x12 // 9 +#endif + ldp x13,x14,[x1],#2*8 + add x27,x27,x17 // h+=Sigma0(a) + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + eor x15,x23,x23,ror#23 + and x17,x24,x23 + bic x28,x25,x23 + add x26,x26,x12 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x15,ror#18 // Sigma1(e) + ror x15,x27,#28 + add x26,x26,x17 // h+=Ch(e,f,g) + eor x17,x27,x27,ror#5 + add x26,x26,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x22,x22,x26 // d+=h + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x15,x17,ror#34 // Sigma0(a) + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x26,x26,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x13,x13 // 10 +#endif + add x26,x26,x17 // h+=Sigma0(a) + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + eor x0,x22,x22,ror#23 + and x17,x23,x22 + bic x19,x24,x22 + add x25,x25,x13 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x0,ror#18 // Sigma1(e) + ror x0,x26,#28 + add x25,x25,x17 // h+=Ch(e,f,g) + eor x17,x26,x26,ror#5 + add x25,x25,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x21,x21,x25 // d+=h + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x0,x17,ror#34 // Sigma0(a) + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x25,x25,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x14,x14 // 11 +#endif + ldp x15,x0,[x1],#2*8 + add x25,x25,x17 // h+=Sigma0(a) + str x6,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + eor x6,x21,x21,ror#23 + and x17,x22,x21 + bic x28,x23,x21 + add x24,x24,x14 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x6,ror#18 // Sigma1(e) + ror x6,x25,#28 + add x24,x24,x17 // h+=Ch(e,f,g) + eor x17,x25,x25,ror#5 + add x24,x24,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x20,x20,x24 // d+=h + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x6,x17,ror#34 // Sigma0(a) + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x24,x24,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x15,x15 // 12 +#endif + add x24,x24,x17 // h+=Sigma0(a) + str x7,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + eor x7,x20,x20,ror#23 + and x17,x21,x20 + bic x19,x22,x20 + add x23,x23,x15 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x7,ror#18 // Sigma1(e) + ror x7,x24,#28 + add x23,x23,x17 // h+=Ch(e,f,g) + eor x17,x24,x24,ror#5 + add x23,x23,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x27,x27,x23 // d+=h + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x7,x17,ror#34 // Sigma0(a) + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x23,x23,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x0,x0 // 13 +#endif + ldp x1,x2,[x1] + add x23,x23,x17 // h+=Sigma0(a) + str x8,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + eor x8,x27,x27,ror#23 + and x17,x20,x27 + bic x28,x21,x27 + add x22,x22,x0 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x8,ror#18 // Sigma1(e) + ror x8,x23,#28 + add x22,x22,x17 // h+=Ch(e,f,g) + eor x17,x23,x23,ror#5 + add x22,x22,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x26,x26,x22 // d+=h + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x8,x17,ror#34 // Sigma0(a) + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x22,x22,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x1,x1 // 14 +#endif + ldr x6,[sp,#24] + add x22,x22,x17 // h+=Sigma0(a) + str x9,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + eor x9,x26,x26,ror#23 + and x17,x27,x26 + bic x19,x20,x26 + add x21,x21,x1 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x9,ror#18 // Sigma1(e) + ror x9,x22,#28 + add x21,x21,x17 // h+=Ch(e,f,g) + eor x17,x22,x22,ror#5 + add x21,x21,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x25,x25,x21 // d+=h + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x9,x17,ror#34 // Sigma0(a) + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x21,x21,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x2,x2 // 15 +#endif + ldr x7,[sp,#0] + add x21,x21,x17 // h+=Sigma0(a) + str x10,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x9,x4,#1 + and x17,x26,x25 + ror x8,x1,#19 + bic x28,x27,x25 + ror x10,x21,#28 + add x20,x20,x2 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x9,x9,x4,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x10,x10,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x8,x8,x1,ror#61 + eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x10,x21,ror#39 // Sigma0(a) + eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) + add x3,x3,x12 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x3,x3,x9 + add x20,x20,x17 // h+=Sigma0(a) + add x3,x3,x8 +.Loop_16_xx: + ldr x8,[sp,#8] + str x11,[sp,#0] + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + ror x10,x5,#1 + and x17,x25,x24 + ror x9,x2,#19 + bic x19,x26,x24 + ror x11,x20,#28 + add x27,x27,x3 // h+=X[i] + eor x16,x16,x24,ror#18 + eor x10,x10,x5,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x24,ror#41 // Sigma1(e) + eor x11,x11,x20,ror#34 + add x27,x27,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x9,x9,x2,ror#61 + eor x10,x10,x5,lsr#7 // sigma0(X[i+1]) + add x27,x27,x16 // h+=Sigma1(e) + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x11,x20,ror#39 // Sigma0(a) + eor x9,x9,x2,lsr#6 // sigma1(X[i+14]) + add x4,x4,x13 + add x23,x23,x27 // d+=h + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x4,x4,x10 + add x27,x27,x17 // h+=Sigma0(a) + add x4,x4,x9 + ldr x9,[sp,#16] + str x12,[sp,#8] + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + ror x11,x6,#1 + and x17,x24,x23 + ror x10,x3,#19 + bic x28,x25,x23 + ror x12,x27,#28 + add x26,x26,x4 // h+=X[i] + eor x16,x16,x23,ror#18 + eor x11,x11,x6,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x23,ror#41 // Sigma1(e) + eor x12,x12,x27,ror#34 + add x26,x26,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x10,x10,x3,ror#61 + eor x11,x11,x6,lsr#7 // sigma0(X[i+1]) + add x26,x26,x16 // h+=Sigma1(e) + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x12,x27,ror#39 // Sigma0(a) + eor x10,x10,x3,lsr#6 // sigma1(X[i+14]) + add x5,x5,x14 + add x22,x22,x26 // d+=h + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x5,x5,x11 + add x26,x26,x17 // h+=Sigma0(a) + add x5,x5,x10 + ldr x10,[sp,#24] + str x13,[sp,#16] + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + ror x12,x7,#1 + and x17,x23,x22 + ror x11,x4,#19 + bic x19,x24,x22 + ror x13,x26,#28 + add x25,x25,x5 // h+=X[i] + eor x16,x16,x22,ror#18 + eor x12,x12,x7,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x22,ror#41 // Sigma1(e) + eor x13,x13,x26,ror#34 + add x25,x25,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x11,x11,x4,ror#61 + eor x12,x12,x7,lsr#7 // sigma0(X[i+1]) + add x25,x25,x16 // h+=Sigma1(e) + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x13,x26,ror#39 // Sigma0(a) + eor x11,x11,x4,lsr#6 // sigma1(X[i+14]) + add x6,x6,x15 + add x21,x21,x25 // d+=h + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x6,x6,x12 + add x25,x25,x17 // h+=Sigma0(a) + add x6,x6,x11 + ldr x11,[sp,#0] + str x14,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + ror x13,x8,#1 + and x17,x22,x21 + ror x12,x5,#19 + bic x28,x23,x21 + ror x14,x25,#28 + add x24,x24,x6 // h+=X[i] + eor x16,x16,x21,ror#18 + eor x13,x13,x8,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x21,ror#41 // Sigma1(e) + eor x14,x14,x25,ror#34 + add x24,x24,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x12,x12,x5,ror#61 + eor x13,x13,x8,lsr#7 // sigma0(X[i+1]) + add x24,x24,x16 // h+=Sigma1(e) + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x14,x25,ror#39 // Sigma0(a) + eor x12,x12,x5,lsr#6 // sigma1(X[i+14]) + add x7,x7,x0 + add x20,x20,x24 // d+=h + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x7,x7,x13 + add x24,x24,x17 // h+=Sigma0(a) + add x7,x7,x12 + ldr x12,[sp,#8] + str x15,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + ror x14,x9,#1 + and x17,x21,x20 + ror x13,x6,#19 + bic x19,x22,x20 + ror x15,x24,#28 + add x23,x23,x7 // h+=X[i] + eor x16,x16,x20,ror#18 + eor x14,x14,x9,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x20,ror#41 // Sigma1(e) + eor x15,x15,x24,ror#34 + add x23,x23,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x13,x13,x6,ror#61 + eor x14,x14,x9,lsr#7 // sigma0(X[i+1]) + add x23,x23,x16 // h+=Sigma1(e) + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x15,x24,ror#39 // Sigma0(a) + eor x13,x13,x6,lsr#6 // sigma1(X[i+14]) + add x8,x8,x1 + add x27,x27,x23 // d+=h + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x8,x8,x14 + add x23,x23,x17 // h+=Sigma0(a) + add x8,x8,x13 + ldr x13,[sp,#16] + str x0,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + ror x15,x10,#1 + and x17,x20,x27 + ror x14,x7,#19 + bic x28,x21,x27 + ror x0,x23,#28 + add x22,x22,x8 // h+=X[i] + eor x16,x16,x27,ror#18 + eor x15,x15,x10,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x27,ror#41 // Sigma1(e) + eor x0,x0,x23,ror#34 + add x22,x22,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x14,x14,x7,ror#61 + eor x15,x15,x10,lsr#7 // sigma0(X[i+1]) + add x22,x22,x16 // h+=Sigma1(e) + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x0,x23,ror#39 // Sigma0(a) + eor x14,x14,x7,lsr#6 // sigma1(X[i+14]) + add x9,x9,x2 + add x26,x26,x22 // d+=h + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x9,x9,x15 + add x22,x22,x17 // h+=Sigma0(a) + add x9,x9,x14 + ldr x14,[sp,#24] + str x1,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + ror x0,x11,#1 + and x17,x27,x26 + ror x15,x8,#19 + bic x19,x20,x26 + ror x1,x22,#28 + add x21,x21,x9 // h+=X[i] + eor x16,x16,x26,ror#18 + eor x0,x0,x11,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x26,ror#41 // Sigma1(e) + eor x1,x1,x22,ror#34 + add x21,x21,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x15,x15,x8,ror#61 + eor x0,x0,x11,lsr#7 // sigma0(X[i+1]) + add x21,x21,x16 // h+=Sigma1(e) + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x1,x22,ror#39 // Sigma0(a) + eor x15,x15,x8,lsr#6 // sigma1(X[i+14]) + add x10,x10,x3 + add x25,x25,x21 // d+=h + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x10,x10,x0 + add x21,x21,x17 // h+=Sigma0(a) + add x10,x10,x15 + ldr x15,[sp,#0] + str x2,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x1,x12,#1 + and x17,x26,x25 + ror x0,x9,#19 + bic x28,x27,x25 + ror x2,x21,#28 + add x20,x20,x10 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x1,x1,x12,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x2,x2,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x0,x0,x9,ror#61 + eor x1,x1,x12,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x2,x21,ror#39 // Sigma0(a) + eor x0,x0,x9,lsr#6 // sigma1(X[i+14]) + add x11,x11,x4 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x11,x11,x1 + add x20,x20,x17 // h+=Sigma0(a) + add x11,x11,x0 + ldr x0,[sp,#8] + str x3,[sp,#0] + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + ror x2,x13,#1 + and x17,x25,x24 + ror x1,x10,#19 + bic x19,x26,x24 + ror x3,x20,#28 + add x27,x27,x11 // h+=X[i] + eor x16,x16,x24,ror#18 + eor x2,x2,x13,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x24,ror#41 // Sigma1(e) + eor x3,x3,x20,ror#34 + add x27,x27,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x1,x1,x10,ror#61 + eor x2,x2,x13,lsr#7 // sigma0(X[i+1]) + add x27,x27,x16 // h+=Sigma1(e) + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x3,x20,ror#39 // Sigma0(a) + eor x1,x1,x10,lsr#6 // sigma1(X[i+14]) + add x12,x12,x5 + add x23,x23,x27 // d+=h + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x12,x12,x2 + add x27,x27,x17 // h+=Sigma0(a) + add x12,x12,x1 + ldr x1,[sp,#16] + str x4,[sp,#8] + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + ror x3,x14,#1 + and x17,x24,x23 + ror x2,x11,#19 + bic x28,x25,x23 + ror x4,x27,#28 + add x26,x26,x12 // h+=X[i] + eor x16,x16,x23,ror#18 + eor x3,x3,x14,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x23,ror#41 // Sigma1(e) + eor x4,x4,x27,ror#34 + add x26,x26,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x2,x2,x11,ror#61 + eor x3,x3,x14,lsr#7 // sigma0(X[i+1]) + add x26,x26,x16 // h+=Sigma1(e) + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x4,x27,ror#39 // Sigma0(a) + eor x2,x2,x11,lsr#6 // sigma1(X[i+14]) + add x13,x13,x6 + add x22,x22,x26 // d+=h + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x13,x13,x3 + add x26,x26,x17 // h+=Sigma0(a) + add x13,x13,x2 + ldr x2,[sp,#24] + str x5,[sp,#16] + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + ror x4,x15,#1 + and x17,x23,x22 + ror x3,x12,#19 + bic x19,x24,x22 + ror x5,x26,#28 + add x25,x25,x13 // h+=X[i] + eor x16,x16,x22,ror#18 + eor x4,x4,x15,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x22,ror#41 // Sigma1(e) + eor x5,x5,x26,ror#34 + add x25,x25,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x3,x3,x12,ror#61 + eor x4,x4,x15,lsr#7 // sigma0(X[i+1]) + add x25,x25,x16 // h+=Sigma1(e) + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x5,x26,ror#39 // Sigma0(a) + eor x3,x3,x12,lsr#6 // sigma1(X[i+14]) + add x14,x14,x7 + add x21,x21,x25 // d+=h + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x14,x14,x4 + add x25,x25,x17 // h+=Sigma0(a) + add x14,x14,x3 + ldr x3,[sp,#0] + str x6,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + ror x5,x0,#1 + and x17,x22,x21 + ror x4,x13,#19 + bic x28,x23,x21 + ror x6,x25,#28 + add x24,x24,x14 // h+=X[i] + eor x16,x16,x21,ror#18 + eor x5,x5,x0,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x21,ror#41 // Sigma1(e) + eor x6,x6,x25,ror#34 + add x24,x24,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x4,x4,x13,ror#61 + eor x5,x5,x0,lsr#7 // sigma0(X[i+1]) + add x24,x24,x16 // h+=Sigma1(e) + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x6,x25,ror#39 // Sigma0(a) + eor x4,x4,x13,lsr#6 // sigma1(X[i+14]) + add x15,x15,x8 + add x20,x20,x24 // d+=h + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x15,x15,x5 + add x24,x24,x17 // h+=Sigma0(a) + add x15,x15,x4 + ldr x4,[sp,#8] + str x7,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + ror x6,x1,#1 + and x17,x21,x20 + ror x5,x14,#19 + bic x19,x22,x20 + ror x7,x24,#28 + add x23,x23,x15 // h+=X[i] + eor x16,x16,x20,ror#18 + eor x6,x6,x1,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x20,ror#41 // Sigma1(e) + eor x7,x7,x24,ror#34 + add x23,x23,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x5,x5,x14,ror#61 + eor x6,x6,x1,lsr#7 // sigma0(X[i+1]) + add x23,x23,x16 // h+=Sigma1(e) + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x7,x24,ror#39 // Sigma0(a) + eor x5,x5,x14,lsr#6 // sigma1(X[i+14]) + add x0,x0,x9 + add x27,x27,x23 // d+=h + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x0,x0,x6 + add x23,x23,x17 // h+=Sigma0(a) + add x0,x0,x5 + ldr x5,[sp,#16] + str x8,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + ror x7,x2,#1 + and x17,x20,x27 + ror x6,x15,#19 + bic x28,x21,x27 + ror x8,x23,#28 + add x22,x22,x0 // h+=X[i] + eor x16,x16,x27,ror#18 + eor x7,x7,x2,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x27,ror#41 // Sigma1(e) + eor x8,x8,x23,ror#34 + add x22,x22,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x6,x6,x15,ror#61 + eor x7,x7,x2,lsr#7 // sigma0(X[i+1]) + add x22,x22,x16 // h+=Sigma1(e) + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x8,x23,ror#39 // Sigma0(a) + eor x6,x6,x15,lsr#6 // sigma1(X[i+14]) + add x1,x1,x10 + add x26,x26,x22 // d+=h + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x1,x1,x7 + add x22,x22,x17 // h+=Sigma0(a) + add x1,x1,x6 + ldr x6,[sp,#24] + str x9,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + ror x8,x3,#1 + and x17,x27,x26 + ror x7,x0,#19 + bic x19,x20,x26 + ror x9,x22,#28 + add x21,x21,x1 // h+=X[i] + eor x16,x16,x26,ror#18 + eor x8,x8,x3,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x26,ror#41 // Sigma1(e) + eor x9,x9,x22,ror#34 + add x21,x21,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x7,x7,x0,ror#61 + eor x8,x8,x3,lsr#7 // sigma0(X[i+1]) + add x21,x21,x16 // h+=Sigma1(e) + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x9,x22,ror#39 // Sigma0(a) + eor x7,x7,x0,lsr#6 // sigma1(X[i+14]) + add x2,x2,x11 + add x25,x25,x21 // d+=h + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x2,x2,x8 + add x21,x21,x17 // h+=Sigma0(a) + add x2,x2,x7 + ldr x7,[sp,#0] + str x10,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x9,x4,#1 + and x17,x26,x25 + ror x8,x1,#19 + bic x28,x27,x25 + ror x10,x21,#28 + add x20,x20,x2 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x9,x9,x4,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x10,x10,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x8,x8,x1,ror#61 + eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x10,x21,ror#39 // Sigma0(a) + eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) + add x3,x3,x12 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x3,x3,x9 + add x20,x20,x17 // h+=Sigma0(a) + add x3,x3,x8 + cbnz x19,.Loop_16_xx + + ldp x0,x2,[x29,#96] + ldr x1,[x29,#112] + sub x30,x30,#648 // rewind + + ldp x3,x4,[x0] + ldp x5,x6,[x0,#2*8] + add x1,x1,#14*8 // advance input pointer + ldp x7,x8,[x0,#4*8] + add x20,x20,x3 + ldp x9,x10,[x0,#6*8] + add x21,x21,x4 + add x22,x22,x5 + add x23,x23,x6 + stp x20,x21,[x0] + add x24,x24,x7 + add x25,x25,x8 + stp x22,x23,[x0,#2*8] + add x26,x26,x9 + add x27,x27,x10 + cmp x1,x2 + stp x24,x25,[x0,#4*8] + stp x26,x27,[x0,#6*8] + b.ne .Loop + + ldp x19,x20,[x29,#16] + add sp,sp,#4*8 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size sha512_block_data_order_nohw,.-sha512_block_data_order_nohw + +.section .rodata +.align 6 +.type .LK512,%object +.LK512: +.quad 0x428a2f98d728ae22,0x7137449123ef65cd +.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc +.quad 0x3956c25bf348b538,0x59f111f1b605d019 +.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 +.quad 0xd807aa98a3030242,0x12835b0145706fbe +.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 +.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 +.quad 0x9bdc06a725c71235,0xc19bf174cf692694 +.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 +.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 +.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 +.quad 0x983e5152ee66dfab,0xa831c66d2db43210 +.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 +.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 +.quad 0x06ca6351e003826f,0x142929670a0e6e70 +.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 +.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df +.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 +.quad 0x81c2c92e47edaee6,0x92722c851482353b +.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 +.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 +.quad 0xd192e819d6ef5218,0xd69906245565a910 +.quad 0xf40e35855771202a,0x106aa07032bbd1b8 +.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 +.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb +.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 +.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 +.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec +.quad 0x90befffa23631e28,0xa4506cebde82bde9 +.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b +.quad 0xca273eceea26619c,0xd186b8c721c0c207 +.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 +.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 +.quad 0x113f9804bef90dae,0x1b710b35131c471b +.quad 0x28db77f523047d84,0x32caab7b40c72493 +.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c +.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a +.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 +.quad 0 // terminator +.size .LK512,.-.LK512 +.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +.text +#ifndef __KERNEL__ +.globl sha512_block_data_order_hw +.hidden sha512_block_data_order_hw +.type sha512_block_data_order_hw,%function +.align 6 +sha512_block_data_order_hw: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 // load input + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + + ld1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // load context + adrp x3,.LK512 + add x3,x3,:lo12:.LK512 + + rev64 v16.16b,v16.16b + rev64 v17.16b,v17.16b + rev64 v18.16b,v18.16b + rev64 v19.16b,v19.16b + rev64 v20.16b,v20.16b + rev64 v21.16b,v21.16b + rev64 v22.16b,v22.16b + rev64 v23.16b,v23.16b + b .Loop_hw + +.align 4 +.Loop_hw: + ld1 {v24.2d},[x3],#16 + subs x2,x2,#1 + sub x4,x1,#128 + orr v26.16b,v0.16b,v0.16b // offload + orr v27.16b,v1.16b,v1.16b + orr v28.16b,v2.16b,v2.16b + orr v29.16b,v3.16b,v3.16b + csel x1,x1,x4,ne // conditional rewind + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec08230 //sha512su0 v16.16b,v17.16b + ext v7.16b,v20.16b,v21.16b,#8 +.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b +.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08251 //sha512su0 v17.16b,v18.16b + ext v7.16b,v21.16b,v22.16b,#8 +.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b +.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec08272 //sha512su0 v18.16b,v19.16b + ext v7.16b,v22.16b,v23.16b,#8 +.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b +.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08293 //sha512su0 v19.16b,v20.16b + ext v7.16b,v23.16b,v16.16b,#8 +.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b +.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b + ext v7.16b,v16.16b,v17.16b,#8 +.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b +.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b + ext v7.16b,v17.16b,v18.16b,#8 +.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b +.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b + ext v7.16b,v18.16b,v19.16b,#8 +.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b +.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08217 //sha512su0 v23.16b,v16.16b + ext v7.16b,v19.16b,v20.16b,#8 +.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b +.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec08230 //sha512su0 v16.16b,v17.16b + ext v7.16b,v20.16b,v21.16b,#8 +.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b +.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08251 //sha512su0 v17.16b,v18.16b + ext v7.16b,v21.16b,v22.16b,#8 +.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b +.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec08272 //sha512su0 v18.16b,v19.16b + ext v7.16b,v22.16b,v23.16b,#8 +.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b +.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08293 //sha512su0 v19.16b,v20.16b + ext v7.16b,v23.16b,v16.16b,#8 +.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b +.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b + ext v7.16b,v16.16b,v17.16b,#8 +.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b +.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b + ext v7.16b,v17.16b,v18.16b,#8 +.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b +.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b + ext v7.16b,v18.16b,v19.16b,#8 +.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b +.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08217 //sha512su0 v23.16b,v16.16b + ext v7.16b,v19.16b,v20.16b,#8 +.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b +.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec08230 //sha512su0 v16.16b,v17.16b + ext v7.16b,v20.16b,v21.16b,#8 +.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b +.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08251 //sha512su0 v17.16b,v18.16b + ext v7.16b,v21.16b,v22.16b,#8 +.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b +.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec08272 //sha512su0 v18.16b,v19.16b + ext v7.16b,v22.16b,v23.16b,#8 +.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b +.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08293 //sha512su0 v19.16b,v20.16b + ext v7.16b,v23.16b,v16.16b,#8 +.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b +.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b + ext v7.16b,v16.16b,v17.16b,#8 +.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b +.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b + ext v7.16b,v17.16b,v18.16b,#8 +.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b +.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b + ext v7.16b,v18.16b,v19.16b,#8 +.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b +.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08217 //sha512su0 v23.16b,v16.16b + ext v7.16b,v19.16b,v20.16b,#8 +.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b +.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec08230 //sha512su0 v16.16b,v17.16b + ext v7.16b,v20.16b,v21.16b,#8 +.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b +.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08251 //sha512su0 v17.16b,v18.16b + ext v7.16b,v21.16b,v22.16b,#8 +.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b +.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec08272 //sha512su0 v18.16b,v19.16b + ext v7.16b,v22.16b,v23.16b,#8 +.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b +.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08293 //sha512su0 v19.16b,v20.16b + ext v7.16b,v23.16b,v16.16b,#8 +.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b +.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b + ext v7.16b,v16.16b,v17.16b,#8 +.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b +.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b + ext v7.16b,v17.16b,v18.16b,#8 +.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b +.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b + ext v7.16b,v18.16b,v19.16b,#8 +.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b +.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08217 //sha512su0 v23.16b,v16.16b + ext v7.16b,v19.16b,v20.16b,#8 +.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b +.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v16.2d + ld1 {v16.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b + rev64 v16.16b,v16.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + ld1 {v24.2d},[x3],#16 + add v25.2d,v25.2d,v17.2d + ld1 {v17.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b + rev64 v17.16b,v17.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v18.2d + ld1 {v18.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b + rev64 v18.16b,v18.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + ld1 {v24.2d},[x3],#16 + add v25.2d,v25.2d,v19.2d + ld1 {v19.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b + rev64 v19.16b,v19.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v20.2d + ld1 {v20.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b + rev64 v20.16b,v20.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + ld1 {v24.2d},[x3],#16 + add v25.2d,v25.2d,v21.2d + ld1 {v21.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b + rev64 v21.16b,v21.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v22.2d + ld1 {v22.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b + rev64 v22.16b,v22.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + sub x3,x3,#80*8 // rewind + add v25.2d,v25.2d,v23.2d + ld1 {v23.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b + rev64 v23.16b,v23.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v0.2d,v0.2d,v26.2d // accumulate + add v1.2d,v1.2d,v27.2d + add v2.2d,v2.2d,v28.2d + add v3.2d,v3.2d,v29.2d + + cbnz x2,.Loop_hw + + st1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // store context + + ldr x29,[sp],#16 + ret +.size sha512_block_data_order_hw,.-sha512_block_data_order_hw +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) diff --git a/ring-0.17.14/pregenerated/sha512-armv8-win64.S b/ring-0.17.14/pregenerated/sha512-armv8-win64.S new file mode 100644 index 0000000000..340b2b0ff5 --- /dev/null +++ b/ring-0.17.14/pregenerated/sha512-armv8-win64.S @@ -0,0 +1,1602 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// ==================================================================== +// Written by Andy Polyakov for the OpenSSL +// project. +// ==================================================================== +// +// SHA256/512 for ARMv8. +// +// Performance in cycles per processed byte and improvement coefficient +// over code generated with "default" compiler: +// +// SHA256-hw SHA256(*) SHA512 +// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) +// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) +// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) +// Denver 2.01 10.5 (+26%) 6.70 (+8%) +// X-Gene 20.0 (+100%) 12.8 (+300%(***)) +// Mongoose 2.36 13.0 (+50%) 8.36 (+33%) +// Kryo 1.92 17.4 (+30%) 11.2 (+8%) +// +// (*) Software SHA256 results are of lesser relevance, presented +// mostly for informational purposes. +// (**) The result is a trade-off: it's possible to improve it by +// 10% (or by 1 cycle per round), but at the cost of 20% loss +// on Cortex-A53 (or by 4 cycles per round). +// (***) Super-impressive coefficients over gcc-generated code are +// indication of some compiler "pathology", most notably code +// generated with -mgeneral-regs-only is significantly faster +// and the gap is only 40-90%. + +#ifndef __KERNEL__ +#endif + +.text + +.globl sha512_block_data_order_nohw + +.def sha512_block_data_order_nohw + .type 32 +.endef +.align 6 +sha512_block_data_order_nohw: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#4*8 + + ldp x20,x21,[x0] // load context + ldp x22,x23,[x0,#2*8] + ldp x24,x25,[x0,#4*8] + add x2,x1,x2,lsl#7 // end of input + ldp x26,x27,[x0,#6*8] + adrp x30,LK512 + add x30,x30,:lo12:LK512 + stp x0,x2,[x29,#96] + +Loop: + ldp x3,x4,[x1],#2*8 + ldr x19,[x30],#8 // *K++ + eor x28,x21,x22 // magic seed + str x1,[x29,#112] +#ifndef __AARCH64EB__ + rev x3,x3 // 0 +#endif + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + eor x6,x24,x24,ror#23 + and x17,x25,x24 + bic x19,x26,x24 + add x27,x27,x3 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x6,ror#18 // Sigma1(e) + ror x6,x20,#28 + add x27,x27,x17 // h+=Ch(e,f,g) + eor x17,x20,x20,ror#5 + add x27,x27,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x23,x23,x27 // d+=h + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x6,x17,ror#34 // Sigma0(a) + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x27,x27,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x4,x4 // 1 +#endif + ldp x5,x6,[x1],#2*8 + add x27,x27,x17 // h+=Sigma0(a) + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + eor x7,x23,x23,ror#23 + and x17,x24,x23 + bic x28,x25,x23 + add x26,x26,x4 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x7,ror#18 // Sigma1(e) + ror x7,x27,#28 + add x26,x26,x17 // h+=Ch(e,f,g) + eor x17,x27,x27,ror#5 + add x26,x26,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x22,x22,x26 // d+=h + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x7,x17,ror#34 // Sigma0(a) + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x26,x26,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x5,x5 // 2 +#endif + add x26,x26,x17 // h+=Sigma0(a) + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + eor x8,x22,x22,ror#23 + and x17,x23,x22 + bic x19,x24,x22 + add x25,x25,x5 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x8,ror#18 // Sigma1(e) + ror x8,x26,#28 + add x25,x25,x17 // h+=Ch(e,f,g) + eor x17,x26,x26,ror#5 + add x25,x25,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x21,x21,x25 // d+=h + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x8,x17,ror#34 // Sigma0(a) + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x25,x25,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x6,x6 // 3 +#endif + ldp x7,x8,[x1],#2*8 + add x25,x25,x17 // h+=Sigma0(a) + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + eor x9,x21,x21,ror#23 + and x17,x22,x21 + bic x28,x23,x21 + add x24,x24,x6 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x9,ror#18 // Sigma1(e) + ror x9,x25,#28 + add x24,x24,x17 // h+=Ch(e,f,g) + eor x17,x25,x25,ror#5 + add x24,x24,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x20,x20,x24 // d+=h + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x9,x17,ror#34 // Sigma0(a) + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x24,x24,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x7,x7 // 4 +#endif + add x24,x24,x17 // h+=Sigma0(a) + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + eor x10,x20,x20,ror#23 + and x17,x21,x20 + bic x19,x22,x20 + add x23,x23,x7 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x10,ror#18 // Sigma1(e) + ror x10,x24,#28 + add x23,x23,x17 // h+=Ch(e,f,g) + eor x17,x24,x24,ror#5 + add x23,x23,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x27,x27,x23 // d+=h + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x10,x17,ror#34 // Sigma0(a) + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x23,x23,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x8,x8 // 5 +#endif + ldp x9,x10,[x1],#2*8 + add x23,x23,x17 // h+=Sigma0(a) + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + eor x11,x27,x27,ror#23 + and x17,x20,x27 + bic x28,x21,x27 + add x22,x22,x8 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x11,ror#18 // Sigma1(e) + ror x11,x23,#28 + add x22,x22,x17 // h+=Ch(e,f,g) + eor x17,x23,x23,ror#5 + add x22,x22,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x26,x26,x22 // d+=h + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x11,x17,ror#34 // Sigma0(a) + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x22,x22,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x9,x9 // 6 +#endif + add x22,x22,x17 // h+=Sigma0(a) + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + eor x12,x26,x26,ror#23 + and x17,x27,x26 + bic x19,x20,x26 + add x21,x21,x9 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x12,ror#18 // Sigma1(e) + ror x12,x22,#28 + add x21,x21,x17 // h+=Ch(e,f,g) + eor x17,x22,x22,ror#5 + add x21,x21,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x25,x25,x21 // d+=h + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x12,x17,ror#34 // Sigma0(a) + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x21,x21,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x10,x10 // 7 +#endif + ldp x11,x12,[x1],#2*8 + add x21,x21,x17 // h+=Sigma0(a) + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + eor x13,x25,x25,ror#23 + and x17,x26,x25 + bic x28,x27,x25 + add x20,x20,x10 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x13,ror#18 // Sigma1(e) + ror x13,x21,#28 + add x20,x20,x17 // h+=Ch(e,f,g) + eor x17,x21,x21,ror#5 + add x20,x20,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x24,x24,x20 // d+=h + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x13,x17,ror#34 // Sigma0(a) + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x20,x20,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x11,x11 // 8 +#endif + add x20,x20,x17 // h+=Sigma0(a) + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + eor x14,x24,x24,ror#23 + and x17,x25,x24 + bic x19,x26,x24 + add x27,x27,x11 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x14,ror#18 // Sigma1(e) + ror x14,x20,#28 + add x27,x27,x17 // h+=Ch(e,f,g) + eor x17,x20,x20,ror#5 + add x27,x27,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x23,x23,x27 // d+=h + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x14,x17,ror#34 // Sigma0(a) + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x27,x27,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x12,x12 // 9 +#endif + ldp x13,x14,[x1],#2*8 + add x27,x27,x17 // h+=Sigma0(a) + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + eor x15,x23,x23,ror#23 + and x17,x24,x23 + bic x28,x25,x23 + add x26,x26,x12 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x15,ror#18 // Sigma1(e) + ror x15,x27,#28 + add x26,x26,x17 // h+=Ch(e,f,g) + eor x17,x27,x27,ror#5 + add x26,x26,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x22,x22,x26 // d+=h + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x15,x17,ror#34 // Sigma0(a) + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x26,x26,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x13,x13 // 10 +#endif + add x26,x26,x17 // h+=Sigma0(a) + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + eor x0,x22,x22,ror#23 + and x17,x23,x22 + bic x19,x24,x22 + add x25,x25,x13 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x0,ror#18 // Sigma1(e) + ror x0,x26,#28 + add x25,x25,x17 // h+=Ch(e,f,g) + eor x17,x26,x26,ror#5 + add x25,x25,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x21,x21,x25 // d+=h + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x0,x17,ror#34 // Sigma0(a) + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x25,x25,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x14,x14 // 11 +#endif + ldp x15,x0,[x1],#2*8 + add x25,x25,x17 // h+=Sigma0(a) + str x6,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + eor x6,x21,x21,ror#23 + and x17,x22,x21 + bic x28,x23,x21 + add x24,x24,x14 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x6,ror#18 // Sigma1(e) + ror x6,x25,#28 + add x24,x24,x17 // h+=Ch(e,f,g) + eor x17,x25,x25,ror#5 + add x24,x24,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x20,x20,x24 // d+=h + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x6,x17,ror#34 // Sigma0(a) + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x24,x24,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x15,x15 // 12 +#endif + add x24,x24,x17 // h+=Sigma0(a) + str x7,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + eor x7,x20,x20,ror#23 + and x17,x21,x20 + bic x19,x22,x20 + add x23,x23,x15 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x7,ror#18 // Sigma1(e) + ror x7,x24,#28 + add x23,x23,x17 // h+=Ch(e,f,g) + eor x17,x24,x24,ror#5 + add x23,x23,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x27,x27,x23 // d+=h + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x7,x17,ror#34 // Sigma0(a) + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x23,x23,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x0,x0 // 13 +#endif + ldp x1,x2,[x1] + add x23,x23,x17 // h+=Sigma0(a) + str x8,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + eor x8,x27,x27,ror#23 + and x17,x20,x27 + bic x28,x21,x27 + add x22,x22,x0 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x8,ror#18 // Sigma1(e) + ror x8,x23,#28 + add x22,x22,x17 // h+=Ch(e,f,g) + eor x17,x23,x23,ror#5 + add x22,x22,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x26,x26,x22 // d+=h + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x8,x17,ror#34 // Sigma0(a) + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x22,x22,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x1,x1 // 14 +#endif + ldr x6,[sp,#24] + add x22,x22,x17 // h+=Sigma0(a) + str x9,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + eor x9,x26,x26,ror#23 + and x17,x27,x26 + bic x19,x20,x26 + add x21,x21,x1 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x9,ror#18 // Sigma1(e) + ror x9,x22,#28 + add x21,x21,x17 // h+=Ch(e,f,g) + eor x17,x22,x22,ror#5 + add x21,x21,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x25,x25,x21 // d+=h + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x9,x17,ror#34 // Sigma0(a) + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x21,x21,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x2,x2 // 15 +#endif + ldr x7,[sp,#0] + add x21,x21,x17 // h+=Sigma0(a) + str x10,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x9,x4,#1 + and x17,x26,x25 + ror x8,x1,#19 + bic x28,x27,x25 + ror x10,x21,#28 + add x20,x20,x2 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x9,x9,x4,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x10,x10,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x8,x8,x1,ror#61 + eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x10,x21,ror#39 // Sigma0(a) + eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) + add x3,x3,x12 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x3,x3,x9 + add x20,x20,x17 // h+=Sigma0(a) + add x3,x3,x8 +Loop_16_xx: + ldr x8,[sp,#8] + str x11,[sp,#0] + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + ror x10,x5,#1 + and x17,x25,x24 + ror x9,x2,#19 + bic x19,x26,x24 + ror x11,x20,#28 + add x27,x27,x3 // h+=X[i] + eor x16,x16,x24,ror#18 + eor x10,x10,x5,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x24,ror#41 // Sigma1(e) + eor x11,x11,x20,ror#34 + add x27,x27,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x9,x9,x2,ror#61 + eor x10,x10,x5,lsr#7 // sigma0(X[i+1]) + add x27,x27,x16 // h+=Sigma1(e) + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x11,x20,ror#39 // Sigma0(a) + eor x9,x9,x2,lsr#6 // sigma1(X[i+14]) + add x4,x4,x13 + add x23,x23,x27 // d+=h + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x4,x4,x10 + add x27,x27,x17 // h+=Sigma0(a) + add x4,x4,x9 + ldr x9,[sp,#16] + str x12,[sp,#8] + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + ror x11,x6,#1 + and x17,x24,x23 + ror x10,x3,#19 + bic x28,x25,x23 + ror x12,x27,#28 + add x26,x26,x4 // h+=X[i] + eor x16,x16,x23,ror#18 + eor x11,x11,x6,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x23,ror#41 // Sigma1(e) + eor x12,x12,x27,ror#34 + add x26,x26,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x10,x10,x3,ror#61 + eor x11,x11,x6,lsr#7 // sigma0(X[i+1]) + add x26,x26,x16 // h+=Sigma1(e) + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x12,x27,ror#39 // Sigma0(a) + eor x10,x10,x3,lsr#6 // sigma1(X[i+14]) + add x5,x5,x14 + add x22,x22,x26 // d+=h + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x5,x5,x11 + add x26,x26,x17 // h+=Sigma0(a) + add x5,x5,x10 + ldr x10,[sp,#24] + str x13,[sp,#16] + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + ror x12,x7,#1 + and x17,x23,x22 + ror x11,x4,#19 + bic x19,x24,x22 + ror x13,x26,#28 + add x25,x25,x5 // h+=X[i] + eor x16,x16,x22,ror#18 + eor x12,x12,x7,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x22,ror#41 // Sigma1(e) + eor x13,x13,x26,ror#34 + add x25,x25,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x11,x11,x4,ror#61 + eor x12,x12,x7,lsr#7 // sigma0(X[i+1]) + add x25,x25,x16 // h+=Sigma1(e) + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x13,x26,ror#39 // Sigma0(a) + eor x11,x11,x4,lsr#6 // sigma1(X[i+14]) + add x6,x6,x15 + add x21,x21,x25 // d+=h + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x6,x6,x12 + add x25,x25,x17 // h+=Sigma0(a) + add x6,x6,x11 + ldr x11,[sp,#0] + str x14,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + ror x13,x8,#1 + and x17,x22,x21 + ror x12,x5,#19 + bic x28,x23,x21 + ror x14,x25,#28 + add x24,x24,x6 // h+=X[i] + eor x16,x16,x21,ror#18 + eor x13,x13,x8,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x21,ror#41 // Sigma1(e) + eor x14,x14,x25,ror#34 + add x24,x24,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x12,x12,x5,ror#61 + eor x13,x13,x8,lsr#7 // sigma0(X[i+1]) + add x24,x24,x16 // h+=Sigma1(e) + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x14,x25,ror#39 // Sigma0(a) + eor x12,x12,x5,lsr#6 // sigma1(X[i+14]) + add x7,x7,x0 + add x20,x20,x24 // d+=h + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x7,x7,x13 + add x24,x24,x17 // h+=Sigma0(a) + add x7,x7,x12 + ldr x12,[sp,#8] + str x15,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + ror x14,x9,#1 + and x17,x21,x20 + ror x13,x6,#19 + bic x19,x22,x20 + ror x15,x24,#28 + add x23,x23,x7 // h+=X[i] + eor x16,x16,x20,ror#18 + eor x14,x14,x9,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x20,ror#41 // Sigma1(e) + eor x15,x15,x24,ror#34 + add x23,x23,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x13,x13,x6,ror#61 + eor x14,x14,x9,lsr#7 // sigma0(X[i+1]) + add x23,x23,x16 // h+=Sigma1(e) + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x15,x24,ror#39 // Sigma0(a) + eor x13,x13,x6,lsr#6 // sigma1(X[i+14]) + add x8,x8,x1 + add x27,x27,x23 // d+=h + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x8,x8,x14 + add x23,x23,x17 // h+=Sigma0(a) + add x8,x8,x13 + ldr x13,[sp,#16] + str x0,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + ror x15,x10,#1 + and x17,x20,x27 + ror x14,x7,#19 + bic x28,x21,x27 + ror x0,x23,#28 + add x22,x22,x8 // h+=X[i] + eor x16,x16,x27,ror#18 + eor x15,x15,x10,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x27,ror#41 // Sigma1(e) + eor x0,x0,x23,ror#34 + add x22,x22,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x14,x14,x7,ror#61 + eor x15,x15,x10,lsr#7 // sigma0(X[i+1]) + add x22,x22,x16 // h+=Sigma1(e) + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x0,x23,ror#39 // Sigma0(a) + eor x14,x14,x7,lsr#6 // sigma1(X[i+14]) + add x9,x9,x2 + add x26,x26,x22 // d+=h + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x9,x9,x15 + add x22,x22,x17 // h+=Sigma0(a) + add x9,x9,x14 + ldr x14,[sp,#24] + str x1,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + ror x0,x11,#1 + and x17,x27,x26 + ror x15,x8,#19 + bic x19,x20,x26 + ror x1,x22,#28 + add x21,x21,x9 // h+=X[i] + eor x16,x16,x26,ror#18 + eor x0,x0,x11,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x26,ror#41 // Sigma1(e) + eor x1,x1,x22,ror#34 + add x21,x21,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x15,x15,x8,ror#61 + eor x0,x0,x11,lsr#7 // sigma0(X[i+1]) + add x21,x21,x16 // h+=Sigma1(e) + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x1,x22,ror#39 // Sigma0(a) + eor x15,x15,x8,lsr#6 // sigma1(X[i+14]) + add x10,x10,x3 + add x25,x25,x21 // d+=h + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x10,x10,x0 + add x21,x21,x17 // h+=Sigma0(a) + add x10,x10,x15 + ldr x15,[sp,#0] + str x2,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x1,x12,#1 + and x17,x26,x25 + ror x0,x9,#19 + bic x28,x27,x25 + ror x2,x21,#28 + add x20,x20,x10 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x1,x1,x12,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x2,x2,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x0,x0,x9,ror#61 + eor x1,x1,x12,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x2,x21,ror#39 // Sigma0(a) + eor x0,x0,x9,lsr#6 // sigma1(X[i+14]) + add x11,x11,x4 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x11,x11,x1 + add x20,x20,x17 // h+=Sigma0(a) + add x11,x11,x0 + ldr x0,[sp,#8] + str x3,[sp,#0] + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + ror x2,x13,#1 + and x17,x25,x24 + ror x1,x10,#19 + bic x19,x26,x24 + ror x3,x20,#28 + add x27,x27,x11 // h+=X[i] + eor x16,x16,x24,ror#18 + eor x2,x2,x13,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x24,ror#41 // Sigma1(e) + eor x3,x3,x20,ror#34 + add x27,x27,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x1,x1,x10,ror#61 + eor x2,x2,x13,lsr#7 // sigma0(X[i+1]) + add x27,x27,x16 // h+=Sigma1(e) + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x3,x20,ror#39 // Sigma0(a) + eor x1,x1,x10,lsr#6 // sigma1(X[i+14]) + add x12,x12,x5 + add x23,x23,x27 // d+=h + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x12,x12,x2 + add x27,x27,x17 // h+=Sigma0(a) + add x12,x12,x1 + ldr x1,[sp,#16] + str x4,[sp,#8] + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + ror x3,x14,#1 + and x17,x24,x23 + ror x2,x11,#19 + bic x28,x25,x23 + ror x4,x27,#28 + add x26,x26,x12 // h+=X[i] + eor x16,x16,x23,ror#18 + eor x3,x3,x14,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x23,ror#41 // Sigma1(e) + eor x4,x4,x27,ror#34 + add x26,x26,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x2,x2,x11,ror#61 + eor x3,x3,x14,lsr#7 // sigma0(X[i+1]) + add x26,x26,x16 // h+=Sigma1(e) + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x4,x27,ror#39 // Sigma0(a) + eor x2,x2,x11,lsr#6 // sigma1(X[i+14]) + add x13,x13,x6 + add x22,x22,x26 // d+=h + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x13,x13,x3 + add x26,x26,x17 // h+=Sigma0(a) + add x13,x13,x2 + ldr x2,[sp,#24] + str x5,[sp,#16] + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + ror x4,x15,#1 + and x17,x23,x22 + ror x3,x12,#19 + bic x19,x24,x22 + ror x5,x26,#28 + add x25,x25,x13 // h+=X[i] + eor x16,x16,x22,ror#18 + eor x4,x4,x15,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x22,ror#41 // Sigma1(e) + eor x5,x5,x26,ror#34 + add x25,x25,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x3,x3,x12,ror#61 + eor x4,x4,x15,lsr#7 // sigma0(X[i+1]) + add x25,x25,x16 // h+=Sigma1(e) + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x5,x26,ror#39 // Sigma0(a) + eor x3,x3,x12,lsr#6 // sigma1(X[i+14]) + add x14,x14,x7 + add x21,x21,x25 // d+=h + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x14,x14,x4 + add x25,x25,x17 // h+=Sigma0(a) + add x14,x14,x3 + ldr x3,[sp,#0] + str x6,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + ror x5,x0,#1 + and x17,x22,x21 + ror x4,x13,#19 + bic x28,x23,x21 + ror x6,x25,#28 + add x24,x24,x14 // h+=X[i] + eor x16,x16,x21,ror#18 + eor x5,x5,x0,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x21,ror#41 // Sigma1(e) + eor x6,x6,x25,ror#34 + add x24,x24,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x4,x4,x13,ror#61 + eor x5,x5,x0,lsr#7 // sigma0(X[i+1]) + add x24,x24,x16 // h+=Sigma1(e) + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x6,x25,ror#39 // Sigma0(a) + eor x4,x4,x13,lsr#6 // sigma1(X[i+14]) + add x15,x15,x8 + add x20,x20,x24 // d+=h + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x15,x15,x5 + add x24,x24,x17 // h+=Sigma0(a) + add x15,x15,x4 + ldr x4,[sp,#8] + str x7,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + ror x6,x1,#1 + and x17,x21,x20 + ror x5,x14,#19 + bic x19,x22,x20 + ror x7,x24,#28 + add x23,x23,x15 // h+=X[i] + eor x16,x16,x20,ror#18 + eor x6,x6,x1,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x20,ror#41 // Sigma1(e) + eor x7,x7,x24,ror#34 + add x23,x23,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x5,x5,x14,ror#61 + eor x6,x6,x1,lsr#7 // sigma0(X[i+1]) + add x23,x23,x16 // h+=Sigma1(e) + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x7,x24,ror#39 // Sigma0(a) + eor x5,x5,x14,lsr#6 // sigma1(X[i+14]) + add x0,x0,x9 + add x27,x27,x23 // d+=h + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x0,x0,x6 + add x23,x23,x17 // h+=Sigma0(a) + add x0,x0,x5 + ldr x5,[sp,#16] + str x8,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + ror x7,x2,#1 + and x17,x20,x27 + ror x6,x15,#19 + bic x28,x21,x27 + ror x8,x23,#28 + add x22,x22,x0 // h+=X[i] + eor x16,x16,x27,ror#18 + eor x7,x7,x2,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x27,ror#41 // Sigma1(e) + eor x8,x8,x23,ror#34 + add x22,x22,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x6,x6,x15,ror#61 + eor x7,x7,x2,lsr#7 // sigma0(X[i+1]) + add x22,x22,x16 // h+=Sigma1(e) + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x8,x23,ror#39 // Sigma0(a) + eor x6,x6,x15,lsr#6 // sigma1(X[i+14]) + add x1,x1,x10 + add x26,x26,x22 // d+=h + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x1,x1,x7 + add x22,x22,x17 // h+=Sigma0(a) + add x1,x1,x6 + ldr x6,[sp,#24] + str x9,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + ror x8,x3,#1 + and x17,x27,x26 + ror x7,x0,#19 + bic x19,x20,x26 + ror x9,x22,#28 + add x21,x21,x1 // h+=X[i] + eor x16,x16,x26,ror#18 + eor x8,x8,x3,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x26,ror#41 // Sigma1(e) + eor x9,x9,x22,ror#34 + add x21,x21,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x7,x7,x0,ror#61 + eor x8,x8,x3,lsr#7 // sigma0(X[i+1]) + add x21,x21,x16 // h+=Sigma1(e) + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x9,x22,ror#39 // Sigma0(a) + eor x7,x7,x0,lsr#6 // sigma1(X[i+14]) + add x2,x2,x11 + add x25,x25,x21 // d+=h + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x2,x2,x8 + add x21,x21,x17 // h+=Sigma0(a) + add x2,x2,x7 + ldr x7,[sp,#0] + str x10,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x9,x4,#1 + and x17,x26,x25 + ror x8,x1,#19 + bic x28,x27,x25 + ror x10,x21,#28 + add x20,x20,x2 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x9,x9,x4,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x10,x10,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x8,x8,x1,ror#61 + eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x10,x21,ror#39 // Sigma0(a) + eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) + add x3,x3,x12 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x3,x3,x9 + add x20,x20,x17 // h+=Sigma0(a) + add x3,x3,x8 + cbnz x19,Loop_16_xx + + ldp x0,x2,[x29,#96] + ldr x1,[x29,#112] + sub x30,x30,#648 // rewind + + ldp x3,x4,[x0] + ldp x5,x6,[x0,#2*8] + add x1,x1,#14*8 // advance input pointer + ldp x7,x8,[x0,#4*8] + add x20,x20,x3 + ldp x9,x10,[x0,#6*8] + add x21,x21,x4 + add x22,x22,x5 + add x23,x23,x6 + stp x20,x21,[x0] + add x24,x24,x7 + add x25,x25,x8 + stp x22,x23,[x0,#2*8] + add x26,x26,x9 + add x27,x27,x10 + cmp x1,x2 + stp x24,x25,[x0,#4*8] + stp x26,x27,[x0,#6*8] + b.ne Loop + + ldp x19,x20,[x29,#16] + add sp,sp,#4*8 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +.section .rodata +.align 6 + +LK512: +.quad 0x428a2f98d728ae22,0x7137449123ef65cd +.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc +.quad 0x3956c25bf348b538,0x59f111f1b605d019 +.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 +.quad 0xd807aa98a3030242,0x12835b0145706fbe +.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 +.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 +.quad 0x9bdc06a725c71235,0xc19bf174cf692694 +.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 +.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 +.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 +.quad 0x983e5152ee66dfab,0xa831c66d2db43210 +.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 +.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 +.quad 0x06ca6351e003826f,0x142929670a0e6e70 +.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 +.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df +.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 +.quad 0x81c2c92e47edaee6,0x92722c851482353b +.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 +.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 +.quad 0xd192e819d6ef5218,0xd69906245565a910 +.quad 0xf40e35855771202a,0x106aa07032bbd1b8 +.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 +.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb +.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 +.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 +.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec +.quad 0x90befffa23631e28,0xa4506cebde82bde9 +.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b +.quad 0xca273eceea26619c,0xd186b8c721c0c207 +.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 +.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 +.quad 0x113f9804bef90dae,0x1b710b35131c471b +.quad 0x28db77f523047d84,0x32caab7b40c72493 +.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c +.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a +.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 +.quad 0 // terminator + +.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +.text +#ifndef __KERNEL__ +.globl sha512_block_data_order_hw + +.def sha512_block_data_order_hw + .type 32 +.endef +.align 6 +sha512_block_data_order_hw: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 // load input + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + + ld1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // load context + adrp x3,LK512 + add x3,x3,:lo12:LK512 + + rev64 v16.16b,v16.16b + rev64 v17.16b,v17.16b + rev64 v18.16b,v18.16b + rev64 v19.16b,v19.16b + rev64 v20.16b,v20.16b + rev64 v21.16b,v21.16b + rev64 v22.16b,v22.16b + rev64 v23.16b,v23.16b + b Loop_hw + +.align 4 +Loop_hw: + ld1 {v24.2d},[x3],#16 + subs x2,x2,#1 + sub x4,x1,#128 + orr v26.16b,v0.16b,v0.16b // offload + orr v27.16b,v1.16b,v1.16b + orr v28.16b,v2.16b,v2.16b + orr v29.16b,v3.16b,v3.16b + csel x1,x1,x4,ne // conditional rewind + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08230 //sha512su0 v16.16b,v17.16b + ext v7.16b,v20.16b,v21.16b,#8 +.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b +.long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08251 //sha512su0 v17.16b,v18.16b + ext v7.16b,v21.16b,v22.16b,#8 +.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b +.long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08272 //sha512su0 v18.16b,v19.16b + ext v7.16b,v22.16b,v23.16b,#8 +.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b +.long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08293 //sha512su0 v19.16b,v20.16b + ext v7.16b,v23.16b,v16.16b,#8 +.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b +.long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082b4 //sha512su0 v20.16b,v21.16b + ext v7.16b,v16.16b,v17.16b,#8 +.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b +.long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec082d5 //sha512su0 v21.16b,v22.16b + ext v7.16b,v17.16b,v18.16b,#8 +.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b +.long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082f6 //sha512su0 v22.16b,v23.16b + ext v7.16b,v18.16b,v19.16b,#8 +.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b +.long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08217 //sha512su0 v23.16b,v16.16b + ext v7.16b,v19.16b,v20.16b,#8 +.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b +.long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08230 //sha512su0 v16.16b,v17.16b + ext v7.16b,v20.16b,v21.16b,#8 +.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b +.long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08251 //sha512su0 v17.16b,v18.16b + ext v7.16b,v21.16b,v22.16b,#8 +.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b +.long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08272 //sha512su0 v18.16b,v19.16b + ext v7.16b,v22.16b,v23.16b,#8 +.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b +.long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08293 //sha512su0 v19.16b,v20.16b + ext v7.16b,v23.16b,v16.16b,#8 +.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b +.long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082b4 //sha512su0 v20.16b,v21.16b + ext v7.16b,v16.16b,v17.16b,#8 +.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b +.long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec082d5 //sha512su0 v21.16b,v22.16b + ext v7.16b,v17.16b,v18.16b,#8 +.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b +.long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082f6 //sha512su0 v22.16b,v23.16b + ext v7.16b,v18.16b,v19.16b,#8 +.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b +.long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08217 //sha512su0 v23.16b,v16.16b + ext v7.16b,v19.16b,v20.16b,#8 +.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b +.long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08230 //sha512su0 v16.16b,v17.16b + ext v7.16b,v20.16b,v21.16b,#8 +.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b +.long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08251 //sha512su0 v17.16b,v18.16b + ext v7.16b,v21.16b,v22.16b,#8 +.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b +.long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08272 //sha512su0 v18.16b,v19.16b + ext v7.16b,v22.16b,v23.16b,#8 +.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b +.long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08293 //sha512su0 v19.16b,v20.16b + ext v7.16b,v23.16b,v16.16b,#8 +.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b +.long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082b4 //sha512su0 v20.16b,v21.16b + ext v7.16b,v16.16b,v17.16b,#8 +.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b +.long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec082d5 //sha512su0 v21.16b,v22.16b + ext v7.16b,v17.16b,v18.16b,#8 +.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b +.long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082f6 //sha512su0 v22.16b,v23.16b + ext v7.16b,v18.16b,v19.16b,#8 +.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b +.long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08217 //sha512su0 v23.16b,v16.16b + ext v7.16b,v19.16b,v20.16b,#8 +.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b +.long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08230 //sha512su0 v16.16b,v17.16b + ext v7.16b,v20.16b,v21.16b,#8 +.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b +.long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08251 //sha512su0 v17.16b,v18.16b + ext v7.16b,v21.16b,v22.16b,#8 +.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b +.long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08272 //sha512su0 v18.16b,v19.16b + ext v7.16b,v22.16b,v23.16b,#8 +.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b +.long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08293 //sha512su0 v19.16b,v20.16b + ext v7.16b,v23.16b,v16.16b,#8 +.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b +.long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082b4 //sha512su0 v20.16b,v21.16b + ext v7.16b,v16.16b,v17.16b,#8 +.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b +.long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec082d5 //sha512su0 v21.16b,v22.16b + ext v7.16b,v17.16b,v18.16b,#8 +.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b +.long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082f6 //sha512su0 v22.16b,v23.16b + ext v7.16b,v18.16b,v19.16b,#8 +.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b +.long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08217 //sha512su0 v23.16b,v16.16b + ext v7.16b,v19.16b,v20.16b,#8 +.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b +.long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v16.2d + ld1 {v16.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" +.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b + rev64 v16.16b,v16.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + ld1 {v24.2d},[x3],#16 + add v25.2d,v25.2d,v17.2d + ld1 {v17.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" +.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b + rev64 v17.16b,v17.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v18.2d + ld1 {v18.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" +.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b + rev64 v18.16b,v18.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + ld1 {v24.2d},[x3],#16 + add v25.2d,v25.2d,v19.2d + ld1 {v19.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" +.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b + rev64 v19.16b,v19.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v20.2d + ld1 {v20.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" +.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b + rev64 v20.16b,v20.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + ld1 {v24.2d},[x3],#16 + add v25.2d,v25.2d,v21.2d + ld1 {v21.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" +.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b + rev64 v21.16b,v21.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v22.2d + ld1 {v22.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" +.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b + rev64 v22.16b,v22.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + sub x3,x3,#80*8 // rewind + add v25.2d,v25.2d,v23.2d + ld1 {v23.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" +.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b + rev64 v23.16b,v23.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v0.2d,v0.2d,v26.2d // accumulate + add v1.2d,v1.2d,v27.2d + add v2.2d,v2.2d,v28.2d + add v3.2d,v3.2d,v29.2d + + cbnz x2,Loop_hw + + st1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // store context + + ldr x29,[sp],#16 + ret + +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) diff --git a/ring-0.17.14/pregenerated/sha512-x86_64-elf.S b/ring-0.17.14/pregenerated/sha512-x86_64-elf.S new file mode 100644 index 0000000000..7e7727757d --- /dev/null +++ b/ring-0.17.14/pregenerated/sha512-x86_64-elf.S @@ -0,0 +1,2978 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +.text + +.globl sha512_block_data_order_nohw +.hidden sha512_block_data_order_nohw +.type sha512_block_data_order_nohw,@function +.align 16 +sha512_block_data_order_nohw: +.cfi_startproc +_CET_ENDBR + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $128+32,%rsp + leaq (%rsi,%rdx,8),%rdx + andq $-64,%rsp + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %rax,152(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 +.Lprologue: + + movq 0(%rdi),%rax + movq 8(%rdi),%rbx + movq 16(%rdi),%rcx + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp .Lloop + +.align 16 +.Lloop: + movq %rbx,%rdi + leaq K512(%rip),%rbp + xorq %rcx,%rdi + movq 0(%rsi),%r12 + movq %r8,%r13 + movq %rax,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r9,%r15 + + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + + movq %r12,0(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + + leaq 8(%rbp),%rbp + addq %r14,%r11 + movq 8(%rsi),%r12 + movq %rdx,%r13 + movq %r11,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r8,%rdi + + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + + movq %r12,8(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + + leaq 24(%rbp),%rbp + addq %r14,%r10 + movq 16(%rsi),%r12 + movq %rcx,%r13 + movq %r10,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rdx,%r15 + + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + + movq %r12,16(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + + leaq 8(%rbp),%rbp + addq %r14,%r9 + movq 24(%rsi),%r12 + movq %rbx,%r13 + movq %r9,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rcx,%rdi + + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + + movq %r12,24(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + + leaq 24(%rbp),%rbp + addq %r14,%r8 + movq 32(%rsi),%r12 + movq %rax,%r13 + movq %r8,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rbx,%r15 + + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + + movq %r12,32(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + + leaq 8(%rbp),%rbp + addq %r14,%rdx + movq 40(%rsi),%r12 + movq %r11,%r13 + movq %rdx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rax,%rdi + + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + + movq %r12,40(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + + leaq 24(%rbp),%rbp + addq %r14,%rcx + movq 48(%rsi),%r12 + movq %r10,%r13 + movq %rcx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r11,%r15 + + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + + movq %r12,48(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + + leaq 8(%rbp),%rbp + addq %r14,%rbx + movq 56(%rsi),%r12 + movq %r9,%r13 + movq %rbx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r10,%rdi + + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + + movq %r12,56(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + + leaq 24(%rbp),%rbp + addq %r14,%rax + movq 64(%rsi),%r12 + movq %r8,%r13 + movq %rax,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r9,%r15 + + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + + movq %r12,64(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + + leaq 8(%rbp),%rbp + addq %r14,%r11 + movq 72(%rsi),%r12 + movq %rdx,%r13 + movq %r11,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r8,%rdi + + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + + movq %r12,72(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + + leaq 24(%rbp),%rbp + addq %r14,%r10 + movq 80(%rsi),%r12 + movq %rcx,%r13 + movq %r10,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rdx,%r15 + + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + + movq %r12,80(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + + leaq 8(%rbp),%rbp + addq %r14,%r9 + movq 88(%rsi),%r12 + movq %rbx,%r13 + movq %r9,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rcx,%rdi + + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + + movq %r12,88(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + + leaq 24(%rbp),%rbp + addq %r14,%r8 + movq 96(%rsi),%r12 + movq %rax,%r13 + movq %r8,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rbx,%r15 + + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + + movq %r12,96(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + + leaq 8(%rbp),%rbp + addq %r14,%rdx + movq 104(%rsi),%r12 + movq %r11,%r13 + movq %rdx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rax,%rdi + + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + + movq %r12,104(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + + leaq 24(%rbp),%rbp + addq %r14,%rcx + movq 112(%rsi),%r12 + movq %r10,%r13 + movq %rcx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r11,%r15 + + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + + movq %r12,112(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + + leaq 8(%rbp),%rbp + addq %r14,%rbx + movq 120(%rsi),%r12 + movq %r9,%r13 + movq %rbx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r10,%rdi + + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + + movq %r12,120(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + + leaq 24(%rbp),%rbp + jmp .Lrounds_16_xx +.align 16 +.Lrounds_16_xx: + movq 8(%rsp),%r13 + movq 112(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rax + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 72(%rsp),%r12 + + addq 0(%rsp),%r12 + movq %r8,%r13 + addq %r15,%r12 + movq %rax,%r14 + rorq $23,%r13 + movq %r9,%r15 + + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + + movq %r12,0(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + + leaq 8(%rbp),%rbp + movq 16(%rsp),%r13 + movq 120(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r11 + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 80(%rsp),%r12 + + addq 8(%rsp),%r12 + movq %rdx,%r13 + addq %rdi,%r12 + movq %r11,%r14 + rorq $23,%r13 + movq %r8,%rdi + + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + + movq %r12,8(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + + leaq 24(%rbp),%rbp + movq 24(%rsp),%r13 + movq 0(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r10 + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 88(%rsp),%r12 + + addq 16(%rsp),%r12 + movq %rcx,%r13 + addq %r15,%r12 + movq %r10,%r14 + rorq $23,%r13 + movq %rdx,%r15 + + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + + movq %r12,16(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + + leaq 8(%rbp),%rbp + movq 32(%rsp),%r13 + movq 8(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r9 + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 96(%rsp),%r12 + + addq 24(%rsp),%r12 + movq %rbx,%r13 + addq %rdi,%r12 + movq %r9,%r14 + rorq $23,%r13 + movq %rcx,%rdi + + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + + movq %r12,24(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + + leaq 24(%rbp),%rbp + movq 40(%rsp),%r13 + movq 16(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r8 + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 104(%rsp),%r12 + + addq 32(%rsp),%r12 + movq %rax,%r13 + addq %r15,%r12 + movq %r8,%r14 + rorq $23,%r13 + movq %rbx,%r15 + + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + + movq %r12,32(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + + leaq 8(%rbp),%rbp + movq 48(%rsp),%r13 + movq 24(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rdx + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 112(%rsp),%r12 + + addq 40(%rsp),%r12 + movq %r11,%r13 + addq %rdi,%r12 + movq %rdx,%r14 + rorq $23,%r13 + movq %rax,%rdi + + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + + movq %r12,40(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + + leaq 24(%rbp),%rbp + movq 56(%rsp),%r13 + movq 32(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rcx + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 120(%rsp),%r12 + + addq 48(%rsp),%r12 + movq %r10,%r13 + addq %r15,%r12 + movq %rcx,%r14 + rorq $23,%r13 + movq %r11,%r15 + + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + + movq %r12,48(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + + leaq 8(%rbp),%rbp + movq 64(%rsp),%r13 + movq 40(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rbx + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 0(%rsp),%r12 + + addq 56(%rsp),%r12 + movq %r9,%r13 + addq %rdi,%r12 + movq %rbx,%r14 + rorq $23,%r13 + movq %r10,%rdi + + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + + movq %r12,56(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + + leaq 24(%rbp),%rbp + movq 72(%rsp),%r13 + movq 48(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rax + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 8(%rsp),%r12 + + addq 64(%rsp),%r12 + movq %r8,%r13 + addq %r15,%r12 + movq %rax,%r14 + rorq $23,%r13 + movq %r9,%r15 + + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + + movq %r12,64(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + + leaq 8(%rbp),%rbp + movq 80(%rsp),%r13 + movq 56(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r11 + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 16(%rsp),%r12 + + addq 72(%rsp),%r12 + movq %rdx,%r13 + addq %rdi,%r12 + movq %r11,%r14 + rorq $23,%r13 + movq %r8,%rdi + + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + + movq %r12,72(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + + leaq 24(%rbp),%rbp + movq 88(%rsp),%r13 + movq 64(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r10 + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 24(%rsp),%r12 + + addq 80(%rsp),%r12 + movq %rcx,%r13 + addq %r15,%r12 + movq %r10,%r14 + rorq $23,%r13 + movq %rdx,%r15 + + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + + movq %r12,80(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + + leaq 8(%rbp),%rbp + movq 96(%rsp),%r13 + movq 72(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r9 + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 32(%rsp),%r12 + + addq 88(%rsp),%r12 + movq %rbx,%r13 + addq %rdi,%r12 + movq %r9,%r14 + rorq $23,%r13 + movq %rcx,%rdi + + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + + movq %r12,88(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + + leaq 24(%rbp),%rbp + movq 104(%rsp),%r13 + movq 80(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r8 + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 40(%rsp),%r12 + + addq 96(%rsp),%r12 + movq %rax,%r13 + addq %r15,%r12 + movq %r8,%r14 + rorq $23,%r13 + movq %rbx,%r15 + + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + + movq %r12,96(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + + leaq 8(%rbp),%rbp + movq 112(%rsp),%r13 + movq 88(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rdx + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 48(%rsp),%r12 + + addq 104(%rsp),%r12 + movq %r11,%r13 + addq %rdi,%r12 + movq %rdx,%r14 + rorq $23,%r13 + movq %rax,%rdi + + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + + movq %r12,104(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + + leaq 24(%rbp),%rbp + movq 120(%rsp),%r13 + movq 96(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rcx + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 56(%rsp),%r12 + + addq 112(%rsp),%r12 + movq %r10,%r13 + addq %r15,%r12 + movq %rcx,%r14 + rorq $23,%r13 + movq %r11,%r15 + + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + + movq %r12,112(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + + leaq 8(%rbp),%rbp + movq 0(%rsp),%r13 + movq 104(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rbx + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 64(%rsp),%r12 + + addq 120(%rsp),%r12 + movq %r9,%r13 + addq %rdi,%r12 + movq %rbx,%r14 + rorq $23,%r13 + movq %r10,%rdi + + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + + movq %r12,120(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + + leaq 24(%rbp),%rbp + cmpb $0,7(%rbp) + jnz .Lrounds_16_xx + + movq 128+0(%rsp),%rdi + addq %r14,%rax + leaq 128(%rsi),%rsi + + addq 0(%rdi),%rax + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + jb .Lloop + + movq 152(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue: + ret +.cfi_endproc +.size sha512_block_data_order_nohw,.-sha512_block_data_order_nohw +.section .rodata +.align 64 +.type K512,@object +K512: +.quad 0x428a2f98d728ae22,0x7137449123ef65cd +.quad 0x428a2f98d728ae22,0x7137449123ef65cd +.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc +.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc +.quad 0x3956c25bf348b538,0x59f111f1b605d019 +.quad 0x3956c25bf348b538,0x59f111f1b605d019 +.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 +.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 +.quad 0xd807aa98a3030242,0x12835b0145706fbe +.quad 0xd807aa98a3030242,0x12835b0145706fbe +.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 +.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 +.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 +.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 +.quad 0x9bdc06a725c71235,0xc19bf174cf692694 +.quad 0x9bdc06a725c71235,0xc19bf174cf692694 +.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 +.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 +.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 +.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 +.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 +.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 +.quad 0x983e5152ee66dfab,0xa831c66d2db43210 +.quad 0x983e5152ee66dfab,0xa831c66d2db43210 +.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 +.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 +.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 +.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 +.quad 0x06ca6351e003826f,0x142929670a0e6e70 +.quad 0x06ca6351e003826f,0x142929670a0e6e70 +.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 +.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 +.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df +.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df +.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 +.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 +.quad 0x81c2c92e47edaee6,0x92722c851482353b +.quad 0x81c2c92e47edaee6,0x92722c851482353b +.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 +.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 +.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 +.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 +.quad 0xd192e819d6ef5218,0xd69906245565a910 +.quad 0xd192e819d6ef5218,0xd69906245565a910 +.quad 0xf40e35855771202a,0x106aa07032bbd1b8 +.quad 0xf40e35855771202a,0x106aa07032bbd1b8 +.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 +.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 +.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb +.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb +.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 +.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 +.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 +.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 +.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec +.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec +.quad 0x90befffa23631e28,0xa4506cebde82bde9 +.quad 0x90befffa23631e28,0xa4506cebde82bde9 +.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b +.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b +.quad 0xca273eceea26619c,0xd186b8c721c0c207 +.quad 0xca273eceea26619c,0xd186b8c721c0c207 +.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 +.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 +.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 +.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 +.quad 0x113f9804bef90dae,0x1b710b35131c471b +.quad 0x113f9804bef90dae,0x1b710b35131c471b +.quad 0x28db77f523047d84,0x32caab7b40c72493 +.quad 0x28db77f523047d84,0x32caab7b40c72493 +.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c +.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c +.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a +.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a +.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 +.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + +.quad 0x0001020304050607,0x08090a0b0c0d0e0f +.quad 0x0001020304050607,0x08090a0b0c0d0e0f +.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.text +.globl sha512_block_data_order_avx +.hidden sha512_block_data_order_avx +.type sha512_block_data_order_avx,@function +.align 64 +sha512_block_data_order_avx: +.cfi_startproc +_CET_ENDBR + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $160,%rsp + leaq (%rsi,%rdx,8),%rdx + andq $-64,%rsp + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %rax,152(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 +.Lprologue_avx: + + vzeroupper + movq 0(%rdi),%rax + movq 8(%rdi),%rbx + movq 16(%rdi),%rcx + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp .Lloop_avx +.align 16 +.Lloop_avx: + vmovdqa K512+1280(%rip),%xmm11 + vmovdqu 0(%rsi),%xmm0 + leaq K512+128(%rip),%rbp + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vpshufb %xmm11,%xmm0,%xmm0 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm11,%xmm1,%xmm1 + vmovdqu 64(%rsi),%xmm4 + vpshufb %xmm11,%xmm2,%xmm2 + vmovdqu 80(%rsi),%xmm5 + vpshufb %xmm11,%xmm3,%xmm3 + vmovdqu 96(%rsi),%xmm6 + vpshufb %xmm11,%xmm4,%xmm4 + vmovdqu 112(%rsi),%xmm7 + vpshufb %xmm11,%xmm5,%xmm5 + vpaddq -128(%rbp),%xmm0,%xmm8 + vpshufb %xmm11,%xmm6,%xmm6 + vpaddq -96(%rbp),%xmm1,%xmm9 + vpshufb %xmm11,%xmm7,%xmm7 + vpaddq -64(%rbp),%xmm2,%xmm10 + vpaddq -32(%rbp),%xmm3,%xmm11 + vmovdqa %xmm8,0(%rsp) + vpaddq 0(%rbp),%xmm4,%xmm8 + vmovdqa %xmm9,16(%rsp) + vpaddq 32(%rbp),%xmm5,%xmm9 + vmovdqa %xmm10,32(%rsp) + vpaddq 64(%rbp),%xmm6,%xmm10 + vmovdqa %xmm11,48(%rsp) + vpaddq 96(%rbp),%xmm7,%xmm11 + vmovdqa %xmm8,64(%rsp) + movq %rax,%r14 + vmovdqa %xmm9,80(%rsp) + movq %rbx,%rdi + vmovdqa %xmm10,96(%rsp) + xorq %rcx,%rdi + vmovdqa %xmm11,112(%rsp) + movq %r8,%r13 + jmp .Lavx_00_47 + +.align 16 +.Lavx_00_47: + addq $256,%rbp + vpalignr $8,%xmm0,%xmm1,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rax + vpalignr $8,%xmm4,%xmm5,%xmm11 + movq %r9,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r8,%r13 + xorq %r10,%r12 + vpaddq %xmm11,%xmm0,%xmm0 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r8,%r12 + xorq %r8,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 0(%rsp),%r11 + movq %rax,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rbx,%r15 + addq %r12,%r11 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm7,%xmm11 + addq %r11,%rdx + addq %rdi,%r11 + vpxor %xmm9,%xmm8,%xmm8 + movq %rdx,%r13 + addq %r11,%r14 + vpsllq $3,%xmm7,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r11 + vpaddq %xmm8,%xmm0,%xmm0 + movq %r8,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm7,%xmm9 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rdx,%r12 + xorq %rdx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 8(%rsp),%r10 + movq %r11,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r9,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rax,%rdi + addq %r12,%r10 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm0,%xmm0 + xorq %r11,%r14 + addq %r13,%r10 + vpaddq -128(%rbp),%xmm0,%xmm10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,0(%rsp) + vpalignr $8,%xmm1,%xmm2,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r10 + vpalignr $8,%xmm5,%xmm6,%xmm11 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rcx,%r13 + xorq %r8,%r12 + vpaddq %xmm11,%xmm1,%xmm1 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rcx,%r12 + xorq %rcx,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 16(%rsp),%r9 + movq %r10,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r11,%r15 + addq %r12,%r9 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm0,%xmm11 + addq %r9,%rbx + addq %rdi,%r9 + vpxor %xmm9,%xmm8,%xmm8 + movq %rbx,%r13 + addq %r9,%r14 + vpsllq $3,%xmm0,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r9 + vpaddq %xmm8,%xmm1,%xmm1 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm0,%xmm9 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rbx,%r12 + xorq %rbx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 24(%rsp),%r8 + movq %r9,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r10,%rdi + addq %r12,%r8 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm1,%xmm1 + xorq %r9,%r14 + addq %r13,%r8 + vpaddq -96(%rbp),%xmm1,%xmm10 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,16(%rsp) + vpalignr $8,%xmm2,%xmm3,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r8 + vpalignr $8,%xmm6,%xmm7,%xmm11 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rax,%r13 + xorq %rcx,%r12 + vpaddq %xmm11,%xmm2,%xmm2 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rax,%r12 + xorq %rax,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 32(%rsp),%rdx + movq %r8,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r9,%r15 + addq %r12,%rdx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm1,%xmm11 + addq %rdx,%r11 + addq %rdi,%rdx + vpxor %xmm9,%xmm8,%xmm8 + movq %r11,%r13 + addq %rdx,%r14 + vpsllq $3,%xmm1,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rdx + vpaddq %xmm8,%xmm2,%xmm2 + movq %rax,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm1,%xmm9 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r11,%r12 + xorq %r11,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 40(%rsp),%rcx + movq %rdx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r8,%rdi + addq %r12,%rcx + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm2,%xmm2 + xorq %rdx,%r14 + addq %r13,%rcx + vpaddq -64(%rbp),%xmm2,%xmm10 + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,32(%rsp) + vpalignr $8,%xmm3,%xmm4,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rcx + vpalignr $8,%xmm7,%xmm0,%xmm11 + movq %r11,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r10,%r13 + xorq %rax,%r12 + vpaddq %xmm11,%xmm3,%xmm3 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r10,%r12 + xorq %r10,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 48(%rsp),%rbx + movq %rcx,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rdx,%r15 + addq %r12,%rbx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm2,%xmm11 + addq %rbx,%r9 + addq %rdi,%rbx + vpxor %xmm9,%xmm8,%xmm8 + movq %r9,%r13 + addq %rbx,%r14 + vpsllq $3,%xmm2,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rbx + vpaddq %xmm8,%xmm3,%xmm3 + movq %r10,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm2,%xmm9 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r9,%r12 + xorq %r9,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 56(%rsp),%rax + movq %rbx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r11,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rcx,%rdi + addq %r12,%rax + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm3,%xmm3 + xorq %rbx,%r14 + addq %r13,%rax + vpaddq -32(%rbp),%xmm3,%xmm10 + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,48(%rsp) + vpalignr $8,%xmm4,%xmm5,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rax + vpalignr $8,%xmm0,%xmm1,%xmm11 + movq %r9,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r8,%r13 + xorq %r10,%r12 + vpaddq %xmm11,%xmm4,%xmm4 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r8,%r12 + xorq %r8,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 64(%rsp),%r11 + movq %rax,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rbx,%r15 + addq %r12,%r11 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm3,%xmm11 + addq %r11,%rdx + addq %rdi,%r11 + vpxor %xmm9,%xmm8,%xmm8 + movq %rdx,%r13 + addq %r11,%r14 + vpsllq $3,%xmm3,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r11 + vpaddq %xmm8,%xmm4,%xmm4 + movq %r8,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm3,%xmm9 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rdx,%r12 + xorq %rdx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 72(%rsp),%r10 + movq %r11,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r9,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rax,%rdi + addq %r12,%r10 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm4,%xmm4 + xorq %r11,%r14 + addq %r13,%r10 + vpaddq 0(%rbp),%xmm4,%xmm10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,64(%rsp) + vpalignr $8,%xmm5,%xmm6,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r10 + vpalignr $8,%xmm1,%xmm2,%xmm11 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rcx,%r13 + xorq %r8,%r12 + vpaddq %xmm11,%xmm5,%xmm5 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rcx,%r12 + xorq %rcx,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 80(%rsp),%r9 + movq %r10,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r11,%r15 + addq %r12,%r9 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm4,%xmm11 + addq %r9,%rbx + addq %rdi,%r9 + vpxor %xmm9,%xmm8,%xmm8 + movq %rbx,%r13 + addq %r9,%r14 + vpsllq $3,%xmm4,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r9 + vpaddq %xmm8,%xmm5,%xmm5 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm4,%xmm9 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rbx,%r12 + xorq %rbx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 88(%rsp),%r8 + movq %r9,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r10,%rdi + addq %r12,%r8 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm5,%xmm5 + xorq %r9,%r14 + addq %r13,%r8 + vpaddq 32(%rbp),%xmm5,%xmm10 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,80(%rsp) + vpalignr $8,%xmm6,%xmm7,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r8 + vpalignr $8,%xmm2,%xmm3,%xmm11 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rax,%r13 + xorq %rcx,%r12 + vpaddq %xmm11,%xmm6,%xmm6 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rax,%r12 + xorq %rax,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 96(%rsp),%rdx + movq %r8,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r9,%r15 + addq %r12,%rdx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm5,%xmm11 + addq %rdx,%r11 + addq %rdi,%rdx + vpxor %xmm9,%xmm8,%xmm8 + movq %r11,%r13 + addq %rdx,%r14 + vpsllq $3,%xmm5,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rdx + vpaddq %xmm8,%xmm6,%xmm6 + movq %rax,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm5,%xmm9 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r11,%r12 + xorq %r11,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 104(%rsp),%rcx + movq %rdx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r8,%rdi + addq %r12,%rcx + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm6,%xmm6 + xorq %rdx,%r14 + addq %r13,%rcx + vpaddq 64(%rbp),%xmm6,%xmm10 + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,96(%rsp) + vpalignr $8,%xmm7,%xmm0,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rcx + vpalignr $8,%xmm3,%xmm4,%xmm11 + movq %r11,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r10,%r13 + xorq %rax,%r12 + vpaddq %xmm11,%xmm7,%xmm7 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r10,%r12 + xorq %r10,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 112(%rsp),%rbx + movq %rcx,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rdx,%r15 + addq %r12,%rbx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm6,%xmm11 + addq %rbx,%r9 + addq %rdi,%rbx + vpxor %xmm9,%xmm8,%xmm8 + movq %r9,%r13 + addq %rbx,%r14 + vpsllq $3,%xmm6,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rbx + vpaddq %xmm8,%xmm7,%xmm7 + movq %r10,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm6,%xmm9 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r9,%r12 + xorq %r9,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 120(%rsp),%rax + movq %rbx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r11,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rcx,%rdi + addq %r12,%rax + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm7,%xmm7 + xorq %rbx,%r14 + addq %r13,%rax + vpaddq 96(%rbp),%xmm7,%xmm10 + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,112(%rsp) + cmpb $0,135(%rbp) + jne .Lavx_00_47 + shrdq $23,%r13,%r13 + movq %r14,%rax + movq %r9,%r12 + shrdq $5,%r14,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 0(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r11 + movq %r8,%r12 + shrdq $5,%r14,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 8(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + shrdq $6,%r14,%r14 + xorq %rax,%rdi + addq %r12,%r10 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r10 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 16(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + xorq %r11,%r15 + addq %r12,%r9 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r9 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 24(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + xorq %r10,%rdi + addq %r12,%r8 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r8 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 32(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + xorq %r9,%r15 + addq %r12,%rdx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + shrdq $28,%r14,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rdx + movq %rax,%r12 + shrdq $5,%r14,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 40(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + xorq %r8,%rdi + addq %r12,%rcx + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rcx + movq %r11,%r12 + shrdq $5,%r14,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 48(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rbx + movq %r10,%r12 + shrdq $5,%r14,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 56(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + shrdq $6,%r14,%r14 + xorq %rcx,%rdi + addq %r12,%rax + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rax + movq %r9,%r12 + shrdq $5,%r14,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 64(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r11 + movq %r8,%r12 + shrdq $5,%r14,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 72(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + shrdq $6,%r14,%r14 + xorq %rax,%rdi + addq %r12,%r10 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r10 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 80(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + xorq %r11,%r15 + addq %r12,%r9 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r9 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 88(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + xorq %r10,%rdi + addq %r12,%r8 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r8 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 96(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + xorq %r9,%r15 + addq %r12,%rdx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + shrdq $28,%r14,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rdx + movq %rax,%r12 + shrdq $5,%r14,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 104(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + xorq %r8,%rdi + addq %r12,%rcx + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rcx + movq %r11,%r12 + shrdq $5,%r14,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 112(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rbx + movq %r10,%r12 + shrdq $5,%r14,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 120(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + shrdq $6,%r14,%r14 + xorq %rcx,%rdi + addq %r12,%rax + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + movq 128+0(%rsp),%rdi + movq %r14,%rax + + addq 0(%rdi),%rax + leaq 128(%rsi),%rsi + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + jb .Lloop_avx + + movq 152(%rsp),%rsi +.cfi_def_cfa %rsi,8 + vzeroupper + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx: + ret +.cfi_endproc +.size sha512_block_data_order_avx,.-sha512_block_data_order_avx +#endif diff --git a/ring-0.17.14/pregenerated/sha512-x86_64-macosx.S b/ring-0.17.14/pregenerated/sha512-x86_64-macosx.S new file mode 100644 index 0000000000..f882ce0193 --- /dev/null +++ b/ring-0.17.14/pregenerated/sha512-x86_64-macosx.S @@ -0,0 +1,2978 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +.text + +.globl _sha512_block_data_order_nohw +.private_extern _sha512_block_data_order_nohw + +.p2align 4 +_sha512_block_data_order_nohw: + +_CET_ENDBR + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + shlq $4,%rdx + subq $128+32,%rsp + leaq (%rsi,%rdx,8),%rdx + andq $-64,%rsp + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %rax,152(%rsp) + +L$prologue: + + movq 0(%rdi),%rax + movq 8(%rdi),%rbx + movq 16(%rdi),%rcx + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp L$loop + +.p2align 4 +L$loop: + movq %rbx,%rdi + leaq K512(%rip),%rbp + xorq %rcx,%rdi + movq 0(%rsi),%r12 + movq %r8,%r13 + movq %rax,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r9,%r15 + + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + + movq %r12,0(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + + leaq 8(%rbp),%rbp + addq %r14,%r11 + movq 8(%rsi),%r12 + movq %rdx,%r13 + movq %r11,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r8,%rdi + + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + + movq %r12,8(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + + leaq 24(%rbp),%rbp + addq %r14,%r10 + movq 16(%rsi),%r12 + movq %rcx,%r13 + movq %r10,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rdx,%r15 + + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + + movq %r12,16(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + + leaq 8(%rbp),%rbp + addq %r14,%r9 + movq 24(%rsi),%r12 + movq %rbx,%r13 + movq %r9,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rcx,%rdi + + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + + movq %r12,24(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + + leaq 24(%rbp),%rbp + addq %r14,%r8 + movq 32(%rsi),%r12 + movq %rax,%r13 + movq %r8,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rbx,%r15 + + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + + movq %r12,32(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + + leaq 8(%rbp),%rbp + addq %r14,%rdx + movq 40(%rsi),%r12 + movq %r11,%r13 + movq %rdx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rax,%rdi + + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + + movq %r12,40(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + + leaq 24(%rbp),%rbp + addq %r14,%rcx + movq 48(%rsi),%r12 + movq %r10,%r13 + movq %rcx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r11,%r15 + + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + + movq %r12,48(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + + leaq 8(%rbp),%rbp + addq %r14,%rbx + movq 56(%rsi),%r12 + movq %r9,%r13 + movq %rbx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r10,%rdi + + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + + movq %r12,56(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + + leaq 24(%rbp),%rbp + addq %r14,%rax + movq 64(%rsi),%r12 + movq %r8,%r13 + movq %rax,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r9,%r15 + + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + + movq %r12,64(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + + leaq 8(%rbp),%rbp + addq %r14,%r11 + movq 72(%rsi),%r12 + movq %rdx,%r13 + movq %r11,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r8,%rdi + + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + + movq %r12,72(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + + leaq 24(%rbp),%rbp + addq %r14,%r10 + movq 80(%rsi),%r12 + movq %rcx,%r13 + movq %r10,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rdx,%r15 + + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + + movq %r12,80(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + + leaq 8(%rbp),%rbp + addq %r14,%r9 + movq 88(%rsi),%r12 + movq %rbx,%r13 + movq %r9,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rcx,%rdi + + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + + movq %r12,88(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + + leaq 24(%rbp),%rbp + addq %r14,%r8 + movq 96(%rsi),%r12 + movq %rax,%r13 + movq %r8,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rbx,%r15 + + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + + movq %r12,96(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + + leaq 8(%rbp),%rbp + addq %r14,%rdx + movq 104(%rsi),%r12 + movq %r11,%r13 + movq %rdx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rax,%rdi + + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + + movq %r12,104(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + + leaq 24(%rbp),%rbp + addq %r14,%rcx + movq 112(%rsi),%r12 + movq %r10,%r13 + movq %rcx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r11,%r15 + + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + + movq %r12,112(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + + leaq 8(%rbp),%rbp + addq %r14,%rbx + movq 120(%rsi),%r12 + movq %r9,%r13 + movq %rbx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r10,%rdi + + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + + movq %r12,120(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + + leaq 24(%rbp),%rbp + jmp L$rounds_16_xx +.p2align 4 +L$rounds_16_xx: + movq 8(%rsp),%r13 + movq 112(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rax + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 72(%rsp),%r12 + + addq 0(%rsp),%r12 + movq %r8,%r13 + addq %r15,%r12 + movq %rax,%r14 + rorq $23,%r13 + movq %r9,%r15 + + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + + movq %r12,0(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + + leaq 8(%rbp),%rbp + movq 16(%rsp),%r13 + movq 120(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r11 + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 80(%rsp),%r12 + + addq 8(%rsp),%r12 + movq %rdx,%r13 + addq %rdi,%r12 + movq %r11,%r14 + rorq $23,%r13 + movq %r8,%rdi + + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + + movq %r12,8(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + + leaq 24(%rbp),%rbp + movq 24(%rsp),%r13 + movq 0(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r10 + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 88(%rsp),%r12 + + addq 16(%rsp),%r12 + movq %rcx,%r13 + addq %r15,%r12 + movq %r10,%r14 + rorq $23,%r13 + movq %rdx,%r15 + + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + + movq %r12,16(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + + leaq 8(%rbp),%rbp + movq 32(%rsp),%r13 + movq 8(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r9 + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 96(%rsp),%r12 + + addq 24(%rsp),%r12 + movq %rbx,%r13 + addq %rdi,%r12 + movq %r9,%r14 + rorq $23,%r13 + movq %rcx,%rdi + + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + + movq %r12,24(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + + leaq 24(%rbp),%rbp + movq 40(%rsp),%r13 + movq 16(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r8 + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 104(%rsp),%r12 + + addq 32(%rsp),%r12 + movq %rax,%r13 + addq %r15,%r12 + movq %r8,%r14 + rorq $23,%r13 + movq %rbx,%r15 + + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + + movq %r12,32(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + + leaq 8(%rbp),%rbp + movq 48(%rsp),%r13 + movq 24(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rdx + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 112(%rsp),%r12 + + addq 40(%rsp),%r12 + movq %r11,%r13 + addq %rdi,%r12 + movq %rdx,%r14 + rorq $23,%r13 + movq %rax,%rdi + + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + + movq %r12,40(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + + leaq 24(%rbp),%rbp + movq 56(%rsp),%r13 + movq 32(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rcx + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 120(%rsp),%r12 + + addq 48(%rsp),%r12 + movq %r10,%r13 + addq %r15,%r12 + movq %rcx,%r14 + rorq $23,%r13 + movq %r11,%r15 + + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + + movq %r12,48(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + + leaq 8(%rbp),%rbp + movq 64(%rsp),%r13 + movq 40(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rbx + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 0(%rsp),%r12 + + addq 56(%rsp),%r12 + movq %r9,%r13 + addq %rdi,%r12 + movq %rbx,%r14 + rorq $23,%r13 + movq %r10,%rdi + + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + + movq %r12,56(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + + leaq 24(%rbp),%rbp + movq 72(%rsp),%r13 + movq 48(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rax + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 8(%rsp),%r12 + + addq 64(%rsp),%r12 + movq %r8,%r13 + addq %r15,%r12 + movq %rax,%r14 + rorq $23,%r13 + movq %r9,%r15 + + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + + movq %r12,64(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + + leaq 8(%rbp),%rbp + movq 80(%rsp),%r13 + movq 56(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r11 + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 16(%rsp),%r12 + + addq 72(%rsp),%r12 + movq %rdx,%r13 + addq %rdi,%r12 + movq %r11,%r14 + rorq $23,%r13 + movq %r8,%rdi + + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + + movq %r12,72(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + + leaq 24(%rbp),%rbp + movq 88(%rsp),%r13 + movq 64(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r10 + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 24(%rsp),%r12 + + addq 80(%rsp),%r12 + movq %rcx,%r13 + addq %r15,%r12 + movq %r10,%r14 + rorq $23,%r13 + movq %rdx,%r15 + + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + + movq %r12,80(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + + leaq 8(%rbp),%rbp + movq 96(%rsp),%r13 + movq 72(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r9 + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 32(%rsp),%r12 + + addq 88(%rsp),%r12 + movq %rbx,%r13 + addq %rdi,%r12 + movq %r9,%r14 + rorq $23,%r13 + movq %rcx,%rdi + + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + + movq %r12,88(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + + leaq 24(%rbp),%rbp + movq 104(%rsp),%r13 + movq 80(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r8 + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 40(%rsp),%r12 + + addq 96(%rsp),%r12 + movq %rax,%r13 + addq %r15,%r12 + movq %r8,%r14 + rorq $23,%r13 + movq %rbx,%r15 + + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + + movq %r12,96(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + + leaq 8(%rbp),%rbp + movq 112(%rsp),%r13 + movq 88(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rdx + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 48(%rsp),%r12 + + addq 104(%rsp),%r12 + movq %r11,%r13 + addq %rdi,%r12 + movq %rdx,%r14 + rorq $23,%r13 + movq %rax,%rdi + + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + + movq %r12,104(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + + leaq 24(%rbp),%rbp + movq 120(%rsp),%r13 + movq 96(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rcx + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 56(%rsp),%r12 + + addq 112(%rsp),%r12 + movq %r10,%r13 + addq %r15,%r12 + movq %rcx,%r14 + rorq $23,%r13 + movq %r11,%r15 + + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + + movq %r12,112(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + + leaq 8(%rbp),%rbp + movq 0(%rsp),%r13 + movq 104(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rbx + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 64(%rsp),%r12 + + addq 120(%rsp),%r12 + movq %r9,%r13 + addq %rdi,%r12 + movq %rbx,%r14 + rorq $23,%r13 + movq %r10,%rdi + + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + + movq %r12,120(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + + leaq 24(%rbp),%rbp + cmpb $0,7(%rbp) + jnz L$rounds_16_xx + + movq 128+0(%rsp),%rdi + addq %r14,%rax + leaq 128(%rsi),%rsi + + addq 0(%rdi),%rax + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + jb L$loop + + movq 152(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$epilogue: + ret + + +.section __DATA,__const +.p2align 6 + +K512: +.quad 0x428a2f98d728ae22,0x7137449123ef65cd +.quad 0x428a2f98d728ae22,0x7137449123ef65cd +.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc +.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc +.quad 0x3956c25bf348b538,0x59f111f1b605d019 +.quad 0x3956c25bf348b538,0x59f111f1b605d019 +.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 +.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 +.quad 0xd807aa98a3030242,0x12835b0145706fbe +.quad 0xd807aa98a3030242,0x12835b0145706fbe +.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 +.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 +.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 +.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 +.quad 0x9bdc06a725c71235,0xc19bf174cf692694 +.quad 0x9bdc06a725c71235,0xc19bf174cf692694 +.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 +.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 +.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 +.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 +.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 +.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 +.quad 0x983e5152ee66dfab,0xa831c66d2db43210 +.quad 0x983e5152ee66dfab,0xa831c66d2db43210 +.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 +.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 +.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 +.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 +.quad 0x06ca6351e003826f,0x142929670a0e6e70 +.quad 0x06ca6351e003826f,0x142929670a0e6e70 +.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 +.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 +.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df +.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df +.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 +.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 +.quad 0x81c2c92e47edaee6,0x92722c851482353b +.quad 0x81c2c92e47edaee6,0x92722c851482353b +.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 +.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 +.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 +.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 +.quad 0xd192e819d6ef5218,0xd69906245565a910 +.quad 0xd192e819d6ef5218,0xd69906245565a910 +.quad 0xf40e35855771202a,0x106aa07032bbd1b8 +.quad 0xf40e35855771202a,0x106aa07032bbd1b8 +.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 +.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 +.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb +.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb +.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 +.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 +.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 +.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 +.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec +.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec +.quad 0x90befffa23631e28,0xa4506cebde82bde9 +.quad 0x90befffa23631e28,0xa4506cebde82bde9 +.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b +.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b +.quad 0xca273eceea26619c,0xd186b8c721c0c207 +.quad 0xca273eceea26619c,0xd186b8c721c0c207 +.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 +.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 +.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 +.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 +.quad 0x113f9804bef90dae,0x1b710b35131c471b +.quad 0x113f9804bef90dae,0x1b710b35131c471b +.quad 0x28db77f523047d84,0x32caab7b40c72493 +.quad 0x28db77f523047d84,0x32caab7b40c72493 +.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c +.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c +.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a +.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a +.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 +.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + +.quad 0x0001020304050607,0x08090a0b0c0d0e0f +.quad 0x0001020304050607,0x08090a0b0c0d0e0f +.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.text +.globl _sha512_block_data_order_avx +.private_extern _sha512_block_data_order_avx + +.p2align 6 +_sha512_block_data_order_avx: + +_CET_ENDBR + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + shlq $4,%rdx + subq $160,%rsp + leaq (%rsi,%rdx,8),%rdx + andq $-64,%rsp + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %rax,152(%rsp) + +L$prologue_avx: + + vzeroupper + movq 0(%rdi),%rax + movq 8(%rdi),%rbx + movq 16(%rdi),%rcx + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp L$loop_avx +.p2align 4 +L$loop_avx: + vmovdqa K512+1280(%rip),%xmm11 + vmovdqu 0(%rsi),%xmm0 + leaq K512+128(%rip),%rbp + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vpshufb %xmm11,%xmm0,%xmm0 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm11,%xmm1,%xmm1 + vmovdqu 64(%rsi),%xmm4 + vpshufb %xmm11,%xmm2,%xmm2 + vmovdqu 80(%rsi),%xmm5 + vpshufb %xmm11,%xmm3,%xmm3 + vmovdqu 96(%rsi),%xmm6 + vpshufb %xmm11,%xmm4,%xmm4 + vmovdqu 112(%rsi),%xmm7 + vpshufb %xmm11,%xmm5,%xmm5 + vpaddq -128(%rbp),%xmm0,%xmm8 + vpshufb %xmm11,%xmm6,%xmm6 + vpaddq -96(%rbp),%xmm1,%xmm9 + vpshufb %xmm11,%xmm7,%xmm7 + vpaddq -64(%rbp),%xmm2,%xmm10 + vpaddq -32(%rbp),%xmm3,%xmm11 + vmovdqa %xmm8,0(%rsp) + vpaddq 0(%rbp),%xmm4,%xmm8 + vmovdqa %xmm9,16(%rsp) + vpaddq 32(%rbp),%xmm5,%xmm9 + vmovdqa %xmm10,32(%rsp) + vpaddq 64(%rbp),%xmm6,%xmm10 + vmovdqa %xmm11,48(%rsp) + vpaddq 96(%rbp),%xmm7,%xmm11 + vmovdqa %xmm8,64(%rsp) + movq %rax,%r14 + vmovdqa %xmm9,80(%rsp) + movq %rbx,%rdi + vmovdqa %xmm10,96(%rsp) + xorq %rcx,%rdi + vmovdqa %xmm11,112(%rsp) + movq %r8,%r13 + jmp L$avx_00_47 + +.p2align 4 +L$avx_00_47: + addq $256,%rbp + vpalignr $8,%xmm0,%xmm1,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rax + vpalignr $8,%xmm4,%xmm5,%xmm11 + movq %r9,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r8,%r13 + xorq %r10,%r12 + vpaddq %xmm11,%xmm0,%xmm0 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r8,%r12 + xorq %r8,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 0(%rsp),%r11 + movq %rax,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rbx,%r15 + addq %r12,%r11 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm7,%xmm11 + addq %r11,%rdx + addq %rdi,%r11 + vpxor %xmm9,%xmm8,%xmm8 + movq %rdx,%r13 + addq %r11,%r14 + vpsllq $3,%xmm7,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r11 + vpaddq %xmm8,%xmm0,%xmm0 + movq %r8,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm7,%xmm9 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rdx,%r12 + xorq %rdx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 8(%rsp),%r10 + movq %r11,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r9,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rax,%rdi + addq %r12,%r10 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm0,%xmm0 + xorq %r11,%r14 + addq %r13,%r10 + vpaddq -128(%rbp),%xmm0,%xmm10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,0(%rsp) + vpalignr $8,%xmm1,%xmm2,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r10 + vpalignr $8,%xmm5,%xmm6,%xmm11 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rcx,%r13 + xorq %r8,%r12 + vpaddq %xmm11,%xmm1,%xmm1 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rcx,%r12 + xorq %rcx,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 16(%rsp),%r9 + movq %r10,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r11,%r15 + addq %r12,%r9 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm0,%xmm11 + addq %r9,%rbx + addq %rdi,%r9 + vpxor %xmm9,%xmm8,%xmm8 + movq %rbx,%r13 + addq %r9,%r14 + vpsllq $3,%xmm0,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r9 + vpaddq %xmm8,%xmm1,%xmm1 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm0,%xmm9 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rbx,%r12 + xorq %rbx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 24(%rsp),%r8 + movq %r9,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r10,%rdi + addq %r12,%r8 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm1,%xmm1 + xorq %r9,%r14 + addq %r13,%r8 + vpaddq -96(%rbp),%xmm1,%xmm10 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,16(%rsp) + vpalignr $8,%xmm2,%xmm3,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r8 + vpalignr $8,%xmm6,%xmm7,%xmm11 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rax,%r13 + xorq %rcx,%r12 + vpaddq %xmm11,%xmm2,%xmm2 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rax,%r12 + xorq %rax,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 32(%rsp),%rdx + movq %r8,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r9,%r15 + addq %r12,%rdx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm1,%xmm11 + addq %rdx,%r11 + addq %rdi,%rdx + vpxor %xmm9,%xmm8,%xmm8 + movq %r11,%r13 + addq %rdx,%r14 + vpsllq $3,%xmm1,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rdx + vpaddq %xmm8,%xmm2,%xmm2 + movq %rax,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm1,%xmm9 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r11,%r12 + xorq %r11,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 40(%rsp),%rcx + movq %rdx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r8,%rdi + addq %r12,%rcx + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm2,%xmm2 + xorq %rdx,%r14 + addq %r13,%rcx + vpaddq -64(%rbp),%xmm2,%xmm10 + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,32(%rsp) + vpalignr $8,%xmm3,%xmm4,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rcx + vpalignr $8,%xmm7,%xmm0,%xmm11 + movq %r11,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r10,%r13 + xorq %rax,%r12 + vpaddq %xmm11,%xmm3,%xmm3 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r10,%r12 + xorq %r10,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 48(%rsp),%rbx + movq %rcx,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rdx,%r15 + addq %r12,%rbx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm2,%xmm11 + addq %rbx,%r9 + addq %rdi,%rbx + vpxor %xmm9,%xmm8,%xmm8 + movq %r9,%r13 + addq %rbx,%r14 + vpsllq $3,%xmm2,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rbx + vpaddq %xmm8,%xmm3,%xmm3 + movq %r10,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm2,%xmm9 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r9,%r12 + xorq %r9,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 56(%rsp),%rax + movq %rbx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r11,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rcx,%rdi + addq %r12,%rax + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm3,%xmm3 + xorq %rbx,%r14 + addq %r13,%rax + vpaddq -32(%rbp),%xmm3,%xmm10 + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,48(%rsp) + vpalignr $8,%xmm4,%xmm5,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rax + vpalignr $8,%xmm0,%xmm1,%xmm11 + movq %r9,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r8,%r13 + xorq %r10,%r12 + vpaddq %xmm11,%xmm4,%xmm4 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r8,%r12 + xorq %r8,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 64(%rsp),%r11 + movq %rax,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rbx,%r15 + addq %r12,%r11 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm3,%xmm11 + addq %r11,%rdx + addq %rdi,%r11 + vpxor %xmm9,%xmm8,%xmm8 + movq %rdx,%r13 + addq %r11,%r14 + vpsllq $3,%xmm3,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r11 + vpaddq %xmm8,%xmm4,%xmm4 + movq %r8,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm3,%xmm9 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rdx,%r12 + xorq %rdx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 72(%rsp),%r10 + movq %r11,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r9,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rax,%rdi + addq %r12,%r10 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm4,%xmm4 + xorq %r11,%r14 + addq %r13,%r10 + vpaddq 0(%rbp),%xmm4,%xmm10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,64(%rsp) + vpalignr $8,%xmm5,%xmm6,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r10 + vpalignr $8,%xmm1,%xmm2,%xmm11 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rcx,%r13 + xorq %r8,%r12 + vpaddq %xmm11,%xmm5,%xmm5 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rcx,%r12 + xorq %rcx,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 80(%rsp),%r9 + movq %r10,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r11,%r15 + addq %r12,%r9 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm4,%xmm11 + addq %r9,%rbx + addq %rdi,%r9 + vpxor %xmm9,%xmm8,%xmm8 + movq %rbx,%r13 + addq %r9,%r14 + vpsllq $3,%xmm4,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r9 + vpaddq %xmm8,%xmm5,%xmm5 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm4,%xmm9 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rbx,%r12 + xorq %rbx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 88(%rsp),%r8 + movq %r9,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r10,%rdi + addq %r12,%r8 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm5,%xmm5 + xorq %r9,%r14 + addq %r13,%r8 + vpaddq 32(%rbp),%xmm5,%xmm10 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,80(%rsp) + vpalignr $8,%xmm6,%xmm7,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r8 + vpalignr $8,%xmm2,%xmm3,%xmm11 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rax,%r13 + xorq %rcx,%r12 + vpaddq %xmm11,%xmm6,%xmm6 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rax,%r12 + xorq %rax,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 96(%rsp),%rdx + movq %r8,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r9,%r15 + addq %r12,%rdx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm5,%xmm11 + addq %rdx,%r11 + addq %rdi,%rdx + vpxor %xmm9,%xmm8,%xmm8 + movq %r11,%r13 + addq %rdx,%r14 + vpsllq $3,%xmm5,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rdx + vpaddq %xmm8,%xmm6,%xmm6 + movq %rax,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm5,%xmm9 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r11,%r12 + xorq %r11,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 104(%rsp),%rcx + movq %rdx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r8,%rdi + addq %r12,%rcx + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm6,%xmm6 + xorq %rdx,%r14 + addq %r13,%rcx + vpaddq 64(%rbp),%xmm6,%xmm10 + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,96(%rsp) + vpalignr $8,%xmm7,%xmm0,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rcx + vpalignr $8,%xmm3,%xmm4,%xmm11 + movq %r11,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r10,%r13 + xorq %rax,%r12 + vpaddq %xmm11,%xmm7,%xmm7 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r10,%r12 + xorq %r10,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 112(%rsp),%rbx + movq %rcx,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rdx,%r15 + addq %r12,%rbx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm6,%xmm11 + addq %rbx,%r9 + addq %rdi,%rbx + vpxor %xmm9,%xmm8,%xmm8 + movq %r9,%r13 + addq %rbx,%r14 + vpsllq $3,%xmm6,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rbx + vpaddq %xmm8,%xmm7,%xmm7 + movq %r10,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm6,%xmm9 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r9,%r12 + xorq %r9,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 120(%rsp),%rax + movq %rbx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r11,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rcx,%rdi + addq %r12,%rax + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm7,%xmm7 + xorq %rbx,%r14 + addq %r13,%rax + vpaddq 96(%rbp),%xmm7,%xmm10 + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,112(%rsp) + cmpb $0,135(%rbp) + jne L$avx_00_47 + shrdq $23,%r13,%r13 + movq %r14,%rax + movq %r9,%r12 + shrdq $5,%r14,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 0(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r11 + movq %r8,%r12 + shrdq $5,%r14,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 8(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + shrdq $6,%r14,%r14 + xorq %rax,%rdi + addq %r12,%r10 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r10 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 16(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + xorq %r11,%r15 + addq %r12,%r9 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r9 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 24(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + xorq %r10,%rdi + addq %r12,%r8 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r8 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 32(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + xorq %r9,%r15 + addq %r12,%rdx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + shrdq $28,%r14,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rdx + movq %rax,%r12 + shrdq $5,%r14,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 40(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + xorq %r8,%rdi + addq %r12,%rcx + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rcx + movq %r11,%r12 + shrdq $5,%r14,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 48(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rbx + movq %r10,%r12 + shrdq $5,%r14,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 56(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + shrdq $6,%r14,%r14 + xorq %rcx,%rdi + addq %r12,%rax + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rax + movq %r9,%r12 + shrdq $5,%r14,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 64(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r11 + movq %r8,%r12 + shrdq $5,%r14,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 72(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + shrdq $6,%r14,%r14 + xorq %rax,%rdi + addq %r12,%r10 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r10 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 80(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + xorq %r11,%r15 + addq %r12,%r9 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r9 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 88(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + xorq %r10,%rdi + addq %r12,%r8 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r8 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 96(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + xorq %r9,%r15 + addq %r12,%rdx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + shrdq $28,%r14,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rdx + movq %rax,%r12 + shrdq $5,%r14,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 104(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + xorq %r8,%rdi + addq %r12,%rcx + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rcx + movq %r11,%r12 + shrdq $5,%r14,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 112(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rbx + movq %r10,%r12 + shrdq $5,%r14,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 120(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + shrdq $6,%r14,%r14 + xorq %rcx,%rdi + addq %r12,%rax + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + movq 128+0(%rsp),%rdi + movq %r14,%rax + + addq 0(%rdi),%rax + leaq 128(%rsi),%rsi + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + jb L$loop_avx + + movq 152(%rsp),%rsi + + vzeroupper + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$epilogue_avx: + ret + + +#endif diff --git a/ring-0.17.14/pregenerated/sha512-x86_64-nasm.asm b/ring-0.17.14/pregenerated/sha512-x86_64-nasm.asm new file mode 100644 index 0000000000..96e1ad9f45 --- /dev/null +++ b/ring-0.17.14/pregenerated/sha512-x86_64-nasm.asm @@ -0,0 +1,3138 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%include "ring_core_generated/prefix_symbols_nasm.inc" +section .text code align=64 + + +global sha512_block_data_order_nohw + +ALIGN 16 +sha512_block_data_order_nohw: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sha512_block_data_order_nohw: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + mov rax,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + shl rdx,4 + sub rsp,16*8+4*8 + lea rdx,[rdx*8+rsi] + and rsp,-64 + mov QWORD[((128+0))+rsp],rdi + mov QWORD[((128+8))+rsp],rsi + mov QWORD[((128+16))+rsp],rdx + mov QWORD[152+rsp],rax + +$L$prologue: + + mov rax,QWORD[rdi] + mov rbx,QWORD[8+rdi] + mov rcx,QWORD[16+rdi] + mov rdx,QWORD[24+rdi] + mov r8,QWORD[32+rdi] + mov r9,QWORD[40+rdi] + mov r10,QWORD[48+rdi] + mov r11,QWORD[56+rdi] + jmp NEAR $L$loop + +ALIGN 16 +$L$loop: + mov rdi,rbx + lea rbp,[K512] + xor rdi,rcx + mov r12,QWORD[rsi] + mov r13,r8 + mov r14,rax + bswap r12 + ror r13,23 + mov r15,r9 + + xor r13,r8 + ror r14,5 + xor r15,r10 + + mov QWORD[rsp],r12 + xor r14,rax + and r15,r8 + + ror r13,4 + add r12,r11 + xor r15,r10 + + ror r14,6 + xor r13,r8 + add r12,r15 + + mov r15,rax + add r12,QWORD[rbp] + xor r14,rax + + xor r15,rbx + ror r13,14 + mov r11,rbx + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor r11,rdi + add rdx,r12 + add r11,r12 + + lea rbp,[8+rbp] + add r11,r14 + mov r12,QWORD[8+rsi] + mov r13,rdx + mov r14,r11 + bswap r12 + ror r13,23 + mov rdi,r8 + + xor r13,rdx + ror r14,5 + xor rdi,r9 + + mov QWORD[8+rsp],r12 + xor r14,r11 + and rdi,rdx + + ror r13,4 + add r12,r10 + xor rdi,r9 + + ror r14,6 + xor r13,rdx + add r12,rdi + + mov rdi,r11 + add r12,QWORD[rbp] + xor r14,r11 + + xor rdi,rax + ror r13,14 + mov r10,rax + + and r15,rdi + ror r14,28 + add r12,r13 + + xor r10,r15 + add rcx,r12 + add r10,r12 + + lea rbp,[24+rbp] + add r10,r14 + mov r12,QWORD[16+rsi] + mov r13,rcx + mov r14,r10 + bswap r12 + ror r13,23 + mov r15,rdx + + xor r13,rcx + ror r14,5 + xor r15,r8 + + mov QWORD[16+rsp],r12 + xor r14,r10 + and r15,rcx + + ror r13,4 + add r12,r9 + xor r15,r8 + + ror r14,6 + xor r13,rcx + add r12,r15 + + mov r15,r10 + add r12,QWORD[rbp] + xor r14,r10 + + xor r15,r11 + ror r13,14 + mov r9,r11 + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor r9,rdi + add rbx,r12 + add r9,r12 + + lea rbp,[8+rbp] + add r9,r14 + mov r12,QWORD[24+rsi] + mov r13,rbx + mov r14,r9 + bswap r12 + ror r13,23 + mov rdi,rcx + + xor r13,rbx + ror r14,5 + xor rdi,rdx + + mov QWORD[24+rsp],r12 + xor r14,r9 + and rdi,rbx + + ror r13,4 + add r12,r8 + xor rdi,rdx + + ror r14,6 + xor r13,rbx + add r12,rdi + + mov rdi,r9 + add r12,QWORD[rbp] + xor r14,r9 + + xor rdi,r10 + ror r13,14 + mov r8,r10 + + and r15,rdi + ror r14,28 + add r12,r13 + + xor r8,r15 + add rax,r12 + add r8,r12 + + lea rbp,[24+rbp] + add r8,r14 + mov r12,QWORD[32+rsi] + mov r13,rax + mov r14,r8 + bswap r12 + ror r13,23 + mov r15,rbx + + xor r13,rax + ror r14,5 + xor r15,rcx + + mov QWORD[32+rsp],r12 + xor r14,r8 + and r15,rax + + ror r13,4 + add r12,rdx + xor r15,rcx + + ror r14,6 + xor r13,rax + add r12,r15 + + mov r15,r8 + add r12,QWORD[rbp] + xor r14,r8 + + xor r15,r9 + ror r13,14 + mov rdx,r9 + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor rdx,rdi + add r11,r12 + add rdx,r12 + + lea rbp,[8+rbp] + add rdx,r14 + mov r12,QWORD[40+rsi] + mov r13,r11 + mov r14,rdx + bswap r12 + ror r13,23 + mov rdi,rax + + xor r13,r11 + ror r14,5 + xor rdi,rbx + + mov QWORD[40+rsp],r12 + xor r14,rdx + and rdi,r11 + + ror r13,4 + add r12,rcx + xor rdi,rbx + + ror r14,6 + xor r13,r11 + add r12,rdi + + mov rdi,rdx + add r12,QWORD[rbp] + xor r14,rdx + + xor rdi,r8 + ror r13,14 + mov rcx,r8 + + and r15,rdi + ror r14,28 + add r12,r13 + + xor rcx,r15 + add r10,r12 + add rcx,r12 + + lea rbp,[24+rbp] + add rcx,r14 + mov r12,QWORD[48+rsi] + mov r13,r10 + mov r14,rcx + bswap r12 + ror r13,23 + mov r15,r11 + + xor r13,r10 + ror r14,5 + xor r15,rax + + mov QWORD[48+rsp],r12 + xor r14,rcx + and r15,r10 + + ror r13,4 + add r12,rbx + xor r15,rax + + ror r14,6 + xor r13,r10 + add r12,r15 + + mov r15,rcx + add r12,QWORD[rbp] + xor r14,rcx + + xor r15,rdx + ror r13,14 + mov rbx,rdx + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor rbx,rdi + add r9,r12 + add rbx,r12 + + lea rbp,[8+rbp] + add rbx,r14 + mov r12,QWORD[56+rsi] + mov r13,r9 + mov r14,rbx + bswap r12 + ror r13,23 + mov rdi,r10 + + xor r13,r9 + ror r14,5 + xor rdi,r11 + + mov QWORD[56+rsp],r12 + xor r14,rbx + and rdi,r9 + + ror r13,4 + add r12,rax + xor rdi,r11 + + ror r14,6 + xor r13,r9 + add r12,rdi + + mov rdi,rbx + add r12,QWORD[rbp] + xor r14,rbx + + xor rdi,rcx + ror r13,14 + mov rax,rcx + + and r15,rdi + ror r14,28 + add r12,r13 + + xor rax,r15 + add r8,r12 + add rax,r12 + + lea rbp,[24+rbp] + add rax,r14 + mov r12,QWORD[64+rsi] + mov r13,r8 + mov r14,rax + bswap r12 + ror r13,23 + mov r15,r9 + + xor r13,r8 + ror r14,5 + xor r15,r10 + + mov QWORD[64+rsp],r12 + xor r14,rax + and r15,r8 + + ror r13,4 + add r12,r11 + xor r15,r10 + + ror r14,6 + xor r13,r8 + add r12,r15 + + mov r15,rax + add r12,QWORD[rbp] + xor r14,rax + + xor r15,rbx + ror r13,14 + mov r11,rbx + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor r11,rdi + add rdx,r12 + add r11,r12 + + lea rbp,[8+rbp] + add r11,r14 + mov r12,QWORD[72+rsi] + mov r13,rdx + mov r14,r11 + bswap r12 + ror r13,23 + mov rdi,r8 + + xor r13,rdx + ror r14,5 + xor rdi,r9 + + mov QWORD[72+rsp],r12 + xor r14,r11 + and rdi,rdx + + ror r13,4 + add r12,r10 + xor rdi,r9 + + ror r14,6 + xor r13,rdx + add r12,rdi + + mov rdi,r11 + add r12,QWORD[rbp] + xor r14,r11 + + xor rdi,rax + ror r13,14 + mov r10,rax + + and r15,rdi + ror r14,28 + add r12,r13 + + xor r10,r15 + add rcx,r12 + add r10,r12 + + lea rbp,[24+rbp] + add r10,r14 + mov r12,QWORD[80+rsi] + mov r13,rcx + mov r14,r10 + bswap r12 + ror r13,23 + mov r15,rdx + + xor r13,rcx + ror r14,5 + xor r15,r8 + + mov QWORD[80+rsp],r12 + xor r14,r10 + and r15,rcx + + ror r13,4 + add r12,r9 + xor r15,r8 + + ror r14,6 + xor r13,rcx + add r12,r15 + + mov r15,r10 + add r12,QWORD[rbp] + xor r14,r10 + + xor r15,r11 + ror r13,14 + mov r9,r11 + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor r9,rdi + add rbx,r12 + add r9,r12 + + lea rbp,[8+rbp] + add r9,r14 + mov r12,QWORD[88+rsi] + mov r13,rbx + mov r14,r9 + bswap r12 + ror r13,23 + mov rdi,rcx + + xor r13,rbx + ror r14,5 + xor rdi,rdx + + mov QWORD[88+rsp],r12 + xor r14,r9 + and rdi,rbx + + ror r13,4 + add r12,r8 + xor rdi,rdx + + ror r14,6 + xor r13,rbx + add r12,rdi + + mov rdi,r9 + add r12,QWORD[rbp] + xor r14,r9 + + xor rdi,r10 + ror r13,14 + mov r8,r10 + + and r15,rdi + ror r14,28 + add r12,r13 + + xor r8,r15 + add rax,r12 + add r8,r12 + + lea rbp,[24+rbp] + add r8,r14 + mov r12,QWORD[96+rsi] + mov r13,rax + mov r14,r8 + bswap r12 + ror r13,23 + mov r15,rbx + + xor r13,rax + ror r14,5 + xor r15,rcx + + mov QWORD[96+rsp],r12 + xor r14,r8 + and r15,rax + + ror r13,4 + add r12,rdx + xor r15,rcx + + ror r14,6 + xor r13,rax + add r12,r15 + + mov r15,r8 + add r12,QWORD[rbp] + xor r14,r8 + + xor r15,r9 + ror r13,14 + mov rdx,r9 + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor rdx,rdi + add r11,r12 + add rdx,r12 + + lea rbp,[8+rbp] + add rdx,r14 + mov r12,QWORD[104+rsi] + mov r13,r11 + mov r14,rdx + bswap r12 + ror r13,23 + mov rdi,rax + + xor r13,r11 + ror r14,5 + xor rdi,rbx + + mov QWORD[104+rsp],r12 + xor r14,rdx + and rdi,r11 + + ror r13,4 + add r12,rcx + xor rdi,rbx + + ror r14,6 + xor r13,r11 + add r12,rdi + + mov rdi,rdx + add r12,QWORD[rbp] + xor r14,rdx + + xor rdi,r8 + ror r13,14 + mov rcx,r8 + + and r15,rdi + ror r14,28 + add r12,r13 + + xor rcx,r15 + add r10,r12 + add rcx,r12 + + lea rbp,[24+rbp] + add rcx,r14 + mov r12,QWORD[112+rsi] + mov r13,r10 + mov r14,rcx + bswap r12 + ror r13,23 + mov r15,r11 + + xor r13,r10 + ror r14,5 + xor r15,rax + + mov QWORD[112+rsp],r12 + xor r14,rcx + and r15,r10 + + ror r13,4 + add r12,rbx + xor r15,rax + + ror r14,6 + xor r13,r10 + add r12,r15 + + mov r15,rcx + add r12,QWORD[rbp] + xor r14,rcx + + xor r15,rdx + ror r13,14 + mov rbx,rdx + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor rbx,rdi + add r9,r12 + add rbx,r12 + + lea rbp,[8+rbp] + add rbx,r14 + mov r12,QWORD[120+rsi] + mov r13,r9 + mov r14,rbx + bswap r12 + ror r13,23 + mov rdi,r10 + + xor r13,r9 + ror r14,5 + xor rdi,r11 + + mov QWORD[120+rsp],r12 + xor r14,rbx + and rdi,r9 + + ror r13,4 + add r12,rax + xor rdi,r11 + + ror r14,6 + xor r13,r9 + add r12,rdi + + mov rdi,rbx + add r12,QWORD[rbp] + xor r14,rbx + + xor rdi,rcx + ror r13,14 + mov rax,rcx + + and r15,rdi + ror r14,28 + add r12,r13 + + xor rax,r15 + add r8,r12 + add rax,r12 + + lea rbp,[24+rbp] + jmp NEAR $L$rounds_16_xx +ALIGN 16 +$L$rounds_16_xx: + mov r13,QWORD[8+rsp] + mov r15,QWORD[112+rsp] + + mov r12,r13 + ror r13,7 + add rax,r14 + mov r14,r15 + ror r15,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor r15,r14 + shr r14,6 + + ror r15,19 + xor r12,r13 + xor r15,r14 + add r12,QWORD[72+rsp] + + add r12,QWORD[rsp] + mov r13,r8 + add r12,r15 + mov r14,rax + ror r13,23 + mov r15,r9 + + xor r13,r8 + ror r14,5 + xor r15,r10 + + mov QWORD[rsp],r12 + xor r14,rax + and r15,r8 + + ror r13,4 + add r12,r11 + xor r15,r10 + + ror r14,6 + xor r13,r8 + add r12,r15 + + mov r15,rax + add r12,QWORD[rbp] + xor r14,rax + + xor r15,rbx + ror r13,14 + mov r11,rbx + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor r11,rdi + add rdx,r12 + add r11,r12 + + lea rbp,[8+rbp] + mov r13,QWORD[16+rsp] + mov rdi,QWORD[120+rsp] + + mov r12,r13 + ror r13,7 + add r11,r14 + mov r14,rdi + ror rdi,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor rdi,r14 + shr r14,6 + + ror rdi,19 + xor r12,r13 + xor rdi,r14 + add r12,QWORD[80+rsp] + + add r12,QWORD[8+rsp] + mov r13,rdx + add r12,rdi + mov r14,r11 + ror r13,23 + mov rdi,r8 + + xor r13,rdx + ror r14,5 + xor rdi,r9 + + mov QWORD[8+rsp],r12 + xor r14,r11 + and rdi,rdx + + ror r13,4 + add r12,r10 + xor rdi,r9 + + ror r14,6 + xor r13,rdx + add r12,rdi + + mov rdi,r11 + add r12,QWORD[rbp] + xor r14,r11 + + xor rdi,rax + ror r13,14 + mov r10,rax + + and r15,rdi + ror r14,28 + add r12,r13 + + xor r10,r15 + add rcx,r12 + add r10,r12 + + lea rbp,[24+rbp] + mov r13,QWORD[24+rsp] + mov r15,QWORD[rsp] + + mov r12,r13 + ror r13,7 + add r10,r14 + mov r14,r15 + ror r15,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor r15,r14 + shr r14,6 + + ror r15,19 + xor r12,r13 + xor r15,r14 + add r12,QWORD[88+rsp] + + add r12,QWORD[16+rsp] + mov r13,rcx + add r12,r15 + mov r14,r10 + ror r13,23 + mov r15,rdx + + xor r13,rcx + ror r14,5 + xor r15,r8 + + mov QWORD[16+rsp],r12 + xor r14,r10 + and r15,rcx + + ror r13,4 + add r12,r9 + xor r15,r8 + + ror r14,6 + xor r13,rcx + add r12,r15 + + mov r15,r10 + add r12,QWORD[rbp] + xor r14,r10 + + xor r15,r11 + ror r13,14 + mov r9,r11 + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor r9,rdi + add rbx,r12 + add r9,r12 + + lea rbp,[8+rbp] + mov r13,QWORD[32+rsp] + mov rdi,QWORD[8+rsp] + + mov r12,r13 + ror r13,7 + add r9,r14 + mov r14,rdi + ror rdi,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor rdi,r14 + shr r14,6 + + ror rdi,19 + xor r12,r13 + xor rdi,r14 + add r12,QWORD[96+rsp] + + add r12,QWORD[24+rsp] + mov r13,rbx + add r12,rdi + mov r14,r9 + ror r13,23 + mov rdi,rcx + + xor r13,rbx + ror r14,5 + xor rdi,rdx + + mov QWORD[24+rsp],r12 + xor r14,r9 + and rdi,rbx + + ror r13,4 + add r12,r8 + xor rdi,rdx + + ror r14,6 + xor r13,rbx + add r12,rdi + + mov rdi,r9 + add r12,QWORD[rbp] + xor r14,r9 + + xor rdi,r10 + ror r13,14 + mov r8,r10 + + and r15,rdi + ror r14,28 + add r12,r13 + + xor r8,r15 + add rax,r12 + add r8,r12 + + lea rbp,[24+rbp] + mov r13,QWORD[40+rsp] + mov r15,QWORD[16+rsp] + + mov r12,r13 + ror r13,7 + add r8,r14 + mov r14,r15 + ror r15,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor r15,r14 + shr r14,6 + + ror r15,19 + xor r12,r13 + xor r15,r14 + add r12,QWORD[104+rsp] + + add r12,QWORD[32+rsp] + mov r13,rax + add r12,r15 + mov r14,r8 + ror r13,23 + mov r15,rbx + + xor r13,rax + ror r14,5 + xor r15,rcx + + mov QWORD[32+rsp],r12 + xor r14,r8 + and r15,rax + + ror r13,4 + add r12,rdx + xor r15,rcx + + ror r14,6 + xor r13,rax + add r12,r15 + + mov r15,r8 + add r12,QWORD[rbp] + xor r14,r8 + + xor r15,r9 + ror r13,14 + mov rdx,r9 + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor rdx,rdi + add r11,r12 + add rdx,r12 + + lea rbp,[8+rbp] + mov r13,QWORD[48+rsp] + mov rdi,QWORD[24+rsp] + + mov r12,r13 + ror r13,7 + add rdx,r14 + mov r14,rdi + ror rdi,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor rdi,r14 + shr r14,6 + + ror rdi,19 + xor r12,r13 + xor rdi,r14 + add r12,QWORD[112+rsp] + + add r12,QWORD[40+rsp] + mov r13,r11 + add r12,rdi + mov r14,rdx + ror r13,23 + mov rdi,rax + + xor r13,r11 + ror r14,5 + xor rdi,rbx + + mov QWORD[40+rsp],r12 + xor r14,rdx + and rdi,r11 + + ror r13,4 + add r12,rcx + xor rdi,rbx + + ror r14,6 + xor r13,r11 + add r12,rdi + + mov rdi,rdx + add r12,QWORD[rbp] + xor r14,rdx + + xor rdi,r8 + ror r13,14 + mov rcx,r8 + + and r15,rdi + ror r14,28 + add r12,r13 + + xor rcx,r15 + add r10,r12 + add rcx,r12 + + lea rbp,[24+rbp] + mov r13,QWORD[56+rsp] + mov r15,QWORD[32+rsp] + + mov r12,r13 + ror r13,7 + add rcx,r14 + mov r14,r15 + ror r15,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor r15,r14 + shr r14,6 + + ror r15,19 + xor r12,r13 + xor r15,r14 + add r12,QWORD[120+rsp] + + add r12,QWORD[48+rsp] + mov r13,r10 + add r12,r15 + mov r14,rcx + ror r13,23 + mov r15,r11 + + xor r13,r10 + ror r14,5 + xor r15,rax + + mov QWORD[48+rsp],r12 + xor r14,rcx + and r15,r10 + + ror r13,4 + add r12,rbx + xor r15,rax + + ror r14,6 + xor r13,r10 + add r12,r15 + + mov r15,rcx + add r12,QWORD[rbp] + xor r14,rcx + + xor r15,rdx + ror r13,14 + mov rbx,rdx + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor rbx,rdi + add r9,r12 + add rbx,r12 + + lea rbp,[8+rbp] + mov r13,QWORD[64+rsp] + mov rdi,QWORD[40+rsp] + + mov r12,r13 + ror r13,7 + add rbx,r14 + mov r14,rdi + ror rdi,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor rdi,r14 + shr r14,6 + + ror rdi,19 + xor r12,r13 + xor rdi,r14 + add r12,QWORD[rsp] + + add r12,QWORD[56+rsp] + mov r13,r9 + add r12,rdi + mov r14,rbx + ror r13,23 + mov rdi,r10 + + xor r13,r9 + ror r14,5 + xor rdi,r11 + + mov QWORD[56+rsp],r12 + xor r14,rbx + and rdi,r9 + + ror r13,4 + add r12,rax + xor rdi,r11 + + ror r14,6 + xor r13,r9 + add r12,rdi + + mov rdi,rbx + add r12,QWORD[rbp] + xor r14,rbx + + xor rdi,rcx + ror r13,14 + mov rax,rcx + + and r15,rdi + ror r14,28 + add r12,r13 + + xor rax,r15 + add r8,r12 + add rax,r12 + + lea rbp,[24+rbp] + mov r13,QWORD[72+rsp] + mov r15,QWORD[48+rsp] + + mov r12,r13 + ror r13,7 + add rax,r14 + mov r14,r15 + ror r15,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor r15,r14 + shr r14,6 + + ror r15,19 + xor r12,r13 + xor r15,r14 + add r12,QWORD[8+rsp] + + add r12,QWORD[64+rsp] + mov r13,r8 + add r12,r15 + mov r14,rax + ror r13,23 + mov r15,r9 + + xor r13,r8 + ror r14,5 + xor r15,r10 + + mov QWORD[64+rsp],r12 + xor r14,rax + and r15,r8 + + ror r13,4 + add r12,r11 + xor r15,r10 + + ror r14,6 + xor r13,r8 + add r12,r15 + + mov r15,rax + add r12,QWORD[rbp] + xor r14,rax + + xor r15,rbx + ror r13,14 + mov r11,rbx + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor r11,rdi + add rdx,r12 + add r11,r12 + + lea rbp,[8+rbp] + mov r13,QWORD[80+rsp] + mov rdi,QWORD[56+rsp] + + mov r12,r13 + ror r13,7 + add r11,r14 + mov r14,rdi + ror rdi,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor rdi,r14 + shr r14,6 + + ror rdi,19 + xor r12,r13 + xor rdi,r14 + add r12,QWORD[16+rsp] + + add r12,QWORD[72+rsp] + mov r13,rdx + add r12,rdi + mov r14,r11 + ror r13,23 + mov rdi,r8 + + xor r13,rdx + ror r14,5 + xor rdi,r9 + + mov QWORD[72+rsp],r12 + xor r14,r11 + and rdi,rdx + + ror r13,4 + add r12,r10 + xor rdi,r9 + + ror r14,6 + xor r13,rdx + add r12,rdi + + mov rdi,r11 + add r12,QWORD[rbp] + xor r14,r11 + + xor rdi,rax + ror r13,14 + mov r10,rax + + and r15,rdi + ror r14,28 + add r12,r13 + + xor r10,r15 + add rcx,r12 + add r10,r12 + + lea rbp,[24+rbp] + mov r13,QWORD[88+rsp] + mov r15,QWORD[64+rsp] + + mov r12,r13 + ror r13,7 + add r10,r14 + mov r14,r15 + ror r15,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor r15,r14 + shr r14,6 + + ror r15,19 + xor r12,r13 + xor r15,r14 + add r12,QWORD[24+rsp] + + add r12,QWORD[80+rsp] + mov r13,rcx + add r12,r15 + mov r14,r10 + ror r13,23 + mov r15,rdx + + xor r13,rcx + ror r14,5 + xor r15,r8 + + mov QWORD[80+rsp],r12 + xor r14,r10 + and r15,rcx + + ror r13,4 + add r12,r9 + xor r15,r8 + + ror r14,6 + xor r13,rcx + add r12,r15 + + mov r15,r10 + add r12,QWORD[rbp] + xor r14,r10 + + xor r15,r11 + ror r13,14 + mov r9,r11 + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor r9,rdi + add rbx,r12 + add r9,r12 + + lea rbp,[8+rbp] + mov r13,QWORD[96+rsp] + mov rdi,QWORD[72+rsp] + + mov r12,r13 + ror r13,7 + add r9,r14 + mov r14,rdi + ror rdi,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor rdi,r14 + shr r14,6 + + ror rdi,19 + xor r12,r13 + xor rdi,r14 + add r12,QWORD[32+rsp] + + add r12,QWORD[88+rsp] + mov r13,rbx + add r12,rdi + mov r14,r9 + ror r13,23 + mov rdi,rcx + + xor r13,rbx + ror r14,5 + xor rdi,rdx + + mov QWORD[88+rsp],r12 + xor r14,r9 + and rdi,rbx + + ror r13,4 + add r12,r8 + xor rdi,rdx + + ror r14,6 + xor r13,rbx + add r12,rdi + + mov rdi,r9 + add r12,QWORD[rbp] + xor r14,r9 + + xor rdi,r10 + ror r13,14 + mov r8,r10 + + and r15,rdi + ror r14,28 + add r12,r13 + + xor r8,r15 + add rax,r12 + add r8,r12 + + lea rbp,[24+rbp] + mov r13,QWORD[104+rsp] + mov r15,QWORD[80+rsp] + + mov r12,r13 + ror r13,7 + add r8,r14 + mov r14,r15 + ror r15,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor r15,r14 + shr r14,6 + + ror r15,19 + xor r12,r13 + xor r15,r14 + add r12,QWORD[40+rsp] + + add r12,QWORD[96+rsp] + mov r13,rax + add r12,r15 + mov r14,r8 + ror r13,23 + mov r15,rbx + + xor r13,rax + ror r14,5 + xor r15,rcx + + mov QWORD[96+rsp],r12 + xor r14,r8 + and r15,rax + + ror r13,4 + add r12,rdx + xor r15,rcx + + ror r14,6 + xor r13,rax + add r12,r15 + + mov r15,r8 + add r12,QWORD[rbp] + xor r14,r8 + + xor r15,r9 + ror r13,14 + mov rdx,r9 + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor rdx,rdi + add r11,r12 + add rdx,r12 + + lea rbp,[8+rbp] + mov r13,QWORD[112+rsp] + mov rdi,QWORD[88+rsp] + + mov r12,r13 + ror r13,7 + add rdx,r14 + mov r14,rdi + ror rdi,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor rdi,r14 + shr r14,6 + + ror rdi,19 + xor r12,r13 + xor rdi,r14 + add r12,QWORD[48+rsp] + + add r12,QWORD[104+rsp] + mov r13,r11 + add r12,rdi + mov r14,rdx + ror r13,23 + mov rdi,rax + + xor r13,r11 + ror r14,5 + xor rdi,rbx + + mov QWORD[104+rsp],r12 + xor r14,rdx + and rdi,r11 + + ror r13,4 + add r12,rcx + xor rdi,rbx + + ror r14,6 + xor r13,r11 + add r12,rdi + + mov rdi,rdx + add r12,QWORD[rbp] + xor r14,rdx + + xor rdi,r8 + ror r13,14 + mov rcx,r8 + + and r15,rdi + ror r14,28 + add r12,r13 + + xor rcx,r15 + add r10,r12 + add rcx,r12 + + lea rbp,[24+rbp] + mov r13,QWORD[120+rsp] + mov r15,QWORD[96+rsp] + + mov r12,r13 + ror r13,7 + add rcx,r14 + mov r14,r15 + ror r15,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor r15,r14 + shr r14,6 + + ror r15,19 + xor r12,r13 + xor r15,r14 + add r12,QWORD[56+rsp] + + add r12,QWORD[112+rsp] + mov r13,r10 + add r12,r15 + mov r14,rcx + ror r13,23 + mov r15,r11 + + xor r13,r10 + ror r14,5 + xor r15,rax + + mov QWORD[112+rsp],r12 + xor r14,rcx + and r15,r10 + + ror r13,4 + add r12,rbx + xor r15,rax + + ror r14,6 + xor r13,r10 + add r12,r15 + + mov r15,rcx + add r12,QWORD[rbp] + xor r14,rcx + + xor r15,rdx + ror r13,14 + mov rbx,rdx + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor rbx,rdi + add r9,r12 + add rbx,r12 + + lea rbp,[8+rbp] + mov r13,QWORD[rsp] + mov rdi,QWORD[104+rsp] + + mov r12,r13 + ror r13,7 + add rbx,r14 + mov r14,rdi + ror rdi,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor rdi,r14 + shr r14,6 + + ror rdi,19 + xor r12,r13 + xor rdi,r14 + add r12,QWORD[64+rsp] + + add r12,QWORD[120+rsp] + mov r13,r9 + add r12,rdi + mov r14,rbx + ror r13,23 + mov rdi,r10 + + xor r13,r9 + ror r14,5 + xor rdi,r11 + + mov QWORD[120+rsp],r12 + xor r14,rbx + and rdi,r9 + + ror r13,4 + add r12,rax + xor rdi,r11 + + ror r14,6 + xor r13,r9 + add r12,rdi + + mov rdi,rbx + add r12,QWORD[rbp] + xor r14,rbx + + xor rdi,rcx + ror r13,14 + mov rax,rcx + + and r15,rdi + ror r14,28 + add r12,r13 + + xor rax,r15 + add r8,r12 + add rax,r12 + + lea rbp,[24+rbp] + cmp BYTE[7+rbp],0 + jnz NEAR $L$rounds_16_xx + + mov rdi,QWORD[((128+0))+rsp] + add rax,r14 + lea rsi,[128+rsi] + + add rax,QWORD[rdi] + add rbx,QWORD[8+rdi] + add rcx,QWORD[16+rdi] + add rdx,QWORD[24+rdi] + add r8,QWORD[32+rdi] + add r9,QWORD[40+rdi] + add r10,QWORD[48+rdi] + add r11,QWORD[56+rdi] + + cmp rsi,QWORD[((128+16))+rsp] + + mov QWORD[rdi],rax + mov QWORD[8+rdi],rbx + mov QWORD[16+rdi],rcx + mov QWORD[24+rdi],rdx + mov QWORD[32+rdi],r8 + mov QWORD[40+rdi],r9 + mov QWORD[48+rdi],r10 + mov QWORD[56+rdi],r11 + jb NEAR $L$loop + + mov rsi,QWORD[152+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_sha512_block_data_order_nohw: +section .rdata rdata align=8 +ALIGN 64 + +K512: + DQ 0x428a2f98d728ae22,0x7137449123ef65cd + DQ 0x428a2f98d728ae22,0x7137449123ef65cd + DQ 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + DQ 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + DQ 0x3956c25bf348b538,0x59f111f1b605d019 + DQ 0x3956c25bf348b538,0x59f111f1b605d019 + DQ 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + DQ 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + DQ 0xd807aa98a3030242,0x12835b0145706fbe + DQ 0xd807aa98a3030242,0x12835b0145706fbe + DQ 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + DQ 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + DQ 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + DQ 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + DQ 0x9bdc06a725c71235,0xc19bf174cf692694 + DQ 0x9bdc06a725c71235,0xc19bf174cf692694 + DQ 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + DQ 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + DQ 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + DQ 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + DQ 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + DQ 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + DQ 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + DQ 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + DQ 0x983e5152ee66dfab,0xa831c66d2db43210 + DQ 0x983e5152ee66dfab,0xa831c66d2db43210 + DQ 0xb00327c898fb213f,0xbf597fc7beef0ee4 + DQ 0xb00327c898fb213f,0xbf597fc7beef0ee4 + DQ 0xc6e00bf33da88fc2,0xd5a79147930aa725 + DQ 0xc6e00bf33da88fc2,0xd5a79147930aa725 + DQ 0x06ca6351e003826f,0x142929670a0e6e70 + DQ 0x06ca6351e003826f,0x142929670a0e6e70 + DQ 0x27b70a8546d22ffc,0x2e1b21385c26c926 + DQ 0x27b70a8546d22ffc,0x2e1b21385c26c926 + DQ 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + DQ 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + DQ 0x650a73548baf63de,0x766a0abb3c77b2a8 + DQ 0x650a73548baf63de,0x766a0abb3c77b2a8 + DQ 0x81c2c92e47edaee6,0x92722c851482353b + DQ 0x81c2c92e47edaee6,0x92722c851482353b + DQ 0xa2bfe8a14cf10364,0xa81a664bbc423001 + DQ 0xa2bfe8a14cf10364,0xa81a664bbc423001 + DQ 0xc24b8b70d0f89791,0xc76c51a30654be30 + DQ 0xc24b8b70d0f89791,0xc76c51a30654be30 + DQ 0xd192e819d6ef5218,0xd69906245565a910 + DQ 0xd192e819d6ef5218,0xd69906245565a910 + DQ 0xf40e35855771202a,0x106aa07032bbd1b8 + DQ 0xf40e35855771202a,0x106aa07032bbd1b8 + DQ 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + DQ 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + DQ 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + DQ 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + DQ 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + DQ 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + DQ 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + DQ 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + DQ 0x748f82ee5defb2fc,0x78a5636f43172f60 + DQ 0x748f82ee5defb2fc,0x78a5636f43172f60 + DQ 0x84c87814a1f0ab72,0x8cc702081a6439ec + DQ 0x84c87814a1f0ab72,0x8cc702081a6439ec + DQ 0x90befffa23631e28,0xa4506cebde82bde9 + DQ 0x90befffa23631e28,0xa4506cebde82bde9 + DQ 0xbef9a3f7b2c67915,0xc67178f2e372532b + DQ 0xbef9a3f7b2c67915,0xc67178f2e372532b + DQ 0xca273eceea26619c,0xd186b8c721c0c207 + DQ 0xca273eceea26619c,0xd186b8c721c0c207 + DQ 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + DQ 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + DQ 0x06f067aa72176fba,0x0a637dc5a2c898a6 + DQ 0x06f067aa72176fba,0x0a637dc5a2c898a6 + DQ 0x113f9804bef90dae,0x1b710b35131c471b + DQ 0x113f9804bef90dae,0x1b710b35131c471b + DQ 0x28db77f523047d84,0x32caab7b40c72493 + DQ 0x28db77f523047d84,0x32caab7b40c72493 + DQ 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + DQ 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + DQ 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + DQ 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + DQ 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + DQ 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + + DQ 0x0001020304050607,0x08090a0b0c0d0e0f + DQ 0x0001020304050607,0x08090a0b0c0d0e0f + DB 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97 + DB 110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54 + DB 52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 + DB 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 + DB 111,114,103,62,0 +section .text + +global sha512_block_data_order_avx + +ALIGN 64 +sha512_block_data_order_avx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sha512_block_data_order_avx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + mov rax,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + shl rdx,4 + sub rsp,256 + lea rdx,[rdx*8+rsi] + and rsp,-64 + mov QWORD[((128+0))+rsp],rdi + mov QWORD[((128+8))+rsp],rsi + mov QWORD[((128+16))+rsp],rdx + mov QWORD[152+rsp],rax + + movaps XMMWORD[(128+32)+rsp],xmm6 + movaps XMMWORD[(128+48)+rsp],xmm7 + movaps XMMWORD[(128+64)+rsp],xmm8 + movaps XMMWORD[(128+80)+rsp],xmm9 + movaps XMMWORD[(128+96)+rsp],xmm10 + movaps XMMWORD[(128+112)+rsp],xmm11 +$L$prologue_avx: + + vzeroupper + mov rax,QWORD[rdi] + mov rbx,QWORD[8+rdi] + mov rcx,QWORD[16+rdi] + mov rdx,QWORD[24+rdi] + mov r8,QWORD[32+rdi] + mov r9,QWORD[40+rdi] + mov r10,QWORD[48+rdi] + mov r11,QWORD[56+rdi] + jmp NEAR $L$loop_avx +ALIGN 16 +$L$loop_avx: + vmovdqa xmm11,XMMWORD[((K512+1280))] + vmovdqu xmm0,XMMWORD[rsi] + lea rbp,[((K512+128))] + vmovdqu xmm1,XMMWORD[16+rsi] + vmovdqu xmm2,XMMWORD[32+rsi] + vpshufb xmm0,xmm0,xmm11 + vmovdqu xmm3,XMMWORD[48+rsi] + vpshufb xmm1,xmm1,xmm11 + vmovdqu xmm4,XMMWORD[64+rsi] + vpshufb xmm2,xmm2,xmm11 + vmovdqu xmm5,XMMWORD[80+rsi] + vpshufb xmm3,xmm3,xmm11 + vmovdqu xmm6,XMMWORD[96+rsi] + vpshufb xmm4,xmm4,xmm11 + vmovdqu xmm7,XMMWORD[112+rsi] + vpshufb xmm5,xmm5,xmm11 + vpaddq xmm8,xmm0,XMMWORD[((-128))+rbp] + vpshufb xmm6,xmm6,xmm11 + vpaddq xmm9,xmm1,XMMWORD[((-96))+rbp] + vpshufb xmm7,xmm7,xmm11 + vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp] + vpaddq xmm11,xmm3,XMMWORD[((-32))+rbp] + vmovdqa XMMWORD[rsp],xmm8 + vpaddq xmm8,xmm4,XMMWORD[rbp] + vmovdqa XMMWORD[16+rsp],xmm9 + vpaddq xmm9,xmm5,XMMWORD[32+rbp] + vmovdqa XMMWORD[32+rsp],xmm10 + vpaddq xmm10,xmm6,XMMWORD[64+rbp] + vmovdqa XMMWORD[48+rsp],xmm11 + vpaddq xmm11,xmm7,XMMWORD[96+rbp] + vmovdqa XMMWORD[64+rsp],xmm8 + mov r14,rax + vmovdqa XMMWORD[80+rsp],xmm9 + mov rdi,rbx + vmovdqa XMMWORD[96+rsp],xmm10 + xor rdi,rcx + vmovdqa XMMWORD[112+rsp],xmm11 + mov r13,r8 + jmp NEAR $L$avx_00_47 + +ALIGN 16 +$L$avx_00_47: + add rbp,256 + vpalignr xmm8,xmm1,xmm0,8 + shrd r13,r13,23 + mov rax,r14 + vpalignr xmm11,xmm5,xmm4,8 + mov r12,r9 + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,r8 + xor r12,r10 + vpaddq xmm0,xmm0,xmm11 + shrd r13,r13,4 + xor r14,rax + vpsrlq xmm11,xmm8,7 + and r12,r8 + xor r13,r8 + vpsllq xmm9,xmm8,56 + add r11,QWORD[rsp] + mov r15,rax + vpxor xmm8,xmm11,xmm10 + xor r12,r10 + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,rbx + add r11,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,rax + add r11,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rbx + shrd r14,r14,28 + vpsrlq xmm11,xmm7,6 + add rdx,r11 + add r11,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,rdx + add r14,r11 + vpsllq xmm10,xmm7,3 + shrd r13,r13,23 + mov r11,r14 + vpaddq xmm0,xmm0,xmm8 + mov r12,r8 + shrd r14,r14,5 + vpsrlq xmm9,xmm7,19 + xor r13,rdx + xor r12,r9 + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,r11 + vpsllq xmm10,xmm10,42 + and r12,rdx + xor r13,rdx + vpxor xmm11,xmm11,xmm9 + add r10,QWORD[8+rsp] + mov rdi,r11 + vpsrlq xmm9,xmm9,42 + xor r12,r9 + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,rax + add r10,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm0,xmm0,xmm11 + xor r14,r11 + add r10,r13 + vpaddq xmm10,xmm0,XMMWORD[((-128))+rbp] + xor r15,rax + shrd r14,r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + vmovdqa XMMWORD[rsp],xmm10 + vpalignr xmm8,xmm2,xmm1,8 + shrd r13,r13,23 + mov r10,r14 + vpalignr xmm11,xmm6,xmm5,8 + mov r12,rdx + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,rcx + xor r12,r8 + vpaddq xmm1,xmm1,xmm11 + shrd r13,r13,4 + xor r14,r10 + vpsrlq xmm11,xmm8,7 + and r12,rcx + xor r13,rcx + vpsllq xmm9,xmm8,56 + add r9,QWORD[16+rsp] + mov r15,r10 + vpxor xmm8,xmm11,xmm10 + xor r12,r8 + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,r11 + add r9,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,r10 + add r9,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r11 + shrd r14,r14,28 + vpsrlq xmm11,xmm0,6 + add rbx,r9 + add r9,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,rbx + add r14,r9 + vpsllq xmm10,xmm0,3 + shrd r13,r13,23 + mov r9,r14 + vpaddq xmm1,xmm1,xmm8 + mov r12,rcx + shrd r14,r14,5 + vpsrlq xmm9,xmm0,19 + xor r13,rbx + xor r12,rdx + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,r9 + vpsllq xmm10,xmm10,42 + and r12,rbx + xor r13,rbx + vpxor xmm11,xmm11,xmm9 + add r8,QWORD[24+rsp] + mov rdi,r9 + vpsrlq xmm9,xmm9,42 + xor r12,rdx + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,r10 + add r8,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm1,xmm1,xmm11 + xor r14,r9 + add r8,r13 + vpaddq xmm10,xmm1,XMMWORD[((-96))+rbp] + xor r15,r10 + shrd r14,r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + vmovdqa XMMWORD[16+rsp],xmm10 + vpalignr xmm8,xmm3,xmm2,8 + shrd r13,r13,23 + mov r8,r14 + vpalignr xmm11,xmm7,xmm6,8 + mov r12,rbx + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,rax + xor r12,rcx + vpaddq xmm2,xmm2,xmm11 + shrd r13,r13,4 + xor r14,r8 + vpsrlq xmm11,xmm8,7 + and r12,rax + xor r13,rax + vpsllq xmm9,xmm8,56 + add rdx,QWORD[32+rsp] + mov r15,r8 + vpxor xmm8,xmm11,xmm10 + xor r12,rcx + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,r9 + add rdx,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,r8 + add rdx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r9 + shrd r14,r14,28 + vpsrlq xmm11,xmm1,6 + add r11,rdx + add rdx,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,r11 + add r14,rdx + vpsllq xmm10,xmm1,3 + shrd r13,r13,23 + mov rdx,r14 + vpaddq xmm2,xmm2,xmm8 + mov r12,rax + shrd r14,r14,5 + vpsrlq xmm9,xmm1,19 + xor r13,r11 + xor r12,rbx + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,rdx + vpsllq xmm10,xmm10,42 + and r12,r11 + xor r13,r11 + vpxor xmm11,xmm11,xmm9 + add rcx,QWORD[40+rsp] + mov rdi,rdx + vpsrlq xmm9,xmm9,42 + xor r12,rbx + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,r8 + add rcx,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm2,xmm2,xmm11 + xor r14,rdx + add rcx,r13 + vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp] + xor r15,r8 + shrd r14,r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + vmovdqa XMMWORD[32+rsp],xmm10 + vpalignr xmm8,xmm4,xmm3,8 + shrd r13,r13,23 + mov rcx,r14 + vpalignr xmm11,xmm0,xmm7,8 + mov r12,r11 + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,r10 + xor r12,rax + vpaddq xmm3,xmm3,xmm11 + shrd r13,r13,4 + xor r14,rcx + vpsrlq xmm11,xmm8,7 + and r12,r10 + xor r13,r10 + vpsllq xmm9,xmm8,56 + add rbx,QWORD[48+rsp] + mov r15,rcx + vpxor xmm8,xmm11,xmm10 + xor r12,rax + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,rdx + add rbx,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,rcx + add rbx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rdx + shrd r14,r14,28 + vpsrlq xmm11,xmm2,6 + add r9,rbx + add rbx,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,r9 + add r14,rbx + vpsllq xmm10,xmm2,3 + shrd r13,r13,23 + mov rbx,r14 + vpaddq xmm3,xmm3,xmm8 + mov r12,r10 + shrd r14,r14,5 + vpsrlq xmm9,xmm2,19 + xor r13,r9 + xor r12,r11 + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,rbx + vpsllq xmm10,xmm10,42 + and r12,r9 + xor r13,r9 + vpxor xmm11,xmm11,xmm9 + add rax,QWORD[56+rsp] + mov rdi,rbx + vpsrlq xmm9,xmm9,42 + xor r12,r11 + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,rcx + add rax,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm3,xmm3,xmm11 + xor r14,rbx + add rax,r13 + vpaddq xmm10,xmm3,XMMWORD[((-32))+rbp] + xor r15,rcx + shrd r14,r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + vmovdqa XMMWORD[48+rsp],xmm10 + vpalignr xmm8,xmm5,xmm4,8 + shrd r13,r13,23 + mov rax,r14 + vpalignr xmm11,xmm1,xmm0,8 + mov r12,r9 + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,r8 + xor r12,r10 + vpaddq xmm4,xmm4,xmm11 + shrd r13,r13,4 + xor r14,rax + vpsrlq xmm11,xmm8,7 + and r12,r8 + xor r13,r8 + vpsllq xmm9,xmm8,56 + add r11,QWORD[64+rsp] + mov r15,rax + vpxor xmm8,xmm11,xmm10 + xor r12,r10 + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,rbx + add r11,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,rax + add r11,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rbx + shrd r14,r14,28 + vpsrlq xmm11,xmm3,6 + add rdx,r11 + add r11,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,rdx + add r14,r11 + vpsllq xmm10,xmm3,3 + shrd r13,r13,23 + mov r11,r14 + vpaddq xmm4,xmm4,xmm8 + mov r12,r8 + shrd r14,r14,5 + vpsrlq xmm9,xmm3,19 + xor r13,rdx + xor r12,r9 + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,r11 + vpsllq xmm10,xmm10,42 + and r12,rdx + xor r13,rdx + vpxor xmm11,xmm11,xmm9 + add r10,QWORD[72+rsp] + mov rdi,r11 + vpsrlq xmm9,xmm9,42 + xor r12,r9 + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,rax + add r10,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm4,xmm4,xmm11 + xor r14,r11 + add r10,r13 + vpaddq xmm10,xmm4,XMMWORD[rbp] + xor r15,rax + shrd r14,r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + vmovdqa XMMWORD[64+rsp],xmm10 + vpalignr xmm8,xmm6,xmm5,8 + shrd r13,r13,23 + mov r10,r14 + vpalignr xmm11,xmm2,xmm1,8 + mov r12,rdx + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,rcx + xor r12,r8 + vpaddq xmm5,xmm5,xmm11 + shrd r13,r13,4 + xor r14,r10 + vpsrlq xmm11,xmm8,7 + and r12,rcx + xor r13,rcx + vpsllq xmm9,xmm8,56 + add r9,QWORD[80+rsp] + mov r15,r10 + vpxor xmm8,xmm11,xmm10 + xor r12,r8 + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,r11 + add r9,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,r10 + add r9,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r11 + shrd r14,r14,28 + vpsrlq xmm11,xmm4,6 + add rbx,r9 + add r9,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,rbx + add r14,r9 + vpsllq xmm10,xmm4,3 + shrd r13,r13,23 + mov r9,r14 + vpaddq xmm5,xmm5,xmm8 + mov r12,rcx + shrd r14,r14,5 + vpsrlq xmm9,xmm4,19 + xor r13,rbx + xor r12,rdx + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,r9 + vpsllq xmm10,xmm10,42 + and r12,rbx + xor r13,rbx + vpxor xmm11,xmm11,xmm9 + add r8,QWORD[88+rsp] + mov rdi,r9 + vpsrlq xmm9,xmm9,42 + xor r12,rdx + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,r10 + add r8,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm5,xmm5,xmm11 + xor r14,r9 + add r8,r13 + vpaddq xmm10,xmm5,XMMWORD[32+rbp] + xor r15,r10 + shrd r14,r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + vmovdqa XMMWORD[80+rsp],xmm10 + vpalignr xmm8,xmm7,xmm6,8 + shrd r13,r13,23 + mov r8,r14 + vpalignr xmm11,xmm3,xmm2,8 + mov r12,rbx + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,rax + xor r12,rcx + vpaddq xmm6,xmm6,xmm11 + shrd r13,r13,4 + xor r14,r8 + vpsrlq xmm11,xmm8,7 + and r12,rax + xor r13,rax + vpsllq xmm9,xmm8,56 + add rdx,QWORD[96+rsp] + mov r15,r8 + vpxor xmm8,xmm11,xmm10 + xor r12,rcx + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,r9 + add rdx,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,r8 + add rdx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r9 + shrd r14,r14,28 + vpsrlq xmm11,xmm5,6 + add r11,rdx + add rdx,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,r11 + add r14,rdx + vpsllq xmm10,xmm5,3 + shrd r13,r13,23 + mov rdx,r14 + vpaddq xmm6,xmm6,xmm8 + mov r12,rax + shrd r14,r14,5 + vpsrlq xmm9,xmm5,19 + xor r13,r11 + xor r12,rbx + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,rdx + vpsllq xmm10,xmm10,42 + and r12,r11 + xor r13,r11 + vpxor xmm11,xmm11,xmm9 + add rcx,QWORD[104+rsp] + mov rdi,rdx + vpsrlq xmm9,xmm9,42 + xor r12,rbx + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,r8 + add rcx,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm6,xmm6,xmm11 + xor r14,rdx + add rcx,r13 + vpaddq xmm10,xmm6,XMMWORD[64+rbp] + xor r15,r8 + shrd r14,r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + vmovdqa XMMWORD[96+rsp],xmm10 + vpalignr xmm8,xmm0,xmm7,8 + shrd r13,r13,23 + mov rcx,r14 + vpalignr xmm11,xmm4,xmm3,8 + mov r12,r11 + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,r10 + xor r12,rax + vpaddq xmm7,xmm7,xmm11 + shrd r13,r13,4 + xor r14,rcx + vpsrlq xmm11,xmm8,7 + and r12,r10 + xor r13,r10 + vpsllq xmm9,xmm8,56 + add rbx,QWORD[112+rsp] + mov r15,rcx + vpxor xmm8,xmm11,xmm10 + xor r12,rax + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,rdx + add rbx,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,rcx + add rbx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rdx + shrd r14,r14,28 + vpsrlq xmm11,xmm6,6 + add r9,rbx + add rbx,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,r9 + add r14,rbx + vpsllq xmm10,xmm6,3 + shrd r13,r13,23 + mov rbx,r14 + vpaddq xmm7,xmm7,xmm8 + mov r12,r10 + shrd r14,r14,5 + vpsrlq xmm9,xmm6,19 + xor r13,r9 + xor r12,r11 + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,rbx + vpsllq xmm10,xmm10,42 + and r12,r9 + xor r13,r9 + vpxor xmm11,xmm11,xmm9 + add rax,QWORD[120+rsp] + mov rdi,rbx + vpsrlq xmm9,xmm9,42 + xor r12,r11 + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,rcx + add rax,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm7,xmm7,xmm11 + xor r14,rbx + add rax,r13 + vpaddq xmm10,xmm7,XMMWORD[96+rbp] + xor r15,rcx + shrd r14,r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + vmovdqa XMMWORD[112+rsp],xmm10 + cmp BYTE[135+rbp],0 + jne NEAR $L$avx_00_47 + shrd r13,r13,23 + mov rax,r14 + mov r12,r9 + shrd r14,r14,5 + xor r13,r8 + xor r12,r10 + shrd r13,r13,4 + xor r14,rax + and r12,r8 + xor r13,r8 + add r11,QWORD[rsp] + mov r15,rax + xor r12,r10 + shrd r14,r14,6 + xor r15,rbx + add r11,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,rax + add r11,r13 + xor rdi,rbx + shrd r14,r14,28 + add rdx,r11 + add r11,rdi + mov r13,rdx + add r14,r11 + shrd r13,r13,23 + mov r11,r14 + mov r12,r8 + shrd r14,r14,5 + xor r13,rdx + xor r12,r9 + shrd r13,r13,4 + xor r14,r11 + and r12,rdx + xor r13,rdx + add r10,QWORD[8+rsp] + mov rdi,r11 + xor r12,r9 + shrd r14,r14,6 + xor rdi,rax + add r10,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,r11 + add r10,r13 + xor r15,rax + shrd r14,r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + shrd r13,r13,23 + mov r10,r14 + mov r12,rdx + shrd r14,r14,5 + xor r13,rcx + xor r12,r8 + shrd r13,r13,4 + xor r14,r10 + and r12,rcx + xor r13,rcx + add r9,QWORD[16+rsp] + mov r15,r10 + xor r12,r8 + shrd r14,r14,6 + xor r15,r11 + add r9,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,r10 + add r9,r13 + xor rdi,r11 + shrd r14,r14,28 + add rbx,r9 + add r9,rdi + mov r13,rbx + add r14,r9 + shrd r13,r13,23 + mov r9,r14 + mov r12,rcx + shrd r14,r14,5 + xor r13,rbx + xor r12,rdx + shrd r13,r13,4 + xor r14,r9 + and r12,rbx + xor r13,rbx + add r8,QWORD[24+rsp] + mov rdi,r9 + xor r12,rdx + shrd r14,r14,6 + xor rdi,r10 + add r8,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,r9 + add r8,r13 + xor r15,r10 + shrd r14,r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + shrd r13,r13,23 + mov r8,r14 + mov r12,rbx + shrd r14,r14,5 + xor r13,rax + xor r12,rcx + shrd r13,r13,4 + xor r14,r8 + and r12,rax + xor r13,rax + add rdx,QWORD[32+rsp] + mov r15,r8 + xor r12,rcx + shrd r14,r14,6 + xor r15,r9 + add rdx,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,r8 + add rdx,r13 + xor rdi,r9 + shrd r14,r14,28 + add r11,rdx + add rdx,rdi + mov r13,r11 + add r14,rdx + shrd r13,r13,23 + mov rdx,r14 + mov r12,rax + shrd r14,r14,5 + xor r13,r11 + xor r12,rbx + shrd r13,r13,4 + xor r14,rdx + and r12,r11 + xor r13,r11 + add rcx,QWORD[40+rsp] + mov rdi,rdx + xor r12,rbx + shrd r14,r14,6 + xor rdi,r8 + add rcx,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,rdx + add rcx,r13 + xor r15,r8 + shrd r14,r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + shrd r13,r13,23 + mov rcx,r14 + mov r12,r11 + shrd r14,r14,5 + xor r13,r10 + xor r12,rax + shrd r13,r13,4 + xor r14,rcx + and r12,r10 + xor r13,r10 + add rbx,QWORD[48+rsp] + mov r15,rcx + xor r12,rax + shrd r14,r14,6 + xor r15,rdx + add rbx,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,rcx + add rbx,r13 + xor rdi,rdx + shrd r14,r14,28 + add r9,rbx + add rbx,rdi + mov r13,r9 + add r14,rbx + shrd r13,r13,23 + mov rbx,r14 + mov r12,r10 + shrd r14,r14,5 + xor r13,r9 + xor r12,r11 + shrd r13,r13,4 + xor r14,rbx + and r12,r9 + xor r13,r9 + add rax,QWORD[56+rsp] + mov rdi,rbx + xor r12,r11 + shrd r14,r14,6 + xor rdi,rcx + add rax,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,rbx + add rax,r13 + xor r15,rcx + shrd r14,r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + shrd r13,r13,23 + mov rax,r14 + mov r12,r9 + shrd r14,r14,5 + xor r13,r8 + xor r12,r10 + shrd r13,r13,4 + xor r14,rax + and r12,r8 + xor r13,r8 + add r11,QWORD[64+rsp] + mov r15,rax + xor r12,r10 + shrd r14,r14,6 + xor r15,rbx + add r11,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,rax + add r11,r13 + xor rdi,rbx + shrd r14,r14,28 + add rdx,r11 + add r11,rdi + mov r13,rdx + add r14,r11 + shrd r13,r13,23 + mov r11,r14 + mov r12,r8 + shrd r14,r14,5 + xor r13,rdx + xor r12,r9 + shrd r13,r13,4 + xor r14,r11 + and r12,rdx + xor r13,rdx + add r10,QWORD[72+rsp] + mov rdi,r11 + xor r12,r9 + shrd r14,r14,6 + xor rdi,rax + add r10,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,r11 + add r10,r13 + xor r15,rax + shrd r14,r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + shrd r13,r13,23 + mov r10,r14 + mov r12,rdx + shrd r14,r14,5 + xor r13,rcx + xor r12,r8 + shrd r13,r13,4 + xor r14,r10 + and r12,rcx + xor r13,rcx + add r9,QWORD[80+rsp] + mov r15,r10 + xor r12,r8 + shrd r14,r14,6 + xor r15,r11 + add r9,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,r10 + add r9,r13 + xor rdi,r11 + shrd r14,r14,28 + add rbx,r9 + add r9,rdi + mov r13,rbx + add r14,r9 + shrd r13,r13,23 + mov r9,r14 + mov r12,rcx + shrd r14,r14,5 + xor r13,rbx + xor r12,rdx + shrd r13,r13,4 + xor r14,r9 + and r12,rbx + xor r13,rbx + add r8,QWORD[88+rsp] + mov rdi,r9 + xor r12,rdx + shrd r14,r14,6 + xor rdi,r10 + add r8,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,r9 + add r8,r13 + xor r15,r10 + shrd r14,r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + shrd r13,r13,23 + mov r8,r14 + mov r12,rbx + shrd r14,r14,5 + xor r13,rax + xor r12,rcx + shrd r13,r13,4 + xor r14,r8 + and r12,rax + xor r13,rax + add rdx,QWORD[96+rsp] + mov r15,r8 + xor r12,rcx + shrd r14,r14,6 + xor r15,r9 + add rdx,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,r8 + add rdx,r13 + xor rdi,r9 + shrd r14,r14,28 + add r11,rdx + add rdx,rdi + mov r13,r11 + add r14,rdx + shrd r13,r13,23 + mov rdx,r14 + mov r12,rax + shrd r14,r14,5 + xor r13,r11 + xor r12,rbx + shrd r13,r13,4 + xor r14,rdx + and r12,r11 + xor r13,r11 + add rcx,QWORD[104+rsp] + mov rdi,rdx + xor r12,rbx + shrd r14,r14,6 + xor rdi,r8 + add rcx,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,rdx + add rcx,r13 + xor r15,r8 + shrd r14,r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + shrd r13,r13,23 + mov rcx,r14 + mov r12,r11 + shrd r14,r14,5 + xor r13,r10 + xor r12,rax + shrd r13,r13,4 + xor r14,rcx + and r12,r10 + xor r13,r10 + add rbx,QWORD[112+rsp] + mov r15,rcx + xor r12,rax + shrd r14,r14,6 + xor r15,rdx + add rbx,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,rcx + add rbx,r13 + xor rdi,rdx + shrd r14,r14,28 + add r9,rbx + add rbx,rdi + mov r13,r9 + add r14,rbx + shrd r13,r13,23 + mov rbx,r14 + mov r12,r10 + shrd r14,r14,5 + xor r13,r9 + xor r12,r11 + shrd r13,r13,4 + xor r14,rbx + and r12,r9 + xor r13,r9 + add rax,QWORD[120+rsp] + mov rdi,rbx + xor r12,r11 + shrd r14,r14,6 + xor rdi,rcx + add rax,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,rbx + add rax,r13 + xor r15,rcx + shrd r14,r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + mov rdi,QWORD[((128+0))+rsp] + mov rax,r14 + + add rax,QWORD[rdi] + lea rsi,[128+rsi] + add rbx,QWORD[8+rdi] + add rcx,QWORD[16+rdi] + add rdx,QWORD[24+rdi] + add r8,QWORD[32+rdi] + add r9,QWORD[40+rdi] + add r10,QWORD[48+rdi] + add r11,QWORD[56+rdi] + + cmp rsi,QWORD[((128+16))+rsp] + + mov QWORD[rdi],rax + mov QWORD[8+rdi],rbx + mov QWORD[16+rdi],rcx + mov QWORD[24+rdi],rdx + mov QWORD[32+rdi],r8 + mov QWORD[40+rdi],r9 + mov QWORD[48+rdi],r10 + mov QWORD[56+rdi],r11 + jb NEAR $L$loop_avx + + mov rsi,QWORD[152+rsp] + + vzeroupper + movaps xmm6,XMMWORD[((128+32))+rsp] + movaps xmm7,XMMWORD[((128+48))+rsp] + movaps xmm8,XMMWORD[((128+64))+rsp] + movaps xmm9,XMMWORD[((128+80))+rsp] + movaps xmm10,XMMWORD[((128+96))+rsp] + movaps xmm11,XMMWORD[((128+112))+rsp] + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$epilogue_avx: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_sha512_block_data_order_avx: +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +se_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$in_prologue + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$in_prologue + mov rsi,rax + mov rax,QWORD[((128+24))+rax] + + mov rbx,QWORD[((-8))+rax] + mov rbp,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 + + lea r10,[$L$epilogue] + cmp rbx,r10 + jb NEAR $L$in_prologue + + lea rsi,[((128+32))+rsi] + lea rdi,[512+r8] + mov ecx,12 + DD 0xa548f3fc + +$L$in_prologue: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + ret + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_sha512_block_data_order_nohw wrt ..imagebase + DD $L$SEH_end_sha512_block_data_order_nohw wrt ..imagebase + DD $L$SEH_info_sha512_block_data_order_nohw wrt ..imagebase + DD $L$SEH_begin_sha512_block_data_order_avx wrt ..imagebase + DD $L$SEH_end_sha512_block_data_order_avx wrt ..imagebase + DD $L$SEH_info_sha512_block_data_order_avx wrt ..imagebase +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_sha512_block_data_order_nohw: + DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$prologue wrt ..imagebase,$L$epilogue wrt ..imagebase +$L$SEH_info_sha512_block_data_order_avx: + DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/ring-0.17.14/pregenerated/sha512-x86_64-nasm.o b/ring-0.17.14/pregenerated/sha512-x86_64-nasm.o new file mode 100644 index 0000000000..94e2ffe0f2 Binary files /dev/null and b/ring-0.17.14/pregenerated/sha512-x86_64-nasm.o differ diff --git a/ring-0.17.14/pregenerated/vpaes-armv7-linux32.S b/ring-0.17.14/pregenerated/vpaes-armv7-linux32.S new file mode 100644 index 0000000000..bdf08a2741 --- /dev/null +++ b/ring-0.17.14/pregenerated/vpaes-armv7-linux32.S @@ -0,0 +1,722 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__) +.syntax unified + +.arch armv7-a +.fpu neon + +#if defined(__thumb2__) +.thumb +#else +.code 32 +#endif + +.text + +.type _vpaes_consts,%object +.align 7 @ totally strategic alignment +_vpaes_consts: +.Lk_mc_forward:@ mc_forward +.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 +.quad 0x080B0A0904070605, 0x000302010C0F0E0D +.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 +.quad 0x000302010C0F0E0D, 0x080B0A0904070605 +.Lk_mc_backward:@ mc_backward +.quad 0x0605040702010003, 0x0E0D0C0F0A09080B +.quad 0x020100030E0D0C0F, 0x0A09080B06050407 +.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 +.quad 0x0A09080B06050407, 0x020100030E0D0C0F +.Lk_sr:@ sr +.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 +.quad 0x030E09040F0A0500, 0x0B06010C07020D08 +.quad 0x0F060D040B020900, 0x070E050C030A0108 +.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +@ +@ "Hot" constants +@ +.Lk_inv:@ inv, inva +.quad 0x0E05060F0D080180, 0x040703090A0B0C02 +.quad 0x01040A060F0B0780, 0x030D0E0C02050809 +.Lk_ipt:@ input transform (lo, hi) +.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 +.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 +.Lk_sbo:@ sbou, sbot +.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 +.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA +.Lk_sb1:@ sb1u, sb1t +.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF +.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 +.Lk_sb2:@ sb2u, sb2t +.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A +.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD + +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,55,32,78,69,79,78,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 +.align 2 +.size _vpaes_consts,.-_vpaes_consts +.align 6 +@@ +@@ _aes_preheat +@@ +@@ Fills q9-q15 as specified below. +@@ +.type _vpaes_preheat,%function +.align 4 +_vpaes_preheat: + adr r10, .Lk_inv + vmov.i8 q9, #0x0f @ .Lk_s0F + vld1.64 {q10,q11}, [r10]! @ .Lk_inv + add r10, r10, #64 @ Skip .Lk_ipt, .Lk_sbo + vld1.64 {q12,q13}, [r10]! @ .Lk_sb1 + vld1.64 {q14,q15}, [r10] @ .Lk_sb2 + bx lr + +@@ +@@ _aes_encrypt_core +@@ +@@ AES-encrypt q0. +@@ +@@ Inputs: +@@ q0 = input +@@ q9-q15 as in _vpaes_preheat +@@ [r2] = scheduled keys +@@ +@@ Output in q0 +@@ Clobbers q1-q5, r8-r11 +@@ Preserves q6-q8 so you get some local vectors +@@ +@@ +.type _vpaes_encrypt_core,%function +.align 4 +_vpaes_encrypt_core: + mov r9, r2 + ldr r8, [r2,#240] @ pull rounds + adr r11, .Lk_ipt + @ vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + @ vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + vld1.64 {q2, q3}, [r11] + adr r11, .Lk_mc_forward+16 + vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 # round0 key + vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 + vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 + vtbl.8 d2, {q2}, d2 @ vpshufb %xmm1, %xmm2, %xmm1 + vtbl.8 d3, {q2}, d3 + vtbl.8 d4, {q3}, d0 @ vpshufb %xmm0, %xmm3, %xmm2 + vtbl.8 d5, {q3}, d1 + veor q0, q1, q5 @ vpxor %xmm5, %xmm1, %xmm0 + veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 + + @ .Lenc_entry ends with a bnz instruction which is normally paired with + @ subs in .Lenc_loop. + tst r8, r8 + b .Lenc_entry + +.align 4 +.Lenc_loop: + @ middle of middle round + add r10, r11, #0x40 + vtbl.8 d8, {q13}, d4 @ vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + vtbl.8 d9, {q13}, d5 + vld1.64 {q1}, [r11]! @ vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + vtbl.8 d0, {q12}, d6 @ vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + vtbl.8 d1, {q12}, d7 + veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + vtbl.8 d10, {q15}, d4 @ vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + vtbl.8 d11, {q15}, d5 + veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A + vtbl.8 d4, {q14}, d6 @ vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + vtbl.8 d5, {q14}, d7 + vld1.64 {q4}, [r10] @ vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] + vtbl.8 d6, {q0}, d2 @ vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + vtbl.8 d7, {q0}, d3 + veor q2, q2, q5 @ vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + @ Write to q5 instead of q0, so the table and destination registers do + @ not overlap. + vtbl.8 d10, {q0}, d8 @ vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + vtbl.8 d11, {q0}, d9 + veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + vtbl.8 d8, {q3}, d2 @ vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + vtbl.8 d9, {q3}, d3 + @ Here we restore the original q0/q5 usage. + veor q0, q5, q3 @ vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + and r11, r11, #~(1<<6) @ and $0x30, %r11 # ... mod 4 + veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + subs r8, r8, #1 @ nr-- + +.Lenc_entry: + @ top of round + vand q1, q0, q9 @ vpand %xmm0, %xmm9, %xmm1 # 0 = k + vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i + vtbl.8 d10, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + vtbl.8 d11, {q11}, d3 + veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j + vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + vtbl.8 d7, {q10}, d1 + vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + vtbl.8 d9, {q10}, d3 + veor q3, q3, q5 @ vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + vtbl.8 d4, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + vtbl.8 d5, {q10}, d7 + vtbl.8 d6, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + vtbl.8 d7, {q10}, d9 + veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io + veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 + bne .Lenc_loop + + @ middle of last round + add r10, r11, #0x80 + + adr r11, .Lk_sbo + @ Read to q1 instead of q4, so the vtbl.8 instruction below does not + @ overlap table and destination registers. + vld1.64 {q1}, [r11]! @ vmovdqa -0x60(%r10), %xmm4 # 3 : sbou + vld1.64 {q0}, [r11] @ vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + vtbl.8 d8, {q1}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + vtbl.8 d9, {q1}, d5 + vld1.64 {q1}, [r10] @ vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] + @ Write to q2 instead of q0 below, to avoid overlapping table and + @ destination registers. + vtbl.8 d4, {q0}, d6 @ vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + vtbl.8 d5, {q0}, d7 + veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + veor q2, q2, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A + @ Here we restore the original q0/q2 usage. + vtbl.8 d0, {q2}, d2 @ vpshufb %xmm1, %xmm0, %xmm0 + vtbl.8 d1, {q2}, d3 + bx lr +.size _vpaes_encrypt_core,.-_vpaes_encrypt_core +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ +@@ @@ +@@ AES key schedule @@ +@@ @@ +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + +@ This function diverges from both x86_64 and armv7 in which constants are +@ pinned. x86_64 has a common preheat function for all operations. aarch64 +@ separates them because it has enough registers to pin nearly all constants. +@ armv7 does not have enough registers, but needing explicit loads and stores +@ also complicates using x86_64's register allocation directly. +@ +@ We pin some constants for convenience and leave q14 and q15 free to load +@ others on demand. + +@ +@ Key schedule constants +@ +.type _vpaes_key_consts,%object +.align 4 +_vpaes_key_consts: +.Lk_rcon:@ rcon +.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +.Lk_opt:@ output transform +.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 +.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 +.Lk_deskew:@ deskew tables: inverts the sbox's "skew" +.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A +.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 +.size _vpaes_key_consts,.-_vpaes_key_consts + +.type _vpaes_key_preheat,%function +.align 4 +_vpaes_key_preheat: + adr r11, .Lk_rcon + vmov.i8 q12, #0x5b @ .Lk_s63 + adr r10, .Lk_inv @ Must be aligned to 8 mod 16. + vmov.i8 q9, #0x0f @ .Lk_s0F + vld1.64 {q10,q11}, [r10] @ .Lk_inv + vld1.64 {q8}, [r11] @ .Lk_rcon + bx lr +.size _vpaes_key_preheat,.-_vpaes_key_preheat + +.type _vpaes_schedule_core,%function +.align 4 +_vpaes_schedule_core: + @ We only need to save lr, but ARM requires an 8-byte stack alignment, + @ so save an extra register. + stmdb sp!, {r3,lr} + + bl _vpaes_key_preheat @ load the tables + + adr r11, .Lk_ipt @ Must be aligned to 8 mod 16. + vld1.64 {q0}, [r0]! @ vmovdqu (%rdi), %xmm0 # load key (unaligned) + + @ input transform + @ Use q4 here rather than q3 so .Lschedule_am_decrypting does not + @ overlap table and destination. + vmov q4, q0 @ vmovdqa %xmm0, %xmm3 + bl _vpaes_schedule_transform + adr r10, .Lk_sr @ Must be aligned to 8 mod 16. + vmov q7, q0 @ vmovdqa %xmm0, %xmm7 + + add r8, r8, r10 + + @ encrypting, output zeroth round key after transform + vst1.64 {q0}, [r2] @ vmovdqu %xmm0, (%rdx) + + @ *ring*: Decryption removed. + +.Lschedule_go: + cmp r1, #192 @ cmp $192, %esi + bhi .Lschedule_256 + @ 128: fall though + +@@ +@@ .schedule_128 +@@ +@@ 128-bit specific part of key schedule. +@@ +@@ This schedule is really simple, because all its parts +@@ are accomplished by the subroutines. +@@ +.Lschedule_128: + mov r0, #10 @ mov $10, %esi + +.Loop_schedule_128: + bl _vpaes_schedule_round + subs r0, r0, #1 @ dec %esi + beq .Lschedule_mangle_last + bl _vpaes_schedule_mangle @ write output + b .Loop_schedule_128 + +@@ +@@ .aes_schedule_256 +@@ +@@ 256-bit specific part of key schedule. +@@ +@@ The structure here is very similar to the 128-bit +@@ schedule, but with an additional "low side" in +@@ q6. The low side's rounds are the same as the +@@ high side's, except no rcon and no rotation. +@@ +.align 4 +.Lschedule_256: + vld1.64 {q0}, [r0] @ vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) + bl _vpaes_schedule_transform @ input transform + mov r0, #7 @ mov $7, %esi + +.Loop_schedule_256: + bl _vpaes_schedule_mangle @ output low result + vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 + + @ high round + bl _vpaes_schedule_round + subs r0, r0, #1 @ dec %esi + beq .Lschedule_mangle_last + bl _vpaes_schedule_mangle + + @ low round. swap xmm7 and xmm6 + vdup.32 q0, d1[1] @ vpshufd $0xFF, %xmm0, %xmm0 + vmov.i8 q4, #0 + vmov q5, q7 @ vmovdqa %xmm7, %xmm5 + vmov q7, q6 @ vmovdqa %xmm6, %xmm7 + bl _vpaes_schedule_low_round + vmov q7, q5 @ vmovdqa %xmm5, %xmm7 + + b .Loop_schedule_256 + +@@ +@@ .aes_schedule_mangle_last +@@ +@@ Mangler for last round of key schedule +@@ Mangles q0 +@@ when encrypting, outputs out(q0) ^ 63 +@@ when decrypting, outputs unskew(q0) +@@ +@@ Always called right before return... jumps to cleanup and exits +@@ +.align 4 +.Lschedule_mangle_last: + @ schedule last round key from xmm0 + adr r11, .Lk_deskew @ lea .Lk_deskew(%rip),%r11 # prepare to deskew + + @ encrypting + vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10),%xmm1 + adr r11, .Lk_opt @ lea .Lk_opt(%rip), %r11 # prepare to output transform + add r2, r2, #32 @ add $32, %rdx + vmov q2, q0 + vtbl.8 d0, {q2}, d2 @ vpshufb %xmm1, %xmm0, %xmm0 # output permute + vtbl.8 d1, {q2}, d3 + +.Lschedule_mangle_last_dec: + sub r2, r2, #16 @ add $-16, %rdx + veor q0, q0, q12 @ vpxor .Lk_s63(%rip), %xmm0, %xmm0 + bl _vpaes_schedule_transform @ output transform + vst1.64 {q0}, [r2] @ vmovdqu %xmm0, (%rdx) # save last key + + @ cleanup + veor q0, q0, q0 @ vpxor %xmm0, %xmm0, %xmm0 + veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1 + veor q2, q2, q2 @ vpxor %xmm2, %xmm2, %xmm2 + veor q3, q3, q3 @ vpxor %xmm3, %xmm3, %xmm3 + veor q4, q4, q4 @ vpxor %xmm4, %xmm4, %xmm4 + veor q5, q5, q5 @ vpxor %xmm5, %xmm5, %xmm5 + veor q6, q6, q6 @ vpxor %xmm6, %xmm6, %xmm6 + veor q7, q7, q7 @ vpxor %xmm7, %xmm7, %xmm7 + ldmia sp!, {r3,pc} @ return +.size _vpaes_schedule_core,.-_vpaes_schedule_core + +@@ +@@ .aes_schedule_round +@@ +@@ Runs one main round of the key schedule on q0, q7 +@@ +@@ Specifically, runs subbytes on the high dword of q0 +@@ then rotates it by one byte and xors into the low dword of +@@ q7. +@@ +@@ Adds rcon from low byte of q8, then rotates q8 for +@@ next rcon. +@@ +@@ Smears the dwords of q7 by xoring the low into the +@@ second low, result into third, result into highest. +@@ +@@ Returns results in q7 = q0. +@@ Clobbers q1-q4, r11. +@@ +.type _vpaes_schedule_round,%function +.align 4 +_vpaes_schedule_round: + @ extract rcon from xmm8 + vmov.i8 q4, #0 @ vpxor %xmm4, %xmm4, %xmm4 + vext.8 q1, q8, q4, #15 @ vpalignr $15, %xmm8, %xmm4, %xmm1 + vext.8 q8, q8, q8, #15 @ vpalignr $15, %xmm8, %xmm8, %xmm8 + veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7 + + @ rotate + vdup.32 q0, d1[1] @ vpshufd $0xFF, %xmm0, %xmm0 + vext.8 q0, q0, q0, #1 @ vpalignr $1, %xmm0, %xmm0, %xmm0 + + @ fall through... + + @ low round: same as high round, but no rotation and no rcon. +_vpaes_schedule_low_round: + @ The x86_64 version pins .Lk_sb1 in %xmm13 and .Lk_sb1+16 in %xmm12. + @ We pin other values in _vpaes_key_preheat, so load them now. + adr r11, .Lk_sb1 + vld1.64 {q14,q15}, [r11] + + @ smear xmm7 + vext.8 q1, q4, q7, #12 @ vpslldq $4, %xmm7, %xmm1 + veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7 + vext.8 q4, q4, q7, #8 @ vpslldq $8, %xmm7, %xmm4 + + @ subbytes + vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 # 0 = k + vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i + veor q7, q7, q4 @ vpxor %xmm4, %xmm7, %xmm7 + vtbl.8 d4, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + vtbl.8 d5, {q11}, d3 + veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j + vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + vtbl.8 d7, {q10}, d1 + veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + vtbl.8 d9, {q10}, d3 + veor q7, q7, q12 @ vpxor .Lk_s63(%rip), %xmm7, %xmm7 + vtbl.8 d6, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak + vtbl.8 d7, {q10}, d7 + veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + vtbl.8 d4, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak + vtbl.8 d5, {q10}, d9 + veor q3, q3, q1 @ vpxor %xmm1, %xmm3, %xmm3 # 2 = io + veor q2, q2, q0 @ vpxor %xmm0, %xmm2, %xmm2 # 3 = jo + vtbl.8 d8, {q15}, d6 @ vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou + vtbl.8 d9, {q15}, d7 + vtbl.8 d2, {q14}, d4 @ vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t + vtbl.8 d3, {q14}, d5 + veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output + + @ add in smeared stuff + veor q0, q1, q7 @ vpxor %xmm7, %xmm1, %xmm0 + veor q7, q1, q7 @ vmovdqa %xmm0, %xmm7 + bx lr +.size _vpaes_schedule_round,.-_vpaes_schedule_round + +@@ +@@ .aes_schedule_transform +@@ +@@ Linear-transform q0 according to tables at [r11] +@@ +@@ Requires that q9 = 0x0F0F... as in preheat +@@ Output in q0 +@@ Clobbers q1, q2, q14, q15 +@@ +.type _vpaes_schedule_transform,%function +.align 4 +_vpaes_schedule_transform: + vld1.64 {q14,q15}, [r11] @ vmovdqa (%r11), %xmm2 # lo + @ vmovdqa 16(%r11), %xmm1 # hi + vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 + vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 + vtbl.8 d4, {q14}, d2 @ vpshufb %xmm1, %xmm2, %xmm2 + vtbl.8 d5, {q14}, d3 + vtbl.8 d0, {q15}, d0 @ vpshufb %xmm0, %xmm1, %xmm0 + vtbl.8 d1, {q15}, d1 + veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 + bx lr +.size _vpaes_schedule_transform,.-_vpaes_schedule_transform + +@@ +@@ .aes_schedule_mangle +@@ +@@ Mangles q0 from (basis-transformed) standard version +@@ to our version. +@@ +@@ On encrypt, +@@ xor with 0x63 +@@ multiply by circulant 0,1,1,1 +@@ apply shiftrows transform +@@ +@@ On decrypt, +@@ xor with 0x63 +@@ multiply by "inverse mixcolumns" circulant E,B,D,9 +@@ deskew +@@ apply shiftrows transform +@@ +@@ +@@ Writes out to [r2], and increments or decrements it +@@ Keeps track of round number mod 4 in r8 +@@ Preserves q0 +@@ Clobbers q1-q5 +@@ +.type _vpaes_schedule_mangle,%function +.align 4 +_vpaes_schedule_mangle: + tst r3, r3 + vmov q4, q0 @ vmovdqa %xmm0, %xmm4 # save xmm0 for later + adr r11, .Lk_mc_forward @ Must be aligned to 8 mod 16. + vld1.64 {q5}, [r11] @ vmovdqa .Lk_mc_forward(%rip),%xmm5 + + @ encrypting + @ Write to q2 so we do not overlap table and destination below. + veor q2, q0, q12 @ vpxor .Lk_s63(%rip), %xmm0, %xmm4 + add r2, r2, #16 @ add $16, %rdx + vtbl.8 d8, {q2}, d10 @ vpshufb %xmm5, %xmm4, %xmm4 + vtbl.8 d9, {q2}, d11 + vtbl.8 d2, {q4}, d10 @ vpshufb %xmm5, %xmm4, %xmm1 + vtbl.8 d3, {q4}, d11 + vtbl.8 d6, {q1}, d10 @ vpshufb %xmm5, %xmm1, %xmm3 + vtbl.8 d7, {q1}, d11 + veor q4, q4, q1 @ vpxor %xmm1, %xmm4, %xmm4 + vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1 + veor q3, q3, q4 @ vpxor %xmm4, %xmm3, %xmm3 + +.Lschedule_mangle_both: + @ Write to q2 so table and destination do not overlap. + vtbl.8 d4, {q3}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 + vtbl.8 d5, {q3}, d3 + add r8, r8, #64-16 @ add $-16, %r8 + and r8, r8, #~(1<<6) @ and $0x30, %r8 + vst1.64 {q2}, [r2] @ vmovdqu %xmm3, (%rdx) + bx lr +.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle + +.globl vpaes_set_encrypt_key +.hidden vpaes_set_encrypt_key +.type vpaes_set_encrypt_key,%function +.align 4 +vpaes_set_encrypt_key: + stmdb sp!, {r7,r8,r9,r10,r11, lr} + vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + + lsr r9, r1, #5 @ shr $5,%eax + add r9, r9, #5 @ $5,%eax + str r9, [r2,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + + mov r3, #0 @ mov $0,%ecx + mov r8, #0x30 @ mov $0x30,%r8d + bl _vpaes_schedule_core + eor r0, r0, r0 + + vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return +.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key + +@ Additional constants for converting to bsaes. +.type _vpaes_convert_consts,%object +.align 4 +_vpaes_convert_consts: +@ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear +@ transform in the AES S-box. 0x63 is incorporated into the low half of the +@ table. This was computed with the following script: +@ +@ def u64s_to_u128(x, y): +@ return x | (y << 64) +@ def u128_to_u64s(w): +@ return w & ((1<<64)-1), w >> 64 +@ def get_byte(w, i): +@ return (w >> (i*8)) & 0xff +@ def apply_table(table, b): +@ lo = b & 0xf +@ hi = b >> 4 +@ return get_byte(table[0], lo) ^ get_byte(table[1], hi) +@ def opt(b): +@ table = [ +@ u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808), +@ u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0), +@ ] +@ return apply_table(table, b) +@ def rot_byte(b, n): +@ return 0xff & ((b << n) | (b >> (8-n))) +@ def skew(x): +@ return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^ +@ rot_byte(x, 4)) +@ table = [0, 0] +@ for i in range(16): +@ table[0] |= (skew(opt(i)) ^ 0x63) << (i*8) +@ table[1] |= skew(opt(i<<4)) << (i*8) +@ print(" .quad 0x%016x, 0x%016x" % u128_to_u64s(table[0])) +@ print(" .quad 0x%016x, 0x%016x" % u128_to_u64s(table[1])) +.Lk_opt_then_skew: +.quad 0x9cb8436798bc4763, 0x6440bb9f6044bf9b +.quad 0x1f30062936192f00, 0xb49bad829db284ab + +@ void vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes); +.globl vpaes_encrypt_key_to_bsaes +.hidden vpaes_encrypt_key_to_bsaes +.type vpaes_encrypt_key_to_bsaes,%function +.align 4 +vpaes_encrypt_key_to_bsaes: + stmdb sp!, {r11, lr} + + @ See _vpaes_schedule_core for the key schedule logic. In particular, + @ _vpaes_schedule_transform(.Lk_ipt) (section 2.2 of the paper), + @ _vpaes_schedule_mangle (section 4.3), and .Lschedule_mangle_last + @ contain the transformations not in the bsaes representation. This + @ function inverts those transforms. + @ + @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key + @ representation, which does not match the other aes_nohw_* + @ implementations. The ARM aes_nohw_* stores each 32-bit word + @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the + @ cost of extra REV and VREV32 operations in little-endian ARM. + + vmov.i8 q9, #0x0f @ Required by _vpaes_schedule_transform + adr r2, .Lk_mc_forward @ Must be aligned to 8 mod 16. + add r3, r2, 0x90 @ .Lk_sr+0x10-.Lk_mc_forward = 0x90 (Apple's toolchain doesn't support the expression) + + vld1.64 {q12}, [r2] + vmov.i8 q10, #0x5b @ .Lk_s63 from vpaes-x86_64 + adr r11, .Lk_opt @ Must be aligned to 8 mod 16. + vmov.i8 q11, #0x63 @ .LK_s63 without .Lk_ipt applied + + @ vpaes stores one fewer round count than bsaes, but the number of keys + @ is the same. + ldr r2, [r1,#240] + add r2, r2, #1 + str r2, [r0,#240] + + @ The first key is transformed with _vpaes_schedule_transform(.Lk_ipt). + @ Invert this with .Lk_opt. + vld1.64 {q0}, [r1]! + bl _vpaes_schedule_transform + vrev32.8 q0, q0 + vst1.64 {q0}, [r0]! + + @ The middle keys have _vpaes_schedule_transform(.Lk_ipt) applied, + @ followed by _vpaes_schedule_mangle. _vpaes_schedule_mangle XORs 0x63, + @ multiplies by the circulant 0,1,1,1, then applies ShiftRows. +.Loop_enc_key_to_bsaes: + vld1.64 {q0}, [r1]! + + @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note we cycle + @ r3 in the opposite direction and start at .Lk_sr+0x10 instead of 0x30. + @ We use r3 rather than r8 to avoid a callee-saved register. + vld1.64 {q1}, [r3] + vtbl.8 d4, {q0}, d2 + vtbl.8 d5, {q0}, d3 + add r3, r3, #16 + and r3, r3, #~(1<<6) + vmov q0, q2 + + @ Handle the last key differently. + subs r2, r2, #1 + beq .Loop_enc_key_to_bsaes_last + + @ Multiply by the circulant. This is its own inverse. + vtbl.8 d2, {q0}, d24 + vtbl.8 d3, {q0}, d25 + vmov q0, q1 + vtbl.8 d4, {q1}, d24 + vtbl.8 d5, {q1}, d25 + veor q0, q0, q2 + vtbl.8 d2, {q2}, d24 + vtbl.8 d3, {q2}, d25 + veor q0, q0, q1 + + @ XOR and finish. + veor q0, q0, q10 + bl _vpaes_schedule_transform + vrev32.8 q0, q0 + vst1.64 {q0}, [r0]! + b .Loop_enc_key_to_bsaes + +.Loop_enc_key_to_bsaes_last: + @ The final key does not have a basis transform (note + @ .Lschedule_mangle_last inverts the original transform). It only XORs + @ 0x63 and applies ShiftRows. The latter was already inverted in the + @ loop. Note that, because we act on the original representation, we use + @ q11, not q10. + veor q0, q0, q11 + vrev32.8 q0, q0 + vst1.64 {q0}, [r0] + + @ Wipe registers which contained key material. + veor q0, q0, q0 + veor q1, q1, q1 + veor q2, q2, q2 + + ldmia sp!, {r11, pc} @ return +.size vpaes_encrypt_key_to_bsaes,.-vpaes_encrypt_key_to_bsaes +.globl vpaes_ctr32_encrypt_blocks +.hidden vpaes_ctr32_encrypt_blocks +.type vpaes_ctr32_encrypt_blocks,%function +.align 4 +vpaes_ctr32_encrypt_blocks: + mov ip, sp + stmdb sp!, {r7,r8,r9,r10,r11, lr} + @ This function uses q4-q7 (d8-d15), which are callee-saved. + vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + + cmp r2, #0 + @ r8 is passed on the stack. + ldr r8, [ip] + beq .Lctr32_done + + @ _vpaes_encrypt_core expects the key in r2, so swap r2 and r3. + mov r9, r3 + mov r3, r2 + mov r2, r9 + + @ Load the IV and counter portion. + ldr r7, [r8, #12] + vld1.8 {q7}, [r8] + + bl _vpaes_preheat + rev r7, r7 @ The counter is big-endian. + +.Lctr32_loop: + vmov q0, q7 + vld1.8 {q6}, [r0]! @ .Load input ahead of time + bl _vpaes_encrypt_core + veor q0, q0, q6 @ XOR input and result + vst1.8 {q0}, [r1]! + subs r3, r3, #1 + @ Update the counter. + add r7, r7, #1 + rev r9, r7 + vmov.32 d15[1], r9 + bne .Lctr32_loop + +.Lctr32_done: + vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return +.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__) diff --git a/ring-0.17.14/pregenerated/vpaes-armv8-ios64.S b/ring-0.17.14/pregenerated/vpaes-armv8-ios64.S new file mode 100644 index 0000000000..b4e29dcc17 --- /dev/null +++ b/ring-0.17.14/pregenerated/vpaes-armv8-ios64.S @@ -0,0 +1,744 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) +.section __TEXT,__const + + +.align 7 // totally strategic alignment +_vpaes_consts: +Lk_mc_forward: // mc_forward +.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 +.quad 0x080B0A0904070605, 0x000302010C0F0E0D +.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 +.quad 0x000302010C0F0E0D, 0x080B0A0904070605 +Lk_mc_backward: // mc_backward +.quad 0x0605040702010003, 0x0E0D0C0F0A09080B +.quad 0x020100030E0D0C0F, 0x0A09080B06050407 +.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 +.quad 0x0A09080B06050407, 0x020100030E0D0C0F +Lk_sr: // sr +.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 +.quad 0x030E09040F0A0500, 0x0B06010C07020D08 +.quad 0x0F060D040B020900, 0x070E050C030A0108 +.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +// +// "Hot" constants +// +Lk_inv: // inv, inva +.quad 0x0E05060F0D080180, 0x040703090A0B0C02 +.quad 0x01040A060F0B0780, 0x030D0E0C02050809 +Lk_ipt: // input transform (lo, hi) +.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 +.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 +Lk_sbo: // sbou, sbot +.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 +.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA +Lk_sb1: // sb1u, sb1t +.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF +.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 +Lk_sb2: // sb2u, sb2t +.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A +.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD + +// +// Key schedule constants +// +Lk_dksd: // decryption key schedule: invskew x*D +.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 +.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E +Lk_dksb: // decryption key schedule: invskew x*B +.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 +.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 +Lk_dkse: // decryption key schedule: invskew x*E + 0x63 +.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 +.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 +Lk_dks9: // decryption key schedule: invskew x*9 +.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC +.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE + +Lk_rcon: // rcon +.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +Lk_opt: // output transform +.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 +.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 +Lk_deskew: // deskew tables: inverts the sbox's "skew" +.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A +.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 + +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 +.align 2 + +.align 6 + +.text +## +## _aes_preheat +## +## Fills register %r10 -> .aes_consts (so you can -fPIC) +## and %xmm9-%xmm15 as specified below. +## + +.align 4 +_vpaes_encrypt_preheat: + adrp x10, Lk_inv@PAGE + add x10, x10, Lk_inv@PAGEOFF + movi v17.16b, #0x0f + ld1 {v18.2d,v19.2d}, [x10],#32 // Lk_inv + ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // Lk_ipt, Lk_sbo + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // Lk_sb1, Lk_sb2 + ret + + +## +## _aes_encrypt_core +## +## AES-encrypt %xmm0. +## +## Inputs: +## %xmm0 = input +## %xmm9-%xmm15 as in _vpaes_preheat +## (%rdx) = scheduled keys +## +## Output in %xmm0 +## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax +## Preserves %xmm6 - %xmm8 so you get some local vectors +## +## + +.align 4 +_vpaes_encrypt_core: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + adrp x11, Lk_mc_forward@PAGE+16 + add x11, x11, Lk_mc_forward@PAGEOFF+16 + // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 + // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 + eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + b Lenc_entry + +.align 4 +Lenc_loop: + // middle of middle round + add x10, x11, #0x40 + tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[] + tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[] + tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + sub w8, w8, #1 // nr-- + +Lenc_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + cbnz w8, Lenc_loop + + // middle of last round + add x10, x11, #0x80 + // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[] + tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 + ret + + + +.align 4 +_vpaes_encrypt_2x: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + adrp x11, Lk_mc_forward@PAGE+16 + add x11, x11, Lk_mc_forward@PAGEOFF+16 + // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + and v9.16b, v15.16b, v17.16b + ushr v8.16b, v15.16b, #4 + tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 + tbl v9.16b, {v20.16b}, v9.16b + // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 + tbl v10.16b, {v21.16b}, v8.16b + eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 + eor v8.16b, v9.16b, v16.16b + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + eor v8.16b, v8.16b, v10.16b + b Lenc_2x_entry + +.align 4 +Lenc_2x_loop: + // middle of middle round + add x10, x11, #0x40 + tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + tbl v12.16b, {v25.16b}, v10.16b + ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[] + tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + tbl v8.16b, {v24.16b}, v11.16b + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + tbl v13.16b, {v27.16b}, v10.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + eor v8.16b, v8.16b, v12.16b + tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + tbl v10.16b, {v26.16b}, v11.16b + ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[] + tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + tbl v11.16b, {v8.16b}, v1.16b + eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + eor v10.16b, v10.16b, v13.16b + tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + tbl v8.16b, {v8.16b}, v4.16b + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + eor v11.16b, v11.16b, v10.16b + tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + tbl v12.16b, {v11.16b},v1.16b + eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + eor v8.16b, v8.16b, v11.16b + and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + eor v8.16b, v8.16b, v12.16b + sub w8, w8, #1 // nr-- + +Lenc_2x_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + and v9.16b, v8.16b, v17.16b + ushr v8.16b, v8.16b, #4 + tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + tbl v13.16b, {v19.16b},v9.16b + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + eor v9.16b, v9.16b, v8.16b + tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v11.16b, {v18.16b},v8.16b + tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + tbl v12.16b, {v18.16b},v9.16b + eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v11.16b, v11.16b, v13.16b + eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + eor v12.16b, v12.16b, v13.16b + tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v10.16b, {v18.16b},v11.16b + tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + tbl v11.16b, {v18.16b},v12.16b + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v10.16b, v10.16b, v9.16b + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + eor v11.16b, v11.16b, v8.16b + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + cbnz w8, Lenc_2x_loop + + // middle of last round + add x10, x11, #0x80 + // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + tbl v12.16b, {v22.16b}, v10.16b + ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[] + tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + tbl v8.16b, {v23.16b}, v11.16b + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + eor v8.16b, v8.16b, v12.16b + tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 + tbl v1.16b, {v8.16b},v1.16b + ret + +######################################################## +## ## +## AES key schedule ## +## ## +######################################################## + +.align 4 +_vpaes_key_preheat: + adrp x10, Lk_inv@PAGE + add x10, x10, Lk_inv@PAGEOFF + movi v16.16b, #0x5b // Lk_s63 + adrp x11, Lk_sb1@PAGE + add x11, x11, Lk_sb1@PAGEOFF + movi v17.16b, #0x0f // Lk_s0F + ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // Lk_inv, Lk_ipt + adrp x10, Lk_dksd@PAGE + add x10, x10, Lk_dksd@PAGEOFF + ld1 {v22.2d,v23.2d}, [x11] // Lk_sb1 + adrp x11, Lk_mc_forward@PAGE + add x11, x11, Lk_mc_forward@PAGEOFF + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // Lk_dksd, Lk_dksb + ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // Lk_dkse, Lk_dks9 + ld1 {v8.2d}, [x10] // Lk_rcon + ld1 {v9.2d}, [x11] // Lk_mc_forward[0] + ret + + + +.align 4 +_vpaes_schedule_core: + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp,#-16]! + add x29,sp,#0 + + bl _vpaes_key_preheat // load the tables + + ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) + + // input transform + mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 + bl _vpaes_schedule_transform + mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 + + adrp x10, Lk_sr@PAGE // lea Lk_sr(%rip),%r10 + add x10, x10, Lk_sr@PAGEOFF + + add x8, x8, x10 + + // encrypting, output zeroth round key after transform + st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) + + cmp w1, #192 // cmp $192, %esi + b.hi Lschedule_256 + b.eq Lschedule_192 + // 128: fall though + +## +## .schedule_128 +## +## 128-bit specific part of key schedule. +## +## This schedule is really simple, because all its parts +## are accomplished by the subroutines. +## +Lschedule_128: + mov x0, #10 // mov $10, %esi + +Loop_schedule_128: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_round + cbz x0, Lschedule_mangle_last + bl _vpaes_schedule_mangle // write output + b Loop_schedule_128 + +## +## .aes_schedule_192 +## +## 192-bit specific part of key schedule. +## +## The main body of this schedule is the same as the 128-bit +## schedule, but with more smearing. The long, high side is +## stored in %xmm7 as before, and the short, low side is in +## the high bits of %xmm6. +## +## This schedule is somewhat nastier, however, because each +## round produces 192 bits of key material, or 1.5 round keys. +## Therefore, on each cycle we do 2 rounds and produce 3 round +## keys. +## +.align 4 +Lschedule_192: + sub x0, x0, #8 + ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) + bl _vpaes_schedule_transform // input transform + mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part + eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 + ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros + mov x0, #4 // mov $4, %esi + +Loop_schedule_192: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_round + ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0 + bl _vpaes_schedule_mangle // save key n + bl _vpaes_schedule_192_smear + bl _vpaes_schedule_mangle // save key n+1 + bl _vpaes_schedule_round + cbz x0, Lschedule_mangle_last + bl _vpaes_schedule_mangle // save key n+2 + bl _vpaes_schedule_192_smear + b Loop_schedule_192 + +## +## .aes_schedule_256 +## +## 256-bit specific part of key schedule. +## +## The structure here is very similar to the 128-bit +## schedule, but with an additional "low side" in +## %xmm6. The low side's rounds are the same as the +## high side's, except no rcon and no rotation. +## +.align 4 +Lschedule_256: + ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) + bl _vpaes_schedule_transform // input transform + mov x0, #7 // mov $7, %esi + +Loop_schedule_256: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_mangle // output low result + mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 + + // high round + bl _vpaes_schedule_round + cbz x0, Lschedule_mangle_last + bl _vpaes_schedule_mangle + + // low round. swap xmm7 and xmm6 + dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 + movi v4.16b, #0 + mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 + mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 + bl _vpaes_schedule_low_round + mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 + + b Loop_schedule_256 + +## +## .aes_schedule_mangle_last +## +## Mangler for last round of key schedule +## Mangles %xmm0 +## when encrypting, outputs out(%xmm0) ^ 63 +## when decrypting, outputs unskew(%xmm0) +## +## Always called right before return... jumps to cleanup and exits +## +.align 4 +Lschedule_mangle_last: + // schedule last round key from xmm0 + adrp x11, Lk_deskew@PAGE // lea Lk_deskew(%rip),%r11 # prepare to deskew + add x11, x11, Lk_deskew@PAGEOFF + + cbnz w3, Lschedule_mangle_last_dec + + // encrypting + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 + adrp x11, Lk_opt@PAGE // lea Lk_opt(%rip), %r11 # prepare to output transform + add x11, x11, Lk_opt@PAGEOFF + add x2, x2, #32 // add $32, %rdx + tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute + +Lschedule_mangle_last_dec: + ld1 {v20.2d,v21.2d}, [x11] // reload constants + sub x2, x2, #16 // add $-16, %rdx + eor v0.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm0 + bl _vpaes_schedule_transform // output transform + st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key + + // cleanup + eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 + eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 + eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 + eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 + eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 + eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 + eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 + eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 + ldp x29, x30, [sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +## +## .aes_schedule_192_smear +## +## Smear the short, low side in the 192-bit key schedule. +## +## Inputs: +## %xmm7: high side, b a x y +## %xmm6: low side, d c 0 0 +## %xmm13: 0 +## +## Outputs: +## %xmm6: b+c+d b+c 0 0 +## %xmm0: b+c+d b+c b a +## + +.align 4 +_vpaes_schedule_192_smear: + movi v1.16b, #0 + dup v0.4s, v7.s[3] + ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 + ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a + eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 + eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 + eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a + mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 + ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros + ret + + +## +## .aes_schedule_round +## +## Runs one main round of the key schedule on %xmm0, %xmm7 +## +## Specifically, runs subbytes on the high dword of %xmm0 +## then rotates it by one byte and xors into the low dword of +## %xmm7. +## +## Adds rcon from low byte of %xmm8, then rotates %xmm8 for +## next rcon. +## +## Smears the dwords of %xmm7 by xoring the low into the +## second low, result into third, result into highest. +## +## Returns results in %xmm7 = %xmm0. +## Clobbers %xmm1-%xmm4, %r11. +## + +.align 4 +_vpaes_schedule_round: + // extract rcon from xmm8 + movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 + ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1 + ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8 + eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 + + // rotate + dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 + ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0 + + // fall through... + + // low round: same as high round, but no rotation and no rcon. +_vpaes_schedule_low_round: + // smear xmm7 + ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1 + eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 + ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4 + + // subbytes + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 + tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v7.16b, v7.16b, v16.16b // vpxor Lk_s63(%rip), %xmm7, %xmm7 + tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak + eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io + eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo + tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou + tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t + eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output + + // add in smeared stuff + eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 + eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 + ret + + +## +## .aes_schedule_transform +## +## Linear-transform %xmm0 according to tables at (%r11) +## +## Requires that %xmm9 = 0x0F0F... as in preheat +## Output in %xmm0 +## Clobbers %xmm1, %xmm2 +## + +.align 4 +_vpaes_schedule_transform: + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + // vmovdqa (%r11), %xmm2 # lo + tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + // vmovdqa 16(%r11), %xmm1 # hi + tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + ret + + +## +## .aes_schedule_mangle +## +## Mangle xmm0 from (basis-transformed) standard version +## to our version. +## +## On encrypt, +## xor with 0x63 +## multiply by circulant 0,1,1,1 +## apply shiftrows transform +## +## On decrypt, +## xor with 0x63 +## multiply by "inverse mixcolumns" circulant E,B,D,9 +## deskew +## apply shiftrows transform +## +## +## Writes out to (%rdx), and increments or decrements it +## Keeps track of round number mod 4 in %r8 +## Preserves xmm0 +## Clobbers xmm1-xmm5 +## + +.align 4 +_vpaes_schedule_mangle: + mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later + // vmovdqa .Lk_mc_forward(%rip),%xmm5 + + // encrypting + eor v4.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm4 + add x2, x2, #16 // add $16, %rdx + tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 + tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 + tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 + eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 + +Lschedule_mangle_both: + tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + add x8, x8, #48 // add $-16, %r8 + and x8, x8, #~(1<<6) // and $0x30, %r8 + st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) + ret + + +.globl _vpaes_set_encrypt_key +.private_extern _vpaes_set_encrypt_key + +.align 4 +_vpaes_set_encrypt_key: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + + lsr w9, w1, #5 // shr $5,%eax + add w9, w9, #5 // $5,%eax + str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + + mov w3, #0 // mov $0,%ecx + mov x8, #0x30 // mov $0x30,%r8d + bl _vpaes_schedule_core + eor x0, x0, x0 + + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.globl _vpaes_ctr32_encrypt_blocks +.private_extern _vpaes_ctr32_encrypt_blocks + +.align 4 +_vpaes_ctr32_encrypt_blocks: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + stp d10,d11,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! + + cbz x2, Lctr32_done + + // Note, unlike the other functions, x2 here is measured in blocks, + // not bytes. + mov x17, x2 + mov x2, x3 + + // Load the IV and counter portion. + ldr w6, [x4, #12] + ld1 {v7.16b}, [x4] + + bl _vpaes_encrypt_preheat + tst x17, #1 + rev w6, w6 // The counter is big-endian. + b.eq Lctr32_prep_loop + + // Handle one block so the remaining block count is even for + // _vpaes_encrypt_2x. + ld1 {v6.16b}, [x0], #16 // Load input ahead of time + bl _vpaes_encrypt_core + eor v0.16b, v0.16b, v6.16b // XOR input and result + st1 {v0.16b}, [x1], #16 + subs x17, x17, #1 + // Update the counter. + add w6, w6, #1 + rev w7, w6 + mov v7.s[3], w7 + b.ls Lctr32_done + +Lctr32_prep_loop: + // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x + // uses v14 and v15. + mov v15.16b, v7.16b + mov v14.16b, v7.16b + add w6, w6, #1 + rev w7, w6 + mov v15.s[3], w7 + +Lctr32_loop: + ld1 {v6.16b,v7.16b}, [x0], #32 // Load input ahead of time + bl _vpaes_encrypt_2x + eor v0.16b, v0.16b, v6.16b // XOR input and result + eor v1.16b, v1.16b, v7.16b // XOR input and result (#2) + st1 {v0.16b,v1.16b}, [x1], #32 + subs x17, x17, #2 + // Update the counter. + add w7, w6, #1 + add w6, w6, #2 + rev w7, w7 + mov v14.s[3], w7 + rev w7, w6 + mov v15.s[3], w7 + b.hi Lctr32_loop + +Lctr32_done: + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 + ldp d10,d11,[sp],#16 + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) diff --git a/ring-0.17.14/pregenerated/vpaes-armv8-linux64.S b/ring-0.17.14/pregenerated/vpaes-armv8-linux64.S new file mode 100644 index 0000000000..c3c60f605f --- /dev/null +++ b/ring-0.17.14/pregenerated/vpaes-armv8-linux64.S @@ -0,0 +1,744 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) +.section .rodata + +.type _vpaes_consts,%object +.align 7 // totally strategic alignment +_vpaes_consts: +.Lk_mc_forward: // mc_forward +.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 +.quad 0x080B0A0904070605, 0x000302010C0F0E0D +.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 +.quad 0x000302010C0F0E0D, 0x080B0A0904070605 +.Lk_mc_backward: // mc_backward +.quad 0x0605040702010003, 0x0E0D0C0F0A09080B +.quad 0x020100030E0D0C0F, 0x0A09080B06050407 +.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 +.quad 0x0A09080B06050407, 0x020100030E0D0C0F +.Lk_sr: // sr +.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 +.quad 0x030E09040F0A0500, 0x0B06010C07020D08 +.quad 0x0F060D040B020900, 0x070E050C030A0108 +.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +// +// "Hot" constants +// +.Lk_inv: // inv, inva +.quad 0x0E05060F0D080180, 0x040703090A0B0C02 +.quad 0x01040A060F0B0780, 0x030D0E0C02050809 +.Lk_ipt: // input transform (lo, hi) +.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 +.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 +.Lk_sbo: // sbou, sbot +.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 +.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA +.Lk_sb1: // sb1u, sb1t +.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF +.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 +.Lk_sb2: // sb2u, sb2t +.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A +.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD + +// +// Key schedule constants +// +.Lk_dksd: // decryption key schedule: invskew x*D +.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 +.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E +.Lk_dksb: // decryption key schedule: invskew x*B +.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 +.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 +.Lk_dkse: // decryption key schedule: invskew x*E + 0x63 +.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 +.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 +.Lk_dks9: // decryption key schedule: invskew x*9 +.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC +.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE + +.Lk_rcon: // rcon +.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +.Lk_opt: // output transform +.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 +.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 +.Lk_deskew: // deskew tables: inverts the sbox's "skew" +.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A +.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 + +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 +.align 2 +.size _vpaes_consts,.-_vpaes_consts +.align 6 + +.text +## +## _aes_preheat +## +## Fills register %r10 -> .aes_consts (so you can -fPIC) +## and %xmm9-%xmm15 as specified below. +## +.type _vpaes_encrypt_preheat,%function +.align 4 +_vpaes_encrypt_preheat: + adrp x10, .Lk_inv + add x10, x10, :lo12:.Lk_inv + movi v17.16b, #0x0f + ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv + ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // .Lk_sb1, .Lk_sb2 + ret +.size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat + +## +## _aes_encrypt_core +## +## AES-encrypt %xmm0. +## +## Inputs: +## %xmm0 = input +## %xmm9-%xmm15 as in _vpaes_preheat +## (%rdx) = scheduled keys +## +## Output in %xmm0 +## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax +## Preserves %xmm6 - %xmm8 so you get some local vectors +## +## +.type _vpaes_encrypt_core,%function +.align 4 +_vpaes_encrypt_core: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + adrp x11, .Lk_mc_forward+16 + add x11, x11, :lo12:.Lk_mc_forward+16 + // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 + // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 + eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + b .Lenc_entry + +.align 4 +.Lenc_loop: + // middle of middle round + add x10, x11, #0x40 + tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] + tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + sub w8, w8, #1 // nr-- + +.Lenc_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + cbnz w8, .Lenc_loop + + // middle of last round + add x10, x11, #0x80 + // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] + tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 + ret +.size _vpaes_encrypt_core,.-_vpaes_encrypt_core + +.type _vpaes_encrypt_2x,%function +.align 4 +_vpaes_encrypt_2x: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + adrp x11, .Lk_mc_forward+16 + add x11, x11, :lo12:.Lk_mc_forward+16 + // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + and v9.16b, v15.16b, v17.16b + ushr v8.16b, v15.16b, #4 + tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 + tbl v9.16b, {v20.16b}, v9.16b + // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 + tbl v10.16b, {v21.16b}, v8.16b + eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 + eor v8.16b, v9.16b, v16.16b + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + eor v8.16b, v8.16b, v10.16b + b .Lenc_2x_entry + +.align 4 +.Lenc_2x_loop: + // middle of middle round + add x10, x11, #0x40 + tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + tbl v12.16b, {v25.16b}, v10.16b + ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + tbl v8.16b, {v24.16b}, v11.16b + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + tbl v13.16b, {v27.16b}, v10.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + eor v8.16b, v8.16b, v12.16b + tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + tbl v10.16b, {v26.16b}, v11.16b + ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] + tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + tbl v11.16b, {v8.16b}, v1.16b + eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + eor v10.16b, v10.16b, v13.16b + tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + tbl v8.16b, {v8.16b}, v4.16b + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + eor v11.16b, v11.16b, v10.16b + tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + tbl v12.16b, {v11.16b},v1.16b + eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + eor v8.16b, v8.16b, v11.16b + and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + eor v8.16b, v8.16b, v12.16b + sub w8, w8, #1 // nr-- + +.Lenc_2x_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + and v9.16b, v8.16b, v17.16b + ushr v8.16b, v8.16b, #4 + tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + tbl v13.16b, {v19.16b},v9.16b + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + eor v9.16b, v9.16b, v8.16b + tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v11.16b, {v18.16b},v8.16b + tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + tbl v12.16b, {v18.16b},v9.16b + eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v11.16b, v11.16b, v13.16b + eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + eor v12.16b, v12.16b, v13.16b + tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v10.16b, {v18.16b},v11.16b + tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + tbl v11.16b, {v18.16b},v12.16b + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v10.16b, v10.16b, v9.16b + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + eor v11.16b, v11.16b, v8.16b + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + cbnz w8, .Lenc_2x_loop + + // middle of last round + add x10, x11, #0x80 + // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + tbl v12.16b, {v22.16b}, v10.16b + ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] + tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + tbl v8.16b, {v23.16b}, v11.16b + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + eor v8.16b, v8.16b, v12.16b + tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 + tbl v1.16b, {v8.16b},v1.16b + ret +.size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x +######################################################## +## ## +## AES key schedule ## +## ## +######################################################## +.type _vpaes_key_preheat,%function +.align 4 +_vpaes_key_preheat: + adrp x10, .Lk_inv + add x10, x10, :lo12:.Lk_inv + movi v16.16b, #0x5b // .Lk_s63 + adrp x11, .Lk_sb1 + add x11, x11, :lo12:.Lk_sb1 + movi v17.16b, #0x0f // .Lk_s0F + ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // .Lk_inv, .Lk_ipt + adrp x10, .Lk_dksd + add x10, x10, :lo12:.Lk_dksd + ld1 {v22.2d,v23.2d}, [x11] // .Lk_sb1 + adrp x11, .Lk_mc_forward + add x11, x11, :lo12:.Lk_mc_forward + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb + ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9 + ld1 {v8.2d}, [x10] // .Lk_rcon + ld1 {v9.2d}, [x11] // .Lk_mc_forward[0] + ret +.size _vpaes_key_preheat,.-_vpaes_key_preheat + +.type _vpaes_schedule_core,%function +.align 4 +_vpaes_schedule_core: + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp,#-16]! + add x29,sp,#0 + + bl _vpaes_key_preheat // load the tables + + ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) + + // input transform + mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 + bl _vpaes_schedule_transform + mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 + + adrp x10, .Lk_sr // lea .Lk_sr(%rip),%r10 + add x10, x10, :lo12:.Lk_sr + + add x8, x8, x10 + + // encrypting, output zeroth round key after transform + st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) + + cmp w1, #192 // cmp $192, %esi + b.hi .Lschedule_256 + b.eq .Lschedule_192 + // 128: fall though + +## +## .schedule_128 +## +## 128-bit specific part of key schedule. +## +## This schedule is really simple, because all its parts +## are accomplished by the subroutines. +## +.Lschedule_128: + mov x0, #10 // mov $10, %esi + +.Loop_schedule_128: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_round + cbz x0, .Lschedule_mangle_last + bl _vpaes_schedule_mangle // write output + b .Loop_schedule_128 + +## +## .aes_schedule_192 +## +## 192-bit specific part of key schedule. +## +## The main body of this schedule is the same as the 128-bit +## schedule, but with more smearing. The long, high side is +## stored in %xmm7 as before, and the short, low side is in +## the high bits of %xmm6. +## +## This schedule is somewhat nastier, however, because each +## round produces 192 bits of key material, or 1.5 round keys. +## Therefore, on each cycle we do 2 rounds and produce 3 round +## keys. +## +.align 4 +.Lschedule_192: + sub x0, x0, #8 + ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) + bl _vpaes_schedule_transform // input transform + mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part + eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 + ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros + mov x0, #4 // mov $4, %esi + +.Loop_schedule_192: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_round + ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0 + bl _vpaes_schedule_mangle // save key n + bl _vpaes_schedule_192_smear + bl _vpaes_schedule_mangle // save key n+1 + bl _vpaes_schedule_round + cbz x0, .Lschedule_mangle_last + bl _vpaes_schedule_mangle // save key n+2 + bl _vpaes_schedule_192_smear + b .Loop_schedule_192 + +## +## .aes_schedule_256 +## +## 256-bit specific part of key schedule. +## +## The structure here is very similar to the 128-bit +## schedule, but with an additional "low side" in +## %xmm6. The low side's rounds are the same as the +## high side's, except no rcon and no rotation. +## +.align 4 +.Lschedule_256: + ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) + bl _vpaes_schedule_transform // input transform + mov x0, #7 // mov $7, %esi + +.Loop_schedule_256: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_mangle // output low result + mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 + + // high round + bl _vpaes_schedule_round + cbz x0, .Lschedule_mangle_last + bl _vpaes_schedule_mangle + + // low round. swap xmm7 and xmm6 + dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 + movi v4.16b, #0 + mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 + mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 + bl _vpaes_schedule_low_round + mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 + + b .Loop_schedule_256 + +## +## .aes_schedule_mangle_last +## +## Mangler for last round of key schedule +## Mangles %xmm0 +## when encrypting, outputs out(%xmm0) ^ 63 +## when decrypting, outputs unskew(%xmm0) +## +## Always called right before return... jumps to cleanup and exits +## +.align 4 +.Lschedule_mangle_last: + // schedule last round key from xmm0 + adrp x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew + add x11, x11, :lo12:.Lk_deskew + + cbnz w3, .Lschedule_mangle_last_dec + + // encrypting + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 + adrp x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform + add x11, x11, :lo12:.Lk_opt + add x2, x2, #32 // add $32, %rdx + tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute + +.Lschedule_mangle_last_dec: + ld1 {v20.2d,v21.2d}, [x11] // reload constants + sub x2, x2, #16 // add $-16, %rdx + eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0 + bl _vpaes_schedule_transform // output transform + st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key + + // cleanup + eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 + eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 + eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 + eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 + eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 + eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 + eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 + eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 + ldp x29, x30, [sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size _vpaes_schedule_core,.-_vpaes_schedule_core + +## +## .aes_schedule_192_smear +## +## Smear the short, low side in the 192-bit key schedule. +## +## Inputs: +## %xmm7: high side, b a x y +## %xmm6: low side, d c 0 0 +## %xmm13: 0 +## +## Outputs: +## %xmm6: b+c+d b+c 0 0 +## %xmm0: b+c+d b+c b a +## +.type _vpaes_schedule_192_smear,%function +.align 4 +_vpaes_schedule_192_smear: + movi v1.16b, #0 + dup v0.4s, v7.s[3] + ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 + ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a + eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 + eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 + eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a + mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 + ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros + ret +.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear + +## +## .aes_schedule_round +## +## Runs one main round of the key schedule on %xmm0, %xmm7 +## +## Specifically, runs subbytes on the high dword of %xmm0 +## then rotates it by one byte and xors into the low dword of +## %xmm7. +## +## Adds rcon from low byte of %xmm8, then rotates %xmm8 for +## next rcon. +## +## Smears the dwords of %xmm7 by xoring the low into the +## second low, result into third, result into highest. +## +## Returns results in %xmm7 = %xmm0. +## Clobbers %xmm1-%xmm4, %r11. +## +.type _vpaes_schedule_round,%function +.align 4 +_vpaes_schedule_round: + // extract rcon from xmm8 + movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 + ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1 + ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8 + eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 + + // rotate + dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 + ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0 + + // fall through... + + // low round: same as high round, but no rotation and no rcon. +_vpaes_schedule_low_round: + // smear xmm7 + ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1 + eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 + ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4 + + // subbytes + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 + tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7 + tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak + eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io + eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo + tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou + tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t + eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output + + // add in smeared stuff + eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 + eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 + ret +.size _vpaes_schedule_round,.-_vpaes_schedule_round + +## +## .aes_schedule_transform +## +## Linear-transform %xmm0 according to tables at (%r11) +## +## Requires that %xmm9 = 0x0F0F... as in preheat +## Output in %xmm0 +## Clobbers %xmm1, %xmm2 +## +.type _vpaes_schedule_transform,%function +.align 4 +_vpaes_schedule_transform: + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + // vmovdqa (%r11), %xmm2 # lo + tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + // vmovdqa 16(%r11), %xmm1 # hi + tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + ret +.size _vpaes_schedule_transform,.-_vpaes_schedule_transform + +## +## .aes_schedule_mangle +## +## Mangle xmm0 from (basis-transformed) standard version +## to our version. +## +## On encrypt, +## xor with 0x63 +## multiply by circulant 0,1,1,1 +## apply shiftrows transform +## +## On decrypt, +## xor with 0x63 +## multiply by "inverse mixcolumns" circulant E,B,D,9 +## deskew +## apply shiftrows transform +## +## +## Writes out to (%rdx), and increments or decrements it +## Keeps track of round number mod 4 in %r8 +## Preserves xmm0 +## Clobbers xmm1-xmm5 +## +.type _vpaes_schedule_mangle,%function +.align 4 +_vpaes_schedule_mangle: + mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later + // vmovdqa .Lk_mc_forward(%rip),%xmm5 + + // encrypting + eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4 + add x2, x2, #16 // add $16, %rdx + tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 + tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 + tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 + eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 + +.Lschedule_mangle_both: + tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + add x8, x8, #48 // add $-16, %r8 + and x8, x8, #~(1<<6) // and $0x30, %r8 + st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) + ret +.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle + +.globl vpaes_set_encrypt_key +.hidden vpaes_set_encrypt_key +.type vpaes_set_encrypt_key,%function +.align 4 +vpaes_set_encrypt_key: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + + lsr w9, w1, #5 // shr $5,%eax + add w9, w9, #5 // $5,%eax + str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + + mov w3, #0 // mov $0,%ecx + mov x8, #0x30 // mov $0x30,%r8d + bl _vpaes_schedule_core + eor x0, x0, x0 + + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key +.globl vpaes_ctr32_encrypt_blocks +.hidden vpaes_ctr32_encrypt_blocks +.type vpaes_ctr32_encrypt_blocks,%function +.align 4 +vpaes_ctr32_encrypt_blocks: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + stp d10,d11,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! + + cbz x2, .Lctr32_done + + // Note, unlike the other functions, x2 here is measured in blocks, + // not bytes. + mov x17, x2 + mov x2, x3 + + // Load the IV and counter portion. + ldr w6, [x4, #12] + ld1 {v7.16b}, [x4] + + bl _vpaes_encrypt_preheat + tst x17, #1 + rev w6, w6 // The counter is big-endian. + b.eq .Lctr32_prep_loop + + // Handle one block so the remaining block count is even for + // _vpaes_encrypt_2x. + ld1 {v6.16b}, [x0], #16 // .Load input ahead of time + bl _vpaes_encrypt_core + eor v0.16b, v0.16b, v6.16b // XOR input and result + st1 {v0.16b}, [x1], #16 + subs x17, x17, #1 + // Update the counter. + add w6, w6, #1 + rev w7, w6 + mov v7.s[3], w7 + b.ls .Lctr32_done + +.Lctr32_prep_loop: + // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x + // uses v14 and v15. + mov v15.16b, v7.16b + mov v14.16b, v7.16b + add w6, w6, #1 + rev w7, w6 + mov v15.s[3], w7 + +.Lctr32_loop: + ld1 {v6.16b,v7.16b}, [x0], #32 // .Load input ahead of time + bl _vpaes_encrypt_2x + eor v0.16b, v0.16b, v6.16b // XOR input and result + eor v1.16b, v1.16b, v7.16b // XOR input and result (#2) + st1 {v0.16b,v1.16b}, [x1], #32 + subs x17, x17, #2 + // Update the counter. + add w7, w6, #1 + add w6, w6, #2 + rev w7, w7 + mov v14.s[3], w7 + rev w7, w6 + mov v15.s[3], w7 + b.hi .Lctr32_loop + +.Lctr32_done: + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 + ldp d10,d11,[sp],#16 + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) diff --git a/ring-0.17.14/pregenerated/vpaes-armv8-win64.S b/ring-0.17.14/pregenerated/vpaes-armv8-win64.S new file mode 100644 index 0000000000..6a524370c0 --- /dev/null +++ b/ring-0.17.14/pregenerated/vpaes-armv8-win64.S @@ -0,0 +1,766 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +.section .rodata + + +.align 7 // totally strategic alignment +_vpaes_consts: +Lk_mc_forward: // mc_forward +.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 +.quad 0x080B0A0904070605, 0x000302010C0F0E0D +.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 +.quad 0x000302010C0F0E0D, 0x080B0A0904070605 +Lk_mc_backward: // mc_backward +.quad 0x0605040702010003, 0x0E0D0C0F0A09080B +.quad 0x020100030E0D0C0F, 0x0A09080B06050407 +.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 +.quad 0x0A09080B06050407, 0x020100030E0D0C0F +Lk_sr: // sr +.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 +.quad 0x030E09040F0A0500, 0x0B06010C07020D08 +.quad 0x0F060D040B020900, 0x070E050C030A0108 +.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +// +// "Hot" constants +// +Lk_inv: // inv, inva +.quad 0x0E05060F0D080180, 0x040703090A0B0C02 +.quad 0x01040A060F0B0780, 0x030D0E0C02050809 +Lk_ipt: // input transform (lo, hi) +.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 +.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 +Lk_sbo: // sbou, sbot +.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 +.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA +Lk_sb1: // sb1u, sb1t +.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF +.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 +Lk_sb2: // sb2u, sb2t +.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A +.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD + +// +// Key schedule constants +// +Lk_dksd: // decryption key schedule: invskew x*D +.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 +.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E +Lk_dksb: // decryption key schedule: invskew x*B +.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 +.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 +Lk_dkse: // decryption key schedule: invskew x*E + 0x63 +.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 +.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 +Lk_dks9: // decryption key schedule: invskew x*9 +.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC +.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE + +Lk_rcon: // rcon +.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +Lk_opt: // output transform +.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 +.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 +Lk_deskew: // deskew tables: inverts the sbox's "skew" +.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A +.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 + +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 +.align 2 + +.align 6 + +.text +## +## _aes_preheat +## +## Fills register %r10 -> .aes_consts (so you can -fPIC) +## and %xmm9-%xmm15 as specified below. +## +.def _vpaes_encrypt_preheat + .type 32 +.endef +.align 4 +_vpaes_encrypt_preheat: + adrp x10, Lk_inv + add x10, x10, :lo12:Lk_inv + movi v17.16b, #0x0f + ld1 {v18.2d,v19.2d}, [x10],#32 // Lk_inv + ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // Lk_ipt, Lk_sbo + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // Lk_sb1, Lk_sb2 + ret + + +## +## _aes_encrypt_core +## +## AES-encrypt %xmm0. +## +## Inputs: +## %xmm0 = input +## %xmm9-%xmm15 as in _vpaes_preheat +## (%rdx) = scheduled keys +## +## Output in %xmm0 +## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax +## Preserves %xmm6 - %xmm8 so you get some local vectors +## +## +.def _vpaes_encrypt_core + .type 32 +.endef +.align 4 +_vpaes_encrypt_core: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + adrp x11, Lk_mc_forward+16 + add x11, x11, :lo12:Lk_mc_forward+16 + // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 + // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 + eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + b Lenc_entry + +.align 4 +Lenc_loop: + // middle of middle round + add x10, x11, #0x40 + tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[] + tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[] + tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + sub w8, w8, #1 // nr-- + +Lenc_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + cbnz w8, Lenc_loop + + // middle of last round + add x10, x11, #0x80 + // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[] + tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 + ret + + +.def _vpaes_encrypt_2x + .type 32 +.endef +.align 4 +_vpaes_encrypt_2x: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + adrp x11, Lk_mc_forward+16 + add x11, x11, :lo12:Lk_mc_forward+16 + // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + and v9.16b, v15.16b, v17.16b + ushr v8.16b, v15.16b, #4 + tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 + tbl v9.16b, {v20.16b}, v9.16b + // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 + tbl v10.16b, {v21.16b}, v8.16b + eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 + eor v8.16b, v9.16b, v16.16b + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + eor v8.16b, v8.16b, v10.16b + b Lenc_2x_entry + +.align 4 +Lenc_2x_loop: + // middle of middle round + add x10, x11, #0x40 + tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + tbl v12.16b, {v25.16b}, v10.16b + ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[] + tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + tbl v8.16b, {v24.16b}, v11.16b + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + tbl v13.16b, {v27.16b}, v10.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + eor v8.16b, v8.16b, v12.16b + tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + tbl v10.16b, {v26.16b}, v11.16b + ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[] + tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + tbl v11.16b, {v8.16b}, v1.16b + eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + eor v10.16b, v10.16b, v13.16b + tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + tbl v8.16b, {v8.16b}, v4.16b + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + eor v11.16b, v11.16b, v10.16b + tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + tbl v12.16b, {v11.16b},v1.16b + eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + eor v8.16b, v8.16b, v11.16b + and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + eor v8.16b, v8.16b, v12.16b + sub w8, w8, #1 // nr-- + +Lenc_2x_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + and v9.16b, v8.16b, v17.16b + ushr v8.16b, v8.16b, #4 + tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + tbl v13.16b, {v19.16b},v9.16b + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + eor v9.16b, v9.16b, v8.16b + tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v11.16b, {v18.16b},v8.16b + tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + tbl v12.16b, {v18.16b},v9.16b + eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v11.16b, v11.16b, v13.16b + eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + eor v12.16b, v12.16b, v13.16b + tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v10.16b, {v18.16b},v11.16b + tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + tbl v11.16b, {v18.16b},v12.16b + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v10.16b, v10.16b, v9.16b + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + eor v11.16b, v11.16b, v8.16b + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + cbnz w8, Lenc_2x_loop + + // middle of last round + add x10, x11, #0x80 + // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + tbl v12.16b, {v22.16b}, v10.16b + ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[] + tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + tbl v8.16b, {v23.16b}, v11.16b + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + eor v8.16b, v8.16b, v12.16b + tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 + tbl v1.16b, {v8.16b},v1.16b + ret + +######################################################## +## ## +## AES key schedule ## +## ## +######################################################## +.def _vpaes_key_preheat + .type 32 +.endef +.align 4 +_vpaes_key_preheat: + adrp x10, Lk_inv + add x10, x10, :lo12:Lk_inv + movi v16.16b, #0x5b // Lk_s63 + adrp x11, Lk_sb1 + add x11, x11, :lo12:Lk_sb1 + movi v17.16b, #0x0f // Lk_s0F + ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // Lk_inv, Lk_ipt + adrp x10, Lk_dksd + add x10, x10, :lo12:Lk_dksd + ld1 {v22.2d,v23.2d}, [x11] // Lk_sb1 + adrp x11, Lk_mc_forward + add x11, x11, :lo12:Lk_mc_forward + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // Lk_dksd, Lk_dksb + ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // Lk_dkse, Lk_dks9 + ld1 {v8.2d}, [x10] // Lk_rcon + ld1 {v9.2d}, [x11] // Lk_mc_forward[0] + ret + + +.def _vpaes_schedule_core + .type 32 +.endef +.align 4 +_vpaes_schedule_core: + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp,#-16]! + add x29,sp,#0 + + bl _vpaes_key_preheat // load the tables + + ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) + + // input transform + mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 + bl _vpaes_schedule_transform + mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 + + adrp x10, Lk_sr // lea Lk_sr(%rip),%r10 + add x10, x10, :lo12:Lk_sr + + add x8, x8, x10 + + // encrypting, output zeroth round key after transform + st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) + + cmp w1, #192 // cmp $192, %esi + b.hi Lschedule_256 + b.eq Lschedule_192 + // 128: fall though + +## +## .schedule_128 +## +## 128-bit specific part of key schedule. +## +## This schedule is really simple, because all its parts +## are accomplished by the subroutines. +## +Lschedule_128: + mov x0, #10 // mov $10, %esi + +Loop_schedule_128: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_round + cbz x0, Lschedule_mangle_last + bl _vpaes_schedule_mangle // write output + b Loop_schedule_128 + +## +## .aes_schedule_192 +## +## 192-bit specific part of key schedule. +## +## The main body of this schedule is the same as the 128-bit +## schedule, but with more smearing. The long, high side is +## stored in %xmm7 as before, and the short, low side is in +## the high bits of %xmm6. +## +## This schedule is somewhat nastier, however, because each +## round produces 192 bits of key material, or 1.5 round keys. +## Therefore, on each cycle we do 2 rounds and produce 3 round +## keys. +## +.align 4 +Lschedule_192: + sub x0, x0, #8 + ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) + bl _vpaes_schedule_transform // input transform + mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part + eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 + ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros + mov x0, #4 // mov $4, %esi + +Loop_schedule_192: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_round + ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0 + bl _vpaes_schedule_mangle // save key n + bl _vpaes_schedule_192_smear + bl _vpaes_schedule_mangle // save key n+1 + bl _vpaes_schedule_round + cbz x0, Lschedule_mangle_last + bl _vpaes_schedule_mangle // save key n+2 + bl _vpaes_schedule_192_smear + b Loop_schedule_192 + +## +## .aes_schedule_256 +## +## 256-bit specific part of key schedule. +## +## The structure here is very similar to the 128-bit +## schedule, but with an additional "low side" in +## %xmm6. The low side's rounds are the same as the +## high side's, except no rcon and no rotation. +## +.align 4 +Lschedule_256: + ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) + bl _vpaes_schedule_transform // input transform + mov x0, #7 // mov $7, %esi + +Loop_schedule_256: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_mangle // output low result + mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 + + // high round + bl _vpaes_schedule_round + cbz x0, Lschedule_mangle_last + bl _vpaes_schedule_mangle + + // low round. swap xmm7 and xmm6 + dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 + movi v4.16b, #0 + mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 + mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 + bl _vpaes_schedule_low_round + mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 + + b Loop_schedule_256 + +## +## .aes_schedule_mangle_last +## +## Mangler for last round of key schedule +## Mangles %xmm0 +## when encrypting, outputs out(%xmm0) ^ 63 +## when decrypting, outputs unskew(%xmm0) +## +## Always called right before return... jumps to cleanup and exits +## +.align 4 +Lschedule_mangle_last: + // schedule last round key from xmm0 + adrp x11, Lk_deskew // lea Lk_deskew(%rip),%r11 # prepare to deskew + add x11, x11, :lo12:Lk_deskew + + cbnz w3, Lschedule_mangle_last_dec + + // encrypting + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 + adrp x11, Lk_opt // lea Lk_opt(%rip), %r11 # prepare to output transform + add x11, x11, :lo12:Lk_opt + add x2, x2, #32 // add $32, %rdx + tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute + +Lschedule_mangle_last_dec: + ld1 {v20.2d,v21.2d}, [x11] // reload constants + sub x2, x2, #16 // add $-16, %rdx + eor v0.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm0 + bl _vpaes_schedule_transform // output transform + st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key + + // cleanup + eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 + eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 + eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 + eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 + eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 + eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 + eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 + eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 + ldp x29, x30, [sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +## +## .aes_schedule_192_smear +## +## Smear the short, low side in the 192-bit key schedule. +## +## Inputs: +## %xmm7: high side, b a x y +## %xmm6: low side, d c 0 0 +## %xmm13: 0 +## +## Outputs: +## %xmm6: b+c+d b+c 0 0 +## %xmm0: b+c+d b+c b a +## +.def _vpaes_schedule_192_smear + .type 32 +.endef +.align 4 +_vpaes_schedule_192_smear: + movi v1.16b, #0 + dup v0.4s, v7.s[3] + ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 + ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a + eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 + eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 + eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a + mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 + ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros + ret + + +## +## .aes_schedule_round +## +## Runs one main round of the key schedule on %xmm0, %xmm7 +## +## Specifically, runs subbytes on the high dword of %xmm0 +## then rotates it by one byte and xors into the low dword of +## %xmm7. +## +## Adds rcon from low byte of %xmm8, then rotates %xmm8 for +## next rcon. +## +## Smears the dwords of %xmm7 by xoring the low into the +## second low, result into third, result into highest. +## +## Returns results in %xmm7 = %xmm0. +## Clobbers %xmm1-%xmm4, %r11. +## +.def _vpaes_schedule_round + .type 32 +.endef +.align 4 +_vpaes_schedule_round: + // extract rcon from xmm8 + movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 + ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1 + ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8 + eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 + + // rotate + dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 + ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0 + + // fall through... + + // low round: same as high round, but no rotation and no rcon. +_vpaes_schedule_low_round: + // smear xmm7 + ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1 + eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 + ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4 + + // subbytes + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 + tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v7.16b, v7.16b, v16.16b // vpxor Lk_s63(%rip), %xmm7, %xmm7 + tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak + eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io + eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo + tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou + tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t + eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output + + // add in smeared stuff + eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 + eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 + ret + + +## +## .aes_schedule_transform +## +## Linear-transform %xmm0 according to tables at (%r11) +## +## Requires that %xmm9 = 0x0F0F... as in preheat +## Output in %xmm0 +## Clobbers %xmm1, %xmm2 +## +.def _vpaes_schedule_transform + .type 32 +.endef +.align 4 +_vpaes_schedule_transform: + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + // vmovdqa (%r11), %xmm2 # lo + tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + // vmovdqa 16(%r11), %xmm1 # hi + tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + ret + + +## +## .aes_schedule_mangle +## +## Mangle xmm0 from (basis-transformed) standard version +## to our version. +## +## On encrypt, +## xor with 0x63 +## multiply by circulant 0,1,1,1 +## apply shiftrows transform +## +## On decrypt, +## xor with 0x63 +## multiply by "inverse mixcolumns" circulant E,B,D,9 +## deskew +## apply shiftrows transform +## +## +## Writes out to (%rdx), and increments or decrements it +## Keeps track of round number mod 4 in %r8 +## Preserves xmm0 +## Clobbers xmm1-xmm5 +## +.def _vpaes_schedule_mangle + .type 32 +.endef +.align 4 +_vpaes_schedule_mangle: + mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later + // vmovdqa .Lk_mc_forward(%rip),%xmm5 + + // encrypting + eor v4.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm4 + add x2, x2, #16 // add $16, %rdx + tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 + tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 + tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 + eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 + +Lschedule_mangle_both: + tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + add x8, x8, #48 // add $-16, %r8 + and x8, x8, #~(1<<6) // and $0x30, %r8 + st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) + ret + + +.globl vpaes_set_encrypt_key + +.def vpaes_set_encrypt_key + .type 32 +.endef +.align 4 +vpaes_set_encrypt_key: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + + lsr w9, w1, #5 // shr $5,%eax + add w9, w9, #5 // $5,%eax + str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + + mov w3, #0 // mov $0,%ecx + mov x8, #0x30 // mov $0x30,%r8d + bl _vpaes_schedule_core + eor x0, x0, x0 + + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.globl vpaes_ctr32_encrypt_blocks + +.def vpaes_ctr32_encrypt_blocks + .type 32 +.endef +.align 4 +vpaes_ctr32_encrypt_blocks: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + stp d10,d11,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! + + cbz x2, Lctr32_done + + // Note, unlike the other functions, x2 here is measured in blocks, + // not bytes. + mov x17, x2 + mov x2, x3 + + // Load the IV and counter portion. + ldr w6, [x4, #12] + ld1 {v7.16b}, [x4] + + bl _vpaes_encrypt_preheat + tst x17, #1 + rev w6, w6 // The counter is big-endian. + b.eq Lctr32_prep_loop + + // Handle one block so the remaining block count is even for + // _vpaes_encrypt_2x. + ld1 {v6.16b}, [x0], #16 // Load input ahead of time + bl _vpaes_encrypt_core + eor v0.16b, v0.16b, v6.16b // XOR input and result + st1 {v0.16b}, [x1], #16 + subs x17, x17, #1 + // Update the counter. + add w6, w6, #1 + rev w7, w6 + mov v7.s[3], w7 + b.ls Lctr32_done + +Lctr32_prep_loop: + // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x + // uses v14 and v15. + mov v15.16b, v7.16b + mov v14.16b, v7.16b + add w6, w6, #1 + rev w7, w6 + mov v15.s[3], w7 + +Lctr32_loop: + ld1 {v6.16b,v7.16b}, [x0], #32 // Load input ahead of time + bl _vpaes_encrypt_2x + eor v0.16b, v0.16b, v6.16b // XOR input and result + eor v1.16b, v1.16b, v7.16b // XOR input and result (#2) + st1 {v0.16b,v1.16b}, [x1], #32 + subs x17, x17, #2 + // Update the counter. + add w7, w6, #1 + add w6, w6, #2 + rev w7, w7 + mov v14.s[3], w7 + rev w7, w6 + mov v15.s[3], w7 + b.hi Lctr32_loop + +Lctr32_done: + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 + ldp d10,d11,[sp],#16 + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) diff --git a/ring-0.17.14/pregenerated/vpaes-x86-elf.S b/ring-0.17.14/pregenerated/vpaes-x86-elf.S new file mode 100644 index 0000000000..e4077ae72d --- /dev/null +++ b/ring-0.17.14/pregenerated/vpaes-x86-elf.S @@ -0,0 +1,422 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) +.text +#ifdef BORINGSSL_DISPATCH_TEST +#endif +.align 64 +.L_vpaes_consts: +.long 218628480,235210255,168496130,67568393 +.long 252381056,17041926,33884169,51187212 +.long 252645135,252645135,252645135,252645135 +.long 1512730624,3266504856,1377990664,3401244816 +.long 830229760,1275146365,2969422977,3447763452 +.long 3411033600,2979783055,338359620,2782886510 +.long 4209124096,907596821,221174255,1006095553 +.long 191964160,3799684038,3164090317,1589111125 +.long 182528256,1777043520,2877432650,3265356744 +.long 1874708224,3503451415,3305285752,363511674 +.long 1606117888,3487855781,1093350906,2384367825 +.long 197121,67569157,134941193,202313229 +.long 67569157,134941193,202313229,197121 +.long 134941193,202313229,197121,67569157 +.long 202313229,197121,67569157,134941193 +.long 33619971,100992007,168364043,235736079 +.long 235736079,33619971,100992007,168364043 +.long 168364043,235736079,33619971,100992007 +.long 100992007,168364043,235736079,33619971 +.long 50462976,117835012,185207048,252579084 +.long 252314880,51251460,117574920,184942860 +.long 184682752,252054788,50987272,118359308 +.long 118099200,185467140,251790600,50727180 +.long 2946363062,528716217,1300004225,1881839624 +.long 1532713819,1532713819,1532713819,1532713819 +.long 3602276352,4288629033,3737020424,4153884961 +.long 1354558464,32357713,2958822624,3775749553 +.long 1201988352,132424512,1572796698,503232858 +.long 2213177600,1597421020,4103937655,675398315 +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105 +.byte 111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83 +.byte 83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117 +.byte 114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105 +.byte 118,101,114,115,105,116,121,41,0 +.align 64 +.hidden _vpaes_preheat +.type _vpaes_preheat,@function +.align 16 +_vpaes_preheat: + addl (%esp),%ebp + movdqa -48(%ebp),%xmm7 + movdqa -16(%ebp),%xmm6 + ret +.size _vpaes_preheat,.-_vpaes_preheat +.hidden _vpaes_encrypt_core +.type _vpaes_encrypt_core,@function +.align 16 +_vpaes_encrypt_core: + movl $16,%ecx + movl 240(%edx),%eax + movdqa %xmm6,%xmm1 + movdqa (%ebp),%xmm2 + pandn %xmm0,%xmm1 + pand %xmm6,%xmm0 + movdqu (%edx),%xmm5 +.byte 102,15,56,0,208 + movdqa 16(%ebp),%xmm0 + pxor %xmm5,%xmm2 + psrld $4,%xmm1 + addl $16,%edx +.byte 102,15,56,0,193 + leal 192(%ebp),%ebx + pxor %xmm2,%xmm0 + jmp .L000enc_entry +.align 16 +.L001enc_loop: + movdqa 32(%ebp),%xmm4 + movdqa 48(%ebp),%xmm0 +.byte 102,15,56,0,226 +.byte 102,15,56,0,195 + pxor %xmm5,%xmm4 + movdqa 64(%ebp),%xmm5 + pxor %xmm4,%xmm0 + movdqa -64(%ebx,%ecx,1),%xmm1 +.byte 102,15,56,0,234 + movdqa 80(%ebp),%xmm2 + movdqa (%ebx,%ecx,1),%xmm4 +.byte 102,15,56,0,211 + movdqa %xmm0,%xmm3 + pxor %xmm5,%xmm2 +.byte 102,15,56,0,193 + addl $16,%edx + pxor %xmm2,%xmm0 +.byte 102,15,56,0,220 + addl $16,%ecx + pxor %xmm0,%xmm3 +.byte 102,15,56,0,193 + andl $48,%ecx + subl $1,%eax + pxor %xmm3,%xmm0 +.L000enc_entry: + movdqa %xmm6,%xmm1 + movdqa -32(%ebp),%xmm5 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm6,%xmm0 +.byte 102,15,56,0,232 + movdqa %xmm7,%xmm3 + pxor %xmm1,%xmm0 +.byte 102,15,56,0,217 + movdqa %xmm7,%xmm4 + pxor %xmm5,%xmm3 +.byte 102,15,56,0,224 + movdqa %xmm7,%xmm2 + pxor %xmm5,%xmm4 +.byte 102,15,56,0,211 + movdqa %xmm7,%xmm3 + pxor %xmm0,%xmm2 +.byte 102,15,56,0,220 + movdqu (%edx),%xmm5 + pxor %xmm1,%xmm3 + jnz .L001enc_loop + movdqa 96(%ebp),%xmm4 + movdqa 112(%ebp),%xmm0 +.byte 102,15,56,0,226 + pxor %xmm5,%xmm4 +.byte 102,15,56,0,195 + movdqa 64(%ebx,%ecx,1),%xmm1 + pxor %xmm4,%xmm0 +.byte 102,15,56,0,193 + ret +.size _vpaes_encrypt_core,.-_vpaes_encrypt_core +.hidden _vpaes_schedule_core +.type _vpaes_schedule_core,@function +.align 16 +_vpaes_schedule_core: + addl (%esp),%ebp + movdqu (%esi),%xmm0 + movdqa 320(%ebp),%xmm2 + movdqa %xmm0,%xmm3 + leal (%ebp),%ebx + movdqa %xmm2,4(%esp) + call _vpaes_schedule_transform + movdqa %xmm0,%xmm7 + testl %edi,%edi + jnz .L002schedule_am_decrypting + movdqu %xmm0,(%edx) + jmp .L003schedule_go +.L002schedule_am_decrypting: + movdqa 256(%ebp,%ecx,1),%xmm1 +.byte 102,15,56,0,217 + movdqu %xmm3,(%edx) + xorl $48,%ecx +.L003schedule_go: + cmpl $192,%eax + ja .L004schedule_256 +.L005schedule_128: + movl $10,%eax +.L006loop_schedule_128: + call _vpaes_schedule_round + decl %eax + jz .L007schedule_mangle_last + call _vpaes_schedule_mangle + jmp .L006loop_schedule_128 +.align 16 +.L004schedule_256: + movdqu 16(%esi),%xmm0 + call _vpaes_schedule_transform + movl $7,%eax +.L008loop_schedule_256: + call _vpaes_schedule_mangle + movdqa %xmm0,%xmm6 + call _vpaes_schedule_round + decl %eax + jz .L007schedule_mangle_last + call _vpaes_schedule_mangle + pshufd $255,%xmm0,%xmm0 + movdqa %xmm7,20(%esp) + movdqa %xmm6,%xmm7 + call .L_vpaes_schedule_low_round + movdqa 20(%esp),%xmm7 + jmp .L008loop_schedule_256 +.align 16 +.L007schedule_mangle_last: + leal 384(%ebp),%ebx + testl %edi,%edi + jnz .L009schedule_mangle_last_dec + movdqa 256(%ebp,%ecx,1),%xmm1 +.byte 102,15,56,0,193 + leal 352(%ebp),%ebx + addl $32,%edx +.L009schedule_mangle_last_dec: + addl $-16,%edx + pxor 336(%ebp),%xmm0 + call _vpaes_schedule_transform + movdqu %xmm0,(%edx) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + ret +.size _vpaes_schedule_core,.-_vpaes_schedule_core +.hidden _vpaes_schedule_round +.type _vpaes_schedule_round,@function +.align 16 +_vpaes_schedule_round: + movdqa 8(%esp),%xmm2 + pxor %xmm1,%xmm1 +.byte 102,15,58,15,202,15 +.byte 102,15,58,15,210,15 + pxor %xmm1,%xmm7 + pshufd $255,%xmm0,%xmm0 +.byte 102,15,58,15,192,1 + movdqa %xmm2,8(%esp) +.L_vpaes_schedule_low_round: + movdqa %xmm7,%xmm1 + pslldq $4,%xmm7 + pxor %xmm1,%xmm7 + movdqa %xmm7,%xmm1 + pslldq $8,%xmm7 + pxor %xmm1,%xmm7 + pxor 336(%ebp),%xmm7 + movdqa -16(%ebp),%xmm4 + movdqa -48(%ebp),%xmm5 + movdqa %xmm4,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm4,%xmm0 + movdqa -32(%ebp),%xmm2 +.byte 102,15,56,0,208 + pxor %xmm1,%xmm0 + movdqa %xmm5,%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 + movdqa %xmm5,%xmm4 +.byte 102,15,56,0,224 + pxor %xmm2,%xmm4 + movdqa %xmm5,%xmm2 +.byte 102,15,56,0,211 + pxor %xmm0,%xmm2 + movdqa %xmm5,%xmm3 +.byte 102,15,56,0,220 + pxor %xmm1,%xmm3 + movdqa 32(%ebp),%xmm4 +.byte 102,15,56,0,226 + movdqa 48(%ebp),%xmm0 +.byte 102,15,56,0,195 + pxor %xmm4,%xmm0 + pxor %xmm7,%xmm0 + movdqa %xmm0,%xmm7 + ret +.size _vpaes_schedule_round,.-_vpaes_schedule_round +.hidden _vpaes_schedule_transform +.type _vpaes_schedule_transform,@function +.align 16 +_vpaes_schedule_transform: + movdqa -16(%ebp),%xmm2 + movdqa %xmm2,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm0 + movdqa (%ebx),%xmm2 +.byte 102,15,56,0,208 + movdqa 16(%ebx),%xmm0 +.byte 102,15,56,0,193 + pxor %xmm2,%xmm0 + ret +.size _vpaes_schedule_transform,.-_vpaes_schedule_transform +.hidden _vpaes_schedule_mangle +.type _vpaes_schedule_mangle,@function +.align 16 +_vpaes_schedule_mangle: + movdqa %xmm0,%xmm4 + movdqa 128(%ebp),%xmm5 + testl %edi,%edi + jnz .L010schedule_mangle_dec + addl $16,%edx + pxor 336(%ebp),%xmm4 +.byte 102,15,56,0,229 + movdqa %xmm4,%xmm3 +.byte 102,15,56,0,229 + pxor %xmm4,%xmm3 +.byte 102,15,56,0,229 + pxor %xmm4,%xmm3 + jmp .L011schedule_mangle_both +.align 16 +.L010schedule_mangle_dec: + movdqa -16(%ebp),%xmm2 + leal (%ebp),%esi + movdqa %xmm2,%xmm1 + pandn %xmm4,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm4 + movdqa (%esi),%xmm2 +.byte 102,15,56,0,212 + movdqa 16(%esi),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + movdqa 32(%esi),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 48(%esi),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + movdqa 64(%esi),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 80(%esi),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + movdqa 96(%esi),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 112(%esi),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 + addl $-16,%edx +.L011schedule_mangle_both: + movdqa 256(%ebp,%ecx,1),%xmm1 +.byte 102,15,56,0,217 + addl $-16,%ecx + andl $48,%ecx + movdqu %xmm3,(%edx) + ret +.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle +.globl vpaes_set_encrypt_key +.hidden vpaes_set_encrypt_key +.type vpaes_set_encrypt_key,@function +.align 16 +vpaes_set_encrypt_key: +.L_vpaes_set_encrypt_key_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call .L012pic_for_function_hit +.L012pic_for_function_hit: + popl %ebx + leal BORINGSSL_function_hit+5-.L012pic_for_function_hit(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif + movl 20(%esp),%esi + leal -56(%esp),%ebx + movl 24(%esp),%eax + andl $-16,%ebx + movl 28(%esp),%edx + xchgl %esp,%ebx + movl %ebx,48(%esp) + movl %eax,%ebx + shrl $5,%ebx + addl $5,%ebx + movl %ebx,240(%edx) + movl $48,%ecx + movl $0,%edi + leal .L_vpaes_consts+0x30-.L013pic_point,%ebp + call _vpaes_schedule_core +.L013pic_point: + movl 48(%esp),%esp + xorl %eax,%eax + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size vpaes_set_encrypt_key,.-.L_vpaes_set_encrypt_key_begin +.globl vpaes_encrypt +.hidden vpaes_encrypt +.type vpaes_encrypt,@function +.align 16 +vpaes_encrypt: +.L_vpaes_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call .L014pic_for_function_hit +.L014pic_for_function_hit: + popl %ebx + leal BORINGSSL_function_hit+4-.L014pic_for_function_hit(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif + leal .L_vpaes_consts+0x30-.L015pic_point,%ebp + call _vpaes_preheat +.L015pic_point: + movl 20(%esp),%esi + leal -56(%esp),%ebx + movl 24(%esp),%edi + andl $-16,%ebx + movl 28(%esp),%edx + xchgl %esp,%ebx + movl %ebx,48(%esp) + movdqu (%esi),%xmm0 + call _vpaes_encrypt_core + movdqu %xmm0,(%edi) + movl 48(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size vpaes_encrypt,.-.L_vpaes_encrypt_begin +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) diff --git a/ring-0.17.14/pregenerated/vpaes-x86-win32n.asm b/ring-0.17.14/pregenerated/vpaes-x86-win32n.asm new file mode 100644 index 0000000000..f52f7ca210 --- /dev/null +++ b/ring-0.17.14/pregenerated/vpaes-x86-win32n.asm @@ -0,0 +1,408 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%include "ring_core_generated/prefix_symbols_nasm.inc" +%ifidn __OUTPUT_FORMAT__, win32 +%ifidn __OUTPUT_FORMAT__,obj +section code use32 class=code align=64 +%elifidn __OUTPUT_FORMAT__,win32 +$@feat.00 equ 1 +section .text code align=64 +%else +section .text code +%endif +%ifdef BORINGSSL_DISPATCH_TEST +extern _BORINGSSL_function_hit +%endif +align 64 +L$_vpaes_consts: +dd 218628480,235210255,168496130,67568393 +dd 252381056,17041926,33884169,51187212 +dd 252645135,252645135,252645135,252645135 +dd 1512730624,3266504856,1377990664,3401244816 +dd 830229760,1275146365,2969422977,3447763452 +dd 3411033600,2979783055,338359620,2782886510 +dd 4209124096,907596821,221174255,1006095553 +dd 191964160,3799684038,3164090317,1589111125 +dd 182528256,1777043520,2877432650,3265356744 +dd 1874708224,3503451415,3305285752,363511674 +dd 1606117888,3487855781,1093350906,2384367825 +dd 197121,67569157,134941193,202313229 +dd 67569157,134941193,202313229,197121 +dd 134941193,202313229,197121,67569157 +dd 202313229,197121,67569157,134941193 +dd 33619971,100992007,168364043,235736079 +dd 235736079,33619971,100992007,168364043 +dd 168364043,235736079,33619971,100992007 +dd 100992007,168364043,235736079,33619971 +dd 50462976,117835012,185207048,252579084 +dd 252314880,51251460,117574920,184942860 +dd 184682752,252054788,50987272,118359308 +dd 118099200,185467140,251790600,50727180 +dd 2946363062,528716217,1300004225,1881839624 +dd 1532713819,1532713819,1532713819,1532713819 +dd 3602276352,4288629033,3737020424,4153884961 +dd 1354558464,32357713,2958822624,3775749553 +dd 1201988352,132424512,1572796698,503232858 +dd 2213177600,1597421020,4103937655,675398315 +db 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105 +db 111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83 +db 83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117 +db 114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105 +db 118,101,114,115,105,116,121,41,0 +align 64 +align 16 +__vpaes_preheat: + add ebp,DWORD [esp] + movdqa xmm7,[ebp-48] + movdqa xmm6,[ebp-16] + ret +align 16 +__vpaes_encrypt_core: + mov ecx,16 + mov eax,DWORD [240+edx] + movdqa xmm1,xmm6 + movdqa xmm2,[ebp] + pandn xmm1,xmm0 + pand xmm0,xmm6 + movdqu xmm5,[edx] +db 102,15,56,0,208 + movdqa xmm0,[16+ebp] + pxor xmm2,xmm5 + psrld xmm1,4 + add edx,16 +db 102,15,56,0,193 + lea ebx,[192+ebp] + pxor xmm0,xmm2 + jmp NEAR L$000enc_entry +align 16 +L$001enc_loop: + movdqa xmm4,[32+ebp] + movdqa xmm0,[48+ebp] +db 102,15,56,0,226 +db 102,15,56,0,195 + pxor xmm4,xmm5 + movdqa xmm5,[64+ebp] + pxor xmm0,xmm4 + movdqa xmm1,[ecx*1+ebx-64] +db 102,15,56,0,234 + movdqa xmm2,[80+ebp] + movdqa xmm4,[ecx*1+ebx] +db 102,15,56,0,211 + movdqa xmm3,xmm0 + pxor xmm2,xmm5 +db 102,15,56,0,193 + add edx,16 + pxor xmm0,xmm2 +db 102,15,56,0,220 + add ecx,16 + pxor xmm3,xmm0 +db 102,15,56,0,193 + and ecx,48 + sub eax,1 + pxor xmm0,xmm3 +L$000enc_entry: + movdqa xmm1,xmm6 + movdqa xmm5,[ebp-32] + pandn xmm1,xmm0 + psrld xmm1,4 + pand xmm0,xmm6 +db 102,15,56,0,232 + movdqa xmm3,xmm7 + pxor xmm0,xmm1 +db 102,15,56,0,217 + movdqa xmm4,xmm7 + pxor xmm3,xmm5 +db 102,15,56,0,224 + movdqa xmm2,xmm7 + pxor xmm4,xmm5 +db 102,15,56,0,211 + movdqa xmm3,xmm7 + pxor xmm2,xmm0 +db 102,15,56,0,220 + movdqu xmm5,[edx] + pxor xmm3,xmm1 + jnz NEAR L$001enc_loop + movdqa xmm4,[96+ebp] + movdqa xmm0,[112+ebp] +db 102,15,56,0,226 + pxor xmm4,xmm5 +db 102,15,56,0,195 + movdqa xmm1,[64+ecx*1+ebx] + pxor xmm0,xmm4 +db 102,15,56,0,193 + ret +align 16 +__vpaes_schedule_core: + add ebp,DWORD [esp] + movdqu xmm0,[esi] + movdqa xmm2,[320+ebp] + movdqa xmm3,xmm0 + lea ebx,[ebp] + movdqa [4+esp],xmm2 + call __vpaes_schedule_transform + movdqa xmm7,xmm0 + test edi,edi + jnz NEAR L$002schedule_am_decrypting + movdqu [edx],xmm0 + jmp NEAR L$003schedule_go +L$002schedule_am_decrypting: + movdqa xmm1,[256+ecx*1+ebp] +db 102,15,56,0,217 + movdqu [edx],xmm3 + xor ecx,48 +L$003schedule_go: + cmp eax,192 + ja NEAR L$004schedule_256 +L$005schedule_128: + mov eax,10 +L$006loop_schedule_128: + call __vpaes_schedule_round + dec eax + jz NEAR L$007schedule_mangle_last + call __vpaes_schedule_mangle + jmp NEAR L$006loop_schedule_128 +align 16 +L$004schedule_256: + movdqu xmm0,[16+esi] + call __vpaes_schedule_transform + mov eax,7 +L$008loop_schedule_256: + call __vpaes_schedule_mangle + movdqa xmm6,xmm0 + call __vpaes_schedule_round + dec eax + jz NEAR L$007schedule_mangle_last + call __vpaes_schedule_mangle + pshufd xmm0,xmm0,255 + movdqa [20+esp],xmm7 + movdqa xmm7,xmm6 + call L$_vpaes_schedule_low_round + movdqa xmm7,[20+esp] + jmp NEAR L$008loop_schedule_256 +align 16 +L$007schedule_mangle_last: + lea ebx,[384+ebp] + test edi,edi + jnz NEAR L$009schedule_mangle_last_dec + movdqa xmm1,[256+ecx*1+ebp] +db 102,15,56,0,193 + lea ebx,[352+ebp] + add edx,32 +L$009schedule_mangle_last_dec: + add edx,-16 + pxor xmm0,[336+ebp] + call __vpaes_schedule_transform + movdqu [edx],xmm0 + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + pxor xmm6,xmm6 + pxor xmm7,xmm7 + ret +align 16 +__vpaes_schedule_round: + movdqa xmm2,[8+esp] + pxor xmm1,xmm1 +db 102,15,58,15,202,15 +db 102,15,58,15,210,15 + pxor xmm7,xmm1 + pshufd xmm0,xmm0,255 +db 102,15,58,15,192,1 + movdqa [8+esp],xmm2 +L$_vpaes_schedule_low_round: + movdqa xmm1,xmm7 + pslldq xmm7,4 + pxor xmm7,xmm1 + movdqa xmm1,xmm7 + pslldq xmm7,8 + pxor xmm7,xmm1 + pxor xmm7,[336+ebp] + movdqa xmm4,[ebp-16] + movdqa xmm5,[ebp-48] + movdqa xmm1,xmm4 + pandn xmm1,xmm0 + psrld xmm1,4 + pand xmm0,xmm4 + movdqa xmm2,[ebp-32] +db 102,15,56,0,208 + pxor xmm0,xmm1 + movdqa xmm3,xmm5 +db 102,15,56,0,217 + pxor xmm3,xmm2 + movdqa xmm4,xmm5 +db 102,15,56,0,224 + pxor xmm4,xmm2 + movdqa xmm2,xmm5 +db 102,15,56,0,211 + pxor xmm2,xmm0 + movdqa xmm3,xmm5 +db 102,15,56,0,220 + pxor xmm3,xmm1 + movdqa xmm4,[32+ebp] +db 102,15,56,0,226 + movdqa xmm0,[48+ebp] +db 102,15,56,0,195 + pxor xmm0,xmm4 + pxor xmm0,xmm7 + movdqa xmm7,xmm0 + ret +align 16 +__vpaes_schedule_transform: + movdqa xmm2,[ebp-16] + movdqa xmm1,xmm2 + pandn xmm1,xmm0 + psrld xmm1,4 + pand xmm0,xmm2 + movdqa xmm2,[ebx] +db 102,15,56,0,208 + movdqa xmm0,[16+ebx] +db 102,15,56,0,193 + pxor xmm0,xmm2 + ret +align 16 +__vpaes_schedule_mangle: + movdqa xmm4,xmm0 + movdqa xmm5,[128+ebp] + test edi,edi + jnz NEAR L$010schedule_mangle_dec + add edx,16 + pxor xmm4,[336+ebp] +db 102,15,56,0,229 + movdqa xmm3,xmm4 +db 102,15,56,0,229 + pxor xmm3,xmm4 +db 102,15,56,0,229 + pxor xmm3,xmm4 + jmp NEAR L$011schedule_mangle_both +align 16 +L$010schedule_mangle_dec: + movdqa xmm2,[ebp-16] + lea esi,[ebp] + movdqa xmm1,xmm2 + pandn xmm1,xmm4 + psrld xmm1,4 + pand xmm4,xmm2 + movdqa xmm2,[esi] +db 102,15,56,0,212 + movdqa xmm3,[16+esi] +db 102,15,56,0,217 + pxor xmm3,xmm2 +db 102,15,56,0,221 + movdqa xmm2,[32+esi] +db 102,15,56,0,212 + pxor xmm2,xmm3 + movdqa xmm3,[48+esi] +db 102,15,56,0,217 + pxor xmm3,xmm2 +db 102,15,56,0,221 + movdqa xmm2,[64+esi] +db 102,15,56,0,212 + pxor xmm2,xmm3 + movdqa xmm3,[80+esi] +db 102,15,56,0,217 + pxor xmm3,xmm2 +db 102,15,56,0,221 + movdqa xmm2,[96+esi] +db 102,15,56,0,212 + pxor xmm2,xmm3 + movdqa xmm3,[112+esi] +db 102,15,56,0,217 + pxor xmm3,xmm2 + add edx,-16 +L$011schedule_mangle_both: + movdqa xmm1,[256+ecx*1+ebp] +db 102,15,56,0,217 + add ecx,-16 + and ecx,48 + movdqu [edx],xmm3 + ret +global _vpaes_set_encrypt_key +align 16 +_vpaes_set_encrypt_key: +L$_vpaes_set_encrypt_key_begin: + push ebp + push ebx + push esi + push edi +%ifdef BORINGSSL_DISPATCH_TEST + push ebx + push edx + call L$012pic_for_function_hit +L$012pic_for_function_hit: + pop ebx + lea ebx,[(_BORINGSSL_function_hit+5-L$012pic_for_function_hit)+ebx] + mov edx,1 + mov BYTE [ebx],dl + pop edx + pop ebx +%endif + mov esi,DWORD [20+esp] + lea ebx,[esp-56] + mov eax,DWORD [24+esp] + and ebx,-16 + mov edx,DWORD [28+esp] + xchg ebx,esp + mov DWORD [48+esp],ebx + mov ebx,eax + shr ebx,5 + add ebx,5 + mov DWORD [240+edx],ebx + mov ecx,48 + mov edi,0 + lea ebp,[(L$_vpaes_consts+0x30-L$013pic_point)] + call __vpaes_schedule_core +L$013pic_point: + mov esp,DWORD [48+esp] + xor eax,eax + pop edi + pop esi + pop ebx + pop ebp + ret +global _vpaes_encrypt +align 16 +_vpaes_encrypt: +L$_vpaes_encrypt_begin: + push ebp + push ebx + push esi + push edi +%ifdef BORINGSSL_DISPATCH_TEST + push ebx + push edx + call L$014pic_for_function_hit +L$014pic_for_function_hit: + pop ebx + lea ebx,[(_BORINGSSL_function_hit+4-L$014pic_for_function_hit)+ebx] + mov edx,1 + mov BYTE [ebx],dl + pop edx + pop ebx +%endif + lea ebp,[(L$_vpaes_consts+0x30-L$015pic_point)] + call __vpaes_preheat +L$015pic_point: + mov esi,DWORD [20+esp] + lea ebx,[esp-56] + mov edi,DWORD [24+esp] + and ebx,-16 + mov edx,DWORD [28+esp] + xchg ebx,esp + mov DWORD [48+esp],ebx + movdqu xmm0,[esi] + call __vpaes_encrypt_core + movdqu [edi],xmm0 + mov esp,DWORD [48+esp] + pop edi + pop esi + pop ebx + pop ebp + ret +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/ring-0.17.14/pregenerated/vpaes-x86-win32n.o b/ring-0.17.14/pregenerated/vpaes-x86-win32n.o new file mode 100644 index 0000000000..b630709091 Binary files /dev/null and b/ring-0.17.14/pregenerated/vpaes-x86-win32n.o differ diff --git a/ring-0.17.14/pregenerated/vpaes-x86_64-elf.S b/ring-0.17.14/pregenerated/vpaes-x86_64-elf.S new file mode 100644 index 0000000000..146439f6b8 --- /dev/null +++ b/ring-0.17.14/pregenerated/vpaes-x86_64-elf.S @@ -0,0 +1,758 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +.text + + + + + + + + + + + + + + + + +.type _vpaes_encrypt_core,@function +.align 16 +_vpaes_encrypt_core: +.cfi_startproc + movq %rdx,%r9 + movq $16,%r11 + movl 240(%rdx),%eax + movdqa %xmm9,%xmm1 + movdqa .Lk_ipt(%rip),%xmm2 + pandn %xmm0,%xmm1 + movdqu (%r9),%xmm5 + psrld $4,%xmm1 + pand %xmm9,%xmm0 +.byte 102,15,56,0,208 + movdqa .Lk_ipt+16(%rip),%xmm0 +.byte 102,15,56,0,193 + pxor %xmm5,%xmm2 + addq $16,%r9 + pxor %xmm2,%xmm0 + leaq .Lk_mc_backward(%rip),%r10 + jmp .Lenc_entry + +.align 16 +.Lenc_loop: + + movdqa %xmm13,%xmm4 + movdqa %xmm12,%xmm0 +.byte 102,15,56,0,226 +.byte 102,15,56,0,195 + pxor %xmm5,%xmm4 + movdqa %xmm15,%xmm5 + pxor %xmm4,%xmm0 + movdqa -64(%r11,%r10,1),%xmm1 +.byte 102,15,56,0,234 + movdqa (%r11,%r10,1),%xmm4 + movdqa %xmm14,%xmm2 +.byte 102,15,56,0,211 + movdqa %xmm0,%xmm3 + pxor %xmm5,%xmm2 +.byte 102,15,56,0,193 + addq $16,%r9 + pxor %xmm2,%xmm0 +.byte 102,15,56,0,220 + addq $16,%r11 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,193 + andq $0x30,%r11 + subq $1,%rax + pxor %xmm3,%xmm0 + +.Lenc_entry: + + movdqa %xmm9,%xmm1 + movdqa %xmm11,%xmm5 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm0 +.byte 102,15,56,0,232 + movdqa %xmm10,%xmm3 + pxor %xmm1,%xmm0 +.byte 102,15,56,0,217 + movdqa %xmm10,%xmm4 + pxor %xmm5,%xmm3 +.byte 102,15,56,0,224 + movdqa %xmm10,%xmm2 + pxor %xmm5,%xmm4 +.byte 102,15,56,0,211 + movdqa %xmm10,%xmm3 + pxor %xmm0,%xmm2 +.byte 102,15,56,0,220 + movdqu (%r9),%xmm5 + pxor %xmm1,%xmm3 + jnz .Lenc_loop + + + movdqa -96(%r10),%xmm4 + movdqa -80(%r10),%xmm0 +.byte 102,15,56,0,226 + pxor %xmm5,%xmm4 +.byte 102,15,56,0,195 + movdqa 64(%r11,%r10,1),%xmm1 + pxor %xmm4,%xmm0 +.byte 102,15,56,0,193 + ret +.cfi_endproc +.size _vpaes_encrypt_core,.-_vpaes_encrypt_core + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.type _vpaes_encrypt_core_2x,@function +.align 16 +_vpaes_encrypt_core_2x: +.cfi_startproc + movq %rdx,%r9 + movq $16,%r11 + movl 240(%rdx),%eax + movdqa %xmm9,%xmm1 + movdqa %xmm9,%xmm7 + movdqa .Lk_ipt(%rip),%xmm2 + movdqa %xmm2,%xmm8 + pandn %xmm0,%xmm1 + pandn %xmm6,%xmm7 + movdqu (%r9),%xmm5 + + psrld $4,%xmm1 + psrld $4,%xmm7 + pand %xmm9,%xmm0 + pand %xmm9,%xmm6 +.byte 102,15,56,0,208 +.byte 102,68,15,56,0,198 + movdqa .Lk_ipt+16(%rip),%xmm0 + movdqa %xmm0,%xmm6 +.byte 102,15,56,0,193 +.byte 102,15,56,0,247 + pxor %xmm5,%xmm2 + pxor %xmm5,%xmm8 + addq $16,%r9 + pxor %xmm2,%xmm0 + pxor %xmm8,%xmm6 + leaq .Lk_mc_backward(%rip),%r10 + jmp .Lenc2x_entry + +.align 16 +.Lenc2x_loop: + + movdqa .Lk_sb1(%rip),%xmm4 + movdqa .Lk_sb1+16(%rip),%xmm0 + movdqa %xmm4,%xmm12 + movdqa %xmm0,%xmm6 +.byte 102,15,56,0,226 +.byte 102,69,15,56,0,224 +.byte 102,15,56,0,195 +.byte 102,65,15,56,0,243 + pxor %xmm5,%xmm4 + pxor %xmm5,%xmm12 + movdqa .Lk_sb2(%rip),%xmm5 + movdqa %xmm5,%xmm13 + pxor %xmm4,%xmm0 + pxor %xmm12,%xmm6 + movdqa -64(%r11,%r10,1),%xmm1 + +.byte 102,15,56,0,234 +.byte 102,69,15,56,0,232 + movdqa (%r11,%r10,1),%xmm4 + + movdqa .Lk_sb2+16(%rip),%xmm2 + movdqa %xmm2,%xmm8 +.byte 102,15,56,0,211 +.byte 102,69,15,56,0,195 + movdqa %xmm0,%xmm3 + movdqa %xmm6,%xmm11 + pxor %xmm5,%xmm2 + pxor %xmm13,%xmm8 +.byte 102,15,56,0,193 +.byte 102,15,56,0,241 + addq $16,%r9 + pxor %xmm2,%xmm0 + pxor %xmm8,%xmm6 +.byte 102,15,56,0,220 +.byte 102,68,15,56,0,220 + addq $16,%r11 + pxor %xmm0,%xmm3 + pxor %xmm6,%xmm11 +.byte 102,15,56,0,193 +.byte 102,15,56,0,241 + andq $0x30,%r11 + subq $1,%rax + pxor %xmm3,%xmm0 + pxor %xmm11,%xmm6 + +.Lenc2x_entry: + + movdqa %xmm9,%xmm1 + movdqa %xmm9,%xmm7 + movdqa .Lk_inv+16(%rip),%xmm5 + movdqa %xmm5,%xmm13 + pandn %xmm0,%xmm1 + pandn %xmm6,%xmm7 + psrld $4,%xmm1 + psrld $4,%xmm7 + pand %xmm9,%xmm0 + pand %xmm9,%xmm6 +.byte 102,15,56,0,232 +.byte 102,68,15,56,0,238 + movdqa %xmm10,%xmm3 + movdqa %xmm10,%xmm11 + pxor %xmm1,%xmm0 + pxor %xmm7,%xmm6 +.byte 102,15,56,0,217 +.byte 102,68,15,56,0,223 + movdqa %xmm10,%xmm4 + movdqa %xmm10,%xmm12 + pxor %xmm5,%xmm3 + pxor %xmm13,%xmm11 +.byte 102,15,56,0,224 +.byte 102,68,15,56,0,230 + movdqa %xmm10,%xmm2 + movdqa %xmm10,%xmm8 + pxor %xmm5,%xmm4 + pxor %xmm13,%xmm12 +.byte 102,15,56,0,211 +.byte 102,69,15,56,0,195 + movdqa %xmm10,%xmm3 + movdqa %xmm10,%xmm11 + pxor %xmm0,%xmm2 + pxor %xmm6,%xmm8 +.byte 102,15,56,0,220 +.byte 102,69,15,56,0,220 + movdqu (%r9),%xmm5 + + pxor %xmm1,%xmm3 + pxor %xmm7,%xmm11 + jnz .Lenc2x_loop + + + movdqa -96(%r10),%xmm4 + movdqa -80(%r10),%xmm0 + movdqa %xmm4,%xmm12 + movdqa %xmm0,%xmm6 +.byte 102,15,56,0,226 +.byte 102,69,15,56,0,224 + pxor %xmm5,%xmm4 + pxor %xmm5,%xmm12 +.byte 102,15,56,0,195 +.byte 102,65,15,56,0,243 + movdqa 64(%r11,%r10,1),%xmm1 + + pxor %xmm4,%xmm0 + pxor %xmm12,%xmm6 +.byte 102,15,56,0,193 +.byte 102,15,56,0,241 + ret +.cfi_endproc +.size _vpaes_encrypt_core_2x,.-_vpaes_encrypt_core_2x + + + + + + +.type _vpaes_schedule_core,@function +.align 16 +_vpaes_schedule_core: +.cfi_startproc + + + + + + call _vpaes_preheat + movdqa .Lk_rcon(%rip),%xmm8 + movdqu (%rdi),%xmm0 + + + movdqa %xmm0,%xmm3 + leaq .Lk_ipt(%rip),%r11 + call _vpaes_schedule_transform + movdqa %xmm0,%xmm7 + + leaq .Lk_sr(%rip),%r10 + + + movdqu %xmm0,(%rdx) + +.Lschedule_go: + cmpl $192,%esi + ja .Lschedule_256 + + + + + + + + + + + +.Lschedule_128: + movl $10,%esi + +.Loop_schedule_128: + call _vpaes_schedule_round + decq %rsi + jz .Lschedule_mangle_last + call _vpaes_schedule_mangle + jmp .Loop_schedule_128 + + + + + + + + + + + +.align 16 +.Lschedule_256: + movdqu 16(%rdi),%xmm0 + call _vpaes_schedule_transform + movl $7,%esi + +.Loop_schedule_256: + call _vpaes_schedule_mangle + movdqa %xmm0,%xmm6 + + + call _vpaes_schedule_round + decq %rsi + jz .Lschedule_mangle_last + call _vpaes_schedule_mangle + + + pshufd $0xFF,%xmm0,%xmm0 + movdqa %xmm7,%xmm5 + movdqa %xmm6,%xmm7 + call _vpaes_schedule_low_round + movdqa %xmm5,%xmm7 + + jmp .Loop_schedule_256 + + + + + + + + + + + + +.align 16 +.Lschedule_mangle_last: + + leaq .Lk_deskew(%rip),%r11 + + + movdqa (%r8,%r10,1),%xmm1 +.byte 102,15,56,0,193 + leaq .Lk_opt(%rip),%r11 + addq $32,%rdx + +.Lschedule_mangle_last_dec: + addq $-16,%rdx + pxor .Lk_s63(%rip),%xmm0 + call _vpaes_schedule_transform + movdqu %xmm0,(%rdx) + + + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + ret +.cfi_endproc +.size _vpaes_schedule_core,.-_vpaes_schedule_core + + + + + + + + + + + + + + + + + + + +.type _vpaes_schedule_round,@function +.align 16 +_vpaes_schedule_round: +.cfi_startproc + + pxor %xmm1,%xmm1 +.byte 102,65,15,58,15,200,15 +.byte 102,69,15,58,15,192,15 + pxor %xmm1,%xmm7 + + + pshufd $0xFF,%xmm0,%xmm0 +.byte 102,15,58,15,192,1 + + + + +_vpaes_schedule_low_round: + + movdqa %xmm7,%xmm1 + pslldq $4,%xmm7 + pxor %xmm1,%xmm7 + movdqa %xmm7,%xmm1 + pslldq $8,%xmm7 + pxor %xmm1,%xmm7 + pxor .Lk_s63(%rip),%xmm7 + + + movdqa %xmm9,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm0 + movdqa %xmm11,%xmm2 +.byte 102,15,56,0,208 + pxor %xmm1,%xmm0 + movdqa %xmm10,%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 + movdqa %xmm10,%xmm4 +.byte 102,15,56,0,224 + pxor %xmm2,%xmm4 + movdqa %xmm10,%xmm2 +.byte 102,15,56,0,211 + pxor %xmm0,%xmm2 + movdqa %xmm10,%xmm3 +.byte 102,15,56,0,220 + pxor %xmm1,%xmm3 + movdqa %xmm13,%xmm4 +.byte 102,15,56,0,226 + movdqa %xmm12,%xmm0 +.byte 102,15,56,0,195 + pxor %xmm4,%xmm0 + + + pxor %xmm7,%xmm0 + movdqa %xmm0,%xmm7 + ret +.cfi_endproc +.size _vpaes_schedule_round,.-_vpaes_schedule_round + + + + + + + + + + +.type _vpaes_schedule_transform,@function +.align 16 +_vpaes_schedule_transform: +.cfi_startproc + movdqa %xmm9,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm0 + movdqa (%r11),%xmm2 +.byte 102,15,56,0,208 + movdqa 16(%r11),%xmm0 +.byte 102,15,56,0,193 + pxor %xmm2,%xmm0 + ret +.cfi_endproc +.size _vpaes_schedule_transform,.-_vpaes_schedule_transform + + + + + + + + + + + + + + + + + + + + + + + + +.type _vpaes_schedule_mangle,@function +.align 16 +_vpaes_schedule_mangle: +.cfi_startproc + movdqa %xmm0,%xmm4 + movdqa .Lk_mc_forward(%rip),%xmm5 + + + addq $16,%rdx + pxor .Lk_s63(%rip),%xmm4 +.byte 102,15,56,0,229 + movdqa %xmm4,%xmm3 +.byte 102,15,56,0,229 + pxor %xmm4,%xmm3 +.byte 102,15,56,0,229 + pxor %xmm4,%xmm3 + +.Lschedule_mangle_both: + movdqa (%r8,%r10,1),%xmm1 +.byte 102,15,56,0,217 + addq $-16,%r8 + andq $0x30,%r8 + movdqu %xmm3,(%rdx) + ret +.cfi_endproc +.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle + + + + +.globl vpaes_set_encrypt_key +.hidden vpaes_set_encrypt_key +.type vpaes_set_encrypt_key,@function +.align 16 +vpaes_set_encrypt_key: +.cfi_startproc +_CET_ENDBR +#ifdef BORINGSSL_DISPATCH_TEST +.extern BORINGSSL_function_hit +.hidden BORINGSSL_function_hit + movb $1,BORINGSSL_function_hit+5(%rip) +#endif + + movl %esi,%eax + shrl $5,%eax + addl $5,%eax + movl %eax,240(%rdx) + + movl $0,%ecx + movl $0x30,%r8d + call _vpaes_schedule_core + xorl %eax,%eax + ret +.cfi_endproc +.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key +.globl vpaes_ctr32_encrypt_blocks +.hidden vpaes_ctr32_encrypt_blocks +.type vpaes_ctr32_encrypt_blocks,@function +.align 16 +vpaes_ctr32_encrypt_blocks: +.cfi_startproc +_CET_ENDBR + + xchgq %rcx,%rdx + testq %rcx,%rcx + jz .Lctr32_abort + movdqu (%r8),%xmm0 + movdqa .Lctr_add_one(%rip),%xmm8 + subq %rdi,%rsi + call _vpaes_preheat + movdqa %xmm0,%xmm6 + pshufb .Lrev_ctr(%rip),%xmm6 + + testq $1,%rcx + jz .Lctr32_prep_loop + + + + movdqu (%rdi),%xmm7 + call _vpaes_encrypt_core + pxor %xmm7,%xmm0 + paddd %xmm8,%xmm6 + movdqu %xmm0,(%rsi,%rdi,1) + subq $1,%rcx + leaq 16(%rdi),%rdi + jz .Lctr32_done + +.Lctr32_prep_loop: + + + movdqa %xmm6,%xmm14 + movdqa %xmm6,%xmm15 + paddd %xmm8,%xmm15 + +.Lctr32_loop: + movdqa .Lrev_ctr(%rip),%xmm1 + movdqa %xmm14,%xmm0 + movdqa %xmm15,%xmm6 +.byte 102,15,56,0,193 +.byte 102,15,56,0,241 + call _vpaes_encrypt_core_2x + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa .Lctr_add_two(%rip),%xmm3 + pxor %xmm1,%xmm0 + pxor %xmm2,%xmm6 + paddd %xmm3,%xmm14 + paddd %xmm3,%xmm15 + movdqu %xmm0,(%rsi,%rdi,1) + movdqu %xmm6,16(%rsi,%rdi,1) + subq $2,%rcx + leaq 32(%rdi),%rdi + jnz .Lctr32_loop + +.Lctr32_done: +.Lctr32_abort: + ret +.cfi_endproc +.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks + + + + + + +.type _vpaes_preheat,@function +.align 16 +_vpaes_preheat: +.cfi_startproc + leaq .Lk_s0F(%rip),%r10 + movdqa -32(%r10),%xmm10 + movdqa -16(%r10),%xmm11 + movdqa 0(%r10),%xmm9 + movdqa 48(%r10),%xmm13 + movdqa 64(%r10),%xmm12 + movdqa 80(%r10),%xmm15 + movdqa 96(%r10),%xmm14 + ret +.cfi_endproc +.size _vpaes_preheat,.-_vpaes_preheat + + + + + +.type _vpaes_consts,@object +.section .rodata +.align 64 +_vpaes_consts: +.Lk_inv: +.quad 0x0E05060F0D080180, 0x040703090A0B0C02 +.quad 0x01040A060F0B0780, 0x030D0E0C02050809 + +.Lk_s0F: +.quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F + +.Lk_ipt: +.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 +.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 + +.Lk_sb1: +.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 +.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF +.Lk_sb2: +.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD +.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A +.Lk_sbo: +.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 +.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA + +.Lk_mc_forward: +.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 +.quad 0x080B0A0904070605, 0x000302010C0F0E0D +.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 +.quad 0x000302010C0F0E0D, 0x080B0A0904070605 + +.Lk_mc_backward: +.quad 0x0605040702010003, 0x0E0D0C0F0A09080B +.quad 0x020100030E0D0C0F, 0x0A09080B06050407 +.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 +.quad 0x0A09080B06050407, 0x020100030E0D0C0F + +.Lk_sr: +.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 +.quad 0x030E09040F0A0500, 0x0B06010C07020D08 +.quad 0x0F060D040B020900, 0x070E050C030A0108 +.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +.Lk_rcon: +.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +.Lk_s63: +.quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B + +.Lk_opt: +.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 +.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 + +.Lk_deskew: +.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A +.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 + + +.Lrev_ctr: +.quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 + + +.Lctr_add_one: +.quad 0x0000000000000000, 0x0000000100000000 +.Lctr_add_two: +.quad 0x0000000000000000, 0x0000000200000000 + +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 +.align 64 +.size _vpaes_consts,.-_vpaes_consts +.text +#endif diff --git a/ring-0.17.14/pregenerated/vpaes-x86_64-macosx.S b/ring-0.17.14/pregenerated/vpaes-x86_64-macosx.S new file mode 100644 index 0000000000..b113e79f51 --- /dev/null +++ b/ring-0.17.14/pregenerated/vpaes-x86_64-macosx.S @@ -0,0 +1,757 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +.text + + + + + + + + + + + + + + + + + +.p2align 4 +_vpaes_encrypt_core: + + movq %rdx,%r9 + movq $16,%r11 + movl 240(%rdx),%eax + movdqa %xmm9,%xmm1 + movdqa L$k_ipt(%rip),%xmm2 + pandn %xmm0,%xmm1 + movdqu (%r9),%xmm5 + psrld $4,%xmm1 + pand %xmm9,%xmm0 +.byte 102,15,56,0,208 + movdqa L$k_ipt+16(%rip),%xmm0 +.byte 102,15,56,0,193 + pxor %xmm5,%xmm2 + addq $16,%r9 + pxor %xmm2,%xmm0 + leaq L$k_mc_backward(%rip),%r10 + jmp L$enc_entry + +.p2align 4 +L$enc_loop: + + movdqa %xmm13,%xmm4 + movdqa %xmm12,%xmm0 +.byte 102,15,56,0,226 +.byte 102,15,56,0,195 + pxor %xmm5,%xmm4 + movdqa %xmm15,%xmm5 + pxor %xmm4,%xmm0 + movdqa -64(%r11,%r10,1),%xmm1 +.byte 102,15,56,0,234 + movdqa (%r11,%r10,1),%xmm4 + movdqa %xmm14,%xmm2 +.byte 102,15,56,0,211 + movdqa %xmm0,%xmm3 + pxor %xmm5,%xmm2 +.byte 102,15,56,0,193 + addq $16,%r9 + pxor %xmm2,%xmm0 +.byte 102,15,56,0,220 + addq $16,%r11 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,193 + andq $0x30,%r11 + subq $1,%rax + pxor %xmm3,%xmm0 + +L$enc_entry: + + movdqa %xmm9,%xmm1 + movdqa %xmm11,%xmm5 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm0 +.byte 102,15,56,0,232 + movdqa %xmm10,%xmm3 + pxor %xmm1,%xmm0 +.byte 102,15,56,0,217 + movdqa %xmm10,%xmm4 + pxor %xmm5,%xmm3 +.byte 102,15,56,0,224 + movdqa %xmm10,%xmm2 + pxor %xmm5,%xmm4 +.byte 102,15,56,0,211 + movdqa %xmm10,%xmm3 + pxor %xmm0,%xmm2 +.byte 102,15,56,0,220 + movdqu (%r9),%xmm5 + pxor %xmm1,%xmm3 + jnz L$enc_loop + + + movdqa -96(%r10),%xmm4 + movdqa -80(%r10),%xmm0 +.byte 102,15,56,0,226 + pxor %xmm5,%xmm4 +.byte 102,15,56,0,195 + movdqa 64(%r11,%r10,1),%xmm1 + pxor %xmm4,%xmm0 +.byte 102,15,56,0,193 + ret + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.p2align 4 +_vpaes_encrypt_core_2x: + + movq %rdx,%r9 + movq $16,%r11 + movl 240(%rdx),%eax + movdqa %xmm9,%xmm1 + movdqa %xmm9,%xmm7 + movdqa L$k_ipt(%rip),%xmm2 + movdqa %xmm2,%xmm8 + pandn %xmm0,%xmm1 + pandn %xmm6,%xmm7 + movdqu (%r9),%xmm5 + + psrld $4,%xmm1 + psrld $4,%xmm7 + pand %xmm9,%xmm0 + pand %xmm9,%xmm6 +.byte 102,15,56,0,208 +.byte 102,68,15,56,0,198 + movdqa L$k_ipt+16(%rip),%xmm0 + movdqa %xmm0,%xmm6 +.byte 102,15,56,0,193 +.byte 102,15,56,0,247 + pxor %xmm5,%xmm2 + pxor %xmm5,%xmm8 + addq $16,%r9 + pxor %xmm2,%xmm0 + pxor %xmm8,%xmm6 + leaq L$k_mc_backward(%rip),%r10 + jmp L$enc2x_entry + +.p2align 4 +L$enc2x_loop: + + movdqa L$k_sb1(%rip),%xmm4 + movdqa L$k_sb1+16(%rip),%xmm0 + movdqa %xmm4,%xmm12 + movdqa %xmm0,%xmm6 +.byte 102,15,56,0,226 +.byte 102,69,15,56,0,224 +.byte 102,15,56,0,195 +.byte 102,65,15,56,0,243 + pxor %xmm5,%xmm4 + pxor %xmm5,%xmm12 + movdqa L$k_sb2(%rip),%xmm5 + movdqa %xmm5,%xmm13 + pxor %xmm4,%xmm0 + pxor %xmm12,%xmm6 + movdqa -64(%r11,%r10,1),%xmm1 + +.byte 102,15,56,0,234 +.byte 102,69,15,56,0,232 + movdqa (%r11,%r10,1),%xmm4 + + movdqa L$k_sb2+16(%rip),%xmm2 + movdqa %xmm2,%xmm8 +.byte 102,15,56,0,211 +.byte 102,69,15,56,0,195 + movdqa %xmm0,%xmm3 + movdqa %xmm6,%xmm11 + pxor %xmm5,%xmm2 + pxor %xmm13,%xmm8 +.byte 102,15,56,0,193 +.byte 102,15,56,0,241 + addq $16,%r9 + pxor %xmm2,%xmm0 + pxor %xmm8,%xmm6 +.byte 102,15,56,0,220 +.byte 102,68,15,56,0,220 + addq $16,%r11 + pxor %xmm0,%xmm3 + pxor %xmm6,%xmm11 +.byte 102,15,56,0,193 +.byte 102,15,56,0,241 + andq $0x30,%r11 + subq $1,%rax + pxor %xmm3,%xmm0 + pxor %xmm11,%xmm6 + +L$enc2x_entry: + + movdqa %xmm9,%xmm1 + movdqa %xmm9,%xmm7 + movdqa L$k_inv+16(%rip),%xmm5 + movdqa %xmm5,%xmm13 + pandn %xmm0,%xmm1 + pandn %xmm6,%xmm7 + psrld $4,%xmm1 + psrld $4,%xmm7 + pand %xmm9,%xmm0 + pand %xmm9,%xmm6 +.byte 102,15,56,0,232 +.byte 102,68,15,56,0,238 + movdqa %xmm10,%xmm3 + movdqa %xmm10,%xmm11 + pxor %xmm1,%xmm0 + pxor %xmm7,%xmm6 +.byte 102,15,56,0,217 +.byte 102,68,15,56,0,223 + movdqa %xmm10,%xmm4 + movdqa %xmm10,%xmm12 + pxor %xmm5,%xmm3 + pxor %xmm13,%xmm11 +.byte 102,15,56,0,224 +.byte 102,68,15,56,0,230 + movdqa %xmm10,%xmm2 + movdqa %xmm10,%xmm8 + pxor %xmm5,%xmm4 + pxor %xmm13,%xmm12 +.byte 102,15,56,0,211 +.byte 102,69,15,56,0,195 + movdqa %xmm10,%xmm3 + movdqa %xmm10,%xmm11 + pxor %xmm0,%xmm2 + pxor %xmm6,%xmm8 +.byte 102,15,56,0,220 +.byte 102,69,15,56,0,220 + movdqu (%r9),%xmm5 + + pxor %xmm1,%xmm3 + pxor %xmm7,%xmm11 + jnz L$enc2x_loop + + + movdqa -96(%r10),%xmm4 + movdqa -80(%r10),%xmm0 + movdqa %xmm4,%xmm12 + movdqa %xmm0,%xmm6 +.byte 102,15,56,0,226 +.byte 102,69,15,56,0,224 + pxor %xmm5,%xmm4 + pxor %xmm5,%xmm12 +.byte 102,15,56,0,195 +.byte 102,65,15,56,0,243 + movdqa 64(%r11,%r10,1),%xmm1 + + pxor %xmm4,%xmm0 + pxor %xmm12,%xmm6 +.byte 102,15,56,0,193 +.byte 102,15,56,0,241 + ret + + + + + + + + + +.p2align 4 +_vpaes_schedule_core: + + + + + + + call _vpaes_preheat + movdqa L$k_rcon(%rip),%xmm8 + movdqu (%rdi),%xmm0 + + + movdqa %xmm0,%xmm3 + leaq L$k_ipt(%rip),%r11 + call _vpaes_schedule_transform + movdqa %xmm0,%xmm7 + + leaq L$k_sr(%rip),%r10 + + + movdqu %xmm0,(%rdx) + +L$schedule_go: + cmpl $192,%esi + ja L$schedule_256 + + + + + + + + + + + +L$schedule_128: + movl $10,%esi + +L$oop_schedule_128: + call _vpaes_schedule_round + decq %rsi + jz L$schedule_mangle_last + call _vpaes_schedule_mangle + jmp L$oop_schedule_128 + + + + + + + + + + + +.p2align 4 +L$schedule_256: + movdqu 16(%rdi),%xmm0 + call _vpaes_schedule_transform + movl $7,%esi + +L$oop_schedule_256: + call _vpaes_schedule_mangle + movdqa %xmm0,%xmm6 + + + call _vpaes_schedule_round + decq %rsi + jz L$schedule_mangle_last + call _vpaes_schedule_mangle + + + pshufd $0xFF,%xmm0,%xmm0 + movdqa %xmm7,%xmm5 + movdqa %xmm6,%xmm7 + call _vpaes_schedule_low_round + movdqa %xmm5,%xmm7 + + jmp L$oop_schedule_256 + + + + + + + + + + + + +.p2align 4 +L$schedule_mangle_last: + + leaq L$k_deskew(%rip),%r11 + + + movdqa (%r8,%r10,1),%xmm1 +.byte 102,15,56,0,193 + leaq L$k_opt(%rip),%r11 + addq $32,%rdx + +L$schedule_mangle_last_dec: + addq $-16,%rdx + pxor L$k_s63(%rip),%xmm0 + call _vpaes_schedule_transform + movdqu %xmm0,(%rdx) + + + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + ret + + + + + + + + + + + + + + + + + + + + + + +.p2align 4 +_vpaes_schedule_round: + + + pxor %xmm1,%xmm1 +.byte 102,65,15,58,15,200,15 +.byte 102,69,15,58,15,192,15 + pxor %xmm1,%xmm7 + + + pshufd $0xFF,%xmm0,%xmm0 +.byte 102,15,58,15,192,1 + + + + +_vpaes_schedule_low_round: + + movdqa %xmm7,%xmm1 + pslldq $4,%xmm7 + pxor %xmm1,%xmm7 + movdqa %xmm7,%xmm1 + pslldq $8,%xmm7 + pxor %xmm1,%xmm7 + pxor L$k_s63(%rip),%xmm7 + + + movdqa %xmm9,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm0 + movdqa %xmm11,%xmm2 +.byte 102,15,56,0,208 + pxor %xmm1,%xmm0 + movdqa %xmm10,%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 + movdqa %xmm10,%xmm4 +.byte 102,15,56,0,224 + pxor %xmm2,%xmm4 + movdqa %xmm10,%xmm2 +.byte 102,15,56,0,211 + pxor %xmm0,%xmm2 + movdqa %xmm10,%xmm3 +.byte 102,15,56,0,220 + pxor %xmm1,%xmm3 + movdqa %xmm13,%xmm4 +.byte 102,15,56,0,226 + movdqa %xmm12,%xmm0 +.byte 102,15,56,0,195 + pxor %xmm4,%xmm0 + + + pxor %xmm7,%xmm0 + movdqa %xmm0,%xmm7 + ret + + + + + + + + + + + + + +.p2align 4 +_vpaes_schedule_transform: + + movdqa %xmm9,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm0 + movdqa (%r11),%xmm2 +.byte 102,15,56,0,208 + movdqa 16(%r11),%xmm0 +.byte 102,15,56,0,193 + pxor %xmm2,%xmm0 + ret + + + + + + + + + + + + + + + + + + + + + + + + + + + +.p2align 4 +_vpaes_schedule_mangle: + + movdqa %xmm0,%xmm4 + movdqa L$k_mc_forward(%rip),%xmm5 + + + addq $16,%rdx + pxor L$k_s63(%rip),%xmm4 +.byte 102,15,56,0,229 + movdqa %xmm4,%xmm3 +.byte 102,15,56,0,229 + pxor %xmm4,%xmm3 +.byte 102,15,56,0,229 + pxor %xmm4,%xmm3 + +L$schedule_mangle_both: + movdqa (%r8,%r10,1),%xmm1 +.byte 102,15,56,0,217 + addq $-16,%r8 + andq $0x30,%r8 + movdqu %xmm3,(%rdx) + ret + + + + + + +.globl _vpaes_set_encrypt_key +.private_extern _vpaes_set_encrypt_key + +.p2align 4 +_vpaes_set_encrypt_key: + +_CET_ENDBR +#ifdef BORINGSSL_DISPATCH_TEST + + movb $1,_BORINGSSL_function_hit+5(%rip) +#endif + + movl %esi,%eax + shrl $5,%eax + addl $5,%eax + movl %eax,240(%rdx) + + movl $0,%ecx + movl $0x30,%r8d + call _vpaes_schedule_core + xorl %eax,%eax + ret + + +.globl _vpaes_ctr32_encrypt_blocks +.private_extern _vpaes_ctr32_encrypt_blocks + +.p2align 4 +_vpaes_ctr32_encrypt_blocks: + +_CET_ENDBR + + xchgq %rcx,%rdx + testq %rcx,%rcx + jz L$ctr32_abort + movdqu (%r8),%xmm0 + movdqa L$ctr_add_one(%rip),%xmm8 + subq %rdi,%rsi + call _vpaes_preheat + movdqa %xmm0,%xmm6 + pshufb L$rev_ctr(%rip),%xmm6 + + testq $1,%rcx + jz L$ctr32_prep_loop + + + + movdqu (%rdi),%xmm7 + call _vpaes_encrypt_core + pxor %xmm7,%xmm0 + paddd %xmm8,%xmm6 + movdqu %xmm0,(%rsi,%rdi,1) + subq $1,%rcx + leaq 16(%rdi),%rdi + jz L$ctr32_done + +L$ctr32_prep_loop: + + + movdqa %xmm6,%xmm14 + movdqa %xmm6,%xmm15 + paddd %xmm8,%xmm15 + +L$ctr32_loop: + movdqa L$rev_ctr(%rip),%xmm1 + movdqa %xmm14,%xmm0 + movdqa %xmm15,%xmm6 +.byte 102,15,56,0,193 +.byte 102,15,56,0,241 + call _vpaes_encrypt_core_2x + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa L$ctr_add_two(%rip),%xmm3 + pxor %xmm1,%xmm0 + pxor %xmm2,%xmm6 + paddd %xmm3,%xmm14 + paddd %xmm3,%xmm15 + movdqu %xmm0,(%rsi,%rdi,1) + movdqu %xmm6,16(%rsi,%rdi,1) + subq $2,%rcx + leaq 32(%rdi),%rdi + jnz L$ctr32_loop + +L$ctr32_done: +L$ctr32_abort: + ret + + + + + + + + + +.p2align 4 +_vpaes_preheat: + + leaq L$k_s0F(%rip),%r10 + movdqa -32(%r10),%xmm10 + movdqa -16(%r10),%xmm11 + movdqa 0(%r10),%xmm9 + movdqa 48(%r10),%xmm13 + movdqa 64(%r10),%xmm12 + movdqa 80(%r10),%xmm15 + movdqa 96(%r10),%xmm14 + ret + + + + + + + + +.section __DATA,__const +.p2align 6 +_vpaes_consts: +L$k_inv: +.quad 0x0E05060F0D080180, 0x040703090A0B0C02 +.quad 0x01040A060F0B0780, 0x030D0E0C02050809 + +L$k_s0F: +.quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F + +L$k_ipt: +.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 +.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 + +L$k_sb1: +.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 +.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF +L$k_sb2: +.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD +.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A +L$k_sbo: +.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 +.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA + +L$k_mc_forward: +.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 +.quad 0x080B0A0904070605, 0x000302010C0F0E0D +.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 +.quad 0x000302010C0F0E0D, 0x080B0A0904070605 + +L$k_mc_backward: +.quad 0x0605040702010003, 0x0E0D0C0F0A09080B +.quad 0x020100030E0D0C0F, 0x0A09080B06050407 +.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 +.quad 0x0A09080B06050407, 0x020100030E0D0C0F + +L$k_sr: +.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 +.quad 0x030E09040F0A0500, 0x0B06010C07020D08 +.quad 0x0F060D040B020900, 0x070E050C030A0108 +.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +L$k_rcon: +.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +L$k_s63: +.quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B + +L$k_opt: +.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 +.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 + +L$k_deskew: +.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A +.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 + + +L$rev_ctr: +.quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 + + +L$ctr_add_one: +.quad 0x0000000000000000, 0x0000000100000000 +L$ctr_add_two: +.quad 0x0000000000000000, 0x0000000200000000 + +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 +.p2align 6 + +.text +#endif diff --git a/ring-0.17.14/pregenerated/vpaes-x86_64-nasm.asm b/ring-0.17.14/pregenerated/vpaes-x86_64-nasm.asm new file mode 100644 index 0000000000..c7e6eed496 --- /dev/null +++ b/ring-0.17.14/pregenerated/vpaes-x86_64-nasm.asm @@ -0,0 +1,940 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%include "ring_core_generated/prefix_symbols_nasm.inc" +section .text code align=64 + + + + + + + + + + + + + + + + + + +ALIGN 16 +_vpaes_encrypt_core: + + mov r9,rdx + mov r11,16 + mov eax,DWORD[240+rdx] + movdqa xmm1,xmm9 + movdqa xmm2,XMMWORD[$L$k_ipt] + pandn xmm1,xmm0 + movdqu xmm5,XMMWORD[r9] + psrld xmm1,4 + pand xmm0,xmm9 +DB 102,15,56,0,208 + movdqa xmm0,XMMWORD[(($L$k_ipt+16))] +DB 102,15,56,0,193 + pxor xmm2,xmm5 + add r9,16 + pxor xmm0,xmm2 + lea r10,[$L$k_mc_backward] + jmp NEAR $L$enc_entry + +ALIGN 16 +$L$enc_loop: + + movdqa xmm4,xmm13 + movdqa xmm0,xmm12 +DB 102,15,56,0,226 +DB 102,15,56,0,195 + pxor xmm4,xmm5 + movdqa xmm5,xmm15 + pxor xmm0,xmm4 + movdqa xmm1,XMMWORD[((-64))+r10*1+r11] +DB 102,15,56,0,234 + movdqa xmm4,XMMWORD[r10*1+r11] + movdqa xmm2,xmm14 +DB 102,15,56,0,211 + movdqa xmm3,xmm0 + pxor xmm2,xmm5 +DB 102,15,56,0,193 + add r9,16 + pxor xmm0,xmm2 +DB 102,15,56,0,220 + add r11,16 + pxor xmm3,xmm0 +DB 102,15,56,0,193 + and r11,0x30 + sub rax,1 + pxor xmm0,xmm3 + +$L$enc_entry: + + movdqa xmm1,xmm9 + movdqa xmm5,xmm11 + pandn xmm1,xmm0 + psrld xmm1,4 + pand xmm0,xmm9 +DB 102,15,56,0,232 + movdqa xmm3,xmm10 + pxor xmm0,xmm1 +DB 102,15,56,0,217 + movdqa xmm4,xmm10 + pxor xmm3,xmm5 +DB 102,15,56,0,224 + movdqa xmm2,xmm10 + pxor xmm4,xmm5 +DB 102,15,56,0,211 + movdqa xmm3,xmm10 + pxor xmm2,xmm0 +DB 102,15,56,0,220 + movdqu xmm5,XMMWORD[r9] + pxor xmm3,xmm1 + jnz NEAR $L$enc_loop + + + movdqa xmm4,XMMWORD[((-96))+r10] + movdqa xmm0,XMMWORD[((-80))+r10] +DB 102,15,56,0,226 + pxor xmm4,xmm5 +DB 102,15,56,0,195 + movdqa xmm1,XMMWORD[64+r10*1+r11] + pxor xmm0,xmm4 +DB 102,15,56,0,193 + ret + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +ALIGN 16 +_vpaes_encrypt_core_2x: + + mov r9,rdx + mov r11,16 + mov eax,DWORD[240+rdx] + movdqa xmm1,xmm9 + movdqa xmm7,xmm9 + movdqa xmm2,XMMWORD[$L$k_ipt] + movdqa xmm8,xmm2 + pandn xmm1,xmm0 + pandn xmm7,xmm6 + movdqu xmm5,XMMWORD[r9] + + psrld xmm1,4 + psrld xmm7,4 + pand xmm0,xmm9 + pand xmm6,xmm9 +DB 102,15,56,0,208 +DB 102,68,15,56,0,198 + movdqa xmm0,XMMWORD[(($L$k_ipt+16))] + movdqa xmm6,xmm0 +DB 102,15,56,0,193 +DB 102,15,56,0,247 + pxor xmm2,xmm5 + pxor xmm8,xmm5 + add r9,16 + pxor xmm0,xmm2 + pxor xmm6,xmm8 + lea r10,[$L$k_mc_backward] + jmp NEAR $L$enc2x_entry + +ALIGN 16 +$L$enc2x_loop: + + movdqa xmm4,XMMWORD[$L$k_sb1] + movdqa xmm0,XMMWORD[(($L$k_sb1+16))] + movdqa xmm12,xmm4 + movdqa xmm6,xmm0 +DB 102,15,56,0,226 +DB 102,69,15,56,0,224 +DB 102,15,56,0,195 +DB 102,65,15,56,0,243 + pxor xmm4,xmm5 + pxor xmm12,xmm5 + movdqa xmm5,XMMWORD[$L$k_sb2] + movdqa xmm13,xmm5 + pxor xmm0,xmm4 + pxor xmm6,xmm12 + movdqa xmm1,XMMWORD[((-64))+r10*1+r11] + +DB 102,15,56,0,234 +DB 102,69,15,56,0,232 + movdqa xmm4,XMMWORD[r10*1+r11] + + movdqa xmm2,XMMWORD[(($L$k_sb2+16))] + movdqa xmm8,xmm2 +DB 102,15,56,0,211 +DB 102,69,15,56,0,195 + movdqa xmm3,xmm0 + movdqa xmm11,xmm6 + pxor xmm2,xmm5 + pxor xmm8,xmm13 +DB 102,15,56,0,193 +DB 102,15,56,0,241 + add r9,16 + pxor xmm0,xmm2 + pxor xmm6,xmm8 +DB 102,15,56,0,220 +DB 102,68,15,56,0,220 + add r11,16 + pxor xmm3,xmm0 + pxor xmm11,xmm6 +DB 102,15,56,0,193 +DB 102,15,56,0,241 + and r11,0x30 + sub rax,1 + pxor xmm0,xmm3 + pxor xmm6,xmm11 + +$L$enc2x_entry: + + movdqa xmm1,xmm9 + movdqa xmm7,xmm9 + movdqa xmm5,XMMWORD[(($L$k_inv+16))] + movdqa xmm13,xmm5 + pandn xmm1,xmm0 + pandn xmm7,xmm6 + psrld xmm1,4 + psrld xmm7,4 + pand xmm0,xmm9 + pand xmm6,xmm9 +DB 102,15,56,0,232 +DB 102,68,15,56,0,238 + movdqa xmm3,xmm10 + movdqa xmm11,xmm10 + pxor xmm0,xmm1 + pxor xmm6,xmm7 +DB 102,15,56,0,217 +DB 102,68,15,56,0,223 + movdqa xmm4,xmm10 + movdqa xmm12,xmm10 + pxor xmm3,xmm5 + pxor xmm11,xmm13 +DB 102,15,56,0,224 +DB 102,68,15,56,0,230 + movdqa xmm2,xmm10 + movdqa xmm8,xmm10 + pxor xmm4,xmm5 + pxor xmm12,xmm13 +DB 102,15,56,0,211 +DB 102,69,15,56,0,195 + movdqa xmm3,xmm10 + movdqa xmm11,xmm10 + pxor xmm2,xmm0 + pxor xmm8,xmm6 +DB 102,15,56,0,220 +DB 102,69,15,56,0,220 + movdqu xmm5,XMMWORD[r9] + + pxor xmm3,xmm1 + pxor xmm11,xmm7 + jnz NEAR $L$enc2x_loop + + + movdqa xmm4,XMMWORD[((-96))+r10] + movdqa xmm0,XMMWORD[((-80))+r10] + movdqa xmm12,xmm4 + movdqa xmm6,xmm0 +DB 102,15,56,0,226 +DB 102,69,15,56,0,224 + pxor xmm4,xmm5 + pxor xmm12,xmm5 +DB 102,15,56,0,195 +DB 102,65,15,56,0,243 + movdqa xmm1,XMMWORD[64+r10*1+r11] + + pxor xmm0,xmm4 + pxor xmm6,xmm12 +DB 102,15,56,0,193 +DB 102,15,56,0,241 + ret + + + + + + + + + +ALIGN 16 +_vpaes_schedule_core: + + + + + + + call _vpaes_preheat + movdqa xmm8,XMMWORD[$L$k_rcon] + movdqu xmm0,XMMWORD[rdi] + + + movdqa xmm3,xmm0 + lea r11,[$L$k_ipt] + call _vpaes_schedule_transform + movdqa xmm7,xmm0 + + lea r10,[$L$k_sr] + + + movdqu XMMWORD[rdx],xmm0 + +$L$schedule_go: + cmp esi,192 + ja NEAR $L$schedule_256 + + + + + + + + + + + +$L$schedule_128: + mov esi,10 + +$L$oop_schedule_128: + call _vpaes_schedule_round + dec rsi + jz NEAR $L$schedule_mangle_last + call _vpaes_schedule_mangle + jmp NEAR $L$oop_schedule_128 + + + + + + + + + + + +ALIGN 16 +$L$schedule_256: + movdqu xmm0,XMMWORD[16+rdi] + call _vpaes_schedule_transform + mov esi,7 + +$L$oop_schedule_256: + call _vpaes_schedule_mangle + movdqa xmm6,xmm0 + + + call _vpaes_schedule_round + dec rsi + jz NEAR $L$schedule_mangle_last + call _vpaes_schedule_mangle + + + pshufd xmm0,xmm0,0xFF + movdqa xmm5,xmm7 + movdqa xmm7,xmm6 + call _vpaes_schedule_low_round + movdqa xmm7,xmm5 + + jmp NEAR $L$oop_schedule_256 + + + + + + + + + + + + +ALIGN 16 +$L$schedule_mangle_last: + + lea r11,[$L$k_deskew] + + + movdqa xmm1,XMMWORD[r10*1+r8] +DB 102,15,56,0,193 + lea r11,[$L$k_opt] + add rdx,32 + +$L$schedule_mangle_last_dec: + add rdx,-16 + pxor xmm0,XMMWORD[$L$k_s63] + call _vpaes_schedule_transform + movdqu XMMWORD[rdx],xmm0 + + + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + pxor xmm6,xmm6 + pxor xmm7,xmm7 + ret + + + + + + + + + + + + + + + + + + + + + + +ALIGN 16 +_vpaes_schedule_round: + + + pxor xmm1,xmm1 +DB 102,65,15,58,15,200,15 +DB 102,69,15,58,15,192,15 + pxor xmm7,xmm1 + + + pshufd xmm0,xmm0,0xFF +DB 102,15,58,15,192,1 + + + + +_vpaes_schedule_low_round: + + movdqa xmm1,xmm7 + pslldq xmm7,4 + pxor xmm7,xmm1 + movdqa xmm1,xmm7 + pslldq xmm7,8 + pxor xmm7,xmm1 + pxor xmm7,XMMWORD[$L$k_s63] + + + movdqa xmm1,xmm9 + pandn xmm1,xmm0 + psrld xmm1,4 + pand xmm0,xmm9 + movdqa xmm2,xmm11 +DB 102,15,56,0,208 + pxor xmm0,xmm1 + movdqa xmm3,xmm10 +DB 102,15,56,0,217 + pxor xmm3,xmm2 + movdqa xmm4,xmm10 +DB 102,15,56,0,224 + pxor xmm4,xmm2 + movdqa xmm2,xmm10 +DB 102,15,56,0,211 + pxor xmm2,xmm0 + movdqa xmm3,xmm10 +DB 102,15,56,0,220 + pxor xmm3,xmm1 + movdqa xmm4,xmm13 +DB 102,15,56,0,226 + movdqa xmm0,xmm12 +DB 102,15,56,0,195 + pxor xmm0,xmm4 + + + pxor xmm0,xmm7 + movdqa xmm7,xmm0 + ret + + + + + + + + + + + + + +ALIGN 16 +_vpaes_schedule_transform: + + movdqa xmm1,xmm9 + pandn xmm1,xmm0 + psrld xmm1,4 + pand xmm0,xmm9 + movdqa xmm2,XMMWORD[r11] +DB 102,15,56,0,208 + movdqa xmm0,XMMWORD[16+r11] +DB 102,15,56,0,193 + pxor xmm0,xmm2 + ret + + + + + + + + + + + + + + + + + + + + + + + + + + + +ALIGN 16 +_vpaes_schedule_mangle: + + movdqa xmm4,xmm0 + movdqa xmm5,XMMWORD[$L$k_mc_forward] + + + add rdx,16 + pxor xmm4,XMMWORD[$L$k_s63] +DB 102,15,56,0,229 + movdqa xmm3,xmm4 +DB 102,15,56,0,229 + pxor xmm3,xmm4 +DB 102,15,56,0,229 + pxor xmm3,xmm4 + +$L$schedule_mangle_both: + movdqa xmm1,XMMWORD[r10*1+r8] +DB 102,15,56,0,217 + add r8,-16 + and r8,0x30 + movdqu XMMWORD[rdx],xmm3 + ret + + + + + + +global vpaes_set_encrypt_key + +ALIGN 16 +vpaes_set_encrypt_key: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_vpaes_set_encrypt_key: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR +%ifdef BORINGSSL_DISPATCH_TEST +EXTERN BORINGSSL_function_hit + mov BYTE[((BORINGSSL_function_hit+5))],1 +%endif + + lea rsp,[((-184))+rsp] + movaps XMMWORD[16+rsp],xmm6 + movaps XMMWORD[32+rsp],xmm7 + movaps XMMWORD[48+rsp],xmm8 + movaps XMMWORD[64+rsp],xmm9 + movaps XMMWORD[80+rsp],xmm10 + movaps XMMWORD[96+rsp],xmm11 + movaps XMMWORD[112+rsp],xmm12 + movaps XMMWORD[128+rsp],xmm13 + movaps XMMWORD[144+rsp],xmm14 + movaps XMMWORD[160+rsp],xmm15 +$L$enc_key_body: + mov eax,esi + shr eax,5 + add eax,5 + mov DWORD[240+rdx],eax + + mov ecx,0 + mov r8d,0x30 + call _vpaes_schedule_core + movaps xmm6,XMMWORD[16+rsp] + movaps xmm7,XMMWORD[32+rsp] + movaps xmm8,XMMWORD[48+rsp] + movaps xmm9,XMMWORD[64+rsp] + movaps xmm10,XMMWORD[80+rsp] + movaps xmm11,XMMWORD[96+rsp] + movaps xmm12,XMMWORD[112+rsp] + movaps xmm13,XMMWORD[128+rsp] + movaps xmm14,XMMWORD[144+rsp] + movaps xmm15,XMMWORD[160+rsp] + lea rsp,[184+rsp] +$L$enc_key_epilogue: + xor eax,eax + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_vpaes_set_encrypt_key: +global vpaes_ctr32_encrypt_blocks + +ALIGN 16 +vpaes_ctr32_encrypt_blocks: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_vpaes_ctr32_encrypt_blocks: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +_CET_ENDBR + + xchg rdx,rcx + test rcx,rcx + jz NEAR $L$ctr32_abort + lea rsp,[((-184))+rsp] + movaps XMMWORD[16+rsp],xmm6 + movaps XMMWORD[32+rsp],xmm7 + movaps XMMWORD[48+rsp],xmm8 + movaps XMMWORD[64+rsp],xmm9 + movaps XMMWORD[80+rsp],xmm10 + movaps XMMWORD[96+rsp],xmm11 + movaps XMMWORD[112+rsp],xmm12 + movaps XMMWORD[128+rsp],xmm13 + movaps XMMWORD[144+rsp],xmm14 + movaps XMMWORD[160+rsp],xmm15 +$L$ctr32_body: + movdqu xmm0,XMMWORD[r8] + movdqa xmm8,XMMWORD[$L$ctr_add_one] + sub rsi,rdi + call _vpaes_preheat + movdqa xmm6,xmm0 + pshufb xmm6,XMMWORD[$L$rev_ctr] + + test rcx,1 + jz NEAR $L$ctr32_prep_loop + + + + movdqu xmm7,XMMWORD[rdi] + call _vpaes_encrypt_core + pxor xmm0,xmm7 + paddd xmm6,xmm8 + movdqu XMMWORD[rdi*1+rsi],xmm0 + sub rcx,1 + lea rdi,[16+rdi] + jz NEAR $L$ctr32_done + +$L$ctr32_prep_loop: + + + movdqa xmm14,xmm6 + movdqa xmm15,xmm6 + paddd xmm15,xmm8 + +$L$ctr32_loop: + movdqa xmm1,XMMWORD[$L$rev_ctr] + movdqa xmm0,xmm14 + movdqa xmm6,xmm15 +DB 102,15,56,0,193 +DB 102,15,56,0,241 + call _vpaes_encrypt_core_2x + movdqu xmm1,XMMWORD[rdi] + movdqu xmm2,XMMWORD[16+rdi] + movdqa xmm3,XMMWORD[$L$ctr_add_two] + pxor xmm0,xmm1 + pxor xmm6,xmm2 + paddd xmm14,xmm3 + paddd xmm15,xmm3 + movdqu XMMWORD[rdi*1+rsi],xmm0 + movdqu XMMWORD[16+rdi*1+rsi],xmm6 + sub rcx,2 + lea rdi,[32+rdi] + jnz NEAR $L$ctr32_loop + +$L$ctr32_done: + movaps xmm6,XMMWORD[16+rsp] + movaps xmm7,XMMWORD[32+rsp] + movaps xmm8,XMMWORD[48+rsp] + movaps xmm9,XMMWORD[64+rsp] + movaps xmm10,XMMWORD[80+rsp] + movaps xmm11,XMMWORD[96+rsp] + movaps xmm12,XMMWORD[112+rsp] + movaps xmm13,XMMWORD[128+rsp] + movaps xmm14,XMMWORD[144+rsp] + movaps xmm15,XMMWORD[160+rsp] + lea rsp,[184+rsp] +$L$ctr32_epilogue: +$L$ctr32_abort: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_vpaes_ctr32_encrypt_blocks: + + + + + + + +ALIGN 16 +_vpaes_preheat: + + lea r10,[$L$k_s0F] + movdqa xmm10,XMMWORD[((-32))+r10] + movdqa xmm11,XMMWORD[((-16))+r10] + movdqa xmm9,XMMWORD[r10] + movdqa xmm13,XMMWORD[48+r10] + movdqa xmm12,XMMWORD[64+r10] + movdqa xmm15,XMMWORD[80+r10] + movdqa xmm14,XMMWORD[96+r10] + ret + + + + + + + + +section .rdata rdata align=8 +ALIGN 64 +_vpaes_consts: +$L$k_inv: + DQ 0x0E05060F0D080180,0x040703090A0B0C02 + DQ 0x01040A060F0B0780,0x030D0E0C02050809 + +$L$k_s0F: + DQ 0x0F0F0F0F0F0F0F0F,0x0F0F0F0F0F0F0F0F + +$L$k_ipt: + DQ 0xC2B2E8985A2A7000,0xCABAE09052227808 + DQ 0x4C01307D317C4D00,0xCD80B1FCB0FDCC81 + +$L$k_sb1: + DQ 0xB19BE18FCB503E00,0xA5DF7A6E142AF544 + DQ 0x3618D415FAE22300,0x3BF7CCC10D2ED9EF +$L$k_sb2: + DQ 0xE27A93C60B712400,0x5EB7E955BC982FCD + DQ 0x69EB88400AE12900,0xC2A163C8AB82234A +$L$k_sbo: + DQ 0xD0D26D176FBDC700,0x15AABF7AC502A878 + DQ 0xCFE474A55FBB6A00,0x8E1E90D1412B35FA + +$L$k_mc_forward: + DQ 0x0407060500030201,0x0C0F0E0D080B0A09 + DQ 0x080B0A0904070605,0x000302010C0F0E0D + DQ 0x0C0F0E0D080B0A09,0x0407060500030201 + DQ 0x000302010C0F0E0D,0x080B0A0904070605 + +$L$k_mc_backward: + DQ 0x0605040702010003,0x0E0D0C0F0A09080B + DQ 0x020100030E0D0C0F,0x0A09080B06050407 + DQ 0x0E0D0C0F0A09080B,0x0605040702010003 + DQ 0x0A09080B06050407,0x020100030E0D0C0F + +$L$k_sr: + DQ 0x0706050403020100,0x0F0E0D0C0B0A0908 + DQ 0x030E09040F0A0500,0x0B06010C07020D08 + DQ 0x0F060D040B020900,0x070E050C030A0108 + DQ 0x0B0E0104070A0D00,0x0306090C0F020508 + +$L$k_rcon: + DQ 0x1F8391B9AF9DEEB6,0x702A98084D7C7D81 + +$L$k_s63: + DQ 0x5B5B5B5B5B5B5B5B,0x5B5B5B5B5B5B5B5B + +$L$k_opt: + DQ 0xFF9F4929D6B66000,0xF7974121DEBE6808 + DQ 0x01EDBD5150BCEC00,0xE10D5DB1B05C0CE0 + +$L$k_deskew: + DQ 0x07E4A34047A4E300,0x1DFEB95A5DBEF91A + DQ 0x5F36B5DC83EA6900,0x2841C2ABF49D1E77 + + +$L$rev_ctr: + DQ 0x0706050403020100,0x0c0d0e0f0b0a0908 + + +$L$ctr_add_one: + DQ 0x0000000000000000,0x0000000100000000 +$L$ctr_add_two: + DQ 0x0000000000000000,0x0000000200000000 + + DB 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105 + DB 111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54 + DB 52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97 + DB 109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32 + DB 85,110,105,118,101,114,115,105,116,121,41,0 +ALIGN 64 + +section .text + +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +se_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$in_prologue + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$in_prologue + + lea rsi,[16+rax] + lea rdi,[512+r8] + mov ecx,20 + DD 0xa548f3fc + lea rax,[184+rax] + +$L$in_prologue: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + ret + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_vpaes_set_encrypt_key wrt ..imagebase + DD $L$SEH_end_vpaes_set_encrypt_key wrt ..imagebase + DD $L$SEH_info_vpaes_set_encrypt_key wrt ..imagebase + + DD $L$SEH_begin_vpaes_ctr32_encrypt_blocks wrt ..imagebase + DD $L$SEH_end_vpaes_ctr32_encrypt_blocks wrt ..imagebase + DD $L$SEH_info_vpaes_ctr32_encrypt_blocks wrt ..imagebase + +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_vpaes_set_encrypt_key: + DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$enc_key_body wrt ..imagebase,$L$enc_key_epilogue wrt ..imagebase +$L$SEH_info_vpaes_ctr32_encrypt_blocks: + DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$ctr32_body wrt ..imagebase,$L$ctr32_epilogue wrt ..imagebase +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/ring-0.17.14/pregenerated/vpaes-x86_64-nasm.o b/ring-0.17.14/pregenerated/vpaes-x86_64-nasm.o new file mode 100644 index 0000000000..4b04ba3f84 Binary files /dev/null and b/ring-0.17.14/pregenerated/vpaes-x86_64-nasm.o differ diff --git a/ring-0.17.14/pregenerated/x86-mont-elf.S b/ring-0.17.14/pregenerated/x86-mont-elf.S new file mode 100644 index 0000000000..e1923ea008 --- /dev/null +++ b/ring-0.17.14/pregenerated/x86-mont-elf.S @@ -0,0 +1,220 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) +.text +.globl bn_mul_mont +.hidden bn_mul_mont +.type bn_mul_mont,@function +.align 16 +bn_mul_mont: +.L_bn_mul_mont_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + xorl %eax,%eax + movl 40(%esp),%edi + leal 20(%esp),%esi + leal 24(%esp),%edx + addl $2,%edi + negl %edi + leal -32(%esp,%edi,4),%ebp + negl %edi + movl %ebp,%eax + subl %edx,%eax + andl $2047,%eax + subl %eax,%ebp + xorl %ebp,%edx + andl $2048,%edx + xorl $2048,%edx + subl %edx,%ebp + andl $-64,%ebp + movl %esp,%eax + subl %ebp,%eax + andl $-4096,%eax + movl %esp,%edx + leal (%ebp,%eax,1),%esp + movl (%esp),%eax + cmpl %ebp,%esp + ja .L000page_walk + jmp .L001page_walk_done +.align 16 +.L000page_walk: + leal -4096(%esp),%esp + movl (%esp),%eax + cmpl %ebp,%esp + ja .L000page_walk +.L001page_walk_done: + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%ebp + movl 16(%esi),%esi + movl (%esi),%esi + movl %eax,4(%esp) + movl %ebx,8(%esp) + movl %ecx,12(%esp) + movl %ebp,16(%esp) + movl %esi,20(%esp) + leal -3(%edi),%ebx + movl %edx,24(%esp) + movl $-1,%eax + movd %eax,%mm7 + movl 8(%esp),%esi + movl 12(%esp),%edi + movl 16(%esp),%ebp + xorl %edx,%edx + xorl %ecx,%ecx + movd (%edi),%mm4 + movd (%esi),%mm5 + movd (%ebp),%mm3 + pmuludq %mm4,%mm5 + movq %mm5,%mm2 + movq %mm5,%mm0 + pand %mm7,%mm0 + pmuludq 20(%esp),%mm5 + pmuludq %mm5,%mm3 + paddq %mm0,%mm3 + movd 4(%ebp),%mm1 + movd 4(%esi),%mm0 + psrlq $32,%mm2 + psrlq $32,%mm3 + incl %ecx +.align 16 +.L0021st: + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + pand %mm7,%mm0 + movd 4(%ebp,%ecx,4),%mm1 + paddq %mm0,%mm3 + movd 4(%esi,%ecx,4),%mm0 + psrlq $32,%mm2 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm3 + leal 1(%ecx),%ecx + cmpl %ebx,%ecx + jl .L0021st + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + pand %mm7,%mm0 + paddq %mm0,%mm3 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm2 + psrlq $32,%mm3 + paddq %mm2,%mm3 + movq %mm3,32(%esp,%ebx,4) + incl %edx +.L003outer: + xorl %ecx,%ecx + movd (%edi,%edx,4),%mm4 + movd (%esi),%mm5 + movd 32(%esp),%mm6 + movd (%ebp),%mm3 + pmuludq %mm4,%mm5 + paddq %mm6,%mm5 + movq %mm5,%mm0 + movq %mm5,%mm2 + pand %mm7,%mm0 + pmuludq 20(%esp),%mm5 + pmuludq %mm5,%mm3 + paddq %mm0,%mm3 + movd 36(%esp),%mm6 + movd 4(%ebp),%mm1 + movd 4(%esi),%mm0 + psrlq $32,%mm2 + psrlq $32,%mm3 + paddq %mm6,%mm2 + incl %ecx + decl %ebx +.L004inner: + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + movd 36(%esp,%ecx,4),%mm6 + pand %mm7,%mm0 + movd 4(%ebp,%ecx,4),%mm1 + paddq %mm0,%mm3 + movd 4(%esi,%ecx,4),%mm0 + psrlq $32,%mm2 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm3 + paddq %mm6,%mm2 + decl %ebx + leal 1(%ecx),%ecx + jnz .L004inner + movl %ecx,%ebx + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + pand %mm7,%mm0 + paddq %mm0,%mm3 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm2 + psrlq $32,%mm3 + movd 36(%esp,%ebx,4),%mm6 + paddq %mm2,%mm3 + paddq %mm6,%mm3 + movq %mm3,32(%esp,%ebx,4) + leal 1(%edx),%edx + cmpl %ebx,%edx + jle .L003outer + emms + jmp .L005common_tail +.align 16 +.L005common_tail: + movl 16(%esp),%ebp + movl 4(%esp),%edi + leal 32(%esp),%esi + movl (%esi),%eax + movl %ebx,%ecx + xorl %edx,%edx +.align 16 +.L006sub: + sbbl (%ebp,%edx,4),%eax + movl %eax,(%edi,%edx,4) + decl %ecx + movl 4(%esi,%edx,4),%eax + leal 1(%edx),%edx + jge .L006sub + sbbl $0,%eax + movl $-1,%edx + xorl %eax,%edx + jmp .L007copy +.align 16 +.L007copy: + movl 32(%esp,%ebx,4),%esi + movl (%edi,%ebx,4),%ebp + movl %ecx,32(%esp,%ebx,4) + andl %eax,%esi + andl %edx,%ebp + orl %esi,%ebp + movl %ebp,(%edi,%ebx,4) + decl %ebx + jge .L007copy + movl 24(%esp),%esp + movl $1,%eax + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size bn_mul_mont,.-.L_bn_mul_mont_begin +.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 +.byte 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56 +.byte 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 +.byte 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 +.byte 111,114,103,62,0 +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) diff --git a/ring-0.17.14/pregenerated/x86-mont-win32n.asm b/ring-0.17.14/pregenerated/x86-mont-win32n.asm new file mode 100644 index 0000000000..fe0cc973e1 --- /dev/null +++ b/ring-0.17.14/pregenerated/x86-mont-win32n.asm @@ -0,0 +1,226 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%include "ring_core_generated/prefix_symbols_nasm.inc" +%ifidn __OUTPUT_FORMAT__, win32 +%ifidn __OUTPUT_FORMAT__,obj +section code use32 class=code align=64 +%elifidn __OUTPUT_FORMAT__,win32 +$@feat.00 equ 1 +section .text code align=64 +%else +section .text code +%endif +global _bn_mul_mont +align 16 +_bn_mul_mont: +L$_bn_mul_mont_begin: + push ebp + push ebx + push esi + push edi + xor eax,eax + mov edi,DWORD [40+esp] + lea esi,[20+esp] + lea edx,[24+esp] + add edi,2 + neg edi + lea ebp,[edi*4+esp-32] + neg edi + mov eax,ebp + sub eax,edx + and eax,2047 + sub ebp,eax + xor edx,ebp + and edx,2048 + xor edx,2048 + sub ebp,edx + and ebp,-64 + mov eax,esp + sub eax,ebp + and eax,-4096 + mov edx,esp + lea esp,[eax*1+ebp] + mov eax,DWORD [esp] + cmp esp,ebp + ja NEAR L$000page_walk + jmp NEAR L$001page_walk_done +align 16 +L$000page_walk: + lea esp,[esp-4096] + mov eax,DWORD [esp] + cmp esp,ebp + ja NEAR L$000page_walk +L$001page_walk_done: + mov eax,DWORD [esi] + mov ebx,DWORD [4+esi] + mov ecx,DWORD [8+esi] + mov ebp,DWORD [12+esi] + mov esi,DWORD [16+esi] + mov esi,DWORD [esi] + mov DWORD [4+esp],eax + mov DWORD [8+esp],ebx + mov DWORD [12+esp],ecx + mov DWORD [16+esp],ebp + mov DWORD [20+esp],esi + lea ebx,[edi-3] + mov DWORD [24+esp],edx + mov eax,-1 + movd mm7,eax + mov esi,DWORD [8+esp] + mov edi,DWORD [12+esp] + mov ebp,DWORD [16+esp] + xor edx,edx + xor ecx,ecx + movd mm4,DWORD [edi] + movd mm5,DWORD [esi] + movd mm3,DWORD [ebp] + pmuludq mm5,mm4 + movq mm2,mm5 + movq mm0,mm5 + pand mm0,mm7 + pmuludq mm5,[20+esp] + pmuludq mm3,mm5 + paddq mm3,mm0 + movd mm1,DWORD [4+ebp] + movd mm0,DWORD [4+esi] + psrlq mm2,32 + psrlq mm3,32 + inc ecx +align 16 +L$0021st: + pmuludq mm0,mm4 + pmuludq mm1,mm5 + paddq mm2,mm0 + paddq mm3,mm1 + movq mm0,mm2 + pand mm0,mm7 + movd mm1,DWORD [4+ecx*4+ebp] + paddq mm3,mm0 + movd mm0,DWORD [4+ecx*4+esi] + psrlq mm2,32 + movd DWORD [28+ecx*4+esp],mm3 + psrlq mm3,32 + lea ecx,[1+ecx] + cmp ecx,ebx + jl NEAR L$0021st + pmuludq mm0,mm4 + pmuludq mm1,mm5 + paddq mm2,mm0 + paddq mm3,mm1 + movq mm0,mm2 + pand mm0,mm7 + paddq mm3,mm0 + movd DWORD [28+ecx*4+esp],mm3 + psrlq mm2,32 + psrlq mm3,32 + paddq mm3,mm2 + movq [32+ebx*4+esp],mm3 + inc edx +L$003outer: + xor ecx,ecx + movd mm4,DWORD [edx*4+edi] + movd mm5,DWORD [esi] + movd mm6,DWORD [32+esp] + movd mm3,DWORD [ebp] + pmuludq mm5,mm4 + paddq mm5,mm6 + movq mm0,mm5 + movq mm2,mm5 + pand mm0,mm7 + pmuludq mm5,[20+esp] + pmuludq mm3,mm5 + paddq mm3,mm0 + movd mm6,DWORD [36+esp] + movd mm1,DWORD [4+ebp] + movd mm0,DWORD [4+esi] + psrlq mm2,32 + psrlq mm3,32 + paddq mm2,mm6 + inc ecx + dec ebx +L$004inner: + pmuludq mm0,mm4 + pmuludq mm1,mm5 + paddq mm2,mm0 + paddq mm3,mm1 + movq mm0,mm2 + movd mm6,DWORD [36+ecx*4+esp] + pand mm0,mm7 + movd mm1,DWORD [4+ecx*4+ebp] + paddq mm3,mm0 + movd mm0,DWORD [4+ecx*4+esi] + psrlq mm2,32 + movd DWORD [28+ecx*4+esp],mm3 + psrlq mm3,32 + paddq mm2,mm6 + dec ebx + lea ecx,[1+ecx] + jnz NEAR L$004inner + mov ebx,ecx + pmuludq mm0,mm4 + pmuludq mm1,mm5 + paddq mm2,mm0 + paddq mm3,mm1 + movq mm0,mm2 + pand mm0,mm7 + paddq mm3,mm0 + movd DWORD [28+ecx*4+esp],mm3 + psrlq mm2,32 + psrlq mm3,32 + movd mm6,DWORD [36+ebx*4+esp] + paddq mm3,mm2 + paddq mm3,mm6 + movq [32+ebx*4+esp],mm3 + lea edx,[1+edx] + cmp edx,ebx + jle NEAR L$003outer + emms + jmp NEAR L$005common_tail +align 16 +L$005common_tail: + mov ebp,DWORD [16+esp] + mov edi,DWORD [4+esp] + lea esi,[32+esp] + mov eax,DWORD [esi] + mov ecx,ebx + xor edx,edx +align 16 +L$006sub: + sbb eax,DWORD [edx*4+ebp] + mov DWORD [edx*4+edi],eax + dec ecx + mov eax,DWORD [4+edx*4+esi] + lea edx,[1+edx] + jge NEAR L$006sub + sbb eax,0 + mov edx,-1 + xor edx,eax + jmp NEAR L$007copy +align 16 +L$007copy: + mov esi,DWORD [32+ebx*4+esp] + mov ebp,DWORD [ebx*4+edi] + mov DWORD [32+ebx*4+esp],ecx + and esi,eax + and ebp,edx + or ebp,esi + mov DWORD [ebx*4+edi],ebp + dec ebx + jge NEAR L$007copy + mov esp,DWORD [24+esp] + mov eax,1 + pop edi + pop esi + pop ebx + pop ebp + ret +db 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 +db 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56 +db 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 +db 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 +db 111,114,103,62,0 +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/ring-0.17.14/pregenerated/x86-mont-win32n.o b/ring-0.17.14/pregenerated/x86-mont-win32n.o new file mode 100644 index 0000000000..9d1b72b85d Binary files /dev/null and b/ring-0.17.14/pregenerated/x86-mont-win32n.o differ diff --git a/ring-0.17.14/pregenerated/x86_64-mont-elf.S b/ring-0.17.14/pregenerated/x86_64-mont-elf.S new file mode 100644 index 0000000000..01888e72bc --- /dev/null +++ b/ring-0.17.14/pregenerated/x86_64-mont-elf.S @@ -0,0 +1,1237 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +.text + +.globl bn_mul_mont_nohw +.hidden bn_mul_mont_nohw +.type bn_mul_mont_nohw,@function +.align 16 +bn_mul_mont_nohw: +.cfi_startproc +_CET_ENDBR + movl %r9d,%r9d + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + + negq %r9 + movq %rsp,%r11 + leaq -16(%rsp,%r9,8),%r10 + negq %r9 + andq $-1024,%r10 + + + + + + + + + + subq %r10,%r11 + andq $-4096,%r11 + leaq (%r10,%r11,1),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja .Lmul_page_walk + jmp .Lmul_page_walk_done + +.align 16 +.Lmul_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja .Lmul_page_walk +.Lmul_page_walk_done: + + movq %rax,8(%rsp,%r9,8) +.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 +.Lmul_body: + movq %rdx,%r12 + movq (%r8),%r8 + movq (%r12),%rbx + movq (%rsi),%rax + + xorq %r14,%r14 + xorq %r15,%r15 + + movq %r8,%rbp + mulq %rbx + movq %rax,%r10 + movq (%rcx),%rax + + imulq %r10,%rbp + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq %rdx,%r13 + + leaq 1(%r15),%r15 + jmp .L1st_enter + +.align 16 +.L1st: + addq %rax,%r13 + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%r13 + movq %r10,%r11 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + +.L1st_enter: + mulq %rbx + addq %rax,%r11 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + leaq 1(%r15),%r15 + movq %rdx,%r10 + + mulq %rbp + cmpq %r9,%r15 + jne .L1st + + addq %rax,%r13 + movq (%rsi),%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + movq %r10,%r11 + + xorq %rdx,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r9,8) + movq %rdx,(%rsp,%r9,8) + + leaq 1(%r14),%r14 + jmp .Louter +.align 16 +.Louter: + movq (%r12,%r14,8),%rbx + xorq %r15,%r15 + movq %r8,%rbp + movq (%rsp),%r10 + mulq %rbx + addq %rax,%r10 + movq (%rcx),%rax + adcq $0,%rdx + + imulq %r10,%rbp + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq 8(%rsp),%r10 + movq %rdx,%r13 + + leaq 1(%r15),%r15 + jmp .Linner_enter + +.align 16 +.Linner: + addq %rax,%r13 + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + movq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + +.Linner_enter: + mulq %rbx + addq %rax,%r11 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + addq %r11,%r10 + movq %rdx,%r11 + adcq $0,%r11 + leaq 1(%r15),%r15 + + mulq %rbp + cmpq %r9,%r15 + jne .Linner + + addq %rax,%r13 + movq (%rsi),%rax + adcq $0,%rdx + addq %r10,%r13 + movq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + + xorq %rdx,%rdx + addq %r11,%r13 + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r9,8) + movq %rdx,(%rsp,%r9,8) + + leaq 1(%r14),%r14 + cmpq %r9,%r14 + jb .Louter + + xorq %r14,%r14 + movq (%rsp),%rax + movq %r9,%r15 + +.align 16 +.Lsub: sbbq (%rcx,%r14,8),%rax + movq %rax,(%rdi,%r14,8) + movq 8(%rsp,%r14,8),%rax + leaq 1(%r14),%r14 + decq %r15 + jnz .Lsub + + sbbq $0,%rax + movq $-1,%rbx + xorq %rax,%rbx + xorq %r14,%r14 + movq %r9,%r15 + +.Lcopy: + movq (%rdi,%r14,8),%rcx + movq (%rsp,%r14,8),%rdx + andq %rbx,%rcx + andq %rax,%rdx + movq %r9,(%rsp,%r14,8) + orq %rcx,%rdx + movq %rdx,(%rdi,%r14,8) + leaq 1(%r14),%r14 + subq $1,%r15 + jnz .Lcopy + + movq 8(%rsp,%r9,8),%rsi +.cfi_def_cfa %rsi,8 + movq $1,%rax + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lmul_epilogue: + ret +.cfi_endproc +.size bn_mul_mont_nohw,.-bn_mul_mont_nohw +.globl bn_mul4x_mont +.hidden bn_mul4x_mont +.type bn_mul4x_mont,@function +.align 16 +bn_mul4x_mont: +.cfi_startproc +_CET_ENDBR + movl %r9d,%r9d + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + + negq %r9 + movq %rsp,%r11 + leaq -32(%rsp,%r9,8),%r10 + negq %r9 + andq $-1024,%r10 + + subq %r10,%r11 + andq $-4096,%r11 + leaq (%r10,%r11,1),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja .Lmul4x_page_walk + jmp .Lmul4x_page_walk_done + +.Lmul4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja .Lmul4x_page_walk +.Lmul4x_page_walk_done: + + movq %rax,8(%rsp,%r9,8) +.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 +.Lmul4x_body: + movq %rdi,16(%rsp,%r9,8) + movq %rdx,%r12 + movq (%r8),%r8 + movq (%r12),%rbx + movq (%rsi),%rax + + xorq %r14,%r14 + xorq %r15,%r15 + + movq %r8,%rbp + mulq %rbx + movq %rax,%r10 + movq (%rcx),%rax + + imulq %r10,%rbp + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 4(%r15),%r15 + adcq $0,%rdx + movq %rdi,(%rsp) + movq %rdx,%r13 + jmp .L1st4x +.align 16 +.L1st4x: + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%r13 + + mulq %rbx + addq %rax,%r10 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq 8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx,%r15,8),%rax + adcq $0,%rdx + leaq 4(%r15),%r15 + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq -16(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-32(%rsp,%r15,8) + movq %rdx,%r13 + cmpq %r9,%r15 + jb .L1st4x + + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%r13 + + xorq %rdi,%rdi + addq %r10,%r13 + adcq $0,%rdi + movq %r13,-8(%rsp,%r15,8) + movq %rdi,(%rsp,%r15,8) + + leaq 1(%r14),%r14 +.align 4 +.Louter4x: + movq (%r12,%r14,8),%rbx + xorq %r15,%r15 + movq (%rsp),%r10 + movq %r8,%rbp + mulq %rbx + addq %rax,%r10 + movq (%rcx),%rax + adcq $0,%rdx + + imulq %r10,%rbp + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + addq 8(%rsp),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 4(%r15),%r15 + adcq $0,%rdx + movq %rdi,(%rsp) + movq %rdx,%r13 + jmp .Linner4x +.align 16 +.Linner4x: + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -16(%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -8(%rsp,%r15,8),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%r13 + + mulq %rbx + addq %rax,%r10 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + addq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq 8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx,%r15,8),%rax + adcq $0,%rdx + addq 8(%rsp,%r15,8),%r11 + adcq $0,%rdx + leaq 4(%r15),%r15 + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq -16(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-32(%rsp,%r15,8) + movq %rdx,%r13 + cmpq %r9,%r15 + jb .Linner4x + + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -16(%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -8(%rsp,%r15,8),%r11 + adcq $0,%rdx + leaq 1(%r14),%r14 + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%r13 + + xorq %rdi,%rdi + addq %r10,%r13 + adcq $0,%rdi + addq (%rsp,%r9,8),%r13 + adcq $0,%rdi + movq %r13,-8(%rsp,%r15,8) + movq %rdi,(%rsp,%r15,8) + + cmpq %r9,%r14 + jb .Louter4x + movq 16(%rsp,%r9,8),%rdi + leaq -4(%r9),%r15 + movq 0(%rsp),%rax + movq 8(%rsp),%rdx + shrq $2,%r15 + leaq (%rsp),%rsi + xorq %r14,%r14 + + subq 0(%rcx),%rax + movq 16(%rsi),%rbx + movq 24(%rsi),%rbp + sbbq 8(%rcx),%rdx + +.Lsub4x: + movq %rax,0(%rdi,%r14,8) + movq %rdx,8(%rdi,%r14,8) + sbbq 16(%rcx,%r14,8),%rbx + movq 32(%rsi,%r14,8),%rax + movq 40(%rsi,%r14,8),%rdx + sbbq 24(%rcx,%r14,8),%rbp + movq %rbx,16(%rdi,%r14,8) + movq %rbp,24(%rdi,%r14,8) + sbbq 32(%rcx,%r14,8),%rax + movq 48(%rsi,%r14,8),%rbx + movq 56(%rsi,%r14,8),%rbp + sbbq 40(%rcx,%r14,8),%rdx + leaq 4(%r14),%r14 + decq %r15 + jnz .Lsub4x + + movq %rax,0(%rdi,%r14,8) + movq 32(%rsi,%r14,8),%rax + sbbq 16(%rcx,%r14,8),%rbx + movq %rdx,8(%rdi,%r14,8) + sbbq 24(%rcx,%r14,8),%rbp + movq %rbx,16(%rdi,%r14,8) + + sbbq $0,%rax + movq %rbp,24(%rdi,%r14,8) + pxor %xmm0,%xmm0 +.byte 102,72,15,110,224 + pcmpeqd %xmm5,%xmm5 + pshufd $0,%xmm4,%xmm4 + movq %r9,%r15 + pxor %xmm4,%xmm5 + shrq $2,%r15 + xorl %eax,%eax + + jmp .Lcopy4x +.align 16 +.Lcopy4x: + movdqa (%rsp,%rax,1),%xmm1 + movdqu (%rdi,%rax,1),%xmm2 + pand %xmm4,%xmm1 + pand %xmm5,%xmm2 + movdqa 16(%rsp,%rax,1),%xmm3 + movdqa %xmm0,(%rsp,%rax,1) + por %xmm2,%xmm1 + movdqu 16(%rdi,%rax,1),%xmm2 + movdqu %xmm1,(%rdi,%rax,1) + pand %xmm4,%xmm3 + pand %xmm5,%xmm2 + movdqa %xmm0,16(%rsp,%rax,1) + por %xmm2,%xmm3 + movdqu %xmm3,16(%rdi,%rax,1) + leaq 32(%rax),%rax + decq %r15 + jnz .Lcopy4x + movq 8(%rsp,%r9,8),%rsi +.cfi_def_cfa %rsi, 8 + movq $1,%rax + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lmul4x_epilogue: + ret +.cfi_endproc +.size bn_mul4x_mont,.-bn_mul4x_mont +.extern bn_sqrx8x_internal +.hidden bn_sqrx8x_internal +.extern bn_sqr8x_internal +.hidden bn_sqr8x_internal + +.globl bn_sqr8x_mont +.hidden bn_sqr8x_mont +.type bn_sqr8x_mont,@function +.align 32 +bn_sqr8x_mont: +.cfi_startproc +_CET_ENDBR + movl %r9d,%r9d + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 +.Lsqr8x_prologue: + + movl %r9d,%r10d + shll $3,%r9d + shlq $3+2,%r10 + negq %r9 + + + + + + + leaq -64(%rsp,%r9,2),%r11 + movq %rsp,%rbp + movq (%r8),%r8 + subq %rsi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb .Lsqr8x_sp_alt + subq %r11,%rbp + leaq -64(%rbp,%r9,2),%rbp + jmp .Lsqr8x_sp_done + +.align 32 +.Lsqr8x_sp_alt: + leaq 4096-64(,%r9,2),%r10 + leaq -64(%rbp,%r9,2),%rbp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rbp +.Lsqr8x_sp_done: + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lsqr8x_page_walk + jmp .Lsqr8x_page_walk_done + +.align 16 +.Lsqr8x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lsqr8x_page_walk +.Lsqr8x_page_walk_done: + + movq %r9,%r10 + negq %r9 + + movq %r8,32(%rsp) + movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 +.Lsqr8x_body: + +.byte 102,72,15,110,209 + pxor %xmm0,%xmm0 +.byte 102,72,15,110,207 +.byte 102,73,15,110,218 + testq %rdx,%rdx + jz .Lsqr8x_nox + + call bn_sqrx8x_internal + + + + + leaq (%r8,%rcx,1),%rbx + movq %rcx,%r9 + movq %rcx,%rdx +.byte 102,72,15,126,207 + sarq $3+2,%rcx + jmp .Lsqr8x_sub + +.align 32 +.Lsqr8x_nox: + call bn_sqr8x_internal + + + + + leaq (%rdi,%r9,1),%rbx + movq %r9,%rcx + movq %r9,%rdx +.byte 102,72,15,126,207 + sarq $3+2,%rcx + jmp .Lsqr8x_sub + +.align 32 +.Lsqr8x_sub: + movq 0(%rbx),%r12 + movq 8(%rbx),%r13 + movq 16(%rbx),%r14 + movq 24(%rbx),%r15 + leaq 32(%rbx),%rbx + sbbq 0(%rbp),%r12 + sbbq 8(%rbp),%r13 + sbbq 16(%rbp),%r14 + sbbq 24(%rbp),%r15 + leaq 32(%rbp),%rbp + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r14,16(%rdi) + movq %r15,24(%rdi) + leaq 32(%rdi),%rdi + incq %rcx + jnz .Lsqr8x_sub + + sbbq $0,%rax + leaq (%rbx,%r9,1),%rbx + leaq (%rdi,%r9,1),%rdi + +.byte 102,72,15,110,200 + pxor %xmm0,%xmm0 + pshufd $0,%xmm1,%xmm1 + movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 + jmp .Lsqr8x_cond_copy + +.align 32 +.Lsqr8x_cond_copy: + movdqa 0(%rbx),%xmm2 + movdqa 16(%rbx),%xmm3 + leaq 32(%rbx),%rbx + movdqu 0(%rdi),%xmm4 + movdqu 16(%rdi),%xmm5 + leaq 32(%rdi),%rdi + movdqa %xmm0,-32(%rbx) + movdqa %xmm0,-16(%rbx) + movdqa %xmm0,-32(%rbx,%rdx,1) + movdqa %xmm0,-16(%rbx,%rdx,1) + pcmpeqd %xmm1,%xmm0 + pand %xmm1,%xmm2 + pand %xmm1,%xmm3 + pand %xmm0,%xmm4 + pand %xmm0,%xmm5 + pxor %xmm0,%xmm0 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqu %xmm4,-32(%rdi) + movdqu %xmm5,-16(%rdi) + addq $32,%r9 + jnz .Lsqr8x_cond_copy + + movq $1,%rax + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lsqr8x_epilogue: + ret +.cfi_endproc +.size bn_sqr8x_mont,.-bn_sqr8x_mont +.globl bn_mulx4x_mont +.hidden bn_mulx4x_mont +.type bn_mulx4x_mont,@function +.align 32 +bn_mulx4x_mont: +.cfi_startproc +_CET_ENDBR + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 +.Lmulx4x_prologue: + + shll $3,%r9d + xorq %r10,%r10 + subq %r9,%r10 + movq (%r8),%r8 + leaq -72(%rsp,%r10,1),%rbp + andq $-128,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmulx4x_page_walk + jmp .Lmulx4x_page_walk_done + +.align 16 +.Lmulx4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmulx4x_page_walk +.Lmulx4x_page_walk_done: + + leaq (%rdx,%r9,1),%r10 + + + + + + + + + + + + + movq %r9,0(%rsp) + shrq $5,%r9 + movq %r10,16(%rsp) + subq $1,%r9 + movq %r8,24(%rsp) + movq %rdi,32(%rsp) + movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 + movq %r9,48(%rsp) + jmp .Lmulx4x_body + +.align 32 +.Lmulx4x_body: + leaq 8(%rdx),%rdi + movq (%rdx),%rdx + leaq 64+32(%rsp),%rbx + movq %rdx,%r9 + + mulxq 0(%rsi),%r8,%rax + mulxq 8(%rsi),%r11,%r14 + addq %rax,%r11 + movq %rdi,8(%rsp) + mulxq 16(%rsi),%r12,%r13 + adcq %r14,%r12 + adcq $0,%r13 + + movq %r8,%rdi + imulq 24(%rsp),%r8 + xorq %rbp,%rbp + + mulxq 24(%rsi),%rax,%r14 + movq %r8,%rdx + leaq 32(%rsi),%rsi + adcxq %rax,%r13 + adcxq %rbp,%r14 + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%rdi + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 +.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 + movq 48(%rsp),%rdi + movq %r10,-32(%rbx) + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-24(%rbx) + adcxq %rax,%r12 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r12,-16(%rbx) + + jmp .Lmulx4x_1st + +.align 32 +.Lmulx4x_1st: + adcxq %rbp,%r15 + mulxq 0(%rsi),%r10,%rax + adcxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 +.byte 0x67,0x67 + movq %r8,%rdx + adcxq %rax,%r13 + adcxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + movq %r11,-32(%rbx) + adoxq %r15,%r13 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r13,-16(%rbx) + + decq %rdi + jnz .Lmulx4x_1st + + movq 0(%rsp),%rax + movq 8(%rsp),%rdi + adcq %rbp,%r15 + addq %r15,%r14 + sbbq %r15,%r15 + movq %r14,-8(%rbx) + jmp .Lmulx4x_outer + +.align 32 +.Lmulx4x_outer: + movq (%rdi),%rdx + leaq 8(%rdi),%rdi + subq %rax,%rsi + movq %r15,(%rbx) + leaq 64+32(%rsp),%rbx + subq %rax,%rcx + + mulxq 0(%rsi),%r8,%r11 + xorl %ebp,%ebp + movq %rdx,%r9 + mulxq 8(%rsi),%r14,%r12 + adoxq -32(%rbx),%r8 + adcxq %r14,%r11 + mulxq 16(%rsi),%r15,%r13 + adoxq -24(%rbx),%r11 + adcxq %r15,%r12 + adoxq -16(%rbx),%r12 + adcxq %rbp,%r13 + adoxq %rbp,%r13 + + movq %rdi,8(%rsp) + movq %r8,%r15 + imulq 24(%rsp),%r8 + xorl %ebp,%ebp + + mulxq 24(%rsi),%rax,%r14 + movq %r8,%rdx + adcxq %rax,%r13 + adoxq -8(%rbx),%r13 + adcxq %rbp,%r14 + leaq 32(%rsi),%rsi + adoxq %rbp,%r14 + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%r15 + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + mulxq 16(%rcx),%rax,%r12 + movq %r10,-32(%rbx) + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-24(%rbx) + leaq 32(%rcx),%rcx + adcxq %rax,%r12 + adoxq %rbp,%r15 + movq 48(%rsp),%rdi + movq %r12,-16(%rbx) + + jmp .Lmulx4x_inner + +.align 32 +.Lmulx4x_inner: + mulxq 0(%rsi),%r10,%rax + adcxq %rbp,%r15 + adoxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq 0(%rbx),%r10 + adoxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq 8(%rbx),%r11 + adoxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 + movq %r8,%rdx + adcxq 16(%rbx),%r12 + adoxq %rax,%r13 + adcxq 24(%rbx),%r13 + adoxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + adcxq %rbp,%r14 + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + adoxq %r15,%r13 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-32(%rbx) + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r13,-16(%rbx) + + decq %rdi + jnz .Lmulx4x_inner + + movq 0(%rsp),%rax + movq 8(%rsp),%rdi + adcq %rbp,%r15 + subq 0(%rbx),%rbp + adcq %r15,%r14 + sbbq %r15,%r15 + movq %r14,-8(%rbx) + + cmpq 16(%rsp),%rdi + jne .Lmulx4x_outer + + leaq 64(%rsp),%rbx + subq %rax,%rcx + negq %r15 + movq %rax,%rdx + shrq $3+2,%rax + movq 32(%rsp),%rdi + jmp .Lmulx4x_sub + +.align 32 +.Lmulx4x_sub: + movq 0(%rbx),%r11 + movq 8(%rbx),%r12 + movq 16(%rbx),%r13 + movq 24(%rbx),%r14 + leaq 32(%rbx),%rbx + sbbq 0(%rcx),%r11 + sbbq 8(%rcx),%r12 + sbbq 16(%rcx),%r13 + sbbq 24(%rcx),%r14 + leaq 32(%rcx),%rcx + movq %r11,0(%rdi) + movq %r12,8(%rdi) + movq %r13,16(%rdi) + movq %r14,24(%rdi) + leaq 32(%rdi),%rdi + decq %rax + jnz .Lmulx4x_sub + + sbbq $0,%r15 + leaq 64(%rsp),%rbx + subq %rdx,%rdi + +.byte 102,73,15,110,207 + pxor %xmm0,%xmm0 + pshufd $0,%xmm1,%xmm1 + movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 + jmp .Lmulx4x_cond_copy + +.align 32 +.Lmulx4x_cond_copy: + movdqa 0(%rbx),%xmm2 + movdqa 16(%rbx),%xmm3 + leaq 32(%rbx),%rbx + movdqu 0(%rdi),%xmm4 + movdqu 16(%rdi),%xmm5 + leaq 32(%rdi),%rdi + movdqa %xmm0,-32(%rbx) + movdqa %xmm0,-16(%rbx) + pcmpeqd %xmm1,%xmm0 + pand %xmm1,%xmm2 + pand %xmm1,%xmm3 + pand %xmm0,%xmm4 + pand %xmm0,%xmm5 + pxor %xmm0,%xmm0 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqu %xmm4,-32(%rdi) + movdqu %xmm5,-16(%rdi) + subq $32,%rdx + jnz .Lmulx4x_cond_copy + + movq %rdx,(%rbx) + + movq $1,%rax + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lmulx4x_epilogue: + ret +.cfi_endproc +.size bn_mulx4x_mont,.-bn_mulx4x_mont +.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 16 +#endif diff --git a/ring-0.17.14/pregenerated/x86_64-mont-macosx.S b/ring-0.17.14/pregenerated/x86_64-mont-macosx.S new file mode 100644 index 0000000000..8b8185534c --- /dev/null +++ b/ring-0.17.14/pregenerated/x86_64-mont-macosx.S @@ -0,0 +1,1235 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +.text + +.globl _bn_mul_mont_nohw +.private_extern _bn_mul_mont_nohw + +.p2align 4 +_bn_mul_mont_nohw: + +_CET_ENDBR + movl %r9d,%r9d + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + + negq %r9 + movq %rsp,%r11 + leaq -16(%rsp,%r9,8),%r10 + negq %r9 + andq $-1024,%r10 + + + + + + + + + + subq %r10,%r11 + andq $-4096,%r11 + leaq (%r10,%r11,1),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja L$mul_page_walk + jmp L$mul_page_walk_done + +.p2align 4 +L$mul_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja L$mul_page_walk +L$mul_page_walk_done: + + movq %rax,8(%rsp,%r9,8) + +L$mul_body: + movq %rdx,%r12 + movq (%r8),%r8 + movq (%r12),%rbx + movq (%rsi),%rax + + xorq %r14,%r14 + xorq %r15,%r15 + + movq %r8,%rbp + mulq %rbx + movq %rax,%r10 + movq (%rcx),%rax + + imulq %r10,%rbp + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq %rdx,%r13 + + leaq 1(%r15),%r15 + jmp L$1st_enter + +.p2align 4 +L$1st: + addq %rax,%r13 + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%r13 + movq %r10,%r11 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + +L$1st_enter: + mulq %rbx + addq %rax,%r11 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + leaq 1(%r15),%r15 + movq %rdx,%r10 + + mulq %rbp + cmpq %r9,%r15 + jne L$1st + + addq %rax,%r13 + movq (%rsi),%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + movq %r10,%r11 + + xorq %rdx,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r9,8) + movq %rdx,(%rsp,%r9,8) + + leaq 1(%r14),%r14 + jmp L$outer +.p2align 4 +L$outer: + movq (%r12,%r14,8),%rbx + xorq %r15,%r15 + movq %r8,%rbp + movq (%rsp),%r10 + mulq %rbx + addq %rax,%r10 + movq (%rcx),%rax + adcq $0,%rdx + + imulq %r10,%rbp + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq 8(%rsp),%r10 + movq %rdx,%r13 + + leaq 1(%r15),%r15 + jmp L$inner_enter + +.p2align 4 +L$inner: + addq %rax,%r13 + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + movq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + +L$inner_enter: + mulq %rbx + addq %rax,%r11 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + addq %r11,%r10 + movq %rdx,%r11 + adcq $0,%r11 + leaq 1(%r15),%r15 + + mulq %rbp + cmpq %r9,%r15 + jne L$inner + + addq %rax,%r13 + movq (%rsi),%rax + adcq $0,%rdx + addq %r10,%r13 + movq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + + xorq %rdx,%rdx + addq %r11,%r13 + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r9,8) + movq %rdx,(%rsp,%r9,8) + + leaq 1(%r14),%r14 + cmpq %r9,%r14 + jb L$outer + + xorq %r14,%r14 + movq (%rsp),%rax + movq %r9,%r15 + +.p2align 4 +L$sub: sbbq (%rcx,%r14,8),%rax + movq %rax,(%rdi,%r14,8) + movq 8(%rsp,%r14,8),%rax + leaq 1(%r14),%r14 + decq %r15 + jnz L$sub + + sbbq $0,%rax + movq $-1,%rbx + xorq %rax,%rbx + xorq %r14,%r14 + movq %r9,%r15 + +L$copy: + movq (%rdi,%r14,8),%rcx + movq (%rsp,%r14,8),%rdx + andq %rbx,%rcx + andq %rax,%rdx + movq %r9,(%rsp,%r14,8) + orq %rcx,%rdx + movq %rdx,(%rdi,%r14,8) + leaq 1(%r14),%r14 + subq $1,%r15 + jnz L$copy + + movq 8(%rsp,%r9,8),%rsi + + movq $1,%rax + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$mul_epilogue: + ret + + +.globl _bn_mul4x_mont +.private_extern _bn_mul4x_mont + +.p2align 4 +_bn_mul4x_mont: + +_CET_ENDBR + movl %r9d,%r9d + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + + negq %r9 + movq %rsp,%r11 + leaq -32(%rsp,%r9,8),%r10 + negq %r9 + andq $-1024,%r10 + + subq %r10,%r11 + andq $-4096,%r11 + leaq (%r10,%r11,1),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja L$mul4x_page_walk + jmp L$mul4x_page_walk_done + +L$mul4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja L$mul4x_page_walk +L$mul4x_page_walk_done: + + movq %rax,8(%rsp,%r9,8) + +L$mul4x_body: + movq %rdi,16(%rsp,%r9,8) + movq %rdx,%r12 + movq (%r8),%r8 + movq (%r12),%rbx + movq (%rsi),%rax + + xorq %r14,%r14 + xorq %r15,%r15 + + movq %r8,%rbp + mulq %rbx + movq %rax,%r10 + movq (%rcx),%rax + + imulq %r10,%rbp + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 4(%r15),%r15 + adcq $0,%rdx + movq %rdi,(%rsp) + movq %rdx,%r13 + jmp L$1st4x +.p2align 4 +L$1st4x: + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%r13 + + mulq %rbx + addq %rax,%r10 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq 8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx,%r15,8),%rax + adcq $0,%rdx + leaq 4(%r15),%r15 + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq -16(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-32(%rsp,%r15,8) + movq %rdx,%r13 + cmpq %r9,%r15 + jb L$1st4x + + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%r13 + + xorq %rdi,%rdi + addq %r10,%r13 + adcq $0,%rdi + movq %r13,-8(%rsp,%r15,8) + movq %rdi,(%rsp,%r15,8) + + leaq 1(%r14),%r14 +.p2align 2 +L$outer4x: + movq (%r12,%r14,8),%rbx + xorq %r15,%r15 + movq (%rsp),%r10 + movq %r8,%rbp + mulq %rbx + addq %rax,%r10 + movq (%rcx),%rax + adcq $0,%rdx + + imulq %r10,%rbp + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + addq 8(%rsp),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 4(%r15),%r15 + adcq $0,%rdx + movq %rdi,(%rsp) + movq %rdx,%r13 + jmp L$inner4x +.p2align 4 +L$inner4x: + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -16(%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -8(%rsp,%r15,8),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%r13 + + mulq %rbx + addq %rax,%r10 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + addq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq 8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx,%r15,8),%rax + adcq $0,%rdx + addq 8(%rsp,%r15,8),%r11 + adcq $0,%rdx + leaq 4(%r15),%r15 + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq -16(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-32(%rsp,%r15,8) + movq %rdx,%r13 + cmpq %r9,%r15 + jb L$inner4x + + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -16(%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -8(%rsp,%r15,8),%r11 + adcq $0,%rdx + leaq 1(%r14),%r14 + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%r13 + + xorq %rdi,%rdi + addq %r10,%r13 + adcq $0,%rdi + addq (%rsp,%r9,8),%r13 + adcq $0,%rdi + movq %r13,-8(%rsp,%r15,8) + movq %rdi,(%rsp,%r15,8) + + cmpq %r9,%r14 + jb L$outer4x + movq 16(%rsp,%r9,8),%rdi + leaq -4(%r9),%r15 + movq 0(%rsp),%rax + movq 8(%rsp),%rdx + shrq $2,%r15 + leaq (%rsp),%rsi + xorq %r14,%r14 + + subq 0(%rcx),%rax + movq 16(%rsi),%rbx + movq 24(%rsi),%rbp + sbbq 8(%rcx),%rdx + +L$sub4x: + movq %rax,0(%rdi,%r14,8) + movq %rdx,8(%rdi,%r14,8) + sbbq 16(%rcx,%r14,8),%rbx + movq 32(%rsi,%r14,8),%rax + movq 40(%rsi,%r14,8),%rdx + sbbq 24(%rcx,%r14,8),%rbp + movq %rbx,16(%rdi,%r14,8) + movq %rbp,24(%rdi,%r14,8) + sbbq 32(%rcx,%r14,8),%rax + movq 48(%rsi,%r14,8),%rbx + movq 56(%rsi,%r14,8),%rbp + sbbq 40(%rcx,%r14,8),%rdx + leaq 4(%r14),%r14 + decq %r15 + jnz L$sub4x + + movq %rax,0(%rdi,%r14,8) + movq 32(%rsi,%r14,8),%rax + sbbq 16(%rcx,%r14,8),%rbx + movq %rdx,8(%rdi,%r14,8) + sbbq 24(%rcx,%r14,8),%rbp + movq %rbx,16(%rdi,%r14,8) + + sbbq $0,%rax + movq %rbp,24(%rdi,%r14,8) + pxor %xmm0,%xmm0 +.byte 102,72,15,110,224 + pcmpeqd %xmm5,%xmm5 + pshufd $0,%xmm4,%xmm4 + movq %r9,%r15 + pxor %xmm4,%xmm5 + shrq $2,%r15 + xorl %eax,%eax + + jmp L$copy4x +.p2align 4 +L$copy4x: + movdqa (%rsp,%rax,1),%xmm1 + movdqu (%rdi,%rax,1),%xmm2 + pand %xmm4,%xmm1 + pand %xmm5,%xmm2 + movdqa 16(%rsp,%rax,1),%xmm3 + movdqa %xmm0,(%rsp,%rax,1) + por %xmm2,%xmm1 + movdqu 16(%rdi,%rax,1),%xmm2 + movdqu %xmm1,(%rdi,%rax,1) + pand %xmm4,%xmm3 + pand %xmm5,%xmm2 + movdqa %xmm0,16(%rsp,%rax,1) + por %xmm2,%xmm3 + movdqu %xmm3,16(%rdi,%rax,1) + leaq 32(%rax),%rax + decq %r15 + jnz L$copy4x + movq 8(%rsp,%r9,8),%rsi + + movq $1,%rax + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$mul4x_epilogue: + ret + + + + + +.globl _bn_sqr8x_mont +.private_extern _bn_sqr8x_mont + +.p2align 5 +_bn_sqr8x_mont: + +_CET_ENDBR + movl %r9d,%r9d + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$sqr8x_prologue: + + movl %r9d,%r10d + shll $3,%r9d + shlq $3+2,%r10 + negq %r9 + + + + + + + leaq -64(%rsp,%r9,2),%r11 + movq %rsp,%rbp + movq (%r8),%r8 + subq %rsi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb L$sqr8x_sp_alt + subq %r11,%rbp + leaq -64(%rbp,%r9,2),%rbp + jmp L$sqr8x_sp_done + +.p2align 5 +L$sqr8x_sp_alt: + leaq 4096-64(,%r9,2),%r10 + leaq -64(%rbp,%r9,2),%rbp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rbp +L$sqr8x_sp_done: + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$sqr8x_page_walk + jmp L$sqr8x_page_walk_done + +.p2align 4 +L$sqr8x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$sqr8x_page_walk +L$sqr8x_page_walk_done: + + movq %r9,%r10 + negq %r9 + + movq %r8,32(%rsp) + movq %rax,40(%rsp) + +L$sqr8x_body: + +.byte 102,72,15,110,209 + pxor %xmm0,%xmm0 +.byte 102,72,15,110,207 +.byte 102,73,15,110,218 + testq %rdx,%rdx + jz L$sqr8x_nox + + call _bn_sqrx8x_internal + + + + + leaq (%r8,%rcx,1),%rbx + movq %rcx,%r9 + movq %rcx,%rdx +.byte 102,72,15,126,207 + sarq $3+2,%rcx + jmp L$sqr8x_sub + +.p2align 5 +L$sqr8x_nox: + call _bn_sqr8x_internal + + + + + leaq (%rdi,%r9,1),%rbx + movq %r9,%rcx + movq %r9,%rdx +.byte 102,72,15,126,207 + sarq $3+2,%rcx + jmp L$sqr8x_sub + +.p2align 5 +L$sqr8x_sub: + movq 0(%rbx),%r12 + movq 8(%rbx),%r13 + movq 16(%rbx),%r14 + movq 24(%rbx),%r15 + leaq 32(%rbx),%rbx + sbbq 0(%rbp),%r12 + sbbq 8(%rbp),%r13 + sbbq 16(%rbp),%r14 + sbbq 24(%rbp),%r15 + leaq 32(%rbp),%rbp + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r14,16(%rdi) + movq %r15,24(%rdi) + leaq 32(%rdi),%rdi + incq %rcx + jnz L$sqr8x_sub + + sbbq $0,%rax + leaq (%rbx,%r9,1),%rbx + leaq (%rdi,%r9,1),%rdi + +.byte 102,72,15,110,200 + pxor %xmm0,%xmm0 + pshufd $0,%xmm1,%xmm1 + movq 40(%rsp),%rsi + + jmp L$sqr8x_cond_copy + +.p2align 5 +L$sqr8x_cond_copy: + movdqa 0(%rbx),%xmm2 + movdqa 16(%rbx),%xmm3 + leaq 32(%rbx),%rbx + movdqu 0(%rdi),%xmm4 + movdqu 16(%rdi),%xmm5 + leaq 32(%rdi),%rdi + movdqa %xmm0,-32(%rbx) + movdqa %xmm0,-16(%rbx) + movdqa %xmm0,-32(%rbx,%rdx,1) + movdqa %xmm0,-16(%rbx,%rdx,1) + pcmpeqd %xmm1,%xmm0 + pand %xmm1,%xmm2 + pand %xmm1,%xmm3 + pand %xmm0,%xmm4 + pand %xmm0,%xmm5 + pxor %xmm0,%xmm0 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqu %xmm4,-32(%rdi) + movdqu %xmm5,-16(%rdi) + addq $32,%r9 + jnz L$sqr8x_cond_copy + + movq $1,%rax + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$sqr8x_epilogue: + ret + + +.globl _bn_mulx4x_mont +.private_extern _bn_mulx4x_mont + +.p2align 5 +_bn_mulx4x_mont: + +_CET_ENDBR + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$mulx4x_prologue: + + shll $3,%r9d + xorq %r10,%r10 + subq %r9,%r10 + movq (%r8),%r8 + leaq -72(%rsp,%r10,1),%rbp + andq $-128,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$mulx4x_page_walk + jmp L$mulx4x_page_walk_done + +.p2align 4 +L$mulx4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$mulx4x_page_walk +L$mulx4x_page_walk_done: + + leaq (%rdx,%r9,1),%r10 + + + + + + + + + + + + + movq %r9,0(%rsp) + shrq $5,%r9 + movq %r10,16(%rsp) + subq $1,%r9 + movq %r8,24(%rsp) + movq %rdi,32(%rsp) + movq %rax,40(%rsp) + + movq %r9,48(%rsp) + jmp L$mulx4x_body + +.p2align 5 +L$mulx4x_body: + leaq 8(%rdx),%rdi + movq (%rdx),%rdx + leaq 64+32(%rsp),%rbx + movq %rdx,%r9 + + mulxq 0(%rsi),%r8,%rax + mulxq 8(%rsi),%r11,%r14 + addq %rax,%r11 + movq %rdi,8(%rsp) + mulxq 16(%rsi),%r12,%r13 + adcq %r14,%r12 + adcq $0,%r13 + + movq %r8,%rdi + imulq 24(%rsp),%r8 + xorq %rbp,%rbp + + mulxq 24(%rsi),%rax,%r14 + movq %r8,%rdx + leaq 32(%rsi),%rsi + adcxq %rax,%r13 + adcxq %rbp,%r14 + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%rdi + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 +.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 + movq 48(%rsp),%rdi + movq %r10,-32(%rbx) + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-24(%rbx) + adcxq %rax,%r12 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r12,-16(%rbx) + + jmp L$mulx4x_1st + +.p2align 5 +L$mulx4x_1st: + adcxq %rbp,%r15 + mulxq 0(%rsi),%r10,%rax + adcxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 +.byte 0x67,0x67 + movq %r8,%rdx + adcxq %rax,%r13 + adcxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + movq %r11,-32(%rbx) + adoxq %r15,%r13 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r13,-16(%rbx) + + decq %rdi + jnz L$mulx4x_1st + + movq 0(%rsp),%rax + movq 8(%rsp),%rdi + adcq %rbp,%r15 + addq %r15,%r14 + sbbq %r15,%r15 + movq %r14,-8(%rbx) + jmp L$mulx4x_outer + +.p2align 5 +L$mulx4x_outer: + movq (%rdi),%rdx + leaq 8(%rdi),%rdi + subq %rax,%rsi + movq %r15,(%rbx) + leaq 64+32(%rsp),%rbx + subq %rax,%rcx + + mulxq 0(%rsi),%r8,%r11 + xorl %ebp,%ebp + movq %rdx,%r9 + mulxq 8(%rsi),%r14,%r12 + adoxq -32(%rbx),%r8 + adcxq %r14,%r11 + mulxq 16(%rsi),%r15,%r13 + adoxq -24(%rbx),%r11 + adcxq %r15,%r12 + adoxq -16(%rbx),%r12 + adcxq %rbp,%r13 + adoxq %rbp,%r13 + + movq %rdi,8(%rsp) + movq %r8,%r15 + imulq 24(%rsp),%r8 + xorl %ebp,%ebp + + mulxq 24(%rsi),%rax,%r14 + movq %r8,%rdx + adcxq %rax,%r13 + adoxq -8(%rbx),%r13 + adcxq %rbp,%r14 + leaq 32(%rsi),%rsi + adoxq %rbp,%r14 + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%r15 + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + mulxq 16(%rcx),%rax,%r12 + movq %r10,-32(%rbx) + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-24(%rbx) + leaq 32(%rcx),%rcx + adcxq %rax,%r12 + adoxq %rbp,%r15 + movq 48(%rsp),%rdi + movq %r12,-16(%rbx) + + jmp L$mulx4x_inner + +.p2align 5 +L$mulx4x_inner: + mulxq 0(%rsi),%r10,%rax + adcxq %rbp,%r15 + adoxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq 0(%rbx),%r10 + adoxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq 8(%rbx),%r11 + adoxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 + movq %r8,%rdx + adcxq 16(%rbx),%r12 + adoxq %rax,%r13 + adcxq 24(%rbx),%r13 + adoxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + adcxq %rbp,%r14 + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + adoxq %r15,%r13 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-32(%rbx) + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r13,-16(%rbx) + + decq %rdi + jnz L$mulx4x_inner + + movq 0(%rsp),%rax + movq 8(%rsp),%rdi + adcq %rbp,%r15 + subq 0(%rbx),%rbp + adcq %r15,%r14 + sbbq %r15,%r15 + movq %r14,-8(%rbx) + + cmpq 16(%rsp),%rdi + jne L$mulx4x_outer + + leaq 64(%rsp),%rbx + subq %rax,%rcx + negq %r15 + movq %rax,%rdx + shrq $3+2,%rax + movq 32(%rsp),%rdi + jmp L$mulx4x_sub + +.p2align 5 +L$mulx4x_sub: + movq 0(%rbx),%r11 + movq 8(%rbx),%r12 + movq 16(%rbx),%r13 + movq 24(%rbx),%r14 + leaq 32(%rbx),%rbx + sbbq 0(%rcx),%r11 + sbbq 8(%rcx),%r12 + sbbq 16(%rcx),%r13 + sbbq 24(%rcx),%r14 + leaq 32(%rcx),%rcx + movq %r11,0(%rdi) + movq %r12,8(%rdi) + movq %r13,16(%rdi) + movq %r14,24(%rdi) + leaq 32(%rdi),%rdi + decq %rax + jnz L$mulx4x_sub + + sbbq $0,%r15 + leaq 64(%rsp),%rbx + subq %rdx,%rdi + +.byte 102,73,15,110,207 + pxor %xmm0,%xmm0 + pshufd $0,%xmm1,%xmm1 + movq 40(%rsp),%rsi + + jmp L$mulx4x_cond_copy + +.p2align 5 +L$mulx4x_cond_copy: + movdqa 0(%rbx),%xmm2 + movdqa 16(%rbx),%xmm3 + leaq 32(%rbx),%rbx + movdqu 0(%rdi),%xmm4 + movdqu 16(%rdi),%xmm5 + leaq 32(%rdi),%rdi + movdqa %xmm0,-32(%rbx) + movdqa %xmm0,-16(%rbx) + pcmpeqd %xmm1,%xmm0 + pand %xmm1,%xmm2 + pand %xmm1,%xmm3 + pand %xmm0,%xmm4 + pand %xmm0,%xmm5 + pxor %xmm0,%xmm0 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqu %xmm4,-32(%rdi) + movdqu %xmm5,-16(%rdi) + subq $32,%rdx + jnz L$mulx4x_cond_copy + + movq %rdx,(%rbx) + + movq $1,%rax + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$mulx4x_epilogue: + ret + + +.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.p2align 4 +#endif diff --git a/ring-0.17.14/pregenerated/x86_64-mont-nasm.asm b/ring-0.17.14/pregenerated/x86_64-mont-nasm.asm new file mode 100644 index 0000000000..361eaceeab --- /dev/null +++ b/ring-0.17.14/pregenerated/x86_64-mont-nasm.asm @@ -0,0 +1,1468 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%include "ring_core_generated/prefix_symbols_nasm.inc" +section .text code align=64 + + +global bn_mul_mont_nohw + +ALIGN 16 +bn_mul_mont_nohw: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_bn_mul_mont_nohw: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + +_CET_ENDBR + mov r9d,r9d + mov rax,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + + neg r9 + mov r11,rsp + lea r10,[((-16))+r9*8+rsp] + neg r9 + and r10,-1024 + + + + + + + + + + sub r11,r10 + and r11,-4096 + lea rsp,[r11*1+r10] + mov r11,QWORD[rsp] + cmp rsp,r10 + ja NEAR $L$mul_page_walk + jmp NEAR $L$mul_page_walk_done + +ALIGN 16 +$L$mul_page_walk: + lea rsp,[((-4096))+rsp] + mov r11,QWORD[rsp] + cmp rsp,r10 + ja NEAR $L$mul_page_walk +$L$mul_page_walk_done: + + mov QWORD[8+r9*8+rsp],rax + +$L$mul_body: + mov r12,rdx + mov r8,QWORD[r8] + mov rbx,QWORD[r12] + mov rax,QWORD[rsi] + + xor r14,r14 + xor r15,r15 + + mov rbp,r8 + mul rbx + mov r10,rax + mov rax,QWORD[rcx] + + imul rbp,r10 + mov r11,rdx + + mul rbp + add r10,rax + mov rax,QWORD[8+rsi] + adc rdx,0 + mov r13,rdx + + lea r15,[1+r15] + jmp NEAR $L$1st_enter + +ALIGN 16 +$L$1st: + add r13,rax + mov rax,QWORD[r15*8+rsi] + adc rdx,0 + add r13,r11 + mov r11,r10 + adc rdx,0 + mov QWORD[((-16))+r15*8+rsp],r13 + mov r13,rdx + +$L$1st_enter: + mul rbx + add r11,rax + mov rax,QWORD[r15*8+rcx] + adc rdx,0 + lea r15,[1+r15] + mov r10,rdx + + mul rbp + cmp r15,r9 + jne NEAR $L$1st + + add r13,rax + mov rax,QWORD[rsi] + adc rdx,0 + add r13,r11 + adc rdx,0 + mov QWORD[((-16))+r15*8+rsp],r13 + mov r13,rdx + mov r11,r10 + + xor rdx,rdx + add r13,r11 + adc rdx,0 + mov QWORD[((-8))+r9*8+rsp],r13 + mov QWORD[r9*8+rsp],rdx + + lea r14,[1+r14] + jmp NEAR $L$outer +ALIGN 16 +$L$outer: + mov rbx,QWORD[r14*8+r12] + xor r15,r15 + mov rbp,r8 + mov r10,QWORD[rsp] + mul rbx + add r10,rax + mov rax,QWORD[rcx] + adc rdx,0 + + imul rbp,r10 + mov r11,rdx + + mul rbp + add r10,rax + mov rax,QWORD[8+rsi] + adc rdx,0 + mov r10,QWORD[8+rsp] + mov r13,rdx + + lea r15,[1+r15] + jmp NEAR $L$inner_enter + +ALIGN 16 +$L$inner: + add r13,rax + mov rax,QWORD[r15*8+rsi] + adc rdx,0 + add r13,r10 + mov r10,QWORD[r15*8+rsp] + adc rdx,0 + mov QWORD[((-16))+r15*8+rsp],r13 + mov r13,rdx + +$L$inner_enter: + mul rbx + add r11,rax + mov rax,QWORD[r15*8+rcx] + adc rdx,0 + add r10,r11 + mov r11,rdx + adc r11,0 + lea r15,[1+r15] + + mul rbp + cmp r15,r9 + jne NEAR $L$inner + + add r13,rax + mov rax,QWORD[rsi] + adc rdx,0 + add r13,r10 + mov r10,QWORD[r15*8+rsp] + adc rdx,0 + mov QWORD[((-16))+r15*8+rsp],r13 + mov r13,rdx + + xor rdx,rdx + add r13,r11 + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-8))+r9*8+rsp],r13 + mov QWORD[r9*8+rsp],rdx + + lea r14,[1+r14] + cmp r14,r9 + jb NEAR $L$outer + + xor r14,r14 + mov rax,QWORD[rsp] + mov r15,r9 + +ALIGN 16 +$L$sub: sbb rax,QWORD[r14*8+rcx] + mov QWORD[r14*8+rdi],rax + mov rax,QWORD[8+r14*8+rsp] + lea r14,[1+r14] + dec r15 + jnz NEAR $L$sub + + sbb rax,0 + mov rbx,-1 + xor rbx,rax + xor r14,r14 + mov r15,r9 + +$L$copy: + mov rcx,QWORD[r14*8+rdi] + mov rdx,QWORD[r14*8+rsp] + and rcx,rbx + and rdx,rax + mov QWORD[r14*8+rsp],r9 + or rdx,rcx + mov QWORD[r14*8+rdi],rdx + lea r14,[1+r14] + sub r15,1 + jnz NEAR $L$copy + + mov rsi,QWORD[8+r9*8+rsp] + + mov rax,1 + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$mul_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_bn_mul_mont_nohw: +global bn_mul4x_mont + +ALIGN 16 +bn_mul4x_mont: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_bn_mul4x_mont: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + +_CET_ENDBR + mov r9d,r9d + mov rax,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + + neg r9 + mov r11,rsp + lea r10,[((-32))+r9*8+rsp] + neg r9 + and r10,-1024 + + sub r11,r10 + and r11,-4096 + lea rsp,[r11*1+r10] + mov r11,QWORD[rsp] + cmp rsp,r10 + ja NEAR $L$mul4x_page_walk + jmp NEAR $L$mul4x_page_walk_done + +$L$mul4x_page_walk: + lea rsp,[((-4096))+rsp] + mov r11,QWORD[rsp] + cmp rsp,r10 + ja NEAR $L$mul4x_page_walk +$L$mul4x_page_walk_done: + + mov QWORD[8+r9*8+rsp],rax + +$L$mul4x_body: + mov QWORD[16+r9*8+rsp],rdi + mov r12,rdx + mov r8,QWORD[r8] + mov rbx,QWORD[r12] + mov rax,QWORD[rsi] + + xor r14,r14 + xor r15,r15 + + mov rbp,r8 + mul rbx + mov r10,rax + mov rax,QWORD[rcx] + + imul rbp,r10 + mov r11,rdx + + mul rbp + add r10,rax + mov rax,QWORD[8+rsi] + adc rdx,0 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[8+rcx] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[16+rsi] + adc rdx,0 + add rdi,r11 + lea r15,[4+r15] + adc rdx,0 + mov QWORD[rsp],rdi + mov r13,rdx + jmp NEAR $L$1st4x +ALIGN 16 +$L$1st4x: + mul rbx + add r10,rax + mov rax,QWORD[((-16))+r15*8+rcx] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[((-8))+r15*8+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-24))+r15*8+rsp],r13 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[((-8))+r15*8+rcx] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[r15*8+rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD[((-16))+r15*8+rsp],rdi + mov r13,rdx + + mul rbx + add r10,rax + mov rax,QWORD[r15*8+rcx] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[8+r15*8+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-8))+r15*8+rsp],r13 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[8+r15*8+rcx] + adc rdx,0 + lea r15,[4+r15] + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[((-16))+r15*8+rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD[((-32))+r15*8+rsp],rdi + mov r13,rdx + cmp r15,r9 + jb NEAR $L$1st4x + + mul rbx + add r10,rax + mov rax,QWORD[((-16))+r15*8+rcx] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[((-8))+r15*8+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-24))+r15*8+rsp],r13 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[((-8))+r15*8+rcx] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD[((-16))+r15*8+rsp],rdi + mov r13,rdx + + xor rdi,rdi + add r13,r10 + adc rdi,0 + mov QWORD[((-8))+r15*8+rsp],r13 + mov QWORD[r15*8+rsp],rdi + + lea r14,[1+r14] +ALIGN 4 +$L$outer4x: + mov rbx,QWORD[r14*8+r12] + xor r15,r15 + mov r10,QWORD[rsp] + mov rbp,r8 + mul rbx + add r10,rax + mov rax,QWORD[rcx] + adc rdx,0 + + imul rbp,r10 + mov r11,rdx + + mul rbp + add r10,rax + mov rax,QWORD[8+rsi] + adc rdx,0 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[8+rcx] + adc rdx,0 + add r11,QWORD[8+rsp] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[16+rsi] + adc rdx,0 + add rdi,r11 + lea r15,[4+r15] + adc rdx,0 + mov QWORD[rsp],rdi + mov r13,rdx + jmp NEAR $L$inner4x +ALIGN 16 +$L$inner4x: + mul rbx + add r10,rax + mov rax,QWORD[((-16))+r15*8+rcx] + adc rdx,0 + add r10,QWORD[((-16))+r15*8+rsp] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[((-8))+r15*8+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-24))+r15*8+rsp],r13 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[((-8))+r15*8+rcx] + adc rdx,0 + add r11,QWORD[((-8))+r15*8+rsp] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[r15*8+rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD[((-16))+r15*8+rsp],rdi + mov r13,rdx + + mul rbx + add r10,rax + mov rax,QWORD[r15*8+rcx] + adc rdx,0 + add r10,QWORD[r15*8+rsp] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[8+r15*8+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-8))+r15*8+rsp],r13 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[8+r15*8+rcx] + adc rdx,0 + add r11,QWORD[8+r15*8+rsp] + adc rdx,0 + lea r15,[4+r15] + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[((-16))+r15*8+rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD[((-32))+r15*8+rsp],rdi + mov r13,rdx + cmp r15,r9 + jb NEAR $L$inner4x + + mul rbx + add r10,rax + mov rax,QWORD[((-16))+r15*8+rcx] + adc rdx,0 + add r10,QWORD[((-16))+r15*8+rsp] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[((-8))+r15*8+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-24))+r15*8+rsp],r13 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[((-8))+r15*8+rcx] + adc rdx,0 + add r11,QWORD[((-8))+r15*8+rsp] + adc rdx,0 + lea r14,[1+r14] + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD[((-16))+r15*8+rsp],rdi + mov r13,rdx + + xor rdi,rdi + add r13,r10 + adc rdi,0 + add r13,QWORD[r9*8+rsp] + adc rdi,0 + mov QWORD[((-8))+r15*8+rsp],r13 + mov QWORD[r15*8+rsp],rdi + + cmp r14,r9 + jb NEAR $L$outer4x + mov rdi,QWORD[16+r9*8+rsp] + lea r15,[((-4))+r9] + mov rax,QWORD[rsp] + mov rdx,QWORD[8+rsp] + shr r15,2 + lea rsi,[rsp] + xor r14,r14 + + sub rax,QWORD[rcx] + mov rbx,QWORD[16+rsi] + mov rbp,QWORD[24+rsi] + sbb rdx,QWORD[8+rcx] + +$L$sub4x: + mov QWORD[r14*8+rdi],rax + mov QWORD[8+r14*8+rdi],rdx + sbb rbx,QWORD[16+r14*8+rcx] + mov rax,QWORD[32+r14*8+rsi] + mov rdx,QWORD[40+r14*8+rsi] + sbb rbp,QWORD[24+r14*8+rcx] + mov QWORD[16+r14*8+rdi],rbx + mov QWORD[24+r14*8+rdi],rbp + sbb rax,QWORD[32+r14*8+rcx] + mov rbx,QWORD[48+r14*8+rsi] + mov rbp,QWORD[56+r14*8+rsi] + sbb rdx,QWORD[40+r14*8+rcx] + lea r14,[4+r14] + dec r15 + jnz NEAR $L$sub4x + + mov QWORD[r14*8+rdi],rax + mov rax,QWORD[32+r14*8+rsi] + sbb rbx,QWORD[16+r14*8+rcx] + mov QWORD[8+r14*8+rdi],rdx + sbb rbp,QWORD[24+r14*8+rcx] + mov QWORD[16+r14*8+rdi],rbx + + sbb rax,0 + mov QWORD[24+r14*8+rdi],rbp + pxor xmm0,xmm0 +DB 102,72,15,110,224 + pcmpeqd xmm5,xmm5 + pshufd xmm4,xmm4,0 + mov r15,r9 + pxor xmm5,xmm4 + shr r15,2 + xor eax,eax + + jmp NEAR $L$copy4x +ALIGN 16 +$L$copy4x: + movdqa xmm1,XMMWORD[rax*1+rsp] + movdqu xmm2,XMMWORD[rax*1+rdi] + pand xmm1,xmm4 + pand xmm2,xmm5 + movdqa xmm3,XMMWORD[16+rax*1+rsp] + movdqa XMMWORD[rax*1+rsp],xmm0 + por xmm1,xmm2 + movdqu xmm2,XMMWORD[16+rax*1+rdi] + movdqu XMMWORD[rax*1+rdi],xmm1 + pand xmm3,xmm4 + pand xmm2,xmm5 + movdqa XMMWORD[16+rax*1+rsp],xmm0 + por xmm3,xmm2 + movdqu XMMWORD[16+rax*1+rdi],xmm3 + lea rax,[32+rax] + dec r15 + jnz NEAR $L$copy4x + mov rsi,QWORD[8+r9*8+rsp] + + mov rax,1 + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$mul4x_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_bn_mul4x_mont: +EXTERN bn_sqrx8x_internal +EXTERN bn_sqr8x_internal + +global bn_sqr8x_mont + +ALIGN 32 +bn_sqr8x_mont: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_bn_sqr8x_mont: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + +_CET_ENDBR + mov r9d,r9d + mov rax,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + +$L$sqr8x_prologue: + + mov r10d,r9d + shl r9d,3 + shl r10,3+2 + neg r9 + + + + + + + lea r11,[((-64))+r9*2+rsp] + mov rbp,rsp + mov r8,QWORD[r8] + sub r11,rsi + and r11,4095 + cmp r10,r11 + jb NEAR $L$sqr8x_sp_alt + sub rbp,r11 + lea rbp,[((-64))+r9*2+rbp] + jmp NEAR $L$sqr8x_sp_done + +ALIGN 32 +$L$sqr8x_sp_alt: + lea r10,[((4096-64))+r9*2] + lea rbp,[((-64))+r9*2+rbp] + sub r11,r10 + mov r10,0 + cmovc r11,r10 + sub rbp,r11 +$L$sqr8x_sp_done: + and rbp,-64 + mov r11,rsp + sub r11,rbp + and r11,-4096 + lea rsp,[rbp*1+r11] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$sqr8x_page_walk + jmp NEAR $L$sqr8x_page_walk_done + +ALIGN 16 +$L$sqr8x_page_walk: + lea rsp,[((-4096))+rsp] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$sqr8x_page_walk +$L$sqr8x_page_walk_done: + + mov r10,r9 + neg r9 + + mov QWORD[32+rsp],r8 + mov QWORD[40+rsp],rax + +$L$sqr8x_body: + +DB 102,72,15,110,209 + pxor xmm0,xmm0 +DB 102,72,15,110,207 +DB 102,73,15,110,218 + test rdx,rdx + jz NEAR $L$sqr8x_nox + + call bn_sqrx8x_internal + + + + + lea rbx,[rcx*1+r8] + mov r9,rcx + mov rdx,rcx +DB 102,72,15,126,207 + sar rcx,3+2 + jmp NEAR $L$sqr8x_sub + +ALIGN 32 +$L$sqr8x_nox: + call bn_sqr8x_internal + + + + + lea rbx,[r9*1+rdi] + mov rcx,r9 + mov rdx,r9 +DB 102,72,15,126,207 + sar rcx,3+2 + jmp NEAR $L$sqr8x_sub + +ALIGN 32 +$L$sqr8x_sub: + mov r12,QWORD[rbx] + mov r13,QWORD[8+rbx] + mov r14,QWORD[16+rbx] + mov r15,QWORD[24+rbx] + lea rbx,[32+rbx] + sbb r12,QWORD[rbp] + sbb r13,QWORD[8+rbp] + sbb r14,QWORD[16+rbp] + sbb r15,QWORD[24+rbp] + lea rbp,[32+rbp] + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r14 + mov QWORD[24+rdi],r15 + lea rdi,[32+rdi] + inc rcx + jnz NEAR $L$sqr8x_sub + + sbb rax,0 + lea rbx,[r9*1+rbx] + lea rdi,[r9*1+rdi] + +DB 102,72,15,110,200 + pxor xmm0,xmm0 + pshufd xmm1,xmm1,0 + mov rsi,QWORD[40+rsp] + + jmp NEAR $L$sqr8x_cond_copy + +ALIGN 32 +$L$sqr8x_cond_copy: + movdqa xmm2,XMMWORD[rbx] + movdqa xmm3,XMMWORD[16+rbx] + lea rbx,[32+rbx] + movdqu xmm4,XMMWORD[rdi] + movdqu xmm5,XMMWORD[16+rdi] + lea rdi,[32+rdi] + movdqa XMMWORD[(-32)+rbx],xmm0 + movdqa XMMWORD[(-16)+rbx],xmm0 + movdqa XMMWORD[(-32)+rdx*1+rbx],xmm0 + movdqa XMMWORD[(-16)+rdx*1+rbx],xmm0 + pcmpeqd xmm0,xmm1 + pand xmm2,xmm1 + pand xmm3,xmm1 + pand xmm4,xmm0 + pand xmm5,xmm0 + pxor xmm0,xmm0 + por xmm4,xmm2 + por xmm5,xmm3 + movdqu XMMWORD[(-32)+rdi],xmm4 + movdqu XMMWORD[(-16)+rdi],xmm5 + add r9,32 + jnz NEAR $L$sqr8x_cond_copy + + mov rax,1 + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$sqr8x_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_bn_sqr8x_mont: +global bn_mulx4x_mont + +ALIGN 32 +bn_mulx4x_mont: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_bn_mulx4x_mont: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + +_CET_ENDBR + mov rax,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + +$L$mulx4x_prologue: + + shl r9d,3 + xor r10,r10 + sub r10,r9 + mov r8,QWORD[r8] + lea rbp,[((-72))+r10*1+rsp] + and rbp,-128 + mov r11,rsp + sub r11,rbp + and r11,-4096 + lea rsp,[rbp*1+r11] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$mulx4x_page_walk + jmp NEAR $L$mulx4x_page_walk_done + +ALIGN 16 +$L$mulx4x_page_walk: + lea rsp,[((-4096))+rsp] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$mulx4x_page_walk +$L$mulx4x_page_walk_done: + + lea r10,[r9*1+rdx] + + + + + + + + + + + + + mov QWORD[rsp],r9 + shr r9,5 + mov QWORD[16+rsp],r10 + sub r9,1 + mov QWORD[24+rsp],r8 + mov QWORD[32+rsp],rdi + mov QWORD[40+rsp],rax + + mov QWORD[48+rsp],r9 + jmp NEAR $L$mulx4x_body + +ALIGN 32 +$L$mulx4x_body: + lea rdi,[8+rdx] + mov rdx,QWORD[rdx] + lea rbx,[((64+32))+rsp] + mov r9,rdx + + mulx rax,r8,QWORD[rsi] + mulx r14,r11,QWORD[8+rsi] + add r11,rax + mov QWORD[8+rsp],rdi + mulx r13,r12,QWORD[16+rsi] + adc r12,r14 + adc r13,0 + + mov rdi,r8 + imul r8,QWORD[24+rsp] + xor rbp,rbp + + mulx r14,rax,QWORD[24+rsi] + mov rdx,r8 + lea rsi,[32+rsi] + adcx r13,rax + adcx r14,rbp + + mulx r10,rax,QWORD[rcx] + adcx rdi,rax + adox r10,r11 + mulx r11,rax,QWORD[8+rcx] + adcx r10,rax + adox r11,r12 + DB 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 + mov rdi,QWORD[48+rsp] + mov QWORD[((-32))+rbx],r10 + adcx r11,rax + adox r12,r13 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + mov QWORD[((-24))+rbx],r11 + adcx r12,rax + adox r15,rbp + lea rcx,[32+rcx] + mov QWORD[((-16))+rbx],r12 + + jmp NEAR $L$mulx4x_1st + +ALIGN 32 +$L$mulx4x_1st: + adcx r15,rbp + mulx rax,r10,QWORD[rsi] + adcx r10,r14 + mulx r14,r11,QWORD[8+rsi] + adcx r11,rax + mulx rax,r12,QWORD[16+rsi] + adcx r12,r14 + mulx r14,r13,QWORD[24+rsi] + DB 0x67,0x67 + mov rdx,r8 + adcx r13,rax + adcx r14,rbp + lea rsi,[32+rsi] + lea rbx,[32+rbx] + + adox r10,r15 + mulx r15,rax,QWORD[rcx] + adcx r10,rax + adox r11,r15 + mulx r15,rax,QWORD[8+rcx] + adcx r11,rax + adox r12,r15 + mulx r15,rax,QWORD[16+rcx] + mov QWORD[((-40))+rbx],r10 + adcx r12,rax + mov QWORD[((-32))+rbx],r11 + adox r13,r15 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + mov QWORD[((-24))+rbx],r12 + adcx r13,rax + adox r15,rbp + lea rcx,[32+rcx] + mov QWORD[((-16))+rbx],r13 + + dec rdi + jnz NEAR $L$mulx4x_1st + + mov rax,QWORD[rsp] + mov rdi,QWORD[8+rsp] + adc r15,rbp + add r14,r15 + sbb r15,r15 + mov QWORD[((-8))+rbx],r14 + jmp NEAR $L$mulx4x_outer + +ALIGN 32 +$L$mulx4x_outer: + mov rdx,QWORD[rdi] + lea rdi,[8+rdi] + sub rsi,rax + mov QWORD[rbx],r15 + lea rbx,[((64+32))+rsp] + sub rcx,rax + + mulx r11,r8,QWORD[rsi] + xor ebp,ebp + mov r9,rdx + mulx r12,r14,QWORD[8+rsi] + adox r8,QWORD[((-32))+rbx] + adcx r11,r14 + mulx r13,r15,QWORD[16+rsi] + adox r11,QWORD[((-24))+rbx] + adcx r12,r15 + adox r12,QWORD[((-16))+rbx] + adcx r13,rbp + adox r13,rbp + + mov QWORD[8+rsp],rdi + mov r15,r8 + imul r8,QWORD[24+rsp] + xor ebp,ebp + + mulx r14,rax,QWORD[24+rsi] + mov rdx,r8 + adcx r13,rax + adox r13,QWORD[((-8))+rbx] + adcx r14,rbp + lea rsi,[32+rsi] + adox r14,rbp + + mulx r10,rax,QWORD[rcx] + adcx r15,rax + adox r10,r11 + mulx r11,rax,QWORD[8+rcx] + adcx r10,rax + adox r11,r12 + mulx r12,rax,QWORD[16+rcx] + mov QWORD[((-32))+rbx],r10 + adcx r11,rax + adox r12,r13 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + mov QWORD[((-24))+rbx],r11 + lea rcx,[32+rcx] + adcx r12,rax + adox r15,rbp + mov rdi,QWORD[48+rsp] + mov QWORD[((-16))+rbx],r12 + + jmp NEAR $L$mulx4x_inner + +ALIGN 32 +$L$mulx4x_inner: + mulx rax,r10,QWORD[rsi] + adcx r15,rbp + adox r10,r14 + mulx r14,r11,QWORD[8+rsi] + adcx r10,QWORD[rbx] + adox r11,rax + mulx rax,r12,QWORD[16+rsi] + adcx r11,QWORD[8+rbx] + adox r12,r14 + mulx r14,r13,QWORD[24+rsi] + mov rdx,r8 + adcx r12,QWORD[16+rbx] + adox r13,rax + adcx r13,QWORD[24+rbx] + adox r14,rbp + lea rsi,[32+rsi] + lea rbx,[32+rbx] + adcx r14,rbp + + adox r10,r15 + mulx r15,rax,QWORD[rcx] + adcx r10,rax + adox r11,r15 + mulx r15,rax,QWORD[8+rcx] + adcx r11,rax + adox r12,r15 + mulx r15,rax,QWORD[16+rcx] + mov QWORD[((-40))+rbx],r10 + adcx r12,rax + adox r13,r15 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + mov QWORD[((-32))+rbx],r11 + mov QWORD[((-24))+rbx],r12 + adcx r13,rax + adox r15,rbp + lea rcx,[32+rcx] + mov QWORD[((-16))+rbx],r13 + + dec rdi + jnz NEAR $L$mulx4x_inner + + mov rax,QWORD[rsp] + mov rdi,QWORD[8+rsp] + adc r15,rbp + sub rbp,QWORD[rbx] + adc r14,r15 + sbb r15,r15 + mov QWORD[((-8))+rbx],r14 + + cmp rdi,QWORD[16+rsp] + jne NEAR $L$mulx4x_outer + + lea rbx,[64+rsp] + sub rcx,rax + neg r15 + mov rdx,rax + shr rax,3+2 + mov rdi,QWORD[32+rsp] + jmp NEAR $L$mulx4x_sub + +ALIGN 32 +$L$mulx4x_sub: + mov r11,QWORD[rbx] + mov r12,QWORD[8+rbx] + mov r13,QWORD[16+rbx] + mov r14,QWORD[24+rbx] + lea rbx,[32+rbx] + sbb r11,QWORD[rcx] + sbb r12,QWORD[8+rcx] + sbb r13,QWORD[16+rcx] + sbb r14,QWORD[24+rcx] + lea rcx,[32+rcx] + mov QWORD[rdi],r11 + mov QWORD[8+rdi],r12 + mov QWORD[16+rdi],r13 + mov QWORD[24+rdi],r14 + lea rdi,[32+rdi] + dec rax + jnz NEAR $L$mulx4x_sub + + sbb r15,0 + lea rbx,[64+rsp] + sub rdi,rdx + +DB 102,73,15,110,207 + pxor xmm0,xmm0 + pshufd xmm1,xmm1,0 + mov rsi,QWORD[40+rsp] + + jmp NEAR $L$mulx4x_cond_copy + +ALIGN 32 +$L$mulx4x_cond_copy: + movdqa xmm2,XMMWORD[rbx] + movdqa xmm3,XMMWORD[16+rbx] + lea rbx,[32+rbx] + movdqu xmm4,XMMWORD[rdi] + movdqu xmm5,XMMWORD[16+rdi] + lea rdi,[32+rdi] + movdqa XMMWORD[(-32)+rbx],xmm0 + movdqa XMMWORD[(-16)+rbx],xmm0 + pcmpeqd xmm0,xmm1 + pand xmm2,xmm1 + pand xmm3,xmm1 + pand xmm4,xmm0 + pand xmm5,xmm0 + pxor xmm0,xmm0 + por xmm4,xmm2 + por xmm5,xmm3 + movdqu XMMWORD[(-32)+rdi],xmm4 + movdqu XMMWORD[(-16)+rdi],xmm5 + sub rdx,32 + jnz NEAR $L$mulx4x_cond_copy + + mov QWORD[rbx],rdx + + mov rax,1 + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$mulx4x_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_bn_mulx4x_mont: + DB 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 + DB 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56 + DB 54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83 + DB 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 + DB 115,108,46,111,114,103,62,0 +ALIGN 16 +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +mul_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + mov r10,QWORD[192+r8] + mov rax,QWORD[8+r10*8+rax] + + jmp NEAR $L$common_pop_regs + + + +ALIGN 16 +sqr_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_pop_regs + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[8+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + mov rax,QWORD[40+rax] + +$L$common_pop_regs: + mov rbx,QWORD[((-8))+rax] + mov rbp,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 + +$L$common_seh_tail: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + ret + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_bn_mul_mont_nohw wrt ..imagebase + DD $L$SEH_end_bn_mul_mont_nohw wrt ..imagebase + DD $L$SEH_info_bn_mul_mont_nohw wrt ..imagebase + + DD $L$SEH_begin_bn_mul4x_mont wrt ..imagebase + DD $L$SEH_end_bn_mul4x_mont wrt ..imagebase + DD $L$SEH_info_bn_mul4x_mont wrt ..imagebase + + DD $L$SEH_begin_bn_sqr8x_mont wrt ..imagebase + DD $L$SEH_end_bn_sqr8x_mont wrt ..imagebase + DD $L$SEH_info_bn_sqr8x_mont wrt ..imagebase + DD $L$SEH_begin_bn_mulx4x_mont wrt ..imagebase + DD $L$SEH_end_bn_mulx4x_mont wrt ..imagebase + DD $L$SEH_info_bn_mulx4x_mont wrt ..imagebase +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_bn_mul_mont_nohw: + DB 9,0,0,0 + DD mul_handler wrt ..imagebase + DD $L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase +$L$SEH_info_bn_mul4x_mont: + DB 9,0,0,0 + DD mul_handler wrt ..imagebase + DD $L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase +$L$SEH_info_bn_sqr8x_mont: + DB 9,0,0,0 + DD sqr_handler wrt ..imagebase + DD $L$sqr8x_prologue wrt ..imagebase,$L$sqr8x_body wrt ..imagebase,$L$sqr8x_epilogue wrt ..imagebase +ALIGN 8 +$L$SEH_info_bn_mulx4x_mont: + DB 9,0,0,0 + DD sqr_handler wrt ..imagebase + DD $L$mulx4x_prologue wrt ..imagebase,$L$mulx4x_body wrt ..imagebase,$L$mulx4x_epilogue wrt ..imagebase +ALIGN 8 +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/ring-0.17.14/pregenerated/x86_64-mont-nasm.o b/ring-0.17.14/pregenerated/x86_64-mont-nasm.o new file mode 100644 index 0000000000..211ade6370 Binary files /dev/null and b/ring-0.17.14/pregenerated/x86_64-mont-nasm.o differ diff --git a/ring-0.17.14/pregenerated/x86_64-mont5-elf.S b/ring-0.17.14/pregenerated/x86_64-mont5-elf.S new file mode 100644 index 0000000000..f202ea3fc3 --- /dev/null +++ b/ring-0.17.14/pregenerated/x86_64-mont5-elf.S @@ -0,0 +1,3188 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +.text + +.globl bn_mul4x_mont_gather5 +.hidden bn_mul4x_mont_gather5 +.type bn_mul4x_mont_gather5,@function +.align 32 +bn_mul4x_mont_gather5: +.cfi_startproc +_CET_ENDBR +.byte 0x67 + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 +.Lmul4x_prologue: + +.byte 0x67 + + + + shll $3,%r9d + leaq (%r9,%r9,2),%r10 + negq %r9 + + + + + + + + + + + leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp + subq %rdi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb .Lmul4xsp_alt + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp + jmp .Lmul4xsp_done + +.align 32 +.Lmul4xsp_alt: + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rbp,%r9,2),%rbp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rbp +.Lmul4xsp_done: + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmul4x_page_walk + jmp .Lmul4x_page_walk_done + +.Lmul4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmul4x_page_walk +.Lmul4x_page_walk_done: + + negq %r9 + + movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 +.Lmul4x_body: + + call mul4x_internal + + movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq $1,%rax + + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lmul4x_epilogue: + ret +.cfi_endproc +.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 + +.type mul4x_internal,@function +.align 32 +mul4x_internal: +.cfi_startproc + shlq $5,%r9 + movd 8(%rax),%xmm5 + leaq .Linc(%rip),%rax + leaq 128(%rdx,%r9,1),%r13 + shrq $5,%r9 + movdqa 0(%rax),%xmm0 + movdqa 16(%rax),%xmm1 + leaq 88-112(%rsp,%r9,1),%r10 + leaq 128(%rdx),%r12 + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 +.byte 0x67,0x67 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 +.byte 0x67 + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,112(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,128(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,144(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,160(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,176(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,192(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,208(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,224(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,240(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,256(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,272(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,288(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,304(%r10) + + paddd %xmm2,%xmm3 +.byte 0x67 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,320(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,336(%r10) + pand 64(%r12),%xmm0 + + pand 80(%r12),%xmm1 + pand 96(%r12),%xmm2 + movdqa %xmm3,352(%r10) + pand 112(%r12),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -128(%r12),%xmm4 + movdqa -112(%r12),%xmm5 + movdqa -96(%r12),%xmm2 + pand 112(%r10),%xmm4 + movdqa -80(%r12),%xmm3 + pand 128(%r10),%xmm5 + por %xmm4,%xmm0 + pand 144(%r10),%xmm2 + por %xmm5,%xmm1 + pand 160(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -64(%r12),%xmm4 + movdqa -48(%r12),%xmm5 + movdqa -32(%r12),%xmm2 + pand 176(%r10),%xmm4 + movdqa -16(%r12),%xmm3 + pand 192(%r10),%xmm5 + por %xmm4,%xmm0 + pand 208(%r10),%xmm2 + por %xmm5,%xmm1 + pand 224(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa 0(%r12),%xmm4 + movdqa 16(%r12),%xmm5 + movdqa 32(%r12),%xmm2 + pand 240(%r10),%xmm4 + movdqa 48(%r12),%xmm3 + pand 256(%r10),%xmm5 + por %xmm4,%xmm0 + pand 272(%r10),%xmm2 + por %xmm5,%xmm1 + pand 288(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + por %xmm1,%xmm0 + + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 + leaq 256(%r12),%r12 +.byte 102,72,15,126,195 + + movq %r13,16+8(%rsp) + movq %rdi,56+8(%rsp) + + movq (%r8),%r8 + movq (%rsi),%rax + leaq (%rsi,%r9,1),%rsi + negq %r9 + + movq %r8,%rbp + mulq %rbx + movq %rax,%r10 + movq (%rcx),%rax + + imulq %r10,%rbp + leaq 64+8(%rsp),%r14 + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi,%r9,1),%rax + adcq $0,%rdx + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi,%r9,1),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 32(%r9),%r15 + leaq 32(%rcx),%rcx + adcq $0,%rdx + movq %rdi,(%r14) + movq %rdx,%r13 + jmp .L1st4x + +.align 32 +.L1st4x: + mulq %rbx + addq %rax,%r10 + movq -16(%rcx),%rax + leaq 32(%r14),%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%r14) + movq %rdx,%r13 + + mulq %rbx + addq %rax,%r10 + movq 0(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq 8(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 32(%rcx),%rcx + adcq $0,%rdx + movq %rdi,(%r14) + movq %rdx,%r13 + + addq $32,%r15 + jnz .L1st4x + + mulq %rbx + addq %rax,%r10 + movq -16(%rcx),%rax + leaq 32(%r14),%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r9,1),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%r14) + movq %rdx,%r13 + + leaq (%rcx,%r9,1),%rcx + + xorq %rdi,%rdi + addq %r10,%r13 + adcq $0,%rdi + movq %r13,-8(%r14) + + jmp .Louter4x + +.align 32 +.Louter4x: + leaq 16+128(%r14),%rdx + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + movdqa -128(%r12),%xmm0 + movdqa -112(%r12),%xmm1 + movdqa -96(%r12),%xmm2 + movdqa -80(%r12),%xmm3 + pand -128(%rdx),%xmm0 + pand -112(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -80(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%r12),%xmm0 + movdqa -48(%r12),%xmm1 + movdqa -32(%r12),%xmm2 + movdqa -16(%r12),%xmm3 + pand -64(%rdx),%xmm0 + pand -48(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -16(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%r12),%xmm0 + movdqa 16(%r12),%xmm1 + movdqa 32(%r12),%xmm2 + movdqa 48(%r12),%xmm3 + pand 0(%rdx),%xmm0 + pand 16(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 48(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%r12),%xmm0 + movdqa 80(%r12),%xmm1 + movdqa 96(%r12),%xmm2 + movdqa 112(%r12),%xmm3 + pand 64(%rdx),%xmm0 + pand 80(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 112(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + leaq 256(%r12),%r12 +.byte 102,72,15,126,195 + + movq (%r14,%r9,1),%r10 + movq %r8,%rbp + mulq %rbx + addq %rax,%r10 + movq (%rcx),%rax + adcq $0,%rdx + + imulq %r10,%rbp + movq %rdx,%r11 + movq %rdi,(%r14) + + leaq (%r14,%r9,1),%r14 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi,%r9,1),%rax + adcq $0,%rdx + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + addq 8(%r14),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi,%r9,1),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 32(%r9),%r15 + leaq 32(%rcx),%rcx + adcq $0,%rdx + movq %rdx,%r13 + jmp .Linner4x + +.align 32 +.Linner4x: + mulq %rbx + addq %rax,%r10 + movq -16(%rcx),%rax + adcq $0,%rdx + addq 16(%r14),%r10 + leaq 32(%r14),%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdi,-32(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx),%rax + adcq $0,%rdx + addq -8(%r14),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %r13,-24(%r14) + movq %rdx,%r13 + + mulq %rbx + addq %rax,%r10 + movq 0(%rcx),%rax + adcq $0,%rdx + addq (%r14),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq 8(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdi,-16(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + addq 8(%r14),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 32(%rcx),%rcx + adcq $0,%rdx + movq %r13,-8(%r14) + movq %rdx,%r13 + + addq $32,%r15 + jnz .Linner4x + + mulq %rbx + addq %rax,%r10 + movq -16(%rcx),%rax + adcq $0,%rdx + addq 16(%r14),%r10 + leaq 32(%r14),%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdi,-32(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq %rbp,%rax + movq -8(%rcx),%rbp + adcq $0,%rdx + addq -8(%r14),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r9,1),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %r13,-24(%r14) + movq %rdx,%r13 + + movq %rdi,-16(%r14) + leaq (%rcx,%r9,1),%rcx + + xorq %rdi,%rdi + addq %r10,%r13 + adcq $0,%rdi + addq (%r14),%r13 + adcq $0,%rdi + movq %r13,-8(%r14) + + cmpq 16+8(%rsp),%r12 + jb .Louter4x + xorq %rax,%rax + subq %r13,%rbp + adcq %r15,%r15 + orq %r15,%rdi + subq %rdi,%rax + leaq (%r14,%r9,1),%rbx + movq (%rcx),%r12 + leaq (%rcx),%rbp + movq %r9,%rcx + sarq $3+2,%rcx + movq 56+8(%rsp),%rdi + decq %r12 + xorq %r10,%r10 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp .Lsqr4x_sub_entry +.cfi_endproc +.size mul4x_internal,.-mul4x_internal +.globl bn_power5_nohw +.hidden bn_power5_nohw +.type bn_power5_nohw,@function +.align 32 +bn_power5_nohw: +.cfi_startproc +_CET_ENDBR + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 +.Lpower5_prologue: + + + + + shll $3,%r9d + leal (%r9,%r9,2),%r10d + negq %r9 + movq (%r8),%r8 + + + + + + + + + leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp + subq %rdi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb .Lpwr_sp_alt + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp + jmp .Lpwr_sp_done + +.align 32 +.Lpwr_sp_alt: + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rbp,%r9,2),%rbp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rbp +.Lpwr_sp_done: + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lpwr_page_walk + jmp .Lpwr_page_walk_done + +.Lpwr_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lpwr_page_walk +.Lpwr_page_walk_done: + + movq %r9,%r10 + negq %r9 + + + + + + + + + + + movq %r8,32(%rsp) + movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 +.Lpower5_body: +.byte 102,72,15,110,207 +.byte 102,72,15,110,209 +.byte 102,73,15,110,218 +.byte 102,72,15,110,226 + + call __bn_sqr8x_internal + call __bn_post4x_internal + call __bn_sqr8x_internal + call __bn_post4x_internal + call __bn_sqr8x_internal + call __bn_post4x_internal + call __bn_sqr8x_internal + call __bn_post4x_internal + call __bn_sqr8x_internal + call __bn_post4x_internal + +.byte 102,72,15,126,209 +.byte 102,72,15,126,226 + movq %rsi,%rdi + movq 40(%rsp),%rax + leaq 32(%rsp),%r8 + + call mul4x_internal + + movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq $1,%rax + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpower5_epilogue: + ret +.cfi_endproc +.size bn_power5_nohw,.-bn_power5_nohw + +.globl bn_sqr8x_internal +.hidden bn_sqr8x_internal +.hidden bn_sqr8x_internal +.type bn_sqr8x_internal,@function +.align 32 +bn_sqr8x_internal: +__bn_sqr8x_internal: +.cfi_startproc +_CET_ENDBR + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + leaq 32(%r10),%rbp + leaq (%rsi,%r9,1),%rsi + + movq %r9,%rcx + + + movq -32(%rsi,%rbp,1),%r14 + leaq 48+8(%rsp,%r9,2),%rdi + movq -24(%rsi,%rbp,1),%rax + leaq -32(%rdi,%rbp,1),%rdi + movq -16(%rsi,%rbp,1),%rbx + movq %rax,%r15 + + mulq %r14 + movq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + movq %r10,-24(%rdi,%rbp,1) + + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + adcq $0,%rdx + movq %r11,-16(%rdi,%rbp,1) + movq %rdx,%r10 + + + movq -8(%rsi,%rbp,1),%rbx + mulq %r15 + movq %rax,%r12 + movq %rbx,%rax + movq %rdx,%r13 + + leaq (%rbp),%rcx + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + adcq $0,%r11 + addq %r12,%r10 + adcq $0,%r11 + movq %r10,-8(%rdi,%rcx,1) + jmp .Lsqr4x_1st + +.align 32 +.Lsqr4x_1st: + movq (%rsi,%rcx,1),%rbx + mulq %r15 + addq %rax,%r13 + movq %rbx,%rax + movq %rdx,%r12 + adcq $0,%r12 + + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + movq 8(%rsi,%rcx,1),%rbx + movq %rdx,%r10 + adcq $0,%r10 + addq %r13,%r11 + adcq $0,%r10 + + + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + movq %r11,(%rdi,%rcx,1) + movq %rdx,%r13 + adcq $0,%r13 + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + movq 16(%rsi,%rcx,1),%rbx + movq %rdx,%r11 + adcq $0,%r11 + addq %r12,%r10 + adcq $0,%r11 + + mulq %r15 + addq %rax,%r13 + movq %rbx,%rax + movq %r10,8(%rdi,%rcx,1) + movq %rdx,%r12 + adcq $0,%r12 + + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + movq 24(%rsi,%rcx,1),%rbx + movq %rdx,%r10 + adcq $0,%r10 + addq %r13,%r11 + adcq $0,%r10 + + + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + movq %r11,16(%rdi,%rcx,1) + movq %rdx,%r13 + adcq $0,%r13 + leaq 32(%rcx),%rcx + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + adcq $0,%r11 + addq %r12,%r10 + adcq $0,%r11 + movq %r10,-8(%rdi,%rcx,1) + + cmpq $0,%rcx + jne .Lsqr4x_1st + + mulq %r15 + addq %rax,%r13 + leaq 16(%rbp),%rbp + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + + movq %r13,(%rdi) + movq %rdx,%r12 + movq %rdx,8(%rdi) + jmp .Lsqr4x_outer + +.align 32 +.Lsqr4x_outer: + movq -32(%rsi,%rbp,1),%r14 + leaq 48+8(%rsp,%r9,2),%rdi + movq -24(%rsi,%rbp,1),%rax + leaq -32(%rdi,%rbp,1),%rdi + movq -16(%rsi,%rbp,1),%rbx + movq %rax,%r15 + + mulq %r14 + movq -24(%rdi,%rbp,1),%r10 + addq %rax,%r10 + movq %rbx,%rax + adcq $0,%rdx + movq %r10,-24(%rdi,%rbp,1) + movq %rdx,%r11 + + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + adcq $0,%rdx + addq -16(%rdi,%rbp,1),%r11 + movq %rdx,%r10 + adcq $0,%r10 + movq %r11,-16(%rdi,%rbp,1) + + xorq %r12,%r12 + + movq -8(%rsi,%rbp,1),%rbx + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + adcq $0,%rdx + addq -8(%rdi,%rbp,1),%r12 + movq %rdx,%r13 + adcq $0,%r13 + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + adcq $0,%rdx + addq %r12,%r10 + movq %rdx,%r11 + adcq $0,%r11 + movq %r10,-8(%rdi,%rbp,1) + + leaq (%rbp),%rcx + jmp .Lsqr4x_inner + +.align 32 +.Lsqr4x_inner: + movq (%rsi,%rcx,1),%rbx + mulq %r15 + addq %rax,%r13 + movq %rbx,%rax + movq %rdx,%r12 + adcq $0,%r12 + addq (%rdi,%rcx,1),%r13 + adcq $0,%r12 + +.byte 0x67 + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + movq 8(%rsi,%rcx,1),%rbx + movq %rdx,%r10 + adcq $0,%r10 + addq %r13,%r11 + adcq $0,%r10 + + mulq %r15 + addq %rax,%r12 + movq %r11,(%rdi,%rcx,1) + movq %rbx,%rax + movq %rdx,%r13 + adcq $0,%r13 + addq 8(%rdi,%rcx,1),%r12 + leaq 16(%rcx),%rcx + adcq $0,%r13 + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + adcq $0,%rdx + addq %r12,%r10 + movq %rdx,%r11 + adcq $0,%r11 + movq %r10,-8(%rdi,%rcx,1) + + cmpq $0,%rcx + jne .Lsqr4x_inner + +.byte 0x67 + mulq %r15 + addq %rax,%r13 + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + + movq %r13,(%rdi) + movq %rdx,%r12 + movq %rdx,8(%rdi) + + addq $16,%rbp + jnz .Lsqr4x_outer + + + movq -32(%rsi),%r14 + leaq 48+8(%rsp,%r9,2),%rdi + movq -24(%rsi),%rax + leaq -32(%rdi,%rbp,1),%rdi + movq -16(%rsi),%rbx + movq %rax,%r15 + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + adcq $0,%r11 + + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + movq %r10,-24(%rdi) + movq %rdx,%r10 + adcq $0,%r10 + addq %r13,%r11 + movq -8(%rsi),%rbx + adcq $0,%r10 + + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + movq %r11,-16(%rdi) + movq %rdx,%r13 + adcq $0,%r13 + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + adcq $0,%r11 + addq %r12,%r10 + adcq $0,%r11 + movq %r10,-8(%rdi) + + mulq %r15 + addq %rax,%r13 + movq -16(%rsi),%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + + movq %r13,(%rdi) + movq %rdx,%r12 + movq %rdx,8(%rdi) + + mulq %rbx + addq $16,%rbp + xorq %r14,%r14 + subq %r9,%rbp + xorq %r15,%r15 + + addq %r12,%rax + adcq $0,%rdx + movq %rax,8(%rdi) + movq %rdx,16(%rdi) + movq %r15,24(%rdi) + + movq -16(%rsi,%rbp,1),%rax + leaq 48+8(%rsp),%rdi + xorq %r10,%r10 + movq 8(%rdi),%r11 + + leaq (%r14,%r10,2),%r12 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r13 + shrq $63,%r11 + orq %r10,%r13 + movq 16(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 24(%rdi),%r11 + adcq %rax,%r12 + movq -8(%rsi,%rbp,1),%rax + movq %r12,(%rdi) + adcq %rdx,%r13 + + leaq (%r14,%r10,2),%rbx + movq %r13,8(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r8 + shrq $63,%r11 + orq %r10,%r8 + movq 32(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 40(%rdi),%r11 + adcq %rax,%rbx + movq 0(%rsi,%rbp,1),%rax + movq %rbx,16(%rdi) + adcq %rdx,%r8 + leaq 16(%rbp),%rbp + movq %r8,24(%rdi) + sbbq %r15,%r15 + leaq 64(%rdi),%rdi + jmp .Lsqr4x_shift_n_add + +.align 32 +.Lsqr4x_shift_n_add: + leaq (%r14,%r10,2),%r12 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r13 + shrq $63,%r11 + orq %r10,%r13 + movq -16(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq -8(%rdi),%r11 + adcq %rax,%r12 + movq -8(%rsi,%rbp,1),%rax + movq %r12,-32(%rdi) + adcq %rdx,%r13 + + leaq (%r14,%r10,2),%rbx + movq %r13,-24(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r8 + shrq $63,%r11 + orq %r10,%r8 + movq 0(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 8(%rdi),%r11 + adcq %rax,%rbx + movq 0(%rsi,%rbp,1),%rax + movq %rbx,-16(%rdi) + adcq %rdx,%r8 + + leaq (%r14,%r10,2),%r12 + movq %r8,-8(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r13 + shrq $63,%r11 + orq %r10,%r13 + movq 16(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 24(%rdi),%r11 + adcq %rax,%r12 + movq 8(%rsi,%rbp,1),%rax + movq %r12,0(%rdi) + adcq %rdx,%r13 + + leaq (%r14,%r10,2),%rbx + movq %r13,8(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r8 + shrq $63,%r11 + orq %r10,%r8 + movq 32(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 40(%rdi),%r11 + adcq %rax,%rbx + movq 16(%rsi,%rbp,1),%rax + movq %rbx,16(%rdi) + adcq %rdx,%r8 + movq %r8,24(%rdi) + sbbq %r15,%r15 + leaq 64(%rdi),%rdi + addq $32,%rbp + jnz .Lsqr4x_shift_n_add + + leaq (%r14,%r10,2),%r12 +.byte 0x67 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r13 + shrq $63,%r11 + orq %r10,%r13 + movq -16(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq -8(%rdi),%r11 + adcq %rax,%r12 + movq -8(%rsi),%rax + movq %r12,-32(%rdi) + adcq %rdx,%r13 + + leaq (%r14,%r10,2),%rbx + movq %r13,-24(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r8 + shrq $63,%r11 + orq %r10,%r8 + mulq %rax + negq %r15 + adcq %rax,%rbx + adcq %rdx,%r8 + movq %rbx,-16(%rdi) + movq %r8,-8(%rdi) +.byte 102,72,15,126,213 +__bn_sqr8x_reduction: + xorq %rax,%rax + leaq (%r9,%rbp,1),%rcx + leaq 48+8(%rsp,%r9,2),%rdx + movq %rcx,0+8(%rsp) + leaq 48+8(%rsp,%r9,1),%rdi + movq %rdx,8+8(%rsp) + negq %r9 + jmp .L8x_reduction_loop + +.align 32 +.L8x_reduction_loop: + leaq (%rdi,%r9,1),%rdi +.byte 0x66 + movq 0(%rdi),%rbx + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq %rax,(%rdx) + leaq 64(%rdi),%rdi + +.byte 0x67 + movq %rbx,%r8 + imulq 32+8(%rsp),%rbx + movq 0(%rbp),%rax + movl $8,%ecx + jmp .L8x_reduce + +.align 32 +.L8x_reduce: + mulq %rbx + movq 8(%rbp),%rax + negq %r8 + movq %rdx,%r8 + adcq $0,%r8 + + mulq %rbx + addq %rax,%r9 + movq 16(%rbp),%rax + adcq $0,%rdx + addq %r9,%r8 + movq %rbx,48-8+8(%rsp,%rcx,8) + movq %rdx,%r9 + adcq $0,%r9 + + mulq %rbx + addq %rax,%r10 + movq 24(%rbp),%rax + adcq $0,%rdx + addq %r10,%r9 + movq 32+8(%rsp),%rsi + movq %rdx,%r10 + adcq $0,%r10 + + mulq %rbx + addq %rax,%r11 + movq 32(%rbp),%rax + adcq $0,%rdx + imulq %r8,%rsi + addq %r11,%r10 + movq %rdx,%r11 + adcq $0,%r11 + + mulq %rbx + addq %rax,%r12 + movq 40(%rbp),%rax + adcq $0,%rdx + addq %r12,%r11 + movq %rdx,%r12 + adcq $0,%r12 + + mulq %rbx + addq %rax,%r13 + movq 48(%rbp),%rax + adcq $0,%rdx + addq %r13,%r12 + movq %rdx,%r13 + adcq $0,%r13 + + mulq %rbx + addq %rax,%r14 + movq 56(%rbp),%rax + adcq $0,%rdx + addq %r14,%r13 + movq %rdx,%r14 + adcq $0,%r14 + + mulq %rbx + movq %rsi,%rbx + addq %rax,%r15 + movq 0(%rbp),%rax + adcq $0,%rdx + addq %r15,%r14 + movq %rdx,%r15 + adcq $0,%r15 + + decl %ecx + jnz .L8x_reduce + + leaq 64(%rbp),%rbp + xorq %rax,%rax + movq 8+8(%rsp),%rdx + cmpq 0+8(%rsp),%rbp + jae .L8x_no_tail + +.byte 0x66 + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + sbbq %rsi,%rsi + + movq 48+56+8(%rsp),%rbx + movl $8,%ecx + movq 0(%rbp),%rax + jmp .L8x_tail + +.align 32 +.L8x_tail: + mulq %rbx + addq %rax,%r8 + movq 8(%rbp),%rax + movq %r8,(%rdi) + movq %rdx,%r8 + adcq $0,%r8 + + mulq %rbx + addq %rax,%r9 + movq 16(%rbp),%rax + adcq $0,%rdx + addq %r9,%r8 + leaq 8(%rdi),%rdi + movq %rdx,%r9 + adcq $0,%r9 + + mulq %rbx + addq %rax,%r10 + movq 24(%rbp),%rax + adcq $0,%rdx + addq %r10,%r9 + movq %rdx,%r10 + adcq $0,%r10 + + mulq %rbx + addq %rax,%r11 + movq 32(%rbp),%rax + adcq $0,%rdx + addq %r11,%r10 + movq %rdx,%r11 + adcq $0,%r11 + + mulq %rbx + addq %rax,%r12 + movq 40(%rbp),%rax + adcq $0,%rdx + addq %r12,%r11 + movq %rdx,%r12 + adcq $0,%r12 + + mulq %rbx + addq %rax,%r13 + movq 48(%rbp),%rax + adcq $0,%rdx + addq %r13,%r12 + movq %rdx,%r13 + adcq $0,%r13 + + mulq %rbx + addq %rax,%r14 + movq 56(%rbp),%rax + adcq $0,%rdx + addq %r14,%r13 + movq %rdx,%r14 + adcq $0,%r14 + + mulq %rbx + movq 48-16+8(%rsp,%rcx,8),%rbx + addq %rax,%r15 + adcq $0,%rdx + addq %r15,%r14 + movq 0(%rbp),%rax + movq %rdx,%r15 + adcq $0,%r15 + + decl %ecx + jnz .L8x_tail + + leaq 64(%rbp),%rbp + movq 8+8(%rsp),%rdx + cmpq 0+8(%rsp),%rbp + jae .L8x_tail_done + + movq 48+56+8(%rsp),%rbx + negq %rsi + movq 0(%rbp),%rax + adcq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + sbbq %rsi,%rsi + + movl $8,%ecx + jmp .L8x_tail + +.align 32 +.L8x_tail_done: + xorq %rax,%rax + addq (%rdx),%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rax + + negq %rsi +.L8x_no_tail: + adcq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + adcq $0,%rax + movq -8(%rbp),%rcx + xorq %rsi,%rsi + +.byte 102,72,15,126,213 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) +.byte 102,73,15,126,217 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + leaq 64(%rdi),%rdi + + cmpq %rdx,%rdi + jb .L8x_reduction_loop + ret +.cfi_endproc +.size bn_sqr8x_internal,.-bn_sqr8x_internal +.type __bn_post4x_internal,@function +.align 32 +__bn_post4x_internal: +.cfi_startproc + movq 0(%rbp),%r12 + leaq (%rdi,%r9,1),%rbx + movq %r9,%rcx +.byte 102,72,15,126,207 + negq %rax +.byte 102,72,15,126,206 + sarq $3+2,%rcx + decq %r12 + xorq %r10,%r10 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp .Lsqr4x_sub_entry + +.align 16 +.Lsqr4x_sub: + movq 0(%rbp),%r12 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 +.Lsqr4x_sub_entry: + leaq 32(%rbp),%rbp + notq %r12 + notq %r13 + notq %r14 + notq %r15 + andq %rax,%r12 + andq %rax,%r13 + andq %rax,%r14 + andq %rax,%r15 + + negq %r10 + adcq 0(%rbx),%r12 + adcq 8(%rbx),%r13 + adcq 16(%rbx),%r14 + adcq 24(%rbx),%r15 + movq %r12,0(%rdi) + leaq 32(%rbx),%rbx + movq %r13,8(%rdi) + sbbq %r10,%r10 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + leaq 32(%rdi),%rdi + + incq %rcx + jnz .Lsqr4x_sub + + movq %r9,%r10 + negq %r9 + ret +.cfi_endproc +.size __bn_post4x_internal,.-__bn_post4x_internal +.globl bn_mulx4x_mont_gather5 +.hidden bn_mulx4x_mont_gather5 +.type bn_mulx4x_mont_gather5,@function +.align 32 +bn_mulx4x_mont_gather5: +.cfi_startproc +_CET_ENDBR + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 +.Lmulx4x_prologue: + + + + + shll $3,%r9d + leaq (%r9,%r9,2),%r10 + negq %r9 + movq (%r8),%r8 + + + + + + + + + + + leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp + subq %rdi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb .Lmulx4xsp_alt + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp + jmp .Lmulx4xsp_done + +.Lmulx4xsp_alt: + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rbp,%r9,2),%rbp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rbp +.Lmulx4xsp_done: + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmulx4x_page_walk + jmp .Lmulx4x_page_walk_done + +.Lmulx4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmulx4x_page_walk +.Lmulx4x_page_walk_done: + + + + + + + + + + + + + + movq %r8,32(%rsp) + movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 +.Lmulx4x_body: + call mulx4x_internal + + movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq $1,%rax + + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lmulx4x_epilogue: + ret +.cfi_endproc +.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 + +.type mulx4x_internal,@function +.align 32 +mulx4x_internal: +.cfi_startproc + movq %r9,8(%rsp) + movq %r9,%r10 + negq %r9 + shlq $5,%r9 + negq %r10 + leaq 128(%rdx,%r9,1),%r13 + shrq $5+5,%r9 + movd 8(%rax),%xmm5 + subq $1,%r9 + leaq .Linc(%rip),%rax + movq %r13,16+8(%rsp) + movq %r9,24+8(%rsp) + movq %rdi,56+8(%rsp) + movdqa 0(%rax),%xmm0 + movdqa 16(%rax),%xmm1 + leaq 88-112(%rsp,%r10,1),%r10 + leaq 128(%rdx),%rdi + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 +.byte 0x67 + movdqa %xmm1,%xmm2 +.byte 0x67 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,112(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,128(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,144(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,160(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,176(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,192(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,208(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,224(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,240(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,256(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,272(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,288(%r10) + movdqa %xmm4,%xmm3 +.byte 0x67 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,304(%r10) + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,320(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,336(%r10) + + pand 64(%rdi),%xmm0 + pand 80(%rdi),%xmm1 + pand 96(%rdi),%xmm2 + movdqa %xmm3,352(%r10) + pand 112(%rdi),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -128(%rdi),%xmm4 + movdqa -112(%rdi),%xmm5 + movdqa -96(%rdi),%xmm2 + pand 112(%r10),%xmm4 + movdqa -80(%rdi),%xmm3 + pand 128(%r10),%xmm5 + por %xmm4,%xmm0 + pand 144(%r10),%xmm2 + por %xmm5,%xmm1 + pand 160(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -64(%rdi),%xmm4 + movdqa -48(%rdi),%xmm5 + movdqa -32(%rdi),%xmm2 + pand 176(%r10),%xmm4 + movdqa -16(%rdi),%xmm3 + pand 192(%r10),%xmm5 + por %xmm4,%xmm0 + pand 208(%r10),%xmm2 + por %xmm5,%xmm1 + pand 224(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa 0(%rdi),%xmm4 + movdqa 16(%rdi),%xmm5 + movdqa 32(%rdi),%xmm2 + pand 240(%r10),%xmm4 + movdqa 48(%rdi),%xmm3 + pand 256(%r10),%xmm5 + por %xmm4,%xmm0 + pand 272(%r10),%xmm2 + por %xmm5,%xmm1 + pand 288(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + pxor %xmm1,%xmm0 + + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 + leaq 256(%rdi),%rdi +.byte 102,72,15,126,194 + leaq 64+32+8(%rsp),%rbx + + movq %rdx,%r9 + mulxq 0(%rsi),%r8,%rax + mulxq 8(%rsi),%r11,%r12 + addq %rax,%r11 + mulxq 16(%rsi),%rax,%r13 + adcq %rax,%r12 + adcq $0,%r13 + mulxq 24(%rsi),%rax,%r14 + + movq %r8,%r15 + imulq 32+8(%rsp),%r8 + xorq %rbp,%rbp + movq %r8,%rdx + + movq %rdi,8+8(%rsp) + + leaq 32(%rsi),%rsi + adcxq %rax,%r13 + adcxq %rbp,%r14 + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%r15 + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + mulxq 16(%rcx),%rax,%r12 + movq 24+8(%rsp),%rdi + movq %r10,-32(%rbx) + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-24(%rbx) + adcxq %rax,%r12 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r12,-16(%rbx) + jmp .Lmulx4x_1st + +.align 32 +.Lmulx4x_1st: + adcxq %rbp,%r15 + mulxq 0(%rsi),%r10,%rax + adcxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 +.byte 0x67,0x67 + movq %r8,%rdx + adcxq %rax,%r13 + adcxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + movq %r11,-32(%rbx) + adoxq %r15,%r13 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r13,-16(%rbx) + + decq %rdi + jnz .Lmulx4x_1st + + movq 8(%rsp),%rax + adcq %rbp,%r15 + leaq (%rsi,%rax,1),%rsi + addq %r15,%r14 + movq 8+8(%rsp),%rdi + adcq %rbp,%rbp + movq %r14,-8(%rbx) + jmp .Lmulx4x_outer + +.align 32 +.Lmulx4x_outer: + leaq 16-256(%rbx),%r10 + pxor %xmm4,%xmm4 +.byte 0x67,0x67 + pxor %xmm5,%xmm5 + movdqa -128(%rdi),%xmm0 + movdqa -112(%rdi),%xmm1 + movdqa -96(%rdi),%xmm2 + pand 256(%r10),%xmm0 + movdqa -80(%rdi),%xmm3 + pand 272(%r10),%xmm1 + por %xmm0,%xmm4 + pand 288(%r10),%xmm2 + por %xmm1,%xmm5 + pand 304(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%rdi),%xmm0 + movdqa -48(%rdi),%xmm1 + movdqa -32(%rdi),%xmm2 + pand 320(%r10),%xmm0 + movdqa -16(%rdi),%xmm3 + pand 336(%r10),%xmm1 + por %xmm0,%xmm4 + pand 352(%r10),%xmm2 + por %xmm1,%xmm5 + pand 368(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%rdi),%xmm0 + movdqa 16(%rdi),%xmm1 + movdqa 32(%rdi),%xmm2 + pand 384(%r10),%xmm0 + movdqa 48(%rdi),%xmm3 + pand 400(%r10),%xmm1 + por %xmm0,%xmm4 + pand 416(%r10),%xmm2 + por %xmm1,%xmm5 + pand 432(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%rdi),%xmm0 + movdqa 80(%rdi),%xmm1 + movdqa 96(%rdi),%xmm2 + pand 448(%r10),%xmm0 + movdqa 112(%rdi),%xmm3 + pand 464(%r10),%xmm1 + por %xmm0,%xmm4 + pand 480(%r10),%xmm2 + por %xmm1,%xmm5 + pand 496(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + leaq 256(%rdi),%rdi +.byte 102,72,15,126,194 + + movq %rbp,(%rbx) + leaq 32(%rbx,%rax,1),%rbx + mulxq 0(%rsi),%r8,%r11 + xorq %rbp,%rbp + movq %rdx,%r9 + mulxq 8(%rsi),%r14,%r12 + adoxq -32(%rbx),%r8 + adcxq %r14,%r11 + mulxq 16(%rsi),%r15,%r13 + adoxq -24(%rbx),%r11 + adcxq %r15,%r12 + mulxq 24(%rsi),%rdx,%r14 + adoxq -16(%rbx),%r12 + adcxq %rdx,%r13 + leaq (%rcx,%rax,1),%rcx + leaq 32(%rsi),%rsi + adoxq -8(%rbx),%r13 + adcxq %rbp,%r14 + adoxq %rbp,%r14 + + movq %r8,%r15 + imulq 32+8(%rsp),%r8 + + movq %r8,%rdx + xorq %rbp,%rbp + movq %rdi,8+8(%rsp) + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%r15 + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + mulxq 16(%rcx),%rax,%r12 + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq 24+8(%rsp),%rdi + movq %r10,-32(%rbx) + adcxq %rax,%r12 + movq %r11,-24(%rbx) + adoxq %rbp,%r15 + movq %r12,-16(%rbx) + leaq 32(%rcx),%rcx + jmp .Lmulx4x_inner + +.align 32 +.Lmulx4x_inner: + mulxq 0(%rsi),%r10,%rax + adcxq %rbp,%r15 + adoxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq 0(%rbx),%r10 + adoxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq 8(%rbx),%r11 + adoxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 + movq %r8,%rdx + adcxq 16(%rbx),%r12 + adoxq %rax,%r13 + adcxq 24(%rbx),%r13 + adoxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + adcxq %rbp,%r14 + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + adoxq %r15,%r13 + movq %r11,-32(%rbx) + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + leaq 32(%rcx),%rcx + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + movq %r13,-16(%rbx) + + decq %rdi + jnz .Lmulx4x_inner + + movq 0+8(%rsp),%rax + adcq %rbp,%r15 + subq 0(%rbx),%rdi + movq 8+8(%rsp),%rdi + movq 16+8(%rsp),%r10 + adcq %r15,%r14 + leaq (%rsi,%rax,1),%rsi + adcq %rbp,%rbp + movq %r14,-8(%rbx) + + cmpq %r10,%rdi + jb .Lmulx4x_outer + + movq -8(%rcx),%r10 + movq %rbp,%r8 + movq (%rcx,%rax,1),%r12 + leaq (%rcx,%rax,1),%rbp + movq %rax,%rcx + leaq (%rbx,%rax,1),%rdi + xorl %eax,%eax + xorq %r15,%r15 + subq %r14,%r10 + adcq %r15,%r15 + orq %r15,%r8 + sarq $3+2,%rcx + subq %r8,%rax + movq 56+8(%rsp),%rdx + decq %r12 + movq 8(%rbp),%r13 + xorq %r8,%r8 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp .Lsqrx4x_sub_entry +.cfi_endproc +.size mulx4x_internal,.-mulx4x_internal +.globl bn_powerx5 +.hidden bn_powerx5 +.type bn_powerx5,@function +.align 32 +bn_powerx5: +.cfi_startproc +_CET_ENDBR + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 +.Lpowerx5_prologue: + + + + + shll $3,%r9d + leaq (%r9,%r9,2),%r10 + negq %r9 + movq (%r8),%r8 + + + + + + + + + leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp + subq %rdi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb .Lpwrx_sp_alt + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp + jmp .Lpwrx_sp_done + +.align 32 +.Lpwrx_sp_alt: + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rbp,%r9,2),%rbp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rbp +.Lpwrx_sp_done: + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lpwrx_page_walk + jmp .Lpwrx_page_walk_done + +.Lpwrx_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lpwrx_page_walk +.Lpwrx_page_walk_done: + + movq %r9,%r10 + negq %r9 + + + + + + + + + + + + + pxor %xmm0,%xmm0 +.byte 102,72,15,110,207 +.byte 102,72,15,110,209 +.byte 102,73,15,110,218 +.byte 102,72,15,110,226 + movq %r8,32(%rsp) + movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 +.Lpowerx5_body: + + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + + movq %r10,%r9 + movq %rsi,%rdi +.byte 102,72,15,126,209 +.byte 102,72,15,126,226 + movq 40(%rsp),%rax + + call mulx4x_internal + + movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq $1,%rax + + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpowerx5_epilogue: + ret +.cfi_endproc +.size bn_powerx5,.-bn_powerx5 + +.globl bn_sqrx8x_internal +.hidden bn_sqrx8x_internal +.hidden bn_sqrx8x_internal +.type bn_sqrx8x_internal,@function +.align 32 +bn_sqrx8x_internal: +__bn_sqrx8x_internal: +.cfi_startproc +_CET_ENDBR + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + leaq 48+8(%rsp),%rdi + leaq (%rsi,%r9,1),%rbp + movq %r9,0+8(%rsp) + movq %rbp,8+8(%rsp) + jmp .Lsqr8x_zero_start + +.align 32 +.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 +.Lsqrx8x_zero: +.byte 0x3e + movdqa %xmm0,0(%rdi) + movdqa %xmm0,16(%rdi) + movdqa %xmm0,32(%rdi) + movdqa %xmm0,48(%rdi) +.Lsqr8x_zero_start: + movdqa %xmm0,64(%rdi) + movdqa %xmm0,80(%rdi) + movdqa %xmm0,96(%rdi) + movdqa %xmm0,112(%rdi) + leaq 128(%rdi),%rdi + subq $64,%r9 + jnz .Lsqrx8x_zero + + movq 0(%rsi),%rdx + + xorq %r10,%r10 + xorq %r11,%r11 + xorq %r12,%r12 + xorq %r13,%r13 + xorq %r14,%r14 + xorq %r15,%r15 + leaq 48+8(%rsp),%rdi + xorq %rbp,%rbp + jmp .Lsqrx8x_outer_loop + +.align 32 +.Lsqrx8x_outer_loop: + mulxq 8(%rsi),%r8,%rax + adcxq %r9,%r8 + adoxq %rax,%r10 + mulxq 16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 +.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 + adcxq %r11,%r10 + adoxq %rax,%r12 +.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 + adcxq %r12,%r11 + adoxq %rax,%r13 + mulxq 40(%rsi),%r12,%rax + adcxq %r13,%r12 + adoxq %rax,%r14 + mulxq 48(%rsi),%r13,%rax + adcxq %r14,%r13 + adoxq %r15,%rax + mulxq 56(%rsi),%r14,%r15 + movq 8(%rsi),%rdx + adcxq %rax,%r14 + adoxq %rbp,%r15 + adcq 64(%rdi),%r15 + movq %r8,8(%rdi) + movq %r9,16(%rdi) + sbbq %rcx,%rcx + xorq %rbp,%rbp + + + mulxq 16(%rsi),%r8,%rbx + mulxq 24(%rsi),%r9,%rax + adcxq %r10,%r8 + adoxq %rbx,%r9 + mulxq 32(%rsi),%r10,%rbx + adcxq %r11,%r9 + adoxq %rax,%r10 +.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 + adcxq %r12,%r10 + adoxq %rbx,%r11 +.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 + adcxq %r13,%r11 + adoxq %r14,%r12 +.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 + movq 16(%rsi),%rdx + adcxq %rax,%r12 + adoxq %rbx,%r13 + adcxq %r15,%r13 + adoxq %rbp,%r14 + adcxq %rbp,%r14 + + movq %r8,24(%rdi) + movq %r9,32(%rdi) + + mulxq 24(%rsi),%r8,%rbx + mulxq 32(%rsi),%r9,%rax + adcxq %r10,%r8 + adoxq %rbx,%r9 + mulxq 40(%rsi),%r10,%rbx + adcxq %r11,%r9 + adoxq %rax,%r10 +.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 + adcxq %r12,%r10 + adoxq %r13,%r11 +.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 +.byte 0x3e + movq 24(%rsi),%rdx + adcxq %rbx,%r11 + adoxq %rax,%r12 + adcxq %r14,%r12 + movq %r8,40(%rdi) + movq %r9,48(%rdi) + mulxq 32(%rsi),%r8,%rax + adoxq %rbp,%r13 + adcxq %rbp,%r13 + + mulxq 40(%rsi),%r9,%rbx + adcxq %r10,%r8 + adoxq %rax,%r9 + mulxq 48(%rsi),%r10,%rax + adcxq %r11,%r9 + adoxq %r12,%r10 + mulxq 56(%rsi),%r11,%r12 + movq 32(%rsi),%rdx + movq 40(%rsi),%r14 + adcxq %rbx,%r10 + adoxq %rax,%r11 + movq 48(%rsi),%r15 + adcxq %r13,%r11 + adoxq %rbp,%r12 + adcxq %rbp,%r12 + + movq %r8,56(%rdi) + movq %r9,64(%rdi) + + mulxq %r14,%r9,%rax + movq 56(%rsi),%r8 + adcxq %r10,%r9 + mulxq %r15,%r10,%rbx + adoxq %rax,%r10 + adcxq %r11,%r10 + mulxq %r8,%r11,%rax + movq %r14,%rdx + adoxq %rbx,%r11 + adcxq %r12,%r11 + + adcxq %rbp,%rax + + mulxq %r15,%r14,%rbx + mulxq %r8,%r12,%r13 + movq %r15,%rdx + leaq 64(%rsi),%rsi + adcxq %r14,%r11 + adoxq %rbx,%r12 + adcxq %rax,%r12 + adoxq %rbp,%r13 + +.byte 0x67,0x67 + mulxq %r8,%r8,%r14 + adcxq %r8,%r13 + adcxq %rbp,%r14 + + cmpq 8+8(%rsp),%rsi + je .Lsqrx8x_outer_break + + negq %rcx + movq $-8,%rcx + movq %rbp,%r15 + movq 64(%rdi),%r8 + adcxq 72(%rdi),%r9 + adcxq 80(%rdi),%r10 + adcxq 88(%rdi),%r11 + adcq 96(%rdi),%r12 + adcq 104(%rdi),%r13 + adcq 112(%rdi),%r14 + adcq 120(%rdi),%r15 + leaq (%rsi),%rbp + leaq 128(%rdi),%rdi + sbbq %rax,%rax + + movq -64(%rsi),%rdx + movq %rax,16+8(%rsp) + movq %rdi,24+8(%rsp) + + + xorl %eax,%eax + jmp .Lsqrx8x_loop + +.align 32 +.Lsqrx8x_loop: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rax,%rbx + adoxq %r9,%r8 + + mulxq 8(%rbp),%rax,%r9 + adcxq %rax,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rax,%r10 + adcxq %rax,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 + adcxq %rax,%r11 + adoxq %r13,%r12 + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rbp),%rax,%r14 + movq %rbx,(%rdi,%rcx,8) + movl $0,%ebx + adcxq %rax,%r13 + adoxq %r15,%r14 + +.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 + movq 8(%rsi,%rcx,8),%rdx + adcxq %rax,%r14 + adoxq %rbx,%r15 + adcxq %rbx,%r15 + +.byte 0x67 + incq %rcx + jnz .Lsqrx8x_loop + + leaq 64(%rbp),%rbp + movq $-8,%rcx + cmpq 8+8(%rsp),%rbp + je .Lsqrx8x_break + + subq 16+8(%rsp),%rbx +.byte 0x66 + movq -64(%rsi),%rdx + adcxq 0(%rdi),%r8 + adcxq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + leaq 64(%rdi),%rdi +.byte 0x67 + sbbq %rax,%rax + xorl %ebx,%ebx + movq %rax,16+8(%rsp) + jmp .Lsqrx8x_loop + +.align 32 +.Lsqrx8x_break: + xorq %rbp,%rbp + subq 16+8(%rsp),%rbx + adcxq %rbp,%r8 + movq 24+8(%rsp),%rcx + adcxq %rbp,%r9 + movq 0(%rsi),%rdx + adcq $0,%r10 + movq %r8,0(%rdi) + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + cmpq %rcx,%rdi + je .Lsqrx8x_outer_loop + + movq %r9,8(%rdi) + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + movq 40(%rcx),%r13 + movq %r14,48(%rdi) + movq 48(%rcx),%r14 + movq %r15,56(%rdi) + movq 56(%rcx),%r15 + movq %rcx,%rdi + jmp .Lsqrx8x_outer_loop + +.align 32 +.Lsqrx8x_outer_break: + movq %r9,72(%rdi) +.byte 102,72,15,126,217 + movq %r10,80(%rdi) + movq %r11,88(%rdi) + movq %r12,96(%rdi) + movq %r13,104(%rdi) + movq %r14,112(%rdi) + leaq 48+8(%rsp),%rdi + movq (%rsi,%rcx,1),%rdx + + movq 8(%rdi),%r11 + xorq %r10,%r10 + movq 0+8(%rsp),%r9 + adoxq %r11,%r11 + movq 16(%rdi),%r12 + movq 24(%rdi),%r13 + + +.align 32 +.Lsqrx4x_shift_n_add: + mulxq %rdx,%rax,%rbx + adoxq %r12,%r12 + adcxq %r10,%rax +.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 +.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 + adoxq %r13,%r13 + adcxq %r11,%rbx + movq 40(%rdi),%r11 + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + + mulxq %rdx,%rax,%rbx + adoxq %r10,%r10 + adcxq %r12,%rax + movq 16(%rsi,%rcx,1),%rdx + movq 48(%rdi),%r12 + adoxq %r11,%r11 + adcxq %r13,%rbx + movq 56(%rdi),%r13 + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + + mulxq %rdx,%rax,%rbx + adoxq %r12,%r12 + adcxq %r10,%rax + movq 24(%rsi,%rcx,1),%rdx + leaq 32(%rcx),%rcx + movq 64(%rdi),%r10 + adoxq %r13,%r13 + adcxq %r11,%rbx + movq 72(%rdi),%r11 + movq %rax,32(%rdi) + movq %rbx,40(%rdi) + + mulxq %rdx,%rax,%rbx + adoxq %r10,%r10 + adcxq %r12,%rax + jrcxz .Lsqrx4x_shift_n_add_break +.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 + adoxq %r11,%r11 + adcxq %r13,%rbx + movq 80(%rdi),%r12 + movq 88(%rdi),%r13 + movq %rax,48(%rdi) + movq %rbx,56(%rdi) + leaq 64(%rdi),%rdi + nop + jmp .Lsqrx4x_shift_n_add + +.align 32 +.Lsqrx4x_shift_n_add_break: + adcxq %r13,%rbx + movq %rax,48(%rdi) + movq %rbx,56(%rdi) + leaq 64(%rdi),%rdi +.byte 102,72,15,126,213 +__bn_sqrx8x_reduction: + xorl %eax,%eax + movq 32+8(%rsp),%rbx + movq 48+8(%rsp),%rdx + leaq -64(%rbp,%r9,1),%rcx + + movq %rcx,0+8(%rsp) + movq %rdi,8+8(%rsp) + + leaq 48+8(%rsp),%rdi + jmp .Lsqrx8x_reduction_loop + +.align 32 +.Lsqrx8x_reduction_loop: + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq %rdx,%r8 + imulq %rbx,%rdx + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq %rax,24+8(%rsp) + + leaq 64(%rdi),%rdi + xorq %rsi,%rsi + movq $-8,%rcx + jmp .Lsqrx8x_reduce + +.align 32 +.Lsqrx8x_reduce: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rbx,%rax + adoxq %r9,%r8 + + mulxq 8(%rbp),%rbx,%r9 + adcxq %rbx,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rbx,%r10 + adcxq %rbx,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rbx,%r11 + adcxq %rbx,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 + movq %rdx,%rax + movq %r8,%rdx + adcxq %rbx,%r11 + adoxq %r13,%r12 + + mulxq 32+8(%rsp),%rbx,%rdx + movq %rax,%rdx + movq %rax,64+48+8(%rsp,%rcx,8) + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rbp),%rax,%r14 + adcxq %rax,%r13 + adoxq %r15,%r14 + + mulxq 56(%rbp),%rax,%r15 + movq %rbx,%rdx + adcxq %rax,%r14 + adoxq %rsi,%r15 + adcxq %rsi,%r15 + +.byte 0x67,0x67,0x67 + incq %rcx + jnz .Lsqrx8x_reduce + + movq %rsi,%rax + cmpq 0+8(%rsp),%rbp + jae .Lsqrx8x_no_tail + + movq 48+8(%rsp),%rdx + addq 0(%rdi),%r8 + leaq 64(%rbp),%rbp + movq $-8,%rcx + adcxq 8(%rdi),%r9 + adcxq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + leaq 64(%rdi),%rdi + sbbq %rax,%rax + + xorq %rsi,%rsi + movq %rax,16+8(%rsp) + jmp .Lsqrx8x_tail + +.align 32 +.Lsqrx8x_tail: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rax,%rbx + adoxq %r9,%r8 + + mulxq 8(%rbp),%rax,%r9 + adcxq %rax,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rax,%r10 + adcxq %rax,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 + adcxq %rax,%r11 + adoxq %r13,%r12 + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rbp),%rax,%r14 + adcxq %rax,%r13 + adoxq %r15,%r14 + + mulxq 56(%rbp),%rax,%r15 + movq 72+48+8(%rsp,%rcx,8),%rdx + adcxq %rax,%r14 + adoxq %rsi,%r15 + movq %rbx,(%rdi,%rcx,8) + movq %r8,%rbx + adcxq %rsi,%r15 + + incq %rcx + jnz .Lsqrx8x_tail + + cmpq 0+8(%rsp),%rbp + jae .Lsqrx8x_tail_done + + subq 16+8(%rsp),%rsi + movq 48+8(%rsp),%rdx + leaq 64(%rbp),%rbp + adcq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + leaq 64(%rdi),%rdi + sbbq %rax,%rax + subq $8,%rcx + + xorq %rsi,%rsi + movq %rax,16+8(%rsp) + jmp .Lsqrx8x_tail + +.align 32 +.Lsqrx8x_tail_done: + xorq %rax,%rax + addq 24+8(%rsp),%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rax + + subq 16+8(%rsp),%rsi +.Lsqrx8x_no_tail: + adcq 0(%rdi),%r8 +.byte 102,72,15,126,217 + adcq 8(%rdi),%r9 + movq 56(%rbp),%rsi +.byte 102,72,15,126,213 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + adcq $0,%rax + + movq 32+8(%rsp),%rbx + movq 64(%rdi,%rcx,1),%rdx + + movq %r8,0(%rdi) + leaq 64(%rdi),%r8 + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + leaq 64(%rdi,%rcx,1),%rdi + cmpq 8+8(%rsp),%r8 + jb .Lsqrx8x_reduction_loop + ret +.cfi_endproc +.size bn_sqrx8x_internal,.-bn_sqrx8x_internal +.align 32 +.type __bn_postx4x_internal,@function +__bn_postx4x_internal: +.cfi_startproc + movq 0(%rbp),%r12 + movq %rcx,%r10 + movq %rcx,%r9 + negq %rax + sarq $3+2,%rcx + +.byte 102,72,15,126,202 +.byte 102,72,15,126,206 + decq %r12 + movq 8(%rbp),%r13 + xorq %r8,%r8 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp .Lsqrx4x_sub_entry + +.align 16 +.Lsqrx4x_sub: + movq 0(%rbp),%r12 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 +.Lsqrx4x_sub_entry: + andnq %rax,%r12,%r12 + leaq 32(%rbp),%rbp + andnq %rax,%r13,%r13 + andnq %rax,%r14,%r14 + andnq %rax,%r15,%r15 + + negq %r8 + adcq 0(%rdi),%r12 + adcq 8(%rdi),%r13 + adcq 16(%rdi),%r14 + adcq 24(%rdi),%r15 + movq %r12,0(%rdx) + leaq 32(%rdi),%rdi + movq %r13,8(%rdx) + sbbq %r8,%r8 + movq %r14,16(%rdx) + movq %r15,24(%rdx) + leaq 32(%rdx),%rdx + + incq %rcx + jnz .Lsqrx4x_sub + + negq %r9 + + ret +.cfi_endproc +.size __bn_postx4x_internal,.-__bn_postx4x_internal +.globl bn_scatter5 +.hidden bn_scatter5 +.type bn_scatter5,@function +.align 16 +bn_scatter5: +.cfi_startproc +_CET_ENDBR + cmpl $0,%esi + jz .Lscatter_epilogue + + + + + + + + + + leaq (%rdx,%rcx,8),%rdx +.Lscatter: + movq (%rdi),%rax + leaq 8(%rdi),%rdi + movq %rax,(%rdx) + leaq 256(%rdx),%rdx + subl $1,%esi + jnz .Lscatter +.Lscatter_epilogue: + ret +.cfi_endproc +.size bn_scatter5,.-bn_scatter5 + +.globl bn_gather5 +.hidden bn_gather5 +.type bn_gather5,@function +.align 32 +bn_gather5: +.cfi_startproc +.LSEH_begin_bn_gather5: +_CET_ENDBR + +.byte 0x4c,0x8d,0x14,0x24 +.cfi_def_cfa_register %r10 +.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 + leaq .Linc(%rip),%rax + andq $-16,%rsp + + movd %ecx,%xmm5 + movdqa 0(%rax),%xmm0 + movdqa 16(%rax),%xmm1 + leaq 128(%rdx),%r11 + leaq 128(%rsp),%rax + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,-128(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,-112(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,-96(%rax) + movdqa %xmm4,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,-80(%rax) + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,-64(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,-48(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,-32(%rax) + movdqa %xmm4,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,-16(%rax) + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,0(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,16(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,32(%rax) + movdqa %xmm4,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,48(%rax) + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,64(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,80(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,96(%rax) + movdqa %xmm4,%xmm2 + movdqa %xmm3,112(%rax) + jmp .Lgather + +.align 32 +.Lgather: + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + movdqa -128(%r11),%xmm0 + movdqa -112(%r11),%xmm1 + movdqa -96(%r11),%xmm2 + pand -128(%rax),%xmm0 + movdqa -80(%r11),%xmm3 + pand -112(%rax),%xmm1 + por %xmm0,%xmm4 + pand -96(%rax),%xmm2 + por %xmm1,%xmm5 + pand -80(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%r11),%xmm0 + movdqa -48(%r11),%xmm1 + movdqa -32(%r11),%xmm2 + pand -64(%rax),%xmm0 + movdqa -16(%r11),%xmm3 + pand -48(%rax),%xmm1 + por %xmm0,%xmm4 + pand -32(%rax),%xmm2 + por %xmm1,%xmm5 + pand -16(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%r11),%xmm0 + movdqa 16(%r11),%xmm1 + movdqa 32(%r11),%xmm2 + pand 0(%rax),%xmm0 + movdqa 48(%r11),%xmm3 + pand 16(%rax),%xmm1 + por %xmm0,%xmm4 + pand 32(%rax),%xmm2 + por %xmm1,%xmm5 + pand 48(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%r11),%xmm0 + movdqa 80(%r11),%xmm1 + movdqa 96(%r11),%xmm2 + pand 64(%rax),%xmm0 + movdqa 112(%r11),%xmm3 + pand 80(%rax),%xmm1 + por %xmm0,%xmm4 + pand 96(%rax),%xmm2 + por %xmm1,%xmm5 + pand 112(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + leaq 256(%r11),%r11 + + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + movq %xmm0,(%rdi) + leaq 8(%rdi),%rdi + subl $1,%esi + jnz .Lgather + + leaq (%r10),%rsp +.cfi_def_cfa_register %rsp + ret +.LSEH_end_bn_gather5: +.cfi_endproc +.size bn_gather5,.-bn_gather5 +.section .rodata +.align 64 +.Linc: +.long 0,0, 1,1 +.long 2,2, 2,2 +.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.text +#endif diff --git a/ring-0.17.14/pregenerated/x86_64-mont5-macosx.S b/ring-0.17.14/pregenerated/x86_64-mont5-macosx.S new file mode 100644 index 0000000000..dd17dc90c2 --- /dev/null +++ b/ring-0.17.14/pregenerated/x86_64-mont5-macosx.S @@ -0,0 +1,3188 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +.text + +.globl _bn_mul4x_mont_gather5 +.private_extern _bn_mul4x_mont_gather5 + +.p2align 5 +_bn_mul4x_mont_gather5: + +_CET_ENDBR +.byte 0x67 + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$mul4x_prologue: + +.byte 0x67 + + + + shll $3,%r9d + leaq (%r9,%r9,2),%r10 + negq %r9 + + + + + + + + + + + leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp + subq %rdi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb L$mul4xsp_alt + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp + jmp L$mul4xsp_done + +.p2align 5 +L$mul4xsp_alt: + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rbp,%r9,2),%rbp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rbp +L$mul4xsp_done: + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$mul4x_page_walk + jmp L$mul4x_page_walk_done + +L$mul4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$mul4x_page_walk +L$mul4x_page_walk_done: + + negq %r9 + + movq %rax,40(%rsp) + +L$mul4x_body: + + call mul4x_internal + + movq 40(%rsp),%rsi + + movq $1,%rax + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$mul4x_epilogue: + ret + + + + +.p2align 5 +mul4x_internal: + + shlq $5,%r9 + movd 8(%rax),%xmm5 + leaq L$inc(%rip),%rax + leaq 128(%rdx,%r9,1),%r13 + shrq $5,%r9 + movdqa 0(%rax),%xmm0 + movdqa 16(%rax),%xmm1 + leaq 88-112(%rsp,%r9,1),%r10 + leaq 128(%rdx),%r12 + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 +.byte 0x67,0x67 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 +.byte 0x67 + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,112(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,128(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,144(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,160(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,176(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,192(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,208(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,224(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,240(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,256(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,272(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,288(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,304(%r10) + + paddd %xmm2,%xmm3 +.byte 0x67 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,320(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,336(%r10) + pand 64(%r12),%xmm0 + + pand 80(%r12),%xmm1 + pand 96(%r12),%xmm2 + movdqa %xmm3,352(%r10) + pand 112(%r12),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -128(%r12),%xmm4 + movdqa -112(%r12),%xmm5 + movdqa -96(%r12),%xmm2 + pand 112(%r10),%xmm4 + movdqa -80(%r12),%xmm3 + pand 128(%r10),%xmm5 + por %xmm4,%xmm0 + pand 144(%r10),%xmm2 + por %xmm5,%xmm1 + pand 160(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -64(%r12),%xmm4 + movdqa -48(%r12),%xmm5 + movdqa -32(%r12),%xmm2 + pand 176(%r10),%xmm4 + movdqa -16(%r12),%xmm3 + pand 192(%r10),%xmm5 + por %xmm4,%xmm0 + pand 208(%r10),%xmm2 + por %xmm5,%xmm1 + pand 224(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa 0(%r12),%xmm4 + movdqa 16(%r12),%xmm5 + movdqa 32(%r12),%xmm2 + pand 240(%r10),%xmm4 + movdqa 48(%r12),%xmm3 + pand 256(%r10),%xmm5 + por %xmm4,%xmm0 + pand 272(%r10),%xmm2 + por %xmm5,%xmm1 + pand 288(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + por %xmm1,%xmm0 + + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 + leaq 256(%r12),%r12 +.byte 102,72,15,126,195 + + movq %r13,16+8(%rsp) + movq %rdi,56+8(%rsp) + + movq (%r8),%r8 + movq (%rsi),%rax + leaq (%rsi,%r9,1),%rsi + negq %r9 + + movq %r8,%rbp + mulq %rbx + movq %rax,%r10 + movq (%rcx),%rax + + imulq %r10,%rbp + leaq 64+8(%rsp),%r14 + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi,%r9,1),%rax + adcq $0,%rdx + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi,%r9,1),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 32(%r9),%r15 + leaq 32(%rcx),%rcx + adcq $0,%rdx + movq %rdi,(%r14) + movq %rdx,%r13 + jmp L$1st4x + +.p2align 5 +L$1st4x: + mulq %rbx + addq %rax,%r10 + movq -16(%rcx),%rax + leaq 32(%r14),%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%r14) + movq %rdx,%r13 + + mulq %rbx + addq %rax,%r10 + movq 0(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq 8(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 32(%rcx),%rcx + adcq $0,%rdx + movq %rdi,(%r14) + movq %rdx,%r13 + + addq $32,%r15 + jnz L$1st4x + + mulq %rbx + addq %rax,%r10 + movq -16(%rcx),%rax + leaq 32(%r14),%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r9,1),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%r14) + movq %rdx,%r13 + + leaq (%rcx,%r9,1),%rcx + + xorq %rdi,%rdi + addq %r10,%r13 + adcq $0,%rdi + movq %r13,-8(%r14) + + jmp L$outer4x + +.p2align 5 +L$outer4x: + leaq 16+128(%r14),%rdx + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + movdqa -128(%r12),%xmm0 + movdqa -112(%r12),%xmm1 + movdqa -96(%r12),%xmm2 + movdqa -80(%r12),%xmm3 + pand -128(%rdx),%xmm0 + pand -112(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -80(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%r12),%xmm0 + movdqa -48(%r12),%xmm1 + movdqa -32(%r12),%xmm2 + movdqa -16(%r12),%xmm3 + pand -64(%rdx),%xmm0 + pand -48(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -16(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%r12),%xmm0 + movdqa 16(%r12),%xmm1 + movdqa 32(%r12),%xmm2 + movdqa 48(%r12),%xmm3 + pand 0(%rdx),%xmm0 + pand 16(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 48(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%r12),%xmm0 + movdqa 80(%r12),%xmm1 + movdqa 96(%r12),%xmm2 + movdqa 112(%r12),%xmm3 + pand 64(%rdx),%xmm0 + pand 80(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 112(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + leaq 256(%r12),%r12 +.byte 102,72,15,126,195 + + movq (%r14,%r9,1),%r10 + movq %r8,%rbp + mulq %rbx + addq %rax,%r10 + movq (%rcx),%rax + adcq $0,%rdx + + imulq %r10,%rbp + movq %rdx,%r11 + movq %rdi,(%r14) + + leaq (%r14,%r9,1),%r14 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi,%r9,1),%rax + adcq $0,%rdx + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + addq 8(%r14),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi,%r9,1),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 32(%r9),%r15 + leaq 32(%rcx),%rcx + adcq $0,%rdx + movq %rdx,%r13 + jmp L$inner4x + +.p2align 5 +L$inner4x: + mulq %rbx + addq %rax,%r10 + movq -16(%rcx),%rax + adcq $0,%rdx + addq 16(%r14),%r10 + leaq 32(%r14),%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdi,-32(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx),%rax + adcq $0,%rdx + addq -8(%r14),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %r13,-24(%r14) + movq %rdx,%r13 + + mulq %rbx + addq %rax,%r10 + movq 0(%rcx),%rax + adcq $0,%rdx + addq (%r14),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq 8(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdi,-16(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + addq 8(%r14),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 32(%rcx),%rcx + adcq $0,%rdx + movq %r13,-8(%r14) + movq %rdx,%r13 + + addq $32,%r15 + jnz L$inner4x + + mulq %rbx + addq %rax,%r10 + movq -16(%rcx),%rax + adcq $0,%rdx + addq 16(%r14),%r10 + leaq 32(%r14),%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdi,-32(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq %rbp,%rax + movq -8(%rcx),%rbp + adcq $0,%rdx + addq -8(%r14),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r9,1),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %r13,-24(%r14) + movq %rdx,%r13 + + movq %rdi,-16(%r14) + leaq (%rcx,%r9,1),%rcx + + xorq %rdi,%rdi + addq %r10,%r13 + adcq $0,%rdi + addq (%r14),%r13 + adcq $0,%rdi + movq %r13,-8(%r14) + + cmpq 16+8(%rsp),%r12 + jb L$outer4x + xorq %rax,%rax + subq %r13,%rbp + adcq %r15,%r15 + orq %r15,%rdi + subq %rdi,%rax + leaq (%r14,%r9,1),%rbx + movq (%rcx),%r12 + leaq (%rcx),%rbp + movq %r9,%rcx + sarq $3+2,%rcx + movq 56+8(%rsp),%rdi + decq %r12 + xorq %r10,%r10 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp L$sqr4x_sub_entry + + +.globl _bn_power5_nohw +.private_extern _bn_power5_nohw + +.p2align 5 +_bn_power5_nohw: + +_CET_ENDBR + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$power5_prologue: + + + + + shll $3,%r9d + leal (%r9,%r9,2),%r10d + negq %r9 + movq (%r8),%r8 + + + + + + + + + leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp + subq %rdi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb L$pwr_sp_alt + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp + jmp L$pwr_sp_done + +.p2align 5 +L$pwr_sp_alt: + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rbp,%r9,2),%rbp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rbp +L$pwr_sp_done: + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$pwr_page_walk + jmp L$pwr_page_walk_done + +L$pwr_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$pwr_page_walk +L$pwr_page_walk_done: + + movq %r9,%r10 + negq %r9 + + + + + + + + + + + movq %r8,32(%rsp) + movq %rax,40(%rsp) + +L$power5_body: +.byte 102,72,15,110,207 +.byte 102,72,15,110,209 +.byte 102,73,15,110,218 +.byte 102,72,15,110,226 + + call __bn_sqr8x_internal + call __bn_post4x_internal + call __bn_sqr8x_internal + call __bn_post4x_internal + call __bn_sqr8x_internal + call __bn_post4x_internal + call __bn_sqr8x_internal + call __bn_post4x_internal + call __bn_sqr8x_internal + call __bn_post4x_internal + +.byte 102,72,15,126,209 +.byte 102,72,15,126,226 + movq %rsi,%rdi + movq 40(%rsp),%rax + leaq 32(%rsp),%r8 + + call mul4x_internal + + movq 40(%rsp),%rsi + + movq $1,%rax + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$power5_epilogue: + ret + + + +.globl _bn_sqr8x_internal +.private_extern _bn_sqr8x_internal +.private_extern _bn_sqr8x_internal + +.p2align 5 +_bn_sqr8x_internal: +__bn_sqr8x_internal: + +_CET_ENDBR + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + leaq 32(%r10),%rbp + leaq (%rsi,%r9,1),%rsi + + movq %r9,%rcx + + + movq -32(%rsi,%rbp,1),%r14 + leaq 48+8(%rsp,%r9,2),%rdi + movq -24(%rsi,%rbp,1),%rax + leaq -32(%rdi,%rbp,1),%rdi + movq -16(%rsi,%rbp,1),%rbx + movq %rax,%r15 + + mulq %r14 + movq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + movq %r10,-24(%rdi,%rbp,1) + + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + adcq $0,%rdx + movq %r11,-16(%rdi,%rbp,1) + movq %rdx,%r10 + + + movq -8(%rsi,%rbp,1),%rbx + mulq %r15 + movq %rax,%r12 + movq %rbx,%rax + movq %rdx,%r13 + + leaq (%rbp),%rcx + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + adcq $0,%r11 + addq %r12,%r10 + adcq $0,%r11 + movq %r10,-8(%rdi,%rcx,1) + jmp L$sqr4x_1st + +.p2align 5 +L$sqr4x_1st: + movq (%rsi,%rcx,1),%rbx + mulq %r15 + addq %rax,%r13 + movq %rbx,%rax + movq %rdx,%r12 + adcq $0,%r12 + + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + movq 8(%rsi,%rcx,1),%rbx + movq %rdx,%r10 + adcq $0,%r10 + addq %r13,%r11 + adcq $0,%r10 + + + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + movq %r11,(%rdi,%rcx,1) + movq %rdx,%r13 + adcq $0,%r13 + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + movq 16(%rsi,%rcx,1),%rbx + movq %rdx,%r11 + adcq $0,%r11 + addq %r12,%r10 + adcq $0,%r11 + + mulq %r15 + addq %rax,%r13 + movq %rbx,%rax + movq %r10,8(%rdi,%rcx,1) + movq %rdx,%r12 + adcq $0,%r12 + + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + movq 24(%rsi,%rcx,1),%rbx + movq %rdx,%r10 + adcq $0,%r10 + addq %r13,%r11 + adcq $0,%r10 + + + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + movq %r11,16(%rdi,%rcx,1) + movq %rdx,%r13 + adcq $0,%r13 + leaq 32(%rcx),%rcx + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + adcq $0,%r11 + addq %r12,%r10 + adcq $0,%r11 + movq %r10,-8(%rdi,%rcx,1) + + cmpq $0,%rcx + jne L$sqr4x_1st + + mulq %r15 + addq %rax,%r13 + leaq 16(%rbp),%rbp + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + + movq %r13,(%rdi) + movq %rdx,%r12 + movq %rdx,8(%rdi) + jmp L$sqr4x_outer + +.p2align 5 +L$sqr4x_outer: + movq -32(%rsi,%rbp,1),%r14 + leaq 48+8(%rsp,%r9,2),%rdi + movq -24(%rsi,%rbp,1),%rax + leaq -32(%rdi,%rbp,1),%rdi + movq -16(%rsi,%rbp,1),%rbx + movq %rax,%r15 + + mulq %r14 + movq -24(%rdi,%rbp,1),%r10 + addq %rax,%r10 + movq %rbx,%rax + adcq $0,%rdx + movq %r10,-24(%rdi,%rbp,1) + movq %rdx,%r11 + + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + adcq $0,%rdx + addq -16(%rdi,%rbp,1),%r11 + movq %rdx,%r10 + adcq $0,%r10 + movq %r11,-16(%rdi,%rbp,1) + + xorq %r12,%r12 + + movq -8(%rsi,%rbp,1),%rbx + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + adcq $0,%rdx + addq -8(%rdi,%rbp,1),%r12 + movq %rdx,%r13 + adcq $0,%r13 + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + adcq $0,%rdx + addq %r12,%r10 + movq %rdx,%r11 + adcq $0,%r11 + movq %r10,-8(%rdi,%rbp,1) + + leaq (%rbp),%rcx + jmp L$sqr4x_inner + +.p2align 5 +L$sqr4x_inner: + movq (%rsi,%rcx,1),%rbx + mulq %r15 + addq %rax,%r13 + movq %rbx,%rax + movq %rdx,%r12 + adcq $0,%r12 + addq (%rdi,%rcx,1),%r13 + adcq $0,%r12 + +.byte 0x67 + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + movq 8(%rsi,%rcx,1),%rbx + movq %rdx,%r10 + adcq $0,%r10 + addq %r13,%r11 + adcq $0,%r10 + + mulq %r15 + addq %rax,%r12 + movq %r11,(%rdi,%rcx,1) + movq %rbx,%rax + movq %rdx,%r13 + adcq $0,%r13 + addq 8(%rdi,%rcx,1),%r12 + leaq 16(%rcx),%rcx + adcq $0,%r13 + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + adcq $0,%rdx + addq %r12,%r10 + movq %rdx,%r11 + adcq $0,%r11 + movq %r10,-8(%rdi,%rcx,1) + + cmpq $0,%rcx + jne L$sqr4x_inner + +.byte 0x67 + mulq %r15 + addq %rax,%r13 + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + + movq %r13,(%rdi) + movq %rdx,%r12 + movq %rdx,8(%rdi) + + addq $16,%rbp + jnz L$sqr4x_outer + + + movq -32(%rsi),%r14 + leaq 48+8(%rsp,%r9,2),%rdi + movq -24(%rsi),%rax + leaq -32(%rdi,%rbp,1),%rdi + movq -16(%rsi),%rbx + movq %rax,%r15 + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + adcq $0,%r11 + + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + movq %r10,-24(%rdi) + movq %rdx,%r10 + adcq $0,%r10 + addq %r13,%r11 + movq -8(%rsi),%rbx + adcq $0,%r10 + + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + movq %r11,-16(%rdi) + movq %rdx,%r13 + adcq $0,%r13 + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + adcq $0,%r11 + addq %r12,%r10 + adcq $0,%r11 + movq %r10,-8(%rdi) + + mulq %r15 + addq %rax,%r13 + movq -16(%rsi),%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + + movq %r13,(%rdi) + movq %rdx,%r12 + movq %rdx,8(%rdi) + + mulq %rbx + addq $16,%rbp + xorq %r14,%r14 + subq %r9,%rbp + xorq %r15,%r15 + + addq %r12,%rax + adcq $0,%rdx + movq %rax,8(%rdi) + movq %rdx,16(%rdi) + movq %r15,24(%rdi) + + movq -16(%rsi,%rbp,1),%rax + leaq 48+8(%rsp),%rdi + xorq %r10,%r10 + movq 8(%rdi),%r11 + + leaq (%r14,%r10,2),%r12 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r13 + shrq $63,%r11 + orq %r10,%r13 + movq 16(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 24(%rdi),%r11 + adcq %rax,%r12 + movq -8(%rsi,%rbp,1),%rax + movq %r12,(%rdi) + adcq %rdx,%r13 + + leaq (%r14,%r10,2),%rbx + movq %r13,8(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r8 + shrq $63,%r11 + orq %r10,%r8 + movq 32(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 40(%rdi),%r11 + adcq %rax,%rbx + movq 0(%rsi,%rbp,1),%rax + movq %rbx,16(%rdi) + adcq %rdx,%r8 + leaq 16(%rbp),%rbp + movq %r8,24(%rdi) + sbbq %r15,%r15 + leaq 64(%rdi),%rdi + jmp L$sqr4x_shift_n_add + +.p2align 5 +L$sqr4x_shift_n_add: + leaq (%r14,%r10,2),%r12 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r13 + shrq $63,%r11 + orq %r10,%r13 + movq -16(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq -8(%rdi),%r11 + adcq %rax,%r12 + movq -8(%rsi,%rbp,1),%rax + movq %r12,-32(%rdi) + adcq %rdx,%r13 + + leaq (%r14,%r10,2),%rbx + movq %r13,-24(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r8 + shrq $63,%r11 + orq %r10,%r8 + movq 0(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 8(%rdi),%r11 + adcq %rax,%rbx + movq 0(%rsi,%rbp,1),%rax + movq %rbx,-16(%rdi) + adcq %rdx,%r8 + + leaq (%r14,%r10,2),%r12 + movq %r8,-8(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r13 + shrq $63,%r11 + orq %r10,%r13 + movq 16(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 24(%rdi),%r11 + adcq %rax,%r12 + movq 8(%rsi,%rbp,1),%rax + movq %r12,0(%rdi) + adcq %rdx,%r13 + + leaq (%r14,%r10,2),%rbx + movq %r13,8(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r8 + shrq $63,%r11 + orq %r10,%r8 + movq 32(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 40(%rdi),%r11 + adcq %rax,%rbx + movq 16(%rsi,%rbp,1),%rax + movq %rbx,16(%rdi) + adcq %rdx,%r8 + movq %r8,24(%rdi) + sbbq %r15,%r15 + leaq 64(%rdi),%rdi + addq $32,%rbp + jnz L$sqr4x_shift_n_add + + leaq (%r14,%r10,2),%r12 +.byte 0x67 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r13 + shrq $63,%r11 + orq %r10,%r13 + movq -16(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq -8(%rdi),%r11 + adcq %rax,%r12 + movq -8(%rsi),%rax + movq %r12,-32(%rdi) + adcq %rdx,%r13 + + leaq (%r14,%r10,2),%rbx + movq %r13,-24(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r8 + shrq $63,%r11 + orq %r10,%r8 + mulq %rax + negq %r15 + adcq %rax,%rbx + adcq %rdx,%r8 + movq %rbx,-16(%rdi) + movq %r8,-8(%rdi) +.byte 102,72,15,126,213 +__bn_sqr8x_reduction: + xorq %rax,%rax + leaq (%r9,%rbp,1),%rcx + leaq 48+8(%rsp,%r9,2),%rdx + movq %rcx,0+8(%rsp) + leaq 48+8(%rsp,%r9,1),%rdi + movq %rdx,8+8(%rsp) + negq %r9 + jmp L$8x_reduction_loop + +.p2align 5 +L$8x_reduction_loop: + leaq (%rdi,%r9,1),%rdi +.byte 0x66 + movq 0(%rdi),%rbx + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq %rax,(%rdx) + leaq 64(%rdi),%rdi + +.byte 0x67 + movq %rbx,%r8 + imulq 32+8(%rsp),%rbx + movq 0(%rbp),%rax + movl $8,%ecx + jmp L$8x_reduce + +.p2align 5 +L$8x_reduce: + mulq %rbx + movq 8(%rbp),%rax + negq %r8 + movq %rdx,%r8 + adcq $0,%r8 + + mulq %rbx + addq %rax,%r9 + movq 16(%rbp),%rax + adcq $0,%rdx + addq %r9,%r8 + movq %rbx,48-8+8(%rsp,%rcx,8) + movq %rdx,%r9 + adcq $0,%r9 + + mulq %rbx + addq %rax,%r10 + movq 24(%rbp),%rax + adcq $0,%rdx + addq %r10,%r9 + movq 32+8(%rsp),%rsi + movq %rdx,%r10 + adcq $0,%r10 + + mulq %rbx + addq %rax,%r11 + movq 32(%rbp),%rax + adcq $0,%rdx + imulq %r8,%rsi + addq %r11,%r10 + movq %rdx,%r11 + adcq $0,%r11 + + mulq %rbx + addq %rax,%r12 + movq 40(%rbp),%rax + adcq $0,%rdx + addq %r12,%r11 + movq %rdx,%r12 + adcq $0,%r12 + + mulq %rbx + addq %rax,%r13 + movq 48(%rbp),%rax + adcq $0,%rdx + addq %r13,%r12 + movq %rdx,%r13 + adcq $0,%r13 + + mulq %rbx + addq %rax,%r14 + movq 56(%rbp),%rax + adcq $0,%rdx + addq %r14,%r13 + movq %rdx,%r14 + adcq $0,%r14 + + mulq %rbx + movq %rsi,%rbx + addq %rax,%r15 + movq 0(%rbp),%rax + adcq $0,%rdx + addq %r15,%r14 + movq %rdx,%r15 + adcq $0,%r15 + + decl %ecx + jnz L$8x_reduce + + leaq 64(%rbp),%rbp + xorq %rax,%rax + movq 8+8(%rsp),%rdx + cmpq 0+8(%rsp),%rbp + jae L$8x_no_tail + +.byte 0x66 + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + sbbq %rsi,%rsi + + movq 48+56+8(%rsp),%rbx + movl $8,%ecx + movq 0(%rbp),%rax + jmp L$8x_tail + +.p2align 5 +L$8x_tail: + mulq %rbx + addq %rax,%r8 + movq 8(%rbp),%rax + movq %r8,(%rdi) + movq %rdx,%r8 + adcq $0,%r8 + + mulq %rbx + addq %rax,%r9 + movq 16(%rbp),%rax + adcq $0,%rdx + addq %r9,%r8 + leaq 8(%rdi),%rdi + movq %rdx,%r9 + adcq $0,%r9 + + mulq %rbx + addq %rax,%r10 + movq 24(%rbp),%rax + adcq $0,%rdx + addq %r10,%r9 + movq %rdx,%r10 + adcq $0,%r10 + + mulq %rbx + addq %rax,%r11 + movq 32(%rbp),%rax + adcq $0,%rdx + addq %r11,%r10 + movq %rdx,%r11 + adcq $0,%r11 + + mulq %rbx + addq %rax,%r12 + movq 40(%rbp),%rax + adcq $0,%rdx + addq %r12,%r11 + movq %rdx,%r12 + adcq $0,%r12 + + mulq %rbx + addq %rax,%r13 + movq 48(%rbp),%rax + adcq $0,%rdx + addq %r13,%r12 + movq %rdx,%r13 + adcq $0,%r13 + + mulq %rbx + addq %rax,%r14 + movq 56(%rbp),%rax + adcq $0,%rdx + addq %r14,%r13 + movq %rdx,%r14 + adcq $0,%r14 + + mulq %rbx + movq 48-16+8(%rsp,%rcx,8),%rbx + addq %rax,%r15 + adcq $0,%rdx + addq %r15,%r14 + movq 0(%rbp),%rax + movq %rdx,%r15 + adcq $0,%r15 + + decl %ecx + jnz L$8x_tail + + leaq 64(%rbp),%rbp + movq 8+8(%rsp),%rdx + cmpq 0+8(%rsp),%rbp + jae L$8x_tail_done + + movq 48+56+8(%rsp),%rbx + negq %rsi + movq 0(%rbp),%rax + adcq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + sbbq %rsi,%rsi + + movl $8,%ecx + jmp L$8x_tail + +.p2align 5 +L$8x_tail_done: + xorq %rax,%rax + addq (%rdx),%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rax + + negq %rsi +L$8x_no_tail: + adcq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + adcq $0,%rax + movq -8(%rbp),%rcx + xorq %rsi,%rsi + +.byte 102,72,15,126,213 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) +.byte 102,73,15,126,217 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + leaq 64(%rdi),%rdi + + cmpq %rdx,%rdi + jb L$8x_reduction_loop + ret + + + +.p2align 5 +__bn_post4x_internal: + + movq 0(%rbp),%r12 + leaq (%rdi,%r9,1),%rbx + movq %r9,%rcx +.byte 102,72,15,126,207 + negq %rax +.byte 102,72,15,126,206 + sarq $3+2,%rcx + decq %r12 + xorq %r10,%r10 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp L$sqr4x_sub_entry + +.p2align 4 +L$sqr4x_sub: + movq 0(%rbp),%r12 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 +L$sqr4x_sub_entry: + leaq 32(%rbp),%rbp + notq %r12 + notq %r13 + notq %r14 + notq %r15 + andq %rax,%r12 + andq %rax,%r13 + andq %rax,%r14 + andq %rax,%r15 + + negq %r10 + adcq 0(%rbx),%r12 + adcq 8(%rbx),%r13 + adcq 16(%rbx),%r14 + adcq 24(%rbx),%r15 + movq %r12,0(%rdi) + leaq 32(%rbx),%rbx + movq %r13,8(%rdi) + sbbq %r10,%r10 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + leaq 32(%rdi),%rdi + + incq %rcx + jnz L$sqr4x_sub + + movq %r9,%r10 + negq %r9 + ret + + +.globl _bn_mulx4x_mont_gather5 +.private_extern _bn_mulx4x_mont_gather5 + +.p2align 5 +_bn_mulx4x_mont_gather5: + +_CET_ENDBR + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$mulx4x_prologue: + + + + + shll $3,%r9d + leaq (%r9,%r9,2),%r10 + negq %r9 + movq (%r8),%r8 + + + + + + + + + + + leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp + subq %rdi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb L$mulx4xsp_alt + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp + jmp L$mulx4xsp_done + +L$mulx4xsp_alt: + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rbp,%r9,2),%rbp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rbp +L$mulx4xsp_done: + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$mulx4x_page_walk + jmp L$mulx4x_page_walk_done + +L$mulx4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$mulx4x_page_walk +L$mulx4x_page_walk_done: + + + + + + + + + + + + + + movq %r8,32(%rsp) + movq %rax,40(%rsp) + +L$mulx4x_body: + call mulx4x_internal + + movq 40(%rsp),%rsi + + movq $1,%rax + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$mulx4x_epilogue: + ret + + + + +.p2align 5 +mulx4x_internal: + + movq %r9,8(%rsp) + movq %r9,%r10 + negq %r9 + shlq $5,%r9 + negq %r10 + leaq 128(%rdx,%r9,1),%r13 + shrq $5+5,%r9 + movd 8(%rax),%xmm5 + subq $1,%r9 + leaq L$inc(%rip),%rax + movq %r13,16+8(%rsp) + movq %r9,24+8(%rsp) + movq %rdi,56+8(%rsp) + movdqa 0(%rax),%xmm0 + movdqa 16(%rax),%xmm1 + leaq 88-112(%rsp,%r10,1),%r10 + leaq 128(%rdx),%rdi + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 +.byte 0x67 + movdqa %xmm1,%xmm2 +.byte 0x67 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,112(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,128(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,144(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,160(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,176(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,192(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,208(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,224(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,240(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,256(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,272(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,288(%r10) + movdqa %xmm4,%xmm3 +.byte 0x67 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,304(%r10) + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,320(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,336(%r10) + + pand 64(%rdi),%xmm0 + pand 80(%rdi),%xmm1 + pand 96(%rdi),%xmm2 + movdqa %xmm3,352(%r10) + pand 112(%rdi),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -128(%rdi),%xmm4 + movdqa -112(%rdi),%xmm5 + movdqa -96(%rdi),%xmm2 + pand 112(%r10),%xmm4 + movdqa -80(%rdi),%xmm3 + pand 128(%r10),%xmm5 + por %xmm4,%xmm0 + pand 144(%r10),%xmm2 + por %xmm5,%xmm1 + pand 160(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -64(%rdi),%xmm4 + movdqa -48(%rdi),%xmm5 + movdqa -32(%rdi),%xmm2 + pand 176(%r10),%xmm4 + movdqa -16(%rdi),%xmm3 + pand 192(%r10),%xmm5 + por %xmm4,%xmm0 + pand 208(%r10),%xmm2 + por %xmm5,%xmm1 + pand 224(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa 0(%rdi),%xmm4 + movdqa 16(%rdi),%xmm5 + movdqa 32(%rdi),%xmm2 + pand 240(%r10),%xmm4 + movdqa 48(%rdi),%xmm3 + pand 256(%r10),%xmm5 + por %xmm4,%xmm0 + pand 272(%r10),%xmm2 + por %xmm5,%xmm1 + pand 288(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + pxor %xmm1,%xmm0 + + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 + leaq 256(%rdi),%rdi +.byte 102,72,15,126,194 + leaq 64+32+8(%rsp),%rbx + + movq %rdx,%r9 + mulxq 0(%rsi),%r8,%rax + mulxq 8(%rsi),%r11,%r12 + addq %rax,%r11 + mulxq 16(%rsi),%rax,%r13 + adcq %rax,%r12 + adcq $0,%r13 + mulxq 24(%rsi),%rax,%r14 + + movq %r8,%r15 + imulq 32+8(%rsp),%r8 + xorq %rbp,%rbp + movq %r8,%rdx + + movq %rdi,8+8(%rsp) + + leaq 32(%rsi),%rsi + adcxq %rax,%r13 + adcxq %rbp,%r14 + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%r15 + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + mulxq 16(%rcx),%rax,%r12 + movq 24+8(%rsp),%rdi + movq %r10,-32(%rbx) + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-24(%rbx) + adcxq %rax,%r12 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r12,-16(%rbx) + jmp L$mulx4x_1st + +.p2align 5 +L$mulx4x_1st: + adcxq %rbp,%r15 + mulxq 0(%rsi),%r10,%rax + adcxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 +.byte 0x67,0x67 + movq %r8,%rdx + adcxq %rax,%r13 + adcxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + movq %r11,-32(%rbx) + adoxq %r15,%r13 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r13,-16(%rbx) + + decq %rdi + jnz L$mulx4x_1st + + movq 8(%rsp),%rax + adcq %rbp,%r15 + leaq (%rsi,%rax,1),%rsi + addq %r15,%r14 + movq 8+8(%rsp),%rdi + adcq %rbp,%rbp + movq %r14,-8(%rbx) + jmp L$mulx4x_outer + +.p2align 5 +L$mulx4x_outer: + leaq 16-256(%rbx),%r10 + pxor %xmm4,%xmm4 +.byte 0x67,0x67 + pxor %xmm5,%xmm5 + movdqa -128(%rdi),%xmm0 + movdqa -112(%rdi),%xmm1 + movdqa -96(%rdi),%xmm2 + pand 256(%r10),%xmm0 + movdqa -80(%rdi),%xmm3 + pand 272(%r10),%xmm1 + por %xmm0,%xmm4 + pand 288(%r10),%xmm2 + por %xmm1,%xmm5 + pand 304(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%rdi),%xmm0 + movdqa -48(%rdi),%xmm1 + movdqa -32(%rdi),%xmm2 + pand 320(%r10),%xmm0 + movdqa -16(%rdi),%xmm3 + pand 336(%r10),%xmm1 + por %xmm0,%xmm4 + pand 352(%r10),%xmm2 + por %xmm1,%xmm5 + pand 368(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%rdi),%xmm0 + movdqa 16(%rdi),%xmm1 + movdqa 32(%rdi),%xmm2 + pand 384(%r10),%xmm0 + movdqa 48(%rdi),%xmm3 + pand 400(%r10),%xmm1 + por %xmm0,%xmm4 + pand 416(%r10),%xmm2 + por %xmm1,%xmm5 + pand 432(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%rdi),%xmm0 + movdqa 80(%rdi),%xmm1 + movdqa 96(%rdi),%xmm2 + pand 448(%r10),%xmm0 + movdqa 112(%rdi),%xmm3 + pand 464(%r10),%xmm1 + por %xmm0,%xmm4 + pand 480(%r10),%xmm2 + por %xmm1,%xmm5 + pand 496(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + leaq 256(%rdi),%rdi +.byte 102,72,15,126,194 + + movq %rbp,(%rbx) + leaq 32(%rbx,%rax,1),%rbx + mulxq 0(%rsi),%r8,%r11 + xorq %rbp,%rbp + movq %rdx,%r9 + mulxq 8(%rsi),%r14,%r12 + adoxq -32(%rbx),%r8 + adcxq %r14,%r11 + mulxq 16(%rsi),%r15,%r13 + adoxq -24(%rbx),%r11 + adcxq %r15,%r12 + mulxq 24(%rsi),%rdx,%r14 + adoxq -16(%rbx),%r12 + adcxq %rdx,%r13 + leaq (%rcx,%rax,1),%rcx + leaq 32(%rsi),%rsi + adoxq -8(%rbx),%r13 + adcxq %rbp,%r14 + adoxq %rbp,%r14 + + movq %r8,%r15 + imulq 32+8(%rsp),%r8 + + movq %r8,%rdx + xorq %rbp,%rbp + movq %rdi,8+8(%rsp) + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%r15 + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + mulxq 16(%rcx),%rax,%r12 + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq 24+8(%rsp),%rdi + movq %r10,-32(%rbx) + adcxq %rax,%r12 + movq %r11,-24(%rbx) + adoxq %rbp,%r15 + movq %r12,-16(%rbx) + leaq 32(%rcx),%rcx + jmp L$mulx4x_inner + +.p2align 5 +L$mulx4x_inner: + mulxq 0(%rsi),%r10,%rax + adcxq %rbp,%r15 + adoxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq 0(%rbx),%r10 + adoxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq 8(%rbx),%r11 + adoxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 + movq %r8,%rdx + adcxq 16(%rbx),%r12 + adoxq %rax,%r13 + adcxq 24(%rbx),%r13 + adoxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + adcxq %rbp,%r14 + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + adoxq %r15,%r13 + movq %r11,-32(%rbx) + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + leaq 32(%rcx),%rcx + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + movq %r13,-16(%rbx) + + decq %rdi + jnz L$mulx4x_inner + + movq 0+8(%rsp),%rax + adcq %rbp,%r15 + subq 0(%rbx),%rdi + movq 8+8(%rsp),%rdi + movq 16+8(%rsp),%r10 + adcq %r15,%r14 + leaq (%rsi,%rax,1),%rsi + adcq %rbp,%rbp + movq %r14,-8(%rbx) + + cmpq %r10,%rdi + jb L$mulx4x_outer + + movq -8(%rcx),%r10 + movq %rbp,%r8 + movq (%rcx,%rax,1),%r12 + leaq (%rcx,%rax,1),%rbp + movq %rax,%rcx + leaq (%rbx,%rax,1),%rdi + xorl %eax,%eax + xorq %r15,%r15 + subq %r14,%r10 + adcq %r15,%r15 + orq %r15,%r8 + sarq $3+2,%rcx + subq %r8,%rax + movq 56+8(%rsp),%rdx + decq %r12 + movq 8(%rbp),%r13 + xorq %r8,%r8 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp L$sqrx4x_sub_entry + + +.globl _bn_powerx5 +.private_extern _bn_powerx5 + +.p2align 5 +_bn_powerx5: + +_CET_ENDBR + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$powerx5_prologue: + + + + + shll $3,%r9d + leaq (%r9,%r9,2),%r10 + negq %r9 + movq (%r8),%r8 + + + + + + + + + leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp + subq %rdi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb L$pwrx_sp_alt + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp + jmp L$pwrx_sp_done + +.p2align 5 +L$pwrx_sp_alt: + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rbp,%r9,2),%rbp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rbp +L$pwrx_sp_done: + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$pwrx_page_walk + jmp L$pwrx_page_walk_done + +L$pwrx_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$pwrx_page_walk +L$pwrx_page_walk_done: + + movq %r9,%r10 + negq %r9 + + + + + + + + + + + + + pxor %xmm0,%xmm0 +.byte 102,72,15,110,207 +.byte 102,72,15,110,209 +.byte 102,73,15,110,218 +.byte 102,72,15,110,226 + movq %r8,32(%rsp) + movq %rax,40(%rsp) + +L$powerx5_body: + + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + + movq %r10,%r9 + movq %rsi,%rdi +.byte 102,72,15,126,209 +.byte 102,72,15,126,226 + movq 40(%rsp),%rax + + call mulx4x_internal + + movq 40(%rsp),%rsi + + movq $1,%rax + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$powerx5_epilogue: + ret + + + +.globl _bn_sqrx8x_internal +.private_extern _bn_sqrx8x_internal +.private_extern _bn_sqrx8x_internal + +.p2align 5 +_bn_sqrx8x_internal: +__bn_sqrx8x_internal: + +_CET_ENDBR + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + leaq 48+8(%rsp),%rdi + leaq (%rsi,%r9,1),%rbp + movq %r9,0+8(%rsp) + movq %rbp,8+8(%rsp) + jmp L$sqr8x_zero_start + +.p2align 5 +.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 +L$sqrx8x_zero: +.byte 0x3e + movdqa %xmm0,0(%rdi) + movdqa %xmm0,16(%rdi) + movdqa %xmm0,32(%rdi) + movdqa %xmm0,48(%rdi) +L$sqr8x_zero_start: + movdqa %xmm0,64(%rdi) + movdqa %xmm0,80(%rdi) + movdqa %xmm0,96(%rdi) + movdqa %xmm0,112(%rdi) + leaq 128(%rdi),%rdi + subq $64,%r9 + jnz L$sqrx8x_zero + + movq 0(%rsi),%rdx + + xorq %r10,%r10 + xorq %r11,%r11 + xorq %r12,%r12 + xorq %r13,%r13 + xorq %r14,%r14 + xorq %r15,%r15 + leaq 48+8(%rsp),%rdi + xorq %rbp,%rbp + jmp L$sqrx8x_outer_loop + +.p2align 5 +L$sqrx8x_outer_loop: + mulxq 8(%rsi),%r8,%rax + adcxq %r9,%r8 + adoxq %rax,%r10 + mulxq 16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 +.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 + adcxq %r11,%r10 + adoxq %rax,%r12 +.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 + adcxq %r12,%r11 + adoxq %rax,%r13 + mulxq 40(%rsi),%r12,%rax + adcxq %r13,%r12 + adoxq %rax,%r14 + mulxq 48(%rsi),%r13,%rax + adcxq %r14,%r13 + adoxq %r15,%rax + mulxq 56(%rsi),%r14,%r15 + movq 8(%rsi),%rdx + adcxq %rax,%r14 + adoxq %rbp,%r15 + adcq 64(%rdi),%r15 + movq %r8,8(%rdi) + movq %r9,16(%rdi) + sbbq %rcx,%rcx + xorq %rbp,%rbp + + + mulxq 16(%rsi),%r8,%rbx + mulxq 24(%rsi),%r9,%rax + adcxq %r10,%r8 + adoxq %rbx,%r9 + mulxq 32(%rsi),%r10,%rbx + adcxq %r11,%r9 + adoxq %rax,%r10 +.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 + adcxq %r12,%r10 + adoxq %rbx,%r11 +.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 + adcxq %r13,%r11 + adoxq %r14,%r12 +.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 + movq 16(%rsi),%rdx + adcxq %rax,%r12 + adoxq %rbx,%r13 + adcxq %r15,%r13 + adoxq %rbp,%r14 + adcxq %rbp,%r14 + + movq %r8,24(%rdi) + movq %r9,32(%rdi) + + mulxq 24(%rsi),%r8,%rbx + mulxq 32(%rsi),%r9,%rax + adcxq %r10,%r8 + adoxq %rbx,%r9 + mulxq 40(%rsi),%r10,%rbx + adcxq %r11,%r9 + adoxq %rax,%r10 +.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 + adcxq %r12,%r10 + adoxq %r13,%r11 +.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 +.byte 0x3e + movq 24(%rsi),%rdx + adcxq %rbx,%r11 + adoxq %rax,%r12 + adcxq %r14,%r12 + movq %r8,40(%rdi) + movq %r9,48(%rdi) + mulxq 32(%rsi),%r8,%rax + adoxq %rbp,%r13 + adcxq %rbp,%r13 + + mulxq 40(%rsi),%r9,%rbx + adcxq %r10,%r8 + adoxq %rax,%r9 + mulxq 48(%rsi),%r10,%rax + adcxq %r11,%r9 + adoxq %r12,%r10 + mulxq 56(%rsi),%r11,%r12 + movq 32(%rsi),%rdx + movq 40(%rsi),%r14 + adcxq %rbx,%r10 + adoxq %rax,%r11 + movq 48(%rsi),%r15 + adcxq %r13,%r11 + adoxq %rbp,%r12 + adcxq %rbp,%r12 + + movq %r8,56(%rdi) + movq %r9,64(%rdi) + + mulxq %r14,%r9,%rax + movq 56(%rsi),%r8 + adcxq %r10,%r9 + mulxq %r15,%r10,%rbx + adoxq %rax,%r10 + adcxq %r11,%r10 + mulxq %r8,%r11,%rax + movq %r14,%rdx + adoxq %rbx,%r11 + adcxq %r12,%r11 + + adcxq %rbp,%rax + + mulxq %r15,%r14,%rbx + mulxq %r8,%r12,%r13 + movq %r15,%rdx + leaq 64(%rsi),%rsi + adcxq %r14,%r11 + adoxq %rbx,%r12 + adcxq %rax,%r12 + adoxq %rbp,%r13 + +.byte 0x67,0x67 + mulxq %r8,%r8,%r14 + adcxq %r8,%r13 + adcxq %rbp,%r14 + + cmpq 8+8(%rsp),%rsi + je L$sqrx8x_outer_break + + negq %rcx + movq $-8,%rcx + movq %rbp,%r15 + movq 64(%rdi),%r8 + adcxq 72(%rdi),%r9 + adcxq 80(%rdi),%r10 + adcxq 88(%rdi),%r11 + adcq 96(%rdi),%r12 + adcq 104(%rdi),%r13 + adcq 112(%rdi),%r14 + adcq 120(%rdi),%r15 + leaq (%rsi),%rbp + leaq 128(%rdi),%rdi + sbbq %rax,%rax + + movq -64(%rsi),%rdx + movq %rax,16+8(%rsp) + movq %rdi,24+8(%rsp) + + + xorl %eax,%eax + jmp L$sqrx8x_loop + +.p2align 5 +L$sqrx8x_loop: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rax,%rbx + adoxq %r9,%r8 + + mulxq 8(%rbp),%rax,%r9 + adcxq %rax,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rax,%r10 + adcxq %rax,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 + adcxq %rax,%r11 + adoxq %r13,%r12 + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rbp),%rax,%r14 + movq %rbx,(%rdi,%rcx,8) + movl $0,%ebx + adcxq %rax,%r13 + adoxq %r15,%r14 + +.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 + movq 8(%rsi,%rcx,8),%rdx + adcxq %rax,%r14 + adoxq %rbx,%r15 + adcxq %rbx,%r15 + +.byte 0x67 + incq %rcx + jnz L$sqrx8x_loop + + leaq 64(%rbp),%rbp + movq $-8,%rcx + cmpq 8+8(%rsp),%rbp + je L$sqrx8x_break + + subq 16+8(%rsp),%rbx +.byte 0x66 + movq -64(%rsi),%rdx + adcxq 0(%rdi),%r8 + adcxq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + leaq 64(%rdi),%rdi +.byte 0x67 + sbbq %rax,%rax + xorl %ebx,%ebx + movq %rax,16+8(%rsp) + jmp L$sqrx8x_loop + +.p2align 5 +L$sqrx8x_break: + xorq %rbp,%rbp + subq 16+8(%rsp),%rbx + adcxq %rbp,%r8 + movq 24+8(%rsp),%rcx + adcxq %rbp,%r9 + movq 0(%rsi),%rdx + adcq $0,%r10 + movq %r8,0(%rdi) + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + cmpq %rcx,%rdi + je L$sqrx8x_outer_loop + + movq %r9,8(%rdi) + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + movq 40(%rcx),%r13 + movq %r14,48(%rdi) + movq 48(%rcx),%r14 + movq %r15,56(%rdi) + movq 56(%rcx),%r15 + movq %rcx,%rdi + jmp L$sqrx8x_outer_loop + +.p2align 5 +L$sqrx8x_outer_break: + movq %r9,72(%rdi) +.byte 102,72,15,126,217 + movq %r10,80(%rdi) + movq %r11,88(%rdi) + movq %r12,96(%rdi) + movq %r13,104(%rdi) + movq %r14,112(%rdi) + leaq 48+8(%rsp),%rdi + movq (%rsi,%rcx,1),%rdx + + movq 8(%rdi),%r11 + xorq %r10,%r10 + movq 0+8(%rsp),%r9 + adoxq %r11,%r11 + movq 16(%rdi),%r12 + movq 24(%rdi),%r13 + + +.p2align 5 +L$sqrx4x_shift_n_add: + mulxq %rdx,%rax,%rbx + adoxq %r12,%r12 + adcxq %r10,%rax +.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 +.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 + adoxq %r13,%r13 + adcxq %r11,%rbx + movq 40(%rdi),%r11 + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + + mulxq %rdx,%rax,%rbx + adoxq %r10,%r10 + adcxq %r12,%rax + movq 16(%rsi,%rcx,1),%rdx + movq 48(%rdi),%r12 + adoxq %r11,%r11 + adcxq %r13,%rbx + movq 56(%rdi),%r13 + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + + mulxq %rdx,%rax,%rbx + adoxq %r12,%r12 + adcxq %r10,%rax + movq 24(%rsi,%rcx,1),%rdx + leaq 32(%rcx),%rcx + movq 64(%rdi),%r10 + adoxq %r13,%r13 + adcxq %r11,%rbx + movq 72(%rdi),%r11 + movq %rax,32(%rdi) + movq %rbx,40(%rdi) + + mulxq %rdx,%rax,%rbx + adoxq %r10,%r10 + adcxq %r12,%rax + jrcxz L$sqrx4x_shift_n_add_break +.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 + adoxq %r11,%r11 + adcxq %r13,%rbx + movq 80(%rdi),%r12 + movq 88(%rdi),%r13 + movq %rax,48(%rdi) + movq %rbx,56(%rdi) + leaq 64(%rdi),%rdi + nop + jmp L$sqrx4x_shift_n_add + +.p2align 5 +L$sqrx4x_shift_n_add_break: + adcxq %r13,%rbx + movq %rax,48(%rdi) + movq %rbx,56(%rdi) + leaq 64(%rdi),%rdi +.byte 102,72,15,126,213 +__bn_sqrx8x_reduction: + xorl %eax,%eax + movq 32+8(%rsp),%rbx + movq 48+8(%rsp),%rdx + leaq -64(%rbp,%r9,1),%rcx + + movq %rcx,0+8(%rsp) + movq %rdi,8+8(%rsp) + + leaq 48+8(%rsp),%rdi + jmp L$sqrx8x_reduction_loop + +.p2align 5 +L$sqrx8x_reduction_loop: + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq %rdx,%r8 + imulq %rbx,%rdx + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq %rax,24+8(%rsp) + + leaq 64(%rdi),%rdi + xorq %rsi,%rsi + movq $-8,%rcx + jmp L$sqrx8x_reduce + +.p2align 5 +L$sqrx8x_reduce: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rbx,%rax + adoxq %r9,%r8 + + mulxq 8(%rbp),%rbx,%r9 + adcxq %rbx,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rbx,%r10 + adcxq %rbx,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rbx,%r11 + adcxq %rbx,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 + movq %rdx,%rax + movq %r8,%rdx + adcxq %rbx,%r11 + adoxq %r13,%r12 + + mulxq 32+8(%rsp),%rbx,%rdx + movq %rax,%rdx + movq %rax,64+48+8(%rsp,%rcx,8) + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rbp),%rax,%r14 + adcxq %rax,%r13 + adoxq %r15,%r14 + + mulxq 56(%rbp),%rax,%r15 + movq %rbx,%rdx + adcxq %rax,%r14 + adoxq %rsi,%r15 + adcxq %rsi,%r15 + +.byte 0x67,0x67,0x67 + incq %rcx + jnz L$sqrx8x_reduce + + movq %rsi,%rax + cmpq 0+8(%rsp),%rbp + jae L$sqrx8x_no_tail + + movq 48+8(%rsp),%rdx + addq 0(%rdi),%r8 + leaq 64(%rbp),%rbp + movq $-8,%rcx + adcxq 8(%rdi),%r9 + adcxq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + leaq 64(%rdi),%rdi + sbbq %rax,%rax + + xorq %rsi,%rsi + movq %rax,16+8(%rsp) + jmp L$sqrx8x_tail + +.p2align 5 +L$sqrx8x_tail: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rax,%rbx + adoxq %r9,%r8 + + mulxq 8(%rbp),%rax,%r9 + adcxq %rax,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rax,%r10 + adcxq %rax,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 + adcxq %rax,%r11 + adoxq %r13,%r12 + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rbp),%rax,%r14 + adcxq %rax,%r13 + adoxq %r15,%r14 + + mulxq 56(%rbp),%rax,%r15 + movq 72+48+8(%rsp,%rcx,8),%rdx + adcxq %rax,%r14 + adoxq %rsi,%r15 + movq %rbx,(%rdi,%rcx,8) + movq %r8,%rbx + adcxq %rsi,%r15 + + incq %rcx + jnz L$sqrx8x_tail + + cmpq 0+8(%rsp),%rbp + jae L$sqrx8x_tail_done + + subq 16+8(%rsp),%rsi + movq 48+8(%rsp),%rdx + leaq 64(%rbp),%rbp + adcq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + leaq 64(%rdi),%rdi + sbbq %rax,%rax + subq $8,%rcx + + xorq %rsi,%rsi + movq %rax,16+8(%rsp) + jmp L$sqrx8x_tail + +.p2align 5 +L$sqrx8x_tail_done: + xorq %rax,%rax + addq 24+8(%rsp),%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rax + + subq 16+8(%rsp),%rsi +L$sqrx8x_no_tail: + adcq 0(%rdi),%r8 +.byte 102,72,15,126,217 + adcq 8(%rdi),%r9 + movq 56(%rbp),%rsi +.byte 102,72,15,126,213 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + adcq $0,%rax + + movq 32+8(%rsp),%rbx + movq 64(%rdi,%rcx,1),%rdx + + movq %r8,0(%rdi) + leaq 64(%rdi),%r8 + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + leaq 64(%rdi,%rcx,1),%rdi + cmpq 8+8(%rsp),%r8 + jb L$sqrx8x_reduction_loop + ret + + +.p2align 5 + +__bn_postx4x_internal: + + movq 0(%rbp),%r12 + movq %rcx,%r10 + movq %rcx,%r9 + negq %rax + sarq $3+2,%rcx + +.byte 102,72,15,126,202 +.byte 102,72,15,126,206 + decq %r12 + movq 8(%rbp),%r13 + xorq %r8,%r8 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp L$sqrx4x_sub_entry + +.p2align 4 +L$sqrx4x_sub: + movq 0(%rbp),%r12 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 +L$sqrx4x_sub_entry: + andnq %rax,%r12,%r12 + leaq 32(%rbp),%rbp + andnq %rax,%r13,%r13 + andnq %rax,%r14,%r14 + andnq %rax,%r15,%r15 + + negq %r8 + adcq 0(%rdi),%r12 + adcq 8(%rdi),%r13 + adcq 16(%rdi),%r14 + adcq 24(%rdi),%r15 + movq %r12,0(%rdx) + leaq 32(%rdi),%rdi + movq %r13,8(%rdx) + sbbq %r8,%r8 + movq %r14,16(%rdx) + movq %r15,24(%rdx) + leaq 32(%rdx),%rdx + + incq %rcx + jnz L$sqrx4x_sub + + negq %r9 + + ret + + +.globl _bn_scatter5 +.private_extern _bn_scatter5 + +.p2align 4 +_bn_scatter5: + +_CET_ENDBR + cmpl $0,%esi + jz L$scatter_epilogue + + + + + + + + + + leaq (%rdx,%rcx,8),%rdx +L$scatter: + movq (%rdi),%rax + leaq 8(%rdi),%rdi + movq %rax,(%rdx) + leaq 256(%rdx),%rdx + subl $1,%esi + jnz L$scatter +L$scatter_epilogue: + ret + + + +.globl _bn_gather5 +.private_extern _bn_gather5 + +.p2align 5 +_bn_gather5: + +L$SEH_begin_bn_gather5: +_CET_ENDBR + +.byte 0x4c,0x8d,0x14,0x24 + +.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 + leaq L$inc(%rip),%rax + andq $-16,%rsp + + movd %ecx,%xmm5 + movdqa 0(%rax),%xmm0 + movdqa 16(%rax),%xmm1 + leaq 128(%rdx),%r11 + leaq 128(%rsp),%rax + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,-128(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,-112(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,-96(%rax) + movdqa %xmm4,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,-80(%rax) + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,-64(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,-48(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,-32(%rax) + movdqa %xmm4,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,-16(%rax) + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,0(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,16(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,32(%rax) + movdqa %xmm4,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,48(%rax) + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,64(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,80(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,96(%rax) + movdqa %xmm4,%xmm2 + movdqa %xmm3,112(%rax) + jmp L$gather + +.p2align 5 +L$gather: + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + movdqa -128(%r11),%xmm0 + movdqa -112(%r11),%xmm1 + movdqa -96(%r11),%xmm2 + pand -128(%rax),%xmm0 + movdqa -80(%r11),%xmm3 + pand -112(%rax),%xmm1 + por %xmm0,%xmm4 + pand -96(%rax),%xmm2 + por %xmm1,%xmm5 + pand -80(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%r11),%xmm0 + movdqa -48(%r11),%xmm1 + movdqa -32(%r11),%xmm2 + pand -64(%rax),%xmm0 + movdqa -16(%r11),%xmm3 + pand -48(%rax),%xmm1 + por %xmm0,%xmm4 + pand -32(%rax),%xmm2 + por %xmm1,%xmm5 + pand -16(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%r11),%xmm0 + movdqa 16(%r11),%xmm1 + movdqa 32(%r11),%xmm2 + pand 0(%rax),%xmm0 + movdqa 48(%r11),%xmm3 + pand 16(%rax),%xmm1 + por %xmm0,%xmm4 + pand 32(%rax),%xmm2 + por %xmm1,%xmm5 + pand 48(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%r11),%xmm0 + movdqa 80(%r11),%xmm1 + movdqa 96(%r11),%xmm2 + pand 64(%rax),%xmm0 + movdqa 112(%r11),%xmm3 + pand 80(%rax),%xmm1 + por %xmm0,%xmm4 + pand 96(%rax),%xmm2 + por %xmm1,%xmm5 + pand 112(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + leaq 256(%r11),%r11 + + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + movq %xmm0,(%rdi) + leaq 8(%rdi),%rdi + subl $1,%esi + jnz L$gather + + leaq (%r10),%rsp + + ret +L$SEH_end_bn_gather5: + + +.section __DATA,__const +.p2align 6 +L$inc: +.long 0,0, 1,1 +.long 2,2, 2,2 +.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.text +#endif diff --git a/ring-0.17.14/pregenerated/x86_64-mont5-nasm.asm b/ring-0.17.14/pregenerated/x86_64-mont5-nasm.asm new file mode 100644 index 0000000000..eeb354322d --- /dev/null +++ b/ring-0.17.14/pregenerated/x86_64-mont5-nasm.asm @@ -0,0 +1,3401 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%include "ring_core_generated/prefix_symbols_nasm.inc" +section .text code align=64 + + +global bn_mul4x_mont_gather5 + +ALIGN 32 +bn_mul4x_mont_gather5: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_bn_mul4x_mont_gather5: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + +_CET_ENDBR + DB 0x67 + mov rax,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + +$L$mul4x_prologue: + + DB 0x67 + + + + shl r9d,3 + lea r10,[r9*2+r9] + neg r9 + + + + + + + + + + + lea r11,[((-320))+r9*2+rsp] + mov rbp,rsp + sub r11,rdi + and r11,4095 + cmp r10,r11 + jb NEAR $L$mul4xsp_alt + sub rbp,r11 + lea rbp,[((-320))+r9*2+rbp] + jmp NEAR $L$mul4xsp_done + +ALIGN 32 +$L$mul4xsp_alt: + lea r10,[((4096-320))+r9*2] + lea rbp,[((-320))+r9*2+rbp] + sub r11,r10 + mov r10,0 + cmovc r11,r10 + sub rbp,r11 +$L$mul4xsp_done: + and rbp,-64 + mov r11,rsp + sub r11,rbp + and r11,-4096 + lea rsp,[rbp*1+r11] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$mul4x_page_walk + jmp NEAR $L$mul4x_page_walk_done + +$L$mul4x_page_walk: + lea rsp,[((-4096))+rsp] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$mul4x_page_walk +$L$mul4x_page_walk_done: + + neg r9 + + mov QWORD[40+rsp],rax + +$L$mul4x_body: + + call mul4x_internal + + mov rsi,QWORD[40+rsp] + + mov rax,1 + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$mul4x_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_bn_mul4x_mont_gather5: + + +ALIGN 32 +mul4x_internal: + + shl r9,5 + movd xmm5,DWORD[56+rax] + lea rax,[$L$inc] + lea r13,[128+r9*1+rdx] + shr r9,5 + movdqa xmm0,XMMWORD[rax] + movdqa xmm1,XMMWORD[16+rax] + lea r10,[((88-112))+r9*1+rsp] + lea r12,[128+rdx] + + pshufd xmm5,xmm5,0 + movdqa xmm4,xmm1 + DB 0x67,0x67 + movdqa xmm2,xmm1 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + DB 0x67 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[112+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[128+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[144+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[160+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[176+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[192+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[208+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[224+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[240+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[256+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[272+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[288+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[304+r10],xmm0 + + paddd xmm3,xmm2 + DB 0x67 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[320+r10],xmm1 + + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[336+r10],xmm2 + pand xmm0,XMMWORD[64+r12] + + pand xmm1,XMMWORD[80+r12] + pand xmm2,XMMWORD[96+r12] + movdqa XMMWORD[352+r10],xmm3 + pand xmm3,XMMWORD[112+r12] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD[((-128))+r12] + movdqa xmm5,XMMWORD[((-112))+r12] + movdqa xmm2,XMMWORD[((-96))+r12] + pand xmm4,XMMWORD[112+r10] + movdqa xmm3,XMMWORD[((-80))+r12] + pand xmm5,XMMWORD[128+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[144+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[160+r10] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD[((-64))+r12] + movdqa xmm5,XMMWORD[((-48))+r12] + movdqa xmm2,XMMWORD[((-32))+r12] + pand xmm4,XMMWORD[176+r10] + movdqa xmm3,XMMWORD[((-16))+r12] + pand xmm5,XMMWORD[192+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[208+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[224+r10] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD[r12] + movdqa xmm5,XMMWORD[16+r12] + movdqa xmm2,XMMWORD[32+r12] + pand xmm4,XMMWORD[240+r10] + movdqa xmm3,XMMWORD[48+r12] + pand xmm5,XMMWORD[256+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[272+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[288+r10] + por xmm0,xmm2 + por xmm1,xmm3 + por xmm0,xmm1 + + pshufd xmm1,xmm0,0x4e + por xmm0,xmm1 + lea r12,[256+r12] +DB 102,72,15,126,195 + + mov QWORD[((16+8))+rsp],r13 + mov QWORD[((56+8))+rsp],rdi + + mov r8,QWORD[r8] + mov rax,QWORD[rsi] + lea rsi,[r9*1+rsi] + neg r9 + + mov rbp,r8 + mul rbx + mov r10,rax + mov rax,QWORD[rcx] + + imul rbp,r10 + lea r14,[((64+8))+rsp] + mov r11,rdx + + mul rbp + add r10,rax + mov rax,QWORD[8+r9*1+rsi] + adc rdx,0 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[8+rcx] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[16+r9*1+rsi] + adc rdx,0 + add rdi,r11 + lea r15,[32+r9] + lea rcx,[32+rcx] + adc rdx,0 + mov QWORD[r14],rdi + mov r13,rdx + jmp NEAR $L$1st4x + +ALIGN 32 +$L$1st4x: + mul rbx + add r10,rax + mov rax,QWORD[((-16))+rcx] + lea r14,[32+r14] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[((-8))+r15*1+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-24))+r14],r13 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[((-8))+rcx] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[r15*1+rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD[((-16))+r14],rdi + mov r13,rdx + + mul rbx + add r10,rax + mov rax,QWORD[rcx] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[8+r15*1+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-8))+r14],r13 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[8+rcx] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[16+r15*1+rsi] + adc rdx,0 + add rdi,r11 + lea rcx,[32+rcx] + adc rdx,0 + mov QWORD[r14],rdi + mov r13,rdx + + add r15,32 + jnz NEAR $L$1st4x + + mul rbx + add r10,rax + mov rax,QWORD[((-16))+rcx] + lea r14,[32+r14] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[((-8))+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-24))+r14],r13 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[((-8))+rcx] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[r9*1+rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD[((-16))+r14],rdi + mov r13,rdx + + lea rcx,[r9*1+rcx] + + xor rdi,rdi + add r13,r10 + adc rdi,0 + mov QWORD[((-8))+r14],r13 + + jmp NEAR $L$outer4x + +ALIGN 32 +$L$outer4x: + lea rdx,[((16+128))+r14] + pxor xmm4,xmm4 + pxor xmm5,xmm5 + movdqa xmm0,XMMWORD[((-128))+r12] + movdqa xmm1,XMMWORD[((-112))+r12] + movdqa xmm2,XMMWORD[((-96))+r12] + movdqa xmm3,XMMWORD[((-80))+r12] + pand xmm0,XMMWORD[((-128))+rdx] + pand xmm1,XMMWORD[((-112))+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD[((-96))+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD[((-80))+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[((-64))+r12] + movdqa xmm1,XMMWORD[((-48))+r12] + movdqa xmm2,XMMWORD[((-32))+r12] + movdqa xmm3,XMMWORD[((-16))+r12] + pand xmm0,XMMWORD[((-64))+rdx] + pand xmm1,XMMWORD[((-48))+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD[((-32))+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD[((-16))+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[r12] + movdqa xmm1,XMMWORD[16+r12] + movdqa xmm2,XMMWORD[32+r12] + movdqa xmm3,XMMWORD[48+r12] + pand xmm0,XMMWORD[rdx] + pand xmm1,XMMWORD[16+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD[32+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD[48+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[64+r12] + movdqa xmm1,XMMWORD[80+r12] + movdqa xmm2,XMMWORD[96+r12] + movdqa xmm3,XMMWORD[112+r12] + pand xmm0,XMMWORD[64+rdx] + pand xmm1,XMMWORD[80+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD[96+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD[112+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + por xmm4,xmm5 + + pshufd xmm0,xmm4,0x4e + por xmm0,xmm4 + lea r12,[256+r12] +DB 102,72,15,126,195 + + mov r10,QWORD[r9*1+r14] + mov rbp,r8 + mul rbx + add r10,rax + mov rax,QWORD[rcx] + adc rdx,0 + + imul rbp,r10 + mov r11,rdx + mov QWORD[r14],rdi + + lea r14,[r9*1+r14] + + mul rbp + add r10,rax + mov rax,QWORD[8+r9*1+rsi] + adc rdx,0 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[8+rcx] + adc rdx,0 + add r11,QWORD[8+r14] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[16+r9*1+rsi] + adc rdx,0 + add rdi,r11 + lea r15,[32+r9] + lea rcx,[32+rcx] + adc rdx,0 + mov r13,rdx + jmp NEAR $L$inner4x + +ALIGN 32 +$L$inner4x: + mul rbx + add r10,rax + mov rax,QWORD[((-16))+rcx] + adc rdx,0 + add r10,QWORD[16+r14] + lea r14,[32+r14] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[((-8))+r15*1+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-32))+r14],rdi + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[((-8))+rcx] + adc rdx,0 + add r11,QWORD[((-8))+r14] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[r15*1+rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD[((-24))+r14],r13 + mov r13,rdx + + mul rbx + add r10,rax + mov rax,QWORD[rcx] + adc rdx,0 + add r10,QWORD[r14] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[8+r15*1+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-16))+r14],rdi + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[8+rcx] + adc rdx,0 + add r11,QWORD[8+r14] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[16+r15*1+rsi] + adc rdx,0 + add rdi,r11 + lea rcx,[32+rcx] + adc rdx,0 + mov QWORD[((-8))+r14],r13 + mov r13,rdx + + add r15,32 + jnz NEAR $L$inner4x + + mul rbx + add r10,rax + mov rax,QWORD[((-16))+rcx] + adc rdx,0 + add r10,QWORD[16+r14] + lea r14,[32+r14] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[((-8))+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-32))+r14],rdi + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,rbp + mov rbp,QWORD[((-8))+rcx] + adc rdx,0 + add r11,QWORD[((-8))+r14] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[r9*1+rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD[((-24))+r14],r13 + mov r13,rdx + + mov QWORD[((-16))+r14],rdi + lea rcx,[r9*1+rcx] + + xor rdi,rdi + add r13,r10 + adc rdi,0 + add r13,QWORD[r14] + adc rdi,0 + mov QWORD[((-8))+r14],r13 + + cmp r12,QWORD[((16+8))+rsp] + jb NEAR $L$outer4x + xor rax,rax + sub rbp,r13 + adc r15,r15 + or rdi,r15 + sub rax,rdi + lea rbx,[r9*1+r14] + mov r12,QWORD[rcx] + lea rbp,[rcx] + mov rcx,r9 + sar rcx,3+2 + mov rdi,QWORD[((56+8))+rsp] + dec r12 + xor r10,r10 + mov r13,QWORD[8+rbp] + mov r14,QWORD[16+rbp] + mov r15,QWORD[24+rbp] + jmp NEAR $L$sqr4x_sub_entry + + +global bn_power5_nohw + +ALIGN 32 +bn_power5_nohw: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_bn_power5_nohw: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + +_CET_ENDBR + mov rax,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + +$L$power5_prologue: + + + + + shl r9d,3 + lea r10d,[r9*2+r9] + neg r9 + mov r8,QWORD[r8] + + + + + + + + + lea r11,[((-320))+r9*2+rsp] + mov rbp,rsp + sub r11,rdi + and r11,4095 + cmp r10,r11 + jb NEAR $L$pwr_sp_alt + sub rbp,r11 + lea rbp,[((-320))+r9*2+rbp] + jmp NEAR $L$pwr_sp_done + +ALIGN 32 +$L$pwr_sp_alt: + lea r10,[((4096-320))+r9*2] + lea rbp,[((-320))+r9*2+rbp] + sub r11,r10 + mov r10,0 + cmovc r11,r10 + sub rbp,r11 +$L$pwr_sp_done: + and rbp,-64 + mov r11,rsp + sub r11,rbp + and r11,-4096 + lea rsp,[rbp*1+r11] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$pwr_page_walk + jmp NEAR $L$pwr_page_walk_done + +$L$pwr_page_walk: + lea rsp,[((-4096))+rsp] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$pwr_page_walk +$L$pwr_page_walk_done: + + mov r10,r9 + neg r9 + + + + + + + + + + + mov QWORD[32+rsp],r8 + mov QWORD[40+rsp],rax + +$L$power5_body: +DB 102,72,15,110,207 +DB 102,72,15,110,209 +DB 102,73,15,110,218 +DB 102,72,15,110,226 + + call __bn_sqr8x_internal + call __bn_post4x_internal + call __bn_sqr8x_internal + call __bn_post4x_internal + call __bn_sqr8x_internal + call __bn_post4x_internal + call __bn_sqr8x_internal + call __bn_post4x_internal + call __bn_sqr8x_internal + call __bn_post4x_internal + +DB 102,72,15,126,209 +DB 102,72,15,126,226 + mov rdi,rsi + mov rax,QWORD[40+rsp] + lea r8,[32+rsp] + + call mul4x_internal + + mov rsi,QWORD[40+rsp] + + mov rax,1 + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$power5_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_bn_power5_nohw: + +global bn_sqr8x_internal + + +ALIGN 32 +bn_sqr8x_internal: +__bn_sqr8x_internal: + +_CET_ENDBR + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + lea rbp,[32+r10] + lea rsi,[r9*1+rsi] + + mov rcx,r9 + + + mov r14,QWORD[((-32))+rbp*1+rsi] + lea rdi,[((48+8))+r9*2+rsp] + mov rax,QWORD[((-24))+rbp*1+rsi] + lea rdi,[((-32))+rbp*1+rdi] + mov rbx,QWORD[((-16))+rbp*1+rsi] + mov r15,rax + + mul r14 + mov r10,rax + mov rax,rbx + mov r11,rdx + mov QWORD[((-24))+rbp*1+rdi],r10 + + mul r14 + add r11,rax + mov rax,rbx + adc rdx,0 + mov QWORD[((-16))+rbp*1+rdi],r11 + mov r10,rdx + + + mov rbx,QWORD[((-8))+rbp*1+rsi] + mul r15 + mov r12,rax + mov rax,rbx + mov r13,rdx + + lea rcx,[rbp] + mul r14 + add r10,rax + mov rax,rbx + mov r11,rdx + adc r11,0 + add r10,r12 + adc r11,0 + mov QWORD[((-8))+rcx*1+rdi],r10 + jmp NEAR $L$sqr4x_1st + +ALIGN 32 +$L$sqr4x_1st: + mov rbx,QWORD[rcx*1+rsi] + mul r15 + add r13,rax + mov rax,rbx + mov r12,rdx + adc r12,0 + + mul r14 + add r11,rax + mov rax,rbx + mov rbx,QWORD[8+rcx*1+rsi] + mov r10,rdx + adc r10,0 + add r11,r13 + adc r10,0 + + + mul r15 + add r12,rax + mov rax,rbx + mov QWORD[rcx*1+rdi],r11 + mov r13,rdx + adc r13,0 + + mul r14 + add r10,rax + mov rax,rbx + mov rbx,QWORD[16+rcx*1+rsi] + mov r11,rdx + adc r11,0 + add r10,r12 + adc r11,0 + + mul r15 + add r13,rax + mov rax,rbx + mov QWORD[8+rcx*1+rdi],r10 + mov r12,rdx + adc r12,0 + + mul r14 + add r11,rax + mov rax,rbx + mov rbx,QWORD[24+rcx*1+rsi] + mov r10,rdx + adc r10,0 + add r11,r13 + adc r10,0 + + + mul r15 + add r12,rax + mov rax,rbx + mov QWORD[16+rcx*1+rdi],r11 + mov r13,rdx + adc r13,0 + lea rcx,[32+rcx] + + mul r14 + add r10,rax + mov rax,rbx + mov r11,rdx + adc r11,0 + add r10,r12 + adc r11,0 + mov QWORD[((-8))+rcx*1+rdi],r10 + + cmp rcx,0 + jne NEAR $L$sqr4x_1st + + mul r15 + add r13,rax + lea rbp,[16+rbp] + adc rdx,0 + add r13,r11 + adc rdx,0 + + mov QWORD[rdi],r13 + mov r12,rdx + mov QWORD[8+rdi],rdx + jmp NEAR $L$sqr4x_outer + +ALIGN 32 +$L$sqr4x_outer: + mov r14,QWORD[((-32))+rbp*1+rsi] + lea rdi,[((48+8))+r9*2+rsp] + mov rax,QWORD[((-24))+rbp*1+rsi] + lea rdi,[((-32))+rbp*1+rdi] + mov rbx,QWORD[((-16))+rbp*1+rsi] + mov r15,rax + + mul r14 + mov r10,QWORD[((-24))+rbp*1+rdi] + add r10,rax + mov rax,rbx + adc rdx,0 + mov QWORD[((-24))+rbp*1+rdi],r10 + mov r11,rdx + + mul r14 + add r11,rax + mov rax,rbx + adc rdx,0 + add r11,QWORD[((-16))+rbp*1+rdi] + mov r10,rdx + adc r10,0 + mov QWORD[((-16))+rbp*1+rdi],r11 + + xor r12,r12 + + mov rbx,QWORD[((-8))+rbp*1+rsi] + mul r15 + add r12,rax + mov rax,rbx + adc rdx,0 + add r12,QWORD[((-8))+rbp*1+rdi] + mov r13,rdx + adc r13,0 + + mul r14 + add r10,rax + mov rax,rbx + adc rdx,0 + add r10,r12 + mov r11,rdx + adc r11,0 + mov QWORD[((-8))+rbp*1+rdi],r10 + + lea rcx,[rbp] + jmp NEAR $L$sqr4x_inner + +ALIGN 32 +$L$sqr4x_inner: + mov rbx,QWORD[rcx*1+rsi] + mul r15 + add r13,rax + mov rax,rbx + mov r12,rdx + adc r12,0 + add r13,QWORD[rcx*1+rdi] + adc r12,0 + + DB 0x67 + mul r14 + add r11,rax + mov rax,rbx + mov rbx,QWORD[8+rcx*1+rsi] + mov r10,rdx + adc r10,0 + add r11,r13 + adc r10,0 + + mul r15 + add r12,rax + mov QWORD[rcx*1+rdi],r11 + mov rax,rbx + mov r13,rdx + adc r13,0 + add r12,QWORD[8+rcx*1+rdi] + lea rcx,[16+rcx] + adc r13,0 + + mul r14 + add r10,rax + mov rax,rbx + adc rdx,0 + add r10,r12 + mov r11,rdx + adc r11,0 + mov QWORD[((-8))+rcx*1+rdi],r10 + + cmp rcx,0 + jne NEAR $L$sqr4x_inner + + DB 0x67 + mul r15 + add r13,rax + adc rdx,0 + add r13,r11 + adc rdx,0 + + mov QWORD[rdi],r13 + mov r12,rdx + mov QWORD[8+rdi],rdx + + add rbp,16 + jnz NEAR $L$sqr4x_outer + + + mov r14,QWORD[((-32))+rsi] + lea rdi,[((48+8))+r9*2+rsp] + mov rax,QWORD[((-24))+rsi] + lea rdi,[((-32))+rbp*1+rdi] + mov rbx,QWORD[((-16))+rsi] + mov r15,rax + + mul r14 + add r10,rax + mov rax,rbx + mov r11,rdx + adc r11,0 + + mul r14 + add r11,rax + mov rax,rbx + mov QWORD[((-24))+rdi],r10 + mov r10,rdx + adc r10,0 + add r11,r13 + mov rbx,QWORD[((-8))+rsi] + adc r10,0 + + mul r15 + add r12,rax + mov rax,rbx + mov QWORD[((-16))+rdi],r11 + mov r13,rdx + adc r13,0 + + mul r14 + add r10,rax + mov rax,rbx + mov r11,rdx + adc r11,0 + add r10,r12 + adc r11,0 + mov QWORD[((-8))+rdi],r10 + + mul r15 + add r13,rax + mov rax,QWORD[((-16))+rsi] + adc rdx,0 + add r13,r11 + adc rdx,0 + + mov QWORD[rdi],r13 + mov r12,rdx + mov QWORD[8+rdi],rdx + + mul rbx + add rbp,16 + xor r14,r14 + sub rbp,r9 + xor r15,r15 + + add rax,r12 + adc rdx,0 + mov QWORD[8+rdi],rax + mov QWORD[16+rdi],rdx + mov QWORD[24+rdi],r15 + + mov rax,QWORD[((-16))+rbp*1+rsi] + lea rdi,[((48+8))+rsp] + xor r10,r10 + mov r11,QWORD[8+rdi] + + lea r12,[r10*2+r14] + shr r10,63 + lea r13,[r11*2+rcx] + shr r11,63 + or r13,r10 + mov r10,QWORD[16+rdi] + mov r14,r11 + mul rax + neg r15 + mov r11,QWORD[24+rdi] + adc r12,rax + mov rax,QWORD[((-8))+rbp*1+rsi] + mov QWORD[rdi],r12 + adc r13,rdx + + lea rbx,[r10*2+r14] + mov QWORD[8+rdi],r13 + sbb r15,r15 + shr r10,63 + lea r8,[r11*2+rcx] + shr r11,63 + or r8,r10 + mov r10,QWORD[32+rdi] + mov r14,r11 + mul rax + neg r15 + mov r11,QWORD[40+rdi] + adc rbx,rax + mov rax,QWORD[rbp*1+rsi] + mov QWORD[16+rdi],rbx + adc r8,rdx + lea rbp,[16+rbp] + mov QWORD[24+rdi],r8 + sbb r15,r15 + lea rdi,[64+rdi] + jmp NEAR $L$sqr4x_shift_n_add + +ALIGN 32 +$L$sqr4x_shift_n_add: + lea r12,[r10*2+r14] + shr r10,63 + lea r13,[r11*2+rcx] + shr r11,63 + or r13,r10 + mov r10,QWORD[((-16))+rdi] + mov r14,r11 + mul rax + neg r15 + mov r11,QWORD[((-8))+rdi] + adc r12,rax + mov rax,QWORD[((-8))+rbp*1+rsi] + mov QWORD[((-32))+rdi],r12 + adc r13,rdx + + lea rbx,[r10*2+r14] + mov QWORD[((-24))+rdi],r13 + sbb r15,r15 + shr r10,63 + lea r8,[r11*2+rcx] + shr r11,63 + or r8,r10 + mov r10,QWORD[rdi] + mov r14,r11 + mul rax + neg r15 + mov r11,QWORD[8+rdi] + adc rbx,rax + mov rax,QWORD[rbp*1+rsi] + mov QWORD[((-16))+rdi],rbx + adc r8,rdx + + lea r12,[r10*2+r14] + mov QWORD[((-8))+rdi],r8 + sbb r15,r15 + shr r10,63 + lea r13,[r11*2+rcx] + shr r11,63 + or r13,r10 + mov r10,QWORD[16+rdi] + mov r14,r11 + mul rax + neg r15 + mov r11,QWORD[24+rdi] + adc r12,rax + mov rax,QWORD[8+rbp*1+rsi] + mov QWORD[rdi],r12 + adc r13,rdx + + lea rbx,[r10*2+r14] + mov QWORD[8+rdi],r13 + sbb r15,r15 + shr r10,63 + lea r8,[r11*2+rcx] + shr r11,63 + or r8,r10 + mov r10,QWORD[32+rdi] + mov r14,r11 + mul rax + neg r15 + mov r11,QWORD[40+rdi] + adc rbx,rax + mov rax,QWORD[16+rbp*1+rsi] + mov QWORD[16+rdi],rbx + adc r8,rdx + mov QWORD[24+rdi],r8 + sbb r15,r15 + lea rdi,[64+rdi] + add rbp,32 + jnz NEAR $L$sqr4x_shift_n_add + + lea r12,[r10*2+r14] + DB 0x67 + shr r10,63 + lea r13,[r11*2+rcx] + shr r11,63 + or r13,r10 + mov r10,QWORD[((-16))+rdi] + mov r14,r11 + mul rax + neg r15 + mov r11,QWORD[((-8))+rdi] + adc r12,rax + mov rax,QWORD[((-8))+rsi] + mov QWORD[((-32))+rdi],r12 + adc r13,rdx + + lea rbx,[r10*2+r14] + mov QWORD[((-24))+rdi],r13 + sbb r15,r15 + shr r10,63 + lea r8,[r11*2+rcx] + shr r11,63 + or r8,r10 + mul rax + neg r15 + adc rbx,rax + adc r8,rdx + mov QWORD[((-16))+rdi],rbx + mov QWORD[((-8))+rdi],r8 +DB 102,72,15,126,213 +__bn_sqr8x_reduction: + xor rax,rax + lea rcx,[rbp*1+r9] + lea rdx,[((48+8))+r9*2+rsp] + mov QWORD[((0+8))+rsp],rcx + lea rdi,[((48+8))+r9*1+rsp] + mov QWORD[((8+8))+rsp],rdx + neg r9 + jmp NEAR $L$8x_reduction_loop + +ALIGN 32 +$L$8x_reduction_loop: + lea rdi,[r9*1+rdi] + DB 0x66 + mov rbx,QWORD[rdi] + mov r9,QWORD[8+rdi] + mov r10,QWORD[16+rdi] + mov r11,QWORD[24+rdi] + mov r12,QWORD[32+rdi] + mov r13,QWORD[40+rdi] + mov r14,QWORD[48+rdi] + mov r15,QWORD[56+rdi] + mov QWORD[rdx],rax + lea rdi,[64+rdi] + + DB 0x67 + mov r8,rbx + imul rbx,QWORD[((32+8))+rsp] + mov rax,QWORD[rbp] + mov ecx,8 + jmp NEAR $L$8x_reduce + +ALIGN 32 +$L$8x_reduce: + mul rbx + mov rax,QWORD[8+rbp] + neg r8 + mov r8,rdx + adc r8,0 + + mul rbx + add r9,rax + mov rax,QWORD[16+rbp] + adc rdx,0 + add r8,r9 + mov QWORD[((48-8+8))+rcx*8+rsp],rbx + mov r9,rdx + adc r9,0 + + mul rbx + add r10,rax + mov rax,QWORD[24+rbp] + adc rdx,0 + add r9,r10 + mov rsi,QWORD[((32+8))+rsp] + mov r10,rdx + adc r10,0 + + mul rbx + add r11,rax + mov rax,QWORD[32+rbp] + adc rdx,0 + imul rsi,r8 + add r10,r11 + mov r11,rdx + adc r11,0 + + mul rbx + add r12,rax + mov rax,QWORD[40+rbp] + adc rdx,0 + add r11,r12 + mov r12,rdx + adc r12,0 + + mul rbx + add r13,rax + mov rax,QWORD[48+rbp] + adc rdx,0 + add r12,r13 + mov r13,rdx + adc r13,0 + + mul rbx + add r14,rax + mov rax,QWORD[56+rbp] + adc rdx,0 + add r13,r14 + mov r14,rdx + adc r14,0 + + mul rbx + mov rbx,rsi + add r15,rax + mov rax,QWORD[rbp] + adc rdx,0 + add r14,r15 + mov r15,rdx + adc r15,0 + + dec ecx + jnz NEAR $L$8x_reduce + + lea rbp,[64+rbp] + xor rax,rax + mov rdx,QWORD[((8+8))+rsp] + cmp rbp,QWORD[((0+8))+rsp] + jae NEAR $L$8x_no_tail + + DB 0x66 + add r8,QWORD[rdi] + adc r9,QWORD[8+rdi] + adc r10,QWORD[16+rdi] + adc r11,QWORD[24+rdi] + adc r12,QWORD[32+rdi] + adc r13,QWORD[40+rdi] + adc r14,QWORD[48+rdi] + adc r15,QWORD[56+rdi] + sbb rsi,rsi + + mov rbx,QWORD[((48+56+8))+rsp] + mov ecx,8 + mov rax,QWORD[rbp] + jmp NEAR $L$8x_tail + +ALIGN 32 +$L$8x_tail: + mul rbx + add r8,rax + mov rax,QWORD[8+rbp] + mov QWORD[rdi],r8 + mov r8,rdx + adc r8,0 + + mul rbx + add r9,rax + mov rax,QWORD[16+rbp] + adc rdx,0 + add r8,r9 + lea rdi,[8+rdi] + mov r9,rdx + adc r9,0 + + mul rbx + add r10,rax + mov rax,QWORD[24+rbp] + adc rdx,0 + add r9,r10 + mov r10,rdx + adc r10,0 + + mul rbx + add r11,rax + mov rax,QWORD[32+rbp] + adc rdx,0 + add r10,r11 + mov r11,rdx + adc r11,0 + + mul rbx + add r12,rax + mov rax,QWORD[40+rbp] + adc rdx,0 + add r11,r12 + mov r12,rdx + adc r12,0 + + mul rbx + add r13,rax + mov rax,QWORD[48+rbp] + adc rdx,0 + add r12,r13 + mov r13,rdx + adc r13,0 + + mul rbx + add r14,rax + mov rax,QWORD[56+rbp] + adc rdx,0 + add r13,r14 + mov r14,rdx + adc r14,0 + + mul rbx + mov rbx,QWORD[((48-16+8))+rcx*8+rsp] + add r15,rax + adc rdx,0 + add r14,r15 + mov rax,QWORD[rbp] + mov r15,rdx + adc r15,0 + + dec ecx + jnz NEAR $L$8x_tail + + lea rbp,[64+rbp] + mov rdx,QWORD[((8+8))+rsp] + cmp rbp,QWORD[((0+8))+rsp] + jae NEAR $L$8x_tail_done + + mov rbx,QWORD[((48+56+8))+rsp] + neg rsi + mov rax,QWORD[rbp] + adc r8,QWORD[rdi] + adc r9,QWORD[8+rdi] + adc r10,QWORD[16+rdi] + adc r11,QWORD[24+rdi] + adc r12,QWORD[32+rdi] + adc r13,QWORD[40+rdi] + adc r14,QWORD[48+rdi] + adc r15,QWORD[56+rdi] + sbb rsi,rsi + + mov ecx,8 + jmp NEAR $L$8x_tail + +ALIGN 32 +$L$8x_tail_done: + xor rax,rax + add r8,QWORD[rdx] + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + adc r14,0 + adc r15,0 + adc rax,0 + + neg rsi +$L$8x_no_tail: + adc r8,QWORD[rdi] + adc r9,QWORD[8+rdi] + adc r10,QWORD[16+rdi] + adc r11,QWORD[24+rdi] + adc r12,QWORD[32+rdi] + adc r13,QWORD[40+rdi] + adc r14,QWORD[48+rdi] + adc r15,QWORD[56+rdi] + adc rax,0 + mov rcx,QWORD[((-8))+rbp] + xor rsi,rsi + +DB 102,72,15,126,213 + + mov QWORD[rdi],r8 + mov QWORD[8+rdi],r9 +DB 102,73,15,126,217 + mov QWORD[16+rdi],r10 + mov QWORD[24+rdi],r11 + mov QWORD[32+rdi],r12 + mov QWORD[40+rdi],r13 + mov QWORD[48+rdi],r14 + mov QWORD[56+rdi],r15 + lea rdi,[64+rdi] + + cmp rdi,rdx + jb NEAR $L$8x_reduction_loop + ret + + + +ALIGN 32 +__bn_post4x_internal: + + mov r12,QWORD[rbp] + lea rbx,[r9*1+rdi] + mov rcx,r9 +DB 102,72,15,126,207 + neg rax +DB 102,72,15,126,206 + sar rcx,3+2 + dec r12 + xor r10,r10 + mov r13,QWORD[8+rbp] + mov r14,QWORD[16+rbp] + mov r15,QWORD[24+rbp] + jmp NEAR $L$sqr4x_sub_entry + +ALIGN 16 +$L$sqr4x_sub: + mov r12,QWORD[rbp] + mov r13,QWORD[8+rbp] + mov r14,QWORD[16+rbp] + mov r15,QWORD[24+rbp] +$L$sqr4x_sub_entry: + lea rbp,[32+rbp] + not r12 + not r13 + not r14 + not r15 + and r12,rax + and r13,rax + and r14,rax + and r15,rax + + neg r10 + adc r12,QWORD[rbx] + adc r13,QWORD[8+rbx] + adc r14,QWORD[16+rbx] + adc r15,QWORD[24+rbx] + mov QWORD[rdi],r12 + lea rbx,[32+rbx] + mov QWORD[8+rdi],r13 + sbb r10,r10 + mov QWORD[16+rdi],r14 + mov QWORD[24+rdi],r15 + lea rdi,[32+rdi] + + inc rcx + jnz NEAR $L$sqr4x_sub + + mov r10,r9 + neg r9 + ret + + +global bn_mulx4x_mont_gather5 + +ALIGN 32 +bn_mulx4x_mont_gather5: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_bn_mulx4x_mont_gather5: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + +_CET_ENDBR + mov rax,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + +$L$mulx4x_prologue: + + + + + shl r9d,3 + lea r10,[r9*2+r9] + neg r9 + mov r8,QWORD[r8] + + + + + + + + + + + lea r11,[((-320))+r9*2+rsp] + mov rbp,rsp + sub r11,rdi + and r11,4095 + cmp r10,r11 + jb NEAR $L$mulx4xsp_alt + sub rbp,r11 + lea rbp,[((-320))+r9*2+rbp] + jmp NEAR $L$mulx4xsp_done + +$L$mulx4xsp_alt: + lea r10,[((4096-320))+r9*2] + lea rbp,[((-320))+r9*2+rbp] + sub r11,r10 + mov r10,0 + cmovc r11,r10 + sub rbp,r11 +$L$mulx4xsp_done: + and rbp,-64 + mov r11,rsp + sub r11,rbp + and r11,-4096 + lea rsp,[rbp*1+r11] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$mulx4x_page_walk + jmp NEAR $L$mulx4x_page_walk_done + +$L$mulx4x_page_walk: + lea rsp,[((-4096))+rsp] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$mulx4x_page_walk +$L$mulx4x_page_walk_done: + + + + + + + + + + + + + + mov QWORD[32+rsp],r8 + mov QWORD[40+rsp],rax + +$L$mulx4x_body: + call mulx4x_internal + + mov rsi,QWORD[40+rsp] + + mov rax,1 + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$mulx4x_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_bn_mulx4x_mont_gather5: + + +ALIGN 32 +mulx4x_internal: + + mov QWORD[8+rsp],r9 + mov r10,r9 + neg r9 + shl r9,5 + neg r10 + lea r13,[128+r9*1+rdx] + shr r9,5+5 + movd xmm5,DWORD[56+rax] + sub r9,1 + lea rax,[$L$inc] + mov QWORD[((16+8))+rsp],r13 + mov QWORD[((24+8))+rsp],r9 + mov QWORD[((56+8))+rsp],rdi + movdqa xmm0,XMMWORD[rax] + movdqa xmm1,XMMWORD[16+rax] + lea r10,[((88-112))+r10*1+rsp] + lea rdi,[128+rdx] + + pshufd xmm5,xmm5,0 + movdqa xmm4,xmm1 + DB 0x67 + movdqa xmm2,xmm1 + DB 0x67 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[112+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[128+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[144+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[160+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[176+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[192+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[208+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[224+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[240+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[256+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[272+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[288+r10],xmm3 + movdqa xmm3,xmm4 + DB 0x67 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[304+r10],xmm0 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[320+r10],xmm1 + + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[336+r10],xmm2 + + pand xmm0,XMMWORD[64+rdi] + pand xmm1,XMMWORD[80+rdi] + pand xmm2,XMMWORD[96+rdi] + movdqa XMMWORD[352+r10],xmm3 + pand xmm3,XMMWORD[112+rdi] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD[((-128))+rdi] + movdqa xmm5,XMMWORD[((-112))+rdi] + movdqa xmm2,XMMWORD[((-96))+rdi] + pand xmm4,XMMWORD[112+r10] + movdqa xmm3,XMMWORD[((-80))+rdi] + pand xmm5,XMMWORD[128+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[144+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[160+r10] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD[((-64))+rdi] + movdqa xmm5,XMMWORD[((-48))+rdi] + movdqa xmm2,XMMWORD[((-32))+rdi] + pand xmm4,XMMWORD[176+r10] + movdqa xmm3,XMMWORD[((-16))+rdi] + pand xmm5,XMMWORD[192+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[208+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[224+r10] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD[rdi] + movdqa xmm5,XMMWORD[16+rdi] + movdqa xmm2,XMMWORD[32+rdi] + pand xmm4,XMMWORD[240+r10] + movdqa xmm3,XMMWORD[48+rdi] + pand xmm5,XMMWORD[256+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[272+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[288+r10] + por xmm0,xmm2 + por xmm1,xmm3 + pxor xmm0,xmm1 + + pshufd xmm1,xmm0,0x4e + por xmm0,xmm1 + lea rdi,[256+rdi] +DB 102,72,15,126,194 + lea rbx,[((64+32+8))+rsp] + + mov r9,rdx + mulx rax,r8,QWORD[rsi] + mulx r12,r11,QWORD[8+rsi] + add r11,rax + mulx r13,rax,QWORD[16+rsi] + adc r12,rax + adc r13,0 + mulx r14,rax,QWORD[24+rsi] + + mov r15,r8 + imul r8,QWORD[((32+8))+rsp] + xor rbp,rbp + mov rdx,r8 + + mov QWORD[((8+8))+rsp],rdi + + lea rsi,[32+rsi] + adcx r13,rax + adcx r14,rbp + + mulx r10,rax,QWORD[rcx] + adcx r15,rax + adox r10,r11 + mulx r11,rax,QWORD[8+rcx] + adcx r10,rax + adox r11,r12 + mulx r12,rax,QWORD[16+rcx] + mov rdi,QWORD[((24+8))+rsp] + mov QWORD[((-32))+rbx],r10 + adcx r11,rax + adox r12,r13 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + mov QWORD[((-24))+rbx],r11 + adcx r12,rax + adox r15,rbp + lea rcx,[32+rcx] + mov QWORD[((-16))+rbx],r12 + jmp NEAR $L$mulx4x_1st + +ALIGN 32 +$L$mulx4x_1st: + adcx r15,rbp + mulx rax,r10,QWORD[rsi] + adcx r10,r14 + mulx r14,r11,QWORD[8+rsi] + adcx r11,rax + mulx rax,r12,QWORD[16+rsi] + adcx r12,r14 + mulx r14,r13,QWORD[24+rsi] + DB 0x67,0x67 + mov rdx,r8 + adcx r13,rax + adcx r14,rbp + lea rsi,[32+rsi] + lea rbx,[32+rbx] + + adox r10,r15 + mulx r15,rax,QWORD[rcx] + adcx r10,rax + adox r11,r15 + mulx r15,rax,QWORD[8+rcx] + adcx r11,rax + adox r12,r15 + mulx r15,rax,QWORD[16+rcx] + mov QWORD[((-40))+rbx],r10 + adcx r12,rax + mov QWORD[((-32))+rbx],r11 + adox r13,r15 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + mov QWORD[((-24))+rbx],r12 + adcx r13,rax + adox r15,rbp + lea rcx,[32+rcx] + mov QWORD[((-16))+rbx],r13 + + dec rdi + jnz NEAR $L$mulx4x_1st + + mov rax,QWORD[8+rsp] + adc r15,rbp + lea rsi,[rax*1+rsi] + add r14,r15 + mov rdi,QWORD[((8+8))+rsp] + adc rbp,rbp + mov QWORD[((-8))+rbx],r14 + jmp NEAR $L$mulx4x_outer + +ALIGN 32 +$L$mulx4x_outer: + lea r10,[((16-256))+rbx] + pxor xmm4,xmm4 + DB 0x67,0x67 + pxor xmm5,xmm5 + movdqa xmm0,XMMWORD[((-128))+rdi] + movdqa xmm1,XMMWORD[((-112))+rdi] + movdqa xmm2,XMMWORD[((-96))+rdi] + pand xmm0,XMMWORD[256+r10] + movdqa xmm3,XMMWORD[((-80))+rdi] + pand xmm1,XMMWORD[272+r10] + por xmm4,xmm0 + pand xmm2,XMMWORD[288+r10] + por xmm5,xmm1 + pand xmm3,XMMWORD[304+r10] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[((-64))+rdi] + movdqa xmm1,XMMWORD[((-48))+rdi] + movdqa xmm2,XMMWORD[((-32))+rdi] + pand xmm0,XMMWORD[320+r10] + movdqa xmm3,XMMWORD[((-16))+rdi] + pand xmm1,XMMWORD[336+r10] + por xmm4,xmm0 + pand xmm2,XMMWORD[352+r10] + por xmm5,xmm1 + pand xmm3,XMMWORD[368+r10] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[rdi] + movdqa xmm1,XMMWORD[16+rdi] + movdqa xmm2,XMMWORD[32+rdi] + pand xmm0,XMMWORD[384+r10] + movdqa xmm3,XMMWORD[48+rdi] + pand xmm1,XMMWORD[400+r10] + por xmm4,xmm0 + pand xmm2,XMMWORD[416+r10] + por xmm5,xmm1 + pand xmm3,XMMWORD[432+r10] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[64+rdi] + movdqa xmm1,XMMWORD[80+rdi] + movdqa xmm2,XMMWORD[96+rdi] + pand xmm0,XMMWORD[448+r10] + movdqa xmm3,XMMWORD[112+rdi] + pand xmm1,XMMWORD[464+r10] + por xmm4,xmm0 + pand xmm2,XMMWORD[480+r10] + por xmm5,xmm1 + pand xmm3,XMMWORD[496+r10] + por xmm4,xmm2 + por xmm5,xmm3 + por xmm4,xmm5 + + pshufd xmm0,xmm4,0x4e + por xmm0,xmm4 + lea rdi,[256+rdi] +DB 102,72,15,126,194 + + mov QWORD[rbx],rbp + lea rbx,[32+rax*1+rbx] + mulx r11,r8,QWORD[rsi] + xor rbp,rbp + mov r9,rdx + mulx r12,r14,QWORD[8+rsi] + adox r8,QWORD[((-32))+rbx] + adcx r11,r14 + mulx r13,r15,QWORD[16+rsi] + adox r11,QWORD[((-24))+rbx] + adcx r12,r15 + mulx r14,rdx,QWORD[24+rsi] + adox r12,QWORD[((-16))+rbx] + adcx r13,rdx + lea rcx,[rax*1+rcx] + lea rsi,[32+rsi] + adox r13,QWORD[((-8))+rbx] + adcx r14,rbp + adox r14,rbp + + mov r15,r8 + imul r8,QWORD[((32+8))+rsp] + + mov rdx,r8 + xor rbp,rbp + mov QWORD[((8+8))+rsp],rdi + + mulx r10,rax,QWORD[rcx] + adcx r15,rax + adox r10,r11 + mulx r11,rax,QWORD[8+rcx] + adcx r10,rax + adox r11,r12 + mulx r12,rax,QWORD[16+rcx] + adcx r11,rax + adox r12,r13 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + mov rdi,QWORD[((24+8))+rsp] + mov QWORD[((-32))+rbx],r10 + adcx r12,rax + mov QWORD[((-24))+rbx],r11 + adox r15,rbp + mov QWORD[((-16))+rbx],r12 + lea rcx,[32+rcx] + jmp NEAR $L$mulx4x_inner + +ALIGN 32 +$L$mulx4x_inner: + mulx rax,r10,QWORD[rsi] + adcx r15,rbp + adox r10,r14 + mulx r14,r11,QWORD[8+rsi] + adcx r10,QWORD[rbx] + adox r11,rax + mulx rax,r12,QWORD[16+rsi] + adcx r11,QWORD[8+rbx] + adox r12,r14 + mulx r14,r13,QWORD[24+rsi] + mov rdx,r8 + adcx r12,QWORD[16+rbx] + adox r13,rax + adcx r13,QWORD[24+rbx] + adox r14,rbp + lea rsi,[32+rsi] + lea rbx,[32+rbx] + adcx r14,rbp + + adox r10,r15 + mulx r15,rax,QWORD[rcx] + adcx r10,rax + adox r11,r15 + mulx r15,rax,QWORD[8+rcx] + adcx r11,rax + adox r12,r15 + mulx r15,rax,QWORD[16+rcx] + mov QWORD[((-40))+rbx],r10 + adcx r12,rax + adox r13,r15 + mov QWORD[((-32))+rbx],r11 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + lea rcx,[32+rcx] + mov QWORD[((-24))+rbx],r12 + adcx r13,rax + adox r15,rbp + mov QWORD[((-16))+rbx],r13 + + dec rdi + jnz NEAR $L$mulx4x_inner + + mov rax,QWORD[((0+8))+rsp] + adc r15,rbp + sub rdi,QWORD[rbx] + mov rdi,QWORD[((8+8))+rsp] + mov r10,QWORD[((16+8))+rsp] + adc r14,r15 + lea rsi,[rax*1+rsi] + adc rbp,rbp + mov QWORD[((-8))+rbx],r14 + + cmp rdi,r10 + jb NEAR $L$mulx4x_outer + + mov r10,QWORD[((-8))+rcx] + mov r8,rbp + mov r12,QWORD[rax*1+rcx] + lea rbp,[rax*1+rcx] + mov rcx,rax + lea rdi,[rax*1+rbx] + xor eax,eax + xor r15,r15 + sub r10,r14 + adc r15,r15 + or r8,r15 + sar rcx,3+2 + sub rax,r8 + mov rdx,QWORD[((56+8))+rsp] + dec r12 + mov r13,QWORD[8+rbp] + xor r8,r8 + mov r14,QWORD[16+rbp] + mov r15,QWORD[24+rbp] + jmp NEAR $L$sqrx4x_sub_entry + + +global bn_powerx5 + +ALIGN 32 +bn_powerx5: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_bn_powerx5: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + +_CET_ENDBR + mov rax,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + +$L$powerx5_prologue: + + + + + shl r9d,3 + lea r10,[r9*2+r9] + neg r9 + mov r8,QWORD[r8] + + + + + + + + + lea r11,[((-320))+r9*2+rsp] + mov rbp,rsp + sub r11,rdi + and r11,4095 + cmp r10,r11 + jb NEAR $L$pwrx_sp_alt + sub rbp,r11 + lea rbp,[((-320))+r9*2+rbp] + jmp NEAR $L$pwrx_sp_done + +ALIGN 32 +$L$pwrx_sp_alt: + lea r10,[((4096-320))+r9*2] + lea rbp,[((-320))+r9*2+rbp] + sub r11,r10 + mov r10,0 + cmovc r11,r10 + sub rbp,r11 +$L$pwrx_sp_done: + and rbp,-64 + mov r11,rsp + sub r11,rbp + and r11,-4096 + lea rsp,[rbp*1+r11] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$pwrx_page_walk + jmp NEAR $L$pwrx_page_walk_done + +$L$pwrx_page_walk: + lea rsp,[((-4096))+rsp] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$pwrx_page_walk +$L$pwrx_page_walk_done: + + mov r10,r9 + neg r9 + + + + + + + + + + + + + pxor xmm0,xmm0 +DB 102,72,15,110,207 +DB 102,72,15,110,209 +DB 102,73,15,110,218 +DB 102,72,15,110,226 + mov QWORD[32+rsp],r8 + mov QWORD[40+rsp],rax + +$L$powerx5_body: + + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + + mov r9,r10 + mov rdi,rsi +DB 102,72,15,126,209 +DB 102,72,15,126,226 + mov rax,QWORD[40+rsp] + + call mulx4x_internal + + mov rsi,QWORD[40+rsp] + + mov rax,1 + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$powerx5_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_bn_powerx5: + +global bn_sqrx8x_internal + + +ALIGN 32 +bn_sqrx8x_internal: +__bn_sqrx8x_internal: + +_CET_ENDBR + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + lea rdi,[((48+8))+rsp] + lea rbp,[r9*1+rsi] + mov QWORD[((0+8))+rsp],r9 + mov QWORD[((8+8))+rsp],rbp + jmp NEAR $L$sqr8x_zero_start + +ALIGN 32 + DB 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 +$L$sqrx8x_zero: + DB 0x3e + movdqa XMMWORD[rdi],xmm0 + movdqa XMMWORD[16+rdi],xmm0 + movdqa XMMWORD[32+rdi],xmm0 + movdqa XMMWORD[48+rdi],xmm0 +$L$sqr8x_zero_start: + movdqa XMMWORD[64+rdi],xmm0 + movdqa XMMWORD[80+rdi],xmm0 + movdqa XMMWORD[96+rdi],xmm0 + movdqa XMMWORD[112+rdi],xmm0 + lea rdi,[128+rdi] + sub r9,64 + jnz NEAR $L$sqrx8x_zero + + mov rdx,QWORD[rsi] + + xor r10,r10 + xor r11,r11 + xor r12,r12 + xor r13,r13 + xor r14,r14 + xor r15,r15 + lea rdi,[((48+8))+rsp] + xor rbp,rbp + jmp NEAR $L$sqrx8x_outer_loop + +ALIGN 32 +$L$sqrx8x_outer_loop: + mulx rax,r8,QWORD[8+rsi] + adcx r8,r9 + adox r10,rax + mulx rax,r9,QWORD[16+rsi] + adcx r9,r10 + adox r11,rax + DB 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 + adcx r10,r11 + adox r12,rax + DB 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 + adcx r11,r12 + adox r13,rax + mulx rax,r12,QWORD[40+rsi] + adcx r12,r13 + adox r14,rax + mulx rax,r13,QWORD[48+rsi] + adcx r13,r14 + adox rax,r15 + mulx r15,r14,QWORD[56+rsi] + mov rdx,QWORD[8+rsi] + adcx r14,rax + adox r15,rbp + adc r15,QWORD[64+rdi] + mov QWORD[8+rdi],r8 + mov QWORD[16+rdi],r9 + sbb rcx,rcx + xor rbp,rbp + + + mulx rbx,r8,QWORD[16+rsi] + mulx rax,r9,QWORD[24+rsi] + adcx r8,r10 + adox r9,rbx + mulx rbx,r10,QWORD[32+rsi] + adcx r9,r11 + adox r10,rax + DB 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 + adcx r10,r12 + adox r11,rbx + DB 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 + adcx r11,r13 + adox r12,r14 + DB 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 + mov rdx,QWORD[16+rsi] + adcx r12,rax + adox r13,rbx + adcx r13,r15 + adox r14,rbp + adcx r14,rbp + + mov QWORD[24+rdi],r8 + mov QWORD[32+rdi],r9 + + mulx rbx,r8,QWORD[24+rsi] + mulx rax,r9,QWORD[32+rsi] + adcx r8,r10 + adox r9,rbx + mulx rbx,r10,QWORD[40+rsi] + adcx r9,r11 + adox r10,rax + DB 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 + adcx r10,r12 + adox r11,r13 + DB 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 + DB 0x3e + mov rdx,QWORD[24+rsi] + adcx r11,rbx + adox r12,rax + adcx r12,r14 + mov QWORD[40+rdi],r8 + mov QWORD[48+rdi],r9 + mulx rax,r8,QWORD[32+rsi] + adox r13,rbp + adcx r13,rbp + + mulx rbx,r9,QWORD[40+rsi] + adcx r8,r10 + adox r9,rax + mulx rax,r10,QWORD[48+rsi] + adcx r9,r11 + adox r10,r12 + mulx r12,r11,QWORD[56+rsi] + mov rdx,QWORD[32+rsi] + mov r14,QWORD[40+rsi] + adcx r10,rbx + adox r11,rax + mov r15,QWORD[48+rsi] + adcx r11,r13 + adox r12,rbp + adcx r12,rbp + + mov QWORD[56+rdi],r8 + mov QWORD[64+rdi],r9 + + mulx rax,r9,r14 + mov r8,QWORD[56+rsi] + adcx r9,r10 + mulx rbx,r10,r15 + adox r10,rax + adcx r10,r11 + mulx rax,r11,r8 + mov rdx,r14 + adox r11,rbx + adcx r11,r12 + + adcx rax,rbp + + mulx rbx,r14,r15 + mulx r13,r12,r8 + mov rdx,r15 + lea rsi,[64+rsi] + adcx r11,r14 + adox r12,rbx + adcx r12,rax + adox r13,rbp + + DB 0x67,0x67 + mulx r14,r8,r8 + adcx r13,r8 + adcx r14,rbp + + cmp rsi,QWORD[((8+8))+rsp] + je NEAR $L$sqrx8x_outer_break + + neg rcx + mov rcx,-8 + mov r15,rbp + mov r8,QWORD[64+rdi] + adcx r9,QWORD[72+rdi] + adcx r10,QWORD[80+rdi] + adcx r11,QWORD[88+rdi] + adc r12,QWORD[96+rdi] + adc r13,QWORD[104+rdi] + adc r14,QWORD[112+rdi] + adc r15,QWORD[120+rdi] + lea rbp,[rsi] + lea rdi,[128+rdi] + sbb rax,rax + + mov rdx,QWORD[((-64))+rsi] + mov QWORD[((16+8))+rsp],rax + mov QWORD[((24+8))+rsp],rdi + + + xor eax,eax + jmp NEAR $L$sqrx8x_loop + +ALIGN 32 +$L$sqrx8x_loop: + mov rbx,r8 + mulx r8,rax,QWORD[rbp] + adcx rbx,rax + adox r8,r9 + + mulx r9,rax,QWORD[8+rbp] + adcx r8,rax + adox r9,r10 + + mulx r10,rax,QWORD[16+rbp] + adcx r9,rax + adox r10,r11 + + mulx r11,rax,QWORD[24+rbp] + adcx r10,rax + adox r11,r12 + + DB 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 + adcx r11,rax + adox r12,r13 + + mulx r13,rax,QWORD[40+rbp] + adcx r12,rax + adox r13,r14 + + mulx r14,rax,QWORD[48+rbp] + mov QWORD[rcx*8+rdi],rbx + mov ebx,0 + adcx r13,rax + adox r14,r15 + + DB 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 + mov rdx,QWORD[8+rcx*8+rsi] + adcx r14,rax + adox r15,rbx + adcx r15,rbx + + DB 0x67 + inc rcx + jnz NEAR $L$sqrx8x_loop + + lea rbp,[64+rbp] + mov rcx,-8 + cmp rbp,QWORD[((8+8))+rsp] + je NEAR $L$sqrx8x_break + + sub rbx,QWORD[((16+8))+rsp] + DB 0x66 + mov rdx,QWORD[((-64))+rsi] + adcx r8,QWORD[rdi] + adcx r9,QWORD[8+rdi] + adc r10,QWORD[16+rdi] + adc r11,QWORD[24+rdi] + adc r12,QWORD[32+rdi] + adc r13,QWORD[40+rdi] + adc r14,QWORD[48+rdi] + adc r15,QWORD[56+rdi] + lea rdi,[64+rdi] + DB 0x67 + sbb rax,rax + xor ebx,ebx + mov QWORD[((16+8))+rsp],rax + jmp NEAR $L$sqrx8x_loop + +ALIGN 32 +$L$sqrx8x_break: + xor rbp,rbp + sub rbx,QWORD[((16+8))+rsp] + adcx r8,rbp + mov rcx,QWORD[((24+8))+rsp] + adcx r9,rbp + mov rdx,QWORD[rsi] + adc r10,0 + mov QWORD[rdi],r8 + adc r11,0 + adc r12,0 + adc r13,0 + adc r14,0 + adc r15,0 + cmp rdi,rcx + je NEAR $L$sqrx8x_outer_loop + + mov QWORD[8+rdi],r9 + mov r9,QWORD[8+rcx] + mov QWORD[16+rdi],r10 + mov r10,QWORD[16+rcx] + mov QWORD[24+rdi],r11 + mov r11,QWORD[24+rcx] + mov QWORD[32+rdi],r12 + mov r12,QWORD[32+rcx] + mov QWORD[40+rdi],r13 + mov r13,QWORD[40+rcx] + mov QWORD[48+rdi],r14 + mov r14,QWORD[48+rcx] + mov QWORD[56+rdi],r15 + mov r15,QWORD[56+rcx] + mov rdi,rcx + jmp NEAR $L$sqrx8x_outer_loop + +ALIGN 32 +$L$sqrx8x_outer_break: + mov QWORD[72+rdi],r9 +DB 102,72,15,126,217 + mov QWORD[80+rdi],r10 + mov QWORD[88+rdi],r11 + mov QWORD[96+rdi],r12 + mov QWORD[104+rdi],r13 + mov QWORD[112+rdi],r14 + lea rdi,[((48+8))+rsp] + mov rdx,QWORD[rcx*1+rsi] + + mov r11,QWORD[8+rdi] + xor r10,r10 + mov r9,QWORD[((0+8))+rsp] + adox r11,r11 + mov r12,QWORD[16+rdi] + mov r13,QWORD[24+rdi] + + +ALIGN 32 +$L$sqrx4x_shift_n_add: + mulx rbx,rax,rdx + adox r12,r12 + adcx rax,r10 + DB 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 + DB 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 + adox r13,r13 + adcx rbx,r11 + mov r11,QWORD[40+rdi] + mov QWORD[rdi],rax + mov QWORD[8+rdi],rbx + + mulx rbx,rax,rdx + adox r10,r10 + adcx rax,r12 + mov rdx,QWORD[16+rcx*1+rsi] + mov r12,QWORD[48+rdi] + adox r11,r11 + adcx rbx,r13 + mov r13,QWORD[56+rdi] + mov QWORD[16+rdi],rax + mov QWORD[24+rdi],rbx + + mulx rbx,rax,rdx + adox r12,r12 + adcx rax,r10 + mov rdx,QWORD[24+rcx*1+rsi] + lea rcx,[32+rcx] + mov r10,QWORD[64+rdi] + adox r13,r13 + adcx rbx,r11 + mov r11,QWORD[72+rdi] + mov QWORD[32+rdi],rax + mov QWORD[40+rdi],rbx + + mulx rbx,rax,rdx + adox r10,r10 + adcx rax,r12 + jrcxz $L$sqrx4x_shift_n_add_break + DB 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 + adox r11,r11 + adcx rbx,r13 + mov r12,QWORD[80+rdi] + mov r13,QWORD[88+rdi] + mov QWORD[48+rdi],rax + mov QWORD[56+rdi],rbx + lea rdi,[64+rdi] + nop + jmp NEAR $L$sqrx4x_shift_n_add + +ALIGN 32 +$L$sqrx4x_shift_n_add_break: + adcx rbx,r13 + mov QWORD[48+rdi],rax + mov QWORD[56+rdi],rbx + lea rdi,[64+rdi] +DB 102,72,15,126,213 +__bn_sqrx8x_reduction: + xor eax,eax + mov rbx,QWORD[((32+8))+rsp] + mov rdx,QWORD[((48+8))+rsp] + lea rcx,[((-64))+r9*1+rbp] + + mov QWORD[((0+8))+rsp],rcx + mov QWORD[((8+8))+rsp],rdi + + lea rdi,[((48+8))+rsp] + jmp NEAR $L$sqrx8x_reduction_loop + +ALIGN 32 +$L$sqrx8x_reduction_loop: + mov r9,QWORD[8+rdi] + mov r10,QWORD[16+rdi] + mov r11,QWORD[24+rdi] + mov r12,QWORD[32+rdi] + mov r8,rdx + imul rdx,rbx + mov r13,QWORD[40+rdi] + mov r14,QWORD[48+rdi] + mov r15,QWORD[56+rdi] + mov QWORD[((24+8))+rsp],rax + + lea rdi,[64+rdi] + xor rsi,rsi + mov rcx,-8 + jmp NEAR $L$sqrx8x_reduce + +ALIGN 32 +$L$sqrx8x_reduce: + mov rbx,r8 + mulx r8,rax,QWORD[rbp] + adcx rax,rbx + adox r8,r9 + + mulx r9,rbx,QWORD[8+rbp] + adcx r8,rbx + adox r9,r10 + + mulx r10,rbx,QWORD[16+rbp] + adcx r9,rbx + adox r10,r11 + + mulx r11,rbx,QWORD[24+rbp] + adcx r10,rbx + adox r11,r12 + + DB 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 + mov rax,rdx + mov rdx,r8 + adcx r11,rbx + adox r12,r13 + + mulx rdx,rbx,QWORD[((32+8))+rsp] + mov rdx,rax + mov QWORD[((64+48+8))+rcx*8+rsp],rax + + mulx r13,rax,QWORD[40+rbp] + adcx r12,rax + adox r13,r14 + + mulx r14,rax,QWORD[48+rbp] + adcx r13,rax + adox r14,r15 + + mulx r15,rax,QWORD[56+rbp] + mov rdx,rbx + adcx r14,rax + adox r15,rsi + adcx r15,rsi + + DB 0x67,0x67,0x67 + inc rcx + jnz NEAR $L$sqrx8x_reduce + + mov rax,rsi + cmp rbp,QWORD[((0+8))+rsp] + jae NEAR $L$sqrx8x_no_tail + + mov rdx,QWORD[((48+8))+rsp] + add r8,QWORD[rdi] + lea rbp,[64+rbp] + mov rcx,-8 + adcx r9,QWORD[8+rdi] + adcx r10,QWORD[16+rdi] + adc r11,QWORD[24+rdi] + adc r12,QWORD[32+rdi] + adc r13,QWORD[40+rdi] + adc r14,QWORD[48+rdi] + adc r15,QWORD[56+rdi] + lea rdi,[64+rdi] + sbb rax,rax + + xor rsi,rsi + mov QWORD[((16+8))+rsp],rax + jmp NEAR $L$sqrx8x_tail + +ALIGN 32 +$L$sqrx8x_tail: + mov rbx,r8 + mulx r8,rax,QWORD[rbp] + adcx rbx,rax + adox r8,r9 + + mulx r9,rax,QWORD[8+rbp] + adcx r8,rax + adox r9,r10 + + mulx r10,rax,QWORD[16+rbp] + adcx r9,rax + adox r10,r11 + + mulx r11,rax,QWORD[24+rbp] + adcx r10,rax + adox r11,r12 + + DB 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 + adcx r11,rax + adox r12,r13 + + mulx r13,rax,QWORD[40+rbp] + adcx r12,rax + adox r13,r14 + + mulx r14,rax,QWORD[48+rbp] + adcx r13,rax + adox r14,r15 + + mulx r15,rax,QWORD[56+rbp] + mov rdx,QWORD[((72+48+8))+rcx*8+rsp] + adcx r14,rax + adox r15,rsi + mov QWORD[rcx*8+rdi],rbx + mov rbx,r8 + adcx r15,rsi + + inc rcx + jnz NEAR $L$sqrx8x_tail + + cmp rbp,QWORD[((0+8))+rsp] + jae NEAR $L$sqrx8x_tail_done + + sub rsi,QWORD[((16+8))+rsp] + mov rdx,QWORD[((48+8))+rsp] + lea rbp,[64+rbp] + adc r8,QWORD[rdi] + adc r9,QWORD[8+rdi] + adc r10,QWORD[16+rdi] + adc r11,QWORD[24+rdi] + adc r12,QWORD[32+rdi] + adc r13,QWORD[40+rdi] + adc r14,QWORD[48+rdi] + adc r15,QWORD[56+rdi] + lea rdi,[64+rdi] + sbb rax,rax + sub rcx,8 + + xor rsi,rsi + mov QWORD[((16+8))+rsp],rax + jmp NEAR $L$sqrx8x_tail + +ALIGN 32 +$L$sqrx8x_tail_done: + xor rax,rax + add r8,QWORD[((24+8))+rsp] + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + adc r14,0 + adc r15,0 + adc rax,0 + + sub rsi,QWORD[((16+8))+rsp] +$L$sqrx8x_no_tail: + adc r8,QWORD[rdi] +DB 102,72,15,126,217 + adc r9,QWORD[8+rdi] + mov rsi,QWORD[56+rbp] +DB 102,72,15,126,213 + adc r10,QWORD[16+rdi] + adc r11,QWORD[24+rdi] + adc r12,QWORD[32+rdi] + adc r13,QWORD[40+rdi] + adc r14,QWORD[48+rdi] + adc r15,QWORD[56+rdi] + adc rax,0 + + mov rbx,QWORD[((32+8))+rsp] + mov rdx,QWORD[64+rcx*1+rdi] + + mov QWORD[rdi],r8 + lea r8,[64+rdi] + mov QWORD[8+rdi],r9 + mov QWORD[16+rdi],r10 + mov QWORD[24+rdi],r11 + mov QWORD[32+rdi],r12 + mov QWORD[40+rdi],r13 + mov QWORD[48+rdi],r14 + mov QWORD[56+rdi],r15 + + lea rdi,[64+rcx*1+rdi] + cmp r8,QWORD[((8+8))+rsp] + jb NEAR $L$sqrx8x_reduction_loop + ret + + +ALIGN 32 + +__bn_postx4x_internal: + + mov r12,QWORD[rbp] + mov r10,rcx + mov r9,rcx + neg rax + sar rcx,3+2 + +DB 102,72,15,126,202 +DB 102,72,15,126,206 + dec r12 + mov r13,QWORD[8+rbp] + xor r8,r8 + mov r14,QWORD[16+rbp] + mov r15,QWORD[24+rbp] + jmp NEAR $L$sqrx4x_sub_entry + +ALIGN 16 +$L$sqrx4x_sub: + mov r12,QWORD[rbp] + mov r13,QWORD[8+rbp] + mov r14,QWORD[16+rbp] + mov r15,QWORD[24+rbp] +$L$sqrx4x_sub_entry: + andn r12,r12,rax + lea rbp,[32+rbp] + andn r13,r13,rax + andn r14,r14,rax + andn r15,r15,rax + + neg r8 + adc r12,QWORD[rdi] + adc r13,QWORD[8+rdi] + adc r14,QWORD[16+rdi] + adc r15,QWORD[24+rdi] + mov QWORD[rdx],r12 + lea rdi,[32+rdi] + mov QWORD[8+rdx],r13 + sbb r8,r8 + mov QWORD[16+rdx],r14 + mov QWORD[24+rdx],r15 + lea rdx,[32+rdx] + + inc rcx + jnz NEAR $L$sqrx4x_sub + + neg r9 + + ret + + +global bn_scatter5 + +ALIGN 16 +bn_scatter5: + +_CET_ENDBR + cmp edx,0 + jz NEAR $L$scatter_epilogue + + + + + + + + + + lea r8,[r9*8+r8] +$L$scatter: + mov rax,QWORD[rcx] + lea rcx,[8+rcx] + mov QWORD[r8],rax + lea r8,[256+r8] + sub edx,1 + jnz NEAR $L$scatter +$L$scatter_epilogue: + ret + + + +global bn_gather5 + +ALIGN 32 +bn_gather5: + +$L$SEH_begin_bn_gather5: +_CET_ENDBR + + DB 0x4c,0x8d,0x14,0x24 + + DB 0x48,0x81,0xec,0x08,0x01,0x00,0x00 + lea rax,[$L$inc] + and rsp,-16 + + movd xmm5,r9d + movdqa xmm0,XMMWORD[rax] + movdqa xmm1,XMMWORD[16+rax] + lea r11,[128+r8] + lea rax,[128+rsp] + + pshufd xmm5,xmm5,0 + movdqa xmm4,xmm1 + movdqa xmm2,xmm1 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa xmm3,xmm4 + + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[(-128)+rax],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[(-112)+rax],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[(-96)+rax],xmm2 + movdqa xmm2,xmm4 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[(-80)+rax],xmm3 + movdqa xmm3,xmm4 + + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[(-64)+rax],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[(-48)+rax],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[(-32)+rax],xmm2 + movdqa xmm2,xmm4 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[(-16)+rax],xmm3 + movdqa xmm3,xmm4 + + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[rax],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[16+rax],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[32+rax],xmm2 + movdqa xmm2,xmm4 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[48+rax],xmm3 + movdqa xmm3,xmm4 + + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[64+rax],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[80+rax],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[96+rax],xmm2 + movdqa xmm2,xmm4 + movdqa XMMWORD[112+rax],xmm3 + jmp NEAR $L$gather + +ALIGN 32 +$L$gather: + pxor xmm4,xmm4 + pxor xmm5,xmm5 + movdqa xmm0,XMMWORD[((-128))+r11] + movdqa xmm1,XMMWORD[((-112))+r11] + movdqa xmm2,XMMWORD[((-96))+r11] + pand xmm0,XMMWORD[((-128))+rax] + movdqa xmm3,XMMWORD[((-80))+r11] + pand xmm1,XMMWORD[((-112))+rax] + por xmm4,xmm0 + pand xmm2,XMMWORD[((-96))+rax] + por xmm5,xmm1 + pand xmm3,XMMWORD[((-80))+rax] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[((-64))+r11] + movdqa xmm1,XMMWORD[((-48))+r11] + movdqa xmm2,XMMWORD[((-32))+r11] + pand xmm0,XMMWORD[((-64))+rax] + movdqa xmm3,XMMWORD[((-16))+r11] + pand xmm1,XMMWORD[((-48))+rax] + por xmm4,xmm0 + pand xmm2,XMMWORD[((-32))+rax] + por xmm5,xmm1 + pand xmm3,XMMWORD[((-16))+rax] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[r11] + movdqa xmm1,XMMWORD[16+r11] + movdqa xmm2,XMMWORD[32+r11] + pand xmm0,XMMWORD[rax] + movdqa xmm3,XMMWORD[48+r11] + pand xmm1,XMMWORD[16+rax] + por xmm4,xmm0 + pand xmm2,XMMWORD[32+rax] + por xmm5,xmm1 + pand xmm3,XMMWORD[48+rax] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[64+r11] + movdqa xmm1,XMMWORD[80+r11] + movdqa xmm2,XMMWORD[96+r11] + pand xmm0,XMMWORD[64+rax] + movdqa xmm3,XMMWORD[112+r11] + pand xmm1,XMMWORD[80+rax] + por xmm4,xmm0 + pand xmm2,XMMWORD[96+rax] + por xmm5,xmm1 + pand xmm3,XMMWORD[112+rax] + por xmm4,xmm2 + por xmm5,xmm3 + por xmm4,xmm5 + lea r11,[256+r11] + + pshufd xmm0,xmm4,0x4e + por xmm0,xmm4 + movq QWORD[rcx],xmm0 + lea rcx,[8+rcx] + sub edx,1 + jnz NEAR $L$gather + + lea rsp,[r10] + + ret +$L$SEH_end_bn_gather5: + + +section .rdata rdata align=8 +ALIGN 64 +$L$inc: + DD 0,0,1,1 + DD 2,2,2,2 + DB 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 + DB 112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115 + DB 99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111 + DB 114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79 + DB 71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111 + DB 112,101,110,115,115,108,46,111,114,103,62,0 +section .text + +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +mul_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_pop_regs + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[8+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + lea r10,[$L$mul4x_epilogue] + cmp rbx,r10 + ja NEAR $L$body_40 + + mov r10,QWORD[192+r8] + mov rax,QWORD[8+r10*8+rax] + + jmp NEAR $L$common_pop_regs + +$L$body_40: + mov rax,QWORD[40+rax] +$L$common_pop_regs: + mov rbx,QWORD[((-8))+rax] + mov rbp,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 + +$L$common_seh_tail: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + ret + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_bn_mul4x_mont_gather5 wrt ..imagebase + DD $L$SEH_end_bn_mul4x_mont_gather5 wrt ..imagebase + DD $L$SEH_info_bn_mul4x_mont_gather5 wrt ..imagebase + + DD $L$SEH_begin_bn_power5_nohw wrt ..imagebase + DD $L$SEH_end_bn_power5_nohw wrt ..imagebase + DD $L$SEH_info_bn_power5_nohw wrt ..imagebase + DD $L$SEH_begin_bn_mulx4x_mont_gather5 wrt ..imagebase + DD $L$SEH_end_bn_mulx4x_mont_gather5 wrt ..imagebase + DD $L$SEH_info_bn_mulx4x_mont_gather5 wrt ..imagebase + + DD $L$SEH_begin_bn_powerx5 wrt ..imagebase + DD $L$SEH_end_bn_powerx5 wrt ..imagebase + DD $L$SEH_info_bn_powerx5 wrt ..imagebase + DD $L$SEH_begin_bn_gather5 wrt ..imagebase + DD $L$SEH_end_bn_gather5 wrt ..imagebase + DD $L$SEH_info_bn_gather5 wrt ..imagebase + +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_bn_mul4x_mont_gather5: + DB 9,0,0,0 + DD mul_handler wrt ..imagebase + DD $L$mul4x_prologue wrt ..imagebase,$L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase +ALIGN 8 +$L$SEH_info_bn_power5_nohw: + DB 9,0,0,0 + DD mul_handler wrt ..imagebase + DD $L$power5_prologue wrt ..imagebase,$L$power5_body wrt ..imagebase,$L$power5_epilogue wrt ..imagebase +ALIGN 8 +$L$SEH_info_bn_mulx4x_mont_gather5: + DB 9,0,0,0 + DD mul_handler wrt ..imagebase + DD $L$mulx4x_prologue wrt ..imagebase,$L$mulx4x_body wrt ..imagebase,$L$mulx4x_epilogue wrt ..imagebase +ALIGN 8 +$L$SEH_info_bn_powerx5: + DB 9,0,0,0 + DD mul_handler wrt ..imagebase + DD $L$powerx5_prologue wrt ..imagebase,$L$powerx5_body wrt ..imagebase,$L$powerx5_epilogue wrt ..imagebase +ALIGN 8 +$L$SEH_info_bn_gather5: + DB 0x01,0x0b,0x03,0x0a + DB 0x0b,0x01,0x21,0x00 + DB 0x04,0xa3,0x00,0x00 +ALIGN 8 +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/ring-0.17.14/pregenerated/x86_64-mont5-nasm.o b/ring-0.17.14/pregenerated/x86_64-mont5-nasm.o new file mode 100644 index 0000000000..2d5a63e2f1 Binary files /dev/null and b/ring-0.17.14/pregenerated/x86_64-mont5-nasm.o differ diff --git a/ring-0.17.14/src/aead.rs b/ring-0.17.14/src/aead.rs new file mode 100644 index 0000000000..16a8aba4b6 --- /dev/null +++ b/ring-0.17.14/src/aead.rs @@ -0,0 +1,186 @@ +// Copyright 2015-2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! Authenticated Encryption with Associated Data (AEAD). +//! +//! See [Authenticated encryption: relations among notions and analysis of the +//! generic composition paradigm][AEAD] for an introduction to the concept of +//! AEADs. +//! +//! [AEAD]: https://eprint.iacr.org/2000/025.pdf +//! [`crypto.cipher.AEAD`]: https://golang.org/pkg/crypto/cipher/#AEAD + +use crate::{ + cpu, error, + polyfill::{u64_from_usize, usize_from_u64_saturated}, +}; + +pub use self::{ + algorithm::{Algorithm, AES_128_GCM, AES_256_GCM, CHACHA20_POLY1305}, + less_safe_key::LessSafeKey, + nonce::{Nonce, NONCE_LEN}, + opening_key::OpeningKey, + sealing_key::SealingKey, + unbound_key::UnboundKey, +}; + +/// A sequences of unique nonces. +/// +/// A given `NonceSequence` must never return the same `Nonce` twice from +/// `advance()`. +/// +/// A simple counter is a reasonable (but probably not ideal) `NonceSequence`. +/// +/// Intentionally not `Clone` or `Copy` since cloning would allow duplication +/// of the sequence. +pub trait NonceSequence { + /// Returns the next nonce in the sequence. + /// + /// This may fail if "too many" nonces have been requested, where how many + /// is too many is up to the implementation of `NonceSequence`. An + /// implementation may that enforce a maximum number of records are + /// sent/received under a key this way. Once `advance()` fails, it must + /// fail for all subsequent calls. + fn advance(&mut self) -> Result; +} + +/// An AEAD key bound to a nonce sequence. +pub trait BoundKey: core::fmt::Debug { + /// Constructs a new key from the given `UnboundKey` and `NonceSequence`. + fn new(key: UnboundKey, nonce_sequence: N) -> Self; + + /// The key's AEAD algorithm. + fn algorithm(&self) -> &'static Algorithm; +} + +/// The additionally authenticated data (AAD) for an opening or sealing +/// operation. This data is authenticated but is **not** encrypted. +/// +/// The type `A` could be a byte slice `&[u8]`, a byte array `[u8; N]` +/// for some constant `N`, `Vec`, etc. +#[derive(Clone, Copy)] +pub struct Aad(A); + +impl> Aad { + /// Construct the `Aad` from the given bytes. + #[inline] + pub fn from(aad: A) -> Self { + Self(aad) + } +} + +impl AsRef<[u8]> for Aad +where + A: AsRef<[u8]>, +{ + fn as_ref(&self) -> &[u8] { + self.0.as_ref() + } +} + +impl Aad<[u8; 0]> { + /// Construct an empty `Aad`. + pub fn empty() -> Self { + Self::from([]) + } +} + +impl core::fmt::Debug for Aad +where + A: core::fmt::Debug, +{ + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_tuple("Aad").field(&self.0).finish() + } +} + +impl PartialEq for Aad +where + A: PartialEq, +{ + #[inline] + fn eq(&self, other: &Self) -> bool { + self.0.eq(&other.0) + } +} + +impl Eq for Aad where A: Eq {} + +#[allow(clippy::large_enum_variant, variant_size_differences)] +#[derive(Clone)] +enum KeyInner { + AesGcm(aes_gcm::Key), + ChaCha20Poly1305(chacha20_poly1305::Key), +} + +const fn max_input_len(block_len: usize, overhead_blocks_per_nonce: usize) -> usize { + // Each of our AEADs use a 32-bit block counter so the maximum is the + // largest input that will not overflow the counter. + usize_from_u64_saturated( + ((1u64 << 32) - u64_from_usize(overhead_blocks_per_nonce)) * u64_from_usize(block_len), + ) +} + +/// A possibly valid authentication tag. +#[must_use] +#[repr(C)] +#[derive(Clone, Copy)] +pub struct Tag([u8; TAG_LEN]); + +impl AsRef<[u8]> for Tag { + fn as_ref(&self) -> &[u8] { + self.0.as_ref() + } +} + +impl TryFrom<&[u8]> for Tag { + type Error = error::Unspecified; + + fn try_from(value: &[u8]) -> Result { + let raw_tag: [u8; TAG_LEN] = value.try_into().map_err(|_| error::Unspecified)?; + Ok(Self::from(raw_tag)) + } +} + +impl From<[u8; TAG_LEN]> for Tag { + #[inline] + fn from(value: [u8; TAG_LEN]) -> Self { + Self(value) + } +} + +const MAX_KEY_LEN: usize = 32; + +// All the AEADs we support use 128-bit tags. +const TAG_LEN: usize = 16; + +/// The maximum length of a tag for the algorithms in this module. +pub const MAX_TAG_LEN: usize = TAG_LEN; + +mod aes; +mod aes_gcm; +mod algorithm; +mod chacha; +mod chacha20_poly1305; +pub mod chacha20_poly1305_openssh; +mod gcm; +mod less_safe_key; +mod nonce; +mod opening_key; +mod overlapping; +mod poly1305; +pub mod quic; +mod sealing_key; +mod shift; +mod unbound_key; diff --git a/ring-0.17.14/src/aead/aes.rs b/ring-0.17.14/src/aead/aes.rs new file mode 100644 index 0000000000..d9832aeb21 --- /dev/null +++ b/ring-0.17.14/src/aead/aes.rs @@ -0,0 +1,284 @@ +// Copyright 2018-2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::{nonce::Nonce, overlapping, quic::Sample, NONCE_LEN}; +use crate::{ + bb, + cpu::{self, GetFeature as _}, + error, + polyfill::unwrap_const, +}; +use cfg_if::cfg_if; +use core::num::NonZeroU32; + +pub(super) use ffi::Counter; + +#[macro_use] +mod ffi; + +mod bs; +pub(super) mod fallback; +pub(super) mod hw; +pub(super) mod vp; + +pub type Overlapping<'o> = overlapping::Overlapping<'o, u8>; +pub type OverlappingPartialBlock<'o> = overlapping::PartialBlock<'o, u8, BLOCK_LEN>; + +cfg_if! { + if #[cfg(any(all(target_arch = "aarch64", target_endian = "little"), target_arch = "x86_64"))] { + pub(super) use ffi::AES_KEY; + } else { + use ffi::AES_KEY; + } +} + +#[derive(Clone)] +pub(super) enum Key { + #[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + target_arch = "x86_64", + target_arch = "x86" + ))] + Hw(hw::Key), + + #[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + all(target_arch = "arm", target_endian = "little"), + target_arch = "x86", + target_arch = "x86_64" + ))] + Vp(vp::Key), + + Fallback(fallback::Key), +} + +impl Key { + #[inline] + pub fn new( + bytes: KeyBytes<'_>, + cpu_features: cpu::Features, + ) -> Result { + #[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + target_arch = "x86", + target_arch = "x86_64" + ))] + if let Some(hw_features) = cpu_features.get_feature() { + return Ok(Self::Hw(hw::Key::new( + bytes, + hw_features, + cpu_features.get_feature(), + )?)); + } + + #[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + all(target_arch = "arm", target_endian = "little"), + target_arch = "x86_64", + target_arch = "x86" + ))] + if let Some(vp_features) = cpu_features.get_feature() { + return Ok(Self::Vp(vp::Key::new(bytes, vp_features)?)); + } + + let _ = cpu_features; + + Ok(Self::Fallback(fallback::Key::new(bytes)?)) + } + + #[inline] + fn encrypt_block(&self, a: Block) -> Block { + match self { + #[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + target_arch = "x86_64", + target_arch = "x86" + ))] + Key::Hw(inner) => inner.encrypt_block(a), + + #[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + all(target_arch = "arm", target_endian = "little"), + target_arch = "x86", + target_arch = "x86_64" + ))] + Key::Vp(inner) => inner.encrypt_block(a), + + Key::Fallback(inner) => inner.encrypt_block(a), + } + } + + pub fn new_mask(&self, sample: Sample) -> [u8; 5] { + let [b0, b1, b2, b3, b4, ..] = self.encrypt_block(sample); + [b0, b1, b2, b3, b4] + } +} + +pub const AES_128_KEY_LEN: usize = 128 / 8; +pub const AES_256_KEY_LEN: usize = 256 / 8; + +pub enum KeyBytes<'a> { + AES_128(&'a [u8; AES_128_KEY_LEN]), + AES_256(&'a [u8; AES_256_KEY_LEN]), +} + +// `Counter` is `ffi::Counter` as its representation is dictated by its use in +// the FFI. +impl Counter { + pub fn one(nonce: Nonce) -> Self { + let mut value = [0u8; BLOCK_LEN]; + value[..NONCE_LEN].copy_from_slice(nonce.as_ref()); + value[BLOCK_LEN - 1] = 1; + Self(value) + } + + pub fn increment(&mut self) -> Iv { + const ONE: NonZeroU32 = unwrap_const(NonZeroU32::new(1)); + + let iv = Iv(self.0); + self.increment_by_less_safe(ONE); + iv + } + + pub(super) fn increment_by_less_safe(&mut self, increment_by: NonZeroU32) { + let [.., c0, c1, c2, c3] = &mut self.0; + let old_value: u32 = u32::from_be_bytes([*c0, *c1, *c2, *c3]); + let new_value = old_value.wrapping_add(increment_by.get()); + [*c0, *c1, *c2, *c3] = u32::to_be_bytes(new_value); + } +} + +/// The IV for a single block encryption. +/// +/// Intentionally not `Clone` to ensure each is used only once. +pub struct Iv(Block); + +impl From for Iv { + fn from(counter: Counter) -> Self { + Self(counter.0) + } +} + +pub(super) type Block = [u8; BLOCK_LEN]; +pub(super) const BLOCK_LEN: usize = 16; +pub(super) const ZERO_BLOCK: Block = [0u8; BLOCK_LEN]; + +pub(super) trait EncryptBlock { + fn encrypt_block(&self, block: Block) -> Block; + fn encrypt_iv_xor_block(&self, iv: Iv, block: Block) -> Block; +} + +pub(super) trait EncryptCtr32 { + fn ctr32_encrypt_within(&self, in_out: Overlapping<'_>, ctr: &mut Counter); +} + +#[allow(dead_code)] +fn encrypt_block_using_encrypt_iv_xor_block(key: &impl EncryptBlock, block: Block) -> Block { + key.encrypt_iv_xor_block(Iv(block), ZERO_BLOCK) +} + +fn encrypt_iv_xor_block_using_encrypt_block( + key: &impl EncryptBlock, + iv: Iv, + block: Block, +) -> Block { + let encrypted_iv = key.encrypt_block(iv.0); + bb::xor_16(encrypted_iv, block) +} + +#[allow(dead_code)] +fn encrypt_iv_xor_block_using_ctr32(key: &impl EncryptCtr32, iv: Iv, mut block: Block) -> Block { + let mut ctr = Counter(iv.0); // This is OK because we're only encrypting one block. + key.ctr32_encrypt_within(block.as_mut().into(), &mut ctr); + block +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::testutil as test; + + #[test] + pub fn test_aes() { + test::run(test_vector_file!("aes_tests.txt"), |section, test_case| { + assert_eq!(section, ""); + let key = consume_key(test_case, "Key"); + let input = test_case.consume_bytes("Input"); + let block: Block = input.as_slice().try_into()?; + let expected_output = test_case.consume_bytes("Output"); + + let output = key.encrypt_block(block); + assert_eq!(output.as_ref(), &expected_output[..]); + + Ok(()) + }) + } + + fn consume_key(test_case: &mut test::TestCase, name: &str) -> Key { + let key = test_case.consume_bytes(name); + let key = &key[..]; + let key = match key.len() { + 16 => KeyBytes::AES_128(key.try_into().unwrap()), + 32 => KeyBytes::AES_256(key.try_into().unwrap()), + _ => unreachable!(), + }; + Key::new(key, cpu::features()).unwrap() + } +} + +// These AES-GCM-specific tests are here instead of in `aes_gcm` because +// `Counter`'s API isn't visible (enough) to aes_gcm. +#[cfg(test)] +mod aes_gcm_tests { + use super::{super::aes_gcm::MAX_IN_OUT_LEN, *}; + use core::num::NonZeroU32; + + #[test] + fn test_aes_gcm_counter_blocks_max() { + test_aes_gcm_counter_blocks(MAX_IN_OUT_LEN, &[0, 0, 0, 0]); + } + + #[test] + fn test_aes_gcm_counter_blocks_max_minus_one() { + test_aes_gcm_counter_blocks(MAX_IN_OUT_LEN - BLOCK_LEN, &[0xff, 0xff, 0xff, 0xff]); + } + + fn test_aes_gcm_counter_blocks(in_out_len: usize, expected_final_counter: &[u8; 4]) { + fn ctr32(ctr: &Counter) -> &[u8; 4] { + (&ctr.0[12..]).try_into().unwrap() + } + + let rounded_down = in_out_len / BLOCK_LEN; + let blocks = rounded_down + (if in_out_len % BLOCK_LEN == 0 { 0 } else { 1 }); + let blocks = u32::try_from(blocks) + .ok() + .and_then(NonZeroU32::new) + .unwrap(); + + let nonce = Nonce::assume_unique_for_key([1; 12]); + let mut ctr = Counter::one(nonce); + assert_eq!(ctr32(&ctr), &[0, 0, 0, 1]); + let _tag_iv = ctr.increment(); + assert_eq!(ctr32(&ctr), &[0, 0, 0, 2]); + ctr.increment_by_less_safe(blocks); + + // `MAX_IN_OUT_LEN` is less on 32-bit targets, so we don't even get + // close to wrapping, but run the tests on them anyway. + #[cfg(target_pointer_width = "64")] + assert_eq!(ctr32(&ctr), expected_final_counter); + + #[cfg(target_pointer_width = "32")] + let _ = expected_final_counter; + } +} diff --git a/ring-0.17.14/src/aead/aes/bs.rs b/ring-0.17.14/src/aead/aes/bs.rs new file mode 100644 index 0000000000..2e418bbf10 --- /dev/null +++ b/ring-0.17.14/src/aead/aes/bs.rs @@ -0,0 +1,60 @@ +// Copyright 2018-2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![cfg(all(target_arch = "arm", target_endian = "little"))] + +use super::{Counter, Overlapping, AES_KEY}; + +/// SAFETY: +/// * The caller must ensure that if blocks > 0 then either `input` and +/// `output` do not overlap at all, or input == output.add(n) for some +/// (nonnegative) n. +/// * if blocks > 0, The caller must ensure `input` points to `blocks` blocks +/// and that `output` points to writable space for `blocks` blocks. +/// * The caller must ensure that `vpaes_key` was initialized with +/// `vpaes_set_encrypt_key`. +/// * Upon returning, `blocks` blocks will have been read from `input` and +/// written to `output`. +pub(super) unsafe fn ctr32_encrypt_blocks_with_vpaes_key( + in_out: Overlapping<'_>, + vpaes_key: &AES_KEY, + ctr: &mut Counter, +) { + prefixed_extern! { + // bsaes_ctr32_encrypt_blocks requires transformation of an existing + // VPAES key; there is no `bsaes_set_encrypt_key`. + fn vpaes_encrypt_key_to_bsaes(bsaes_key: *mut AES_KEY, vpaes_key: &AES_KEY); + } + + // SAFETY: + // * The caller ensures `vpaes_key` was initialized by + // `vpaes_set_encrypt_key`. + // * `bsaes_key was zeroed above, and `vpaes_encrypt_key_to_bsaes` + // is assumed to initialize `bsaes_key`. + let bsaes_key = unsafe { AES_KEY::derive(vpaes_encrypt_key_to_bsaes, vpaes_key) }; + + // The code for `vpaes_encrypt_key_to_bsaes` notes "vpaes stores one + // fewer round count than bsaes, but the number of keys is the same," + // so use this as a sanity check. + debug_assert_eq!(bsaes_key.rounds(), vpaes_key.rounds() + 1); + + // SAFETY: + // * `bsaes_key` is in bsaes format after calling + // `vpaes_encrypt_key_to_bsaes`. + // * `bsaes_ctr32_encrypt_blocks` satisfies the contract for + // `ctr32_encrypt_blocks`. + unsafe { + ctr32_encrypt_blocks!(bsaes_ctr32_encrypt_blocks, in_out, &bsaes_key, ctr); + } +} diff --git a/ring-0.17.14/src/aead/aes/fallback.rs b/ring-0.17.14/src/aead/aes/fallback.rs new file mode 100644 index 0000000000..40dee0d757 --- /dev/null +++ b/ring-0.17.14/src/aead/aes/fallback.rs @@ -0,0 +1,44 @@ +// Copyright 2018-2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::{Block, Counter, EncryptBlock, EncryptCtr32, Iv, KeyBytes, Overlapping, AES_KEY}; +use crate::error; + +#[derive(Clone)] +pub struct Key { + inner: AES_KEY, +} + +impl Key { + pub(in super::super) fn new(bytes: KeyBytes<'_>) -> Result { + let inner = unsafe { set_encrypt_key!(aes_nohw_set_encrypt_key, bytes) }?; + Ok(Self { inner }) + } +} + +impl EncryptBlock for Key { + fn encrypt_block(&self, block: Block) -> Block { + unsafe { encrypt_block!(aes_nohw_encrypt, block, &self.inner) } + } + + fn encrypt_iv_xor_block(&self, iv: Iv, block: Block) -> Block { + super::encrypt_iv_xor_block_using_encrypt_block(self, iv, block) + } +} + +impl EncryptCtr32 for Key { + fn ctr32_encrypt_within(&self, in_out: Overlapping<'_>, ctr: &mut Counter) { + unsafe { ctr32_encrypt_blocks!(aes_nohw_ctr32_encrypt_blocks, in_out, &self.inner, ctr) } + } +} diff --git a/ring-0.17.14/src/aead/aes/ffi.rs b/ring-0.17.14/src/aead/aes/ffi.rs new file mode 100644 index 0000000000..c2286e1b87 --- /dev/null +++ b/ring-0.17.14/src/aead/aes/ffi.rs @@ -0,0 +1,206 @@ +// Copyright 2018-2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::{Block, KeyBytes, Overlapping, BLOCK_LEN}; +use crate::{bits::BitLength, c, error}; +use core::{ + ffi::{c_int, c_uint}, + num::{NonZeroU32, NonZeroUsize}, +}; + +/// nonce || big-endian counter. +#[repr(transparent)] +pub(in super::super) struct Counter(pub(super) [u8; BLOCK_LEN]); + +// Keep this in sync with AES_KEY in aes.h. +#[repr(C)] +#[derive(Clone)] +pub(in super::super) struct AES_KEY { + pub rd_key: [u32; 4 * (MAX_ROUNDS + 1)], + pub rounds: c_uint, +} + +// Keep this in sync with `AES_MAXNR` in aes.h. +const MAX_ROUNDS: usize = 14; + +impl AES_KEY { + #[inline] + pub(super) unsafe fn new( + f: unsafe extern "C" fn(*const u8, BitLength, *mut AES_KEY) -> c_int, + bytes: KeyBytes<'_>, + ) -> Result { + let mut key = Self { + rd_key: [0; 4 * (MAX_ROUNDS + 1)], + rounds: 0, + }; + + let (bytes, key_bits) = match bytes { + KeyBytes::AES_128(bytes) => (&bytes[..], BitLength::from_bits(128)), + KeyBytes::AES_256(bytes) => (&bytes[..], BitLength::from_bits(256)), + }; + + // Unusually, in this case zero means success and non-zero means failure. + if 0 == unsafe { f(bytes.as_ptr(), key_bits, &mut key) } { + debug_assert_ne!(key.rounds, 0); // Sanity check initialization. + Ok(key) + } else { + Err(error::Unspecified) + } + } +} + +#[cfg(all(target_arch = "arm", target_endian = "little"))] +impl AES_KEY { + pub(super) unsafe fn derive( + f: for<'a> unsafe extern "C" fn(*mut AES_KEY, &'a AES_KEY), + src: &Self, + ) -> Self { + let mut r = AES_KEY { + rd_key: [0u32; 4 * (MAX_ROUNDS + 1)], + rounds: 0, + }; + unsafe { f(&mut r, src) }; + r + } + + pub(super) fn rounds(&self) -> u32 { + self.rounds + } +} + +// SAFETY: +// * The function `$name` must read `bits` bits from `user_key`; `bits` will +// always be a valid AES key length, i.e. a whole number of bytes. +// * `$name` must set `key.rounds` to the value expected by the corresponding +// encryption/decryption functions and return 0, or otherwise must return +// non-zero to indicate failure. +// * `$name` may inspect CPU features. +// +// In BoringSSL, the C prototypes for these are in +// crypto/fipsmodule/aes/internal.h. +macro_rules! set_encrypt_key { + ( $name:ident, $key_bytes:expr $(,)? ) => {{ + use crate::bits::BitLength; + use core::ffi::c_int; + prefixed_extern! { + fn $name(user_key: *const u8, bits: BitLength, key: *mut AES_KEY) -> c_int; + } + $crate::aead::aes::ffi::AES_KEY::new($name, $key_bytes) + }}; +} + +macro_rules! encrypt_block { + ($name:ident, $block:expr, $key:expr) => {{ + use crate::aead::aes::{ffi::AES_KEY, Block}; + prefixed_extern! { + fn $name(a: &Block, r: *mut Block, key: &AES_KEY); + } + $key.encrypt_block($name, $block) + }}; +} + +impl AES_KEY { + #[inline] + pub(super) unsafe fn encrypt_block( + &self, + f: unsafe extern "C" fn(&Block, *mut Block, &AES_KEY), + a: Block, + ) -> Block { + let mut result = core::mem::MaybeUninit::uninit(); + unsafe { + f(&a, result.as_mut_ptr(), self); + result.assume_init() + } + } +} + +/// SAFETY: +/// * The caller must ensure that `$key` was initialized with the +/// `set_encrypt_key!` invocation that `$name` requires. +/// * The caller must ensure that fhe function `$name` satisfies the conditions +/// for the `f` parameter to `ctr32_encrypt_blocks`. +macro_rules! ctr32_encrypt_blocks { + ($name:ident, $in_out:expr, $key:expr, $ctr:expr $(,)? ) => {{ + use crate::{ + aead::aes::{ffi::AES_KEY, Counter, BLOCK_LEN}, + c, + }; + prefixed_extern! { + fn $name( + input: *const [u8; BLOCK_LEN], + output: *mut [u8; BLOCK_LEN], + blocks: c::NonZero_size_t, + key: &AES_KEY, + ivec: &Counter, + ); + } + $key.ctr32_encrypt_blocks($name, $in_out, $ctr) + }}; +} + +impl AES_KEY { + /// SAFETY: + /// * `f` must not read more than `blocks` blocks from `input`. + /// * `f` must write exactly `block` blocks to `output`. + /// * In particular, `f` must handle blocks == 0 without reading from `input` + /// or writing to `output`. + /// * `f` must support the input overlapping with the output exactly or + /// with any nonnegative offset `n` (i.e. `input == output.add(n)`); + /// `f` does NOT need to support the cases where input < output. + /// * `key` must have been initialized with the `set_encrypt_key!` invocation + /// that corresponds to `f`. + /// * `f` may inspect CPU features. + #[inline] + pub(super) unsafe fn ctr32_encrypt_blocks( + &self, + f: unsafe extern "C" fn( + input: *const [u8; BLOCK_LEN], + output: *mut [u8; BLOCK_LEN], + blocks: c::NonZero_size_t, + key: &AES_KEY, + ivec: &Counter, + ), + in_out: Overlapping<'_>, + ctr: &mut Counter, + ) { + in_out.with_input_output_len(|input, output, len| { + debug_assert_eq!(len % BLOCK_LEN, 0); + + let blocks = match NonZeroUsize::new(len / BLOCK_LEN) { + Some(blocks) => blocks, + None => { + return; + } + }; + + let input: *const [u8; BLOCK_LEN] = input.cast(); + let output: *mut [u8; BLOCK_LEN] = output.cast(); + let blocks_u32: NonZeroU32 = blocks.try_into().unwrap(); + + // SAFETY: + // * `input` points to `blocks` blocks. + // * `output` points to space for `blocks` blocks to be written. + // * input == output.add(n), where n == src.start, and the caller is + // responsible for ensuing this sufficient for `f` to work correctly. + // * `blocks` is non-zero so `f` doesn't have to work for empty slices. + // * The caller is responsible for ensuring `key` was initialized by the + // `set_encrypt_key!` invocation required by `f`. + unsafe { + f(input, output, blocks, self, ctr); + } + + ctr.increment_by_less_safe(blocks_u32); + }); + } +} diff --git a/ring-0.17.14/src/aead/aes/hw.rs b/ring-0.17.14/src/aead/aes/hw.rs new file mode 100644 index 0000000000..6310f22dab --- /dev/null +++ b/ring-0.17.14/src/aead/aes/hw.rs @@ -0,0 +1,95 @@ +// Copyright 2018-2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + target_arch = "x86", + target_arch = "x86_64" +))] + +use super::{Block, Counter, EncryptBlock, EncryptCtr32, Iv, KeyBytes, Overlapping, AES_KEY}; +use crate::{cpu, error}; +use cfg_if::cfg_if; + +cfg_if! { + if #[cfg(all(target_arch = "aarch64", target_endian = "little"))] { + pub(in super::super) type RequiredCpuFeatures = cpu::arm::Aes; + pub(in super::super) type OptionalCpuFeatures = (); + } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + use cpu::intel::{Aes, Avx, Ssse3}; + // Some functions seem to have been written to require only SSE/SSE2 + // but there seem to be no SSSE3-less CPUs with AES-NI, and we don't + // have feature detection for SSE2. + pub(in super::super) type RequiredCpuFeatures = (Aes, Ssse3); + pub(in super::super) type OptionalCpuFeatures = Avx; + } +} + +#[derive(Clone)] +pub struct Key { + inner: AES_KEY, +} + +impl Key { + #[cfg(all(target_arch = "aarch64", target_endian = "little"))] + pub(in super::super) fn new( + bytes: KeyBytes<'_>, + _required_cpu_features: RequiredCpuFeatures, + _optional_cpu_features: Option, + ) -> Result { + let inner = unsafe { set_encrypt_key!(aes_hw_set_encrypt_key, bytes) }?; + Ok(Self { inner }) + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pub(in super::super) fn new( + bytes: KeyBytes<'_>, + (Aes { .. }, Ssse3 { .. }): RequiredCpuFeatures, + optional_cpu_features: Option, + ) -> Result { + // Ssse3 is required, but upstream only uses this if there is also Avx; + // presumably the base version is faster on pre-AVX CPUs. + let inner = if let Some(Avx { .. }) = optional_cpu_features { + unsafe { set_encrypt_key!(aes_hw_set_encrypt_key_alt, bytes) }? + } else { + unsafe { set_encrypt_key!(aes_hw_set_encrypt_key_base, bytes) }? + }; + Ok(Self { inner }) + } + + #[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + target_arch = "x86_64" + ))] + #[must_use] + pub(in super::super) fn inner_less_safe(&self) -> &AES_KEY { + &self.inner + } +} + +impl EncryptBlock for Key { + fn encrypt_block(&self, block: Block) -> Block { + super::encrypt_block_using_encrypt_iv_xor_block(self, block) + } + + fn encrypt_iv_xor_block(&self, iv: Iv, block: Block) -> Block { + super::encrypt_iv_xor_block_using_ctr32(self, iv, block) + } +} + +impl EncryptCtr32 for Key { + fn ctr32_encrypt_within(&self, in_out: Overlapping<'_>, ctr: &mut Counter) { + unsafe { ctr32_encrypt_blocks!(aes_hw_ctr32_encrypt_blocks, in_out, &self.inner, ctr) } + } +} diff --git a/ring-0.17.14/src/aead/aes/vp.rs b/ring-0.17.14/src/aead/aes/vp.rs new file mode 100644 index 0000000000..283cd7d14b --- /dev/null +++ b/ring-0.17.14/src/aead/aes/vp.rs @@ -0,0 +1,139 @@ +// Copyright 2018-2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + all(target_arch = "arm", target_endian = "little"), + target_arch = "x86", + target_arch = "x86_64" +))] + +use super::{Block, Counter, EncryptBlock, EncryptCtr32, Iv, KeyBytes, Overlapping, AES_KEY}; +use crate::{cpu, error}; + +#[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + all(target_arch = "arm", target_endian = "little") +))] +type RequiredCpuFeatures = cpu::arm::Neon; + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +pub(in super::super) type RequiredCpuFeatures = cpu::intel::Ssse3; + +#[derive(Clone)] +pub(in super::super) struct Key { + inner: AES_KEY, +} + +impl Key { + pub(in super::super) fn new( + bytes: KeyBytes<'_>, + _cpu: RequiredCpuFeatures, + ) -> Result { + let inner = unsafe { set_encrypt_key!(vpaes_set_encrypt_key, bytes) }?; + Ok(Self { inner }) + } +} + +#[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + all(target_arch = "arm", target_endian = "little"), + target_arch = "x86_64" +))] +impl EncryptBlock for Key { + fn encrypt_block(&self, block: Block) -> Block { + super::encrypt_block_using_encrypt_iv_xor_block(self, block) + } + + fn encrypt_iv_xor_block(&self, iv: Iv, block: Block) -> Block { + super::encrypt_iv_xor_block_using_ctr32(self, iv, block) + } +} + +#[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + target_arch = "x86_64" +))] +impl EncryptCtr32 for Key { + fn ctr32_encrypt_within(&self, in_out: Overlapping<'_>, ctr: &mut Counter) { + unsafe { ctr32_encrypt_blocks!(vpaes_ctr32_encrypt_blocks, in_out, &self.inner, ctr) } + } +} + +#[cfg(all(target_arch = "arm", target_endian = "little"))] +impl EncryptCtr32 for Key { + fn ctr32_encrypt_within(&self, in_out: Overlapping<'_>, ctr: &mut Counter) { + use super::{super::overlapping::IndexError, bs, BLOCK_LEN}; + + let in_out = { + let (in_out, src) = in_out.into_slice_src_mut(); + let blocks = in_out[src.clone()].len() / BLOCK_LEN; + + // bsaes operates in batches of 8 blocks. + let bsaes_blocks = if blocks >= 8 && (blocks % 8) < 6 { + // It's faster to use bsaes for all the full batches and then + // switch to vpaes for the last partial batch (if any). + blocks - (blocks % 8) + } else if blocks >= 8 { + // It's faster to let bsaes handle everything including + // the last partial batch. + blocks + } else { + // It's faster to let vpaes handle everything. + 0 + }; + let bsaes_in_out_len = bsaes_blocks * BLOCK_LEN; + let bs_in_out = + Overlapping::new(&mut in_out[..(src.start + bsaes_in_out_len)], src.clone()) + .unwrap_or_else(|IndexError { .. }| unreachable!()); + + // SAFETY: + // * self.inner was initialized with `vpaes_set_encrypt_key` above, + // as required by `bsaes_ctr32_encrypt_blocks_with_vpaes_key`. + unsafe { + bs::ctr32_encrypt_blocks_with_vpaes_key(bs_in_out, &self.inner, ctr); + } + + Overlapping::new(&mut in_out[bsaes_in_out_len..], src) + .unwrap_or_else(|IndexError { .. }| unreachable!()) + }; + + // SAFETY: + // * self.inner was initialized with `vpaes_set_encrypt_key` above, + // as required by `vpaes_ctr32_encrypt_blocks`. + // * `vpaes_ctr32_encrypt_blocks` satisfies the contract for + // `ctr32_encrypt_blocks`. + unsafe { ctr32_encrypt_blocks!(vpaes_ctr32_encrypt_blocks, in_out, &self.inner, ctr) } + } +} + +#[cfg(target_arch = "x86")] +impl EncryptBlock for Key { + fn encrypt_block(&self, block: Block) -> Block { + unsafe { encrypt_block!(vpaes_encrypt, block, &self.inner) } + } + + fn encrypt_iv_xor_block(&self, iv: Iv, block: Block) -> Block { + super::encrypt_iv_xor_block_using_encrypt_block(self, iv, block) + } +} + +#[cfg(target_arch = "x86")] +impl EncryptCtr32 for Key { + fn ctr32_encrypt_within(&self, in_out: Overlapping<'_>, ctr: &mut Counter) { + super::super::shift::shift_full_blocks(in_out, |input| { + self.encrypt_iv_xor_block(ctr.increment(), *input) + }); + } +} diff --git a/ring-0.17.14/src/aead/aes_gcm.rs b/ring-0.17.14/src/aead/aes_gcm.rs new file mode 100644 index 0000000000..d9e08a3116 --- /dev/null +++ b/ring-0.17.14/src/aead/aes_gcm.rs @@ -0,0 +1,505 @@ +// Copyright 2015-2025 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::{ + aes::{self, Counter, Overlapping, OverlappingPartialBlock, BLOCK_LEN, ZERO_BLOCK}, + gcm, + overlapping::IndexError, + Aad, Nonce, Tag, +}; +use crate::{ + cpu, + error::{self, InputTooLongError}, + polyfill::{slice, sliceutil::overwrite_at_start, usize_from_u64_saturated}, +}; +use core::ops::RangeFrom; + +#[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + all(target_arch = "arm", target_endian = "little"), + target_arch = "x86", + target_arch = "x86_64" +))] +use cpu::GetFeature as _; + +mod aarch64; +mod aeshwclmulmovbe; +mod vaesclmulavx2; + +#[derive(Clone)] +pub(super) struct Key(DynKey); + +impl Key { + pub(super) fn new( + key: aes::KeyBytes, + cpu_features: cpu::Features, + ) -> Result { + Ok(Self(DynKey::new(key, cpu_features)?)) + } +} + +#[derive(Clone)] +enum DynKey { + #[cfg(target_arch = "x86_64")] + VAesClMulAvx2(Combo), + + #[cfg(target_arch = "x86_64")] + AesHwClMulAvxMovbe(Combo), + + #[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + target_arch = "x86", + target_arch = "x86_64" + ))] + AesHwClMul(Combo), + + #[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + all(target_arch = "arm", target_endian = "little") + ))] + Simd(Combo), + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Simd(Combo), + + Fallback(Combo), +} + +impl DynKey { + fn new(key: aes::KeyBytes, cpu: cpu::Features) -> Result { + let cpu = cpu.values(); + + #[cfg(target_arch = "x86_64")] + if let Some((aes, gcm)) = cpu.get_feature() { + // 14.3.1 Detection of VEX-Encoded AES and VPCLMULQDQ + let aes_key = aes::hw::Key::new(key, aes, cpu.get_feature())?; + let gcm_key_value = derive_gcm_key_value(&aes_key); + let combo = if let Some(cpu) = cpu.get_feature() { + let gcm_key = gcm::vclmulavx2::Key::new(gcm_key_value, cpu); + Self::VAesClMulAvx2(Combo { aes_key, gcm_key }) + } else if let Some(cpu) = cpu.get_feature() { + let gcm_key = gcm::clmulavxmovbe::Key::new(gcm_key_value, cpu); + Self::AesHwClMulAvxMovbe(Combo { aes_key, gcm_key }) + } else { + let gcm_key = gcm::clmul::Key::new(gcm_key_value, gcm); + Self::AesHwClMul(Combo { aes_key, gcm_key }) + }; + return Ok(combo); + } + + // x86_64 is handled above. + #[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + target_arch = "x86" + ))] + if let (Some(aes), Some(gcm)) = (cpu.get_feature(), cpu.get_feature()) { + let aes_key = aes::hw::Key::new(key, aes, cpu.get_feature())?; + let gcm_key_value = derive_gcm_key_value(&aes_key); + let gcm_key = gcm::clmul::Key::new(gcm_key_value, gcm); + return Ok(Self::AesHwClMul(Combo { aes_key, gcm_key })); + } + + #[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + all(target_arch = "arm", target_endian = "little") + ))] + if let Some(cpu) = cpu.get_feature() { + return Self::new_neon(key, cpu); + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + if let Some(cpu) = cpu.get_feature() { + return Self::new_ssse3(key, cpu); + } + + let _ = cpu; + Self::new_fallback(key) + } + + #[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + all(target_arch = "arm", target_endian = "little") + ))] + #[cfg_attr(target_arch = "aarch64", inline(never))] + fn new_neon(key: aes::KeyBytes, cpu: cpu::arm::Neon) -> Result { + let aes_key = aes::vp::Key::new(key, cpu)?; + let gcm_key_value = derive_gcm_key_value(&aes_key); + let gcm_key = gcm::neon::Key::new(gcm_key_value, cpu); + Ok(Self::Simd(Combo { aes_key, gcm_key })) + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + #[inline(never)] + fn new_ssse3( + key: aes::KeyBytes, + cpu: aes::vp::RequiredCpuFeatures, + ) -> Result { + let aes_key = aes::vp::Key::new(key, cpu)?; + let gcm_key_value = derive_gcm_key_value(&aes_key); + let gcm_key = gcm::fallback::Key::new(gcm_key_value); + Ok(Self::Simd(Combo { aes_key, gcm_key })) + } + + #[cfg_attr( + any( + all(target_arch = "aarch64", target_endian = "little"), + all(target_arch = "arm", target_endian = "little"), + target_arch = "x86", + target_arch = "x86_64", + ), + inline(never) + )] + fn new_fallback(key: aes::KeyBytes) -> Result { + let aes_key = aes::fallback::Key::new(key)?; + let gcm_key_value = derive_gcm_key_value(&aes_key); + let gcm_key = gcm::fallback::Key::new(gcm_key_value); + Ok(Self::Fallback(Combo { aes_key, gcm_key })) + } +} + +fn derive_gcm_key_value(aes_key: &impl aes::EncryptBlock) -> gcm::KeyValue { + gcm::KeyValue::new(aes_key.encrypt_block(ZERO_BLOCK)) +} + +const CHUNK_BLOCKS: usize = 3 * 1024 / 16; + +#[inline(never)] +pub(super) fn seal( + Key(key): &Key, + nonce: Nonce, + aad: Aad<&[u8]>, + in_out: &mut [u8], +) -> Result { + let mut ctr = Counter::one(nonce); + let tag_iv = ctr.increment(); + + match key { + #[cfg(all(target_arch = "aarch64", target_endian = "little"))] + DynKey::AesHwClMul(c) => { + seal_whole_partial(c, aad, in_out, ctr, tag_iv, aarch64::seal_whole) + } + + #[cfg(target_arch = "x86_64")] + DynKey::VAesClMulAvx2(c) => seal_whole_partial( + c, + aad, + in_out, + ctr, + tag_iv, + vaesclmulavx2::seal_whole_vaes_clmul_avx2, + ), + + #[cfg(target_arch = "x86_64")] + DynKey::AesHwClMulAvxMovbe(Combo { aes_key, gcm_key }) => { + aeshwclmulmovbe::seal(aes_key, gcm_key, ctr, tag_iv, aad, in_out) + } + + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + DynKey::AesHwClMul(c) => seal_strided(c, aad, in_out, ctr, tag_iv), + + #[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + all(target_arch = "arm", target_endian = "little"), + target_arch = "x86_64", + target_arch = "x86" + ))] + DynKey::Simd(c) => seal_strided(c, aad, in_out, ctr, tag_iv), + + DynKey::Fallback(c) => seal_strided(c, aad, in_out, ctr, tag_iv), + } +} + +#[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + target_arch = "x86_64" +))] +fn seal_whole_partial( + Combo { aes_key, gcm_key }: &Combo, + aad: Aad<&[u8]>, + in_out: &mut [u8], + mut ctr: Counter, + tag_iv: aes::Iv, + seal_whole: impl FnOnce(&A, &mut gcm::Context, &mut Counter, slice::AsChunksMut), +) -> Result { + let mut auth = gcm::Context::new(gcm_key, aad, in_out.len())?; + let (whole, remainder) = slice::as_chunks_mut(in_out); + seal_whole(aes_key, &mut auth, &mut ctr, whole); + let remainder = OverlappingPartialBlock::new(remainder.into()) + .unwrap_or_else(|InputTooLongError { .. }| unreachable!()); + seal_finish(aes_key, auth, remainder, ctr, tag_iv) +} + +#[cfg_attr( + any( + all(target_arch = "aarch64", target_endian = "little"), + all(target_arch = "arm", target_endian = "little"), + target_arch = "x86", + target_arch = "x86_64" + ), + inline(never) +)] +#[cfg_attr( + any( + all(target_arch = "aarch64", target_endian = "little"), + target_arch = "x86_64" + ), + cold +)] +fn seal_strided< + A: aes::EncryptBlock + aes::EncryptCtr32, + G: gcm::UpdateBlock + gcm::UpdateBlocks, +>( + Combo { aes_key, gcm_key }: &Combo, + aad: Aad<&[u8]>, + in_out: &mut [u8], + mut ctr: Counter, + tag_iv: aes::Iv, +) -> Result { + let mut auth = gcm::Context::new(gcm_key, aad, in_out.len())?; + + let (mut whole, remainder) = slice::as_chunks_mut(in_out); + + for mut chunk in whole.chunks_mut::() { + aes_key.ctr32_encrypt_within(chunk.as_flattened_mut().into(), &mut ctr); + auth.update_blocks(chunk.as_ref()); + } + + let remainder = OverlappingPartialBlock::new(remainder.into()) + .unwrap_or_else(|InputTooLongError { .. }| unreachable!()); + seal_finish(aes_key, auth, remainder, ctr, tag_iv) +} + +fn seal_finish( + aes_key: &A, + mut auth: gcm::Context, + remainder: OverlappingPartialBlock<'_>, + ctr: Counter, + tag_iv: aes::Iv, +) -> Result { + let remainder_len = remainder.len(); + if remainder_len > 0 { + let mut input = ZERO_BLOCK; + overwrite_at_start(&mut input, remainder.input()); + let mut output = aes_key.encrypt_iv_xor_block(ctr.into(), input); + output[remainder_len..].fill(0); + auth.update_block(output); + remainder.overwrite_at_start(output); + } + + Ok(finish(aes_key, auth, tag_iv)) +} + +#[inline(never)] +pub(super) fn open( + Key(key): &Key, + nonce: Nonce, + aad: Aad<&[u8]>, + in_out_slice: &mut [u8], + src: RangeFrom, +) -> Result { + let mut ctr = Counter::one(nonce); + let tag_iv = ctr.increment(); + + match key { + #[cfg(all(target_arch = "aarch64", target_endian = "little"))] + DynKey::AesHwClMul(c) => { + open_whole_partial(c, aad, in_out_slice, src, ctr, tag_iv, aarch64::open_whole) + } + + #[cfg(target_arch = "x86_64")] + DynKey::VAesClMulAvx2(c) => open_whole_partial( + c, + aad, + in_out_slice, + src, + ctr, + tag_iv, + vaesclmulavx2::open_whole_vaes_clmul_avx2, + ), + + #[cfg(target_arch = "x86_64")] + DynKey::AesHwClMulAvxMovbe(Combo { aes_key, gcm_key }) => { + aeshwclmulmovbe::open(aes_key, gcm_key, ctr, tag_iv, aad, in_out_slice, src) + } + + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + DynKey::AesHwClMul(c) => open_strided(c, aad, in_out_slice, src, ctr, tag_iv), + + #[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + all(target_arch = "arm", target_endian = "little"), + target_arch = "x86_64", + target_arch = "x86" + ))] + DynKey::Simd(c) => open_strided(c, aad, in_out_slice, src, ctr, tag_iv), + + DynKey::Fallback(c) => open_strided(c, aad, in_out_slice, src, ctr, tag_iv), + } +} + +#[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + target_arch = "x86_64" +))] +fn open_whole_partial( + Combo { aes_key, gcm_key }: &Combo, + aad: Aad<&[u8]>, + in_out_slice: &mut [u8], + src: RangeFrom, + mut ctr: Counter, + tag_iv: aes::Iv, + open_whole: impl FnOnce(&A, &mut gcm::Context, Overlapping, &mut Counter), +) -> Result { + let in_out = Overlapping::new(in_out_slice, src.clone()).map_err(error::erase::)?; + let mut auth = gcm::Context::new(gcm_key, aad, in_out.len())?; + + let remainder_len = in_out.len() % BLOCK_LEN; + + let in_out_slice_len = in_out_slice.len(); + let whole_in_out_slice = &mut in_out_slice[..(in_out_slice_len - remainder_len)]; + let whole = Overlapping::new(whole_in_out_slice, src.clone()) + .unwrap_or_else(|IndexError { .. }| unreachable!()); + let whole_len = whole.len(); + open_whole(aes_key, &mut auth, whole, &mut ctr); + + let remainder = &mut in_out_slice[whole_len..]; + let remainder = + Overlapping::new(remainder, src).unwrap_or_else(|IndexError { .. }| unreachable!()); + let remainder = OverlappingPartialBlock::new(remainder) + .unwrap_or_else(|InputTooLongError { .. }| unreachable!()); + open_finish(aes_key, auth, remainder, ctr, tag_iv) +} + +#[cfg_attr( + any( + all( + any( + all(target_arch = "aarch64", target_endian = "little"), + all(target_arch = "arm", target_endian = "little") + ), + target_feature = "neon" + ), + all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "sse" + ) + ), + inline(never) +)] +#[cfg_attr( + any( + all(target_arch = "aarch64", target_endian = "little"), + target_arch = "x86_64" + ), + cold +)] +fn open_strided< + A: aes::EncryptBlock + aes::EncryptCtr32, + G: gcm::UpdateBlock + gcm::UpdateBlocks, +>( + Combo { aes_key, gcm_key }: &Combo, + aad: Aad<&[u8]>, + in_out_slice: &mut [u8], + src: RangeFrom, + mut ctr: Counter, + tag_iv: aes::Iv, +) -> Result { + let in_out = Overlapping::new(in_out_slice, src.clone()).map_err(error::erase::)?; + let input = in_out.input(); + let input_len = input.len(); + + let mut auth = gcm::Context::new(gcm_key, aad, input_len)?; + + let remainder_len = input_len % BLOCK_LEN; + let whole_len = input_len - remainder_len; + let in_prefix_len = src.start; + + { + let mut chunk_len = CHUNK_BLOCKS * BLOCK_LEN; + let mut output = 0; + let mut input = in_prefix_len; + loop { + if whole_len - output < chunk_len { + chunk_len = whole_len - output; + } + + let ciphertext = &in_out_slice[input..][..chunk_len]; + let (ciphertext, leftover) = slice::as_chunks(ciphertext); + debug_assert_eq!(leftover.len(), 0); + if ciphertext.is_empty() { + break; + } + auth.update_blocks(ciphertext); + + let chunk = Overlapping::new( + &mut in_out_slice[output..][..(chunk_len + in_prefix_len)], + in_prefix_len.., + ) + .map_err(error::erase::)?; + aes_key.ctr32_encrypt_within(chunk, &mut ctr); + output += chunk_len; + input += chunk_len; + } + } + + let in_out = Overlapping::new(&mut in_out_slice[whole_len..], src) + .unwrap_or_else(|IndexError { .. }| unreachable!()); + let in_out = OverlappingPartialBlock::new(in_out) + .unwrap_or_else(|InputTooLongError { .. }| unreachable!()); + + open_finish(aes_key, auth, in_out, ctr, tag_iv) +} + +fn open_finish( + aes_key: &A, + mut auth: gcm::Context, + remainder: OverlappingPartialBlock<'_>, + ctr: Counter, + tag_iv: aes::Iv, +) -> Result { + if remainder.len() > 0 { + let mut input = ZERO_BLOCK; + overwrite_at_start(&mut input, remainder.input()); + auth.update_block(input); + remainder.overwrite_at_start(aes_key.encrypt_iv_xor_block(ctr.into(), input)); + } + Ok(finish(aes_key, auth, tag_iv)) +} + +fn finish( + aes_key: &A, + gcm_ctx: gcm::Context, + tag_iv: aes::Iv, +) -> Tag { + // Finalize the tag and return it. + gcm_ctx.pre_finish(|pre_tag| Tag(aes_key.encrypt_iv_xor_block(tag_iv, pre_tag))) +} + +pub(super) const MAX_IN_OUT_LEN: usize = super::max_input_len(BLOCK_LEN, 2); + +// [NIST SP800-38D] Section 5.2.1.1. Note that [RFC 5116 Section 5.1] and +// [RFC 5116 Section 5.2] have an off-by-one error in `P_MAX`. +// +// [NIST SP800-38D]: +// http://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38d.pdf +// [RFC 5116 Section 5.1]: https://tools.ietf.org/html/rfc5116#section-5.1 +// [RFC 5116 Section 5.2]: https://tools.ietf.org/html/rfc5116#section-5.2 +const _MAX_INPUT_LEN_BOUNDED_BY_NIST: () = + assert!(MAX_IN_OUT_LEN == usize_from_u64_saturated(((1u64 << 39) - 256) / 8)); + +#[derive(Copy, Clone)] +pub(super) struct Combo { + pub(super) aes_key: Aes, + pub(super) gcm_key: Gcm, +} diff --git a/ring-0.17.14/src/aead/aes_gcm/aarch64.rs b/ring-0.17.14/src/aead/aes_gcm/aarch64.rs new file mode 100644 index 0000000000..15aa5b134f --- /dev/null +++ b/ring-0.17.14/src/aead/aes_gcm/aarch64.rs @@ -0,0 +1,95 @@ +// Copyright 2015-2025 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![cfg(all(target_arch = "aarch64", target_endian = "little"))] + +use super::{aes, gcm, Counter, BLOCK_LEN}; +use crate::{aead::aes::Overlapping, bits::BitLength, polyfill::slice::AsChunksMut}; +use core::num::NonZeroU64; + +pub(super) fn seal_whole( + aes_key: &aes::hw::Key, + auth: &mut gcm::Context, + ctr: &mut Counter, + mut in_out: AsChunksMut, +) { + let whole_block_bits = auth.in_out_whole_block_bits(); + let whole_block_bits_u64: BitLength = whole_block_bits.into(); + if let Ok(whole_block_bits) = whole_block_bits_u64.try_into() { + let (htable, xi) = auth.inner(); + + prefixed_extern! { + fn aes_gcm_enc_kernel( + input: *const [u8; BLOCK_LEN], + in_bits: BitLength, + output: *mut [u8; BLOCK_LEN], + Xi: &mut gcm::Xi, + ivec: &mut Counter, + key: &aes::AES_KEY, + Htable: &gcm::HTable); + } + + unsafe { + aes_gcm_enc_kernel( + in_out.as_ptr(), + whole_block_bits, + in_out.as_mut_ptr(), + xi, + ctr, + aes_key.inner_less_safe(), + htable, + ) + } + } +} + +pub(super) fn open_whole( + aes_key: &aes::hw::Key, + auth: &mut gcm::Context, + in_out: Overlapping, + ctr: &mut Counter, +) { + // Precondition. TODO: Create an overlapping::AsChunks for this. + assert_eq!(in_out.len() % BLOCK_LEN, 0); + + in_out.with_input_output_len(|input, output, _len| { + let whole_block_bits = auth.in_out_whole_block_bits(); + let whole_block_bits_u64: BitLength = whole_block_bits.into(); + if let Ok(whole_block_bits) = whole_block_bits_u64.try_into() { + let (htable, xi) = auth.inner(); + prefixed_extern! { + fn aes_gcm_dec_kernel( + input: *const u8, + in_bits: BitLength, + output: *mut u8, + Xi: &mut gcm::Xi, + ivec: &mut Counter, + key: &aes::AES_KEY, + Htable: &gcm::HTable); + } + + unsafe { + aes_gcm_dec_kernel( + input, + whole_block_bits, + output, + xi, + ctr, + aes_key.inner_less_safe(), + htable, + ) + } + } + }) +} diff --git a/ring-0.17.14/src/aead/aes_gcm/aeshwclmulmovbe.rs b/ring-0.17.14/src/aead/aes_gcm/aeshwclmulmovbe.rs new file mode 100644 index 0000000000..e6d49ee3bb --- /dev/null +++ b/ring-0.17.14/src/aead/aes_gcm/aeshwclmulmovbe.rs @@ -0,0 +1,154 @@ +// Copyright 2015-2025 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![cfg(target_arch = "x86_64")] + +use super::{ + super::overlapping::IndexError, + aes::{self, Counter, EncryptCtr32, Overlapping, OverlappingPartialBlock}, + gcm, Aad, Tag, +}; +use crate::{ + c, + error::{self, InputTooLongError}, + polyfill::slice, +}; +use core::ops::RangeFrom; + +#[inline(never)] +pub(super) fn seal( + aes_key: &aes::hw::Key, + gcm_key: &gcm::clmulavxmovbe::Key, + mut ctr: Counter, + tag_iv: aes::Iv, + aad: Aad<&[u8]>, + in_out: &mut [u8], +) -> Result { + prefixed_extern! { + // `HTable` and `Xi` should be 128-bit aligned. TODO: Can we shrink `HTable`? The + // assembly says it needs just nine values in that array. + fn aesni_gcm_encrypt( + input: *const u8, + output: *mut u8, + len: c::size_t, + key: &aes::AES_KEY, + ivec: &mut Counter, + Htable: &gcm::HTable, + Xi: &mut gcm::Xi) -> c::size_t; + } + + let mut auth = gcm::Context::new(gcm_key, aad, in_out.len())?; + let (htable, xi) = auth.inner(); + + let processed = unsafe { + aesni_gcm_encrypt( + in_out.as_ptr(), + in_out.as_mut_ptr(), + in_out.len(), + aes_key.inner_less_safe(), + &mut ctr, + htable, + xi, + ) + }; + + let ramaining = match in_out.get_mut(processed..) { + Some(remaining) => remaining, + None => { + // This can't happen. If it did, then the assembly already + // caused a buffer overflow. + unreachable!() + } + }; + let (mut whole, remainder) = slice::as_chunks_mut(ramaining); + aes_key.ctr32_encrypt_within(whole.as_flattened_mut().into(), &mut ctr); + auth.update_blocks(whole.as_ref()); + let remainder = OverlappingPartialBlock::new(remainder.into()) + .unwrap_or_else(|InputTooLongError { .. }| unreachable!()); + + super::seal_finish(aes_key, auth, remainder, ctr, tag_iv) +} + +#[inline(never)] +pub(super) fn open( + aes_key: &aes::hw::Key, + gcm_key: &gcm::clmulavxmovbe::Key, + mut ctr: Counter, + tag_iv: aes::Iv, + aad: Aad<&[u8]>, + in_out_slice: &mut [u8], + src: RangeFrom, +) -> Result { + prefixed_extern! { + // `HTable` and `Xi` should be 128-bit aligned. TODO: Can we shrink `HTable`? The + // assembly says it needs just nine values in that array. + fn aesni_gcm_decrypt( + input: *const u8, + output: *mut u8, + len: c::size_t, + key: &aes::AES_KEY, + ivec: &mut Counter, + Htable: &gcm::HTable, + Xi: &mut gcm::Xi) -> c::size_t; + } + + let in_out = Overlapping::new(in_out_slice, src.clone()).map_err(error::erase::)?; + let mut auth = gcm::Context::new(gcm_key, aad, in_out.len())?; + let processed = in_out.with_input_output_len(|input, output, len| { + let (htable, xi) = auth.inner(); + unsafe { + aesni_gcm_decrypt( + input, + output, + len, + aes_key.inner_less_safe(), + &mut ctr, + htable, + xi, + ) + } + }); + let in_out_slice = in_out_slice.get_mut(processed..).unwrap_or_else(|| { + // This can't happen. If it did, then the assembly already + // caused a buffer overflow. + unreachable!() + }); + // Authenticate any remaining whole blocks. + let in_out = + Overlapping::new(in_out_slice, src.clone()).unwrap_or_else(|IndexError { .. }| { + // This can't happen. If it did, then the assembly already + // overwrote part of the remaining input. + unreachable!() + }); + let (whole, _) = slice::as_chunks(in_out.input()); + auth.update_blocks(whole); + + let whole_len = whole.as_flattened().len(); + + // Decrypt any remaining whole blocks. + let whole = Overlapping::new(&mut in_out_slice[..(src.start + whole_len)], src.clone()) + .map_err(error::erase::)?; + aes_key.ctr32_encrypt_within(whole, &mut ctr); + + let in_out_slice = match in_out_slice.get_mut(whole_len..) { + Some(partial) => partial, + None => unreachable!(), + }; + let in_out = + Overlapping::new(in_out_slice, src).unwrap_or_else(|IndexError { .. }| unreachable!()); + let in_out = OverlappingPartialBlock::new(in_out) + .unwrap_or_else(|InputTooLongError { .. }| unreachable!()); + + super::open_finish(aes_key, auth, in_out, ctr, tag_iv) +} diff --git a/ring-0.17.14/src/aead/aes_gcm/vaesclmulavx2.rs b/ring-0.17.14/src/aead/aes_gcm/vaesclmulavx2.rs new file mode 100644 index 0000000000..8a2a68f238 --- /dev/null +++ b/ring-0.17.14/src/aead/aes_gcm/vaesclmulavx2.rs @@ -0,0 +1,86 @@ +// Copyright 2015-2025 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![cfg(target_arch = "x86_64")] + +use super::{aes, gcm, Counter, BLOCK_LEN}; +use crate::{aead::aes::Overlapping, c, polyfill::slice::AsChunksMut}; +use core::num::NonZeroU32; + +pub(super) fn seal_whole_vaes_clmul_avx2( + aes_key: &aes::hw::Key, + auth: &mut gcm::Context, + ctr: &mut Counter, + mut in_out: AsChunksMut, +) { + prefixed_extern! { + fn aes_gcm_enc_update_vaes_avx2( + input: *const u8, + output: *mut u8, + len: c::size_t, + key: &aes::AES_KEY, + ivec: &Counter, + Htable: &gcm::HTable, + Xi: &mut gcm::Xi); + } + + let in_out = in_out.as_flattened_mut(); + + // Precondition: Since we have a `gcm::Context` then the number of blocks + // must fit in `u32`. + let blocks = u32::try_from(in_out.len() / BLOCK_LEN).unwrap(); + + if let Some(blocks) = NonZeroU32::new(blocks) { + let aes_key = aes_key.inner_less_safe(); + let (htable, xi) = auth.inner(); + let input = in_out.as_ptr(); + let output = in_out.as_mut_ptr(); + let len = in_out.len(); + unsafe { aes_gcm_enc_update_vaes_avx2(input, output, len, aes_key, ctr, htable, xi) }; + ctr.increment_by_less_safe(blocks); + } +} + +pub(super) fn open_whole_vaes_clmul_avx2( + aes_key: &aes::hw::Key, + auth: &mut gcm::Context, + in_out: Overlapping, + ctr: &mut Counter, +) { + prefixed_extern! { + fn aes_gcm_dec_update_vaes_avx2( + input: *const u8, + output: *mut u8, + len: c::size_t, + key: &aes::AES_KEY, + ivec: &mut Counter, + Htable: &gcm::HTable, + Xi: &mut gcm::Xi); + } + + // Precondition. TODO: Create an overlapping::AsChunks for this. + assert_eq!(in_out.len() % BLOCK_LEN, 0); + // Precondition: Since we have a `gcm::Context` then the number of blocks + // must fit in `u32`. + let blocks = u32::try_from(in_out.len() / BLOCK_LEN).unwrap(); + + if let Some(blocks) = NonZeroU32::new(blocks) { + let aes_key = aes_key.inner_less_safe(); + let (htable, xi) = auth.inner(); + in_out.with_input_output_len(|input, output, len| unsafe { + aes_gcm_dec_update_vaes_avx2(input, output, len, aes_key, ctr, htable, xi) + }); + ctr.increment_by_less_safe(blocks); + } +} diff --git a/ring-0.17.14/src/aead/algorithm.rs b/ring-0.17.14/src/aead/algorithm.rs new file mode 100644 index 0000000000..24d4fb89b6 --- /dev/null +++ b/ring-0.17.14/src/aead/algorithm.rs @@ -0,0 +1,269 @@ +// Copyright 2015-2021 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use crate::{ + bb, cpu, + error::{self, InputTooLongError}, + hkdf, +}; +use core::ops::RangeFrom; + +use super::{ + aes, aes_gcm, chacha20_poly1305, + nonce::{Nonce, NONCE_LEN}, + overlapping::{IndexError, Overlapping}, + Aad, KeyInner, Tag, TAG_LEN, +}; + +impl hkdf::KeyType for &'static Algorithm { + #[inline] + fn len(&self) -> usize { + self.key_len() + } +} + +/// An AEAD Algorithm. +pub struct Algorithm { + init: fn(key: &[u8], cpu_features: cpu::Features) -> Result, + + seal: fn( + key: &KeyInner, + nonce: Nonce, + aad: Aad<&[u8]>, + in_out: &mut [u8], + cpu_features: cpu::Features, + ) -> Result, + open: fn( + key: &KeyInner, + nonce: Nonce, + aad: Aad<&[u8]>, + in_out: &mut [u8], + src: RangeFrom, + cpu_features: cpu::Features, + ) -> Result, + + key_len: usize, + id: AlgorithmID, +} + +impl Algorithm { + /// The length of the key. + #[inline(always)] + pub fn key_len(&self) -> usize { + self.key_len + } + + /// The length of a tag. + /// + /// See also `MAX_TAG_LEN`. + #[inline(always)] + pub fn tag_len(&self) -> usize { + TAG_LEN + } + + /// The length of the nonces. + #[inline(always)] + pub fn nonce_len(&self) -> usize { + NONCE_LEN + } + + pub(super) fn new_key( + &self, + key_bytes: &[u8], + cpu_features: cpu::Features, + ) -> Result { + (self.init)(key_bytes, cpu_features) + } + + pub(super) fn open_within<'io>( + &self, + key: &KeyInner, + nonce: Nonce, + aad: Aad<&[u8]>, + received_tag: Tag, + in_out: &'io mut [u8], + src: RangeFrom, + cpu_features: cpu::Features, + ) -> Result<&'io mut [u8], error::Unspecified> { + let ciphertext_len = in_out.get(src.clone()).ok_or(error::Unspecified)?.len(); + + let Tag(calculated_tag) = (self.open)(key, nonce, aad, in_out, src, cpu_features)?; + + if bb::verify_slices_are_equal(calculated_tag.as_ref(), received_tag.as_ref()).is_err() { + // Zero out the plaintext so that it isn't accidentally leaked or used + // after verification fails. It would be safest if we could check the + // tag before decrypting, but some `open` implementations interleave + // authentication with decryption for performance. + for b in &mut in_out[..ciphertext_len] { + *b = 0; + } + return Err(error::Unspecified); + } + + // `ciphertext_len` is also the plaintext length. + Ok(&mut in_out[..ciphertext_len]) + } + + #[inline] + pub(super) fn seal( + &self, + key: &KeyInner, + nonce: Nonce, + aad: Aad<&[u8]>, + in_out: &mut [u8], + cpu_features: cpu::Features, + ) -> Result { + (self.seal)(key, nonce, aad, in_out, cpu_features) + } +} + +derive_debug_via_id!(Algorithm); + +#[derive(Debug, Eq, PartialEq)] +pub(super) enum AlgorithmID { + AES_128_GCM, + AES_256_GCM, + CHACHA20_POLY1305, +} + +impl PartialEq for Algorithm { + fn eq(&self, other: &Self) -> bool { + self.id == other.id + } +} + +impl Eq for Algorithm {} + +/// AES-128 in GCM mode with 128-bit tags and 96 bit nonces. +pub static AES_128_GCM: Algorithm = Algorithm { + key_len: aes::AES_128_KEY_LEN, + init: aes_gcm_init_128, + seal: aes_gcm_seal, + open: aes_gcm_open, + id: AlgorithmID::AES_128_GCM, +}; + +/// AES-256 in GCM mode with 128-bit tags and 96 bit nonces. +pub static AES_256_GCM: Algorithm = Algorithm { + key_len: aes::AES_256_KEY_LEN, + init: aes_gcm_init_256, + seal: aes_gcm_seal, + open: aes_gcm_open, + id: AlgorithmID::AES_256_GCM, +}; + +fn aes_gcm_init_128( + key: &[u8], + cpu_features: cpu::Features, +) -> Result { + let key = key.try_into().map_err(|_| error::Unspecified)?; + Ok(KeyInner::AesGcm(aes_gcm::Key::new( + aes::KeyBytes::AES_128(key), + cpu_features, + )?)) +} + +fn aes_gcm_init_256( + key: &[u8], + cpu_features: cpu::Features, +) -> Result { + let key = key.try_into().map_err(|_| error::Unspecified)?; + Ok(KeyInner::AesGcm(aes_gcm::Key::new( + aes::KeyBytes::AES_256(key), + cpu_features, + )?)) +} + +fn aes_gcm_seal( + key: &KeyInner, + nonce: Nonce, + aad: Aad<&[u8]>, + in_out: &mut [u8], + _cpu_features: cpu::Features, +) -> Result { + let key = match key { + KeyInner::AesGcm(key) => key, + _ => unreachable!(), + }; + aes_gcm::seal(key, nonce, aad, in_out) +} + +pub(super) fn aes_gcm_open( + key: &KeyInner, + nonce: Nonce, + aad: Aad<&[u8]>, + in_out: &mut [u8], + src: RangeFrom, + _cpu_features: cpu::Features, +) -> Result { + let key = match key { + KeyInner::AesGcm(key) => key, + _ => unreachable!(), + }; + aes_gcm::open(key, nonce, aad, in_out, src) +} + +/// ChaCha20-Poly1305 as described in [RFC 8439]. +/// +/// The keys are 256 bits long and the nonces are 96 bits long. +/// +/// [RFC 8439]: https://tools.ietf.org/html/rfc8439 +pub static CHACHA20_POLY1305: Algorithm = Algorithm { + key_len: chacha20_poly1305::KEY_LEN, + init: chacha20_poly1305_init, + seal: chacha20_poly1305_seal, + open: chacha20_poly1305_open, + id: AlgorithmID::CHACHA20_POLY1305, +}; + +/// Copies |key| into |ctx_buf|. +fn chacha20_poly1305_init( + key: &[u8], + _cpu_features: cpu::Features, +) -> Result { + let key: [u8; chacha20_poly1305::KEY_LEN] = key.try_into()?; + Ok(KeyInner::ChaCha20Poly1305(chacha20_poly1305::Key::new(key))) +} + +fn chacha20_poly1305_seal( + key: &KeyInner, + nonce: Nonce, + aad: Aad<&[u8]>, + in_out: &mut [u8], + cpu_features: cpu::Features, +) -> Result { + let key = match key { + KeyInner::ChaCha20Poly1305(key) => key, + _ => unreachable!(), + }; + chacha20_poly1305::seal(key, nonce, aad, in_out, cpu_features) + .map_err(error::erase::) +} + +fn chacha20_poly1305_open( + key: &KeyInner, + nonce: Nonce, + aad: Aad<&[u8]>, + in_out: &mut [u8], + src: RangeFrom, + cpu_features: cpu::Features, +) -> Result { + let key = match key { + KeyInner::ChaCha20Poly1305(key) => key, + _ => unreachable!(), + }; + let in_out = Overlapping::new(in_out, src).map_err(error::erase::)?; + chacha20_poly1305::open(key, nonce, aad, in_out, cpu_features) + .map_err(error::erase::) +} diff --git a/ring-0.17.14/src/aead/chacha.rs b/ring-0.17.14/src/aead/chacha.rs new file mode 100644 index 0000000000..8d7230b58c --- /dev/null +++ b/ring-0.17.14/src/aead/chacha.rs @@ -0,0 +1,327 @@ +// Copyright 2016 Brian Smith. +// Portions Copyright (c) 2016, Google Inc. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::{overlapping, quic::Sample, Nonce}; +use crate::cpu; +use cfg_if::cfg_if; + +cfg_if! { + if #[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + all(target_arch = "arm", target_endian = "little"), + target_arch = "x86", + target_arch = "x86_64" + ))] { + #[macro_use] + mod ffi; + #[cfg(any(target_arch = "x86", test))] + mod fallback; + } else { + mod fallback; + } +} + +use crate::polyfill::ArraySplitMap; + +pub type Overlapping<'o> = overlapping::Overlapping<'o, u8>; + +#[derive(Clone)] +pub struct Key { + words: [u32; KEY_LEN / 4], +} + +impl Key { + pub(super) fn new(value: [u8; KEY_LEN]) -> Self { + Self { + words: value.array_split_map(u32::from_le_bytes), + } + } +} + +impl Key { + // Encrypts `in_out` with the counter 0 and returns counter 1, + // where the counter is derived from the nonce `nonce`. + #[inline] + pub(super) fn encrypt_single_block_with_ctr_0( + &self, + nonce: Nonce, + in_out: &mut [u8; N], + cpu: cpu::Features, + ) -> Counter { + assert!(N <= BLOCK_LEN); + let (zero, one) = Counter::zero_one_less_safe(nonce); + self.encrypt(zero, in_out.as_mut().into(), cpu); + one + } + + #[inline] + pub fn new_mask(&self, sample: Sample) -> [u8; 5] { + let cpu = cpu::features(); // TODO: Remove this. + let (ctr, nonce) = sample.split_at(4); + let ctr = u32::from_le_bytes(ctr.try_into().unwrap()); + let nonce = Nonce::assume_unique_for_key(nonce.try_into().unwrap()); + let ctr = Counter::from_nonce_and_ctr(nonce, ctr); + + let mut out: [u8; 5] = [0; 5]; + self.encrypt(ctr, out.as_mut().into(), cpu); + out + } + + #[inline(always)] + pub(super) fn encrypt(&self, counter: Counter, in_out: Overlapping<'_>, cpu: cpu::Features) { + cfg_if! { + if #[cfg(all(target_arch = "aarch64", target_endian = "little"))] { + use cpu::{GetFeature as _, arm::Neon}; + const NEON_MIN_LEN: usize = 192 + 1; + if in_out.len() >= NEON_MIN_LEN { + if let Some(cpu) = cpu.get_feature() { + return chacha20_ctr32_ffi!( + unsafe { (NEON_MIN_LEN, Neon, Overlapping<'_>) => ChaCha20_ctr32_neon }, + self, counter, in_out, cpu); + } + } + if in_out.len() >= 1 { + chacha20_ctr32_ffi!( + unsafe { (1, (), Overlapping<'_>) => ChaCha20_ctr32_nohw }, + self, counter, in_out, ()) + } + } else if #[cfg(all(target_arch = "arm", target_endian = "little"))] { + use cpu::{GetFeature as _, arm::Neon}; + const NEON_MIN_LEN: usize = 192 + 1; + if in_out.len() >= NEON_MIN_LEN { + if let Some(cpu) = cpu.get_feature() { + return chacha20_ctr32_ffi!( + unsafe { (NEON_MIN_LEN, Neon, &mut [u8]) => ChaCha20_ctr32_neon }, + self, counter, in_out.copy_within(), cpu); + } + } + if in_out.len() >= 1 { + chacha20_ctr32_ffi!( + unsafe { (1, (), &mut [u8]) => ChaCha20_ctr32_nohw }, + self, counter, in_out.copy_within(), ()) + } + } else if #[cfg(target_arch = "x86")] { + use cpu::{GetFeature as _, intel::Ssse3}; + if in_out.len() >= 1 { + if let Some(cpu) = cpu.get_feature() { + chacha20_ctr32_ffi!( + unsafe { (1, Ssse3, &mut [u8]) => ChaCha20_ctr32_ssse3 }, + self, counter, in_out.copy_within(), cpu) + } else { + let _: cpu::Features = cpu; + fallback::ChaCha20_ctr32(self, counter, in_out) + } + } + } else if #[cfg(target_arch = "x86_64")] { + use cpu::{GetFeature, intel::{Avx2, Ssse3}}; + const SSE_MIN_LEN: usize = 128 + 1; // Also AVX2, SSSE3_4X, SSSE3 + if in_out.len() >= SSE_MIN_LEN { + let values = cpu.values(); + if let Some(cpu) = values.get_feature() { + return chacha20_ctr32_ffi!( + unsafe { (SSE_MIN_LEN, Avx2, Overlapping<'_>) => ChaCha20_ctr32_avx2 }, + self, counter, in_out, cpu); + } + if let Some(cpu) = values.get_feature() { + return chacha20_ctr32_ffi!( + unsafe { (SSE_MIN_LEN, Ssse3, Overlapping<'_>) => + ChaCha20_ctr32_ssse3_4x }, + self, counter, in_out, cpu); + } + } + if in_out.len() >= 1 { + chacha20_ctr32_ffi!( + unsafe { (1, (), Overlapping<'_>) => ChaCha20_ctr32_nohw }, + self, counter, in_out, ()) + } + } else { + let _: cpu::Features = cpu; + fallback::ChaCha20_ctr32(self, counter, in_out) + } + } + } + + #[inline] + pub(super) fn words_less_safe(&self) -> &[u32; KEY_LEN / 4] { + &self.words + } +} + +/// Counter || Nonce, all native endian. +#[repr(transparent)] +pub struct Counter([u32; 4]); + +impl Counter { + // Nonce-reuse: the caller must only use the first counter (0) for at most + // a single block. + fn zero_one_less_safe(nonce: Nonce) -> (Self, Self) { + let ctr0 @ Self([_, n0, n1, n2]) = Self::from_nonce_and_ctr(nonce, 0); + let ctr1 = Self([1, n0, n1, n2]); + (ctr0, ctr1) + } + + fn from_nonce_and_ctr(nonce: Nonce, ctr: u32) -> Self { + let [n0, n1, n2] = nonce.as_ref().array_split_map(u32::from_le_bytes); + Self([ctr, n0, n1, n2]) + } + + /// This is "less safe" because it hands off management of the counter to + /// the caller. + #[cfg(any( + test, + not(any( + all(target_arch = "aarch64", target_endian = "little"), + all(target_arch = "arm", target_endian = "little"), + target_arch = "x86_64" + )) + ))] + fn into_words_less_safe(self) -> [u32; 4] { + self.0 + } +} + +pub const KEY_LEN: usize = 32; + +const BLOCK_LEN: usize = 64; + +#[cfg(test)] +mod tests { + extern crate alloc; + + use super::{super::overlapping::IndexError, *}; + use crate::error; + use crate::testutil as test; + use alloc::vec; + + const MAX_ALIGNMENT_AND_OFFSET: (usize, usize) = (15, 259); + const MAX_ALIGNMENT_AND_OFFSET_SUBSET: (usize, usize) = + if cfg!(any(not(debug_assertions), feature = "slow_tests")) { + MAX_ALIGNMENT_AND_OFFSET + } else { + (0, 0) + }; + + #[test] + fn chacha20_test_default() { + // Always use `MAX_OFFSET` if we hav assembly code. + let max_offset = if cfg!(any( + all(target_arch = "aarch64", target_endian = "little"), + all(target_arch = "arm", target_endian = "little"), + target_arch = "x86", + target_arch = "x86_64" + )) { + MAX_ALIGNMENT_AND_OFFSET + } else { + MAX_ALIGNMENT_AND_OFFSET_SUBSET + }; + chacha20_test(max_offset, Key::encrypt); + } + + // Smoketest the fallback implementation. + #[test] + fn chacha20_test_fallback() { + chacha20_test(MAX_ALIGNMENT_AND_OFFSET_SUBSET, |key, ctr, in_out, _cpu| { + fallback::ChaCha20_ctr32(key, ctr, in_out) + }); + } + + // Verifies the encryption is successful when done on overlapping buffers. + // + // On some branches of the 32-bit x86 and ARM assembly code the in-place + // operation fails in some situations where the input/output buffers are + // not exactly overlapping. Such failures are dependent not only on the + // degree of overlapping but also the length of the data. `encrypt_within` + // works around that. + fn chacha20_test( + max_alignment_and_offset: (usize, usize), + f: impl for<'k, 'o> Fn(&'k Key, Counter, Overlapping<'o>, cpu::Features), + ) { + let cpu = cpu::features(); + + // Reuse a buffer to avoid slowing down the tests with allocations. + let mut buf = vec![0u8; 1300]; + + test::run( + test_vector_file!("chacha_tests.txt"), + move |section, test_case| { + assert_eq!(section, ""); + + let key = test_case.consume_bytes("Key"); + let key: &[u8; KEY_LEN] = key.as_slice().try_into()?; + let key = Key::new(*key); + + let ctr = test_case.consume_usize("Ctr"); + let nonce = test_case.consume_bytes("Nonce"); + let input = test_case.consume_bytes("Input"); + let output = test_case.consume_bytes("Output"); + + // Run the test case over all prefixes of the input because the + // behavior of ChaCha20 implementation changes dependent on the + // length of the input. + for len in 0..=input.len() { + #[allow(clippy::cast_possible_truncation)] + chacha20_test_case_inner( + &key, + &nonce, + ctr as u32, + &input[..len], + &output[..len], + &mut buf, + max_alignment_and_offset, + cpu, + &f, + ); + } + + Ok(()) + }, + ); + } + + fn chacha20_test_case_inner( + key: &Key, + nonce: &[u8], + ctr: u32, + input: &[u8], + expected: &[u8], + buf: &mut [u8], + (max_alignment, max_offset): (usize, usize), + cpu: cpu::Features, + f: &impl for<'k, 'o> Fn(&'k Key, Counter, Overlapping<'o>, cpu::Features), + ) { + const ARBITRARY: u8 = 123; + + for alignment in 0..=max_alignment { + buf[..alignment].fill(ARBITRARY); + let buf = &mut buf[alignment..]; + for offset in 0..=max_offset { + let buf = &mut buf[..(offset + input.len())]; + buf[..offset].fill(ARBITRARY); + let src = offset..; + buf[src.clone()].copy_from_slice(input); + + let ctr = Counter::from_nonce_and_ctr( + Nonce::try_assume_unique_for_key(nonce).unwrap(), + ctr, + ); + let in_out = Overlapping::new(buf, src) + .map_err(error::erase::) + .unwrap(); + f(key, ctr, in_out, cpu); + assert_eq!(&buf[..input.len()], expected) + } + } + } +} diff --git a/ring-0.17.14/src/aead/chacha/fallback.rs b/ring-0.17.14/src/aead/chacha/fallback.rs new file mode 100644 index 0000000000..38f5430fd7 --- /dev/null +++ b/ring-0.17.14/src/aead/chacha/fallback.rs @@ -0,0 +1,108 @@ +// Copyright 2021 Brian Smith. +// Portions Copyright (c) 2014, Google Inc. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +// Adapted from the public domain, estream code by D. Bernstein. +// Adapted from the BoringSSL crypto/chacha/chacha.c. + +use super::{super::overlapping::IndexError, Counter, Key, Overlapping, BLOCK_LEN}; +use crate::{bb, polyfill::sliceutil}; +use core::mem::size_of; + +pub(super) fn ChaCha20_ctr32(key: &Key, counter: Counter, mut in_out: Overlapping<'_>) { + const SIGMA: [u32; 4] = [ + u32::from_le_bytes(*b"expa"), + u32::from_le_bytes(*b"nd 3"), + u32::from_le_bytes(*b"2-by"), + u32::from_le_bytes(*b"te k"), + ]; + + let key = key.words_less_safe(); + let counter = counter.into_words_less_safe(); + + let mut state = [ + SIGMA[0], SIGMA[1], SIGMA[2], SIGMA[3], key[0], key[1], key[2], key[3], key[4], key[5], + key[6], key[7], counter[0], counter[1], counter[2], counter[3], + ]; + + let mut in_out_len = in_out.len(); + + let mut buf = [0u8; BLOCK_LEN]; + while in_out_len > 0 { + chacha_core(&mut buf, &state); + state[12] += 1; + + debug_assert_eq!(in_out_len, in_out.len()); + + // Both branches do the same thing, but the duplication helps the + // compiler optimize (vectorize) the `BLOCK_LEN` case. + if in_out_len >= BLOCK_LEN { + in_out = in_out + .split_first_chunk::(|in_out| { + bb::xor_assign_at_start(&mut buf, in_out.input()); + sliceutil::overwrite_at_start(in_out.into_unwritten_output(), &buf); + }) + .unwrap_or_else(|IndexError { .. }| { + // Since `in_out_len == in_out.len() && in_out_len >= BLOCK_LEN`. + unreachable!() + }); + } else { + bb::xor_assign_at_start(&mut buf, in_out.input()); + sliceutil::overwrite_at_start(in_out.into_unwritten_output(), &buf); + break; + } + + in_out_len -= BLOCK_LEN; + } +} + +// Performs 20 rounds of ChaCha on `input`, storing the result in `output`. +#[inline(always)] +fn chacha_core(output: &mut [u8; BLOCK_LEN], input: &State) { + let mut x = *input; + + for _ in (0..20).step_by(2) { + quarterround(&mut x, 0, 4, 8, 12); + quarterround(&mut x, 1, 5, 9, 13); + quarterround(&mut x, 2, 6, 10, 14); + quarterround(&mut x, 3, 7, 11, 15); + quarterround(&mut x, 0, 5, 10, 15); + quarterround(&mut x, 1, 6, 11, 12); + quarterround(&mut x, 2, 7, 8, 13); + quarterround(&mut x, 3, 4, 9, 14); + } + + for (x, input) in x.iter_mut().zip(input.iter()) { + *x = x.wrapping_add(*input); + } + + output + .chunks_exact_mut(size_of::()) + .zip(x.iter()) + .for_each(|(output, &x)| output.copy_from_slice(&x.to_le_bytes())); +} + +#[inline(always)] +fn quarterround(x: &mut State, a: usize, b: usize, c: usize, d: usize) { + #[inline(always)] + fn step(x: &mut State, a: usize, b: usize, c: usize, rotation: u32) { + x[a] = x[a].wrapping_add(x[b]); + x[c] = (x[c] ^ x[a]).rotate_left(rotation); + } + step(x, a, b, d, 16); + step(x, c, d, b, 12); + step(x, a, b, d, 8); + step(x, c, d, b, 7); +} + +type State = [u32; BLOCK_LEN / 4]; diff --git a/ring-0.17.14/src/aead/chacha/ffi.rs b/ring-0.17.14/src/aead/chacha/ffi.rs new file mode 100644 index 0000000000..bd570133fe --- /dev/null +++ b/ring-0.17.14/src/aead/chacha/ffi.rs @@ -0,0 +1,66 @@ +// Copyright 2016-2025 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::{super::overlapping::Overlapping, Counter, Key}; + +// `unsafe { (N, C, InOut) => f }` means that the function `f` is safe to call +// iff the in/out length is at least `N`, the CPU features `C` are available, +// and the input type is `InOut`. If `f` supports overlapping input/output then +// `InOut` should be `Overlapping<'_, u8>`; otherwise it should be `&mut [u8]`. +macro_rules! chacha20_ctr32_ffi { + ( unsafe { ($MIN_LEN:expr, $Cpu:ty, $InOut:ty) => $f:ident }, + $key:expr, $counter:expr, $in_out:expr, $cpu:expr ) => {{ + prefixed_extern! { + fn $f( + out: *mut u8, + in_: *const u8, + in_len: crate::c::size_t, + key: &[u32; 8], + counter: &crate::aead::chacha::Counter, + ); + } + // SAFETY: The user asserts that $f has the signature above and is safe + // to call if additionally we have a value of type `$Cpu` and an in/out + // value of the indicated type, which we do. + unsafe { + crate::aead::chacha::ffi::chacha20_ctr32_ffi::<$InOut, $Cpu, $MIN_LEN>( + $key, $counter, $in_out, $cpu, $f, + ) + } + }}; +} + +// Panics if `in_out.len() < MIN_LEN`. The caller should have guarded against +// that so that the assertion gets optimized away. +pub(super) unsafe fn chacha20_ctr32_ffi< + 'o, + InOut: 'o + Into>, + Cpu, + const MIN_LEN: usize, +>( + key: &Key, + counter: Counter, + in_out: InOut, + cpu: Cpu, + f: unsafe extern "C" fn(*mut u8, *const u8, crate::c::size_t, &[u32; 8], &Counter), +) { + assert!(MIN_LEN > 0); + let in_out: Overlapping<'_, u8> = in_out.into(); + in_out.with_input_output_len(|input, output, len| { + assert!(len >= MIN_LEN); + let key = key.words_less_safe(); + let _: Cpu = cpu; + unsafe { f(output, input, len, key, &counter) } + }); +} diff --git a/ring-0.17.14/src/aead/chacha20_poly1305/integrated.rs b/ring-0.17.14/src/aead/chacha20_poly1305/integrated.rs new file mode 100644 index 0000000000..bb7ec28419 --- /dev/null +++ b/ring-0.17.14/src/aead/chacha20_poly1305/integrated.rs @@ -0,0 +1,230 @@ +// Copyright 2015-2025 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::{ + super::{NONCE_LEN, TAG_LEN}, + chacha::Overlapping, + check_input_lengths, Aad, InputTooLongError, Key, Nonce, Tag, KEY_LEN, +}; +use cfg_if::cfg_if; + +macro_rules! declare_open { + ( $name:ident ) => { + prefixed_extern! { + fn $name( + out_plaintext: *mut u8, + ciphertext: *const u8, + plaintext_len: usize, + ad: *const u8, + ad_len: usize, + data: &mut InOut, + ); + } + }; +} + +macro_rules! declare_seal { + ( $name:ident ) => { + prefixed_extern! { + fn $name( + out_ciphertext: *mut u8, + plaintext: *const u8, + plaintext_len: usize, + ad: *const u8, + ad_len: usize, + data: &mut InOut, + ); + } + }; +} + +cfg_if! { + if #[cfg(all(target_arch = "aarch64", target_endian = "little"))] { + use crate::cpu::arm::Neon; + type RequiredCpuFeatures = Neon; + type OptionalCpuFeatures = (); + } else { + use crate::cpu::intel::{Avx2, Bmi2, Sse41}; + type RequiredCpuFeatures = Sse41; + type OptionalCpuFeatures = (Avx2, Bmi2); + } +} + +pub(super) fn seal( + Key(key): &Key, + nonce: Nonce, + aad: Aad<&[u8]>, + in_out: &mut [u8], + required_cpu_features: RequiredCpuFeatures, + optional_cpu_features: Option, +) -> Result { + check_input_lengths(aad, in_out)?; + + // XXX: BoringSSL uses `alignas(16)` on `key` instead of on the + // structure, but Rust can't do that yet; see + // https://github.com/rust-lang/rust/issues/73557. + // + // Keep in sync with the anonymous struct of BoringSSL's + // `chacha20_poly1305_seal_data`. + #[repr(align(16), C)] + #[derive(Clone, Copy)] + struct seal_data_in { + key: [u32; KEY_LEN / 4], + counter: u32, + nonce: [u8; NONCE_LEN], + extra_ciphertext: *const u8, + extra_ciphertext_len: usize, + } + + let mut data = InOut { + input: seal_data_in { + key: *key.words_less_safe(), + counter: 0, + nonce: *nonce.as_ref(), + extra_ciphertext: core::ptr::null(), + extra_ciphertext_len: 0, + }, + }; + + // Encrypts `plaintext_len` bytes from `plaintext` and writes them to `out_ciphertext`. + + let output = in_out.as_mut_ptr(); + let input = in_out.as_ptr(); + let len = in_out.len(); + let ad = aad.as_ref().as_ptr(); + let ad_len = aad.as_ref().len(); + + #[allow(clippy::needless_late_init)] + let tag; + + cfg_if! { + if #[cfg(all(target_arch = "aarch64", target_endian = "little"))] { + declare_seal! { chacha20_poly1305_seal } + let _: Neon = required_cpu_features; + let _: Option<()> = optional_cpu_features; + tag = unsafe { + chacha20_poly1305_seal(output, input, len, ad, ad_len, &mut data); + &data.out.tag + }; + } else { + let _: Sse41 = required_cpu_features; + if matches!(optional_cpu_features, Some((Avx2 { .. }, Bmi2 { .. }))) { + declare_seal! { chacha20_poly1305_seal_avx2 } + tag = unsafe { + chacha20_poly1305_seal_avx2(output, input, len, ad, ad_len, &mut data); + &data.out.tag + }; + } else { + declare_seal! { chacha20_poly1305_seal_sse41 } + tag = unsafe { + chacha20_poly1305_seal_sse41(output, input, len, ad, ad_len, &mut data); + &data.out.tag + }; + } + } + } + + Ok(Tag(*tag)) +} + +pub(super) fn open( + Key(key): &Key, + nonce: Nonce, + aad: Aad<&[u8]>, + in_out: Overlapping<'_>, + required_cpu_features: RequiredCpuFeatures, + optional_cpu_features: Option, +) -> Result { + check_input_lengths(aad, in_out.input())?; + + // XXX: BoringSSL uses `alignas(16)` on `key` instead of on the + // structure, but Rust can't do that yet; see + // https://github.com/rust-lang/rust/issues/73557. + // + // Keep in sync with the anonymous struct of BoringSSL's + // `chacha20_poly1305_open_data`. + #[derive(Copy, Clone)] + #[repr(align(16), C)] + struct open_data_in { + key: [u32; KEY_LEN / 4], + counter: u32, + nonce: [u8; NONCE_LEN], + } + + let mut data = InOut { + input: open_data_in { + key: *key.words_less_safe(), + counter: 0, + nonce: *nonce.as_ref(), + }, + }; + + in_out.with_input_output_len(|input, output, len| { + let ad = aad.as_ref().as_ptr(); + let ad_len = aad.as_ref().len(); + + #[allow(clippy::needless_late_init)] + let tag; + + cfg_if! { + if #[cfg(all(target_arch = "aarch64", target_endian = "little"))] { + declare_open! { chacha20_poly1305_open } + let _: Neon = required_cpu_features; + let _: Option<()> = optional_cpu_features; + tag = unsafe { + chacha20_poly1305_open(output, input, len, ad, ad_len, &mut data); + &data.out.tag + }; + } else { + let _: Sse41 = required_cpu_features; + if matches!(optional_cpu_features, Some((Avx2 { .. }, Bmi2 { .. }))) { + declare_open! { chacha20_poly1305_open_avx2 } + tag = unsafe { + chacha20_poly1305_open_avx2(output, input, len, ad, ad_len, &mut data); + &data.out.tag + }; + } else { + declare_open! { chacha20_poly1305_open_sse41 } + tag = unsafe { + chacha20_poly1305_open_sse41(output, input, len, ad, ad_len, &mut data); + &data.out.tag + }; + } + } + } + + Ok(Tag(*tag)) + }) +} + +// Keep in sync with BoringSSL's `chacha20_poly1305_open_data` and +// `chacha20_poly1305_seal_data`. +#[repr(C)] +pub(super) union InOut +where + T: Copy, +{ + pub(super) input: T, + pub(super) out: Out, +} + +// It isn't obvious whether the assembly code works for tags that aren't +// 16-byte aligned. In practice it will always be 16-byte aligned because it +// is embedded in a union where the other member of the union is 16-byte +// aligned. +#[derive(Clone, Copy)] +#[repr(align(16), C)] +pub(super) struct Out { + pub(super) tag: [u8; TAG_LEN], +} diff --git a/ring-0.17.14/src/aead/chacha20_poly1305/mod.rs b/ring-0.17.14/src/aead/chacha20_poly1305/mod.rs new file mode 100644 index 0000000000..881749081a --- /dev/null +++ b/ring-0.17.14/src/aead/chacha20_poly1305/mod.rs @@ -0,0 +1,167 @@ +// Copyright 2015-2025 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::{ + chacha::{self, Counter, Overlapping}, + poly1305, Aad, Nonce, Tag, +}; +use crate::{ + cpu, + error::InputTooLongError, + polyfill::{slice, sliceutil, u64_from_usize, usize_from_u64_saturated}, +}; +use cfg_if::cfg_if; + +cfg_if! { + if #[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + target_arch = "x86_64"))] { + use cpu::GetFeature as _; + mod integrated; + } +} + +pub(super) const KEY_LEN: usize = chacha::KEY_LEN; + +const MAX_IN_OUT_LEN: usize = super::max_input_len(64, 1); +// https://tools.ietf.org/html/rfc8439#section-2.8 +const _MAX_IN_OUT_LEN_BOUNDED_BY_RFC: () = + assert!(MAX_IN_OUT_LEN == usize_from_u64_saturated(274_877_906_880u64)); + +#[derive(Clone)] +pub(super) struct Key(chacha::Key); + +impl Key { + pub(super) fn new(value: [u8; KEY_LEN]) -> Self { + Self(chacha::Key::new(value)) + } +} + +pub(super) fn seal( + key: &Key, + nonce: Nonce, + aad: Aad<&[u8]>, + in_out: &mut [u8], + cpu: cpu::Features, +) -> Result { + #[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + target_arch = "x86_64" + ))] + if let Some(required) = cpu.get_feature() { + return integrated::seal(key, nonce, aad, in_out, required, cpu.get_feature()); + } + + seal_fallback(key, nonce, aad, in_out, cpu) +} + +pub(super) fn seal_fallback( + Key(chacha20_key): &Key, + nonce: Nonce, + aad: Aad<&[u8]>, + in_out: &mut [u8], + cpu: cpu::Features, +) -> Result { + let (counter, poly1305_key) = begin(chacha20_key, nonce, aad, in_out, cpu)?; + let mut auth = poly1305::Context::from_key(poly1305_key, cpu); + + poly1305_update_padded_16(&mut auth, aad.as_ref()); + chacha20_key.encrypt(counter, in_out.into(), cpu); + poly1305_update_padded_16(&mut auth, in_out); + Ok(finish(auth, aad.as_ref().len(), in_out.len())) +} + +pub(super) fn open( + key: &Key, + nonce: Nonce, + aad: Aad<&[u8]>, + in_out: Overlapping<'_>, + cpu: cpu::Features, +) -> Result { + #[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + target_arch = "x86_64" + ))] + if let Some(required) = cpu.get_feature() { + return integrated::open(key, nonce, aad, in_out, required, cpu.get_feature()); + } + + open_fallback(key, nonce, aad, in_out, cpu) +} + +pub(super) fn open_fallback( + Key(chacha20_key): &Key, + nonce: Nonce, + aad: Aad<&[u8]>, + in_out: Overlapping<'_>, + cpu: cpu::Features, +) -> Result { + let (counter, poly1305_key) = begin(chacha20_key, nonce, aad, in_out.input(), cpu)?; + let mut auth = poly1305::Context::from_key(poly1305_key, cpu); + + poly1305_update_padded_16(&mut auth, aad.as_ref()); + poly1305_update_padded_16(&mut auth, in_out.input()); + let in_out_len = in_out.len(); + chacha20_key.encrypt(counter, in_out, cpu); + Ok(finish(auth, aad.as_ref().len(), in_out_len)) +} + +fn check_input_lengths(aad: Aad<&[u8]>, input: &[u8]) -> Result<(), InputTooLongError> { + if input.len() > MAX_IN_OUT_LEN { + return Err(InputTooLongError::new(input.len())); + } + + // RFC 8439 Section 2.8 says the maximum AAD length is 2**64 - 1, which is + // never larger than usize::MAX, so we don't need an explicit length + // check. + const _USIZE_BOUNDED_BY_U64: u64 = u64_from_usize(usize::MAX); + let _ = aad; + + Ok(()) +} + +// Also used by chacha20_poly1305_openssh. +pub(super) fn begin( + key: &chacha::Key, + nonce: Nonce, + aad: Aad<&[u8]>, + input: &[u8], + cpu: cpu::Features, +) -> Result<(Counter, poly1305::Key), InputTooLongError> { + check_input_lengths(aad, input)?; + + let mut key_bytes = [0u8; poly1305::KEY_LEN]; + let counter = key.encrypt_single_block_with_ctr_0(nonce, &mut key_bytes, cpu); + let poly1305_key = poly1305::Key::new(key_bytes); + Ok((counter, poly1305_key)) +} + +fn finish(auth: poly1305::Context, aad_len: usize, in_out_len: usize) -> Tag { + let mut block = [0u8; poly1305::BLOCK_LEN]; + let (alen, clen) = block.split_at_mut(poly1305::BLOCK_LEN / 2); + alen.copy_from_slice(&u64::to_le_bytes(u64_from_usize(aad_len))); + clen.copy_from_slice(&u64::to_le_bytes(u64_from_usize(in_out_len))); + auth.finish(&block) +} + +#[inline] +fn poly1305_update_padded_16(ctx: &mut poly1305::Context, input: &[u8]) { + let (whole, remainder) = slice::as_chunks(input); + ctx.update(whole); + if !remainder.is_empty() { + let mut block = [0u8; poly1305::BLOCK_LEN]; + sliceutil::overwrite_at_start(&mut block, remainder); + ctx.update_block(block); + } +} diff --git a/ring-0.17.14/src/aead/chacha20_poly1305_openssh.rs b/ring-0.17.14/src/aead/chacha20_poly1305_openssh.rs new file mode 100644 index 0000000000..cf3950ba6b --- /dev/null +++ b/ring-0.17.14/src/aead/chacha20_poly1305_openssh.rs @@ -0,0 +1,212 @@ +// Copyright 2016 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! The [chacha20-poly1305@openssh.com] AEAD-ish construct. +//! +//! This should only be used by SSH implementations. It has a similar, but +//! different API from `ring::aead` because the construct cannot use the same +//! API as `ring::aead` due to the way the construct handles the encrypted +//! packet length. +//! +//! The concatenation of a and b is denoted `a||b`. `K_1` and `K_2` are defined +//! in the [chacha20-poly1305@openssh.com] specification. `packet_length`, +//! `padding_length`, `payload`, and `random padding` are defined in +//! [RFC 4253]. The term `plaintext` is used as a shorthand for +//! `padding_length||payload||random padding`. +//! +//! [chacha20-poly1305@openssh.com]: +//! http://cvsweb.openbsd.org/cgi-bin/cvsweb/src/usr.bin/ssh/PROTOCOL.chacha20poly1305?annotate=HEAD +//! [RFC 4253]: https://tools.ietf.org/html/rfc4253 + +use super::{ + chacha::{self, *}, + chacha20_poly1305, cpu, poly1305, Aad, Nonce, Tag, +}; +use crate::{ + bb, + error::{self, InputTooLongError}, + polyfill::slice, +}; + +/// A key for sealing packets. +pub struct SealingKey { + key: Key, +} + +impl SealingKey { + /// Constructs a new `SealingKey`. + pub fn new(key_material: &[u8; KEY_LEN]) -> Self { + Self { + key: Key::new(key_material), + } + } + + /// Seals (encrypts and signs) a packet. + /// + /// On input, `plaintext_in_ciphertext_out` must contain the unencrypted + /// `packet_length||plaintext` where `plaintext` is the + /// `padding_length||payload||random padding`. It will be overwritten by + /// `encrypted_packet_length||ciphertext`, where `encrypted_packet_length` + /// is encrypted with `K_1` and `ciphertext` is encrypted by `K_2`. + /// + /// # Panics + /// + /// Panics if `plaintext_in_ciphertext_out.len() < PACKET_LENGTH_LEN`. + /// + /// Panics if `plaintext_in_ciphertext_out` is longer than the maximum + /// input size for ChaCha20-Poly1305. Note that this limit is much, + /// much larger than SSH's 256KB maximum record size. + pub fn seal_in_place( + &self, + sequence_number: u32, + plaintext_in_ciphertext_out: &mut [u8], + tag_out: &mut [u8; TAG_LEN], + ) { + // XXX/TODO(SemVer): Refactor API to return an error. + let (len_in_out, data_and_padding_in_out): (&mut [u8; PACKET_LENGTH_LEN], _) = + slice::split_first_chunk_mut(plaintext_in_ciphertext_out).unwrap(); + + let cpu = cpu::features(); + // XXX/TODO(SemVer): Refactor API to return an error. + let (counter, poly_key) = chacha20_poly1305::begin( + &self.key.k_2, + make_nonce(sequence_number), + Aad::from(len_in_out), + data_and_padding_in_out, + cpu, + ) + .map_err(error::erase::) + .unwrap(); + + let _: Counter = self.key.k_1.encrypt_single_block_with_ctr_0( + make_nonce(sequence_number), + len_in_out, + cpu, + ); + self.key + .k_2 + .encrypt(counter, data_and_padding_in_out.into(), cpu); + + let Tag(tag) = poly1305::sign(poly_key, plaintext_in_ciphertext_out, cpu); + *tag_out = tag; + } +} + +/// A key for opening packets. +pub struct OpeningKey { + key: Key, +} + +impl OpeningKey { + /// Constructs a new `OpeningKey`. + pub fn new(key_material: &[u8; KEY_LEN]) -> Self { + Self { + key: Key::new(key_material), + } + } + + /// Returns the decrypted, but unauthenticated, packet length. + /// + /// Importantly, the result won't be authenticated until `open_in_place` is + /// called. + pub fn decrypt_packet_length( + &self, + sequence_number: u32, + encrypted_packet_length: [u8; PACKET_LENGTH_LEN], + ) -> [u8; PACKET_LENGTH_LEN] { + let cpu = cpu::features(); + let mut packet_length = encrypted_packet_length; + let _: Counter = self.key.k_1.encrypt_single_block_with_ctr_0( + make_nonce(sequence_number), + &mut packet_length, + cpu, + ); + packet_length + } + + /// Opens (authenticates and decrypts) a packet. + /// + /// `ciphertext_in_plaintext_out` must be of the form + /// `encrypted_packet_length||ciphertext` where `ciphertext` is the + /// encrypted `plaintext`. When the function succeeds the ciphertext is + /// replaced by the plaintext and the result is `Ok(plaintext)`, where + /// `plaintext` is `&ciphertext_in_plaintext_out[PACKET_LENGTH_LEN..]`; + /// otherwise the contents of `ciphertext_in_plaintext_out` are unspecified + /// and must not be used. + pub fn open_in_place<'a>( + &self, + sequence_number: u32, + ciphertext_in_plaintext_out: &'a mut [u8], + tag: &[u8; TAG_LEN], + ) -> Result<&'a [u8], error::Unspecified> { + let (packet_length, after_packet_length): (&mut [u8; PACKET_LENGTH_LEN], _) = + slice::split_first_chunk_mut(ciphertext_in_plaintext_out).ok_or(error::Unspecified)?; + + let cpu = cpu::features(); + let (counter, poly_key) = chacha20_poly1305::begin( + &self.key.k_2, + make_nonce(sequence_number), + Aad::from(packet_length), + after_packet_length, + cpu, + ) + .map_err(error::erase::)?; + + // We must verify the tag before decrypting so that + // `ciphertext_in_plaintext_out` is unmodified if verification fails. + // This is beyond what we guarantee. + let calculated_tag = poly1305::sign(poly_key, ciphertext_in_plaintext_out, cpu); + bb::verify_slices_are_equal(calculated_tag.as_ref(), tag)?; + + // Won't panic because the length was checked above. + let after_packet_length = &mut ciphertext_in_plaintext_out[PACKET_LENGTH_LEN..]; + + self.key + .k_2 + .encrypt(counter, after_packet_length.into(), cpu); + + Ok(after_packet_length) + } +} + +struct Key { + k_1: chacha::Key, + k_2: chacha::Key, +} + +impl Key { + fn new(key_material: &[u8; KEY_LEN]) -> Self { + // The first half becomes K_2 and the second half becomes K_1. + let (k_2, k_1) = key_material.split_at(chacha::KEY_LEN); + Self { + k_1: chacha::Key::new(k_1.try_into().unwrap()), + k_2: chacha::Key::new(k_2.try_into().unwrap()), + } + } +} + +fn make_nonce(sequence_number: u32) -> Nonce { + let [s0, s1, s2, s3] = sequence_number.to_be_bytes(); + let nonce = [0, 0, 0, 0, 0, 0, 0, 0, s0, s1, s2, s3]; + Nonce::assume_unique_for_key(nonce) +} + +/// The length of key. +pub const KEY_LEN: usize = chacha::KEY_LEN * 2; + +/// The length in bytes of the `packet_length` field in a SSH packet. +pub const PACKET_LENGTH_LEN: usize = 4; // 32 bits + +/// The length in bytes of an authentication tag. +pub const TAG_LEN: usize = super::TAG_LEN; diff --git a/ring-0.17.14/src/aead/gcm.rs b/ring-0.17.14/src/aead/gcm.rs new file mode 100644 index 0000000000..443c19e16b --- /dev/null +++ b/ring-0.17.14/src/aead/gcm.rs @@ -0,0 +1,163 @@ +// Copyright 2018-2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use self::ffi::{Block, BLOCK_LEN, ZERO_BLOCK}; +use super::{aes_gcm, Aad}; +use crate::{ + bits::{BitLength, FromByteLen as _}, + error::{self, InputTooLongError}, + polyfill::{slice::AsChunks, sliceutil::overwrite_at_start, NotSend}, +}; +use cfg_if::cfg_if; + +pub(super) use ffi::KeyValue; + +cfg_if! { + if #[cfg(any(all(target_arch = "aarch64", target_endian = "little"), target_arch = "x86_64"))] { + pub(super) use self::ffi::{HTable, Xi}; + } else { + use self::ffi::{HTable, Xi}; + } +} + +#[macro_use] +mod ffi; + +pub(super) mod clmul; +pub(super) mod clmulavxmovbe; +pub(super) mod fallback; +pub(super) mod neon; +pub(super) mod vclmulavx2; + +pub(super) struct Context<'key, K> { + Xi: Xi, + key: &'key K, + aad_len: BitLength, + in_out_len: BitLength, + _not_send: NotSend, +} + +impl<'key, K: UpdateBlock> Context<'key, K> { + #[inline(always)] + pub(crate) fn new( + key: &'key K, + aad: Aad<&[u8]>, + in_out_len: usize, + ) -> Result { + if in_out_len > aes_gcm::MAX_IN_OUT_LEN { + return Err(error::Unspecified); + } + let in_out_len = + BitLength::from_byte_len(in_out_len).map_err(error::erase::)?; + let aad_len = BitLength::from_byte_len(aad.as_ref().len()) + .map_err(error::erase::)?; + + // NIST SP800-38D Section 5.2.1.1 says that the maximum AAD length is + // 2**64 - 1 bits, i.e. BitLength::MAX, so we don't need to do an + // explicit check here. + + let mut ctx = Self { + Xi: Xi(ZERO_BLOCK), + key, + aad_len, + in_out_len, + _not_send: NotSend::VALUE, + }; + + for ad in aad.0.chunks(BLOCK_LEN) { + let mut block = ZERO_BLOCK; + overwrite_at_start(&mut block, ad); + ctx.update_block(block); + } + + Ok(ctx) + } +} + +#[cfg(all( + target_arch = "aarch64", + target_endian = "little", + target_pointer_width = "64" +))] +impl Context<'_, K> { + pub(super) fn in_out_whole_block_bits(&self) -> BitLength { + use crate::polyfill::usize_from_u64; + const WHOLE_BLOCK_BITS_MASK: usize = !0b111_1111; + #[allow(clippy::assertions_on_constants)] + const _WHOLE_BLOCK_BITS_MASK_CORRECT: () = + assert!(WHOLE_BLOCK_BITS_MASK == !((BLOCK_LEN * 8) - 1)); + BitLength::from_bits(usize_from_u64(self.in_out_len.as_bits()) & WHOLE_BLOCK_BITS_MASK) + } +} + +#[cfg(all(target_arch = "aarch64", target_endian = "little"))] +/// Access to `inner` for the integrated AES-GCM implementations only. +impl Context<'_, clmul::Key> { + #[inline] + pub(super) fn inner(&mut self) -> (&HTable, &mut Xi) { + (&self.key.inner(), &mut self.Xi) + } +} + +#[cfg(target_arch = "x86_64")] +impl Context<'_, clmulavxmovbe::Key> { + /// Access to `inner` for the integrated AES-GCM implementations only. + #[inline] + pub(super) fn inner(&mut self) -> (&HTable, &mut Xi) { + (self.key.inner(), &mut self.Xi) + } +} + +#[cfg(target_arch = "x86_64")] +impl Context<'_, vclmulavx2::Key> { + /// Access to `inner` for the integrated AES-GCM implementations only. + #[inline] + pub(super) fn inner(&mut self) -> (&HTable, &mut Xi) { + (self.key.inner(), &mut self.Xi) + } +} + +impl Context<'_, K> { + #[inline(always)] + pub fn update_blocks(&mut self, input: AsChunks) { + self.key.update_blocks(&mut self.Xi, input); + } +} + +impl Context<'_, K> { + pub fn update_block(&mut self, a: Block) { + self.key.update_block(&mut self.Xi, a); + } + + #[inline(always)] + pub(super) fn pre_finish(mut self, f: F) -> super::Tag + where + F: FnOnce(Block) -> super::Tag, + { + let mut block = [0u8; BLOCK_LEN]; + let (alen, clen) = block.split_at_mut(BLOCK_LEN / 2); + alen.copy_from_slice(&BitLength::::to_be_bytes(self.aad_len)); + clen.copy_from_slice(&BitLength::::to_be_bytes(self.in_out_len)); + self.update_block(block); + f(self.Xi.0) + } +} + +pub(super) trait UpdateBlock { + fn update_block(&self, xi: &mut Xi, a: Block); +} + +pub(super) trait UpdateBlocks { + fn update_blocks(&self, xi: &mut Xi, input: AsChunks); +} diff --git a/ring-0.17.14/src/aead/gcm/clmul.rs b/ring-0.17.14/src/aead/gcm/clmul.rs new file mode 100644 index 0000000000..8cd55a4eeb --- /dev/null +++ b/ring-0.17.14/src/aead/gcm/clmul.rs @@ -0,0 +1,73 @@ +// Copyright 2018-2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + target_arch = "x86", + target_arch = "x86_64" +))] + +use super::{ffi::KeyValue, HTable, UpdateBlock, Xi}; +use crate::aead::gcm::ffi::BLOCK_LEN; +use crate::cpu; +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +use {super::UpdateBlocks, crate::polyfill::slice::AsChunks}; + +#[cfg(all(target_arch = "aarch64", target_endian = "little"))] +pub(in super::super) type RequiredCpuFeatures = cpu::arm::PMull; + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +pub(in super::super) type RequiredCpuFeatures = (cpu::intel::ClMul, cpu::intel::Ssse3); + +#[derive(Clone)] +pub struct Key { + h_table: HTable, +} + +impl Key { + #[cfg_attr(target_arch = "x86_64", inline(never))] + pub(in super::super) fn new(value: KeyValue, _cpu: RequiredCpuFeatures) -> Self { + Self { + h_table: unsafe { htable_new!(gcm_init_clmul, value) }, + } + } + + #[cfg(target_arch = "aarch64")] + pub(super) fn inner(&self) -> &HTable { + &self.h_table + } +} + +impl UpdateBlock for Key { + #[cfg(target_arch = "aarch64")] + fn update_block(&self, xi: &mut Xi, a: [u8; BLOCK_LEN]) { + prefixed_extern! { + fn gcm_gmult_clmul(xi: &mut Xi, Htable: &HTable); + } + xi.bitxor_assign(a); + unsafe { self.h_table.gmult(gcm_gmult_clmul, xi) }; + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + fn update_block(&self, xi: &mut Xi, a: [u8; BLOCK_LEN]) { + self.update_blocks(xi, (&a).into()) + } +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +impl UpdateBlocks for Key { + fn update_blocks(&self, xi: &mut Xi, input: AsChunks) { + unsafe { ghash!(gcm_ghash_clmul, xi, &self.h_table, input) } + } +} diff --git a/ring-0.17.14/src/aead/gcm/clmulavxmovbe.rs b/ring-0.17.14/src/aead/gcm/clmulavxmovbe.rs new file mode 100644 index 0000000000..c53e2410cc --- /dev/null +++ b/ring-0.17.14/src/aead/gcm/clmulavxmovbe.rs @@ -0,0 +1,51 @@ +// Copyright 2018-2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![cfg(target_arch = "x86_64")] + +use super::{HTable, KeyValue, UpdateBlock, UpdateBlocks, Xi, BLOCK_LEN}; +use crate::{cpu::intel, polyfill::slice::AsChunks}; + +#[derive(Clone)] +pub struct Key { + h_table: HTable, +} + +impl Key { + #[inline(never)] + pub(in super::super) fn new( + value: KeyValue, + _required_cpu_features: (intel::ClMul, intel::Avx, intel::Movbe), + ) -> Self { + Self { + h_table: unsafe { htable_new!(gcm_init_avx, value) }, + } + } + + pub(super) fn inner(&self) -> &HTable { + &self.h_table + } +} + +impl UpdateBlock for Key { + fn update_block(&self, xi: &mut Xi, a: [u8; BLOCK_LEN]) { + self.update_blocks(xi, (&a).into()) + } +} + +impl UpdateBlocks for Key { + fn update_blocks(&self, xi: &mut Xi, input: AsChunks) { + unsafe { ghash!(gcm_ghash_avx, xi, self.inner(), input) } + } +} diff --git a/ring-0.17.14/src/aead/gcm/fallback.rs b/ring-0.17.14/src/aead/gcm/fallback.rs new file mode 100644 index 0000000000..4d5403bde8 --- /dev/null +++ b/ring-0.17.14/src/aead/gcm/fallback.rs @@ -0,0 +1,271 @@ +// Copyright (c) 2019, Google Inc. +// Portions Copyright 2020-2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +// This file is based on BoringSSL's gcm_nohw.c. + +// This file contains a implementation of GHASH based on the notes +// in https://bearssl.org/constanttime.html#ghash-for-gcm and the reduction +// algorithm described in +// https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf. +// +// Unlike the BearSSL notes, we use u128 in the 64-bit implementation. + +use super::{ffi::U128, KeyValue, UpdateBlock, UpdateBlocks, Xi, BLOCK_LEN}; +use crate::polyfill::{slice::AsChunks, ArraySplitMap as _}; + +#[derive(Clone)] +pub struct Key { + h: U128, +} + +impl Key { + pub(in super::super) fn new(value: KeyValue) -> Self { + Self { h: init(value) } + } +} + +impl UpdateBlock for Key { + fn update_block(&self, xi: &mut Xi, a: [u8; BLOCK_LEN]) { + xi.bitxor_assign(a); + gmult(xi, self.h); + } +} + +impl UpdateBlocks for Key { + fn update_blocks(&self, xi: &mut Xi, input: AsChunks) { + ghash(xi, self.h, input); + } +} + +#[cfg(target_pointer_width = "64")] +fn gcm_mul64_nohw(a: u64, b: u64) -> (u64, u64) { + #[allow(clippy::cast_possible_truncation)] + #[inline(always)] + fn lo(a: u128) -> u64 { + a as u64 + } + + #[inline(always)] + fn hi(a: u128) -> u64 { + lo(a >> 64) + } + + #[inline(always)] + fn mul(a: u64, b: u64) -> u128 { + u128::from(a) * u128::from(b) + } + + // One term every four bits means the largest term is 64/4 = 16, which barely + // overflows into the next term. Using one term every five bits would cost 25 + // multiplications instead of 16. It is faster to mask off the bottom four + // bits of |a|, giving a largest term of 60/4 = 15, and apply the bottom bits + // separately. + let a0 = a & 0x1111111111111110; + let a1 = a & 0x2222222222222220; + let a2 = a & 0x4444444444444440; + let a3 = a & 0x8888888888888880; + + let b0 = b & 0x1111111111111111; + let b1 = b & 0x2222222222222222; + let b2 = b & 0x4444444444444444; + let b3 = b & 0x8888888888888888; + + let c0 = mul(a0, b0) ^ mul(a1, b3) ^ mul(a2, b2) ^ mul(a3, b1); + let c1 = mul(a0, b1) ^ mul(a1, b0) ^ mul(a2, b3) ^ mul(a3, b2); + let c2 = mul(a0, b2) ^ mul(a1, b1) ^ mul(a2, b0) ^ mul(a3, b3); + let c3 = mul(a0, b3) ^ mul(a1, b2) ^ mul(a2, b1) ^ mul(a3, b0); + + // Multiply the bottom four bits of |a| with |b|. + let a0_mask = 0u64.wrapping_sub(a & 1); + let a1_mask = 0u64.wrapping_sub((a >> 1) & 1); + let a2_mask = 0u64.wrapping_sub((a >> 2) & 1); + let a3_mask = 0u64.wrapping_sub((a >> 3) & 1); + let extra = u128::from(a0_mask & b) + ^ (u128::from(a1_mask & b) << 1) + ^ (u128::from(a2_mask & b) << 2) + ^ (u128::from(a3_mask & b) << 3); + + let lo = (lo(c0) & 0x1111111111111111) + ^ (lo(c1) & 0x2222222222222222) + ^ (lo(c2) & 0x4444444444444444) + ^ (lo(c3) & 0x8888888888888888) + ^ lo(extra); + let hi = (hi(c0) & 0x1111111111111111) + ^ (hi(c1) & 0x2222222222222222) + ^ (hi(c2) & 0x4444444444444444) + ^ (hi(c3) & 0x8888888888888888) + ^ hi(extra); + (lo, hi) +} + +#[cfg(not(target_pointer_width = "64"))] +fn gcm_mul32_nohw(a: u32, b: u32) -> u64 { + #[inline(always)] + fn mul(a: u32, b: u32) -> u64 { + u64::from(a) * u64::from(b) + } + + // One term every four bits means the largest term is 32/4 = 8, which does not + // overflow into the next term. + let a0 = a & 0x11111111; + let a1 = a & 0x22222222; + let a2 = a & 0x44444444; + let a3 = a & 0x88888888; + + let b0 = b & 0x11111111; + let b1 = b & 0x22222222; + let b2 = b & 0x44444444; + let b3 = b & 0x88888888; + + let c0 = mul(a0, b0) ^ mul(a1, b3) ^ mul(a2, b2) ^ mul(a3, b1); + let c1 = mul(a0, b1) ^ mul(a1, b0) ^ mul(a2, b3) ^ mul(a3, b2); + let c2 = mul(a0, b2) ^ mul(a1, b1) ^ mul(a2, b0) ^ mul(a3, b3); + let c3 = mul(a0, b3) ^ mul(a1, b2) ^ mul(a2, b1) ^ mul(a3, b0); + + (c0 & 0x1111111111111111) + | (c1 & 0x2222222222222222) + | (c2 & 0x4444444444444444) + | (c3 & 0x8888888888888888) +} + +#[cfg(not(target_pointer_width = "64"))] +fn gcm_mul64_nohw(a: u64, b: u64) -> (u64, u64) { + #[inline(always)] + fn lo(a: u64) -> u32 { + a as u32 + } + #[inline(always)] + fn hi(a: u64) -> u32 { + lo(a >> 32) + } + + let a0 = lo(a); + let a1 = hi(a); + let b0 = lo(b); + let b1 = hi(b); + // Karatsuba multiplication. + let lo = gcm_mul32_nohw(a0, b0); + let hi = gcm_mul32_nohw(a1, b1); + let mid = gcm_mul32_nohw(a0 ^ a1, b0 ^ b1) ^ lo ^ hi; + (lo ^ (mid << 32), hi ^ (mid >> 32)) +} + +fn init(value: KeyValue) -> U128 { + let xi = value.into_inner(); + + // We implement GHASH in terms of POLYVAL, as described in RFC 8452. This + // avoids a shift by 1 in the multiplication, needed to account for bit + // reversal losing a bit after multiplication, that is, + // rev128(X) * rev128(Y) = rev255(X*Y). + // + // Per Appendix A, we run mulX_POLYVAL. Note this is the same transformation + // applied by |gcm_init_clmul|, etc. Note |Xi| has already been byteswapped. + // + // See also slide 16 of + // https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf + let mut lo = xi[1]; + let mut hi = xi[0]; + + let mut carry = hi >> 63; + carry = 0u64.wrapping_sub(carry); + + hi <<= 1; + hi |= lo >> 63; + lo <<= 1; + + // The irreducible polynomial is 1 + x^121 + x^126 + x^127 + x^128, so we + // conditionally add 0xc200...0001. + lo ^= carry & 1; + hi ^= carry & 0xc200000000000000; + + // This implementation does not use the rest of |Htable|. + U128 { hi, lo } +} + +fn gcm_polyval_nohw(xi: &mut [u64; 2], h: U128) { + // Karatsuba multiplication. The product of |Xi| and |H| is stored in |r0| + // through |r3|. Note there is no byte or bit reversal because we are + // evaluating POLYVAL. + let (r0, mut r1) = gcm_mul64_nohw(xi[0], h.lo); + let (mut r2, mut r3) = gcm_mul64_nohw(xi[1], h.hi); + let (mut mid0, mut mid1) = gcm_mul64_nohw(xi[0] ^ xi[1], h.hi ^ h.lo); + mid0 ^= r0 ^ r2; + mid1 ^= r1 ^ r3; + r2 ^= mid1; + r1 ^= mid0; + + // Now we multiply our 256-bit result by x^-128 and reduce. |r2| and + // |r3| shifts into position and we must multiply |r0| and |r1| by x^-128. We + // have: + // + // 1 = x^121 + x^126 + x^127 + x^128 + // x^-128 = x^-7 + x^-2 + x^-1 + 1 + // + // This is the GHASH reduction step, but with bits flowing in reverse. + + // The x^-7, x^-2, and x^-1 terms shift bits past x^0, which would require + // another reduction steps. Instead, we gather the excess bits, incorporate + // them into |r0| and |r1| and reduce once. See slides 17-19 + // of https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf. + r1 ^= (r0 << 63) ^ (r0 << 62) ^ (r0 << 57); + + // 1 + r2 ^= r0; + r3 ^= r1; + + // x^-1 + r2 ^= r0 >> 1; + r2 ^= r1 << 63; + r3 ^= r1 >> 1; + + // x^-2 + r2 ^= r0 >> 2; + r2 ^= r1 << 62; + r3 ^= r1 >> 2; + + // x^-7 + r2 ^= r0 >> 7; + r2 ^= r1 << 57; + r3 ^= r1 >> 7; + + *xi = [r2, r3]; +} + +fn gmult(xi: &mut Xi, h: U128) { + with_swapped_xi(xi, |swapped| { + gcm_polyval_nohw(swapped, h); + }) +} + +fn ghash(xi: &mut Xi, h: U128, input: AsChunks) { + with_swapped_xi(xi, |swapped| { + input.into_iter().for_each(|&input| { + let input = input.array_split_map(u64::from_be_bytes); + swapped[0] ^= input[1]; + swapped[1] ^= input[0]; + gcm_polyval_nohw(swapped, h); + }); + }); +} + +#[inline] +fn with_swapped_xi(Xi(xi): &mut Xi, f: impl FnOnce(&mut [u64; 2])) { + let unswapped: [u64; 2] = xi.array_split_map(u64::from_be_bytes); + let mut swapped: [u64; 2] = [unswapped[1], unswapped[0]]; + f(&mut swapped); + let (xi_0, xi_1) = xi.split_at_mut(BLOCK_LEN / 2); + xi_0.copy_from_slice(&u64::to_be_bytes(swapped[1])); + xi_1.copy_from_slice(&u64::to_be_bytes(swapped[0])); +} diff --git a/ring-0.17.14/src/aead/gcm/ffi.rs b/ring-0.17.14/src/aead/gcm/ffi.rs new file mode 100644 index 0000000000..c655a0dd39 --- /dev/null +++ b/ring-0.17.14/src/aead/gcm/ffi.rs @@ -0,0 +1,165 @@ +// Copyright 2018 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use crate::{ + bb, + polyfill::{slice::AsChunks, ArraySplitMap}, +}; + +pub(in super::super) const BLOCK_LEN: usize = 16; +pub(in super::super) type Block = [u8; BLOCK_LEN]; +pub(super) const ZERO_BLOCK: Block = [0u8; BLOCK_LEN]; + +#[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + all(target_arch = "arm", target_endian = "little"), + target_arch = "x86", + target_arch = "x86_64" +))] +macro_rules! htable_new { + ( $name:ident, $value:expr $(,)? ) => {{ + use crate::aead::gcm::ffi::HTable; + prefixed_extern! { + fn $name(HTable: &mut HTable, h: &[u64; 2]); + } + HTable::new($name, $value) + }}; +} + +/// SAFETY: +/// * The function `$name` must meet the contract of the `f` paramweter of +/// `ghash()`. +#[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + all(target_arch = "arm", target_endian = "little"), + target_arch = "x86", + target_arch = "x86_64" +))] +macro_rules! ghash { + ( $name:ident, $xi:expr, $h_table:expr, $input:expr $(,)? ) => {{ + use crate::aead::gcm::ffi::{HTable, Xi}; + prefixed_extern! { + fn $name( + xi: &mut Xi, + Htable: &HTable, + inp: *const u8, + len: crate::c::NonZero_size_t, + ); + } + $h_table.ghash($name, $xi, $input) + }}; +} + +pub(in super::super) struct KeyValue([u64; 2]); + +impl KeyValue { + pub(in super::super) fn new(value: Block) -> Self { + Self(value.array_split_map(u64::from_be_bytes)) + } + + pub(super) fn into_inner(self) -> [u64; 2] { + self.0 + } +} + +/// SAFETY: +/// * `f` must read `len` bytes from `inp`; it may assume +/// that `len` is a (non-zero) multiple of `BLOCK_LEN`. +/// * `f` may inspect CPU features. +#[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + all(target_arch = "arm", target_endian = "little"), + target_arch = "x86", + target_arch = "x86_64" +))] +impl HTable { + pub(super) unsafe fn new( + init: unsafe extern "C" fn(HTable: &mut HTable, &[u64; 2]), + value: KeyValue, + ) -> Self { + let mut r = Self { + Htable: [U128 { hi: 0, lo: 0 }; HTABLE_LEN], + }; + unsafe { init(&mut r, &value.0) }; + r + } + + #[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + all(target_arch = "arm", target_endian = "little") + ))] + pub(super) unsafe fn gmult( + &self, + f: unsafe extern "C" fn(xi: &mut Xi, h_table: &HTable), + xi: &mut Xi, + ) { + unsafe { f(xi, self) } + } + + pub(super) unsafe fn ghash( + &self, + f: unsafe extern "C" fn( + xi: &mut Xi, + Htable: &HTable, + inp: *const u8, + len: crate::c::NonZero_size_t, + ), + xi: &mut Xi, + input: AsChunks, + ) { + use core::num::NonZeroUsize; + + let input = input.as_flattened(); + + let input_len = match NonZeroUsize::new(input.len()) { + Some(len) => len, + None => { + return; + } + }; + + // SAFETY: + // * There are `input_len: NonZeroUsize` bytes available at `input` for + // `f` to read. + unsafe { + f(xi, self, input.as_ptr(), input_len); + } + } +} + +// The alignment is required by some assembly code, such as `ghash-ssse3-*`. +#[derive(Clone)] +#[repr(C, align(16))] +pub(in super::super) struct HTable { + Htable: [U128; HTABLE_LEN], +} + +#[derive(Clone, Copy)] +#[repr(C)] +pub(super) struct U128 { + pub(super) hi: u64, + pub(super) lo: u64, +} + +const HTABLE_LEN: usize = 16; + +#[repr(transparent)] +pub(in super::super) struct Xi(pub(super) Block); + +impl Xi { + #[inline] + pub(super) fn bitxor_assign(&mut self, a: Block) { + self.0 = bb::xor_16(self.0, a) + } +} diff --git a/ring-0.17.14/src/aead/gcm/neon.rs b/ring-0.17.14/src/aead/gcm/neon.rs new file mode 100644 index 0000000000..814cd52187 --- /dev/null +++ b/ring-0.17.14/src/aead/gcm/neon.rs @@ -0,0 +1,52 @@ +// Copyright 2018-2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + all(target_arch = "arm", target_endian = "little") +))] + +use super::{HTable, KeyValue, UpdateBlock, UpdateBlocks, Xi, BLOCK_LEN}; +use crate::{cpu, polyfill::slice::AsChunks}; + +pub(in super::super) type RequiredCpuFeatures = cpu::arm::Neon; + +#[derive(Clone)] +pub struct Key { + h_table: HTable, +} + +impl Key { + pub(in super::super) fn new(value: KeyValue, _cpu: RequiredCpuFeatures) -> Self { + Self { + h_table: unsafe { htable_new!(gcm_init_neon, value) }, + } + } +} + +impl UpdateBlock for Key { + fn update_block(&self, xi: &mut Xi, a: [u8; BLOCK_LEN]) { + prefixed_extern! { + fn gcm_gmult_neon(xi: &mut Xi, Htable: &HTable); + } + xi.bitxor_assign(a); + unsafe { self.h_table.gmult(gcm_gmult_neon, xi) }; + } +} + +impl UpdateBlocks for Key { + fn update_blocks(&self, xi: &mut Xi, input: AsChunks) { + unsafe { ghash!(gcm_ghash_neon, xi, &self.h_table, input) } + } +} diff --git a/ring-0.17.14/src/aead/gcm/vclmulavx2.rs b/ring-0.17.14/src/aead/gcm/vclmulavx2.rs new file mode 100644 index 0000000000..ebf4e76ad4 --- /dev/null +++ b/ring-0.17.14/src/aead/gcm/vclmulavx2.rs @@ -0,0 +1,46 @@ +// Copyright 2018-2025 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![cfg(target_arch = "x86_64")] + +use super::{ffi::KeyValue, HTable, UpdateBlock, Xi}; +use crate::{ + aead::gcm::ffi::BLOCK_LEN, + cpu::intel::{Avx2, VAesClmul}, + polyfill::slice::AsChunks, +}; + +#[derive(Clone)] +pub struct Key { + h_table: HTable, +} + +impl Key { + pub(in super::super) fn new(value: KeyValue, _cpu: (Avx2, VAesClmul)) -> Self { + Self { + h_table: unsafe { htable_new!(gcm_init_vpclmulqdq_avx2, value) }, + } + } + + pub(super) fn inner(&self) -> &HTable { + &self.h_table + } +} + +impl UpdateBlock for Key { + fn update_block(&self, xi: &mut Xi, a: [u8; BLOCK_LEN]) { + let input: AsChunks = (&a).into(); + unsafe { ghash!(gcm_ghash_vpclmulqdq_avx2_1, xi, &self.h_table, input) } + } +} diff --git a/ring-0.17.14/src/aead/less_safe_key.rs b/ring-0.17.14/src/aead/less_safe_key.rs new file mode 100644 index 0000000000..10483fc0d3 --- /dev/null +++ b/ring-0.17.14/src/aead/less_safe_key.rs @@ -0,0 +1,181 @@ +// Copyright 2015-2021 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::{Aad, Algorithm, KeyInner, Nonce, Tag, UnboundKey, TAG_LEN}; +use crate::{cpu, error}; +use core::ops::RangeFrom; + +/// Immutable keys for use in situations where `OpeningKey`/`SealingKey` and +/// `NonceSequence` cannot reasonably be used. +/// +/// Prefer to use `OpeningKey`/`SealingKey` and `NonceSequence` when practical. +#[derive(Clone)] +pub struct LessSafeKey { + inner: KeyInner, + algorithm: &'static Algorithm, +} + +impl LessSafeKey { + /// Constructs a `LessSafeKey`. + #[inline] + pub fn new(key: UnboundKey) -> Self { + key.into_inner() + } + + pub(super) fn new_( + algorithm: &'static Algorithm, + key_bytes: &[u8], + cpu_features: cpu::Features, + ) -> Result { + Ok(Self { + inner: algorithm.new_key(key_bytes, cpu_features)?, + algorithm, + }) + } + + /// Like [open_in_place](Self::open_in_place), except the authentication tag is + /// passed separately. + #[inline] + pub fn open_in_place_separate_tag<'in_out, A>( + &self, + nonce: Nonce, + aad: Aad, + tag: Tag, + in_out: &'in_out mut [u8], + ciphertext: RangeFrom, + ) -> Result<&'in_out mut [u8], error::Unspecified> + where + A: AsRef<[u8]>, + { + let aad = Aad::from(aad.as_ref()); + self.algorithm.open_within( + &self.inner, + nonce, + aad, + tag, + in_out, + ciphertext, + cpu::features(), + ) + } + + /// Like [`super::OpeningKey::open_in_place()`], except it accepts an + /// arbitrary nonce. + /// + /// `nonce` must be unique for every use of the key to open data. + #[inline] + pub fn open_in_place<'in_out, A>( + &self, + nonce: Nonce, + aad: Aad, + in_out: &'in_out mut [u8], + ) -> Result<&'in_out mut [u8], error::Unspecified> + where + A: AsRef<[u8]>, + { + self.open_within(nonce, aad, in_out, 0..) + } + + /// Like [`super::OpeningKey::open_within()`], except it accepts an + /// arbitrary nonce. + /// + /// `nonce` must be unique for every use of the key to open data. + #[inline] + pub fn open_within<'in_out, A>( + &self, + nonce: Nonce, + aad: Aad, + in_out: &'in_out mut [u8], + ciphertext_and_tag: RangeFrom, + ) -> Result<&'in_out mut [u8], error::Unspecified> + where + A: AsRef<[u8]>, + { + let tag_offset = in_out + .len() + .checked_sub(TAG_LEN) + .ok_or(error::Unspecified)?; + + // Split the tag off the end of `in_out`. + let (in_out, received_tag) = in_out.split_at_mut(tag_offset); + let received_tag = (*received_tag).try_into()?; + let ciphertext = ciphertext_and_tag; + + self.open_in_place_separate_tag(nonce, aad, received_tag, in_out, ciphertext) + } + + /// Like [`super::SealingKey::seal_in_place_append_tag()`], except it + /// accepts an arbitrary nonce. + /// + /// `nonce` must be unique for every use of the key to seal data. + #[inline] + pub fn seal_in_place_append_tag( + &self, + nonce: Nonce, + aad: Aad, + in_out: &mut InOut, + ) -> Result<(), error::Unspecified> + where + A: AsRef<[u8]>, + InOut: AsMut<[u8]> + for<'in_out> Extend<&'in_out u8>, + { + self.seal_in_place_separate_tag(nonce, aad, in_out.as_mut()) + .map(|tag| in_out.extend(tag.as_ref())) + } + + /// Like `super::SealingKey::seal_in_place_separate_tag()`, except it + /// accepts an arbitrary nonce. + /// + /// `nonce` must be unique for every use of the key to seal data. + #[inline] + pub fn seal_in_place_separate_tag( + &self, + nonce: Nonce, + aad: Aad, + in_out: &mut [u8], + ) -> Result + where + A: AsRef<[u8]>, + { + self.algorithm.seal( + &self.inner, + nonce, + Aad::from(aad.as_ref()), + in_out, + cpu::features(), + ) + } + + /// The key's AEAD algorithm. + #[inline] + pub fn algorithm(&self) -> &'static Algorithm { + self.algorithm + } + + pub(super) fn fmt_debug( + &self, + type_name: &'static str, + f: &mut core::fmt::Formatter, + ) -> Result<(), core::fmt::Error> { + f.debug_struct(type_name) + .field("algorithm", &self.algorithm()) + .finish() + } +} + +impl core::fmt::Debug for LessSafeKey { + fn fmt(&self, f: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> { + self.fmt_debug("LessSafeKey", f) + } +} diff --git a/ring-0.17.14/src/aead/nonce.rs b/ring-0.17.14/src/aead/nonce.rs new file mode 100644 index 0000000000..e314fae8d2 --- /dev/null +++ b/ring-0.17.14/src/aead/nonce.rs @@ -0,0 +1,51 @@ +// Copyright 2018 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use crate::error; + +/// A nonce for a single AEAD opening or sealing operation. +/// +/// The user must ensure, for a particular key, that each nonce is unique. +/// +/// `Nonce` intentionally doesn't implement `Clone` to ensure that each one is +/// consumed at most once. +pub struct Nonce([u8; NONCE_LEN]); + +impl Nonce { + /// Constructs a `Nonce` with the given value, assuming that the value is + /// unique for the lifetime of the key it is being used with. + /// + /// Fails if `value` isn't `NONCE_LEN` bytes long. + #[inline] + pub fn try_assume_unique_for_key(value: &[u8]) -> Result { + let value: &[u8; NONCE_LEN] = value.try_into()?; + Ok(Self::assume_unique_for_key(*value)) + } + + /// Constructs a `Nonce` with the given value, assuming that the value is + /// unique for the lifetime of the key it is being used with. + #[inline] + pub fn assume_unique_for_key(value: [u8; NONCE_LEN]) -> Self { + Self(value) + } +} + +impl AsRef<[u8; NONCE_LEN]> for Nonce { + fn as_ref(&self) -> &[u8; NONCE_LEN] { + &self.0 + } +} + +/// All the AEADs we support use 96-bit nonces. +pub const NONCE_LEN: usize = 96 / 8; diff --git a/ring-0.17.14/src/aead/opening_key.rs b/ring-0.17.14/src/aead/opening_key.rs new file mode 100644 index 0000000000..9e10fc9a05 --- /dev/null +++ b/ring-0.17.14/src/aead/opening_key.rs @@ -0,0 +1,143 @@ +// Copyright 2015-2021 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! Authenticated Encryption with Associated Data (AEAD). +//! +//! See [Authenticated encryption: relations among notions and analysis of the +//! generic composition paradigm][AEAD] for an introduction to the concept of +//! AEADs. +//! +//! [AEAD]: https://eprint.iacr.org/2000/025.pdf +//! [`crypto.cipher.AEAD`]: https://golang.org/pkg/crypto/cipher/#AEAD + +use super::{Aad, Algorithm, BoundKey, LessSafeKey, NonceSequence, UnboundKey}; +use crate::error; +use core::ops::RangeFrom; + +/// An AEAD key for authenticating and decrypting ("opening"), bound to a nonce +/// sequence. +/// +/// Intentionally not `Clone` or `Copy` since cloning would allow duplication +/// of the nonce sequence. +pub struct OpeningKey { + key: LessSafeKey, + nonce_sequence: N, +} + +impl BoundKey for OpeningKey { + fn new(key: UnboundKey, nonce_sequence: N) -> Self { + Self { + key: key.into_inner(), + nonce_sequence, + } + } + + #[inline] + fn algorithm(&self) -> &'static Algorithm { + self.key.algorithm() + } +} + +impl core::fmt::Debug for OpeningKey { + fn fmt(&self, f: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> { + self.key.fmt_debug("OpeningKey", f) + } +} + +impl OpeningKey { + /// Authenticates and decrypts (“opens”) data in place. + /// + /// `aad` is the additional authenticated data (AAD), if any. + /// + /// On input, `in_out` must be the ciphertext followed by the tag. When + /// `open_in_place()` returns `Ok(plaintext)`, the input ciphertext + /// has been overwritten by the plaintext; `plaintext` will refer to the + /// plaintext without the tag. + /// + /// When `open_in_place()` returns `Err(..)`, `in_out` may have been + /// overwritten in an unspecified way. + #[inline] + pub fn open_in_place<'in_out, A>( + &mut self, + aad: Aad, + in_out: &'in_out mut [u8], + ) -> Result<&'in_out mut [u8], error::Unspecified> + where + A: AsRef<[u8]>, + { + self.key + .open_in_place(self.nonce_sequence.advance()?, aad, in_out) + } + + /// Authenticates and decrypts (“opens”) data in place, with a shift. + /// + /// `aad` is the additional authenticated data (AAD), if any. + /// + /// On input, `in_out[ciphertext_and_tag]` must be the ciphertext followed + /// by the tag. When `open_within()` returns `Ok(plaintext)`, the plaintext + /// will be at `in_out[0..plaintext.len()]`. In other words, the following + /// two code fragments are equivalent for valid values of + /// `ciphertext_and_tag`, except `open_within` will often be more efficient: + /// + /// + /// ```skip + /// let plaintext = key.open_within(aad, in_out, cipertext_and_tag)?; + /// ``` + /// + /// ```skip + /// let ciphertext_and_tag_len = in_out[ciphertext_and_tag].len(); + /// in_out.copy_within(ciphertext_and_tag, 0); + /// let plaintext = key.open_in_place(aad, &mut in_out[..ciphertext_and_tag_len])?; + /// ``` + /// + /// Similarly, `key.open_within(aad, in_out, 0..)` is equivalent to + /// `key.open_in_place(aad, in_out)`. + /// + /// When `open_in_place()` returns `Err(..)`, `in_out` may have been + /// overwritten in an unspecified way. + /// + /// The shifting feature is useful in the case where multiple packets are + /// being reassembled in place. Consider this example where the peer has + /// sent the message “Split stream reassembled in place” split into + /// three sealed packets: + /// + /// ```ascii-art + /// Packet 1 Packet 2 Packet 3 + /// Input: [Header][Ciphertext][Tag][Header][Ciphertext][Tag][Header][Ciphertext][Tag] + /// | +--------------+ | + /// +------+ +-----+ +----------------------------------+ + /// v v v + /// Output: [Plaintext][Plaintext][Plaintext] + /// “Split stream reassembled in place” + /// ``` + /// + /// This reassembly can be accomplished with three calls to `open_within()`. + #[inline] + pub fn open_within<'in_out, A>( + &mut self, + aad: Aad, + in_out: &'in_out mut [u8], + ciphertext_and_tag: RangeFrom, + ) -> Result<&'in_out mut [u8], error::Unspecified> + where + A: AsRef<[u8]>, + { + self.key.open_within( + self.nonce_sequence.advance()?, + aad, + in_out, + ciphertext_and_tag, + ) + } +} diff --git a/ring-0.17.14/src/aead/overlapping/array.rs b/ring-0.17.14/src/aead/overlapping/array.rs new file mode 100644 index 0000000000..9c797da187 --- /dev/null +++ b/ring-0.17.14/src/aead/overlapping/array.rs @@ -0,0 +1,60 @@ +// Copyright 2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![cfg_attr(not(test), allow(dead_code))] + +use super::Overlapping; +use crate::error::LenMismatchError; +use core::array::TryFromSliceError; + +pub struct Array<'o, T, const N: usize> { + // Invariant: N != 0. + // Invariant: `self.in_out.len() == N`. + in_out: Overlapping<'o, T>, +} + +impl<'o, T, const N: usize> Array<'o, T, N> { + pub(super) fn new(in_out: Overlapping<'o, T>) -> Result { + if N == 0 || in_out.len() != N { + return Err(LenMismatchError::new(N)); + } + Ok(Self { in_out }) + } + + pub fn into_unwritten_output(self) -> &'o mut [T; N] + where + &'o mut [T]: TryInto<&'o mut [T; N], Error = TryFromSliceError>, + { + self.in_out + .into_unwritten_output() + .try_into() + .unwrap_or_else(|TryFromSliceError { .. }| { + unreachable!() // Due to invariant + }) + } +} + +impl Array<'_, T, N> { + pub fn input<'s>(&'s self) -> &'s [T; N] + where + &'s [T]: TryInto<&'s [T; N], Error = TryFromSliceError>, + { + self.in_out + .input() + .try_into() + .unwrap_or_else(|TryFromSliceError { .. }| { + unreachable!() // Due to invariant + }) + } +} diff --git a/ring-0.17.14/src/aead/overlapping/base.rs b/ring-0.17.14/src/aead/overlapping/base.rs new file mode 100644 index 0000000000..6923587223 --- /dev/null +++ b/ring-0.17.14/src/aead/overlapping/base.rs @@ -0,0 +1,152 @@ +// Copyright 2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +pub use self::index_error::IndexError; +use super::Array; +use crate::error::LenMismatchError; +use core::{mem, ops::RangeFrom}; + +pub struct Overlapping<'o, T> { + // Invariant: self.src.start <= in_out.len(). + in_out: &'o mut [T], + src: RangeFrom, +} + +impl<'o, T> From<&'o mut [T]> for Overlapping<'o, T> { + fn from(in_out: &'o mut [T]) -> Self { + Self { in_out, src: 0.. } + } +} + +impl<'o, T> Overlapping<'o, T> { + pub fn new(in_out: &'o mut [T], src: RangeFrom) -> Result { + match in_out.get(src.clone()) { + Some(_) => Ok(Self { in_out, src }), + None => Err(IndexError::new(src.start)), + } + } + + #[cfg(any( + all(target_arch = "arm", target_endian = "little"), + target_arch = "x86" + ))] + pub fn copy_within(self) -> &'o mut [T] + where + T: Copy, + { + if self.src.start == 0 { + self.in_out + } else { + let len = self.len(); + self.in_out.copy_within(self.src, 0); + &mut self.in_out[..len] + } + } + + #[cfg(any( + all(target_arch = "arm", target_endian = "little"), + target_arch = "x86" + ))] + pub fn into_slice_src_mut(self) -> (&'o mut [T], RangeFrom) { + (self.in_out, self.src) + } + + pub fn into_unwritten_output(self) -> &'o mut [T] { + let len = self.len(); + self.in_out.get_mut(..len).unwrap_or_else(|| { + // The invariant ensures this succeeds. + unreachable!() + }) + } +} + +impl Overlapping<'_, T> { + pub fn len(&self) -> usize { + self.input().len() + } + + pub fn input(&self) -> &[T] { + self.in_out.get(self.src.clone()).unwrap_or_else(|| { + // Ensured by invariant. + unreachable!() + }) + } + + pub fn with_input_output_len(self, f: impl FnOnce(*const T, *mut T, usize) -> R) -> R { + let len = self.len(); + let output = self.in_out.as_mut_ptr(); + // TODO: MSRV(1.65): use `output.cast_const()` + let output_const: *const T = output; + // SAFETY: The constructor ensures that `src` is a valid range. + // Equivalent to `self.in_out[src.clone()].as_ptr()` but without + // worries about compatibility with the stacked borrows model. + // TODO(MSRV-1.80, probably): Avoid special casing 0; see + // https://github.com/rust-lang/rust/pull/117329 + // https://github.com/rust-lang/rustc_codegen_gcc/issues/516 + let input = if self.src.start == 0 { + output_const + } else { + unsafe { output_const.add(self.src.start) } + }; + f(input, output, len) + } + + // Perhaps unlike `slice::split_first_chunk_mut`, this is biased, + // performance-wise, against the case where `N > self.len()`, so callers + // should be structured to avoid that. + // + // If the result is `Err` then nothing was written to `self`; if anything + // was written then the result will not be `Err`. + #[cfg_attr(not(test), allow(dead_code))] + pub fn split_first_chunk( + mut self, + f: impl for<'a> FnOnce(Array<'a, T, N>), + ) -> Result { + let src = self.src.clone(); + let end = self + .src + .start + .checked_add(N) + .ok_or_else(|| IndexError::new(N))?; + let first = self + .in_out + .get_mut(..end) + .ok_or_else(|| IndexError::new(N))?; + let first = Overlapping::new(first, src).unwrap_or_else(|IndexError { .. }| { + // Since `end == src.start + N`. + unreachable!() + }); + let first = Array::new(first).unwrap_or_else(|LenMismatchError { .. }| { + // Since `end == src.start + N`. + unreachable!() + }); + // Once we call `f`, we must return `Ok` because `f` may have written + // over (part of) the input. + Ok({ + f(first); + let tail = mem::take(&mut self.in_out).get_mut(N..).unwrap_or_else(|| { + // There are at least `N` elements since `end == src.start + N`. + unreachable!() + }); + Self::new(tail, self.src).unwrap_or_else(|IndexError { .. }| { + // Follows from `end == src.start + N`. + unreachable!() + }) + }) + } +} + +cold_exhaustive_error! { + struct index_error::IndexError { index: usize } +} diff --git a/ring-0.17.14/src/aead/overlapping/mod.rs b/ring-0.17.14/src/aead/overlapping/mod.rs new file mode 100644 index 0000000000..77086cda43 --- /dev/null +++ b/ring-0.17.14/src/aead/overlapping/mod.rs @@ -0,0 +1,23 @@ +// Copyright 2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +pub use self::{ + array::Array, + base::{IndexError, Overlapping}, + partial_block::PartialBlock, +}; + +mod array; +mod base; +mod partial_block; diff --git a/ring-0.17.14/src/aead/overlapping/partial_block.rs b/ring-0.17.14/src/aead/overlapping/partial_block.rs new file mode 100644 index 0000000000..9b7864a1e1 --- /dev/null +++ b/ring-0.17.14/src/aead/overlapping/partial_block.rs @@ -0,0 +1,59 @@ +// Copyright 2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::Overlapping; +use crate::error::InputTooLongError; + +pub struct PartialBlock<'i, T, const BLOCK_LEN: usize> { + // invariant: `self.in_out.len() < BLOCK_LEN`. + in_out: Overlapping<'i, T>, +} + +impl<'i, T, const BLOCK_LEN: usize> PartialBlock<'i, T, BLOCK_LEN> { + pub fn new(in_out: Overlapping<'i, T>) -> Result { + let len = in_out.len(); + if len >= BLOCK_LEN { + return Err(InputTooLongError::new(len)); + } + Ok(Self { in_out }) + } + + pub fn overwrite_at_start(self, padded: [T; BLOCK_LEN]) + where + T: Copy, + { + let len = self.len(); + let output = self.in_out.into_unwritten_output(); + assert!(output.len() <= padded.len()); + output.copy_from_slice(&padded[..len]); + } +} + +impl PartialBlock<'_, T, BLOCK_LEN> { + #[inline(always)] + pub fn input(&self) -> &[T] { + let r = self.in_out.input(); + // Help the optimizer optimize the caller using the invariant. + // TODO: Does this actually help? + if r.len() >= BLOCK_LEN { + unreachable!() + } + r + } + + #[inline(always)] + pub fn len(&self) -> usize { + self.input().len() + } +} diff --git a/ring-0.17.14/src/aead/poly1305.rs b/ring-0.17.14/src/aead/poly1305.rs new file mode 100644 index 0000000000..70dcda3126 --- /dev/null +++ b/ring-0.17.14/src/aead/poly1305.rs @@ -0,0 +1,117 @@ +// Copyright 2015-2025 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +// TODO: enforce maximum input length. + +use super::{Tag, TAG_LEN}; +#[cfg(all(target_arch = "arm", target_endian = "little"))] +use crate::cpu::GetFeature as _; +use crate::{cpu, polyfill::slice::AsChunks}; + +mod ffi_arm_neon; +mod ffi_fallback; + +/// A Poly1305 key. +pub(super) struct Key { + key_and_nonce: [u8; KEY_LEN], +} + +pub(super) const BLOCK_LEN: usize = 16; +pub(super) const KEY_LEN: usize = 2 * BLOCK_LEN; + +impl Key { + #[inline] + pub(super) fn new(key_and_nonce: [u8; KEY_LEN]) -> Self { + Self { key_and_nonce } + } +} + +pub(super) enum Context { + #[cfg(all(target_arch = "arm", target_endian = "little"))] + ArmNeon(ffi_arm_neon::State), + Fallback(ffi_fallback::State), +} + +impl Context { + #[inline] + pub(super) fn from_key(key: Key, cpu: cpu::Features) -> Self { + #[cfg(all(target_arch = "arm", target_endian = "little"))] + if let Some(cpu) = cpu.get_feature() { + return ffi_arm_neon::State::new_context(key, cpu); + } + let _: cpu::Features = cpu; + ffi_fallback::State::new_context(key) + } + + pub fn update_block(&mut self, input: [u8; BLOCK_LEN]) { + self.update(AsChunks::from_ref(&input)) + } + + pub fn update(&mut self, input: AsChunks) { + self.update_internal(input.as_flattened()); + } + + fn update_internal(&mut self, input: &[u8]) { + match self { + #[cfg(all(target_arch = "arm", target_endian = "little"))] + Self::ArmNeon(state) => state.update_internal(input), + Self::Fallback(state) => state.update_internal(input), + } + } + + pub(super) fn finish(mut self, input: &[u8]) -> Tag { + self.update_internal(input); + match self { + #[cfg(all(target_arch = "arm", target_endian = "little"))] + Self::ArmNeon(state) => state.finish(), + Self::Fallback(state) => state.finish(), + } + } +} + +/// Implements the original, non-IETF padding semantics. +/// +/// This is used by chacha20_poly1305_openssh and the standalone +/// poly1305 test vectors. +pub(super) fn sign(key: Key, input: &[u8], cpu_features: cpu::Features) -> Tag { + let ctx = Context::from_key(key, cpu_features); + ctx.finish(input) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::testutil as test; + + // Adapted from BoringSSL's crypto/poly1305/poly1305_test.cc. + #[test] + pub fn test_poly1305() { + let cpu_features = cpu::features(); + test::run( + test_vector_file!("poly1305_test.txt"), + |section, test_case| { + assert_eq!(section, ""); + let key = test_case.consume_bytes("Key"); + let key: &[u8; KEY_LEN] = key.as_slice().try_into().unwrap(); + let input = test_case.consume_bytes("Input"); + let expected_mac = test_case.consume_bytes("MAC"); + let key = Key::new(*key); + let Tag(actual_mac) = sign(key, &input, cpu_features); + assert_eq!(expected_mac, actual_mac.as_ref()); + + Ok(()) + }, + ) + } +} diff --git a/ring-0.17.14/src/aead/poly1305/ffi_arm_neon.rs b/ring-0.17.14/src/aead/poly1305/ffi_arm_neon.rs new file mode 100644 index 0000000000..5c1542d191 --- /dev/null +++ b/ring-0.17.14/src/aead/poly1305/ffi_arm_neon.rs @@ -0,0 +1,98 @@ +// Copyright 2015-2025 Brian Smith. +// Portions Copyright (c) 2014, 2015, Google Inc. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![cfg(all(target_arch = "arm", target_endian = "little"))] + +use super::{Key, Tag, KEY_LEN, TAG_LEN}; +use crate::{c, cpu::arm::Neon}; +use core::num::NonZeroUsize; + +// XXX/TODO(MSRV): change to `pub(super)`. +pub(in super::super) struct State { + state: poly1305_state_st, + neon: Neon, +} + +// TODO: Is 16 enough? +#[repr(C, align(16))] +struct poly1305_state_st { + r: fe1305x2, + h: fe1305x2, + c: fe1305x2, + precomp: [fe1305x2; 2], + + data: [u8; data_len()], + + buf: [u8; 32], + buf_used: c::size_t, + key: [u8; 16], +} + +const fn data_len() -> usize { + 128 +} + +#[derive(Clone, Copy)] +#[repr(C)] +struct fe1305x2 { + v: [u32; 12], // for alignment; only using 10 +} + +impl State { + pub(super) fn new_context(Key { key_and_nonce }: Key, neon: Neon) -> super::Context { + prefixed_extern! { + fn CRYPTO_poly1305_init_neon(state: &mut poly1305_state_st, key: &[u8; KEY_LEN]); + } + let mut r = Self { + state: poly1305_state_st { + r: fe1305x2 { v: [0; 12] }, + h: fe1305x2 { v: [0; 12] }, + c: fe1305x2 { v: [0; 12] }, + precomp: [fe1305x2 { v: [0; 12] }; 2], + + data: [0u8; data_len()], + buf: Default::default(), + buf_used: 0, + key: [0u8; 16], + }, + neon, + }; + unsafe { CRYPTO_poly1305_init_neon(&mut r.state, &key_and_nonce) } + super::Context::ArmNeon(r) + } + + pub(super) fn update_internal(&mut self, input: &[u8]) { + prefixed_extern! { + fn CRYPTO_poly1305_update_neon( + st: &mut poly1305_state_st, + input: *const u8, + in_len: c::NonZero_size_t); + } + if let Some(len) = NonZeroUsize::new(input.len()) { + let _: Neon = self.neon; + let input = input.as_ptr(); + unsafe { CRYPTO_poly1305_update_neon(&mut self.state, input, len) } + } + } + + pub(super) fn finish(mut self) -> Tag { + prefixed_extern! { + fn CRYPTO_poly1305_finish_neon(st: &mut poly1305_state_st, mac: &mut [u8; TAG_LEN]); + } + let mut tag = Tag([0u8; TAG_LEN]); + unsafe { CRYPTO_poly1305_finish_neon(&mut self.state, &mut tag.0) } + tag + } +} diff --git a/ring-0.17.14/src/aead/poly1305/ffi_fallback.rs b/ring-0.17.14/src/aead/poly1305/ffi_fallback.rs new file mode 100644 index 0000000000..f689cfcb33 --- /dev/null +++ b/ring-0.17.14/src/aead/poly1305/ffi_fallback.rs @@ -0,0 +1,96 @@ +// Copyright 2015-2025 Brian Smith. +// Portions Copyright (c) 2014, 2015, Google Inc. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::{Key, Tag, KEY_LEN, TAG_LEN}; +use crate::c; +use core::num::NonZeroUsize; + +// XXX/TODO(MSRV): change to `pub(super)`. +pub(in super::super) struct State { + state: poly1305_state_st, +} + +// Keep in sync with `poly1305_state_st` in poly1305.c +#[repr(C, align(64))] +struct poly1305_state_st { + r0: u32, + r1: u32, + r2: u32, + r3: u32, + r4: u32, + s1: u32, + s2: u32, + s3: u32, + s4: u32, + h0: u32, + h1: u32, + h2: u32, + h3: u32, + h4: u32, + key: [u8; 16], +} + +impl State { + pub(super) fn new_context(Key { key_and_nonce }: Key) -> super::Context { + prefixed_extern! { + fn CRYPTO_poly1305_init(state: &mut poly1305_state_st, key: &[u8; KEY_LEN]); + } + let mut r = Self { + state: poly1305_state_st { + r0: 0, + r1: 0, + r2: 0, + r3: 0, + r4: 0, + s1: 0, + s2: 0, + s3: 0, + s4: 0, + h0: 0, + h1: 0, + h2: 0, + h3: 0, + h4: 0, + key: [0u8; 16], + }, + }; + unsafe { CRYPTO_poly1305_init(&mut r.state, &key_and_nonce) } + super::Context::Fallback(r) + } + + // `input.len % BLOCK_LEN == 0` must be true for every call except the + // final one. + pub(super) fn update_internal(&mut self, input: &[u8]) { + prefixed_extern! { + fn CRYPTO_poly1305_update( + state: &mut poly1305_state_st, + input: *const u8, + in_len: c::NonZero_size_t); + } + if let Some(len) = NonZeroUsize::new(input.len()) { + let input = input.as_ptr(); + unsafe { CRYPTO_poly1305_update(&mut self.state, input, len) } + } + } + + pub(super) fn finish(mut self) -> Tag { + prefixed_extern! { + fn CRYPTO_poly1305_finish(statep: &mut poly1305_state_st, mac: &mut [u8; TAG_LEN]); + } + let mut tag = Tag([0u8; TAG_LEN]); + unsafe { CRYPTO_poly1305_finish(&mut self.state, &mut tag.0) } + tag + } +} diff --git a/ring-0.17.14/src/aead/poly1305_test.txt b/ring-0.17.14/src/aead/poly1305_test.txt new file mode 100644 index 0000000000..586477df8a --- /dev/null +++ b/ring-0.17.14/src/aead/poly1305_test.txt @@ -0,0 +1,170 @@ +# Test Vectors from OpenSSL commit bbe9769ba66ab2512678a87b0d9b266ba970db05. + +Key = 2d773be37adb1e4d683bf0075e79c4ee037918535a7f99ccb7040fb5f5f43aea +Input = 89dab80b7717c1db5db437860a3f70218e93e1b8f461fb677f16f35f6f87e2a91c99bc3a47ace47640cc95c345be5ecca5a3523c35cc01893af0b64a620334270372ec12482d1b1e363561698a578b359803495bb4e2ef1930b17a5190b580f141300df30adbeca28f6427a8bc1a999fd51c554a017d095d8c3e3127daf9f595 +MAC = c85d15ed44c378d6b00e23064c7bcd51 + +Key = 99e5822dd4173c995e3dae0ddefb97743fde3b080134b39f76e9bf8d0e88d546 +Input = 000000000000000b170303020000000006db1f1f368d696a810a349c0c714c9a5e7850c2407d721acded95e018d7a85266a6e1289cdb4aeb18da5ac8a2b0026d24a59ad485227f3eaedbb2e7e35e1c66cd60f9abf716dcc9ac42682dd7dab287a7024c4eefc321cc0574e16793e37cec03c5bda42b54c114a80b57af26416c7be742005e20855c73e21dc8e2edc9d435cb6f6059280011c270b71570051c1c9b3052126620bc1e2730fa066c7a509d53c60e5ae1b40aa6e39e49669228c90eecb4a50db32a50bc49e90b4f4b359a1dfd11749cd3867fcf2fb7bb6cd4738f6a4ad6f7ca5058f7618845af9f020f6c3b967b8f4cd4a91e2813b507ae66f2d35c18284f7292186062e10fd5510d18775351ef334e7634ab4743f5b68f49adcab384d3fd75f7390f4006ef2a295c8c7a076ad54546cd25d2107fbe1436c840924aaebe5b370893cd63d1325b8616fc4810886bc152c53221b6df373119393255ee72bcaa880174f1717f9184fa91646f17a24ac55d16bfddca9581a92eda479201f0edbf633600d6066d1ab36d5d2415d71351bbcd608a25108d25641992c1f26c531cf9f90203bc4cc19f5927d834b0a47116d3884bbb164b8ec883d1ac832e56b3918a98601a08d171881541d594db399c6ae6151221745aec814c45b0b05b565436fd6f137aa10a0c0b643761dbd6f9a9dcb99b1a6e690854ce0769cde39761d82fcdec15f0d92d7d8e94ade8eb83fbe0 +MAC = 2637408fe13086ea73f971e3425e2820 + + +# RFC 8439, section 2.5.2. + +Key = 85d6be7857556d337f4452fe42d506a80103808afb0db2fd4abff6af4149f51b +Input = "Cryptographic Forum Research Group" +MAC = a8061dc1305136c6c22b8baf0c0127a9 + + +# RFC 8439, section A.3. + +Key = 0000000000000000000000000000000000000000000000000000000000000000 +Input = 00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 +MAC = 00000000000000000000000000000000 + +Key = 0000000000000000000000000000000036e5f6b5c5e06070f0efca96227a863e +Input = 416e79207375626d697373696f6e20746f20746865204945544620696e74656e6465642062792074686520436f6e7472696275746f7220666f72207075626c69636174696f6e20617320616c6c206f722070617274206f6620616e204945544620496e7465726e65742d4472616674206f722052464320616e6420616e792073746174656d656e74206d6164652077697468696e2074686520636f6e74657874206f6620616e204945544620616374697669747920697320636f6e7369646572656420616e20224945544620436f6e747269627574696f6e222e20537563682073746174656d656e747320696e636c756465206f72616c2073746174656d656e747320696e20494554462073657373696f6e732c2061732077656c6c206173207772697474656e20616e6420656c656374726f6e696320636f6d6d756e69636174696f6e73206d61646520617420616e792074696d65206f7220706c6163652c207768696368206172652061646472657373656420746f +MAC = 36e5f6b5c5e06070f0efca96227a863e + +Key = 36e5f6b5c5e06070f0efca96227a863e00000000000000000000000000000000 +Input = 416e79207375626d697373696f6e20746f20746865204945544620696e74656e6465642062792074686520436f6e7472696275746f7220666f72207075626c69636174696f6e20617320616c6c206f722070617274206f6620616e204945544620496e7465726e65742d4472616674206f722052464320616e6420616e792073746174656d656e74206d6164652077697468696e2074686520636f6e74657874206f6620616e204945544620616374697669747920697320636f6e7369646572656420616e20224945544620436f6e747269627574696f6e222e20537563682073746174656d656e747320696e636c756465206f72616c2073746174656d656e747320696e20494554462073657373696f6e732c2061732077656c6c206173207772697474656e20616e6420656c656374726f6e696320636f6d6d756e69636174696f6e73206d61646520617420616e792074696d65206f7220706c6163652c207768696368206172652061646472657373656420746f +MAC = f3477e7cd95417af89a6b8794c310cf0 + +Key = 1c9240a5eb55d38af333888604f6b5f0473917c1402b80099dca5cbc207075c0 +Input = 2754776173206272696c6c69672c20616e642074686520736c6974687920746f7665730a446964206779726520616e642067696d626c6520696e2074686520776162653a0a416c6c206d696d737920776572652074686520626f726f676f7665732c0a416e6420746865206d6f6d65207261746873206f757467726162652e +MAC = 4541669a7eaaee61e708dc7cbcc5eb62 + +Key = 0200000000000000000000000000000000000000000000000000000000000000 +Input = ffffffffffffffffffffffffffffffff +MAC = 03000000000000000000000000000000 + +Key = 02000000000000000000000000000000ffffffffffffffffffffffffffffffff +Input = 02000000000000000000000000000000 +MAC = 03000000000000000000000000000000 + +Key = 0100000000000000000000000000000000000000000000000000000000000000 +Input = fffffffffffffffffffffffffffffffff0ffffffffffffffffffffffffffffff11000000000000000000000000000000 +MAC = 05000000000000000000000000000000 + +Key = 0100000000000000000000000000000000000000000000000000000000000000 +Input = fffffffffffffffffffffffffffffffffbfefefefefefefefefefefefefefefe01010101010101010101010101010101 +MAC = 00000000000000000000000000000000 + +Key = 0200000000000000000000000000000000000000000000000000000000000000 +Input = fdffffffffffffffffffffffffffffff +MAC = faffffffffffffffffffffffffffffff + +Key = 0100000000000000040000000000000000000000000000000000000000000000 +Input = e33594d7505e43b900000000000000003394d7505e4379cd01000000000000000000000000000000000000000000000001000000000000000000000000000000 +MAC = 14000000000000005500000000000000 + +Key = 0100000000000000040000000000000000000000000000000000000000000000 +Input = e33594d7505e43b900000000000000003394d7505e4379cd010000000000000000000000000000000000000000000000 +MAC = 13000000000000000000000000000000 + + +# Additional test vectors that are long enough to ensure OpenSSL's SIMD +# assembly is fully tested. + +# Length 2048. +Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f +Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfed +MAC = 69d28f73dd09d39a92aa179da354b7ea + +# Length 2049. +Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f +Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc8 +MAC = d6a26654b88572e875d9661c83471c1b + +# Length 2050. +Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f +Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852 +MAC = 9fbbb7f7adcd0cd5b46a4a520b22499a + +# Length 2051. +Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f +Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f5 +MAC = eb7cdceb97ade2a07622f8f5a4b1ce15 + +# Length 2052. +Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f +Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f590 +MAC = d41c310927cd92e14784ea78b85503db + +# Length 2053. +Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f +Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f59073 +MAC = 16af133c423f783a14c49d9f526384cf + +# Length 2054. +Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f +Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f59073f4 +MAC = 00c75db8f0636b22f195645b03091f5f + +# Length 2055. +Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f +Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f59073f434 +MAC = 4a532bc740f581555831345f3b75bf33 + +# Length 2056. +Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f +Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f59073f4347a +MAC = 698c7d32c5923871d124a2479e521706 + +# Length 2057. +Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f +Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f59073f4347a8c +MAC = a677187dbf3c927aeeafb9ebce0f61dc + +# Length 2058. +Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f +Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f59073f4347a8c8a +MAC = 201fed7eee981b31d2cc42ff6c38141a + +# Length 2059. +Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f +Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f59073f4347a8c8a28 +MAC = 0c3d3d01a37f347c4f7c5826bcafb3e1 + +# Length 2060. +Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f +Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f59073f4347a8c8a28c9 +MAC = 33a4e0e0bed7c84c5cc5dd4784410f07 + +# Length 2061. +Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f +Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f59073f4347a8c8a28c99e +MAC = 8e41c40a2f8ec58fe594f3a3a2de4ae1 + +# Length 2062. +Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f +Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f59073f4347a8c8a28c99e21 +MAC = c6e5d1810fd878ac6b844c66cef36a22 + +# Length 2063. +Key = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f +Input = 248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e874cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936aff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a3709894e4eb0a4eedc4ae19468e66b81f271351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb14cf77bde35e78ae9ad7d3f57ed7e7f23926c9172f82d77684ea5ed7d74ebc6f142b997036bcb7cce8df1bbc0d5b35a46509c954fc9469d214d6238f166cbf872156b4c41d7aac5942cffb175023078252a3f36e315c5d4ce0e39928a018252862becacef96a19f03bdcf46d75584299d1f8b03c0169e9e407d937145b5e5024139e7022a1978f114f24cdfa23780a119735c41da8fb759bbb3f025c6ec30e6c6e9bce8615be68e392fce59fd26a8e6a6cc5c606e3848116e4d01d29565a1facfb524b6d29643b826eee1e42869fc76df229dd79b39a2b1df28bb335c3a5f15a855d0121e4a6da34b5e4d5b7b5d5746a03ecff70811e1516fcec1bf7462e8876a2d21710aa168c78f45a6a15015950e221da85d3ec822ad6d0a6931b25a06b7bb5f3c10bb36cd4d647f9561982fde9818de5d4bf8db7f86c53b4ff14928ac15f79023b61861e73e44216540bb302153770da2533de9795252ab5fb77ad924c9338c8144c23d4c90dab9a18feac1a1574d4545e1435eb405e6c4c439fc724fce992ae85badf345bad16d85fbd338f04433703614754d0e7e54c4ccde2670587d52ecfb5a70a14a501bacc727722649931d8515b13d020a78e511fe136d45fbf97f9c7f689fcc677cfb3683723878350ffe9d08130cc6e567b6179e01b7eb2b3bbcf0873e1308eec018edeb8cce946338e15d5bf68c71916a83a99358039ef071e009546a2df936879dffbba397a93925d229a469fd17d71b7f524e03a30da6ee927542f8b369bed4734fe25dbd63d24ffd2a222f5f84f75d858ab989be925af570ad6d45bd28ce61b5139e1dd2f0b7795fe072e6e83acbb5e7b777a70c641e4cab2af40eed69abc334cd2703c3273204fac580c6a3d6680427e5f7d051e8380a53f93a180f4556ecea4530b9a2d5948dad63d415b6874f6b90e767d6d265be86351b53ba690780bb57c21b57418c5b97559e840c68257f839e7583a4bf7c7645c5987d40cc1ba79a218c35edfacdabe581d950e4bb7a481ebe64d61d00e75b1f25f1ce5f5462334a5b9038a697aa0937a3f8017e05d2c9c05dcb05c0b02508dea619b137f5444b6f088eb3cb2c66788f88afdfbba8faa1c490485624c88ae11e57347a676902e7553f056188493209bdbb30acc63c9e41e16a9d6c009416b520a76ba38f57628170c43626b5cb46179dc5bf65de865085f84bf741c223fbe474d2d19d8f43914fbd6586351089e73babf344f988b7963fe44528457d7aad3c564f6bcbd0d772a4c9fd328e6022d1c7c9f86726f8d5a23797d309c0f653ab1ac687833eb2700f156296062a8b377078f45f6b68c3d07cae1913ba8d5a6f9bf7525a3439eb932d4cefc4bf8e1b07b48ca13ece366cbc3e0388915915d1757475103a9e9454e7e6355de2d6acbf4710f9a63e4f6d3cd70c2d6fca88dd8a14448fdb63ce9350fdaafbe0b8bd1c5d307dae76dfed799aef2d8f23d5608d37d1330dd38b94860905dbeebf78d7b7318b7d42aed40d3f9899e9f420cbd92a6eeae3026f7725694e0e4bee016ba346fed2c21172bdb4a461cebe0cfe38e76645226ac127a259c193264d735ce8c8a57e17dd3f0579e2e86dc295ad1f45ba2d85db35044da61f7d401274b31eefbeb34e8d2ae596e9b4541aae117bdac5ed0b324c20539c27c07a411d5288b0b5f6fa16e9a7df85dc319fa6b71cd08a859c06a3f7b0289e1750adbf182f9750fea96fea5ab7aa3473340607cd7ed2c626f5382491c26d5d5bea61401dee7319c94d418f297e61ceac8f258ee8c23831bda081591f5a918e96855774ddedffc51e5b180f1971806d42fc333020b734aeb45adb0bc47325d0cea5f6713a786558022afc39d573892aa3635efbfd8bcb11c57f306c72146afe8b45388125cb7bf9ecf965a7ba4f768c77be366470dcdcf214b7f6a5a9460ed4fe44ae559d85e2fdc2094de83fff12ea8804db1215c4ca865871bdd7f8ef32ab799bf923ffb02c1ded7d129beadad46c5eda31ab1a6f43da05ea08bff7ffa88d8966353d01830558c39b930b01d175e437124d8edd0d2698fd8932f2b2c9b14746e52879c57a395538150f390264f00e60d470711202f4194499ff79037ca9885dc8d695f7d917a3086ca88e8f8d0243efee09302cf39e039eb7cc8dd19d28120d5fe533b5727cd39133181c729ca6f90a015ed30be7668d5cb5ecc33a53ee69bf7d1a5ecbdb153803743c6adaaabd36bf84e5be38d3f04a5d5dbfd67bdcd3b176e65bd1391ade775cc32ce43a847fb6c672a3fe97a5d4081c4986959ec5fb898f42a9397ba2b3ec2c1018f8d76d057f2366bd0e4465514ad6560c599664fb85621fe771e00f43d39b591b2a6a321100f4d1ef23a376d5ae3eeedbfe23da73dff0ee4d16b34ebddd8f5f053db9824105fc7300dbee7ea6af56b112319e3e215a0fc79ae946f6b5227453ec7fcaf17cf7651f71499a50d81221404d5f129ac50ea7528ff0e0069ec4ab8acb7919d81749ab37a870c5ef2cc5a15cf96709d3c65b4addc77e7416847160bcabb94ea36377e0ef71be80b5cc53effd5444888044a353574c72c924bba2a8b4e8354188ebfedc852f59073f4347a8c8a28c99e21df +MAC = f6eaae369c3cb5c05748e8d919178e00 + +# Regression test for https://rt.openssl.org/Ticket/Display.html?id=4439 +Key = 2d773be37adb1e4d683bf0075e79c4ee037918535a7f99ccb7040fb5f5f43aea +Input = 89dab80b7717c1db5db437860a3f70218e93e1b8f461fb677f16f35f6f87e2a91c99bc3a47ace47640cc95c345be5ecca5a3523c35cc01893af0b64a620334270372ec12482d1b1e363561698a578b359803495bb4e2ef1930b17a5190b580f141300df30adbeca28f6427a8bc1a999fd51c554a017d095d8c3e3127daf9f595 +MAC = c85d15ed44c378d6b00e23064c7bcd51 + +# Regression tests for https://rt.openssl.org/Ticket/Display.html?id=4483 + +Key = 7f1b02640000000000000000000000000000000000000000cccccccccccccccc +Input = cccccccccccccccccccccccccccccccccccccccccccccccccc80ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccceccccccccccccccccccccccccccccccccccccc5cccccccccccccccccccccccccccccccccccccccccce3ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccaccccccccccccccccccccce6cccccccccc000000afccccccccccccccccccfffffff5000000000000000000000000000000000000000000000000000000ffffffe70000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000719205a8521dfc +MAC = 8559b876eceed66eb37798c0457baff9 + +Key = e00016000000000000000000000000000000aaaaaaaaaaaaaaaaaaaaaaaaaaaa +Input = aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa000000000000000000800264 +MAC = 00bd1258978e205444c9aaaa82006fed + +Key = 0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c +Input = 02fc +MAC = 06120c0c0c0c0c0c0c0c0c0c0c0c0c0c + +Key = 00ff000000000000000000000000000000000000001e00000000000000007b7b +Input = 7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7a7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b5c7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b6e7b007b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7a7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b5c7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b6e7b001300000000b300000000000000000000000000000000000000000000f20000000000000000000000000000000000002000efff0009000000000000000000000000100000000009000000640000000000000000000000001300000000b300000000000000000000000000000000000000000000f20000000000000000000000000000000000002000efff00090000000000000000007a000010000000000900000064000000000000000000000000000000000000000000000000fc +MAC = 33205bbf9e9f8f7212ab9e2ab9b7e4a5 diff --git a/ring-0.17.14/src/aead/quic.rs b/ring-0.17.14/src/aead/quic.rs new file mode 100644 index 0000000000..a0cbe08d7a --- /dev/null +++ b/ring-0.17.14/src/aead/quic.rs @@ -0,0 +1,187 @@ +// Copyright 2018 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! QUIC Header Protection. +//! +//! See draft-ietf-quic-tls. + +use crate::{ + aead::{aes, chacha}, + cpu, error, hkdf, +}; + +/// A key for generating QUIC Header Protection masks. +pub struct HeaderProtectionKey { + inner: KeyInner, + algorithm: &'static Algorithm, +} + +#[allow(clippy::large_enum_variant, variant_size_differences)] +enum KeyInner { + Aes(aes::Key), + ChaCha20(chacha::Key), +} + +impl From> for HeaderProtectionKey { + fn from(okm: hkdf::Okm<&'static Algorithm>) -> Self { + let mut key_bytes = [0; super::MAX_KEY_LEN]; + let algorithm = *okm.len(); + let key_bytes = &mut key_bytes[..algorithm.key_len()]; + okm.fill(key_bytes).unwrap(); + Self::new(algorithm, key_bytes).unwrap() + } +} + +impl HeaderProtectionKey { + /// Create a new header protection key. + /// + /// `key_bytes` must be exactly `algorithm.key_len` bytes long. + pub fn new( + algorithm: &'static Algorithm, + key_bytes: &[u8], + ) -> Result { + Ok(Self { + inner: (algorithm.init)(key_bytes, cpu::features())?, + algorithm, + }) + } + + /// Generate a new QUIC Header Protection mask. + /// + /// `sample` must be exactly `self.algorithm().sample_len()` bytes long. + pub fn new_mask(&self, sample: &[u8]) -> Result<[u8; 5], error::Unspecified> { + let sample = <&[u8; SAMPLE_LEN]>::try_from(sample)?; + + let out = (self.algorithm.new_mask)(&self.inner, *sample); + Ok(out) + } + + /// The key's algorithm. + #[inline(always)] + pub fn algorithm(&self) -> &'static Algorithm { + self.algorithm + } +} + +const SAMPLE_LEN: usize = super::TAG_LEN; + +/// QUIC sample for new key masks +pub type Sample = [u8; SAMPLE_LEN]; + +/// A QUIC Header Protection Algorithm. +pub struct Algorithm { + init: fn(key: &[u8], cpu_features: cpu::Features) -> Result, + + new_mask: fn(key: &KeyInner, sample: Sample) -> [u8; 5], + + key_len: usize, + id: AlgorithmID, +} + +impl hkdf::KeyType for &'static Algorithm { + #[inline] + fn len(&self) -> usize { + self.key_len() + } +} + +impl Algorithm { + /// The length of the key. + #[inline(always)] + pub fn key_len(&self) -> usize { + self.key_len + } + + /// The required sample length. + #[inline(always)] + pub fn sample_len(&self) -> usize { + SAMPLE_LEN + } +} + +derive_debug_via_id!(Algorithm); + +#[derive(Debug, Eq, PartialEq)] +enum AlgorithmID { + AES_128, + AES_256, + CHACHA20, +} + +impl PartialEq for Algorithm { + fn eq(&self, other: &Self) -> bool { + self.id == other.id + } +} + +impl Eq for Algorithm {} + +/// AES-128. +pub static AES_128: Algorithm = Algorithm { + key_len: 16, + init: aes_init_128, + new_mask: aes_new_mask, + id: AlgorithmID::AES_128, +}; + +/// AES-256. +pub static AES_256: Algorithm = Algorithm { + key_len: 32, + init: aes_init_256, + new_mask: aes_new_mask, + id: AlgorithmID::AES_256, +}; + +fn aes_init_128(key: &[u8], cpu_features: cpu::Features) -> Result { + let key = key.try_into().map_err(|_| error::Unspecified)?; + let aes_key = aes::Key::new(aes::KeyBytes::AES_128(key), cpu_features)?; + Ok(KeyInner::Aes(aes_key)) +} + +fn aes_init_256(key: &[u8], cpu_features: cpu::Features) -> Result { + let key = key.try_into().map_err(|_| error::Unspecified)?; + let aes_key = aes::Key::new(aes::KeyBytes::AES_256(key), cpu_features)?; + Ok(KeyInner::Aes(aes_key)) +} + +fn aes_new_mask(key: &KeyInner, sample: Sample) -> [u8; 5] { + let aes_key = match key { + KeyInner::Aes(key) => key, + _ => unreachable!(), + }; + + aes_key.new_mask(sample) +} + +/// ChaCha20. +pub static CHACHA20: Algorithm = Algorithm { + key_len: chacha::KEY_LEN, + init: chacha20_init, + new_mask: chacha20_new_mask, + id: AlgorithmID::CHACHA20, +}; + +fn chacha20_init(key: &[u8], _cpu_features: cpu::Features) -> Result { + let chacha20_key: [u8; chacha::KEY_LEN] = key.try_into()?; + Ok(KeyInner::ChaCha20(chacha::Key::new(chacha20_key))) +} + +fn chacha20_new_mask(key: &KeyInner, sample: Sample) -> [u8; 5] { + let chacha20_key = match key { + KeyInner::ChaCha20(key) => key, + _ => unreachable!(), + }; + + chacha20_key.new_mask(sample) +} diff --git a/ring-0.17.14/src/aead/sealing_key.rs b/ring-0.17.14/src/aead/sealing_key.rs new file mode 100644 index 0000000000..5b8607b465 --- /dev/null +++ b/ring-0.17.14/src/aead/sealing_key.rs @@ -0,0 +1,104 @@ +// Copyright 2015-2021 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! Authenticated Encryption with Associated Data (AEAD). +//! +//! See [Authenticated encryption: relations among notions and analysis of the +//! generic composition paradigm][AEAD] for an introduction to the concept of +//! AEADs. +//! +//! [AEAD]: https://eprint.iacr.org/2000/025.pdf +//! [`crypto.cipher.AEAD`]: https://golang.org/pkg/crypto/cipher/#AEAD + +use super::{Aad, Algorithm, BoundKey, LessSafeKey, NonceSequence, Tag, UnboundKey}; +use crate::error; + +/// An AEAD key for encrypting and signing ("sealing"), bound to a nonce +/// sequence. +/// +/// Intentionally not `Clone` or `Copy` since cloning would allow duplication +/// of the nonce sequence. +pub struct SealingKey { + key: LessSafeKey, + nonce_sequence: N, +} + +impl BoundKey for SealingKey { + fn new(key: UnboundKey, nonce_sequence: N) -> Self { + Self { + key: key.into_inner(), + nonce_sequence, + } + } + + #[inline] + fn algorithm(&self) -> &'static Algorithm { + self.key.algorithm() + } +} + +impl core::fmt::Debug for SealingKey { + fn fmt(&self, f: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> { + self.key.fmt_debug("SealingKey", f) + } +} + +impl SealingKey { + /// Encrypts and signs (“seals”) data in place, appending the tag to the + /// resulting ciphertext. + /// + /// `key.seal_in_place_append_tag(aad, in_out)` is equivalent to: + /// + /// ```skip + /// key.seal_in_place_separate_tag(aad, in_out.as_mut()) + /// .map(|tag| in_out.extend(tag.as_ref())) + /// ``` + #[inline] + pub fn seal_in_place_append_tag( + &mut self, + aad: Aad, + in_out: &mut InOut, + ) -> Result<(), error::Unspecified> + where + A: AsRef<[u8]>, + InOut: AsMut<[u8]> + for<'in_out> Extend<&'in_out u8>, + { + self.key + .seal_in_place_append_tag(self.nonce_sequence.advance()?, aad, in_out) + } + + /// Encrypts and signs (“seals”) data in place. + /// + /// `aad` is the additional authenticated data (AAD), if any. This is + /// authenticated but not encrypted. The type `A` could be a byte slice + /// `&[u8]`, a byte array `[u8; N]` for some constant `N`, `Vec`, etc. + /// If there is no AAD then use `Aad::empty()`. + /// + /// The plaintext is given as the input value of `in_out`. `seal_in_place()` + /// will overwrite the plaintext with the ciphertext and return the tag. + /// For most protocols, the caller must append the tag to the ciphertext. + /// The tag will be `self.algorithm.tag_len()` bytes long. + #[inline] + pub fn seal_in_place_separate_tag( + &mut self, + aad: Aad, + in_out: &mut [u8], + ) -> Result + where + A: AsRef<[u8]>, + { + self.key + .seal_in_place_separate_tag(self.nonce_sequence.advance()?, aad, in_out) + } +} diff --git a/ring-0.17.14/src/aead/shift.rs b/ring-0.17.14/src/aead/shift.rs new file mode 100644 index 0000000000..3bd7f46ee4 --- /dev/null +++ b/ring-0.17.14/src/aead/shift.rs @@ -0,0 +1,32 @@ +// Copyright 2018 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#[cfg(target_arch = "x86")] +pub fn shift_full_blocks( + in_out: super::overlapping::Overlapping<'_, u8>, + mut transform: impl FnMut(&[u8; BLOCK_LEN]) -> [u8; BLOCK_LEN], +) { + let (in_out, src) = in_out.into_slice_src_mut(); + let in_out_len = in_out[src.clone()].len(); + + for i in (0..in_out_len).step_by(BLOCK_LEN) { + let block = { + let input = + <&[u8; BLOCK_LEN]>::try_from(&in_out[(src.start + i)..][..BLOCK_LEN]).unwrap(); + transform(input) + }; + let output = <&mut [u8; BLOCK_LEN]>::try_from(&mut in_out[i..][..BLOCK_LEN]).unwrap(); + *output = block; + } +} diff --git a/ring-0.17.14/src/aead/unbound_key.rs b/ring-0.17.14/src/aead/unbound_key.rs new file mode 100644 index 0000000000..23f9df9eb0 --- /dev/null +++ b/ring-0.17.14/src/aead/unbound_key.rs @@ -0,0 +1,74 @@ +// Copyright 2015-2021 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! Authenticated Encryption with Associated Data (AEAD). +//! +//! See [Authenticated encryption: relations among notions and analysis of the +//! generic composition paradigm][AEAD] for an introduction to the concept of +//! AEADs. +//! +//! [AEAD]: https://eprint.iacr.org/2000/025.pdf +//! [`crypto.cipher.AEAD`]: https://golang.org/pkg/crypto/cipher/#AEAD + +use super::{Algorithm, LessSafeKey, MAX_KEY_LEN}; +use crate::{cpu, error, hkdf}; + +/// An AEAD key without a designated role or nonce sequence. +pub struct UnboundKey { + inner: LessSafeKey, +} + +impl UnboundKey { + /// Constructs a `UnboundKey`. + /// + /// Fails if `key_bytes.len() != algorithm.key_len()`. + #[inline] + pub fn new( + algorithm: &'static Algorithm, + key_bytes: &[u8], + ) -> Result { + Ok(Self { + inner: LessSafeKey::new_(algorithm, key_bytes, cpu::features())?, + }) + } + + /// The key's AEAD algorithm. + #[inline] + pub fn algorithm(&self) -> &'static Algorithm { + self.inner.algorithm() + } + + #[inline] + pub(super) fn into_inner(self) -> LessSafeKey { + self.inner + } +} + +impl core::fmt::Debug for UnboundKey { + fn fmt(&self, f: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> { + self.inner.fmt_debug("UnboundKey", f) + } +} + +impl From> for UnboundKey { + fn from(okm: hkdf::Okm<&'static Algorithm>) -> Self { + let mut key_bytes = [0; MAX_KEY_LEN]; + let key_bytes = &mut key_bytes[..okm.len().key_len()]; + let algorithm = *okm.len(); + okm.fill(key_bytes).unwrap(); + Self { + inner: LessSafeKey::new_(algorithm, key_bytes, cpu::features()).unwrap(), + } + } +} diff --git a/ring-0.17.14/src/agreement.rs b/ring-0.17.14/src/agreement.rs new file mode 100644 index 0000000000..770423f2be --- /dev/null +++ b/ring-0.17.14/src/agreement.rs @@ -0,0 +1,311 @@ +// Copyright 2015-2017 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! Key Agreement: ECDH, including X25519. +//! +//! # Example +//! +//! Note that this example uses X25519, but ECDH using NIST P-256/P-384 is done +//! exactly the same way, just substituting +//! `agreement::ECDH_P256`/`agreement::ECDH_P384` for `agreement::X25519`. +//! +//! ``` +//! use ring::{agreement, rand}; +//! +//! let rng = rand::SystemRandom::new(); +//! +//! let my_private_key = agreement::EphemeralPrivateKey::generate(&agreement::X25519, &rng)?; +//! +//! // Make `my_public_key` a byte slice containing my public key. In a real +//! // application, this would be sent to the peer in an encoded protocol +//! // message. +//! let my_public_key = my_private_key.compute_public_key()?; +//! +//! let peer_public_key_bytes = { +//! // In a real application, the peer public key would be parsed out of a +//! // protocol message. Here we just generate one. +//! let peer_private_key = +//! agreement::EphemeralPrivateKey::generate(&agreement::X25519, &rng)?; +//! peer_private_key.compute_public_key()? +//! }; +//! +//! let peer_public_key = agreement::UnparsedPublicKey::new( +//! &agreement::X25519, +//! peer_public_key_bytes); +//! +//! agreement::agree_ephemeral( +//! my_private_key, +//! &peer_public_key, +//! |_key_material| { +//! // In a real application, we'd apply a KDF to the key material and the +//! // public keys (as recommended in RFC 7748) and then derive session +//! // keys from the result. We omit all that here. +//! }, +//! )?; +//! +//! # Ok::<(), ring::error::Unspecified>(()) +//! ``` + +// The "NSA Guide" steps here are from from section 3.1, "Ephemeral Unified +// Model." + +use crate::{cpu, debug, ec, error, rand}; + +pub use crate::ec::{ + curve25519::x25519::X25519, + suite_b::ecdh::{ECDH_P256, ECDH_P384}, +}; + +/// A key agreement algorithm. +pub struct Algorithm { + pub(crate) curve: &'static ec::Curve, + pub(crate) ecdh: fn( + out: &mut [u8], + private_key: &ec::Seed, + peer_public_key: untrusted::Input, + cpu: cpu::Features, + ) -> Result<(), error::Unspecified>, +} + +derive_debug_via_field!(Algorithm, curve); + +impl Eq for Algorithm {} +impl PartialEq for Algorithm { + fn eq(&self, other: &Self) -> bool { + self.curve.id == other.curve.id + } +} + +/// An ephemeral private key for use (only) with `agree_ephemeral`. The +/// signature of `agree_ephemeral` ensures that an `EphemeralPrivateKey` can be +/// used for at most one key agreement. +pub struct EphemeralPrivateKey { + private_key: ec::Seed, + algorithm: &'static Algorithm, +} + +derive_debug_via_field!( + EphemeralPrivateKey, + stringify!(EphemeralPrivateKey), + algorithm +); + +impl EphemeralPrivateKey { + /// Generate a new ephemeral private key for the given algorithm. + pub fn generate( + alg: &'static Algorithm, + rng: &dyn rand::SecureRandom, + ) -> Result { + let cpu_features = cpu::features(); + + // NSA Guide Step 1. + // + // This only handles the key generation part of step 1. The rest of + // step one is done by `compute_public_key()`. + let private_key = ec::Seed::generate(alg.curve, rng, cpu_features)?; + Ok(Self { + private_key, + algorithm: alg, + }) + } + + /// Computes the public key from the private key. + #[inline(always)] + pub fn compute_public_key(&self) -> Result { + // NSA Guide Step 1. + // + // Obviously, this only handles the part of Step 1 between the private + // key generation and the sending of the public key to the peer. `out` + // is what should be sent to the peer. + self.private_key + .compute_public_key(cpu::features()) + .map(|public_key| PublicKey { + algorithm: self.algorithm, + bytes: public_key, + }) + } + + /// The algorithm for the private key. + #[inline] + pub fn algorithm(&self) -> &'static Algorithm { + self.algorithm + } + + /// Do not use. + #[deprecated] + #[cfg(test)] + pub fn bytes(&self) -> &[u8] { + self.bytes_for_test() + } + + #[cfg(test)] + pub(super) fn bytes_for_test(&self) -> &[u8] { + self.private_key.bytes_less_safe() + } +} + +/// A public key for key agreement. +#[derive(Clone)] +pub struct PublicKey { + algorithm: &'static Algorithm, + bytes: ec::PublicKey, +} + +impl AsRef<[u8]> for PublicKey { + fn as_ref(&self) -> &[u8] { + self.bytes.as_ref() + } +} + +impl core::fmt::Debug for PublicKey { + fn fmt(&self, f: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> { + f.debug_struct("PublicKey") + .field("algorithm", &self.algorithm) + .field("bytes", &debug::HexStr(self.as_ref())) + .finish() + } +} + +impl PublicKey { + /// The algorithm for the public key. + #[inline] + pub fn algorithm(&self) -> &'static Algorithm { + self.algorithm + } +} + +/// An unparsed, possibly malformed, public key for key agreement. +#[derive(Clone, Copy)] +pub struct UnparsedPublicKey { + algorithm: &'static Algorithm, + bytes: B, +} + +impl AsRef<[u8]> for UnparsedPublicKey +where + B: AsRef<[u8]>, +{ + fn as_ref(&self) -> &[u8] { + self.bytes.as_ref() + } +} + +impl core::fmt::Debug for UnparsedPublicKey +where + B: AsRef<[u8]>, +{ + fn fmt(&self, f: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> { + f.debug_struct("UnparsedPublicKey") + .field("algorithm", &self.algorithm) + .field("bytes", &debug::HexStr(self.bytes.as_ref())) + .finish() + } +} + +impl UnparsedPublicKey { + /// Constructs a new `UnparsedPublicKey`. + pub fn new(algorithm: &'static Algorithm, bytes: B) -> Self { + Self { algorithm, bytes } + } + + /// The algorithm for the public key. + #[inline] + pub fn algorithm(&self) -> &'static Algorithm { + self.algorithm + } + + /// TODO: doc + #[inline] + pub fn bytes(&self) -> &B { + &self.bytes + } +} + +/// Performs a key agreement with an ephemeral private key and the given public +/// key. +/// +/// `my_private_key` is the ephemeral private key to use. Since it is moved, it +/// will not be usable after calling `agree_ephemeral`, thus guaranteeing that +/// the key is used for only one key agreement. +/// +/// `peer_public_key` is the peer's public key. `agree_ephemeral` will return +/// `Err(error_value)` if it does not match `my_private_key's` algorithm/curve. +/// `agree_ephemeral` verifies that it is encoded in the standard form for the +/// algorithm and that the key is *valid*; see the algorithm's documentation for +/// details on how keys are to be encoded and what constitutes a valid key for +/// that algorithm. +/// +/// After the key agreement is done, `agree_ephemeral` calls `kdf` with the raw +/// key material from the key agreement operation and then returns what `kdf` +/// returns. +#[inline] +pub fn agree_ephemeral, R>( + my_private_key: EphemeralPrivateKey, + peer_public_key: &UnparsedPublicKey, + kdf: impl FnOnce(&[u8]) -> R, +) -> Result { + let peer_public_key = UnparsedPublicKey { + algorithm: peer_public_key.algorithm, + bytes: peer_public_key.bytes.as_ref(), + }; + agree_ephemeral_(my_private_key, peer_public_key, kdf, cpu::features()) +} + +fn agree_ephemeral_( + my_private_key: EphemeralPrivateKey, + peer_public_key: UnparsedPublicKey<&[u8]>, + kdf: impl FnOnce(&[u8]) -> R, + cpu: cpu::Features, +) -> Result { + // NSA Guide Prerequisite 1. + // + // The domain parameters are hard-coded. This check verifies that the + // peer's public key's domain parameters match the domain parameters of + // this private key. + if peer_public_key.algorithm != my_private_key.algorithm { + return Err(error::Unspecified); + } + + let alg = &my_private_key.algorithm; + + // NSA Guide Prerequisite 2, regarding which KDFs are allowed, is delegated + // to the caller. + + // NSA Guide Prerequisite 3, "Prior to or during the key-agreement process, + // each party shall obtain the identifier associated with the other party + // during the key-agreement scheme," is delegated to the caller. + + // NSA Guide Step 1 is handled by `EphemeralPrivateKey::generate()` and + // `EphemeralPrivateKey::compute_public_key()`. + + let mut shared_key = [0u8; ec::ELEM_MAX_BYTES]; + let shared_key = &mut shared_key[..alg.curve.elem_scalar_seed_len]; + + // NSA Guide Steps 2, 3, and 4. + // + // We have a pretty liberal interpretation of the NIST's spec's "Destroy" + // that doesn't meet the NSA requirement to "zeroize." + (alg.ecdh)( + shared_key, + &my_private_key.private_key, + untrusted::Input::from(peer_public_key.bytes), + cpu, + )?; + + // NSA Guide Steps 5 and 6. + // + // Again, we have a pretty liberal interpretation of the NIST's spec's + // "Destroy" that doesn't meet the NSA requirement to "zeroize." + Ok(kdf(shared_key)) +} diff --git a/ring-0.17.14/src/arithmetic.rs b/ring-0.17.14/src/arithmetic.rs new file mode 100644 index 0000000000..94df7e461b --- /dev/null +++ b/ring-0.17.14/src/arithmetic.rs @@ -0,0 +1,47 @@ +// Copyright 2017-2023 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +pub(crate) use self::{constant::limbs_from_hex, limb_slice_error::LimbSliceError}; +use crate::{error::LenMismatchError, limb::LIMB_BITS}; + +#[macro_use] +mod ffi; + +mod constant; + +#[cfg(feature = "alloc")] +pub mod bigint; + +pub(crate) mod inout; +mod limbs; +mod limbs512; +pub mod montgomery; + +mod n0; + +// The minimum number of limbs allowed for any `&[Limb]` operation. +// +// TODO: Use `256 / LIMB_BITS` so that the limit is independent of limb size. +pub const MIN_LIMBS: usize = 4; + +// The maximum number of limbs allowed for any `&[Limb]` operation. +pub const MAX_LIMBS: usize = 8192 / LIMB_BITS; + +cold_exhaustive_error! { + enum limb_slice_error::LimbSliceError { + len_mismatch => LenMismatch(LenMismatchError), + too_short => TooShort(usize), + too_long => TooLong(usize), + } +} diff --git a/ring-0.17.14/src/arithmetic/bigint.rs b/ring-0.17.14/src/arithmetic/bigint.rs new file mode 100644 index 0000000000..7251a64ad8 --- /dev/null +++ b/ring-0.17.14/src/arithmetic/bigint.rs @@ -0,0 +1,1068 @@ +// Copyright 2015-2023 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! Multi-precision integers. +//! +//! # Modular Arithmetic. +//! +//! Modular arithmetic is done in finite commutative rings ℤ/mℤ for some +//! modulus *m*. We work in finite commutative rings instead of finite fields +//! because the RSA public modulus *n* is not prime, which means ℤ/nℤ contains +//! nonzero elements that have no multiplicative inverse, so ℤ/nℤ is not a +//! finite field. +//! +//! In some calculations we need to deal with multiple rings at once. For +//! example, RSA private key operations operate in the rings ℤ/nℤ, ℤ/pℤ, and +//! ℤ/qℤ. Types and functions dealing with such rings are all parameterized +//! over a type `M` to ensure that we don't wrongly mix up the math, e.g. by +//! multiplying an element of ℤ/pℤ by an element of ℤ/qℤ modulo q. This follows +//! the "unit" pattern described in [Static checking of units in Servo]. +//! +//! `Elem` also uses the static unit checking pattern to statically track the +//! Montgomery factors that need to be canceled out in each value using it's +//! `E` parameter. +//! +//! [Static checking of units in Servo]: +//! https://blog.mozilla.org/research/2014/06/23/static-checking-of-units-in-servo/ + +use self::boxed_limbs::BoxedLimbs; +pub(crate) use self::{ + modulus::{Modulus, OwnedModulus}, + modulusvalue::OwnedModulusValue, + private_exponent::PrivateExponent, +}; +use super::{inout::AliasingSlices3, limbs512, montgomery::*, LimbSliceError, MAX_LIMBS}; +use crate::{ + bits::BitLength, + c, + error::{self, LenMismatchError}, + limb::{self, Limb, LIMB_BITS}, + polyfill::slice::{self, AsChunks}, +}; +use core::{ + marker::PhantomData, + num::{NonZeroU64, NonZeroUsize}, +}; + +mod boxed_limbs; +mod modulus; +mod modulusvalue; +mod private_exponent; + +pub trait PublicModulus {} + +// When we need to create a new `Elem`, first we create a `Storage` and then +// move its `limbs` into the new element. When we want to recylce an `Elem`'s +// memory allocation, we convert it back into a `Storage`. +pub struct Storage { + limbs: BoxedLimbs, +} + +impl From> for Storage { + fn from(elem: Elem) -> Self { + Self { limbs: elem.limbs } + } +} + +/// Elements of ℤ/mℤ for some modulus *m*. +// +// Defaulting `E` to `Unencoded` is a convenience for callers from outside this +// submodule. However, for maximum clarity, we always explicitly use +// `Unencoded` within the `bigint` submodule. +pub struct Elem { + limbs: BoxedLimbs, + + /// The number of Montgomery factors that need to be canceled out from + /// `value` to get the actual value. + encoding: PhantomData, +} + +impl Elem { + pub fn clone_into(&self, mut out: Storage) -> Self { + out.limbs.copy_from_slice(&self.limbs); + Self { + limbs: out.limbs, + encoding: self.encoding, + } + } +} + +impl Elem { + #[inline] + pub fn is_zero(&self) -> bool { + limb::limbs_are_zero(&self.limbs).leak() + } +} + +/// Does a Montgomery reduction on `limbs` assuming they are Montgomery-encoded ('R') and assuming +/// they are the same size as `m`, but perhaps not reduced mod `m`. The result will be +/// fully reduced mod `m`. +/// +/// WARNING: Takes a `Storage` as an in/out value. +fn from_montgomery_amm(mut in_out: Storage, m: &Modulus) -> Elem { + let mut one = [0; MAX_LIMBS]; + one[0] = 1; + let one = &one[..m.limbs().len()]; + limbs_mul_mont( + (&mut in_out.limbs[..], one), + m.limbs(), + m.n0(), + m.cpu_features(), + ) + .unwrap_or_else(unwrap_impossible_limb_slice_error); + Elem { + limbs: in_out.limbs, + encoding: PhantomData, + } +} + +#[cfg(any(test, not(target_arch = "x86_64")))] +impl Elem { + #[inline] + pub fn into_unencoded(self, m: &Modulus) -> Elem { + from_montgomery_amm(Storage::from(self), m) + } +} + +impl Elem { + pub fn from_be_bytes_padded( + input: untrusted::Input, + m: &Modulus, + ) -> Result { + Ok(Self { + limbs: BoxedLimbs::from_be_bytes_padded_less_than(input, m)?, + encoding: PhantomData, + }) + } + + #[inline] + pub fn fill_be_bytes(&self, out: &mut [u8]) { + // See Falko Strenzke, "Manger's Attack revisited", ICICS 2010. + limb::big_endian_from_limbs(&self.limbs, out) + } +} + +pub fn elem_mul_into( + mut out: Storage, + a: &Elem, + b: &Elem, + m: &Modulus, +) -> Elem::Output> +where + (AF, BF): ProductEncoding, +{ + limbs_mul_mont( + (out.limbs.as_mut(), b.limbs.as_ref(), a.limbs.as_ref()), + m.limbs(), + m.n0(), + m.cpu_features(), + ) + .unwrap_or_else(unwrap_impossible_limb_slice_error); + Elem { + limbs: out.limbs, + encoding: PhantomData, + } +} + +pub fn elem_mul( + a: &Elem, + mut b: Elem, + m: &Modulus, +) -> Elem::Output> +where + (AF, BF): ProductEncoding, +{ + limbs_mul_mont( + (&mut b.limbs[..], &a.limbs[..]), + m.limbs(), + m.n0(), + m.cpu_features(), + ) + .unwrap_or_else(unwrap_impossible_limb_slice_error); + Elem { + limbs: b.limbs, + encoding: PhantomData, + } +} + +// r *= 2. +fn elem_double(r: &mut Elem, m: &Modulus) { + limb::limbs_double_mod(&mut r.limbs, m.limbs()) + .unwrap_or_else(unwrap_impossible_len_mismatch_error) +} + +// TODO: This is currently unused, but we intend to eventually use this to +// reduce elements (x mod q) mod p in the RSA CRT. If/when we do so, we +// should update the testing so it is reflective of that usage, instead of +// the old usage. +pub fn elem_reduced_once( + mut r: Storage, + a: &Elem, + m: &Modulus, + other_modulus_len_bits: BitLength, +) -> Elem { + assert_eq!(m.len_bits(), other_modulus_len_bits); + r.limbs.copy_from_slice(&a.limbs); + limb::limbs_reduce_once(&mut r.limbs, m.limbs()) + .unwrap_or_else(unwrap_impossible_len_mismatch_error); + Elem { + limbs: r.limbs, + encoding: PhantomData, + } +} + +#[inline] +pub fn elem_reduced( + mut r: Storage, + a: &Elem, + m: &Modulus, + other_prime_len_bits: BitLength, +) -> Elem { + // This is stricter than required mathematically but this is what we + // guarantee and this is easier to check. The real requirement is that + // that `a < m*R` where `R` is the Montgomery `R` for `m`. + assert_eq!(other_prime_len_bits, m.len_bits()); + + // `limbs_from_mont_in_place` requires this. + assert_eq!(a.limbs.len(), m.limbs().len() * 2); + + let mut tmp = [0; MAX_LIMBS]; + let tmp = &mut tmp[..a.limbs.len()]; + tmp.copy_from_slice(&a.limbs); + + limbs_from_mont_in_place(&mut r.limbs, tmp, m.limbs(), m.n0()); + Elem { + limbs: r.limbs, + encoding: PhantomData, + } +} + +#[inline] +fn elem_squared( + mut a: Elem, + m: &Modulus, +) -> Elem::Output> +where + (E, E): ProductEncoding, +{ + limbs_square_mont(&mut a.limbs, m.limbs(), m.n0(), m.cpu_features()) + .unwrap_or_else(unwrap_impossible_limb_slice_error); + Elem { + limbs: a.limbs, + encoding: PhantomData, + } +} + +pub fn elem_widen( + mut r: Storage, + a: Elem, + m: &Modulus, + smaller_modulus_bits: BitLength, +) -> Result, error::Unspecified> { + if smaller_modulus_bits >= m.len_bits() { + return Err(error::Unspecified); + } + let (to_copy, to_zero) = r.limbs.split_at_mut(a.limbs.len()); + to_copy.copy_from_slice(&a.limbs); + to_zero.fill(0); + Ok(Elem { + limbs: r.limbs, + encoding: PhantomData, + }) +} + +// TODO: Document why this works for all Montgomery factors. +pub fn elem_add(mut a: Elem, b: Elem, m: &Modulus) -> Elem { + limb::limbs_add_assign_mod(&mut a.limbs, &b.limbs, m.limbs()) + .unwrap_or_else(unwrap_impossible_len_mismatch_error); + a +} + +// TODO: Document why this works for all Montgomery factors. +pub fn elem_sub(mut a: Elem, b: &Elem, m: &Modulus) -> Elem { + prefixed_extern! { + // `r` and `a` may alias. + fn LIMBS_sub_mod( + r: *mut Limb, + a: *const Limb, + b: *const Limb, + m: *const Limb, + num_limbs: c::NonZero_size_t, + ); + } + let num_limbs = NonZeroUsize::new(m.limbs().len()).unwrap(); + (a.limbs.as_mut(), b.limbs.as_ref()) + .with_non_dangling_non_null_pointers_rab(num_limbs, |r, a, b| { + let m = m.limbs().as_ptr(); // Also non-dangling because num_limbs is non-zero. + unsafe { LIMBS_sub_mod(r, a, b, m, num_limbs) } + }) + .unwrap_or_else(unwrap_impossible_len_mismatch_error); + a +} + +// The value 1, Montgomery-encoded some number of times. +pub struct One(Elem); + +impl One { + // Returns RR = = R**2 (mod n) where R = 2**r is the smallest power of + // 2**LIMB_BITS such that R > m. + // + // Even though the assembly on some 32-bit platforms works with 64-bit + // values, using `LIMB_BITS` here, rather than `N0::LIMBS_USED * LIMB_BITS`, + // is correct because R**2 will still be a multiple of the latter as + // `N0::LIMBS_USED` is either one or two. + pub(crate) fn newRR(mut out: Storage, m: &Modulus) -> Self { + // The number of limbs in the numbers involved. + let w = m.limbs().len(); + + // The length of the numbers involved, in bits. R = 2**r. + let r = w * LIMB_BITS; + + m.oneR(&mut out.limbs); + let mut acc: Elem = Elem { + limbs: out.limbs, + encoding: PhantomData, + }; + + // 2**t * R can be calculated by t doublings starting with R. + // + // Choose a t that divides r and where t doublings are cheaper than 1 squaring. + // + // We could choose other values of t than w. But if t < d then the exponentiation that + // follows would require multiplications. Normally d is 1 (i.e. the modulus length is a + // power of two: RSA 1024, 2048, 4097, 8192) or 3 (RSA 1536, 3072). + // + // XXX(perf): Currently t = w / 2 is slightly faster. TODO(perf): Optimize `elem_double` + // and re-run benchmarks to rebalance this. + let t = w; + let z = w.trailing_zeros(); + let d = w >> z; + debug_assert_eq!(w, d * (1 << z)); + debug_assert!(d <= t); + debug_assert!(t < r); + for _ in 0..t { + elem_double(&mut acc, m); + } + + // Because t | r: + // + // MontExp(2**t * R, r / t) + // = (2**t)**(r / t) * R (mod m) by definition of MontExp. + // = (2**t)**(1/t * r) * R (mod m) + // = (2**(t * 1/t))**r * R (mod m) + // = (2**1)**r * R (mod m) + // = 2**r * R (mod m) + // = R * R (mod m) + // = RR + // + // Like BoringSSL, use t = w (`m.limbs.len()`) which ensures that the exponent is a power + // of two. Consequently, there will be no multiplications in the Montgomery exponentiation; + // there will only be lg(r / t) squarings. + // + // lg(r / t) + // = lg((w * 2**b) / t) + // = lg((t * 2**b) / t) + // = lg(2**b) + // = b + // TODO(MSRV:1.67): const B: u32 = LIMB_BITS.ilog2(); + const B: u32 = if cfg!(target_pointer_width = "64") { + 6 + } else if cfg!(target_pointer_width = "32") { + 5 + } else { + panic!("unsupported target_pointer_width") + }; + #[allow(clippy::assertions_on_constants)] + const _LIMB_BITS_IS_2_POW_B: () = assert!(LIMB_BITS == 1 << B); + debug_assert_eq!(r, t * (1 << B)); + for _ in 0..B { + acc = elem_squared(acc, m); + } + + Self(Elem { + limbs: acc.limbs, + encoding: PhantomData, // PhantomData + }) + } +} + +impl One { + pub(crate) fn newRRR(One(oneRR): One, m: &Modulus) -> Self { + Self(elem_squared(oneRR, m)) + } +} + +impl AsRef> for One { + fn as_ref(&self) -> &Elem { + &self.0 + } +} + +impl One { + pub fn clone_into(&self, out: Storage) -> Self { + Self(self.0.clone_into(out)) + } +} + +/// Calculates base**exponent (mod m). +/// +/// The run time is a function of the number of limbs in `m` and the bit +/// length and Hamming Weight of `exponent`. The bounds on `m` are pretty +/// obvious but the bounds on `exponent` are less obvious. Callers should +/// document the bounds they place on the maximum value and maximum Hamming +/// weight of `exponent`. +// TODO: The test coverage needs to be expanded, e.g. test with the largest +// accepted exponent and with the most common values of 65537 and 3. +pub(crate) fn elem_exp_vartime( + out: Storage, + base: Elem, + exponent: NonZeroU64, + m: &Modulus, +) -> Elem { + // Use what [Knuth] calls the "S-and-X binary method", i.e. variable-time + // square-and-multiply that scans the exponent from the most significant + // bit to the least significant bit (left-to-right). Left-to-right requires + // less storage compared to right-to-left scanning, at the cost of needing + // to compute `exponent.leading_zeros()`, which we assume to be cheap. + // + // As explained in [Knuth], exponentiation by squaring is the most + // efficient algorithm when the Hamming weight is 2 or less. It isn't the + // most efficient for all other, uncommon, exponent values but any + // suboptimality is bounded at least by the small bit length of `exponent` + // as enforced by its type. + // + // This implementation is slightly simplified by taking advantage of the + // fact that we require the exponent to be a positive integer. + // + // [Knuth]: The Art of Computer Programming, Volume 2: Seminumerical + // Algorithms (3rd Edition), Section 4.6.3. + let exponent = exponent.get(); + let mut acc = base.clone_into(out); + let mut bit = 1 << (64 - 1 - exponent.leading_zeros()); + debug_assert!((exponent & bit) != 0); + while bit > 1 { + bit >>= 1; + acc = elem_squared(acc, m); + if (exponent & bit) != 0 { + acc = elem_mul(&base, acc, m); + } + } + acc +} + +pub fn elem_exp_consttime( + out: Storage

, + base: &Elem, + oneRRR: &One, + exponent: &PrivateExponent, + p: &Modulus

, + q: PrivateCrtPrime, + qInv: bigint::Elem, + public: PublicKey, +} + +derive_debug_via_field!(KeyPair, stringify!(RsaKeyPair), public); + +impl KeyPair { + /// Parses an unencrypted PKCS#8-encoded RSA private key. + /// + /// This will generate a 2048-bit RSA private key of the correct form using + /// OpenSSL's command line tool: + /// + /// ```sh + /// openssl genpkey -algorithm RSA \ + /// -pkeyopt rsa_keygen_bits:2048 \ + /// -pkeyopt rsa_keygen_pubexp:65537 | \ + /// openssl pkcs8 -topk8 -nocrypt -outform der > rsa-2048-private-key.pk8 + /// ``` + /// + /// This will generate a 3072-bit RSA private key of the correct form: + /// + /// ```sh + /// openssl genpkey -algorithm RSA \ + /// -pkeyopt rsa_keygen_bits:3072 \ + /// -pkeyopt rsa_keygen_pubexp:65537 | \ + /// openssl pkcs8 -topk8 -nocrypt -outform der > rsa-3072-private-key.pk8 + /// ``` + /// + /// Often, keys generated for use in OpenSSL-based software are stored in + /// the Base64 “PEM” format without the PKCS#8 wrapper. Such keys can be + /// converted to binary PKCS#8 form using the OpenSSL command line tool like + /// this: + /// + /// ```sh + /// openssl pkcs8 -topk8 -nocrypt -outform der \ + /// -in rsa-2048-private-key.pem > rsa-2048-private-key.pk8 + /// ``` + /// + /// Base64 (“PEM”) PKCS#8-encoded keys can be converted to the binary PKCS#8 + /// form like this: + /// + /// ```sh + /// openssl pkcs8 -nocrypt -outform der \ + /// -in rsa-2048-private-key.pem > rsa-2048-private-key.pk8 + /// ``` + /// + /// See [`Self::from_components`] for more details on how the input is + /// validated. + /// + /// See [RFC 5958] and [RFC 3447 Appendix A.1.2] for more details of the + /// encoding of the key. + /// + /// [NIST SP-800-56B rev. 1]: + /// http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-56Br1.pdf + /// + /// [RFC 3447 Appendix A.1.2]: + /// https://tools.ietf.org/html/rfc3447#appendix-A.1.2 + /// + /// [RFC 5958]: + /// https://tools.ietf.org/html/rfc5958 + pub fn from_pkcs8(pkcs8: &[u8]) -> Result { + const RSA_ENCRYPTION: &[u8] = include_bytes!("../data/alg-rsa-encryption.der"); + let (der, _) = pkcs8::unwrap_key_( + untrusted::Input::from(RSA_ENCRYPTION), + pkcs8::Version::V1Only, + untrusted::Input::from(pkcs8), + )?; + Self::from_der(der.as_slice_less_safe()) + } + + /// Parses an RSA private key that is not inside a PKCS#8 wrapper. + /// + /// The private key must be encoded as a binary DER-encoded ASN.1 + /// `RSAPrivateKey` as described in [RFC 3447 Appendix A.1.2]). In all other + /// respects, this is just like `from_pkcs8()`. See the documentation for + /// `from_pkcs8()` for more details. + /// + /// It is recommended to use `from_pkcs8()` (with a PKCS#8-encoded key) + /// instead. + /// + /// See [`Self::from_components()`] for more details on how the input is + /// validated. + /// + /// [RFC 3447 Appendix A.1.2]: + /// https://tools.ietf.org/html/rfc3447#appendix-A.1.2 + /// + /// [NIST SP-800-56B rev. 1]: + /// http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-56Br1.pdf + pub fn from_der(input: &[u8]) -> Result { + untrusted::Input::from(input).read_all(KeyRejected::invalid_encoding(), |input| { + der::nested( + input, + der::Tag::Sequence, + KeyRejected::invalid_encoding(), + Self::from_der_reader, + ) + }) + } + + fn from_der_reader(input: &mut untrusted::Reader) -> Result { + let version = der::small_nonnegative_integer(input) + .map_err(|error::Unspecified| KeyRejected::invalid_encoding())?; + if version != 0 { + return Err(KeyRejected::version_not_supported()); + } + + fn nonnegative_integer<'a>( + input: &mut untrusted::Reader<'a>, + ) -> Result<&'a [u8], KeyRejected> { + der::nonnegative_integer(input) + .map(|input| input.as_slice_less_safe()) + .map_err(|error::Unspecified| KeyRejected::invalid_encoding()) + } + + let n = nonnegative_integer(input)?; + let e = nonnegative_integer(input)?; + let d = nonnegative_integer(input)?; + let p = nonnegative_integer(input)?; + let q = nonnegative_integer(input)?; + let dP = nonnegative_integer(input)?; + let dQ = nonnegative_integer(input)?; + let qInv = nonnegative_integer(input)?; + + let components = KeyPairComponents { + public_key: PublicKeyComponents { n, e }, + d, + p, + q, + dP, + dQ, + qInv, + }; + + Self::from_components(&components) + } + + /// Constructs an RSA private key from its big-endian-encoded components. + /// + /// Only two-prime (not multi-prime) keys are supported. The public modulus + /// (n) must be at least 2047 bits. The public modulus must be no larger + /// than 4096 bits. It is recommended that the public modulus be exactly + /// 2048 or 3072 bits. The public exponent must be at least 65537 and must + /// be no more than 33 bits long. + /// + /// The private key is validated according to [NIST SP-800-56B rev. 1] + /// section 6.4.1.4.3, crt_pkv (Intended Exponent-Creation Method Unknown), + /// with the following exceptions: + /// + /// * Section 6.4.1.2.1, Step 1: Neither a target security level nor an + /// expected modulus length is provided as a parameter, so checks + /// regarding these expectations are not done. + /// * Section 6.4.1.2.1, Step 3: Since neither the public key nor the + /// expected modulus length is provided as a parameter, the consistency + /// check between these values and the private key's value of n isn't + /// done. + /// * Section 6.4.1.2.1, Step 5: No primality tests are done, both for + /// performance reasons and to avoid any side channels that such tests + /// would provide. + /// * Section 6.4.1.2.1, Step 6, and 6.4.1.4.3, Step 7: + /// * *ring* has a slightly looser lower bound for the values of `p` + /// and `q` than what the NIST document specifies. This looser lower + /// bound matches what most other crypto libraries do. The check might + /// be tightened to meet NIST's requirements in the future. Similarly, + /// the check that `p` and `q` are not too close together is skipped + /// currently, but may be added in the future. + /// * The validity of the mathematical relationship of `dP`, `dQ`, `e` + /// and `n` is verified only during signing. Some size checks of `d`, + /// `dP` and `dQ` are performed at construction, but some NIST checks + /// are skipped because they would be expensive and/or they would leak + /// information through side channels. If a preemptive check of the + /// consistency of `dP`, `dQ`, `e` and `n` with each other is + /// necessary, that can be done by signing any message with the key + /// pair. + /// + /// * `d` is not fully validated, neither at construction nor during + /// signing. This is OK as far as *ring*'s usage of the key is + /// concerned because *ring* never uses the value of `d` (*ring* always + /// uses `p`, `q`, `dP` and `dQ` via the Chinese Remainder Theorem, + /// instead). However, *ring*'s checks would not be sufficient for + /// validating a key pair for use by some other system; that other + /// system must check the value of `d` itself if `d` is to be used. + pub fn from_components( + components: &KeyPairComponents, + ) -> Result + where + Public: AsRef<[u8]>, + Private: AsRef<[u8]>, + { + let components = KeyPairComponents { + public_key: PublicKeyComponents { + n: components.public_key.n.as_ref(), + e: components.public_key.e.as_ref(), + }, + d: components.d.as_ref(), + p: components.p.as_ref(), + q: components.q.as_ref(), + dP: components.dP.as_ref(), + dQ: components.dQ.as_ref(), + qInv: components.qInv.as_ref(), + }; + Self::from_components_(&components, cpu::features()) + } + + fn from_components_( + &KeyPairComponents { + public_key, + d, + p, + q, + dP, + dQ, + qInv, + }: &KeyPairComponents<&[u8]>, + cpu_features: cpu::Features, + ) -> Result { + let d = untrusted::Input::from(d); + let p = untrusted::Input::from(p); + let q = untrusted::Input::from(q); + let dP = untrusted::Input::from(dP); + let dQ = untrusted::Input::from(dQ); + let qInv = untrusted::Input::from(qInv); + + // XXX: Some steps are done out of order, but the NIST steps are worded + // in such a way that it is clear that NIST intends for them to be done + // in order. TODO: Does this matter at all? + + // 6.4.1.4.3/6.4.1.2.1 - Step 1. + + // Step 1.a is omitted, as explained above. + + // Step 1.b is omitted per above. Instead, we check that the public + // modulus is 2048 to `PRIVATE_KEY_PUBLIC_MODULUS_MAX_BITS` bits. + // XXX: The maximum limit of 4096 bits is primarily due to lack of + // testing of larger key sizes; see, in particular, + // https://www.mail-archive.com/openssl-dev@openssl.org/msg44586.html + // and + // https://www.mail-archive.com/openssl-dev@openssl.org/msg44759.html. + // Also, this limit might help with memory management decisions later. + + // Step 1.c. We validate e >= 65537. + let n = untrusted::Input::from(public_key.n); + let e = untrusted::Input::from(public_key.e); + let public_key = PublicKey::from_modulus_and_exponent( + n, + e, + BitLength::from_bits(2048), + super::PRIVATE_KEY_PUBLIC_MODULUS_MAX_BITS, + PublicExponent::_65537, + cpu_features, + )?; + + let n_one = public_key.inner().n().oneRR(); + let n = &public_key.inner().n().value(cpu_features); + + // 6.4.1.4.3 says to skip 6.4.1.2.1 Step 2. + + // 6.4.1.4.3 Step 3. + + // Step 3.a is done below, out of order. + // Step 3.b is unneeded since `n_bits` is derived here from `n`. + + // 6.4.1.4.3 says to skip 6.4.1.2.1 Step 4. (We don't need to recover + // the prime factors since they are already given.) + + // 6.4.1.4.3 - Step 5. + + // Steps 5.a and 5.b are omitted, as explained above. + + let n_bits = public_key.inner().n().len_bits(); + + let p = PrivatePrime::new(p, n_bits, cpu_features)?; + let q = PrivatePrime::new(q, n_bits, cpu_features)?; + + // TODO: Step 5.i + // + // 3.b is unneeded since `n_bits` is derived here from `n`. + + // 6.4.1.4.3 - Step 3.a (out of order). + // + // Verify that p * q == n. We restrict ourselves to modular + // multiplication. We rely on the fact that we've verified + // 0 < q < p < n. We check that q and p are close to sqrt(n) and then + // assume that these preconditions are enough to let us assume that + // checking p * q == 0 (mod n) is equivalent to checking p * q == n. + let q_mod_n = q + .modulus + .to_elem(n) + .map_err(|error::Unspecified| KeyRejected::inconsistent_components())?; + let p_mod_n = p + .modulus + .to_elem(n) + .map_err(|error::Unspecified| KeyRejected::inconsistent_components())?; + let p_mod_n = bigint::elem_mul(n_one, p_mod_n, n); + let pq_mod_n = bigint::elem_mul(&q_mod_n, p_mod_n, n); + if !pq_mod_n.is_zero() { + return Err(KeyRejected::inconsistent_components()); + } + + // 6.4.1.4.3/6.4.1.2.1 - Step 6. + + // Step 6.a, partial. + // + // First, validate `2**half_n_bits < d`. Since 2**half_n_bits has a bit + // length of half_n_bits + 1, this check gives us 2**half_n_bits <= d, + // and knowing d is odd makes the inequality strict. + let d = bigint::OwnedModulusValue::::from_be_bytes(d) + .map_err(|_| KeyRejected::invalid_component())?; + if !(n_bits.half_rounded_up() < d.len_bits()) { + return Err(KeyRejected::inconsistent_components()); + } + // XXX: This check should be `d < LCM(p - 1, q - 1)`, but we don't have + // a good way of calculating LCM, so it is omitted, as explained above. + d.verify_less_than(n) + .map_err(|error::Unspecified| KeyRejected::inconsistent_components())?; + + // Step 6.b is omitted as explained above. + + let pm = &p.modulus.modulus(cpu_features); + + // 6.4.1.4.3 - Step 7. + + // Step 7.c. + let qInv = bigint::Elem::from_be_bytes_padded(qInv, pm) + .map_err(|error::Unspecified| KeyRejected::invalid_component())?; + + // Steps 7.d and 7.e are omitted per the documentation above, and + // because we don't (in the long term) have a good way to do modulo + // with an even modulus. + + // Step 7.f. + let qInv = bigint::elem_mul(p.oneRR.as_ref(), qInv, pm); + let q_mod_p = bigint::elem_reduced(pm.alloc_zero(), &q_mod_n, pm, q.modulus.len_bits()); + let q_mod_p = bigint::elem_mul(p.oneRR.as_ref(), q_mod_p, pm); + bigint::verify_inverses_consttime(&qInv, q_mod_p, pm) + .map_err(|error::Unspecified| KeyRejected::inconsistent_components())?; + + // This should never fail since `n` and `e` were validated above. + + let p = PrivateCrtPrime::new(p, dP, cpu_features)?; + let q = PrivateCrtPrime::new(q, dQ, cpu_features)?; + + Ok(Self { + p, + q, + qInv, + public: public_key, + }) + } + + /// Returns a reference to the public key. + pub fn public(&self) -> &PublicKey { + &self.public + } + + /// Returns the length in bytes of the key pair's public modulus. + /// + /// A signature has the same length as the public modulus. + #[deprecated = "Use `public().modulus_len()`"] + #[inline] + pub fn public_modulus_len(&self) -> usize { + self.public().modulus_len() + } +} + +impl signature::KeyPair for KeyPair { + type PublicKey = PublicKey; + + fn public_key(&self) -> &Self::PublicKey { + self.public() + } +} + +struct PrivatePrime { + modulus: bigint::OwnedModulus, + oneRR: bigint::One, +} + +impl PrivatePrime { + fn new( + p: untrusted::Input, + n_bits: BitLength, + cpu_features: cpu::Features, + ) -> Result { + let p = bigint::OwnedModulusValue::from_be_bytes(p)?; + + // 5.c / 5.g: + // + // TODO: First, stop if `p < (√2) * 2**((nBits/2) - 1)`. + // TODO: First, stop if `q < (√2) * 2**((nBits/2) - 1)`. + // + // Second, stop if `p > 2**(nBits/2) - 1`. + // Second, stop if `q > 2**(nBits/2) - 1`. + if p.len_bits() != n_bits.half_rounded_up() { + return Err(KeyRejected::inconsistent_components()); + } + + if p.len_bits().as_bits() % 512 != 0 { + return Err(KeyRejected::private_modulus_len_not_multiple_of_512_bits()); + } + + // TODO: Step 5.d: Verify GCD(p - 1, e) == 1. + // TODO: Step 5.h: Verify GCD(q - 1, e) == 1. + + // Steps 5.e and 5.f are omitted as explained above. + let p = bigint::OwnedModulus::from(p); + let pm = p.modulus(cpu_features); + let oneRR = bigint::One::newRR(pm.alloc_zero(), &pm); + + Ok(Self { modulus: p, oneRR }) + } +} + +struct PrivateCrtPrime { + modulus: bigint::OwnedModulus, + oneRRR: bigint::One, + exponent: bigint::PrivateExponent, +} + +impl PrivateCrtPrime { + /// Constructs a `PrivateCrtPrime` from the private prime `p` and `dP` where + /// dP == d % (p - 1). + fn new( + p: PrivatePrime, + dP: untrusted::Input, + cpu_features: cpu::Features, + ) -> Result { + let m = &p.modulus.modulus(cpu_features); + // [NIST SP-800-56B rev. 1] 6.4.1.4.3 - Steps 7.a & 7.b. + let dP = bigint::PrivateExponent::from_be_bytes_padded(dP, m) + .map_err(|error::Unspecified| KeyRejected::inconsistent_components())?; + + // XXX: Steps 7.d and 7.e are omitted. We don't check that + // `dP == d % (p - 1)` because we don't (in the long term) have a good + // way to do modulo with an even modulus. Instead we just check that + // `1 <= dP < p - 1`. We'll check it, to some unknown extent, when we + // do the private key operation, since we verify that the result of the + // private key operation using the CRT parameters is consistent with `n` + // and `e`. TODO: Either prove that what we do is sufficient, or make + // it so. + + let oneRRR = bigint::One::newRRR(p.oneRR, m); + + Ok(Self { + modulus: p.modulus, + oneRRR, + exponent: dP, + }) + } +} + +fn elem_exp_consttime( + c: &bigint::Elem, + p: &PrivateCrtPrime, + other_prime_len_bits: BitLength, + cpu_features: cpu::Features, +) -> Result, error::Unspecified> { + let m = &p.modulus.modulus(cpu_features); + bigint::elem_exp_consttime( + m.alloc_zero(), + c, + &p.oneRRR, + &p.exponent, + m, + other_prime_len_bits, + ) + .map_err(error::erase::) +} + +// Type-level representations of the different moduli used in RSA signing, in +// addition to `super::N`. See `super::bigint`'s modulue-level documentation. + +enum P {} + +enum Q {} + +enum D {} + +impl KeyPair { + /// Computes the signature of `msg` and writes it into `signature`. + /// + /// `msg` is digested using the digest algorithm from `padding_alg` and the + /// digest is then padded using the padding algorithm from `padding_alg`. + /// + /// The signature it written into `signature`; `signature`'s length must be + /// exactly the length returned by `self::public().modulus_len()` or else + /// an error will be returned. On failure, `signature` may contain + /// intermediate results, but won't contain anything that would endanger the + /// private key. + /// + /// `rng` may be used to randomize the padding (e.g. for PSS). + /// + /// Many other crypto libraries have signing functions that takes a + /// precomputed digest as input, instead of the message to digest. This + /// function does *not* take a precomputed digest; instead, `sign` + /// calculates the digest itself. + pub fn sign( + &self, + padding_alg: &'static dyn RsaEncoding, + rng: &dyn rand::SecureRandom, + msg: &[u8], + signature: &mut [u8], + ) -> Result<(), error::Unspecified> { + let cpu_features = cpu::features(); + + if signature.len() != self.public().modulus_len() { + return Err(error::Unspecified); + } + + let m_hash = digest::digest(padding_alg.digest_alg(), msg); + + // Use the output buffer as the scratch space for the signature to + // reduce the required stack space. + padding::encode( + padding_alg, + m_hash, + signature, + self.public().inner().n().len_bits(), + rng, + )?; + + // RFC 8017 Section 5.1.2: RSADP, using the Chinese Remainder Theorem + // with Garner's algorithm. + + // Steps 1 and 2. + let m = self.private_exponentiate(signature, cpu_features)?; + + // Step 3. + m.fill_be_bytes(signature); + + Ok(()) + } + + /// Returns base**d (mod n). + /// + /// This does not return or write any intermediate results into any buffers + /// that are provided by the caller so that no intermediate state will be + /// leaked that would endanger the private key. + /// + /// Panics if `in_out` is not `self.public().modulus_len()`. + fn private_exponentiate( + &self, + base: &[u8], + cpu_features: cpu::Features, + ) -> Result, error::Unspecified> { + assert_eq!(base.len(), self.public().modulus_len()); + + // RFC 8017 Section 5.1.2: RSADP, using the Chinese Remainder Theorem + // with Garner's algorithm. + + let n = &self.public.inner().n().value(cpu_features); + let n_one = self.public.inner().n().oneRR(); + + // Step 1. The value zero is also rejected. + let base = bigint::Elem::from_be_bytes_padded(untrusted::Input::from(base), n)?; + + // Step 2 + let c = base; + + // Step 2.b.i. + let q_bits = self.q.modulus.len_bits(); + let m_1 = elem_exp_consttime(&c, &self.p, q_bits, cpu_features)?; + let m_2 = elem_exp_consttime(&c, &self.q, self.p.modulus.len_bits(), cpu_features)?; + + // Step 2.b.ii isn't needed since there are only two primes. + + // Step 2.b.iii. + let h = { + let p = &self.p.modulus.modulus(cpu_features); + let m_2 = bigint::elem_reduced_once(p.alloc_zero(), &m_2, p, q_bits); + let m_1_minus_m_2 = bigint::elem_sub(m_1, &m_2, p); + bigint::elem_mul(&self.qInv, m_1_minus_m_2, p) + }; + + // Step 2.b.iv. The reduction in the modular multiplication isn't + // necessary because `h < p` and `p * q == n` implies `h * q < n`. + // Modular arithmetic is used simply to avoid implementing + // non-modular arithmetic. + let p_bits = self.p.modulus.len_bits(); + let h = bigint::elem_widen(n.alloc_zero(), h, n, p_bits)?; + let q_mod_n = self.q.modulus.to_elem(n)?; + let q_mod_n = bigint::elem_mul(n_one, q_mod_n, n); + let q_times_h = bigint::elem_mul(&q_mod_n, h, n); + let m_2 = bigint::elem_widen(n.alloc_zero(), m_2, n, q_bits)?; + let m = bigint::elem_add(m_2, q_times_h, n); + + // Step 2.b.v isn't needed since there are only two primes. + + // Verify the result to protect against fault attacks as described + // in "On the Importance of Checking Cryptographic Protocols for + // Faults" by Dan Boneh, Richard A. DeMillo, and Richard J. Lipton. + // This check is cheap assuming `e` is small, which is ensured during + // `KeyPair` construction. Note that this is the only validation of `e` + // that is done other than basic checks on its size, oddness, and + // minimum value, since the relationship of `e` to `d`, `p`, and `q` is + // not verified during `KeyPair` construction. + { + let verify = n.alloc_zero(); + let verify = self + .public + .inner() + .exponentiate_elem(verify, &m, cpu_features); + bigint::elem_verify_equal_consttime(&verify, &c)?; + } + + // Step 3 will be done by the caller. + + Ok(m) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::testutil as test; + use alloc::vec; + + #[test] + fn test_rsakeypair_private_exponentiate() { + let cpu = cpu::features(); + test::run( + test_vector_file!("keypair_private_exponentiate_tests.txt"), + |section, test_case| { + assert_eq!(section, ""); + + let key = test_case.consume_bytes("Key"); + let key = KeyPair::from_pkcs8(&key).unwrap(); + let test_cases = &[ + test_case.consume_bytes("p"), + test_case.consume_bytes("p_plus_1"), + test_case.consume_bytes("p_minus_1"), + test_case.consume_bytes("q"), + test_case.consume_bytes("q_plus_1"), + test_case.consume_bytes("q_minus_1"), + ]; + for test_case in test_cases { + // THe call to `elem_verify_equal_consttime` will cause + // `private_exponentiate` to fail if the computation is + // incorrect. + let mut padded = vec![0; key.public.modulus_len()]; + let zeroes = padded.len() - test_case.len(); + padded[zeroes..].copy_from_slice(test_case); + let _: bigint::Elem<_> = key.private_exponentiate(&padded, cpu).unwrap(); + } + Ok(()) + }, + ); + } +} diff --git a/ring-0.17.14/src/rsa/keypair_components.rs b/ring-0.17.14/src/rsa/keypair_components.rs new file mode 100644 index 0000000000..66e76792ae --- /dev/null +++ b/ring-0.17.14/src/rsa/keypair_components.rs @@ -0,0 +1,38 @@ +use super::PublicKeyComponents; + +/// RSA key pair components. +#[derive(Clone, Copy)] +pub struct KeyPairComponents { + /// The public key components. + pub public_key: PublicKeyComponents, + + /// The private exponent. + pub d: Private, + + /// The first prime factor of `d`. + pub p: Private, + + /// The second prime factor of `d`. + pub q: Private, + + /// `p`'s public Chinese Remainder Theorem exponent. + pub dP: Private, + + /// `q`'s public Chinese Remainder Theorem exponent. + pub dQ: Private, + + /// `q**-1 mod p`. + pub qInv: Private, +} + +impl core::fmt::Debug for KeyPairComponents +where + PublicKeyComponents: core::fmt::Debug, +{ + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error> { + // Non-public components are intentionally skipped + f.debug_struct("KeyPairComponents") + .field("public_key", &self.public_key) + .finish() + } +} diff --git a/ring-0.17.14/src/rsa/padding.rs b/ring-0.17.14/src/rsa/padding.rs new file mode 100644 index 0000000000..98a45fc3c1 --- /dev/null +++ b/ring-0.17.14/src/rsa/padding.rs @@ -0,0 +1,177 @@ +// Copyright 2015-2016 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use crate::{bb, bits, digest, error, rand}; + +mod pkcs1; +mod pss; + +pub use self::{ + pkcs1::{RSA_PKCS1_SHA256, RSA_PKCS1_SHA384, RSA_PKCS1_SHA512}, + pss::{RSA_PSS_SHA256, RSA_PSS_SHA384, RSA_PSS_SHA512}, +}; +pub(super) use pkcs1::RSA_PKCS1_SHA1_FOR_LEGACY_USE_ONLY; + +/// Common features of both RSA padding encoding and RSA padding verification. +pub trait Padding: 'static + Sync + crate::sealed::Sealed + core::fmt::Debug { + // The digest algorithm used for digesting the message (and maybe for + // other things). + fn digest_alg(&self) -> &'static digest::Algorithm; +} + +pub(super) fn encode( + encoding: &dyn RsaEncoding, + m_hash: digest::Digest, + m_out: &mut [u8], + mod_bits: bits::BitLength, + rng: &dyn rand::SecureRandom, +) -> Result<(), error::Unspecified> { + #[allow(deprecated)] + encoding.encode(m_hash, m_out, mod_bits, rng) +} + +/// An RSA signature encoding as described in [RFC 3447 Section 8]. +/// +/// [RFC 3447 Section 8]: https://tools.ietf.org/html/rfc3447#section-8 +#[cfg(feature = "alloc")] +pub trait RsaEncoding: Padding { + #[deprecated(note = "internal API that will be removed")] + #[doc(hidden)] + fn encode( + &self, + m_hash: digest::Digest, + m_out: &mut [u8], + mod_bits: bits::BitLength, + rng: &dyn rand::SecureRandom, + ) -> Result<(), error::Unspecified>; +} + +/// Verification of an RSA signature encoding as described in +/// [RFC 3447 Section 8]. +/// +/// [RFC 3447 Section 8]: https://tools.ietf.org/html/rfc3447#section-8 +pub trait Verification: Padding { + fn verify( + &self, + m_hash: digest::Digest, + m: &mut untrusted::Reader, + mod_bits: bits::BitLength, + ) -> Result<(), error::Unspecified>; +} + +// Masks `out` with the output of the mask-generating function MGF1 as +// described in https://tools.ietf.org/html/rfc3447#appendix-B.2.1. +fn mgf1(digest_alg: &'static digest::Algorithm, seed: &[u8], out: &mut [u8]) { + let digest_len = digest_alg.output_len(); + + // Maximum counter value is the value of (mask_len / digest_len) rounded up. + for (i, out) in out.chunks_mut(digest_len).enumerate() { + let mut ctx = digest::Context::new(digest_alg); + ctx.update(seed); + // The counter will always fit in a `u32` because we reject absurdly + // long inputs very early. + ctx.update(&u32::to_be_bytes(i.try_into().unwrap())); + let digest = ctx.finish(); + + // The last chunk may legitimately be shorter than `digest`, but + // `digest` will never be shorter than `out`. + bb::xor_assign_at_start(out, digest.as_ref()); + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::testutil as test; + use crate::{digest, error}; + use alloc::vec; + + #[test] + fn test_pss_padding_verify() { + test::run( + test_vector_file!("rsa_pss_padding_tests.txt"), + |section, test_case| { + assert_eq!(section, ""); + + let digest_name = test_case.consume_string("Digest"); + let alg = match digest_name.as_ref() { + "SHA256" => &RSA_PSS_SHA256, + "SHA384" => &RSA_PSS_SHA384, + "SHA512" => &RSA_PSS_SHA512, + _ => panic!("Unsupported digest: {}", digest_name), + }; + + let msg = test_case.consume_bytes("Msg"); + let msg = untrusted::Input::from(&msg); + let m_hash = digest::digest(alg.digest_alg(), msg.as_slice_less_safe()); + + let encoded = test_case.consume_bytes("EM"); + let encoded = untrusted::Input::from(&encoded); + + // Salt is recomputed in verification algorithm. + let _ = test_case.consume_bytes("Salt"); + + let bit_len = test_case.consume_usize_bits("Len"); + let is_valid = test_case.consume_string("Result") == "P"; + + let actual_result = + encoded.read_all(error::Unspecified, |m| alg.verify(m_hash, m, bit_len)); + assert_eq!(actual_result.is_ok(), is_valid); + + Ok(()) + }, + ); + } + + // Tests PSS encoding for various public modulus lengths. + #[cfg(feature = "alloc")] + #[test] + fn test_pss_padding_encode() { + test::run( + test_vector_file!("rsa_pss_padding_tests.txt"), + |section, test_case| { + assert_eq!(section, ""); + + let digest_name = test_case.consume_string("Digest"); + let alg = match digest_name.as_ref() { + "SHA256" => &RSA_PSS_SHA256, + "SHA384" => &RSA_PSS_SHA384, + "SHA512" => &RSA_PSS_SHA512, + _ => panic!("Unsupported digest: {}", digest_name), + }; + + let msg = test_case.consume_bytes("Msg"); + let salt = test_case.consume_bytes("Salt"); + let encoded = test_case.consume_bytes("EM"); + let bit_len = test_case.consume_usize_bits("Len"); + let expected_result = test_case.consume_string("Result"); + + // Only test the valid outputs + if expected_result != "P" { + return Ok(()); + } + + let rng = test::rand::FixedSliceRandom { bytes: &salt }; + + let mut m_out = vec![0u8; bit_len.as_usize_bytes_rounded_up()]; + let digest = digest::digest(alg.digest_alg(), &msg); + #[allow(deprecated)] + alg.encode(digest, &mut m_out, bit_len, &rng).unwrap(); + assert_eq!(m_out, encoded); + + Ok(()) + }, + ); + } +} diff --git a/ring-0.17.14/src/rsa/padding/pkcs1.rs b/ring-0.17.14/src/rsa/padding/pkcs1.rs new file mode 100644 index 0000000000..880b1dc115 --- /dev/null +++ b/ring-0.17.14/src/rsa/padding/pkcs1.rs @@ -0,0 +1,177 @@ +// Copyright 2015-2016 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::{super::PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN, Padding, RsaEncoding, Verification}; +use crate::{bits, digest, error, io::der, rand}; + +/// PKCS#1 1.5 padding as described in [RFC 3447 Section 8.2]. +/// +/// See "`RSA_PSS_*` Details\" in `ring::signature`'s module-level +/// documentation for more details. +/// +/// [RFC 3447 Section 8.2]: https://tools.ietf.org/html/rfc3447#section-8.2 +#[derive(Debug)] +pub struct PKCS1 { + digest_alg: &'static digest::Algorithm, + digestinfo_prefix: &'static [u8], +} + +impl crate::sealed::Sealed for PKCS1 {} + +impl Padding for PKCS1 { + fn digest_alg(&self) -> &'static digest::Algorithm { + self.digest_alg + } +} + +impl RsaEncoding for PKCS1 { + fn encode( + &self, + m_hash: digest::Digest, + m_out: &mut [u8], + _mod_bits: bits::BitLength, + _rng: &dyn rand::SecureRandom, + ) -> Result<(), error::Unspecified> { + pkcs1_encode(self, m_hash, m_out); + Ok(()) + } +} + +impl Verification for PKCS1 { + fn verify( + &self, + m_hash: digest::Digest, + m: &mut untrusted::Reader, + mod_bits: bits::BitLength, + ) -> Result<(), error::Unspecified> { + // `mod_bits.as_usize_bytes_rounded_up() <= + // PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN` is ensured by `verify_rsa_()`. + let mut calculated = [0u8; PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN]; + let calculated = &mut calculated[..mod_bits.as_usize_bytes_rounded_up()]; + pkcs1_encode(self, m_hash, calculated); + if m.read_bytes_to_end().as_slice_less_safe() != calculated { + return Err(error::Unspecified); + } + Ok(()) + } +} + +// Implement padding procedure per EMSA-PKCS1-v1_5, +// https://tools.ietf.org/html/rfc3447#section-9.2. This is used by both +// verification and signing so it needs to be able to handle moduli of the +// minimum and maximum sizes for both operations. +fn pkcs1_encode(pkcs1: &PKCS1, m_hash: digest::Digest, m_out: &mut [u8]) { + let em = m_out; + + let digest_len = pkcs1.digestinfo_prefix.len() + pkcs1.digest_alg.output_len(); + + // The specification requires at least 8 bytes of padding. Since we + // disallow keys smaller than 1024 bits, this should always be true. + assert!(em.len() >= digest_len + 11); + let pad_len = em.len() - digest_len - 3; + em[0] = 0; + em[1] = 1; + for i in 0..pad_len { + em[2 + i] = 0xff; + } + em[2 + pad_len] = 0; + + let (digest_prefix, digest_dst) = em[3 + pad_len..].split_at_mut(pkcs1.digestinfo_prefix.len()); + digest_prefix.copy_from_slice(pkcs1.digestinfo_prefix); + digest_dst.copy_from_slice(m_hash.as_ref()); +} + +macro_rules! rsa_pkcs1_padding { + ( $vis:vis $PADDING_ALGORITHM:ident, $digest_alg:expr, $digestinfo_prefix:expr, + $doc_str:expr ) => { + #[doc=$doc_str] + $vis static $PADDING_ALGORITHM: PKCS1 = PKCS1 { + digest_alg: $digest_alg, + digestinfo_prefix: $digestinfo_prefix, + }; + }; +} + +// Intentionally not exposed except internally for signature verification. At a +// minimum, we'd need to create test vectors for signing with it, which we +// don't currently have. But, it's a bad idea to use SHA-1 anyway, so perhaps +// we just won't ever expose it. +rsa_pkcs1_padding!( + pub(in super::super) RSA_PKCS1_SHA1_FOR_LEGACY_USE_ONLY, + &digest::SHA1_FOR_LEGACY_USE_ONLY, + &SHA1_PKCS1_DIGESTINFO_PREFIX, + "PKCS#1 1.5 padding using SHA-1 for RSA signatures." +); + +rsa_pkcs1_padding!( + pub RSA_PKCS1_SHA256, + &digest::SHA256, + &SHA256_PKCS1_DIGESTINFO_PREFIX, + "PKCS#1 1.5 padding using SHA-256 for RSA signatures." +); + +rsa_pkcs1_padding!( + pub RSA_PKCS1_SHA384, + &digest::SHA384, + &SHA384_PKCS1_DIGESTINFO_PREFIX, + "PKCS#1 1.5 padding using SHA-384 for RSA signatures." +); + +rsa_pkcs1_padding!( + pub RSA_PKCS1_SHA512, + &digest::SHA512, + &SHA512_PKCS1_DIGESTINFO_PREFIX, + "PKCS#1 1.5 padding using SHA-512 for RSA signatures." +); + +macro_rules! pkcs1_digestinfo_prefix { + ( $name:ident, $digest_len:expr, $digest_oid_len:expr, + [ $( $digest_oid:expr ),* ] ) => { + static $name: [u8; 2 + 8 + $digest_oid_len] = [ + der::Tag::Sequence.into(), 8 + $digest_oid_len + $digest_len, + der::Tag::Sequence.into(), 2 + $digest_oid_len + 2, + der::Tag::OID.into(), $digest_oid_len, $( $digest_oid ),*, + der::Tag::Null.into(), 0, + der::Tag::OctetString.into(), $digest_len, + ]; + } +} + +pkcs1_digestinfo_prefix!( + SHA1_PKCS1_DIGESTINFO_PREFIX, + 20, + 5, + [0x2b, 0x0e, 0x03, 0x02, 0x1a] +); + +pkcs1_digestinfo_prefix!( + SHA256_PKCS1_DIGESTINFO_PREFIX, + 32, + 9, + [0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x01] +); + +pkcs1_digestinfo_prefix!( + SHA384_PKCS1_DIGESTINFO_PREFIX, + 48, + 9, + [0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x02] +); + +pkcs1_digestinfo_prefix!( + SHA512_PKCS1_DIGESTINFO_PREFIX, + 64, + 9, + [0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x03] +); diff --git a/ring-0.17.14/src/rsa/padding/pss.rs b/ring-0.17.14/src/rsa/padding/pss.rs new file mode 100644 index 0000000000..8dd126ccc0 --- /dev/null +++ b/ring-0.17.14/src/rsa/padding/pss.rs @@ -0,0 +1,289 @@ +// Copyright 2015-2016 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::{super::PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN, mgf1, Padding, RsaEncoding, Verification}; +use crate::{bb, bits, digest, error, rand}; + +/// RSA PSS padding as described in [RFC 3447 Section 8.1]. +/// +/// See "`RSA_PSS_*` Details\" in `ring::signature`'s module-level +/// documentation for more details. +/// +/// [RFC 3447 Section 8.1]: https://tools.ietf.org/html/rfc3447#section-8.1 +#[allow(clippy::upper_case_acronyms)] // TODO: Until we implement cargo-semver-checks +#[derive(Debug)] +pub struct PSS { + digest_alg: &'static digest::Algorithm, +} + +impl crate::sealed::Sealed for PSS {} + +impl Padding for PSS { + fn digest_alg(&self) -> &'static digest::Algorithm { + self.digest_alg + } +} + +impl RsaEncoding for PSS { + // Implement padding procedure per EMSA-PSS, + // https://tools.ietf.org/html/rfc3447#section-9.1. + fn encode( + &self, + m_hash: digest::Digest, + m_out: &mut [u8], + mod_bits: bits::BitLength, + rng: &dyn rand::SecureRandom, + ) -> Result<(), error::Unspecified> { + let metrics = PSSMetrics::new(self.digest_alg, mod_bits)?; + + // The `m_out` this function fills is the big-endian-encoded value of `m` + // from the specification, padded to `k` bytes, where `k` is the length + // in bytes of the public modulus. The spec says "Note that emLen will + // be one less than k if modBits - 1 is divisible by 8 and equal to k + // otherwise." In other words we might need to prefix `em` with a + // leading zero byte to form a correct value of `m`. + let em = if metrics.top_byte_mask == 0xff { + m_out[0] = 0; + &mut m_out[1..] + } else { + m_out + }; + assert_eq!(em.len(), metrics.em_len); + + // Steps 1 and 2 are done by the caller to produce `m_hash`. + + // Step 3 is done by `PSSMetrics::new()` above. + + let (db, digest_terminator) = em.split_at_mut(metrics.db_len); + + let separator_pos = db.len() - 1 - metrics.s_len; + + // Step 4. + let salt: &[u8] = { + let salt = &mut db[(separator_pos + 1)..]; + rng.fill(salt)?; // salt + salt + }; + + // Steps 5 and 6. + let h = pss_digest(self.digest_alg, m_hash, salt); + + // Step 7. + db[..separator_pos].fill(0); // ps + + // Step 8. + db[separator_pos] = 0x01; + + // Steps 9 and 10. + mgf1(self.digest_alg, h.as_ref(), db); + + // Step 11. + db[0] &= metrics.top_byte_mask; + + // Step 12. + digest_terminator[..metrics.h_len].copy_from_slice(h.as_ref()); + digest_terminator[metrics.h_len] = 0xbc; + + Ok(()) + } +} + +impl Verification for PSS { + // RSASSA-PSS-VERIFY from https://tools.ietf.org/html/rfc3447#section-8.1.2 + // where steps 1, 2(a), and 2(b) have been done for us. + fn verify( + &self, + m_hash: digest::Digest, + m: &mut untrusted::Reader, + mod_bits: bits::BitLength, + ) -> Result<(), error::Unspecified> { + let metrics = PSSMetrics::new(self.digest_alg, mod_bits)?; + + // RSASSA-PSS-VERIFY Step 2(c). The `m` this function is given is the + // big-endian-encoded value of `m` from the specification, padded to + // `k` bytes, where `k` is the length in bytes of the public modulus. + // The spec. says "Note that emLen will be one less than k if + // modBits - 1 is divisible by 8 and equal to k otherwise," where `k` + // is the length in octets of the RSA public modulus `n`. In other + // words, `em` might have an extra leading zero byte that we need to + // strip before we start the PSS decoding steps which is an artifact of + // the `Verification` interface. + if metrics.top_byte_mask == 0xff { + if m.read_byte()? != 0 { + return Err(error::Unspecified); + } + }; + let em = m; + + // The rest of this function is EMSA-PSS-VERIFY from + // https://tools.ietf.org/html/rfc3447#section-9.1.2. + + // Steps 1 and 2 are done by the caller to produce `m_hash`. + + // Step 3 is done by `PSSMetrics::new()` above. + + // Step 5, out of order. + let masked_db = em.read_bytes(metrics.db_len)?; + let h_hash = em.read_bytes(metrics.h_len)?; + + // Step 4. + if em.read_byte()? != 0xbc { + return Err(error::Unspecified); + } + + // Step 7. + let mut db = [0u8; PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN]; + let db = &mut db[..metrics.db_len]; + + mgf1(self.digest_alg, h_hash.as_slice_less_safe(), db); + + masked_db.read_all(error::Unspecified, |masked_bytes| { + // Step 6. Check the top bits of first byte are zero. + let b = masked_bytes.read_byte()?; + if b & !metrics.top_byte_mask != 0 { + return Err(error::Unspecified); + } + db[0] ^= b; + + // Step 8. + let db_rest = &mut db[1..]; + let masked_bytes = masked_bytes.read_bytes(db_rest.len())?; + bb::xor_assign_at_start(db_rest, masked_bytes.as_slice_less_safe()); + Ok(()) + })?; + + // Step 9. + db[0] &= metrics.top_byte_mask; + + // Step 10. + let ps_len = metrics.ps_len; + if db[0..ps_len].iter().any(|&db| db != 0) { + return Err(error::Unspecified); + } + if db[metrics.ps_len] != 1 { + return Err(error::Unspecified); + } + + // Step 11. + let salt = &db[(db.len() - metrics.s_len)..]; + + // Step 12 and 13. + let h_prime = pss_digest(self.digest_alg, m_hash, salt); + + // Step 14. + if h_hash.as_slice_less_safe() != h_prime.as_ref() { + return Err(error::Unspecified); + } + + Ok(()) + } +} + +struct PSSMetrics { + #[cfg_attr(not(feature = "alloc"), allow(dead_code))] + em_len: usize, + db_len: usize, + ps_len: usize, + s_len: usize, + h_len: usize, + top_byte_mask: u8, +} + +impl PSSMetrics { + fn new( + digest_alg: &'static digest::Algorithm, + mod_bits: bits::BitLength, + ) -> Result { + let em_bits = mod_bits.try_sub_1()?; + let em_len = em_bits.as_usize_bytes_rounded_up(); + let leading_zero_bits = (8 * em_len) - em_bits.as_bits(); + debug_assert!(leading_zero_bits < 8); + let top_byte_mask = 0xffu8 >> leading_zero_bits; + + let h_len = digest_alg.output_len(); + + // We require the salt length to be equal to the digest length. + let s_len = h_len; + + // Step 3 of both `EMSA-PSS-ENCODE` is `EMSA-PSS-VERIFY` requires that + // we reject inputs where "emLen < hLen + sLen + 2". The definition of + // `emBits` in RFC 3447 Sections 9.1.1 and 9.1.2 says `emBits` must be + // "at least 8hLen + 8sLen + 9". Since 9 bits requires two bytes, these + // two conditions are equivalent. 9 bits are required as the 0x01 + // before the salt requires 1 bit and the 0xbc after the digest + // requires 8 bits. + let db_len = em_len.checked_sub(1 + s_len).ok_or(error::Unspecified)?; + let ps_len = db_len.checked_sub(h_len + 1).ok_or(error::Unspecified)?; + + debug_assert!(em_bits.as_bits() >= (8 * h_len) + (8 * s_len) + 9); + + Ok(Self { + em_len, + db_len, + ps_len, + s_len, + h_len, + top_byte_mask, + }) + } +} + +fn pss_digest( + digest_alg: &'static digest::Algorithm, + m_hash: digest::Digest, + salt: &[u8], +) -> digest::Digest { + // Fixed prefix. + const PREFIX_ZEROS: [u8; 8] = [0u8; 8]; + + // Encoding step 5 and 6, Verification step 12 and 13. + let mut ctx = digest::Context::new(digest_alg); + ctx.update(&PREFIX_ZEROS); + ctx.update(m_hash.as_ref()); + ctx.update(salt); + ctx.finish() +} + +macro_rules! rsa_pss_padding { + ( $vis:vis $PADDING_ALGORITHM:ident, $digest_alg:expr, $doc_str:expr ) => { + #[doc=$doc_str] + $vis static $PADDING_ALGORITHM: PSS = PSS { + digest_alg: $digest_alg, + }; + }; +} + +rsa_pss_padding!( + pub RSA_PSS_SHA256, + &digest::SHA256, + "RSA PSS padding using SHA-256 for RSA signatures.\n\nSee + \"`RSA_PSS_*` Details\" in `ring::signature`'s module-level + documentation for more details." +); + +rsa_pss_padding!( + pub RSA_PSS_SHA384, + &digest::SHA384, + "RSA PSS padding using SHA-384 for RSA signatures.\n\nSee + \"`RSA_PSS_*` Details\" in `ring::signature`'s module-level + documentation for more details." +); + +rsa_pss_padding!( + pub RSA_PSS_SHA512, + &digest::SHA512, + "RSA PSS padding using SHA-512 for RSA signatures.\n\nSee + \"`RSA_PSS_*` Details\" in `ring::signature`'s module-level + documentation for more details." +); diff --git a/ring-0.17.14/src/rsa/public_exponent.rs b/ring-0.17.14/src/rsa/public_exponent.rs new file mode 100644 index 0000000000..b87b30b217 --- /dev/null +++ b/ring-0.17.14/src/rsa/public_exponent.rs @@ -0,0 +1,104 @@ +use crate::error; +use crate::polyfill::{unwrap_const, ArrayFlatMap, LeadingZerosStripped}; +use core::num::NonZeroU64; + +/// The exponent `e` of an RSA public key. +#[derive(Clone, Copy)] +pub struct PublicExponent(NonZeroU64); + +impl PublicExponent { + #[cfg(test)] + const ALL_CONSTANTS: [Self; 3] = [Self::_3, Self::_65537, Self::MAX]; + + pub(super) const _3: Self = Self(unwrap_const(NonZeroU64::new(3))); + pub(super) const _65537: Self = Self(unwrap_const(NonZeroU64::new(65537))); + + // This limit was chosen to bound the performance of the simple + // exponentiation-by-squaring implementation in `elem_exp_vartime`. In + // particular, it helps mitigate theoretical resource exhaustion attacks. 33 + // bits was chosen as the limit based on the recommendations in [1] and + // [2]. Windows CryptoAPI (at least older versions) doesn't support values + // larger than 32 bits [3], so it is unlikely that exponents larger than 32 + // bits are being used for anything Windows commonly does. + // + // [1] https://www.imperialviolet.org/2012/03/16/rsae.html + // [2] https://www.imperialviolet.org/2012/03/17/rsados.html + // [3] https://msdn.microsoft.com/en-us/library/aa387685(VS.85).aspx + const MAX: Self = Self(unwrap_const(NonZeroU64::new((1u64 << 33) - 1))); + + pub(super) fn from_be_bytes( + input: untrusted::Input, + min_value: Self, + ) -> Result { + // See `PublicKey::from_modulus_and_exponent` for background on the step + // numbering. + + if input.len() > 5 { + return Err(error::KeyRejected::too_large()); + } + let value = input.read_all(error::KeyRejected::invalid_encoding(), |input| { + // The exponent can't be zero and it can't be prefixed with + // zero-valued bytes. + if input.peek(0) { + return Err(error::KeyRejected::invalid_encoding()); + } + let mut value = 0u64; + loop { + let byte = input + .read_byte() + .map_err(|untrusted::EndOfInput| error::KeyRejected::invalid_encoding())?; + value = (value << 8) | u64::from(byte); + if input.at_end() { + return Ok(value); + } + } + })?; + + // Step 2 / Step b. NIST SP800-89 defers to FIPS 186-3, which requires + // `e >= 65537`. We enforce this when signing, but are more flexible in + // verification, for compatibility. Only small public exponents are + // supported. + let value = NonZeroU64::new(value).ok_or_else(error::KeyRejected::too_small)?; + if value < min_value.0 { + return Err(error::KeyRejected::too_small()); + } + if value > Self::MAX.0 { + return Err(error::KeyRejected::too_large()); + } + + // Step 3 / Step c. + if value.get() & 1 != 1 { + return Err(error::KeyRejected::invalid_component()); + } + + Ok(Self(value)) + } + + /// The big-endian encoding of the exponent. + /// + /// There are no leading zeros. + pub fn be_bytes(&self) -> impl ExactSizeIterator + Clone + '_ { + // The `unwrap()` won't fail as `self.0` is only a few bytes long. + let bytes = ArrayFlatMap::new(core::iter::once(self.0.get()), u64::to_be_bytes).unwrap(); + LeadingZerosStripped::new(bytes) + } + + pub(super) fn value(self) -> NonZeroU64 { + self.0 + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_public_exponent_constants() { + for value in PublicExponent::ALL_CONSTANTS.iter() { + let value: u64 = value.0.into(); + assert_eq!(value & 1, 1); + assert!(value >= PublicExponent::_3.0.into()); // The absolute minimum. + assert!(value <= PublicExponent::MAX.0.into()); + } + } +} diff --git a/ring-0.17.14/src/rsa/public_key.rs b/ring-0.17.14/src/rsa/public_key.rs new file mode 100644 index 0000000000..87e1a30d6e --- /dev/null +++ b/ring-0.17.14/src/rsa/public_key.rs @@ -0,0 +1,228 @@ +// Copyright 2015-2021 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::{PublicExponent, PublicModulus, N, PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN}; +use crate::{ + arithmetic::bigint, + bits, cpu, error, + io::{self, der, der_writer}, + limb::LIMB_BYTES, +}; +use alloc::boxed::Box; +use core::num::NonZeroU64; + +/// An RSA Public Key. +#[derive(Clone)] +pub struct PublicKey { + inner: Inner, + serialized: Box<[u8]>, +} + +derive_debug_self_as_ref_hex_bytes!(PublicKey); + +impl PublicKey { + pub(super) fn from_modulus_and_exponent( + n: untrusted::Input, + e: untrusted::Input, + n_min_bits: bits::BitLength, + n_max_bits: bits::BitLength, + e_min_value: PublicExponent, + cpu_features: cpu::Features, + ) -> Result { + let inner = Inner::from_modulus_and_exponent( + n, + e, + n_min_bits, + n_max_bits, + e_min_value, + cpu_features, + )?; + + let n_bytes = n; + let e_bytes = e; + + // TODO: Remove this re-parsing, and stop allocating this here. + // Instead we should serialize on demand without allocation, from + // `Modulus::be_bytes()` and `Exponent::be_bytes()`. Once this is + // fixed, merge `Inner` back into `PublicKey`. + let n_bytes = io::Positive::from_be_bytes(n_bytes) + .map_err(|_: error::Unspecified| error::KeyRejected::unexpected_error())?; + let e_bytes = io::Positive::from_be_bytes(e_bytes) + .map_err(|_: error::Unspecified| error::KeyRejected::unexpected_error())?; + let serialized = der_writer::write_all(der::Tag::Sequence, &|output| { + der_writer::write_positive_integer(output, &n_bytes)?; + der_writer::write_positive_integer(output, &e_bytes) + }) + .map_err(|_: io::TooLongError| error::KeyRejected::unexpected_error())?; + + Ok(Self { inner, serialized }) + } + + /// The length, in bytes, of the public modulus. + /// + /// The modulus length is rounded up to a whole number of bytes if its + /// bit length isn't a multiple of 8. + pub fn modulus_len(&self) -> usize { + self.inner.n().len_bits().as_usize_bytes_rounded_up() + } + + pub(super) fn inner(&self) -> &Inner { + &self.inner + } +} + +/// `PublicKey` but without any superfluous allocations, optimized for one-shot +/// RSA signature verification. +#[derive(Clone)] +pub(crate) struct Inner { + n: PublicModulus, + e: PublicExponent, +} + +impl Inner { + pub(super) fn from_modulus_and_exponent( + n: untrusted::Input, + e: untrusted::Input, + n_min_bits: bits::BitLength, + n_max_bits: bits::BitLength, + e_min_value: PublicExponent, + cpu_features: cpu::Features, + ) -> Result { + // This is an incomplete implementation of NIST SP800-56Br1 Section + // 6.4.2.2, "Partial Public-Key Validation for RSA." That spec defers + // to NIST SP800-89 Section 5.3.3, "(Explicit) Partial Public Key + // Validation for RSA," "with the caveat that the length of the modulus + // shall be a length that is specified in this Recommendation." In + // SP800-89, two different sets of steps are given, one set numbered, + // and one set lettered. TODO: Document this in the end-user + // documentation for RSA keys. + + let n = PublicModulus::from_be_bytes(n, n_min_bits..=n_max_bits, cpu_features)?; + + let e = PublicExponent::from_be_bytes(e, e_min_value)?; + + // If `n` is less than `e` then somebody has probably accidentally swapped + // them. The largest acceptable `e` is smaller than the smallest acceptable + // `n`, so no additional checks need to be done. + + // XXX: Steps 4 & 5 / Steps d, e, & f are not implemented. This is also the + // case in most other commonly-used crypto libraries. + + Ok(Self { n, e }) + } + + /// The public modulus. + #[inline] + pub(super) fn n(&self) -> &PublicModulus { + &self.n + } + + /// The public exponent. + #[inline] + pub(super) fn e(&self) -> PublicExponent { + self.e + } + + /// Calculates base**e (mod n), filling the first part of `out_buffer` with + /// the result. + /// + /// This is constant-time with respect to the value in `base` (only). + /// + /// The result will be a slice of the encoded bytes of the result within + /// `out_buffer`, if successful. + pub(super) fn exponentiate<'out>( + &self, + base: untrusted::Input, + out_buffer: &'out mut [u8; PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN], + cpu_features: cpu::Features, + ) -> Result<&'out [u8], error::Unspecified> { + let n = &self.n.value(cpu_features); + + // The encoded value of the base must be the same length as the modulus, + // in bytes. + if base.len() != self.n.len_bits().as_usize_bytes_rounded_up() { + return Err(error::Unspecified); + } + + // RFC 8017 Section 5.2.2: RSAVP1. + + // Step 1. + let s = bigint::Elem::from_be_bytes_padded(base, n)?; + if s.is_zero() { + return Err(error::Unspecified); + } + + // Step 2. + let m = n.alloc_zero(); + let m = self.exponentiate_elem(m, &s, cpu_features); + + // Step 3. + Ok(fill_be_bytes_n(m, self.n.len_bits(), out_buffer)) + } + + /// Calculates base**e (mod n). + /// + /// This is constant-time with respect to `base` only. + pub(super) fn exponentiate_elem( + &self, + out: bigint::Storage, + base: &bigint::Elem, + cpu_features: cpu::Features, + ) -> bigint::Elem { + // The exponent was already checked to be at least 3. + let exponent_without_low_bit = NonZeroU64::try_from(self.e.value().get() & !1).unwrap(); + // The exponent was already checked to be odd. + debug_assert_ne!(exponent_without_low_bit, self.e.value()); + + let n = &self.n.value(cpu_features); + + let tmp = n.alloc_zero(); + let base_r = bigint::elem_mul_into(tmp, self.n.oneRR(), base, n); + + // During RSA public key operations the exponent is almost always either + // 65537 (0b10000000000000001) or 3 (0b11), both of which have a Hamming + // weight of 2. The maximum bit length and maximum Hamming weight of the + // exponent is bounded by the value of `PublicExponent::MAX`. + let acc = bigint::elem_exp_vartime(out, base_r, exponent_without_low_bit, n); + + // Now do the multiplication for the low bit and convert out of the Montgomery domain. + bigint::elem_mul(base, acc, n) + } +} + +// XXX: Refactor `signature::KeyPair` to get rid of this. +impl AsRef<[u8]> for PublicKey { + fn as_ref(&self) -> &[u8] { + &self.serialized + } +} + +/// Returns the big-endian representation of `elem` that is +/// the same length as the minimal-length big-endian representation of +/// the modulus `n`. +/// +/// `n_bits` must be the bit length of the public modulus `n`. +fn fill_be_bytes_n( + elem: bigint::Elem, + n_bits: bits::BitLength, + out: &mut [u8; PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN], +) -> &[u8] { + let n_bytes = n_bits.as_usize_bytes_rounded_up(); + let n_bytes_padded = ((n_bytes + (LIMB_BYTES - 1)) / LIMB_BYTES) * LIMB_BYTES; + let out = &mut out[..n_bytes_padded]; + elem.fill_be_bytes(out); + let (padding, out) = out.split_at(n_bytes_padded - n_bytes); + assert!(padding.iter().all(|&b| b == 0)); + out +} diff --git a/ring-0.17.14/src/rsa/public_key_components.rs b/ring-0.17.14/src/rsa/public_key_components.rs new file mode 100644 index 0000000000..3220d75175 --- /dev/null +++ b/ring-0.17.14/src/rsa/public_key_components.rs @@ -0,0 +1,52 @@ +// Copyright 2015-2021 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::PublicKey; +use core::iter::FromIterator; + +/// RSA public key components. +/// +/// `B` must implement `AsRef<[u8]>` like `&[u8]` or `Vec`. +#[derive(Clone, Copy)] +pub struct PublicKeyComponents { + /// The public modulus, encoded in big-endian bytes without leading zeros. + pub n: B, + + /// The public exponent, encoded in big-endian bytes without leading zeros. + pub e: B, +} + +impl core::fmt::Debug for PublicKeyComponents +where + B: core::fmt::Debug, +{ + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error> { + f.debug_struct("PublicKeyComponents") + .field("n", &self.n) + .field("e", &self.e) + .finish() + } +} + +impl From<&PublicKey> for PublicKeyComponents +where + B: FromIterator, +{ + fn from(public_key: &PublicKey) -> Self { + Self { + n: public_key.inner().n().be_bytes().collect(), + e: public_key.inner().e().be_bytes().collect(), + } + } +} diff --git a/ring-0.17.14/src/rsa/public_modulus.rs b/ring-0.17.14/src/rsa/public_modulus.rs new file mode 100644 index 0000000000..c40e49edb0 --- /dev/null +++ b/ring-0.17.14/src/rsa/public_modulus.rs @@ -0,0 +1,98 @@ +use crate::{ + arithmetic::{bigint, montgomery::RR}, + bits::{self, FromByteLen as _}, + cpu, + error::{self, InputTooLongError}, + rsa::N, +}; +use core::ops::RangeInclusive; + +/// The modulus (n) of an RSA public key. +pub struct PublicModulus { + value: bigint::OwnedModulus, + oneRR: bigint::One, +} + +impl Clone for PublicModulus { + fn clone(&self) -> Self { + let PublicModulus { value, oneRR } = self; + let value = value.clone(); + + // XXX: Shouldn't really be needed just to call `alloc_zero()`, + // but not worth optimizing away. + let cpu = cpu::features(); + let n = value.modulus(cpu); + let oneRR = oneRR.clone_into(n.alloc_zero()); + + Self { value, oneRR } + } +} + +/* +impl core::fmt::Debug for PublicModulus { + fn fmt(&self, fmt: &mut ::core::fmt::Formatter) -> Result<(), ::core::fmt::Error> { + self.value.fmt(fmt) + } +}*/ + +impl PublicModulus { + pub(super) fn from_be_bytes( + n: untrusted::Input, + allowed_bit_lengths: RangeInclusive, + cpu_features: cpu::Features, + ) -> Result { + // See `PublicKey::from_modulus_and_exponent` for background on the step + // numbering. + + let min_bits = *allowed_bit_lengths.start(); + let max_bits = *allowed_bit_lengths.end(); + + // `pkcs1_encode` depends on this not being small. Otherwise, + // `pkcs1_encode` would generate padding that is invalid (too few 0xFF + // bytes) for very small keys. + const MIN_BITS: bits::BitLength = bits::BitLength::from_bits(1024); + + // Step 3 / Step c for `n` (out of order). + let value = bigint::OwnedModulusValue::from_be_bytes(n)?; + let bits = value.len_bits(); + + // Step 1 / Step a. XXX: SP800-56Br1 and SP800-89 require the length of + // the public modulus to be exactly 2048 or 3072 bits, but we are more + // flexible to be compatible with other commonly-used crypto libraries. + assert!(min_bits >= MIN_BITS); + let bits_rounded_up = bits::BitLength::from_byte_len(bits.as_usize_bytes_rounded_up()) + .map_err(error::erase::) + .unwrap(); // TODO: safe? + if bits_rounded_up < min_bits { + return Err(error::KeyRejected::too_small()); + } + if bits > max_bits { + return Err(error::KeyRejected::too_large()); + } + let value = bigint::OwnedModulus::from(value); + let m = value.modulus(cpu_features); + let oneRR = bigint::One::newRR(m.alloc_zero(), &m); + + Ok(Self { value, oneRR }) + } + + /// The big-endian encoding of the modulus. + /// + /// There are no leading zeros. + pub fn be_bytes(&self) -> impl ExactSizeIterator + Clone + '_ { + self.value.be_bytes() + } + + /// The length of the modulus in bits. + pub fn len_bits(&self) -> bits::BitLength { + self.value.len_bits() + } + + pub(super) fn value(&self, cpu_features: cpu::Features) -> bigint::Modulus { + self.value.modulus(cpu_features) + } + + pub(super) fn oneRR(&self) -> &bigint::Elem { + self.oneRR.as_ref() + } +} diff --git a/ring-0.17.14/src/rsa/signature_rsa_example_private_key.der b/ring-0.17.14/src/rsa/signature_rsa_example_private_key.der new file mode 100644 index 0000000000..47f08a48d7 Binary files /dev/null and b/ring-0.17.14/src/rsa/signature_rsa_example_private_key.der differ diff --git a/ring-0.17.14/src/rsa/signature_rsa_example_public_key.der b/ring-0.17.14/src/rsa/signature_rsa_example_public_key.der new file mode 100644 index 0000000000..19e944fc3e Binary files /dev/null and b/ring-0.17.14/src/rsa/signature_rsa_example_public_key.der differ diff --git a/ring-0.17.14/src/rsa/verification.rs b/ring-0.17.14/src/rsa/verification.rs new file mode 100644 index 0000000000..4ffbaec640 --- /dev/null +++ b/ring-0.17.14/src/rsa/verification.rs @@ -0,0 +1,231 @@ +// Copyright 2015-2016 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! Verification of RSA signatures. + +use super::{ + parse_public_key, public_key, PublicExponent, RsaParameters, PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN, +}; +use crate::{ + bits::{self, FromByteLen as _}, + cpu, digest, + error::{self, InputTooLongError}, + sealed, signature, +}; + +impl signature::VerificationAlgorithm for RsaParameters { + fn verify( + &self, + public_key: untrusted::Input, + msg: untrusted::Input, + signature: untrusted::Input, + ) -> Result<(), error::Unspecified> { + let (n, e) = parse_public_key(public_key)?; + verify_rsa_( + self, + ( + n.big_endian_without_leading_zero_as_input(), + e.big_endian_without_leading_zero_as_input(), + ), + msg, + signature, + cpu::features(), + ) + } +} + +impl sealed::Sealed for RsaParameters {} + +macro_rules! rsa_params { + ( $VERIFY_ALGORITHM:ident, $min_bits:expr, $PADDING_ALGORITHM:expr, + $doc_str:expr ) => { + #[doc=$doc_str] + /// + /// Only available in `alloc` mode. + pub static $VERIFY_ALGORITHM: RsaParameters = RsaParameters { + padding_alg: $PADDING_ALGORITHM, + min_bits: bits::BitLength::from_bits($min_bits), + }; + }; +} + +rsa_params!( + RSA_PKCS1_1024_8192_SHA1_FOR_LEGACY_USE_ONLY, + 1024, + &super::padding::RSA_PKCS1_SHA1_FOR_LEGACY_USE_ONLY, + "Verification of signatures using RSA keys of 1024-8192 bits, + PKCS#1.5 padding, and SHA-1.\n\nSee \"`RSA_PKCS1_*` Details\" in + `ring::signature`'s module-level documentation for more details." +); +rsa_params!( + RSA_PKCS1_2048_8192_SHA1_FOR_LEGACY_USE_ONLY, + 2048, + &super::padding::RSA_PKCS1_SHA1_FOR_LEGACY_USE_ONLY, + "Verification of signatures using RSA keys of 2048-8192 bits, + PKCS#1.5 padding, and SHA-1.\n\nSee \"`RSA_PKCS1_*` Details\" in + `ring::signature`'s module-level documentation for more details." +); +rsa_params!( + RSA_PKCS1_1024_8192_SHA256_FOR_LEGACY_USE_ONLY, + 1024, + &super::padding::RSA_PKCS1_SHA256, + "Verification of signatures using RSA keys of 1024-8192 bits, + PKCS#1.5 padding, and SHA-256.\n\nSee \"`RSA_PKCS1_*` Details\" in + `ring::signature`'s module-level documentation for more details." +); +rsa_params!( + RSA_PKCS1_2048_8192_SHA256, + 2048, + &super::padding::RSA_PKCS1_SHA256, + "Verification of signatures using RSA keys of 2048-8192 bits, + PKCS#1.5 padding, and SHA-256.\n\nSee \"`RSA_PKCS1_*` Details\" in + `ring::signature`'s module-level documentation for more details." +); +rsa_params!( + RSA_PKCS1_2048_8192_SHA384, + 2048, + &super::padding::RSA_PKCS1_SHA384, + "Verification of signatures using RSA keys of 2048-8192 bits, + PKCS#1.5 padding, and SHA-384.\n\nSee \"`RSA_PKCS1_*` Details\" in + `ring::signature`'s module-level documentation for more details." +); +rsa_params!( + RSA_PKCS1_2048_8192_SHA512, + 2048, + &super::padding::RSA_PKCS1_SHA512, + "Verification of signatures using RSA keys of 2048-8192 bits, + PKCS#1.5 padding, and SHA-512.\n\nSee \"`RSA_PKCS1_*` Details\" in + `ring::signature`'s module-level documentation for more details." +); +rsa_params!( + RSA_PKCS1_1024_8192_SHA512_FOR_LEGACY_USE_ONLY, + 1024, + &super::padding::RSA_PKCS1_SHA512, + "Verification of signatures using RSA keys of 1024-8192 bits, + PKCS#1.5 padding, and SHA-512.\n\nSee \"`RSA_PKCS1_*` Details\" in + `ring::signature`'s module-level documentation for more details." +); +rsa_params!( + RSA_PKCS1_3072_8192_SHA384, + 3072, + &super::padding::RSA_PKCS1_SHA384, + "Verification of signatures using RSA keys of 3072-8192 bits, + PKCS#1.5 padding, and SHA-384.\n\nSee \"`RSA_PKCS1_*` Details\" in + `ring::signature`'s module-level documentation for more details." +); + +rsa_params!( + RSA_PSS_2048_8192_SHA256, + 2048, + &super::padding::RSA_PSS_SHA256, + "Verification of signatures using RSA keys of 2048-8192 bits, + PSS padding, and SHA-256.\n\nSee \"`RSA_PSS_*` Details\" in + `ring::signature`'s module-level documentation for more details." +); +rsa_params!( + RSA_PSS_2048_8192_SHA384, + 2048, + &super::padding::RSA_PSS_SHA384, + "Verification of signatures using RSA keys of 2048-8192 bits, + PSS padding, and SHA-384.\n\nSee \"`RSA_PSS_*` Details\" in + `ring::signature`'s module-level documentation for more details." +); +rsa_params!( + RSA_PSS_2048_8192_SHA512, + 2048, + &super::padding::RSA_PSS_SHA512, + "Verification of signatures using RSA keys of 2048-8192 bits, + PSS padding, and SHA-512.\n\nSee \"`RSA_PSS_*` Details\" in + `ring::signature`'s module-level documentation for more details." +); + +pub use super::PublicKeyComponents as RsaPublicKeyComponents; + +impl super::PublicKeyComponents +where + B: AsRef<[u8]>, +{ + /// Verifies that `signature` is a valid signature of `message` using `self` + /// as the public key. `params` determine what algorithm parameters + /// (padding, digest algorithm, key length range, etc.) are used in the + /// verification. + /// + /// When the public key is in DER-encoded PKCS#1 ASN.1 format, it is + /// recommended to use `ring::signature::verify()` with + /// `ring::signature::RSA_PKCS1_*`, because `ring::signature::verify()` + /// will handle the parsing in that case. Otherwise, this function can be used + /// to pass in the raw bytes for the public key components as + /// `untrusted::Input` arguments. + // + // There are a small number of tests that test this directly, but the + // test coverage for this function mostly depends on the test coverage for the + // `signature::VerificationAlgorithm` implementation for `RsaParameters`. If we + // change that, test coverage for `verify_rsa()` will need to be reconsidered. + // (The NIST test vectors were originally in a form that was optimized for + // testing `verify_rsa` directly, but the testing work for RSA PKCS#1 + // verification was done during the implementation of + // `signature::VerificationAlgorithm`, before `verify_rsa` was factored out). + pub fn verify( + &self, + params: &RsaParameters, + message: &[u8], + signature: &[u8], + ) -> Result<(), error::Unspecified> { + verify_rsa_( + params, + ( + untrusted::Input::from(self.n.as_ref()), + untrusted::Input::from(self.e.as_ref()), + ), + untrusted::Input::from(message), + untrusted::Input::from(signature), + cpu::features(), + ) + } +} + +pub(crate) fn verify_rsa_( + params: &RsaParameters, + (n, e): (untrusted::Input, untrusted::Input), + msg: untrusted::Input, + signature: untrusted::Input, + cpu_features: cpu::Features, +) -> Result<(), error::Unspecified> { + let max_bits: bits::BitLength = + bits::BitLength::from_byte_len(PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN) + .map_err(error::erase::)?; + + // XXX: FIPS 186-4 seems to indicate that the minimum + // exponent value is 2**16 + 1, but it isn't clear if this is just for + // signing or also for verification. We support exponents of 3 and larger + // for compatibility with other commonly-used crypto libraries. + let key = public_key::Inner::from_modulus_and_exponent( + n, + e, + params.min_bits, + max_bits, + PublicExponent::_3, + cpu_features, + )?; + + // RFC 8017 Section 5.2.2: RSAVP1. + let mut decoded = [0u8; PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN]; + let decoded = key.exponentiate(signature, &mut decoded, cpu_features)?; + + // Verify the padded message is correct. + let m_hash = digest::digest(params.padding_alg.digest_alg(), msg.as_slice_less_safe()); + untrusted::Input::from(decoded).read_all(error::Unspecified, |m| { + params.padding_alg.verify(m_hash, m, key.n().len_bits()) + }) +} diff --git a/ring-0.17.14/src/signature.rs b/ring-0.17.14/src/signature.rs new file mode 100644 index 0000000000..70806cac2e --- /dev/null +++ b/ring-0.17.14/src/signature.rs @@ -0,0 +1,409 @@ +// Copyright 2015-2017 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! Public key signatures: signing and verification. +//! +//! Use the `verify` function to verify signatures, passing a reference to the +//! algorithm that identifies the algorithm. See the documentation for `verify` +//! for examples. +//! +//! For signature verification, this API treats each combination of parameters +//! as a separate algorithm. For example, instead of having a single "RSA" +//! algorithm with a verification function that takes a bunch of parameters, +//! there are `RSA_PKCS1_2048_8192_SHA256`, `RSA_PKCS1_2048_8192_SHA384`, etc., +//! which encode sets of parameter choices into objects. This is designed to +//! reduce the risks of algorithm agility and to provide consistency with ECDSA +//! and EdDSA. +//! +//! Currently this module does not support digesting the message to be signed +//! separately from the public key operation, as it is currently being +//! optimized for Ed25519 and for the implementation of protocols that do not +//! requiring signing large messages. An interface for efficiently supporting +//! larger messages may be added later. +//! +//! +//! # Algorithm Details +//! +//! ## `ECDSA_*_ASN1` Details: ASN.1-encoded ECDSA Signatures +//! +//! The signature is a ASN.1 DER-encoded `Ecdsa-Sig-Value` as described in +//! [RFC 3279 Section 2.2.3]. This is the form of ECDSA signature used in +//! X.509-related structures and in TLS's `ServerKeyExchange` messages. +//! +//! The public key is encoding in uncompressed form using the +//! Octet-String-to-Elliptic-Curve-Point algorithm in +//! [SEC 1: Elliptic Curve Cryptography, Version 2.0]. +//! +//! During verification, the public key is validated using the ECC Partial +//! Public-Key Validation Routine from Section 5.6.2.3.3 of +//! [NIST Special Publication 800-56A, revision 2] and Appendix A.3 of the +//! NSA's [Suite B implementer's guide to FIPS 186-3]. Note that, as explained +//! in the NSA guide, ECC Partial Public-Key Validation is equivalent to ECC +//! Full Public-Key Validation for prime-order curves like this one. +//! +//! ## `ECDSA_*_FIXED` Details: Fixed-length (PKCS#11-style) ECDSA Signatures +//! +//! The signature is *r*||*s*, where || denotes concatenation, and where both +//! *r* and *s* are both big-endian-encoded values that are left-padded to the +//! maximum length. A P-256 signature will be 64 bytes long (two 32-byte +//! components) and a P-384 signature will be 96 bytes long (two 48-byte +//! components). This is the form of ECDSA signature used PKCS#11 and DNSSEC. +//! +//! The public key is encoding in uncompressed form using the +//! Octet-String-to-Elliptic-Curve-Point algorithm in +//! [SEC 1: Elliptic Curve Cryptography, Version 2.0]. +//! +//! During verification, the public key is validated using the ECC Partial +//! Public-Key Validation Routine from Section 5.6.2.3.3 of +//! [NIST Special Publication 800-56A, revision 2] and Appendix A.3 of the +//! NSA's [Suite B implementer's guide to FIPS 186-3]. Note that, as explained +//! in the NSA guide, ECC Partial Public-Key Validation is equivalent to ECC +//! Full Public-Key Validation for prime-order curves like this one. +//! +//! ## `RSA_PKCS1_*` Details: RSA PKCS#1 1.5 Signatures +//! +//! The signature is an RSASSA-PKCS1-v1_5 signature as described in +//! [RFC 3447 Section 8.2]. +//! +//! The public key is encoded as an ASN.1 `RSAPublicKey` as described in +//! [RFC 3447 Appendix-A.1.1]. The public key modulus length, rounded *up* to +//! the nearest (larger) multiple of 8 bits, must be in the range given in the +//! name of the algorithm. The public exponent must be an odd integer of 2-33 +//! bits, inclusive. +//! +//! +//! ## `RSA_PSS_*` Details: RSA PSS Signatures +//! +//! The signature is an RSASSA-PSS signature as described in +//! [RFC 3447 Section 8.1]. +//! +//! The public key is encoded as an ASN.1 `RSAPublicKey` as described in +//! [RFC 3447 Appendix-A.1.1]. The public key modulus length, rounded *up* to +//! the nearest (larger) multiple of 8 bits, must be in the range given in the +//! name of the algorithm. The public exponent must be an odd integer of 2-33 +//! bits, inclusive. +//! +//! During verification, signatures will only be accepted if the MGF1 digest +//! algorithm is the same as the message digest algorithm and if the salt +//! length is the same length as the message digest. This matches the +//! requirements in TLS 1.3 and other recent specifications. +//! +//! During signing, the message digest algorithm will be used as the MGF1 +//! digest algorithm. The salt will be the same length as the message digest. +//! This matches the requirements in TLS 1.3 and other recent specifications. +//! Additionally, the entire salt is randomly generated separately for each +//! signature using the secure random number generator passed to `sign()`. +//! +//! +//! [SEC 1: Elliptic Curve Cryptography, Version 2.0]: +//! http://www.secg.org/sec1-v2.pdf +//! [NIST Special Publication 800-56A, revision 2]: +//! http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-56Ar2.pdf +//! [Suite B implementer's guide to FIPS 186-3]: +//! https://github.com/briansmith/ring/blob/main/doc/ecdsa.pdf +//! [RFC 3279 Section 2.2.3]: +//! https://tools.ietf.org/html/rfc3279#section-2.2.3 +//! [RFC 3447 Section 8.2]: +//! https://tools.ietf.org/html/rfc3447#section-7.2 +//! [RFC 3447 Section 8.1]: +//! https://tools.ietf.org/html/rfc3447#section-8.1 +//! [RFC 3447 Appendix-A.1.1]: +//! https://tools.ietf.org/html/rfc3447#appendix-A.1.1 +//! +//! +//! # Examples +//! +//! ## Signing and verifying with Ed25519 +//! +//! ``` +//! use ring::{ +//! rand, +//! signature::{self, KeyPair}, +//! }; +//! +//! # fn main() -> Result<(), ring::error::Unspecified> { +//! // Generate a key pair in PKCS#8 (v2) format. +//! let rng = rand::SystemRandom::new(); +//! let pkcs8_bytes = signature::Ed25519KeyPair::generate_pkcs8(&rng)?; +//! +//! // Normally the application would store the PKCS#8 file persistently. Later +//! // it would read the PKCS#8 file from persistent storage to use it. +//! +//! let key_pair = signature::Ed25519KeyPair::from_pkcs8(pkcs8_bytes.as_ref())?; +//! +//! // Sign the message "hello, world". +//! const MESSAGE: &[u8] = b"hello, world"; +//! let sig = key_pair.sign(MESSAGE); +//! +//! // Normally an application would extract the bytes of the signature and +//! // send them in a protocol message to the peer(s). Here we just get the +//! // public key key directly from the key pair. +//! let peer_public_key_bytes = key_pair.public_key().as_ref(); +//! +//! // Verify the signature of the message using the public key. Normally the +//! // verifier of the message would parse the inputs to this code out of the +//! // protocol message(s) sent by the signer. +//! let peer_public_key = +//! signature::UnparsedPublicKey::new(&signature::ED25519, peer_public_key_bytes); +//! peer_public_key.verify(MESSAGE, sig.as_ref())?; +//! +//! # Ok(()) +//! # } +//! ``` +//! +//! ## Signing and verifying with RSA (PKCS#1 1.5 padding) +//! +//! By default OpenSSL writes RSA public keys in SubjectPublicKeyInfo format, +//! not RSAPublicKey format, and Base64-encodes them (“PEM” format). +//! +//! To convert the PEM SubjectPublicKeyInfo format (“BEGIN PUBLIC KEY”) to the +//! binary RSAPublicKey format needed by `verify()`, use: +//! +//! ```sh +//! openssl rsa -pubin \ +//! -in public_key.pem \ +//! -inform PEM \ +//! -RSAPublicKey_out \ +//! -outform DER \ +//! -out public_key.der +//! ``` +//! +//! To extract the RSAPublicKey-formatted public key from an ASN.1 (binary) +//! DER-encoded RSAPrivateKey format private key file, use: +//! +//! ```sh +//! openssl rsa -in private_key.der \ +//! -inform DER \ +//! -RSAPublicKey_out \ +//! -outform DER \ +//! -out public_key.der +//! ``` +//! +//! ``` +//! # #[cfg(feature = "std")] +//! use ring::{rand, rsa, signature}; +//! +//! # #[cfg(feature = "std")] +//! fn sign_and_verify_rsa(private_key_path: &std::path::Path, +//! public_key_path: &std::path::Path) +//! -> Result<(), MyError> { +//! // Create an RSA keypair from the DER-encoded bytes. This example uses +//! // a 2048-bit key, but larger keys are also supported. +//! let private_key_der = read_file(private_key_path)?; +//! let key_pair = rsa::KeyPair::from_der(&private_key_der) +//! .map_err(|_| MyError::BadPrivateKey)?; +//! +//! // Sign the message "hello, world", using PKCS#1 v1.5 padding and the +//! // SHA256 digest algorithm. +//! const MESSAGE: &'static [u8] = b"hello, world"; +//! let rng = rand::SystemRandom::new(); +//! let mut signature = vec![0; key_pair.public().modulus_len()]; +//! key_pair.sign(&signature::RSA_PKCS1_SHA256, &rng, MESSAGE, &mut signature) +//! .map_err(|_| MyError::OOM)?; +//! +//! // Verify the signature. +//! let public_key = +//! signature::UnparsedPublicKey::new(&signature::RSA_PKCS1_2048_8192_SHA256, +//! read_file(public_key_path)?); +//! public_key.verify(MESSAGE, &signature) +//! .map_err(|_| MyError::BadSignature) +//! } +//! +//! #[derive(Debug)] +//! enum MyError { +//! # #[cfg(feature = "std")] +//! IO(std::io::Error), +//! BadPrivateKey, +//! OOM, +//! BadSignature, +//! } +//! +//! # #[cfg(feature = "std")] +//! fn read_file(path: &std::path::Path) -> Result, MyError> { +//! use std::io::Read; +//! +//! let mut file = std::fs::File::open(path).map_err(|e| MyError::IO(e))?; +//! let mut contents: Vec = Vec::new(); +//! file.read_to_end(&mut contents).map_err(|e| MyError::IO(e))?; +//! Ok(contents) +//! } +//! # +//! # #[cfg(not(feature = "std"))] +//! # fn sign_and_verify_rsa(_private_key_path: &std::path::Path, +//! # _public_key_path: &std::path::Path) +//! # -> Result<(), ()> { +//! # Ok(()) +//! # } +//! # +//! # fn main() { +//! # let private_key_path = +//! # std::path::Path::new("src/rsa/signature_rsa_example_private_key.der"); +//! # let public_key_path = +//! # std::path::Path::new("src/rsa/signature_rsa_example_public_key.der"); +//! # sign_and_verify_rsa(&private_key_path, &public_key_path).unwrap() +//! # } +//! ``` + +use crate::{cpu, debug, ec, error, sealed}; + +pub use crate::ec::{ + curve25519::ed25519::{ + signing::Ed25519KeyPair, + verification::{EdDSAParameters, ED25519}, + ED25519_PUBLIC_KEY_LEN, + }, + suite_b::ecdsa::{ + signing::{ + EcdsaKeyPair, EcdsaSigningAlgorithm, ECDSA_P256_SHA256_ASN1_SIGNING, + ECDSA_P256_SHA256_FIXED_SIGNING, ECDSA_P384_SHA384_ASN1_SIGNING, + ECDSA_P384_SHA384_FIXED_SIGNING, + }, + verification::{ + EcdsaVerificationAlgorithm, ECDSA_P256_SHA256_ASN1, ECDSA_P256_SHA256_FIXED, + ECDSA_P256_SHA384_ASN1, ECDSA_P384_SHA256_ASN1, ECDSA_P384_SHA384_ASN1, + ECDSA_P384_SHA384_FIXED, + }, + }, +}; + +#[cfg(feature = "alloc")] +pub use crate::rsa::{ + padding::{ + RsaEncoding, RSA_PKCS1_SHA256, RSA_PKCS1_SHA384, RSA_PKCS1_SHA512, RSA_PSS_SHA256, + RSA_PSS_SHA384, RSA_PSS_SHA512, + }, + verification::{ + RsaPublicKeyComponents, RSA_PKCS1_1024_8192_SHA1_FOR_LEGACY_USE_ONLY, + RSA_PKCS1_1024_8192_SHA256_FOR_LEGACY_USE_ONLY, + RSA_PKCS1_1024_8192_SHA512_FOR_LEGACY_USE_ONLY, + RSA_PKCS1_2048_8192_SHA1_FOR_LEGACY_USE_ONLY, RSA_PKCS1_2048_8192_SHA256, + RSA_PKCS1_2048_8192_SHA384, RSA_PKCS1_2048_8192_SHA512, RSA_PKCS1_3072_8192_SHA384, + RSA_PSS_2048_8192_SHA256, RSA_PSS_2048_8192_SHA384, RSA_PSS_2048_8192_SHA512, + }, + RsaParameters, +}; + +/// An RSA key pair, used for signing. +#[cfg(feature = "alloc")] +pub type RsaKeyPair = crate::rsa::KeyPair; + +/// A public key signature returned from a signing operation. +#[derive(Clone, Copy)] +pub struct Signature { + value: [u8; MAX_LEN], + len: usize, +} + +impl Signature { + // Panics if `value` is too long. + pub(crate) fn new(fill: F) -> Self + where + F: FnOnce(&mut [u8; MAX_LEN]) -> usize, + { + let mut r = Self { + value: [0; MAX_LEN], + len: 0, + }; + r.len = fill(&mut r.value); + r + } +} + +impl AsRef<[u8]> for Signature { + fn as_ref(&self) -> &[u8] { + &self.value[..self.len] + } +} + +/// Key pairs for signing messages (private key and public key). +pub trait KeyPair: core::fmt::Debug + Send + Sized + Sync { + /// The type of the public key. + type PublicKey: AsRef<[u8]> + core::fmt::Debug + Clone + Send + Sized + Sync; + + /// The public key for the key pair. + fn public_key(&self) -> &Self::PublicKey; +} + +/// The longest signature is an ASN.1 P-384 signature where *r* and *s* are of +/// maximum length with the leading high bit set on each. Then each component +/// will have a tag, a one-byte length, and a one-byte “I'm not negative” +/// prefix, and the outer sequence will have a two-byte length. +pub(crate) const MAX_LEN: usize = 1/*tag:SEQUENCE*/ + 2/*len*/ + + (2 * (1/*tag:INTEGER*/ + 1/*len*/ + 1/*zero*/ + ec::SCALAR_MAX_BYTES)); + +/// A signature verification algorithm. +pub trait VerificationAlgorithm: core::fmt::Debug + Sync + sealed::Sealed { + /// Verify the signature `signature` of message `msg` with the public key + /// `public_key`. + fn verify( + &self, + public_key: untrusted::Input, + msg: untrusted::Input, + signature: untrusted::Input, + ) -> Result<(), error::Unspecified>; +} + +/// An unparsed, possibly malformed, public key for signature verification. +#[derive(Clone, Copy)] +pub struct UnparsedPublicKey { + algorithm: &'static dyn VerificationAlgorithm, + bytes: B, +} + +impl AsRef<[u8]> for UnparsedPublicKey +where + B: AsRef<[u8]>, +{ + fn as_ref(&self) -> &[u8] { + self.bytes.as_ref() + } +} + +impl core::fmt::Debug for UnparsedPublicKey +where + B: AsRef<[u8]>, +{ + fn fmt(&self, f: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> { + f.debug_struct("UnparsedPublicKey") + .field("algorithm", &self.algorithm) + .field("bytes", &debug::HexStr(self.bytes.as_ref())) + .finish() + } +} + +impl UnparsedPublicKey { + /// Construct a new `UnparsedPublicKey`. + /// + /// No validation of `bytes` is done until `verify()` is called. + #[inline] + pub fn new(algorithm: &'static dyn VerificationAlgorithm, bytes: B) -> Self { + Self { algorithm, bytes } + } + + /// Parses the public key and verifies `signature` is a valid signature of + /// `message` using it. + /// + /// See the [crate::signature] module-level documentation for examples. + pub fn verify(&self, message: &[u8], signature: &[u8]) -> Result<(), error::Unspecified> + where + B: AsRef<[u8]>, + { + let _ = cpu::features(); + self.algorithm.verify( + untrusted::Input::from(self.bytes.as_ref()), + untrusted::Input::from(message), + untrusted::Input::from(signature), + ) + } +} diff --git a/ring-0.17.14/src/tests/bits_tests.rs b/ring-0.17.14/src/tests/bits_tests.rs new file mode 100644 index 0000000000..f8252026c9 --- /dev/null +++ b/ring-0.17.14/src/tests/bits_tests.rs @@ -0,0 +1,64 @@ +// Copyright 2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use crate::{ + bits::{BitLength, FromByteLen as _}, + polyfill::u64_from_usize, +}; + +#[test] +fn test_from_byte_len_overflow() { + const USIZE_MAX_VALID_BYTES: usize = usize::MAX / 8; + + // Maximum valid input for BitLength. + match BitLength::::from_byte_len(USIZE_MAX_VALID_BYTES) { + Ok(bits) => { + assert_eq!(bits.as_usize_bytes_rounded_up(), USIZE_MAX_VALID_BYTES); + assert_eq!(bits.as_bits(), usize::MAX & !0b111); + } + Err(_) => unreachable!(), + } + + // Minimum invalid usize input for BitLength. + assert!(BitLength::::from_byte_len(USIZE_MAX_VALID_BYTES + 1).is_err()); + + // Minimum invalid usize input for BitLength on 64-bit targets. + { + let r = BitLength::::from_byte_len(USIZE_MAX_VALID_BYTES + 1); + if cfg!(target_pointer_width = "64") { + assert!(r.is_err()); + } else { + match r { + Ok(bits) => { + assert_eq!( + bits.as_bits(), + (u64_from_usize(USIZE_MAX_VALID_BYTES) + 1) * 8 + ); + } + Err(_) => unreachable!(), + } + } + } + + const U64_MAX_VALID_BYTES: u64 = u64::MAX / 8; + + // Maximum valid u64 input for BitLength. + match BitLength::::from_byte_len(U64_MAX_VALID_BYTES) { + Ok(bits) => assert_eq!(bits.as_bits(), u64::MAX & !0b111), + Err(_) => unreachable!(), + }; + + // Minimum invalid usize input for BitLength on 64-bit targets. + assert!(BitLength::::from_byte_len(U64_MAX_VALID_BYTES + 1).is_err()); +} diff --git a/ring-0.17.14/src/tests/mod.rs b/ring-0.17.14/src/tests/mod.rs new file mode 100644 index 0000000000..b1cabd60c6 --- /dev/null +++ b/ring-0.17.14/src/tests/mod.rs @@ -0,0 +1,17 @@ +// Copyright 2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! Integration tests for non-public APIs. + +mod bits_tests; diff --git a/ring-0.17.14/src/testutil.rs b/ring-0.17.14/src/testutil.rs new file mode 100644 index 0000000000..31cfb98f9a --- /dev/null +++ b/ring-0.17.14/src/testutil.rs @@ -0,0 +1,640 @@ +// Copyright 2015-2016 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! Testing framework. +//! +//! Unlike the rest of *ring*, this testing framework uses panics pretty +//! liberally. It was originally designed for internal use--it drives most of +//! *ring*'s internal tests, and so it is optimized for getting *ring*'s tests +//! written quickly at the expense of some usability. The documentation is +//! lacking. The best way to learn it is to look at some examples. The digest +//! tests are the most complicated because they use named sections. Other tests +//! avoid named sections and so are easier to understand. +//! +//! # Examples +//! +//! ## Writing Tests +//! +//! Input files look like this: +//! +//! ```text +//! # This is a comment. +//! +//! HMAC = SHA1 +//! Input = "My test data" +//! Key = "" +//! Output = 61afdecb95429ef494d61fdee15990cabf0826fc +//! +//! HMAC = SHA256 +//! Input = "Sample message for keylen +//! at C:\Users\Example\example\:4 +//! 9: 0x7ff65496d49c - example_test +//! at C:\Users\Example\example\src\example.rs:652 +//! 10: 0x7ff6549d192a - test::stats::Summary::new::ha139494ed2e4e01f +//! 11: 0x7ff6549d51a2 - test::stats::Summary::new::ha139494ed2e4e01f +//! 12: 0x7ff654a0a911 - _rust_maybe_catch_panic +//! 13: 0x7ff6549d56dd - test::stats::Summary::new::ha139494ed2e4e01f +//! 14: 0x7ff654a03783 - std::sys::thread::Thread::new::h2b08da6cd2517f79 +//! 15: 0x7ff968518101 - BaseThreadInitThunk +//! ``` +//! +//! Notice that the output shows the name of the data file +//! (`src/example_tests.txt`), the test inputs that led to the failure, and the +//! stack trace to the line in the test code that panicked: entry 9 in the +//! stack trace pointing to line 652 of the file `example.rs`. + +extern crate alloc; + +use alloc::{format, string::String, vec::Vec}; + +use crate::{bits, digest, error}; + +#[cfg(any(feature = "std", feature = "test_logging"))] +extern crate std; + +/// `compile_time_assert_clone::();` fails to compile if `T` doesn't +/// implement `Clone`. +pub const fn compile_time_assert_clone() {} + +/// `compile_time_assert_copy::();` fails to compile if `T` doesn't +/// implement `Copy`. +pub const fn compile_time_assert_copy() {} + +/// `compile_time_assert_eq::();` fails to compile if `T` doesn't +/// implement `Eq`. +pub const fn compile_time_assert_eq() {} + +/// `compile_time_assert_send::();` fails to compile if `T` doesn't +/// implement `Send`. +pub const fn compile_time_assert_send() {} + +/// `compile_time_assert_sync::();` fails to compile if `T` doesn't +/// implement `Sync`. +pub const fn compile_time_assert_sync() {} + +/// `compile_time_assert_std_error_error::();` fails to compile if `T` +/// doesn't implement `std::error::Error`. +#[cfg(feature = "std")] +pub const fn compile_time_assert_std_error_error() {} + +/// A test case. A test case consists of a set of named attributes. Every +/// attribute in the test case must be consumed exactly once; this helps catch +/// typos and omissions. +/// +/// Requires the `alloc` default feature to be enabled. +#[derive(Debug)] +pub struct TestCase { + attributes: Vec<(String, String, bool)>, +} + +impl TestCase { + /// Maps the string "true" to true and the string "false" to false. + pub fn consume_bool(&mut self, key: &str) -> bool { + match self.consume_string(key).as_ref() { + "true" => true, + "false" => false, + s => panic!("Invalid bool value: {}", s), + } + } + + /// Maps the strings "SHA1", "SHA256", "SHA384", and "SHA512" to digest + /// algorithms, maps "SHA224" to `None`, and panics on other (erroneous) + /// inputs. "SHA224" is mapped to None because *ring* intentionally does + /// not support SHA224, but we need to consume test vectors from NIST that + /// have SHA224 vectors in them. + pub fn consume_digest_alg(&mut self, key: &str) -> Option<&'static digest::Algorithm> { + let name = self.consume_string(key); + match name.as_ref() { + "SHA1" => Some(&digest::SHA1_FOR_LEGACY_USE_ONLY), + "SHA224" => None, // We actively skip SHA-224 support. + "SHA256" => Some(&digest::SHA256), + "SHA384" => Some(&digest::SHA384), + "SHA512" => Some(&digest::SHA512), + "SHA512_256" => Some(&digest::SHA512_256), + _ => panic!("Unsupported digest algorithm: {}", name), + } + } + + /// Returns the value of an attribute that is encoded as a sequence of an + /// even number of hex digits, or as a double-quoted UTF-8 string. The + /// empty (zero-length) value is represented as "". + pub fn consume_bytes(&mut self, key: &str) -> Vec { + self.consume_optional_bytes(key) + .unwrap_or_else(|| panic!("No attribute named \"{}\"", key)) + } + + /// Like `consume_bytes()` except it returns `None` if the test case + /// doesn't have the attribute. + pub fn consume_optional_bytes(&mut self, key: &str) -> Option> { + let s = self.consume_optional_string(key)?; + let result = if let [b'\"', s @ ..] = s.as_bytes() { + // The value is a quoted UTF-8 string. + let mut s = s.iter(); + let mut bytes = Vec::with_capacity(s.len() - 1); + loop { + let b = match s.next() { + Some(b'\\') => { + match s.next() { + // We don't allow all octal escape sequences, only "\0" for null. + Some(b'0') => 0u8, + Some(b't') => b'\t', + Some(b'n') => b'\n', + // "\xHH" + Some(b'x') => { + let hi = s.next().expect("Invalid hex escape sequence in string."); + let lo = s.next().expect("Invalid hex escape sequence in string."); + if let (Ok(hi), Ok(lo)) = (from_hex_digit(*hi), from_hex_digit(*lo)) + { + (hi << 4) | lo + } else { + panic!("Invalid hex escape sequence in string."); + } + } + _ => { + panic!("Invalid hex escape sequence in string."); + } + } + } + Some(b'"') => { + if s.next().is_some() { + panic!("characters after the closing quote of a quoted string."); + } + break; + } + Some(b) => *b, + None => panic!("Missing terminating '\"' in string literal."), + }; + bytes.push(b); + } + bytes + } else { + // The value is hex encoded. + match from_hex(&s) { + Ok(s) => s, + Err(err_str) => { + panic!("{} in {}", err_str, s); + } + } + }; + Some(result) + } + + /// Returns the value of an attribute that is an integer, in decimal + /// notation. + pub fn consume_usize(&mut self, key: &str) -> usize { + let s = self.consume_string(key); + s.parse::().unwrap() + } + + /// Returns the value of an attribute that is an integer, in decimal + /// notation, as a bit length. + pub fn consume_usize_bits(&mut self, key: &str) -> bits::BitLength { + let s = self.consume_string(key); + let bits = s.parse::().unwrap(); + bits::BitLength::from_bits(bits) + } + + /// Returns the raw value of an attribute, without any unquoting or + /// other interpretation. + pub fn consume_string(&mut self, key: &str) -> String { + self.consume_optional_string(key) + .unwrap_or_else(|| panic!("No attribute named \"{}\"", key)) + } + + /// Like `consume_string()` except it returns `None` if the test case + /// doesn't have the attribute. + pub fn consume_optional_string(&mut self, key: &str) -> Option { + for (name, value, consumed) in &mut self.attributes { + if key == name { + if *consumed { + panic!("Attribute {} was already consumed", key); + } + *consumed = true; + return Some(value.clone()); + } + } + None + } +} + +/// References a test input file. +#[cfg(test)] +macro_rules! test_vector_file { + ($file_name:expr) => { + $crate::testutil::File { + file_name: $file_name, + contents: include_str!($file_name), + } + }; +} + +/// A test input file. +pub struct File<'a> { + /// The name (path) of the file. + pub file_name: &'a str, + + /// The contents of the file. + pub contents: &'a str, +} + +/// Parses test cases out of the given file, calling `f` on each vector until +/// `f` fails or until all the test vectors have been read. `f` can indicate +/// failure either by returning `Err()` or by panicking. +pub fn run(test_file: File, mut f: F) +where + F: FnMut(&str, &mut TestCase) -> Result<(), error::Unspecified>, +{ + let lines = &mut test_file.contents.lines(); + + let mut current_section = String::from(""); + let mut failed = false; + + while let Some(mut test_case) = parse_test_case(&mut current_section, lines) { + let result = match f(¤t_section, &mut test_case) { + Ok(()) => { + if !test_case + .attributes + .iter() + .any(|&(_, _, consumed)| !consumed) + { + Ok(()) + } else { + failed = true; + Err("Test didn't consume all attributes.") + } + } + Err(error::Unspecified) => Err("Test returned Err(error::Unspecified)."), + }; + + if result.is_err() { + failed = true; + } + + #[cfg(feature = "test_logging")] + if let Err(msg) = result { + std::println!("{}: {}", test_file.file_name, msg); + + for (name, value, consumed) in test_case.attributes { + let consumed_str = if consumed { "" } else { " (unconsumed)" }; + std::println!("{}{} = {}", name, consumed_str, value); + } + }; + } + + if failed { + panic!("Test failed.") + } +} + +/// Decode an string of hex digits into a sequence of bytes. The input must +/// have an even number of digits. +pub fn from_hex(hex_str: &str) -> Result, String> { + if hex_str.len() % 2 != 0 { + return Err(String::from( + "Hex string does not have an even number of digits", + )); + } + + let mut result = Vec::with_capacity(hex_str.len() / 2); + for digits in hex_str.as_bytes().chunks(2) { + let hi = from_hex_digit(digits[0])?; + let lo = from_hex_digit(digits[1])?; + result.push((hi * 0x10) | lo); + } + Ok(result) +} + +fn from_hex_digit(d: u8) -> Result { + use core::ops::RangeInclusive; + const DECIMAL: (u8, RangeInclusive) = (0, b'0'..=b'9'); + const HEX_LOWER: (u8, RangeInclusive) = (10, b'a'..=b'f'); + const HEX_UPPER: (u8, RangeInclusive) = (10, b'A'..=b'F'); + for (offset, range) in &[DECIMAL, HEX_LOWER, HEX_UPPER] { + if range.contains(&d) { + return Ok(d - range.start() + offset); + } + } + Err(format!("Invalid hex digit '{}'", d as char)) +} + +fn parse_test_case( + current_section: &mut String, + lines: &mut dyn Iterator, +) -> Option { + let mut attributes = Vec::new(); + + let mut is_first_line = true; + loop { + let line = lines.next(); + + #[cfg(feature = "test_logging")] + if let Some(text) = &line { + std::println!("Line: {}", text); + } + + match line { + // If we get to EOF when we're not in the middle of a test case, + // then we're done. + None if is_first_line => { + return None; + } + + // End of the file on a non-empty test cases ends the test case. + None => { + return Some(TestCase { attributes }); + } + + // A blank line ends a test case if the test case isn't empty. + Some("") => { + if !is_first_line { + return Some(TestCase { attributes }); + } + // Ignore leading blank lines. + } + + // Comments start with '#'; ignore them. + Some(line) if line.starts_with('#') => (), + + Some(line) if line.starts_with('[') => { + assert!(is_first_line); + assert!(line.ends_with(']')); + current_section.truncate(0); + current_section.push_str(line); + let _ = current_section.pop(); + let _ = current_section.remove(0); + } + + Some(line) => { + is_first_line = false; + + let parts: Vec<&str> = line.splitn(2, " = ").collect(); + if parts.len() != 2 { + panic!("Syntax error: Expected Key = Value."); + }; + + let key = parts[0].trim(); + let value = parts[1].trim(); + + // Don't allow the value to be omitted. An empty value can be + // represented as an empty quoted string. + assert_ne!(value.len(), 0); + + // Checking is_none() ensures we don't accept duplicate keys. + attributes.push((String::from(key), String::from(value), false)); + } + } + } +} + +/// Deterministic implementations of `ring::rand::SecureRandom`. +/// +/// These implementations are particularly useful for testing implementations +/// of randomized algorithms & protocols using known-answer-tests where the +/// test vectors contain the random seed to use. They are also especially +/// useful for some types of fuzzing. +#[doc(hidden)] +pub mod rand { + use crate::{error, rand}; + + /// An implementation of `SecureRandom` that always fills the output slice + /// with the given byte. + #[derive(Debug)] + pub struct FixedByteRandom { + pub byte: u8, + } + + impl rand::sealed::SecureRandom for FixedByteRandom { + fn fill_impl(&self, dest: &mut [u8]) -> Result<(), error::Unspecified> { + dest.fill(self.byte); + Ok(()) + } + } + + /// An implementation of `SecureRandom` that always fills the output slice + /// with the slice in `bytes`. The length of the slice given to `slice` + /// must match exactly. + #[derive(Debug)] + pub struct FixedSliceRandom<'a> { + pub bytes: &'a [u8], + } + + impl rand::sealed::SecureRandom for FixedSliceRandom<'_> { + fn fill_impl(&self, dest: &mut [u8]) -> Result<(), error::Unspecified> { + dest.copy_from_slice(self.bytes); + Ok(()) + } + } + + /// An implementation of `SecureRandom` where each slice in `bytes` is a + /// test vector for one call to `fill()`. *Not thread-safe.* + /// + /// The first slice in `bytes` is the output for the first call to + /// `fill()`, the second slice is the output for the second call to + /// `fill()`, etc. The output slice passed to `fill()` must have exactly + /// the length of the corresponding entry in `bytes`. `current` must be + /// initialized to zero. `fill()` must be called exactly once for each + /// entry in `bytes`. + #[derive(Debug)] + pub struct FixedSliceSequenceRandom<'a> { + /// The value. + pub bytes: &'a [&'a [u8]], + pub current: core::cell::UnsafeCell, + } + + impl rand::sealed::SecureRandom for FixedSliceSequenceRandom<'_> { + fn fill_impl(&self, dest: &mut [u8]) -> Result<(), error::Unspecified> { + let current = unsafe { *self.current.get() }; + let bytes = self.bytes[current]; + dest.copy_from_slice(bytes); + // Remember that we returned this slice and prepare to return + // the next one, if any. + unsafe { *self.current.get() += 1 }; + Ok(()) + } + } + + impl Drop for FixedSliceSequenceRandom<'_> { + fn drop(&mut self) { + // Ensure that `fill()` was called exactly the right number of + // times. + assert_eq!(unsafe { *self.current.get() }, self.bytes.len()); + } + } +} + +#[cfg(test)] +mod tests { + use crate::error; + use crate::testutil as test; + + #[test] + fn one_ok() { + test::run(test_vector_file!("test_1_tests.txt"), |_, test_case| { + let _ = test_case.consume_string("Key"); + Ok(()) + }); + } + + #[test] + #[should_panic(expected = "Test failed.")] + fn one_err() { + test::run(test_vector_file!("test_1_tests.txt"), |_, test_case| { + let _ = test_case.consume_string("Key"); + Err(error::Unspecified) + }); + } + + #[test] + #[should_panic(expected = "Oh noes!")] + fn one_panics() { + test::run(test_vector_file!("test_1_tests.txt"), |_, test_case| { + let _ = test_case.consume_string("Key"); + panic!("Oh noes!"); + }); + } + + #[test] + #[should_panic(expected = "Test failed.")] + fn first_err() { + err_one(0) + } + + #[test] + #[should_panic(expected = "Test failed.")] + fn middle_err() { + err_one(1) + } + + #[test] + #[should_panic(expected = "Test failed.")] + fn last_err() { + err_one(2) + } + + fn err_one(test_to_fail: usize) { + let mut n = 0; + test::run(test_vector_file!("test_3_tests.txt"), |_, test_case| { + let _ = test_case.consume_string("Key"); + let result = if n != test_to_fail { + Ok(()) + } else { + Err(error::Unspecified) + }; + n += 1; + result + }); + } + + #[test] + #[should_panic(expected = "Oh Noes!")] + fn first_panic() { + panic_one(0) + } + + #[test] + #[should_panic(expected = "Oh Noes!")] + fn middle_panic() { + panic_one(1) + } + + #[test] + #[should_panic(expected = "Oh Noes!")] + fn last_panic() { + panic_one(2) + } + + fn panic_one(test_to_fail: usize) { + let mut n = 0; + test::run(test_vector_file!("test_3_tests.txt"), |_, test_case| { + let _ = test_case.consume_string("Key"); + if n == test_to_fail { + panic!("Oh Noes!"); + }; + n += 1; + Ok(()) + }); + } + + #[test] + #[should_panic(expected = "Syntax error: Expected Key = Value.")] + fn syntax_error() { + test::run( + test_vector_file!("test_1_syntax_error_tests.txt"), + |_, _| Ok(()), + ); + } +} diff --git a/ring-0.17.14/tests/aead_tests.rs b/ring-0.17.14/tests/aead_tests.rs new file mode 100644 index 0000000000..261822f5ad --- /dev/null +++ b/ring-0.17.14/tests/aead_tests.rs @@ -0,0 +1,653 @@ +// Copyright 2015-2021 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![allow(missing_docs)] + +#[cfg(all(target_arch = "wasm32", target_os = "unknown"))] +use wasm_bindgen_test::{wasm_bindgen_test as test, wasm_bindgen_test_configure}; + +#[cfg(all(target_arch = "wasm32", target_os = "unknown"))] +wasm_bindgen_test_configure!(run_in_browser); + +use core::ops::RangeFrom; +use ring::{aead, error}; +#[allow(deprecated)] +use ring::{test, test_file}; + +/// Generate the known answer test functions for the given algorithm and test +/// case input file, where each test is implemented by a test in `$test`. +/// +/// All of these tests can be run in parallel. +macro_rules! test_known_answer { + ( $alg:ident, $test_file:expr, [ $( $test:ident ),+, ] ) => { + $( + #[test] + fn $test() { + test_aead( + &aead::$alg, + super::super::$test, + test_file!($test_file)); + } + )+ + } +} + +/// Generate the tests for a given algorithm. +/// +/// All of these tests can be run in parallel. +macro_rules! test_aead { + { $( { $alg:ident, $test_file:expr } ),+, } => { + mod aead_test { // Make `cargo test aead` include these files. + $( + #[allow(non_snake_case)] + mod $alg { // Provide a separate namespace for each algorithm's test. + use super::super::*; + + #[cfg(all(target_arch = "wasm32", target_os = "unknown"))] + use wasm_bindgen_test::wasm_bindgen_test as test; + + test_known_answer!( + $alg, + $test_file, + [ + less_safe_key_open_in_place, + less_safe_key_open_within, + less_safe_key_seal_in_place_append_tag, + less_safe_key_seal_in_place_separate_tag, + opening_key_open_in_place, + opening_key_open_within, + sealing_key_seal_in_place_append_tag, + sealing_key_seal_in_place_separate_tag, + test_open_in_place_seperate_tag, + ]); + + #[test] + fn key_sizes() { + super::super::key_sizes(&aead::$alg); + } + } + )+ + } + } +} + +test_aead! { + { AES_128_GCM, "aead_aes_128_gcm_tests.txt" }, + { AES_256_GCM, "aead_aes_256_gcm_tests.txt" }, + { CHACHA20_POLY1305, "aead_chacha20_poly1305_tests.txt" }, +} + +struct KnownAnswerTestCase<'a> { + key: &'a [u8], + nonce: [u8; aead::NONCE_LEN], + plaintext: &'a [u8], + aad: aead::Aad<&'a [u8]>, + ciphertext: &'a [u8], + tag: &'a [u8], +} + +fn test_aead( + aead_alg: &'static aead::Algorithm, + f: impl Fn(&'static aead::Algorithm, KnownAnswerTestCase) -> Result<(), error::Unspecified>, + test_file: test::File, +) { + test::run(test_file, |section, test_case| { + assert_eq!(section, ""); + let key = test_case.consume_bytes("KEY"); + let nonce = test_case.consume_bytes("NONCE"); + let plaintext = test_case.consume_bytes("IN"); + let aad = test_case.consume_bytes("AD"); + let ct = test_case.consume_bytes("CT"); + let tag = test_case.consume_bytes("TAG"); + let error = test_case.consume_optional_string("FAILS"); + + match error.as_deref() { + Some("WRONG_NONCE_LENGTH") => { + assert!(matches!( + aead::Nonce::try_assume_unique_for_key(&nonce), + Err(error::Unspecified) + )); + return Ok(()); + } + Some(unexpected) => { + unreachable!("unexpected error in test data: {}", unexpected); + } + None => {} + }; + + let test_case = KnownAnswerTestCase { + key: &key, + nonce: nonce.as_slice().try_into().unwrap(), + plaintext: &plaintext, + aad: aead::Aad::from(&aad), + ciphertext: &ct, + tag: &tag, + }; + + f(aead_alg, test_case) + }) +} + +fn test_seal_append_tag( + tc: &KnownAnswerTestCase, + seal: Seal, +) -> Result<(), error::Unspecified> +where + Seal: FnOnce(aead::Nonce, &mut Vec) -> Result<(), error::Unspecified>, +{ + let mut in_out = Vec::from(tc.plaintext); + seal(aead::Nonce::assume_unique_for_key(tc.nonce), &mut in_out)?; + + let mut expected_ciphertext_and_tag = Vec::from(tc.ciphertext); + expected_ciphertext_and_tag.extend_from_slice(tc.tag); + + assert_eq!(in_out, expected_ciphertext_and_tag); + + Ok(()) +} + +fn test_seal_separate_tag( + tc: &KnownAnswerTestCase, + seal: Seal, +) -> Result<(), error::Unspecified> +where + Seal: Fn(aead::Nonce, &mut [u8]) -> Result, +{ + let mut in_out = Vec::from(tc.plaintext); + let actual_tag = seal(aead::Nonce::assume_unique_for_key(tc.nonce), &mut in_out)?; + assert_eq!(actual_tag.as_ref(), tc.tag); + assert_eq!(in_out, tc.ciphertext); + + Ok(()) +} + +fn test_open_in_place( + tc: &KnownAnswerTestCase<'_>, + open_in_place: OpenInPlace, +) -> Result<(), error::Unspecified> +where + OpenInPlace: + for<'a> FnOnce(aead::Nonce, &'a mut [u8]) -> Result<&'a mut [u8], error::Unspecified>, +{ + let nonce = aead::Nonce::assume_unique_for_key(tc.nonce); + + let mut in_out = Vec::from(tc.ciphertext); + in_out.extend_from_slice(tc.tag); + + let actual_plaintext = open_in_place(nonce, &mut in_out)?; + + assert_eq!(actual_plaintext, tc.plaintext); + assert_eq!(&in_out[..tc.plaintext.len()], tc.plaintext); + Ok(()) +} + +fn test_open_in_place_seperate_tag( + alg: &'static aead::Algorithm, + tc: KnownAnswerTestCase, +) -> Result<(), error::Unspecified> { + let key = make_less_safe_key(alg, tc.key); + + let mut in_out = Vec::from(tc.ciphertext); + let tag = tc.tag.try_into().unwrap(); + + // Test the simplest behavior. + { + let nonce = aead::Nonce::assume_unique_for_key(tc.nonce); + let actual_plaintext = + key.open_in_place_separate_tag(nonce, tc.aad, tag, &mut in_out, 0..)?; + + assert_eq!(actual_plaintext, tc.plaintext); + assert_eq!(&in_out[..tc.plaintext.len()], tc.plaintext); + } + + // Test that ciphertext range shifting works as expected. + { + let range = in_out.len()..; + in_out.extend_from_slice(tc.ciphertext); + + let nonce = aead::Nonce::assume_unique_for_key(tc.nonce); + let actual_plaintext = + key.open_in_place_separate_tag(nonce, tc.aad, tag, &mut in_out, range)?; + + assert_eq!(actual_plaintext, tc.plaintext); + assert_eq!(&in_out[..tc.plaintext.len()], tc.plaintext); + } + + Ok(()) +} + +fn test_open_within( + tc: &KnownAnswerTestCase<'_>, + open_within: OpenWithin, +) -> Result<(), error::Unspecified> +where + OpenWithin: for<'a> Fn( + aead::Nonce, + &'a mut [u8], + RangeFrom, + ) -> Result<&'a mut [u8], error::Unspecified>, +{ + // In release builds, test all prefix lengths from 0 to 4096 bytes. + // Debug builds are too slow for this, so for those builds, only + // test a smaller subset. + + // TLS record headers are 5 bytes long. + // TLS explicit nonces for AES-GCM are 8 bytes long. + static MINIMAL_IN_PREFIX_LENS: [usize; 36] = [ + // No input prefix to overwrite; i.e. the opening is exactly + // "in place." + 0, + 1, + 2, + // Proposed TLS 1.3 header (no explicit nonce). + 5, + 8, + // Probably the most common use of a non-zero `in_prefix_len` + // would be to write a decrypted TLS record over the top of the + // TLS header and nonce. + 5 /* record header */ + 8, /* explicit nonce */ + // The stitched AES-GCM x86-64 code works on 6-block (96 byte) + // units. Some of the ChaCha20 code is even weirder. + 15, // The maximum partial AES block. + 16, // One AES block. + 17, // One byte more than a full AES block. + 31, // 2 AES blocks or 1 ChaCha20 block, minus 1. + 32, // Two AES blocks, one ChaCha20 block. + 33, // 2 AES blocks or 1 ChaCha20 block, plus 1. + 47, // Three AES blocks - 1. + 48, // Three AES blocks. + 49, // Three AES blocks + 1. + 63, // Four AES blocks or two ChaCha20 blocks, minus 1. + 64, // Four AES blocks or two ChaCha20 blocks. + 65, // Four AES blocks or two ChaCha20 blocks, plus 1. + 79, // Five AES blocks, minus 1. + 80, // Five AES blocks. + 81, // Five AES blocks, plus 1. + 95, // Six AES blocks or three ChaCha20 blocks, minus 1. + 96, // Six AES blocks or three ChaCha20 blocks. + 97, // Six AES blocks or three ChaCha20 blocks, plus 1. + 111, // Seven AES blocks, minus 1. + 112, // Seven AES blocks. + 113, // Seven AES blocks, plus 1. + 127, // Eight AES blocks or four ChaCha20 blocks, minus 1. + 128, // Eight AES blocks or four ChaCha20 blocks. + 129, // Eight AES blocks or four ChaCha20 blocks, plus 1. + 143, // Nine AES blocks, minus 1. + 144, // Nine AES blocks. + 145, // Nine AES blocks, plus 1. + 255, // 16 AES blocks or 8 ChaCha20 blocks, minus 1. + 256, // 16 AES blocks or 8 ChaCha20 blocks. + 257, // 16 AES blocks or 8 ChaCha20 blocks, plus 1. + ]; + + let mut more_comprehensive_in_prefix_lengths = [0; 4096]; + let in_prefix_lengths = if cfg!(debug_assertions) { + &MINIMAL_IN_PREFIX_LENS[..] + } else { + #[allow(clippy::needless_range_loop)] + for b in 0..more_comprehensive_in_prefix_lengths.len() { + more_comprehensive_in_prefix_lengths[b] = b; + } + &more_comprehensive_in_prefix_lengths[..] + }; + let mut in_out = vec![123u8; 4096]; + + for &in_prefix_len in in_prefix_lengths.iter() { + in_out.truncate(0); + in_out.resize(in_prefix_len, 123); + in_out.extend_from_slice(tc.ciphertext); + in_out.extend_from_slice(tc.tag); + + let actual_plaintext = open_within( + aead::Nonce::assume_unique_for_key(tc.nonce), + &mut in_out, + in_prefix_len.., + )?; + assert_eq!(actual_plaintext, tc.plaintext); + assert_eq!(&in_out[..tc.plaintext.len()], tc.plaintext); + } + + Ok(()) +} + +fn sealing_key_seal_in_place_append_tag( + alg: &'static aead::Algorithm, + tc: KnownAnswerTestCase, +) -> Result<(), error::Unspecified> { + test_seal_append_tag(&tc, |nonce, in_out| { + let mut key: aead::SealingKey = make_key(alg, tc.key, nonce); + key.seal_in_place_append_tag(tc.aad, in_out) + }) +} + +fn sealing_key_seal_in_place_separate_tag( + alg: &'static aead::Algorithm, + tc: KnownAnswerTestCase, +) -> Result<(), error::Unspecified> { + test_seal_separate_tag(&tc, |nonce, in_out| { + let mut key: aead::SealingKey<_> = make_key(alg, tc.key, nonce); + key.seal_in_place_separate_tag(tc.aad, in_out) + }) +} + +fn opening_key_open_in_place( + alg: &'static aead::Algorithm, + tc: KnownAnswerTestCase, +) -> Result<(), error::Unspecified> { + test_open_in_place(&tc, |nonce, in_out| { + let mut key: aead::OpeningKey<_> = make_key(alg, tc.key, nonce); + key.open_in_place(tc.aad, in_out) + }) +} + +fn opening_key_open_within( + alg: &'static aead::Algorithm, + tc: KnownAnswerTestCase, +) -> Result<(), error::Unspecified> { + test_open_within(&tc, |nonce, in_out, ciphertext_and_tag| { + let mut key: aead::OpeningKey = make_key(alg, tc.key, nonce); + key.open_within(tc.aad, in_out, ciphertext_and_tag) + }) +} + +fn less_safe_key_seal_in_place_append_tag( + alg: &'static aead::Algorithm, + tc: KnownAnswerTestCase, +) -> Result<(), error::Unspecified> { + test_seal_append_tag(&tc, |nonce, in_out| { + let key = make_less_safe_key(alg, tc.key); + key.seal_in_place_append_tag(nonce, tc.aad, in_out) + }) +} + +fn less_safe_key_open_in_place( + alg: &'static aead::Algorithm, + tc: KnownAnswerTestCase, +) -> Result<(), error::Unspecified> { + test_open_in_place(&tc, |nonce, in_out| { + let key = make_less_safe_key(alg, tc.key); + key.open_in_place(nonce, tc.aad, in_out) + }) +} + +fn less_safe_key_seal_in_place_separate_tag( + alg: &'static aead::Algorithm, + tc: KnownAnswerTestCase, +) -> Result<(), error::Unspecified> { + test_seal_separate_tag(&tc, |nonce, in_out| { + let key = make_less_safe_key(alg, tc.key); + key.seal_in_place_separate_tag(nonce, tc.aad, in_out) + }) +} + +fn less_safe_key_open_within( + alg: &'static aead::Algorithm, + tc: KnownAnswerTestCase, +) -> Result<(), error::Unspecified> { + test_open_within(&tc, |nonce, in_out, ciphertext_and_tag| { + let key = make_less_safe_key(alg, tc.key); + key.open_within(nonce, tc.aad, in_out, ciphertext_and_tag) + }) +} + +#[allow(clippy::range_plus_one)] +fn key_sizes(aead_alg: &'static aead::Algorithm) { + let key_len = aead_alg.key_len(); + let key_data = vec![0u8; key_len * 2]; + + // Key is the right size. + assert!(aead::UnboundKey::new(aead_alg, &key_data[..key_len]).is_ok()); + + // Key is one byte too small. + assert!(aead::UnboundKey::new(aead_alg, &key_data[..(key_len - 1)]).is_err()); + + // Key is one byte too large. + assert!(aead::UnboundKey::new(aead_alg, &key_data[..(key_len + 1)]).is_err()); + + // Key is half the required size. + assert!(aead::UnboundKey::new(aead_alg, &key_data[..(key_len / 2)]).is_err()); + + // Key is twice the required size. + assert!(aead::UnboundKey::new(aead_alg, &key_data[..(key_len * 2)]).is_err()); + + // Key is empty. + assert!(aead::UnboundKey::new(aead_alg, &[]).is_err()); + + // Key is one byte. + assert!(aead::UnboundKey::new(aead_alg, &[0]).is_err()); +} + +// Test that we reject non-standard nonce sizes. +#[allow(clippy::range_plus_one)] +#[test] +fn test_aead_nonce_sizes() { + let nonce_len = aead::NONCE_LEN; + let nonce = vec![0u8; nonce_len * 2]; + + assert!(aead::Nonce::try_assume_unique_for_key(&nonce[..nonce_len]).is_ok()); + assert!(aead::Nonce::try_assume_unique_for_key(&nonce[..(nonce_len - 1)]).is_err()); + assert!(aead::Nonce::try_assume_unique_for_key(&nonce[..(nonce_len + 1)]).is_err()); + assert!(aead::Nonce::try_assume_unique_for_key(&nonce[..(nonce_len / 2)]).is_err()); + assert!(aead::Nonce::try_assume_unique_for_key(&nonce[..(nonce_len * 2)]).is_err()); + assert!(aead::Nonce::try_assume_unique_for_key(&[]).is_err()); + assert!(aead::Nonce::try_assume_unique_for_key(&nonce[..1]).is_err()); + assert!(aead::Nonce::try_assume_unique_for_key(&nonce[..16]).is_err()); // 128 bits. +} + +#[allow(clippy::range_plus_one)] +#[test] +fn aead_chacha20_poly1305_openssh() { + // TODO: test_aead_key_sizes(...); + + test::run( + test_file!("aead_chacha20_poly1305_openssh_tests.txt"), + |section, test_case| { + assert_eq!(section, ""); + + // XXX: `polyfill::convert` isn't available here. + let key_bytes = { + let as_vec = test_case.consume_bytes("KEY"); + let mut as_array = [0u8; aead::chacha20_poly1305_openssh::KEY_LEN]; + as_array.copy_from_slice(&as_vec); + as_array + }; + + let sequence_num: u32 = test_case + .consume_usize("SEQUENCE_NUMBER") + .try_into() + .unwrap(); + let plaintext = test_case.consume_bytes("IN"); + let ct = test_case.consume_bytes("CT"); + let expected_tag = test_case.consume_bytes("TAG"); + + // TODO: Add some tests for when things fail. + //let error = test_case.consume_optional_string("FAILS"); + + let mut tag = [0u8; aead::chacha20_poly1305_openssh::TAG_LEN]; + let mut s_in_out = plaintext.clone(); + let s_key = aead::chacha20_poly1305_openssh::SealingKey::new(&key_bytes); + s_key.seal_in_place(sequence_num, &mut s_in_out[..], &mut tag); + assert_eq!(&ct, &s_in_out); + assert_eq!(&expected_tag, &tag); + let o_key = aead::chacha20_poly1305_openssh::OpeningKey::new(&key_bytes); + + { + let o_result = o_key.open_in_place(sequence_num, &mut s_in_out[..], &tag); + assert_eq!(o_result, Ok(&plaintext[4..])); + } + assert_eq!(&s_in_out[..4], &ct[..4]); + assert_eq!(&s_in_out[4..], &plaintext[4..]); + + Ok(()) + }, + ); +} + +#[test] +fn aead_test_aad_traits() { + test::compile_time_assert_send::>(); + test::compile_time_assert_sync::>(); + test::compile_time_assert_copy::>(); + test::compile_time_assert_eq::>>(); // `!Copy` + + let aad_123 = aead::Aad::from(vec![1, 2, 3]); // `!Copy` + assert_eq!(aad_123, aad_123.clone()); // Cover `Clone` and `PartialEq` + assert_eq!( + format!("{:?}", aead::Aad::from(&[1, 2, 3])), + "Aad([1, 2, 3])" + ); +} + +#[test] +fn test_nonce_traits() { + test::compile_time_assert_send::(); + test::compile_time_assert_sync::(); +} + +#[test] +fn test_tag_traits() { + test::compile_time_assert_send::(); + test::compile_time_assert_sync::(); + + test::compile_time_assert_copy::(); + test::compile_time_assert_clone::(); + + let tag = aead::Tag::from([4u8; 16]); + let _tag_2 = tag; // Cover `Copy` + assert_eq!(tag.as_ref(), tag.clone().as_ref()); // Cover `Clone` +} + +fn test_aead_key_traits() {} + +#[test] +fn test_aead_key_traits_all() { + test_aead_key_traits::>(); + test_aead_key_traits::>(); + test_aead_key_traits::(); +} + +#[test] +fn test_aead_key_debug() { + let key_bytes = [0; 32]; + let nonce = [0; aead::NONCE_LEN]; + + let key = aead::UnboundKey::new(&aead::AES_256_GCM, &key_bytes).unwrap(); + assert_eq!( + "UnboundKey { algorithm: AES_256_GCM }", + format!("{:?}", key) + ); + + let sealing_key: aead::SealingKey = make_key( + &aead::AES_256_GCM, + &key_bytes, + aead::Nonce::try_assume_unique_for_key(&nonce).unwrap(), + ); + assert_eq!( + "SealingKey { algorithm: AES_256_GCM }", + format!("{:?}", sealing_key) + ); + + let opening_key: aead::OpeningKey = make_key( + &aead::AES_256_GCM, + &key_bytes, + aead::Nonce::try_assume_unique_for_key(&nonce).unwrap(), + ); + assert_eq!( + "OpeningKey { algorithm: AES_256_GCM }", + format!("{:?}", opening_key) + ); + + let key: aead::LessSafeKey = make_less_safe_key(&aead::AES_256_GCM, &key_bytes); + assert_eq!( + "LessSafeKey { algorithm: AES_256_GCM }", + format!("{:?}", key) + ); +} + +fn test_aead_lesssafekey_clone_for_algorithm(algorithm: &'static aead::Algorithm) { + let test_bytes: Vec = (0..32).collect(); + let key_bytes = &test_bytes[..algorithm.key_len()]; + let nonce_bytes = &test_bytes[..algorithm.nonce_len()]; + + let key1: aead::LessSafeKey = + aead::LessSafeKey::new(aead::UnboundKey::new(algorithm, key_bytes).unwrap()); + let key2 = key1.clone(); + + // LessSafeKey doesn't support AsRef or PartialEq, so instead just check that both keys produce + // the same encrypted output. + let mut buf1: Vec = (0..100).collect(); + let mut buf2 = buf1.clone(); + let tag1 = key1 + .seal_in_place_separate_tag( + aead::Nonce::try_assume_unique_for_key(nonce_bytes).unwrap(), + aead::Aad::empty(), + &mut buf1, + ) + .unwrap(); + let tag2 = key2 + .seal_in_place_separate_tag( + aead::Nonce::try_assume_unique_for_key(nonce_bytes).unwrap(), + aead::Aad::empty(), + &mut buf2, + ) + .unwrap(); + assert_eq!(tag1.as_ref(), tag2.as_ref()); + assert_eq!(buf1, buf2); +} + +#[test] +fn test_aead_lesssafekey_clone_aes_128_gcm() { + test_aead_lesssafekey_clone_for_algorithm(&aead::AES_128_GCM); +} + +#[test] +fn test_aead_lesssafekey_clone_aes_256_gcm() { + test_aead_lesssafekey_clone_for_algorithm(&aead::AES_256_GCM); +} + +#[test] +fn test_aead_lesssafekey_clone_chacha20_poly1305() { + test_aead_lesssafekey_clone_for_algorithm(&aead::CHACHA20_POLY1305); +} + +fn make_key>( + algorithm: &'static aead::Algorithm, + key: &[u8], + nonce: aead::Nonce, +) -> K { + let key = aead::UnboundKey::new(algorithm, key).unwrap(); + let nonce_sequence = OneNonceSequence::new(nonce); + K::new(key, nonce_sequence) +} + +fn make_less_safe_key(algorithm: &'static aead::Algorithm, key: &[u8]) -> aead::LessSafeKey { + let key = aead::UnboundKey::new(algorithm, key).unwrap(); + aead::LessSafeKey::new(key) +} + +struct OneNonceSequence(Option); + +impl OneNonceSequence { + /// Constructs the sequence allowing `advance()` to be called + /// `allowed_invocations` times. + fn new(nonce: aead::Nonce) -> Self { + Self(Some(nonce)) + } +} + +impl aead::NonceSequence for OneNonceSequence { + fn advance(&mut self) -> Result { + self.0.take().ok_or(error::Unspecified) + } +} diff --git a/ring-0.17.14/tests/agreement_tests.rs b/ring-0.17.14/tests/agreement_tests.rs new file mode 100644 index 0000000000..edbe04fcb5 --- /dev/null +++ b/ring-0.17.14/tests/agreement_tests.rs @@ -0,0 +1,218 @@ +// Copyright 2015-2017 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![allow(missing_docs)] + +#[cfg(all(target_arch = "wasm32", target_os = "unknown"))] +use wasm_bindgen_test::{wasm_bindgen_test as test, wasm_bindgen_test_configure}; + +#[cfg(all(target_arch = "wasm32", target_os = "unknown"))] +wasm_bindgen_test_configure!(run_in_browser); + +extern crate alloc; + +use ring::{agreement, error, rand}; +#[allow(deprecated)] +use ring::{test, test_file}; + +#[test] +fn agreement_traits() { + use alloc::vec::Vec; + + let rng = rand::SystemRandom::new(); + let private_key = + agreement::EphemeralPrivateKey::generate(&agreement::ECDH_P256, &rng).unwrap(); + + test::compile_time_assert_send::(); + test::compile_time_assert_sync::(); + + assert_eq!( + format!("{:?}", &private_key), + "EphemeralPrivateKey { algorithm: Algorithm { curve: P256 } }" + ); + + let public_key = private_key.compute_public_key().unwrap(); + + test::compile_time_assert_clone::(); + test::compile_time_assert_send::(); + test::compile_time_assert_sync::(); + + // Verify `PublicKey` implements `Debug`. + // + // TODO: Test the actual output. + let _: &dyn core::fmt::Debug = &public_key; + + test::compile_time_assert_clone::>(); + test::compile_time_assert_copy::>(); + test::compile_time_assert_sync::>(); + + test::compile_time_assert_clone::>>(); + test::compile_time_assert_sync::>>(); + + let unparsed_public_key = + agreement::UnparsedPublicKey::new(&agreement::X25519, &[0x01, 0x02, 0x03]); + + assert_eq!( + format!("{:?}", unparsed_public_key), + r#"UnparsedPublicKey { algorithm: Algorithm { curve: Curve25519 }, bytes: "010203" }"# + ); + + // Test `AsRef<[u8]>` + assert_eq!(unparsed_public_key.as_ref(), &[0x01, 0x02, 0x03]); +} + +#[test] +fn agreement_agree_ephemeral() { + let rng = rand::SystemRandom::new(); + + test::run(test_file!("agreement_tests.txt"), |section, test_case| { + assert_eq!(section, ""); + + let curve_name = test_case.consume_string("Curve"); + let alg = alg_from_curve_name(&curve_name); + let peer_public = agreement::UnparsedPublicKey::new(alg, test_case.consume_bytes("PeerQ")); + + match test_case.consume_optional_string("Error") { + None => { + let my_private = test_case.consume_bytes("D"); + let my_private = { + #[allow(deprecated)] + let rng = test::rand::FixedSliceRandom { bytes: &my_private }; + agreement::EphemeralPrivateKey::generate(alg, &rng)? + }; + let my_public = test_case.consume_bytes("MyQ"); + let output = test_case.consume_bytes("Output"); + + assert_eq!(my_private.algorithm(), alg); + + let computed_public = my_private.compute_public_key().unwrap(); + assert_eq!(computed_public.as_ref(), &my_public[..]); + + assert_eq!(my_private.algorithm(), alg); + + let result = agreement::agree_ephemeral(my_private, &peer_public, |key_material| { + assert_eq!(key_material, &output[..]); + }); + assert_eq!(result, Ok(())); + } + + Some(_) => { + // In the no-heap mode, some algorithms aren't supported so + // we have to skip those algorithms' test cases. + let dummy_private_key = agreement::EphemeralPrivateKey::generate(alg, &rng)?; + fn kdf_not_called(_: &[u8]) -> Result<(), ()> { + panic!( + "The KDF was called during ECDH when the peer's \ + public key is invalid." + ); + } + assert!(agreement::agree_ephemeral( + dummy_private_key, + &peer_public, + kdf_not_called + ) + .is_err()); + } + } + + Ok(()) + }); +} + +#[test] +fn test_agreement_ecdh_x25519_rfc_iterated() { + let mut k = h("0900000000000000000000000000000000000000000000000000000000000000"); + let mut u = k.clone(); + + fn expect_iterated_x25519( + expected_result: &str, + range: core::ops::Range, + k: &mut Vec, + u: &mut Vec, + ) { + for _ in range { + let new_k = x25519(k, u); + u.clone_from(k); + *k = new_k; + } + assert_eq!(&h(expected_result), k); + } + + expect_iterated_x25519( + "422c8e7a6227d7bca1350b3e2bb7279f7897b87bb6854b783c60e80311ae3079", + 0..1, + &mut k, + &mut u, + ); + expect_iterated_x25519( + "684cf59ba83309552800ef566f2f4d3c1c3887c49360e3875f2eb94d99532c51", + 1..1_000, + &mut k, + &mut u, + ); + + // The spec gives a test vector for 1,000,000 iterations but it takes + // too long to do 1,000,000 iterations by default right now. This + // 10,000 iteration vector is self-computed. + expect_iterated_x25519( + "2c125a20f639d504a7703d2e223c79a79de48c4ee8c23379aa19a62ecd211815", + 1_000..10_000, + &mut k, + &mut u, + ); + + if cfg!(feature = "slow_tests") { + expect_iterated_x25519( + "7c3911e0ab2586fd864497297e575e6f3bc601c0883c30df5f4dd2d24f665424", + 10_000..1_000_000, + &mut k, + &mut u, + ); + } +} + +fn x25519(private_key: &[u8], public_key: &[u8]) -> Vec { + x25519_(private_key, public_key).unwrap() +} + +fn x25519_(private_key: &[u8], public_key: &[u8]) -> Result, error::Unspecified> { + #[allow(deprecated)] + let rng = test::rand::FixedSliceRandom { bytes: private_key }; + let private_key = agreement::EphemeralPrivateKey::generate(&agreement::X25519, &rng)?; + let public_key = agreement::UnparsedPublicKey::new(&agreement::X25519, public_key); + agreement::agree_ephemeral(private_key, &public_key, |agreed_value| { + Vec::from(agreed_value) + }) +} + +fn h(s: &str) -> Vec { + match test::from_hex(s) { + Ok(v) => v, + Err(msg) => { + panic!("{} in {}", msg, s); + } + } +} + +fn alg_from_curve_name(curve_name: &str) -> &'static agreement::Algorithm { + if curve_name == "P-256" { + &agreement::ECDH_P256 + } else if curve_name == "P-384" { + &agreement::ECDH_P384 + } else if curve_name == "X25519" { + &agreement::X25519 + } else { + panic!("Unsupported curve: {}", curve_name); + } +} diff --git a/ring-0.17.14/tests/constant_time_tests.rs b/ring-0.17.14/tests/constant_time_tests.rs new file mode 100644 index 0000000000..b8d9ac38ff --- /dev/null +++ b/ring-0.17.14/tests/constant_time_tests.rs @@ -0,0 +1,78 @@ +// Copyright 2020 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![allow(missing_docs)] + +#[allow(deprecated)] +use constant_time::verify_slices_are_equal; +#[allow(deprecated)] +use ring::constant_time; +use ring::{error, rand}; + +#[cfg(all(target_arch = "wasm32", target_os = "unknown"))] +use wasm_bindgen_test::{wasm_bindgen_test as test, wasm_bindgen_test_configure}; + +#[cfg(all(target_arch = "wasm32", target_os = "unknown"))] +wasm_bindgen_test_configure!(run_in_browser); + +// This logic is loosely based on BoringSSL's `TEST(ConstantTimeTest, MemCmp)`. +#[allow(deprecated)] +#[test] +fn test_verify_slices_are_equal() { + let initial: [u8; 256] = rand::generate(&rand::SystemRandom::new()).unwrap().expose(); + + { + let copy = initial; + for len in 0..copy.len() { + // Not equal because the lengths do not match. + assert_eq!( + verify_slices_are_equal(&initial, ©[..len]), + Err(error::Unspecified) + ); + // Equal lengths and equal contents. + assert_eq!( + verify_slices_are_equal(&initial[..len], ©[..len]), + Ok(()) + ); + } + // Equal lengths and equal contents. + assert_eq!(verify_slices_are_equal(&initial, ©), Ok(())); + } + + for i in 0..initial.len() { + for bit in 0..8 { + let mut copy = initial; + copy[i] ^= 1u8 << bit; + + for len in 0..=initial.len() { + // We flipped at least one bit in `copy`. + assert_ne!(&initial[..], ©[..]); + + let a = &initial[..len]; + let b = ©[..len]; + + let expected_result = if i < len { + // The flipped bit is within `b` so `a` and `b` are not equal. + Err(error::Unspecified) + } else { + // The flipped bit is outside of `b` so `a` and `b` are equal. + Ok(()) + }; + assert_eq!(a == b, expected_result.is_ok()); // Sanity check. + assert_eq!(verify_slices_are_equal(a, b), expected_result); + assert_eq!(verify_slices_are_equal(b, a), expected_result); + } + } + } +} diff --git a/ring-0.17.14/tests/digest_tests.rs b/ring-0.17.14/tests/digest_tests.rs new file mode 100644 index 0000000000..0a3a824826 --- /dev/null +++ b/ring-0.17.14/tests/digest_tests.rs @@ -0,0 +1,137 @@ +// Copyright 2015-2017 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![allow(missing_docs)] + +use ring::digest; +#[allow(deprecated)] +use ring::{test, test_file}; + +#[cfg(all(target_arch = "wasm32", target_os = "unknown"))] +use wasm_bindgen_test::{wasm_bindgen_test as test, wasm_bindgen_test_configure}; + +#[cfg(all(target_arch = "wasm32", target_os = "unknown"))] +wasm_bindgen_test_configure!(run_in_browser); + +/// Test vectors from BoringSSL, Go, and other sources. +#[test] +fn digest_misc() { + test::run(test_file!("digest_tests.txt"), |section, test_case| { + assert_eq!(section, ""); + let digest_alg = test_case.consume_digest_alg("Hash").unwrap(); + let input = test_case.consume_bytes("Input"); + let repeat = test_case.consume_usize("Repeat"); + let expected = test_case.consume_bytes("Output"); + + let mut ctx = digest::Context::new(digest_alg); + let mut data = Vec::new(); + for _ in 0..repeat { + ctx.update(&input); + data.extend(&input); + } + let actual_from_chunks = ctx.finish(); + assert_eq!(&expected, &actual_from_chunks.as_ref()); + + let actual_from_one_shot = digest::digest(digest_alg, &data); + assert_eq!(&expected, &actual_from_one_shot.as_ref()); + + Ok(()) + }); +} + +/// Test some ways in which `Context::update` and/or `Context::finish` +/// could go wrong by testing every combination of updating three inputs +/// that vary from zero bytes to one byte larger than the block length. +/// +/// These are not run in dev (debug) builds because they are too slow. +macro_rules! test_i_u_f { + ( $test_name:ident, $alg:expr) => { + #[cfg(not(debug_assertions))] + #[test] + fn $test_name() { + let mut input = [0; (digest::MAX_BLOCK_LEN + 1) * 3]; + let max = $alg.block_len() + 1; + for i in 0..(max * 3) { + input[i] = (i & 0xff) as u8; + } + + for i in 0..max { + for j in 0..max { + for k in 0..max { + let part1 = &input[..i]; + let part2 = &input[i..(i + j)]; + let part3 = &input[(i + j)..(i + j + k)]; + + let mut ctx = digest::Context::new(&$alg); + ctx.update(part1); + ctx.update(part2); + ctx.update(part3); + let i_u_f = ctx.finish(); + + let one_shot = digest::digest(&$alg, &input[..(i + j + k)]); + + assert_eq!(i_u_f.as_ref(), one_shot.as_ref()); + } + } + } + } + }; +} +test_i_u_f!(digest_test_i_u_f_sha1, digest::SHA1_FOR_LEGACY_USE_ONLY); +test_i_u_f!(digest_test_i_u_f_sha256, digest::SHA256); +test_i_u_f!(digest_test_i_u_f_sha384, digest::SHA384); +test_i_u_f!(digest_test_i_u_f_sha512, digest::SHA512); + +#[test] +fn test_fmt_algorithm() { + assert_eq!("SHA1", &format!("{:?}", digest::SHA1_FOR_LEGACY_USE_ONLY)); + assert_eq!("SHA256", &format!("{:?}", digest::SHA256)); + assert_eq!("SHA384", &format!("{:?}", digest::SHA384)); + assert_eq!("SHA512", &format!("{:?}", digest::SHA512)); + assert_eq!("SHA512_256", &format!("{:?}", digest::SHA512_256)); +} + +#[test] +fn digest_test_fmt() { + assert_eq!( + "SHA1:b7e23ec29af22b0b4e41da31e868d57226121c84", + &format!( + "{:?}", + digest::digest(&digest::SHA1_FOR_LEGACY_USE_ONLY, b"hello, world") + ) + ); + assert_eq!( + "SHA256:09ca7e4eaa6e8ae9c7d261167129184883644d\ + 07dfba7cbfbc4c8a2e08360d5b", + &format!("{:?}", digest::digest(&digest::SHA256, b"hello, world")) + ); + assert_eq!( + "SHA384:1fcdb6059ce05172a26bbe2a3ccc88ed5a8cd5\ + fc53edfd9053304d429296a6da23b1cd9e5c9ed3bb34f0\ + 0418a70cdb7e", + &format!("{:?}", digest::digest(&digest::SHA384, b"hello, world")) + ); + assert_eq!( + "SHA512:8710339dcb6814d0d9d2290ef422285c9322b7\ + 163951f9a0ca8f883d3305286f44139aa374848e4174f5\ + aada663027e4548637b6d19894aec4fb6c46a139fbf9", + &format!("{:?}", digest::digest(&digest::SHA512, b"hello, world")) + ); + + assert_eq!( + "SHA512_256:11f2c88c04f0a9c3d0970894ad2472505e\ + 0bc6e8c7ec46b5211cd1fa3e253e62", + &format!("{:?}", digest::digest(&digest::SHA512_256, b"hello, world")) + ); +} diff --git a/ring-0.17.14/tests/ecdsa_test_private_key_p256.p8 b/ring-0.17.14/tests/ecdsa_test_private_key_p256.p8 new file mode 100644 index 0000000000..bc118842bb Binary files /dev/null and b/ring-0.17.14/tests/ecdsa_test_private_key_p256.p8 differ diff --git a/ring-0.17.14/tests/ecdsa_test_public_key_p256.der b/ring-0.17.14/tests/ecdsa_test_public_key_p256.der new file mode 100644 index 0000000000..ce3e6d7e97 --- /dev/null +++ b/ring-0.17.14/tests/ecdsa_test_public_key_p256.der @@ -0,0 +1 @@ +f#eP墳M`*eғP3M?5Y$^[Zzܧk3Y5 . \ No newline at end of file diff --git a/ring-0.17.14/tests/ecdsa_test_public_key_p256_debug.txt b/ring-0.17.14/tests/ecdsa_test_public_key_p256_debug.txt new file mode 100644 index 0000000000..b590fc8490 --- /dev/null +++ b/ring-0.17.14/tests/ecdsa_test_public_key_p256_debug.txt @@ -0,0 +1 @@ +PublicKey("04fc116698a3e3236550c4c9efa9bd4d0619602a65d2930e9150ab33e84dbc83f8a6a6b9933f35ab59245e5b5a7af5dca76b33cbe7aeee5981b3ca350bebf52ecd") \ No newline at end of file diff --git a/ring-0.17.14/tests/ecdsa_tests.rs b/ring-0.17.14/tests/ecdsa_tests.rs new file mode 100644 index 0000000000..83e5abf405 --- /dev/null +++ b/ring-0.17.14/tests/ecdsa_tests.rs @@ -0,0 +1,321 @@ +// Copyright 2015-2016 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![allow(missing_docs)] + +use ring::{ + rand, + signature::{self, KeyPair}, +}; +#[allow(deprecated)] +use ring::{test, test_file}; + +// ECDSA *signing* tests are in src/ec/ecdsa/signing.rs. + +#[test] +fn ecdsa_from_pkcs8_test() { + let rng = rand::SystemRandom::new(); + + test::run( + test_file!("ecdsa_from_pkcs8_tests.txt"), + |section, test_case| { + assert_eq!(section, ""); + + let curve_name = test_case.consume_string("Curve"); + let ((this_fixed, this_asn1), (other_fixed, other_asn1)) = match curve_name.as_str() { + "P-256" => ( + ( + &signature::ECDSA_P256_SHA256_FIXED_SIGNING, + &signature::ECDSA_P256_SHA256_ASN1_SIGNING, + ), + ( + &signature::ECDSA_P384_SHA384_FIXED_SIGNING, + &signature::ECDSA_P384_SHA384_ASN1_SIGNING, + ), + ), + "P-384" => ( + ( + &signature::ECDSA_P384_SHA384_FIXED_SIGNING, + &signature::ECDSA_P384_SHA384_ASN1_SIGNING, + ), + ( + &signature::ECDSA_P256_SHA256_FIXED_SIGNING, + &signature::ECDSA_P256_SHA256_ASN1_SIGNING, + ), + ), + _ => unreachable!(), + }; + + let input = test_case.consume_bytes("Input"); + + let error = test_case.consume_optional_string("Error"); + + match ( + signature::EcdsaKeyPair::from_pkcs8(this_fixed, &input, &rng), + error.clone(), + ) { + (Ok(_), None) => (), + (Err(e), None) => panic!("Failed with error \"{}\", but expected to succeed", e), + (Ok(_), Some(e)) => panic!("Succeeded, but expected error \"{}\"", e), + (Err(actual), Some(expected)) => assert_eq!(format!("{}", actual), expected), + }; + + match ( + signature::EcdsaKeyPair::from_pkcs8(this_asn1, &input, &rng), + error, + ) { + (Ok(_), None) => (), + (Err(e), None) => panic!("Failed with error \"{}\", but expected to succeed", e), + (Ok(_), Some(e)) => panic!("Succeeded, but expected error \"{}\"", e), + (Err(actual), Some(expected)) => assert_eq!(format!("{}", actual), expected), + }; + + assert!(signature::EcdsaKeyPair::from_pkcs8(other_fixed, &input, &rng).is_err()); + assert!(signature::EcdsaKeyPair::from_pkcs8(other_asn1, &input, &rng).is_err()); + + Ok(()) + }, + ); +} + +// Verify that, at least, we generate PKCS#8 documents that we can read. +#[test] +fn ecdsa_generate_pkcs8_test() { + let rng = rand::SystemRandom::new(); + + for alg in &[ + &signature::ECDSA_P256_SHA256_ASN1_SIGNING, + &signature::ECDSA_P256_SHA256_FIXED_SIGNING, + &signature::ECDSA_P384_SHA384_ASN1_SIGNING, + &signature::ECDSA_P384_SHA384_FIXED_SIGNING, + ] { + let pkcs8 = signature::EcdsaKeyPair::generate_pkcs8(alg, &rng).unwrap(); + println!(); + for b in pkcs8.as_ref() { + print!("{:02x}", *b); + } + println!(); + println!(); + + #[cfg(feature = "alloc")] + let _ = signature::EcdsaKeyPair::from_pkcs8(alg, pkcs8.as_ref(), &rng).unwrap(); + } +} + +#[test] +fn signature_ecdsa_verify_asn1_test() { + test::run( + test_file!("ecdsa_verify_asn1_tests.txt"), + |section, test_case| { + assert_eq!(section, ""); + + let curve_name = test_case.consume_string("Curve"); + let digest_name = test_case.consume_string("Digest"); + let msg = test_case.consume_bytes("Msg"); + let public_key = test_case.consume_bytes("Q"); + let sig = test_case.consume_bytes("Sig"); + let is_valid = test_case.consume_string("Result") == "P (0 )"; + + let alg = match (curve_name.as_str(), digest_name.as_str()) { + ("P-256", "SHA256") => &signature::ECDSA_P256_SHA256_ASN1, + ("P-256", "SHA384") => &signature::ECDSA_P256_SHA384_ASN1, + ("P-384", "SHA256") => &signature::ECDSA_P384_SHA256_ASN1, + ("P-384", "SHA384") => &signature::ECDSA_P384_SHA384_ASN1, + _ => { + panic!("Unsupported curve+digest: {}+{}", curve_name, digest_name); + } + }; + + let actual_result = + signature::UnparsedPublicKey::new(alg, &public_key).verify(&msg, &sig); + assert_eq!(actual_result.is_ok(), is_valid); + + Ok(()) + }, + ); +} + +#[test] +fn signature_ecdsa_verify_fixed_test() { + test::run( + test_file!("ecdsa_verify_fixed_tests.txt"), + |section, test_case| { + assert_eq!(section, ""); + + let curve_name = test_case.consume_string("Curve"); + let digest_name = test_case.consume_string("Digest"); + + let msg = test_case.consume_bytes("Msg"); + let public_key = test_case.consume_bytes("Q"); + let sig = test_case.consume_bytes("Sig"); + let expected_result = test_case.consume_string("Result"); + + let alg = match (curve_name.as_str(), digest_name.as_str()) { + ("P-256", "SHA256") => &signature::ECDSA_P256_SHA256_FIXED, + ("P-384", "SHA384") => &signature::ECDSA_P384_SHA384_FIXED, + _ => { + panic!("Unsupported curve+digest: {}+{}", curve_name, digest_name); + } + }; + + let is_valid = expected_result == "P (0 )"; + + let actual_result = + signature::UnparsedPublicKey::new(alg, &public_key).verify(&msg, &sig); + assert_eq!(actual_result.is_ok(), is_valid); + + Ok(()) + }, + ); +} + +#[test] +fn ecdsa_test_public_key_coverage() { + const PRIVATE_KEY: &[u8] = include_bytes!("ecdsa_test_private_key_p256.p8"); + const PUBLIC_KEY: &[u8] = include_bytes!("ecdsa_test_public_key_p256.der"); + const PUBLIC_KEY_DEBUG: &str = include_str!("ecdsa_test_public_key_p256_debug.txt"); + + let rng = rand::SystemRandom::new(); + let key_pair = signature::EcdsaKeyPair::from_pkcs8( + &signature::ECDSA_P256_SHA256_FIXED_SIGNING, + PRIVATE_KEY, + &rng, + ) + .unwrap(); + + // Test `AsRef<[u8]>` + assert_eq!(key_pair.public_key().as_ref(), PUBLIC_KEY); + + // Test `Clone`. + #[allow(clippy::clone_on_copy, clippy::redundant_clone)] + let _: ::PublicKey = key_pair.public_key().clone(); + + // Test `Copy`. + let _: ::PublicKey = *key_pair.public_key(); + + // Test `Debug`. + assert_eq!(PUBLIC_KEY_DEBUG, format!("{:?}", key_pair.public_key())); + assert_eq!( + format!("EcdsaKeyPair {{ public_key: {:?} }}", key_pair.public_key()), + format!("{:?}", key_pair) + ); +} + +// This test is not a known-answer test, though it re-uses the known-answer +// test vectors. Because the nonce is randomized, the signature will be +// different each time. Because of that, here we simply verify that the +// signature verifies correctly. The known-answer tests themselves are in +// ecsda/signing.rs. +#[test] +fn signature_ecdsa_sign_fixed_sign_and_verify_test() { + let rng = rand::SystemRandom::new(); + + test::run( + test_file!("../src/ec/suite_b/ecdsa/ecdsa_sign_fixed_tests.txt"), + |section, test_case| { + assert_eq!(section, ""); + + let curve_name = test_case.consume_string("Curve"); + let digest_name = test_case.consume_string("Digest"); + + let msg = test_case.consume_bytes("Msg"); + let d = test_case.consume_bytes("d"); + let q = test_case.consume_bytes("Q"); + + // Ignored since the actual signature will use a randomized nonce. + let _k = test_case.consume_bytes("k"); + let _expected_result = test_case.consume_bytes("Sig"); + + let (signing_alg, verification_alg) = match (curve_name.as_str(), digest_name.as_str()) + { + ("P-256", "SHA256") => ( + &signature::ECDSA_P256_SHA256_FIXED_SIGNING, + &signature::ECDSA_P256_SHA256_FIXED, + ), + ("P-384", "SHA384") => ( + &signature::ECDSA_P384_SHA384_FIXED_SIGNING, + &signature::ECDSA_P384_SHA384_FIXED, + ), + _ => { + panic!("Unsupported curve+digest: {}+{}", curve_name, digest_name); + } + }; + + let private_key = + signature::EcdsaKeyPair::from_private_key_and_public_key(signing_alg, &d, &q, &rng) + .unwrap(); + + let signature = private_key.sign(&rng, &msg).unwrap(); + + let public_key = signature::UnparsedPublicKey::new(verification_alg, q); + assert_eq!(public_key.verify(&msg, signature.as_ref()), Ok(())); + + Ok(()) + }, + ); +} + +// This test is not a known-answer test, though it re-uses the known-answer +// test vectors. Because the nonce is randomized, the signature will be +// different each time. Because of that, here we simply verify that the +// signature verifies correctly. The known-answer tests themselves are in +// ecsda/signing.rs. +#[test] +fn signature_ecdsa_sign_asn1_test() { + let rng = rand::SystemRandom::new(); + + test::run( + test_file!("../src/ec/suite_b/ecdsa/ecdsa_sign_asn1_tests.txt"), + |section, test_case| { + assert_eq!(section, ""); + + let curve_name = test_case.consume_string("Curve"); + let digest_name = test_case.consume_string("Digest"); + + let msg = test_case.consume_bytes("Msg"); + let d = test_case.consume_bytes("d"); + let q = test_case.consume_bytes("Q"); + + // Ignored since the actual signature will use a randomized nonce. + let _k = test_case.consume_bytes("k"); + let _expected_result = test_case.consume_bytes("Sig"); + + let (signing_alg, verification_alg) = match (curve_name.as_str(), digest_name.as_str()) + { + ("P-256", "SHA256") => ( + &signature::ECDSA_P256_SHA256_ASN1_SIGNING, + &signature::ECDSA_P256_SHA256_ASN1, + ), + ("P-384", "SHA384") => ( + &signature::ECDSA_P384_SHA384_ASN1_SIGNING, + &signature::ECDSA_P384_SHA384_ASN1, + ), + _ => { + panic!("Unsupported curve+digest: {}+{}", curve_name, digest_name); + } + }; + + let private_key = + signature::EcdsaKeyPair::from_private_key_and_public_key(signing_alg, &d, &q, &rng) + .unwrap(); + + let signature = private_key.sign(&rng, &msg).unwrap(); + + let public_key = signature::UnparsedPublicKey::new(verification_alg, q); + assert_eq!(public_key.verify(&msg, signature.as_ref()), Ok(())); + + Ok(()) + }, + ); +} diff --git a/ring-0.17.14/tests/ed25519_test_private_key.bin b/ring-0.17.14/tests/ed25519_test_private_key.bin new file mode 100644 index 0000000000..afd7169069 --- /dev/null +++ b/ring-0.17.14/tests/ed25519_test_private_key.bin @@ -0,0 +1 @@ +aZ`J,DIi{2ip;` \ No newline at end of file diff --git a/ring-0.17.14/tests/ed25519_test_private_key.p8 b/ring-0.17.14/tests/ed25519_test_private_key.p8 new file mode 100644 index 0000000000..fc90176662 Binary files /dev/null and b/ring-0.17.14/tests/ed25519_test_private_key.p8 differ diff --git a/ring-0.17.14/tests/ed25519_test_public_key.bin b/ring-0.17.14/tests/ed25519_test_public_key.bin new file mode 100644 index 0000000000..9ed91630c4 --- /dev/null +++ b/ring-0.17.14/tests/ed25519_test_public_key.bin @@ -0,0 +1,2 @@ +Z +Kd:rڦ#%hQ \ No newline at end of file diff --git a/ring-0.17.14/tests/ed25519_test_public_key.der b/ring-0.17.14/tests/ed25519_test_public_key.der new file mode 100644 index 0000000000..0ac2bd5c41 --- /dev/null +++ b/ring-0.17.14/tests/ed25519_test_public_key.der @@ -0,0 +1 @@ +X X~WX5ök} \ No newline at end of file diff --git a/ring-0.17.14/tests/ed25519_tests.rs b/ring-0.17.14/tests/ed25519_tests.rs new file mode 100644 index 0000000000..65afc94897 --- /dev/null +++ b/ring-0.17.14/tests/ed25519_tests.rs @@ -0,0 +1,236 @@ +// Copyright 2015-2017 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![allow(missing_docs)] + +use ring::{ + error, rand, + signature::{self, Ed25519KeyPair, KeyPair}, +}; +#[allow(deprecated)] +use ring::{test, test_file}; + +#[cfg(all(target_arch = "wasm32", target_os = "unknown"))] +use wasm_bindgen_test::{wasm_bindgen_test as test, wasm_bindgen_test_configure}; + +#[cfg(all(target_arch = "wasm32", target_os = "unknown"))] +wasm_bindgen_test_configure!(run_in_browser); + +/// Test vectors from BoringSSL. +#[test] +fn test_signature_ed25519() { + test::run(test_file!("ed25519_tests.txt"), |section, test_case| { + assert_eq!(section, ""); + let seed = test_case.consume_bytes("SEED"); + assert_eq!(32, seed.len()); + + let public_key = test_case.consume_bytes("PUB"); + assert_eq!(32, public_key.len()); + + let msg = test_case.consume_bytes("MESSAGE"); + + let expected_sig = test_case.consume_bytes("SIG"); + + { + let key_pair = Ed25519KeyPair::from_seed_and_public_key(&seed, &public_key).unwrap(); + let actual_sig = key_pair.sign(&msg); + assert_eq!(&expected_sig[..], actual_sig.as_ref()); + } + + // Test PKCS#8 generation, parsing, and private-to-public calculations. + #[allow(deprecated)] + let rng = test::rand::FixedSliceRandom { bytes: &seed }; + let pkcs8 = Ed25519KeyPair::generate_pkcs8(&rng).unwrap(); + let key_pair = Ed25519KeyPair::from_pkcs8(pkcs8.as_ref()).unwrap(); + assert_eq!(public_key, key_pair.public_key().as_ref()); + + // Test Signature generation. + let actual_sig = key_pair.sign(&msg); + assert_eq!(&expected_sig[..], actual_sig.as_ref()); + + // Test Signature verification. + test_signature_verification(&public_key, &msg, &expected_sig, Ok(())); + + let mut tampered_sig = expected_sig; + tampered_sig[0] ^= 1; + + test_signature_verification(&public_key, &msg, &tampered_sig, Err(error::Unspecified)); + + Ok(()) + }); +} + +/// Test vectors from BoringSSL. +#[test] +fn test_signature_ed25519_verify() { + test::run( + test_file!("ed25519_verify_tests.txt"), + |section, test_case| { + assert_eq!(section, ""); + + let public_key = test_case.consume_bytes("PUB"); + let msg = test_case.consume_bytes("MESSAGE"); + let sig = test_case.consume_bytes("SIG"); + let expected_result = match test_case.consume_string("Result").as_str() { + "P" => Ok(()), + "F" => Err(error::Unspecified), + s => panic!("{:?} is not a valid result", s), + }; + test_signature_verification(&public_key, &msg, &sig, expected_result); + Ok(()) + }, + ); +} + +fn test_signature_verification( + public_key: &[u8], + msg: &[u8], + sig: &[u8], + expected_result: Result<(), error::Unspecified>, +) { + assert_eq!( + expected_result, + signature::UnparsedPublicKey::new(&signature::ED25519, public_key).verify(msg, sig) + ); +} + +#[test] +fn test_ed25519_from_seed_and_public_key_misuse() { + const PRIVATE_KEY: &[u8] = include_bytes!("ed25519_test_private_key.bin"); + const PUBLIC_KEY: &[u8] = include_bytes!("ed25519_test_public_key.bin"); + + assert!(Ed25519KeyPair::from_seed_and_public_key(PRIVATE_KEY, PUBLIC_KEY).is_ok()); + + // Truncated private key. + assert!(Ed25519KeyPair::from_seed_and_public_key(&PRIVATE_KEY[..31], PUBLIC_KEY).is_err()); + + // Truncated public key. + assert!(Ed25519KeyPair::from_seed_and_public_key(PRIVATE_KEY, &PUBLIC_KEY[..31]).is_err()); + + // Swapped public and private key. + assert!(Ed25519KeyPair::from_seed_and_public_key(PUBLIC_KEY, PRIVATE_KEY).is_err()); +} + +enum FromPkcs8Variant { + Checked, + MaybeUnchecked, +} + +#[test] +fn test_ed25519_from_pkcs8_unchecked() { + test_ed25519_from_pkcs8_( + FromPkcs8Variant::MaybeUnchecked, + Ed25519KeyPair::from_pkcs8_maybe_unchecked, + ) +} + +#[test] +fn test_ed25519_from_pkcs8() { + test_ed25519_from_pkcs8_(FromPkcs8Variant::Checked, Ed25519KeyPair::from_pkcs8) +} + +fn test_ed25519_from_pkcs8_( + variant: FromPkcs8Variant, + f: impl Fn(&[u8]) -> Result, +) { + // Just test that we can parse the input. + test::run( + test_file!("ed25519_from_pkcs8_tests.txt"), + |section, test_case| { + assert_eq!(section, ""); + let input = test_case.consume_bytes("Input"); + let expected_error = { + let expected_checked = test_case.consume_string("Result-Checked"); + let expected_maybe_unchecked = test_case.consume_string("Result-Maybe-Unchecked"); + let expected_result = match variant { + FromPkcs8Variant::Checked => expected_checked, + FromPkcs8Variant::MaybeUnchecked => expected_maybe_unchecked, + }; + if expected_result == "OK" { + None + } else { + Some(expected_result) + } + }; + let expected_public = { + let expected_if_no_error = test_case.consume_optional_bytes("Public"); + if expected_error.is_none() { + Some(expected_if_no_error.unwrap()) + } else { + None + } + }; + + match f(&input) { + Ok(keypair) => { + assert_eq!(expected_error, None); + assert_eq!( + expected_public.as_deref(), + Some(keypair.public_key().as_ref()) + ); + } + Err(actual_error) => { + assert_eq!(expected_error, Some(format!("{}", actual_error))); + assert_eq!(expected_public, None); + } + } + + Ok(()) + }, + ); +} + +#[test] +fn ed25519_test_generate_pkcs8() { + let rng = rand::SystemRandom::new(); + let generated = Ed25519KeyPair::generate_pkcs8(&rng).unwrap(); + let generated = generated.as_ref(); + + let _ronudtripped = Ed25519KeyPair::from_pkcs8(generated).unwrap(); + + // Regression test: Verify we're generating the correct encoding, as + // `Ed25519KeyPair::from_pkcs8` also accepts our old wrong encoding. + assert_eq!(generated.len(), 19 + 32 + 32); + assert_eq!(&generated[..2], &[0x30, 0x51]); +} + +#[test] +fn ed25519_test_public_key_coverage() { + const PRIVATE_KEY: &[u8] = include_bytes!("ed25519_test_private_key.p8"); + const PUBLIC_KEY: &[u8] = include_bytes!("ed25519_test_public_key.der"); + const PUBLIC_KEY_DEBUG: &str = + "PublicKey(\"5809e9fef6dcec58f0f2e3b0d67e9880a11957e083ace85835c3b6c8fbaf6b7d\")"; + + let key_pair = Ed25519KeyPair::from_pkcs8(PRIVATE_KEY).unwrap(); + + // Test `AsRef<[u8]>` + assert_eq!(key_pair.public_key().as_ref(), PUBLIC_KEY); + + // Test `Clone`. + #[allow(clippy::clone_on_copy)] + let _: ::PublicKey = key_pair.public_key().clone(); + + // Test `Copy`. + let _: ::PublicKey = *key_pair.public_key(); + + // Test `Debug`. + assert_eq!(PUBLIC_KEY_DEBUG, format!("{:?}", key_pair.public_key())); + assert_eq!( + format!( + "Ed25519KeyPair {{ public_key: {:?} }}", + key_pair.public_key() + ), + format!("{:?}", key_pair) + ); +} diff --git a/ring-0.17.14/tests/error_tests.rs b/ring-0.17.14/tests/error_tests.rs new file mode 100644 index 0000000000..0152b703e8 --- /dev/null +++ b/ring-0.17.14/tests/error_tests.rs @@ -0,0 +1,12 @@ +#![allow(missing_docs)] + +#[cfg(feature = "std")] +#[test] +fn error_impl_std_error_error_test() { + use ring::error; + #[allow(deprecated)] + use ring::test; + + test::compile_time_assert_std_error_error::(); + test::compile_time_assert_std_error_error::(); +} diff --git a/ring-0.17.14/tests/hkdf_tests.rs b/ring-0.17.14/tests/hkdf_tests.rs new file mode 100644 index 0000000000..64f664bc1d --- /dev/null +++ b/ring-0.17.14/tests/hkdf_tests.rs @@ -0,0 +1,131 @@ +// Copyright 2015 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![allow(missing_docs)] + +use ring::{digest, error, hkdf}; +#[allow(deprecated)] +use ring::{test, test_file}; + +#[cfg(all(target_arch = "wasm32", target_os = "unknown"))] +use wasm_bindgen_test::{wasm_bindgen_test as test, wasm_bindgen_test_configure}; + +#[cfg(all(target_arch = "wasm32", target_os = "unknown"))] +wasm_bindgen_test_configure!(run_in_browser); + +#[test] +fn hkdf_tests() { + test::run(test_file!("hkdf_tests.txt"), |section, test_case| { + assert_eq!(section, ""); + let alg = { + let digest_alg = test_case + .consume_digest_alg("Hash") + .ok_or(error::Unspecified)?; + if digest_alg == &digest::SHA256 { + hkdf::HKDF_SHA256 + } else { + // TODO: add test vectors for other algorithms + panic!("unsupported algorithm: {:?}", digest_alg); + } + }; + let secret = test_case.consume_bytes("IKM"); + let salt = test_case.consume_bytes("salt"); + let info = test_case.consume_bytes("info"); + let _ = test_case.consume_bytes("PRK"); + let expected_out = test_case.consume_bytes("OKM"); + + let salt = hkdf::Salt::new(alg, &salt); + + // TODO: test multi-part info, especially with empty parts. + let My(out) = salt + .extract(&secret) + .expand(&[&info], My(expected_out.len())) + .unwrap() + .into(); + assert_eq!(out, expected_out); + + Ok(()) + }); +} + +#[test] +fn hkdf_output_len_tests() { + for &alg in &[hkdf::HKDF_SHA256, hkdf::HKDF_SHA384, hkdf::HKDF_SHA512] { + const MAX_BLOCKS: usize = 255; + + let salt = hkdf::Salt::new(alg, &[]); + let prk = salt.extract(&[]); // TODO: enforce minimum length. + + { + // Test zero length. + let okm = prk.expand(&[b"info"], My(0)).unwrap(); + let result: My> = okm.into(); + assert_eq!(&result.0, &[]); + } + + let max_out_len = MAX_BLOCKS * alg.hmac_algorithm().digest_algorithm().output_len(); + + { + // Test maximum length output succeeds. + let okm = prk.expand(&[b"info"], My(max_out_len)).unwrap(); + let result: My> = okm.into(); + assert_eq!(result.0.len(), max_out_len); + } + + { + // Test too-large output fails. + assert!(prk.expand(&[b"info"], My(max_out_len + 1)).is_err()); + } + + { + // Test length mismatch (smaller). + let okm = prk.expand(&[b"info"], My(2)).unwrap(); + let mut buf = [0u8; 1]; + assert_eq!(okm.fill(&mut buf), Err(error::Unspecified)); + } + + { + // Test length mismatch (larger). + let okm = prk.expand(&[b"info"], My(2)).unwrap(); + let mut buf = [0u8; 3]; + assert_eq!(okm.fill(&mut buf), Err(error::Unspecified)); + } + + { + // Control for above two tests. + let okm = prk.expand(&[b"info"], My(2)).unwrap(); + let mut buf = [0u8; 2]; + assert_eq!(okm.fill(&mut buf), Ok(())); + } + } +} + +/// Generic newtype wrapper that lets us implement traits for externally-defined +/// types. +#[derive(Debug, PartialEq)] +struct My(T); + +impl hkdf::KeyType for My { + fn len(&self) -> usize { + self.0 + } +} + +impl From>> for My> { + fn from(okm: hkdf::Okm>) -> Self { + let mut r = vec![0u8; okm.len().0]; + okm.fill(&mut r).unwrap(); + Self(r) + } +} diff --git a/ring-0.17.14/tests/hmac_tests.rs b/ring-0.17.14/tests/hmac_tests.rs new file mode 100644 index 0000000000..4a56f30f85 --- /dev/null +++ b/ring-0.17.14/tests/hmac_tests.rs @@ -0,0 +1,113 @@ +// Copyright 2015-2016 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![allow(missing_docs)] + +use ring::{digest, hmac}; +#[allow(deprecated)] +use ring::{test, test_file}; + +#[cfg(all(target_arch = "wasm32", target_os = "unknown"))] +use wasm_bindgen_test::{wasm_bindgen_test as test, wasm_bindgen_test_configure}; + +#[cfg(all(target_arch = "wasm32", target_os = "unknown"))] +wasm_bindgen_test_configure!(run_in_browser); + +#[test] +fn hmac_tests() { + test::run(test_file!("hmac_tests.txt"), |section, test_case| { + assert_eq!(section, ""); + let digest_alg = test_case.consume_digest_alg("HMAC"); + let key_value = test_case.consume_bytes("Key"); + let mut input = test_case.consume_bytes("Input"); + let output = test_case.consume_bytes("Output"); + + let algorithm = { + let digest_alg = match digest_alg { + Some(digest_alg) => digest_alg, + None => { + return Ok(()); + } // Unsupported digest algorithm + }; + if digest_alg == &digest::SHA1_FOR_LEGACY_USE_ONLY { + hmac::HMAC_SHA1_FOR_LEGACY_USE_ONLY + } else if digest_alg == &digest::SHA256 { + hmac::HMAC_SHA256 + } else if digest_alg == &digest::SHA384 { + hmac::HMAC_SHA384 + } else if digest_alg == &digest::SHA512 { + hmac::HMAC_SHA512 + } else { + unreachable!() + } + }; + + hmac_test_case_inner(algorithm, &key_value[..], &input[..], &output[..], true); + + // Tamper with the input and check that verification fails. + if input.is_empty() { + input.push(0); + } else { + input[0] ^= 1; + } + + hmac_test_case_inner(algorithm, &key_value[..], &input[..], &output[..], false); + + Ok(()) + }); +} + +fn hmac_test_case_inner( + algorithm: hmac::Algorithm, + key_value: &[u8], + input: &[u8], + output: &[u8], + is_ok: bool, +) { + let key = hmac::Key::new(algorithm, key_value); + + // One-shot API. + { + let signature = hmac::sign(&key, input); + assert_eq!(is_ok, signature.as_ref() == output); + assert_eq!(is_ok, hmac::verify(&key, input, output).is_ok()); + } + + // Multi-part API, one single part. + { + let mut s_ctx = hmac::Context::with_key(&key); + s_ctx.update(input); + let signature = s_ctx.sign(); + assert_eq!(is_ok, signature.as_ref() == output); + } + + // Multi-part API, byte by byte. + { + let mut ctx = hmac::Context::with_key(&key); + for b in input { + ctx.update(&[*b]); + } + let signature = ctx.sign(); + assert_eq!(is_ok, signature.as_ref() == output); + } +} + +#[test] +fn hmac_debug() { + let key = hmac::Key::new(hmac::HMAC_SHA256, &[0; 32]); + assert_eq!("Key { algorithm: SHA256 }", format!("{:?}", &key)); + + let ctx = hmac::Context::with_key(&key); + assert_eq!("Context { algorithm: SHA256 }", format!("{:?}", &ctx)); +} diff --git a/ring-0.17.14/tests/pbkdf2_tests.rs b/ring-0.17.14/tests/pbkdf2_tests.rs new file mode 100644 index 0000000000..a78506e60f --- /dev/null +++ b/ring-0.17.14/tests/pbkdf2_tests.rs @@ -0,0 +1,72 @@ +// Copyright 2015-2017 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![allow(missing_docs)] + +use core::num::NonZeroU32; +use ring::{digest, error, pbkdf2}; +#[allow(deprecated)] +use ring::{test, test_file}; + +#[cfg(all(target_arch = "wasm32", target_os = "unknown"))] +use wasm_bindgen_test::{wasm_bindgen_test as test, wasm_bindgen_test_configure}; + +#[cfg(all(target_arch = "wasm32", target_os = "unknown"))] +wasm_bindgen_test_configure!(run_in_browser); + +/// Test vectors from BoringSSL, Go, and other sources. +#[test] +pub fn pbkdf2_tests() { + test::run(test_file!("pbkdf2_tests.txt"), |section, test_case| { + assert_eq!(section, ""); + let algorithm = { + let digest_alg = test_case.consume_digest_alg("Hash").unwrap(); + if digest_alg == &digest::SHA1_FOR_LEGACY_USE_ONLY { + pbkdf2::PBKDF2_HMAC_SHA1 + } else if digest_alg == &digest::SHA256 { + pbkdf2::PBKDF2_HMAC_SHA256 + } else if digest_alg == &digest::SHA384 { + pbkdf2::PBKDF2_HMAC_SHA384 + } else if digest_alg == &digest::SHA512 { + pbkdf2::PBKDF2_HMAC_SHA512 + } else { + unreachable!() + } + }; + let iterations: u32 = test_case.consume_usize("c").try_into().unwrap(); + let iterations: NonZeroU32 = iterations.try_into().unwrap(); + let secret = test_case.consume_bytes("P"); + let salt = test_case.consume_bytes("S"); + let dk = test_case.consume_bytes("DK"); + let verify_expected_result = test_case.consume_string("Verify"); + let verify_expected_result = match verify_expected_result.as_str() { + "OK" => Ok(()), + "Err" => Err(error::Unspecified), + _ => panic!("Unsupported value of \"Verify\""), + }; + + { + let mut out = vec![0u8; dk.len()]; + pbkdf2::derive(algorithm, iterations, &salt, &secret, &mut out); + assert_eq!(dk == out, verify_expected_result.is_ok() || dk.is_empty()); + } + + assert_eq!( + pbkdf2::verify(algorithm, iterations, &salt, &secret, &dk), + verify_expected_result + ); + + Ok(()) + }); +} diff --git a/ring-0.17.14/tests/quic_tests.rs b/ring-0.17.14/tests/quic_tests.rs new file mode 100644 index 0000000000..dae30330a3 --- /dev/null +++ b/ring-0.17.14/tests/quic_tests.rs @@ -0,0 +1,87 @@ +// Copyright 2018 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![allow(missing_docs)] + +use ring::aead::quic; +#[allow(deprecated)] +use ring::{test, test_file}; + +#[test] +fn quic_aes_128() { + test_quic(&quic::AES_128, test_file!("quic_aes_128_tests.txt")); +} + +#[test] +fn quic_aes_256() { + test_quic(&quic::AES_256, test_file!("quic_aes_256_tests.txt")); +} + +#[test] +fn quic_chacha20() { + test_quic(&quic::CHACHA20, test_file!("quic_chacha20_tests.txt")); +} + +fn test_quic(alg: &'static quic::Algorithm, test_file: test::File) { + test_key_len(alg); + test_sample_len(alg); + + test::run(test_file, |section, test_case| { + assert_eq!(section, ""); + let key_bytes = test_case.consume_bytes("KEY"); + let sample = test_case.consume_bytes("SAMPLE"); + let mask = test_case.consume_bytes("MASK"); + + let key = quic::HeaderProtectionKey::new(alg, &key_bytes)?; + + assert_eq!(mask.as_ref(), key.new_mask(&sample)?); + + Ok(()) + }); +} + +#[allow(clippy::range_plus_one)] +fn test_key_len(alg: &'static quic::Algorithm) { + let key_len = alg.key_len(); + let key_data = vec![0u8; key_len + 1]; + + assert!(quic::HeaderProtectionKey::new(alg, &[]).is_err()); + assert!(quic::HeaderProtectionKey::new(alg, &key_data[..key_len]).is_ok()); + assert!(quic::HeaderProtectionKey::new(alg, &key_data[..(key_len + 1)]).is_err()); + assert!(quic::HeaderProtectionKey::new(alg, &key_data[..(key_len - 1)]).is_err()); +} + +#[allow(clippy::range_plus_one)] +fn test_sample_len(alg: &'static quic::Algorithm) { + let key_len = alg.key_len(); + let key_data = vec![0u8; key_len]; + + let key = quic::HeaderProtectionKey::new(alg, &key_data).unwrap(); + + let sample_len = alg.sample_len(); + assert_eq!(sample_len, 16); // For all currently-implemented algorithms + let sample_data = vec![0u8; sample_len + 2]; + + // Sample is the right size. + assert!(key.new_mask(&sample_data[..sample_len]).is_ok()); + + // Sample is one byte too small. + assert!(key.new_mask(&sample_data[..(sample_len - 1)]).is_err()); + + // Sample is one byte too big. + assert!(key.new_mask(&sample_data[..(sample_len + 1)]).is_err()); + + // Sample is empty. + assert!(key.new_mask(&[]).is_err()); +} diff --git a/ring-0.17.14/tests/rand_tests.rs b/ring-0.17.14/tests/rand_tests.rs new file mode 100644 index 0000000000..91965362d9 --- /dev/null +++ b/ring-0.17.14/tests/rand_tests.rs @@ -0,0 +1,79 @@ +// Copyright 2015-2019 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![allow(missing_docs)] + +use ring::rand::{self, SecureRandom as _}; +#[allow(deprecated)] +use ring::test; + +#[cfg(all(target_arch = "wasm32", target_os = "unknown"))] +use wasm_bindgen_test::{wasm_bindgen_test as test, wasm_bindgen_test_configure}; + +#[cfg(all(target_arch = "wasm32", target_os = "unknown"))] +wasm_bindgen_test_configure!(run_in_browser); + +#[test] +fn test_system_random_lengths() { + const LINUX_LIMIT: usize = 256; + const WEB_LIMIT: usize = 65536; + + // Test that `fill` succeeds for various interesting lengths. `256` and + // multiples thereof are interesting because that's an edge case for + // `getrandom` on Linux. + let lengths = [ + 0, + 1, + 2, + 3, + 96, + LINUX_LIMIT - 1, + LINUX_LIMIT, + LINUX_LIMIT + 1, + LINUX_LIMIT * 2, + 511, + 512, + 513, + 4096, + WEB_LIMIT - 1, + WEB_LIMIT, + WEB_LIMIT + 1, + WEB_LIMIT * 2, + ]; + + for len in lengths.iter() { + let mut buf = vec![0; *len]; + + let rng = rand::SystemRandom::new(); + assert!(rng.fill(&mut buf).is_ok()); + + // If `len` < 96 then there's a big chance of false positives, but + // otherwise the likelihood of a false positive is so too low to + // worry about. + if *len >= 96 { + assert!(buf.iter().any(|x| *x != 0)); + } + } +} + +#[test] +fn test_system_random_traits() { + test::compile_time_assert_clone::(); + test::compile_time_assert_send::(); + + assert_eq!( + "SystemRandom(())", + format!("{:?}", rand::SystemRandom::new()) + ); +} diff --git a/ring-0.17.14/tests/rsa_test_private_key_2048.p8 b/ring-0.17.14/tests/rsa_test_private_key_2048.p8 new file mode 100644 index 0000000000..26c480b150 Binary files /dev/null and b/ring-0.17.14/tests/rsa_test_private_key_2048.p8 differ diff --git a/ring-0.17.14/tests/rsa_test_public_key_2048.der b/ring-0.17.14/tests/rsa_test_public_key_2048.der new file mode 100644 index 0000000000..47f18a7f5a Binary files /dev/null and b/ring-0.17.14/tests/rsa_test_public_key_2048.der differ diff --git a/ring-0.17.14/tests/rsa_test_public_key_2048_debug.txt b/ring-0.17.14/tests/rsa_test_public_key_2048_debug.txt new file mode 100644 index 0000000000..659d6ee495 --- /dev/null +++ b/ring-0.17.14/tests/rsa_test_public_key_2048_debug.txt @@ -0,0 +1 @@ +PublicKey("3082010a0282010100c8a78500a5a250db8ed36c85b8dcf83c4be1953114faaac7616e0ea24922fa6b7ab01f85582c815cc3bdeb5ed46762bc536accaa8b72705b00cef316b2ec508fb9697241b9e34238419cccf7339eeb8b062147af4f5932f613d9bc0ae70bf6d56d4432e83e13767587531bfa9dd56531741244be75e8bc9226b9fa44b4b8a101358d7e8bb75d0c724a4f11ece77776263faefe79612eb1d71646e77e8982866be1400eafc3580d3139b41aaa7380187372f22e35bd55b288496165c881ed154d5811245c52d56cc09d4916d4f2a50bcf5ae0a2637f4cfa6bf9daafc113dba8383b6dd7da6dd8db22d8510a8d3115983308909a1a0332517aa55e896e154249b30203010001") \ No newline at end of file diff --git a/ring-0.17.14/tests/rsa_test_public_modulus.bin b/ring-0.17.14/tests/rsa_test_public_modulus.bin new file mode 100644 index 0000000000..1f473ee603 Binary files /dev/null and b/ring-0.17.14/tests/rsa_test_public_modulus.bin differ diff --git a/ring-0.17.14/tests/rsa_tests.rs b/ring-0.17.14/tests/rsa_tests.rs new file mode 100644 index 0000000000..835fc85a0c --- /dev/null +++ b/ring-0.17.14/tests/rsa_tests.rs @@ -0,0 +1,348 @@ +// Copyright 2017 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![allow(missing_docs)] +#![cfg(feature = "alloc")] + +use ring::{ + error, + io::der, + rand, rsa, + signature::{self, KeyPair}, +}; +#[allow(deprecated)] +use ring::{test, test_file}; + +#[cfg(all(target_arch = "wasm32", target_os = "unknown"))] +use wasm_bindgen_test::{wasm_bindgen_test as test, wasm_bindgen_test_configure}; + +#[cfg(all(target_arch = "wasm32", target_os = "unknown"))] +wasm_bindgen_test_configure!(run_in_browser); + +#[test] +fn rsa_from_pkcs8_test() { + test::run( + test_file!("rsa_from_pkcs8_tests.txt"), + |section, test_case| { + assert_eq!(section, ""); + + let input = test_case.consume_bytes("Input"); + let error = test_case.consume_optional_string("Error"); + + match (rsa::KeyPair::from_pkcs8(&input), error) { + (Ok(_), None) => {} + (Err(e), None) => panic!("Failed with error \"{}\", but expected to succeed", e), + (Ok(_), Some(e)) => panic!("Succeeded, but expected error \"{}\"", e), + (Err(actual), Some(expected)) => assert_eq!(format!("{}", actual), expected), + }; + + Ok(()) + }, + ); +} + +#[cfg(feature = "alloc")] +#[test] +fn test_signature_rsa_pkcs1_sign() { + let rng = rand::SystemRandom::new(); + test::run( + test_file!("rsa_pkcs1_sign_tests.txt"), + |section, test_case| { + assert_eq!(section, ""); + + let digest_name = test_case.consume_string("Digest"); + let alg = match digest_name.as_ref() { + "SHA256" => &signature::RSA_PKCS1_SHA256, + "SHA384" => &signature::RSA_PKCS1_SHA384, + "SHA512" => &signature::RSA_PKCS1_SHA512, + _ => panic!("Unsupported digest: {}", digest_name), + }; + + let private_key = test_case.consume_bytes("Key"); + let msg = test_case.consume_bytes("Msg"); + let expected = test_case.consume_bytes("Sig"); + let result = test_case.consume_string("Result"); + + let key_pair = rsa::KeyPair::from_der(&private_key); + if result == "Fail-Invalid-Key" { + assert!(key_pair.is_err()); + return Ok(()); + } + let key_pair = key_pair.unwrap(); + + // XXX: This test is too slow on Android ARM Travis CI builds. + // TODO: re-enable these tests on Android ARM. + let mut actual = vec![0u8; key_pair.public().modulus_len()]; + key_pair + .sign(alg, &rng, &msg, actual.as_mut_slice()) + .unwrap(); + assert_eq!(actual.as_slice() == &expected[..], result == "Pass"); + Ok(()) + }, + ); +} + +#[cfg(feature = "alloc")] +#[test] +fn test_signature_rsa_pss_sign() { + test::run( + test_file!("rsa_pss_sign_tests.txt"), + |section, test_case| { + assert_eq!(section, ""); + + let digest_name = test_case.consume_string("Digest"); + let alg = match digest_name.as_ref() { + "SHA256" => &signature::RSA_PSS_SHA256, + "SHA384" => &signature::RSA_PSS_SHA384, + "SHA512" => &signature::RSA_PSS_SHA512, + _ => panic!("Unsupported digest: {}", digest_name), + }; + + let result = test_case.consume_string("Result"); + let private_key = test_case.consume_bytes("Key"); + let key_pair = rsa::KeyPair::from_der(&private_key); + if key_pair.is_err() && result == "Fail-Invalid-Key" { + return Ok(()); + } + let key_pair = key_pair.unwrap(); + let msg = test_case.consume_bytes("Msg"); + let salt = test_case.consume_bytes("Salt"); + let expected = test_case.consume_bytes("Sig"); + + #[allow(deprecated)] + let rng = test::rand::FixedSliceRandom { bytes: &salt }; + + let mut actual = vec![0u8; key_pair.public().modulus_len()]; + key_pair.sign(alg, &rng, &msg, actual.as_mut_slice())?; + assert_eq!(actual.as_slice() == &expected[..], result == "Pass"); + Ok(()) + }, + ); +} + +// `KeyPair::sign` requires that the output buffer is the same length as +// the public key modulus. Test what happens when it isn't the same length. +#[test] +fn test_signature_rsa_pkcs1_sign_output_buffer_len() { + // Sign the message "hello, world", using PKCS#1 v1.5 padding and the + // SHA256 digest algorithm. + const MESSAGE: &[u8] = b"hello, world"; + let rng = rand::SystemRandom::new(); + + const PRIVATE_KEY_DER: &[u8] = + include_bytes!("../src/rsa/signature_rsa_example_private_key.der"); + let key_pair = rsa::KeyPair::from_der(PRIVATE_KEY_DER).unwrap(); + + // When the output buffer is not exactly the right length, `sign()` returns + // an error (and does not panic or invoke UB). if `sign` doesn't check that + // the length is correct at the beginning then there are various possible + // failure points when the output buffer is too small. + for len in 0..key_pair.public().modulus_len() + 1 { + let mut signature = vec![0; len]; + assert_eq!( + len == key_pair.public().modulus_len(), + key_pair + .sign(&signature::RSA_PKCS1_SHA256, &rng, MESSAGE, &mut signature) + .is_ok() + ); + } +} + +#[cfg(feature = "alloc")] +#[test] +fn test_signature_rsa_pkcs1_verify() { + let sha1_params = &[ + ( + &signature::RSA_PKCS1_1024_8192_SHA1_FOR_LEGACY_USE_ONLY, + 1024, + ), + ( + &signature::RSA_PKCS1_2048_8192_SHA1_FOR_LEGACY_USE_ONLY, + 2048, + ), + ]; + let sha256_params = &[ + ( + &signature::RSA_PKCS1_1024_8192_SHA256_FOR_LEGACY_USE_ONLY, + 1024, + ), + (&signature::RSA_PKCS1_2048_8192_SHA256, 2048), + ]; + let sha384_params = &[ + (&signature::RSA_PKCS1_2048_8192_SHA384, 2048), + (&signature::RSA_PKCS1_3072_8192_SHA384, 3072), + ]; + let sha512_params = &[ + ( + &signature::RSA_PKCS1_1024_8192_SHA512_FOR_LEGACY_USE_ONLY, + 1024, + ), + (&signature::RSA_PKCS1_2048_8192_SHA512, 2048), + ]; + test::run( + test_file!("rsa_pkcs1_verify_tests.txt"), + |section, test_case| { + assert_eq!(section, ""); + + let digest_name = test_case.consume_string("Digest"); + let params: &[_] = match digest_name.as_ref() { + "SHA1" => sha1_params, + "SHA256" => sha256_params, + "SHA384" => sha384_params, + "SHA512" => sha512_params, + _ => panic!("Unsupported digest: {}", digest_name), + }; + + let public_key = test_case.consume_bytes("Key"); + + // Sanity check that we correctly DER-encoded the originally- + // provided separate (n, e) components. When we add test vectors + // for improperly-encoded signatures, we'll have to revisit this. + let key_bits = untrusted::Input::from(&public_key) + .read_all(error::Unspecified, |input| { + der::nested(input, der::Tag::Sequence, error::Unspecified, |input| { + let n_bytes = + der::positive_integer(input)?.big_endian_without_leading_zero(); + let _e = der::positive_integer(input)?; + + // Because `n_bytes` has the leading zeros stripped and is big-endian, there + // must be less than 8 leading zero bits. + let n_leading_zeros = usize::try_from(n_bytes[0].leading_zeros()).unwrap(); + assert!(n_leading_zeros < 8); + Ok((n_bytes.len() * 8) - n_leading_zeros) + }) + }) + .expect("invalid DER"); + + let msg = test_case.consume_bytes("Msg"); + let sig = test_case.consume_bytes("Sig"); + let is_valid = test_case.consume_string("Result") == "P"; + for &(alg, min_bits) in params { + let width_ok = key_bits >= min_bits; + let actual_result = + signature::UnparsedPublicKey::new(alg, &public_key).verify(&msg, &sig); + assert_eq!(actual_result.is_ok(), is_valid && width_ok); + } + + Ok(()) + }, + ); +} + +#[cfg(feature = "alloc")] +#[test] +fn test_signature_rsa_pss_verify() { + test::run( + test_file!("rsa_pss_verify_tests.txt"), + |section, test_case| { + assert_eq!(section, ""); + + let digest_name = test_case.consume_string("Digest"); + let alg = match digest_name.as_ref() { + "SHA256" => &signature::RSA_PSS_2048_8192_SHA256, + "SHA384" => &signature::RSA_PSS_2048_8192_SHA384, + "SHA512" => &signature::RSA_PSS_2048_8192_SHA512, + _ => panic!("Unsupported digest: {}", digest_name), + }; + + let public_key = test_case.consume_bytes("Key"); + + // Sanity check that we correctly DER-encoded the originally- + // provided separate (n, e) components. When we add test vectors + // for improperly-encoded signatures, we'll have to revisit this. + assert!(untrusted::Input::from(&public_key) + .read_all(error::Unspecified, |input| der::nested( + input, + der::Tag::Sequence, + error::Unspecified, + |input| { + let _ = der::positive_integer(input)?; + let _ = der::positive_integer(input)?; + Ok(()) + } + )) + .is_ok()); + + let msg = test_case.consume_bytes("Msg"); + let sig = test_case.consume_bytes("Sig"); + let is_valid = test_case.consume_string("Result") == "P"; + + let actual_result = + signature::UnparsedPublicKey::new(alg, &public_key).verify(&msg, &sig); + assert_eq!(actual_result.is_ok(), is_valid); + + Ok(()) + }, + ); +} + +// Test for `primitive::verify()`. Read public key parts from a file +// and use them to verify a signature. +#[cfg(feature = "alloc")] +#[test] +fn test_signature_rsa_primitive_verification() { + test::run( + test_file!("rsa_primitive_verify_tests.txt"), + |section, test_case| { + assert_eq!(section, ""); + let n = test_case.consume_bytes("n"); + let e = test_case.consume_bytes("e"); + let msg = test_case.consume_bytes("Msg"); + let sig = test_case.consume_bytes("Sig"); + let expected = test_case.consume_string("Result"); + let public_key = signature::RsaPublicKeyComponents { n: &n, e: &e }; + let result = public_key.verify(&signature::RSA_PKCS1_2048_8192_SHA256, &msg, &sig); + assert_eq!(result.is_ok(), expected == "Pass"); + Ok(()) + }, + ) +} + +#[cfg(feature = "alloc")] +#[test] +fn rsa_test_keypair_coverage() { + const PRIVATE_KEY: &[u8] = include_bytes!("rsa_test_private_key_2048.p8"); + + let key_pair = rsa::KeyPair::from_pkcs8(PRIVATE_KEY).unwrap(); + + // Test that `signature::KeyPair::PublicKey` is `rsa::PublicKey`; if it + // were a separate type then it would need to be tested separately. + let _: &rsa::PublicKey = key_pair.public_key(); + + test_public_key_coverage(key_pair.public()); + // Test clones. + test_public_key_coverage(&key_pair.public().clone()); + + // Test `Debug` + assert_eq!( + format!("RsaKeyPair {{ public: {:?} }}", key_pair.public_key()), + format!("{:?}", key_pair) + ); +} + +fn test_public_key_coverage(key: &rsa::PublicKey) { + // Test `AsRef<[u8]>` + const PUBLIC_KEY: &[u8] = include_bytes!("rsa_test_public_key_2048.der"); + assert_eq!(key.as_ref(), PUBLIC_KEY); + + // Test `Debug`. + const PUBLIC_KEY_DEBUG: &str = include_str!("rsa_test_public_key_2048_debug.txt"); + assert_eq!(PUBLIC_KEY_DEBUG, format!("{:?}", key)); + + let components = rsa::PublicKeyComponents::>::from(key); + const PUBLIC_KEY_MODULUS_BE_BYTES: &[u8] = include_bytes!("rsa_test_public_modulus.bin"); + assert_eq!(PUBLIC_KEY_MODULUS_BE_BYTES, &components.n); + const _65537: &[u8] = &[0x01, 0x00, 0x01]; + assert_eq!(_65537, &components.e); +} diff --git a/ring-0.17.14/tests/signature_tests.rs b/ring-0.17.14/tests/signature_tests.rs new file mode 100644 index 0000000000..dd340fab65 --- /dev/null +++ b/ring-0.17.14/tests/signature_tests.rs @@ -0,0 +1,30 @@ +#![allow(missing_docs)] + +use ring::signature; +#[allow(deprecated)] +use ring::test; + +#[cfg(all(target_arch = "wasm32", target_os = "unknown"))] +use wasm_bindgen_test::{wasm_bindgen_test as test, wasm_bindgen_test_configure}; + +#[cfg(all(target_arch = "wasm32", target_os = "unknown"))] +wasm_bindgen_test_configure!(run_in_browser); + +#[test] +fn signature_impl_test() { + test::compile_time_assert_clone::(); + test::compile_time_assert_copy::(); + test::compile_time_assert_send::(); + test::compile_time_assert_sync::(); + + let unparsed_public_key = + signature::UnparsedPublicKey::new(&signature::ED25519, &[0x01, 0x02, 0x03]); + + assert_eq!( + format!("{:?}", unparsed_public_key), + r#"UnparsedPublicKey { algorithm: ring::signature::ED25519, bytes: "010203" }"# + ); + + // Test `AsRef<[u8]>` + assert_eq!(unparsed_public_key.as_ref(), &[0x01, 0x02, 0x03]); +} diff --git a/ring-0.17.14/third_party/fiat/LICENSE b/ring-0.17.14/third_party/fiat/LICENSE new file mode 100644 index 0000000000..3bc4b882eb --- /dev/null +++ b/ring-0.17.14/third_party/fiat/LICENSE @@ -0,0 +1,15 @@ +The Apache License, Version 2.0 (Apache-2.0) + +Copyright 2015-2020 the fiat-crypto authors (see the AUTHORS file) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/ring-0.17.14/third_party/fiat/asm/fiat_curve25519_adx_mul.S b/ring-0.17.14/third_party/fiat/asm/fiat_curve25519_adx_mul.S new file mode 100644 index 0000000000..7e7be03036 --- /dev/null +++ b/ring-0.17.14/third_party/fiat/asm/fiat_curve25519_adx_mul.S @@ -0,0 +1,178 @@ +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \ + (defined(__APPLE__) || defined(__ELF__)) + +.intel_syntax noprefix +.text +#if defined(__APPLE__) +.private_extern _fiat_curve25519_adx_mul +.global _fiat_curve25519_adx_mul +_fiat_curve25519_adx_mul: +#else +.type fiat_curve25519_adx_mul, @function +.hidden fiat_curve25519_adx_mul +.global fiat_curve25519_adx_mul +fiat_curve25519_adx_mul: +#endif + +.cfi_startproc +_CET_ENDBR +push rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset rbp, -16 +mov rbp, rsp + +mov rax, rdx +mov rdx, [ rsi + 0x18 ] +mulx r11, r10, [ rax + 0x8 ] +mov rdx, [ rax + 0x0 ] +mov [ rsp - 0x58 ], r15 +.cfi_offset r15, -16-0x58 +mulx r8, rcx, [ rsi + 0x18 ] +mov rdx, [ rsi + 0x8 ] +mov [ rsp - 0x80 ], rbx +.cfi_offset rbx, -16-0x80 +mulx rbx, r9, [ rax + 0x18 ] +mov rdx, [ rsi + 0x8 ] +mov [ rsp - 0x70 ], r12 +.cfi_offset r12, -16-0x70 +mulx r15, r12, [ rax + 0x8 ] +mov rdx, [ rsi + 0x0 ] +mov [ rsp - 0x68 ], r13 +.cfi_offset r13, -16-0x68 +mov [ rsp - 0x60 ], r14 +.cfi_offset r14, -16-0x60 +mulx r14, r13, [ rax + 0x0 ] +mov rdx, [ rax + 0x10 ] +mov [ rsp - 0x18 ], r15 +mov [ rsp - 0x50 ], rdi +mulx rdi, r15, [ rsi + 0x0 ] +mov rdx, [ rax + 0x18 ] +mov [ rsp - 0x48 ], r13 +mov [ rsp - 0x40 ], r9 +mulx r9, r13, [ rsi + 0x0 ] +test al, al +adox rcx, rdi +mov rdx, [ rsi + 0x10 ] +mov [ rsp - 0x38 ], r13 +mulx r13, rdi, [ rax + 0x8 ] +adox r10, r9 +mov rdx, 0x0 +adox rbx, rdx +adcx rdi, rcx +adcx r8, r10 +mov r9, rdx +adcx r9, rbx +mov rdx, [ rsi + 0x10 ] +mulx r10, rcx, [ rax + 0x0 ] +mov rdx, [ rsi + 0x0 ] +mov [ rsp - 0x30 ], r15 +mulx r15, rbx, [ rax + 0x8 ] +mov rdx, -0x2 +inc rdx +adox rcx, r15 +setc r15b +clc +adcx rcx, r12 +adox r10, rdi +mov rdx, [ rax + 0x10 ] +mov [ rsp - 0x78 ], rcx +mulx rcx, rdi, [ rsi + 0x10 ] +adox rdi, r8 +mov rdx, [ rax + 0x18 ] +mov [ rsp - 0x28 ], rcx +mulx rcx, r8, [ rsi + 0x10 ] +mov rdx, [ rax + 0x10 ] +mov [ rsp - 0x20 ], r8 +mulx r12, r8, [ rsi + 0x18 ] +adox r8, r9 +mov rdx, [ rsi + 0x8 ] +mov [ rsp - 0x10 ], r12 +mulx r12, r9, [ rax + 0x10 ] +movzx rdx, r15b +lea rdx, [ rdx + rcx ] +adcx r9, r10 +adcx r13, rdi +mov r15, 0x0 +mov r10, r15 +adox r10, rdx +mov rdx, [ rax + 0x18 ] +mulx rcx, rdi, [ rsi + 0x18 ] +adox rcx, r15 +adcx r11, r8 +mov rdx, r15 +adcx rdx, r10 +adcx rcx, r15 +mov r8, rdx +mov rdx, [ rax + 0x0 ] +mulx r15, r10, [ rsi + 0x8 ] +test al, al +adox r10, r14 +adcx rbx, r10 +adox r15, [ rsp - 0x78 ] +adcx r15, [ rsp - 0x30 ] +adox r9, [ rsp - 0x18 ] +adcx r9, [ rsp - 0x38 ] +adox r13, [ rsp - 0x40 ] +adcx r12, r13 +adox r11, [ rsp - 0x20 ] +adcx r11, [ rsp - 0x28 ] +mov rdx, 0x26 +mulx rsi, r14, r12 +adox rdi, r8 +adcx rdi, [ rsp - 0x10 ] +mulx r10, r8, r11 +mov r13, 0x0 +adox rcx, r13 +adcx rcx, r13 +mulx r11, r12, rdi +xor rdi, rdi +adox r8, rbx +adox r12, r15 +mulx rbx, r13, rcx +adcx r14, [ rsp - 0x48 ] +adox r13, r9 +adox rbx, rdi +adcx rsi, r8 +adcx r10, r12 +adcx r11, r13 +adc rbx, 0x0 +mulx r9, r15, rbx +xor r9, r9 +adox r15, r14 +mov rdi, r9 +adox rdi, rsi +mov rcx, r9 +adox rcx, r10 +mov r8, [ rsp - 0x50 ] +mov [ r8 + 0x8 ], rdi +mov r12, r9 +adox r12, r11 +mov r14, r9 +cmovo r14, rdx +mov [ r8 + 0x18 ], r12 +adcx r15, r14 +mov [ r8 + 0x0 ], r15 +mov [ r8 + 0x10 ], rcx +mov rbx, [ rsp - 0x80 ] +.cfi_restore rbx +mov r12, [ rsp - 0x70 ] +.cfi_restore r12 +mov r13, [ rsp - 0x68 ] +.cfi_restore r13 +mov r14, [ rsp - 0x60 ] +.cfi_restore r14 +mov r15, [ rsp - 0x58 ] +.cfi_restore r15 + +pop rbp +.cfi_restore rbp +.cfi_adjust_cfa_offset -8 +ret +.cfi_endproc +#if defined(__ELF__) +.size fiat_curve25519_adx_mul, .-fiat_curve25519_adx_mul +#endif + +#endif diff --git a/ring-0.17.14/third_party/fiat/asm/fiat_curve25519_adx_square.S b/ring-0.17.14/third_party/fiat/asm/fiat_curve25519_adx_square.S new file mode 100644 index 0000000000..ccf7b76e58 --- /dev/null +++ b/ring-0.17.14/third_party/fiat/asm/fiat_curve25519_adx_square.S @@ -0,0 +1,146 @@ +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \ + (defined(__APPLE__) || defined(__ELF__)) + +.intel_syntax noprefix +.text +#if defined(__APPLE__) +.private_extern _fiat_curve25519_adx_square +.global _fiat_curve25519_adx_square +_fiat_curve25519_adx_square: +#else +.type fiat_curve25519_adx_square, @function +.hidden fiat_curve25519_adx_square +.global fiat_curve25519_adx_square +fiat_curve25519_adx_square: +#endif + +.cfi_startproc +_CET_ENDBR +push rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset rbp, -16 +mov rbp, rsp + +mov rdx, [ rsi + 0x0 ] +mulx r10, rax, [ rsi + 0x8 ] +mov rdx, [ rsi + 0x0 ] +mulx rcx, r11, [ rsi + 0x10 ] +xor rdx, rdx +adox r11, r10 +mov rdx, [ rsi + 0x0 ] +mulx r9, r8, [ rsi + 0x18 ] +mov rdx, [ rsi + 0x8 ] +mov [ rsp - 0x80 ], rbx +.cfi_offset rbx, -16-0x80 +mulx rbx, r10, [ rsi + 0x18 ] +adox r8, rcx +mov [rsp - 0x48 ], rdi +adox r10, r9 +adcx rax, rax +mov rdx, [ rsi + 0x10 ] +mulx r9, rcx, [ rsi + 0x18 ] +adox rcx, rbx +mov rdx, [ rsi + 0x10 ] +mulx rdi, rbx, [ rsi + 0x8 ] +mov rdx, 0x0 +adox r9, rdx +mov [ rsp - 0x70 ], r12 +.cfi_offset r12, -16-0x70 +mov r12, -0x3 +inc r12 +adox rbx, r8 +adox rdi, r10 +adcx r11, r11 +mov r8, rdx +adox r8, rcx +mov r10, rdx +adox r10, r9 +adcx rbx, rbx +mov rdx, [ rsi + 0x0 ] +mulx r9, rcx, rdx +mov rdx, [ rsi + 0x8 ] +mov [ rsp - 0x68 ], r13 +.cfi_offset r13, -16-0x68 +mov [ rsp - 0x60 ], r14 +.cfi_offset r14, -16-0x60 +mulx r14, r13, rdx +seto dl +inc r12 +adox r9, rax +adox r13, r11 +adox r14, rbx +adcx rdi, rdi +mov al, dl +mov rdx, [ rsi + 0x10 ] +mulx rbx, r11, rdx +adox r11, rdi +adcx r8, r8 +adox rbx, r8 +adcx r10, r10 +movzx rdx, al +mov rdi, 0x0 +adcx rdx, rdi +movzx r8, al +lea r8, [ r8 + rdx ] +mov rdx, [ rsi + 0x18 ] +mulx rdi, rax, rdx +adox rax, r10 +mov rdx, 0x26 +mov [ rsp - 0x58 ], r15 +.cfi_offset r15, -16-0x58 +mulx r15, r10, r11 +clc +adcx r10, rcx +mulx r11, rcx, rbx +adox r8, rdi +mulx rdi, rbx, r8 +inc r12 +adox rcx, r9 +mulx r8, r9, rax +adcx r15, rcx +adox r9, r13 +adcx r11, r9 +adox rbx, r14 +adox rdi, r12 +adcx r8, rbx +adc rdi, 0x0 +mulx r14, r13, rdi +test al, al +mov rdi, [ rsp - 0x48 ] +adox r13, r10 +mov r14, r12 +adox r14, r15 +mov [ rdi + 0x8 ], r14 +mov rax, r12 +adox rax, r11 +mov r10, r12 +adox r10, r8 +mov [ rdi + 0x10 ], rax +mov rcx, r12 +cmovo rcx, rdx +adcx r13, rcx +mov [ rdi + 0x0 ], r13 +mov [ rdi + 0x18 ], r10 +mov rbx, [ rsp - 0x80 ] +.cfi_restore rbx +mov r12, [ rsp - 0x70 ] +.cfi_restore r12 +mov r13, [ rsp - 0x68 ] +.cfi_restore r13 +mov r14, [ rsp - 0x60 ] +.cfi_restore r14 +mov r15, [ rsp - 0x58 ] +.cfi_restore r15 + +pop rbp +.cfi_restore rbp +.cfi_adjust_cfa_offset -8 +ret +.cfi_endproc +#if defined(__ELF__) +.size fiat_curve25519_adx_square, .-fiat_curve25519_adx_square +#endif + +#endif diff --git a/ring-0.17.14/third_party/fiat/curve25519_32.h b/ring-0.17.14/third_party/fiat/curve25519_32.h new file mode 100644 index 0000000000..56417ce635 --- /dev/null +++ b/ring-0.17.14/third_party/fiat/curve25519_32.h @@ -0,0 +1,1479 @@ +/* Autogenerated: 'src/ExtractionOCaml/unsaturated_solinas' --inline --static --use-value-barrier 25519 32 '(auto)' '2^255 - 19' carry_mul carry_square carry add sub opp selectznz to_bytes from_bytes relax carry_scmul121666 */ +/* curve description: 25519 */ +/* machine_wordsize = 32 (from "32") */ +/* requested operations: carry_mul, carry_square, carry, add, sub, opp, selectznz, to_bytes, from_bytes, relax, carry_scmul121666 */ +/* n = 10 (from "(auto)") */ +/* s-c = 2^255 - [(1, 19)] (from "2^255 - 19") */ +/* tight_bounds_multiplier = 1 (from "") */ +/* */ +/* Computed values: */ +/* carry_chain = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1] */ +/* eval z = z[0] + (z[1] << 26) + (z[2] << 51) + (z[3] << 77) + (z[4] << 102) + (z[5] << 128) + (z[6] << 153) + (z[7] << 179) + (z[8] << 204) + (z[9] << 230) */ +/* bytes_eval z = z[0] + (z[1] << 8) + (z[2] << 16) + (z[3] << 24) + (z[4] << 32) + (z[5] << 40) + (z[6] << 48) + (z[7] << 56) + (z[8] << 64) + (z[9] << 72) + (z[10] << 80) + (z[11] << 88) + (z[12] << 96) + (z[13] << 104) + (z[14] << 112) + (z[15] << 120) + (z[16] << 128) + (z[17] << 136) + (z[18] << 144) + (z[19] << 152) + (z[20] << 160) + (z[21] << 168) + (z[22] << 176) + (z[23] << 184) + (z[24] << 192) + (z[25] << 200) + (z[26] << 208) + (z[27] << 216) + (z[28] << 224) + (z[29] << 232) + (z[30] << 240) + (z[31] << 248) */ +/* balance = [0x7ffffda, 0x3fffffe, 0x7fffffe, 0x3fffffe, 0x7fffffe, 0x3fffffe, 0x7fffffe, 0x3fffffe, 0x7fffffe, 0x3fffffe] */ + +#include +typedef unsigned char fiat_25519_uint1; +typedef signed char fiat_25519_int1; +#if defined(__GNUC__) || defined(__clang__) +# define FIAT_25519_FIAT_INLINE __inline__ +#else +# define FIAT_25519_FIAT_INLINE +#endif + +/* The type fiat_25519_loose_field_element is a field element with loose bounds. */ +/* Bounds: [[0x0 ~> 0xc000000], [0x0 ~> 0x6000000], [0x0 ~> 0xc000000], [0x0 ~> 0x6000000], [0x0 ~> 0xc000000], [0x0 ~> 0x6000000], [0x0 ~> 0xc000000], [0x0 ~> 0x6000000], [0x0 ~> 0xc000000], [0x0 ~> 0x6000000]] */ +typedef uint32_t fiat_25519_loose_field_element[10]; + +/* The type fiat_25519_tight_field_element is a field element with tight bounds. */ +/* Bounds: [[0x0 ~> 0x4000000], [0x0 ~> 0x2000000], [0x0 ~> 0x4000000], [0x0 ~> 0x2000000], [0x0 ~> 0x4000000], [0x0 ~> 0x2000000], [0x0 ~> 0x4000000], [0x0 ~> 0x2000000], [0x0 ~> 0x4000000], [0x0 ~> 0x2000000]] */ +typedef uint32_t fiat_25519_tight_field_element[10]; + +#if (-1 & 3) != 3 +#error "This code only works on a two's complement system" +#endif + +#if !defined(FIAT_25519_NO_ASM) && (defined(__GNUC__) || defined(__clang__)) +static __inline__ uint32_t fiat_25519_value_barrier_u32(uint32_t a) { + __asm__("" : "+r"(a) : /* no inputs */); + return a; +} +#else +# define fiat_25519_value_barrier_u32(x) (x) +#endif + + +/* + * The function fiat_25519_addcarryx_u26 is an addition with carry. + * + * Postconditions: + * out1 = (arg1 + arg2 + arg3) mod 2^26 + * out2 = ⌊(arg1 + arg2 + arg3) / 2^26⌋ + * + * Input Bounds: + * arg1: [0x0 ~> 0x1] + * arg2: [0x0 ~> 0x3ffffff] + * arg3: [0x0 ~> 0x3ffffff] + * Output Bounds: + * out1: [0x0 ~> 0x3ffffff] + * out2: [0x0 ~> 0x1] + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_addcarryx_u26(uint32_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint32_t arg2, uint32_t arg3) { + uint32_t x1; + uint32_t x2; + fiat_25519_uint1 x3; + x1 = ((arg1 + arg2) + arg3); + x2 = (x1 & UINT32_C(0x3ffffff)); + x3 = (fiat_25519_uint1)(x1 >> 26); + *out1 = x2; + *out2 = x3; +} + +/* + * The function fiat_25519_subborrowx_u26 is a subtraction with borrow. + * + * Postconditions: + * out1 = (-arg1 + arg2 + -arg3) mod 2^26 + * out2 = -⌊(-arg1 + arg2 + -arg3) / 2^26⌋ + * + * Input Bounds: + * arg1: [0x0 ~> 0x1] + * arg2: [0x0 ~> 0x3ffffff] + * arg3: [0x0 ~> 0x3ffffff] + * Output Bounds: + * out1: [0x0 ~> 0x3ffffff] + * out2: [0x0 ~> 0x1] + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_subborrowx_u26(uint32_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint32_t arg2, uint32_t arg3) { + int32_t x1; + fiat_25519_int1 x2; + uint32_t x3; + x1 = ((int32_t)(arg2 - arg1) - (int32_t)arg3); + x2 = (fiat_25519_int1)(x1 >> 26); + x3 = (x1 & UINT32_C(0x3ffffff)); + *out1 = x3; + *out2 = (fiat_25519_uint1)(0x0 - x2); +} + +/* + * The function fiat_25519_addcarryx_u25 is an addition with carry. + * + * Postconditions: + * out1 = (arg1 + arg2 + arg3) mod 2^25 + * out2 = ⌊(arg1 + arg2 + arg3) / 2^25⌋ + * + * Input Bounds: + * arg1: [0x0 ~> 0x1] + * arg2: [0x0 ~> 0x1ffffff] + * arg3: [0x0 ~> 0x1ffffff] + * Output Bounds: + * out1: [0x0 ~> 0x1ffffff] + * out2: [0x0 ~> 0x1] + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_addcarryx_u25(uint32_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint32_t arg2, uint32_t arg3) { + uint32_t x1; + uint32_t x2; + fiat_25519_uint1 x3; + x1 = ((arg1 + arg2) + arg3); + x2 = (x1 & UINT32_C(0x1ffffff)); + x3 = (fiat_25519_uint1)(x1 >> 25); + *out1 = x2; + *out2 = x3; +} + +/* + * The function fiat_25519_subborrowx_u25 is a subtraction with borrow. + * + * Postconditions: + * out1 = (-arg1 + arg2 + -arg3) mod 2^25 + * out2 = -⌊(-arg1 + arg2 + -arg3) / 2^25⌋ + * + * Input Bounds: + * arg1: [0x0 ~> 0x1] + * arg2: [0x0 ~> 0x1ffffff] + * arg3: [0x0 ~> 0x1ffffff] + * Output Bounds: + * out1: [0x0 ~> 0x1ffffff] + * out2: [0x0 ~> 0x1] + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_subborrowx_u25(uint32_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint32_t arg2, uint32_t arg3) { + int32_t x1; + fiat_25519_int1 x2; + uint32_t x3; + x1 = ((int32_t)(arg2 - arg1) - (int32_t)arg3); + x2 = (fiat_25519_int1)(x1 >> 25); + x3 = (x1 & UINT32_C(0x1ffffff)); + *out1 = x3; + *out2 = (fiat_25519_uint1)(0x0 - x2); +} + +/* + * The function fiat_25519_cmovznz_u32 is a single-word conditional move. + * + * Postconditions: + * out1 = (if arg1 = 0 then arg2 else arg3) + * + * Input Bounds: + * arg1: [0x0 ~> 0x1] + * arg2: [0x0 ~> 0xffffffff] + * arg3: [0x0 ~> 0xffffffff] + * Output Bounds: + * out1: [0x0 ~> 0xffffffff] + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_cmovznz_u32(uint32_t* out1, fiat_25519_uint1 arg1, uint32_t arg2, uint32_t arg3) { + fiat_25519_uint1 x1; + uint32_t x2; + uint32_t x3; + x1 = (!(!arg1)); + x2 = ((fiat_25519_int1)(0x0 - x1) & UINT32_C(0xffffffff)); + x3 = ((fiat_25519_value_barrier_u32(x2) & arg3) | (fiat_25519_value_barrier_u32((~x2)) & arg2)); + *out1 = x3; +} + +/* + * The function fiat_25519_carry_mul multiplies two field elements and reduces the result. + * + * Postconditions: + * eval out1 mod m = (eval arg1 * eval arg2) mod m + * + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_carry_mul(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1, const fiat_25519_loose_field_element arg2) { + uint64_t x1; + uint64_t x2; + uint64_t x3; + uint64_t x4; + uint64_t x5; + uint64_t x6; + uint64_t x7; + uint64_t x8; + uint64_t x9; + uint64_t x10; + uint64_t x11; + uint64_t x12; + uint64_t x13; + uint64_t x14; + uint64_t x15; + uint64_t x16; + uint64_t x17; + uint64_t x18; + uint64_t x19; + uint64_t x20; + uint64_t x21; + uint64_t x22; + uint64_t x23; + uint64_t x24; + uint64_t x25; + uint64_t x26; + uint64_t x27; + uint64_t x28; + uint64_t x29; + uint64_t x30; + uint64_t x31; + uint64_t x32; + uint64_t x33; + uint64_t x34; + uint64_t x35; + uint64_t x36; + uint64_t x37; + uint64_t x38; + uint64_t x39; + uint64_t x40; + uint64_t x41; + uint64_t x42; + uint64_t x43; + uint64_t x44; + uint64_t x45; + uint64_t x46; + uint64_t x47; + uint64_t x48; + uint64_t x49; + uint64_t x50; + uint64_t x51; + uint64_t x52; + uint64_t x53; + uint64_t x54; + uint64_t x55; + uint64_t x56; + uint64_t x57; + uint64_t x58; + uint64_t x59; + uint64_t x60; + uint64_t x61; + uint64_t x62; + uint64_t x63; + uint64_t x64; + uint64_t x65; + uint64_t x66; + uint64_t x67; + uint64_t x68; + uint64_t x69; + uint64_t x70; + uint64_t x71; + uint64_t x72; + uint64_t x73; + uint64_t x74; + uint64_t x75; + uint64_t x76; + uint64_t x77; + uint64_t x78; + uint64_t x79; + uint64_t x80; + uint64_t x81; + uint64_t x82; + uint64_t x83; + uint64_t x84; + uint64_t x85; + uint64_t x86; + uint64_t x87; + uint64_t x88; + uint64_t x89; + uint64_t x90; + uint64_t x91; + uint64_t x92; + uint64_t x93; + uint64_t x94; + uint64_t x95; + uint64_t x96; + uint64_t x97; + uint64_t x98; + uint64_t x99; + uint64_t x100; + uint64_t x101; + uint64_t x102; + uint32_t x103; + uint64_t x104; + uint64_t x105; + uint64_t x106; + uint64_t x107; + uint64_t x108; + uint64_t x109; + uint64_t x110; + uint64_t x111; + uint64_t x112; + uint64_t x113; + uint64_t x114; + uint32_t x115; + uint64_t x116; + uint64_t x117; + uint32_t x118; + uint64_t x119; + uint64_t x120; + uint32_t x121; + uint64_t x122; + uint64_t x123; + uint32_t x124; + uint64_t x125; + uint64_t x126; + uint32_t x127; + uint64_t x128; + uint64_t x129; + uint32_t x130; + uint64_t x131; + uint64_t x132; + uint32_t x133; + uint64_t x134; + uint64_t x135; + uint32_t x136; + uint64_t x137; + uint64_t x138; + uint32_t x139; + uint64_t x140; + uint64_t x141; + uint32_t x142; + uint32_t x143; + uint32_t x144; + fiat_25519_uint1 x145; + uint32_t x146; + uint32_t x147; + x1 = ((uint64_t)(arg1[9]) * ((arg2[9]) * UINT8_C(0x26))); + x2 = ((uint64_t)(arg1[9]) * ((arg2[8]) * UINT8_C(0x13))); + x3 = ((uint64_t)(arg1[9]) * ((arg2[7]) * UINT8_C(0x26))); + x4 = ((uint64_t)(arg1[9]) * ((arg2[6]) * UINT8_C(0x13))); + x5 = ((uint64_t)(arg1[9]) * ((arg2[5]) * UINT8_C(0x26))); + x6 = ((uint64_t)(arg1[9]) * ((arg2[4]) * UINT8_C(0x13))); + x7 = ((uint64_t)(arg1[9]) * ((arg2[3]) * UINT8_C(0x26))); + x8 = ((uint64_t)(arg1[9]) * ((arg2[2]) * UINT8_C(0x13))); + x9 = ((uint64_t)(arg1[9]) * ((arg2[1]) * UINT8_C(0x26))); + x10 = ((uint64_t)(arg1[8]) * ((arg2[9]) * UINT8_C(0x13))); + x11 = ((uint64_t)(arg1[8]) * ((arg2[8]) * UINT8_C(0x13))); + x12 = ((uint64_t)(arg1[8]) * ((arg2[7]) * UINT8_C(0x13))); + x13 = ((uint64_t)(arg1[8]) * ((arg2[6]) * UINT8_C(0x13))); + x14 = ((uint64_t)(arg1[8]) * ((arg2[5]) * UINT8_C(0x13))); + x15 = ((uint64_t)(arg1[8]) * ((arg2[4]) * UINT8_C(0x13))); + x16 = ((uint64_t)(arg1[8]) * ((arg2[3]) * UINT8_C(0x13))); + x17 = ((uint64_t)(arg1[8]) * ((arg2[2]) * UINT8_C(0x13))); + x18 = ((uint64_t)(arg1[7]) * ((arg2[9]) * UINT8_C(0x26))); + x19 = ((uint64_t)(arg1[7]) * ((arg2[8]) * UINT8_C(0x13))); + x20 = ((uint64_t)(arg1[7]) * ((arg2[7]) * UINT8_C(0x26))); + x21 = ((uint64_t)(arg1[7]) * ((arg2[6]) * UINT8_C(0x13))); + x22 = ((uint64_t)(arg1[7]) * ((arg2[5]) * UINT8_C(0x26))); + x23 = ((uint64_t)(arg1[7]) * ((arg2[4]) * UINT8_C(0x13))); + x24 = ((uint64_t)(arg1[7]) * ((arg2[3]) * UINT8_C(0x26))); + x25 = ((uint64_t)(arg1[6]) * ((arg2[9]) * UINT8_C(0x13))); + x26 = ((uint64_t)(arg1[6]) * ((arg2[8]) * UINT8_C(0x13))); + x27 = ((uint64_t)(arg1[6]) * ((arg2[7]) * UINT8_C(0x13))); + x28 = ((uint64_t)(arg1[6]) * ((arg2[6]) * UINT8_C(0x13))); + x29 = ((uint64_t)(arg1[6]) * ((arg2[5]) * UINT8_C(0x13))); + x30 = ((uint64_t)(arg1[6]) * ((arg2[4]) * UINT8_C(0x13))); + x31 = ((uint64_t)(arg1[5]) * ((arg2[9]) * UINT8_C(0x26))); + x32 = ((uint64_t)(arg1[5]) * ((arg2[8]) * UINT8_C(0x13))); + x33 = ((uint64_t)(arg1[5]) * ((arg2[7]) * UINT8_C(0x26))); + x34 = ((uint64_t)(arg1[5]) * ((arg2[6]) * UINT8_C(0x13))); + x35 = ((uint64_t)(arg1[5]) * ((arg2[5]) * UINT8_C(0x26))); + x36 = ((uint64_t)(arg1[4]) * ((arg2[9]) * UINT8_C(0x13))); + x37 = ((uint64_t)(arg1[4]) * ((arg2[8]) * UINT8_C(0x13))); + x38 = ((uint64_t)(arg1[4]) * ((arg2[7]) * UINT8_C(0x13))); + x39 = ((uint64_t)(arg1[4]) * ((arg2[6]) * UINT8_C(0x13))); + x40 = ((uint64_t)(arg1[3]) * ((arg2[9]) * UINT8_C(0x26))); + x41 = ((uint64_t)(arg1[3]) * ((arg2[8]) * UINT8_C(0x13))); + x42 = ((uint64_t)(arg1[3]) * ((arg2[7]) * UINT8_C(0x26))); + x43 = ((uint64_t)(arg1[2]) * ((arg2[9]) * UINT8_C(0x13))); + x44 = ((uint64_t)(arg1[2]) * ((arg2[8]) * UINT8_C(0x13))); + x45 = ((uint64_t)(arg1[1]) * ((arg2[9]) * UINT8_C(0x26))); + x46 = ((uint64_t)(arg1[9]) * (arg2[0])); + x47 = ((uint64_t)(arg1[8]) * (arg2[1])); + x48 = ((uint64_t)(arg1[8]) * (arg2[0])); + x49 = ((uint64_t)(arg1[7]) * (arg2[2])); + x50 = ((uint64_t)(arg1[7]) * ((arg2[1]) * 0x2)); + x51 = ((uint64_t)(arg1[7]) * (arg2[0])); + x52 = ((uint64_t)(arg1[6]) * (arg2[3])); + x53 = ((uint64_t)(arg1[6]) * (arg2[2])); + x54 = ((uint64_t)(arg1[6]) * (arg2[1])); + x55 = ((uint64_t)(arg1[6]) * (arg2[0])); + x56 = ((uint64_t)(arg1[5]) * (arg2[4])); + x57 = ((uint64_t)(arg1[5]) * ((arg2[3]) * 0x2)); + x58 = ((uint64_t)(arg1[5]) * (arg2[2])); + x59 = ((uint64_t)(arg1[5]) * ((arg2[1]) * 0x2)); + x60 = ((uint64_t)(arg1[5]) * (arg2[0])); + x61 = ((uint64_t)(arg1[4]) * (arg2[5])); + x62 = ((uint64_t)(arg1[4]) * (arg2[4])); + x63 = ((uint64_t)(arg1[4]) * (arg2[3])); + x64 = ((uint64_t)(arg1[4]) * (arg2[2])); + x65 = ((uint64_t)(arg1[4]) * (arg2[1])); + x66 = ((uint64_t)(arg1[4]) * (arg2[0])); + x67 = ((uint64_t)(arg1[3]) * (arg2[6])); + x68 = ((uint64_t)(arg1[3]) * ((arg2[5]) * 0x2)); + x69 = ((uint64_t)(arg1[3]) * (arg2[4])); + x70 = ((uint64_t)(arg1[3]) * ((arg2[3]) * 0x2)); + x71 = ((uint64_t)(arg1[3]) * (arg2[2])); + x72 = ((uint64_t)(arg1[3]) * ((arg2[1]) * 0x2)); + x73 = ((uint64_t)(arg1[3]) * (arg2[0])); + x74 = ((uint64_t)(arg1[2]) * (arg2[7])); + x75 = ((uint64_t)(arg1[2]) * (arg2[6])); + x76 = ((uint64_t)(arg1[2]) * (arg2[5])); + x77 = ((uint64_t)(arg1[2]) * (arg2[4])); + x78 = ((uint64_t)(arg1[2]) * (arg2[3])); + x79 = ((uint64_t)(arg1[2]) * (arg2[2])); + x80 = ((uint64_t)(arg1[2]) * (arg2[1])); + x81 = ((uint64_t)(arg1[2]) * (arg2[0])); + x82 = ((uint64_t)(arg1[1]) * (arg2[8])); + x83 = ((uint64_t)(arg1[1]) * ((arg2[7]) * 0x2)); + x84 = ((uint64_t)(arg1[1]) * (arg2[6])); + x85 = ((uint64_t)(arg1[1]) * ((arg2[5]) * 0x2)); + x86 = ((uint64_t)(arg1[1]) * (arg2[4])); + x87 = ((uint64_t)(arg1[1]) * ((arg2[3]) * 0x2)); + x88 = ((uint64_t)(arg1[1]) * (arg2[2])); + x89 = ((uint64_t)(arg1[1]) * ((arg2[1]) * 0x2)); + x90 = ((uint64_t)(arg1[1]) * (arg2[0])); + x91 = ((uint64_t)(arg1[0]) * (arg2[9])); + x92 = ((uint64_t)(arg1[0]) * (arg2[8])); + x93 = ((uint64_t)(arg1[0]) * (arg2[7])); + x94 = ((uint64_t)(arg1[0]) * (arg2[6])); + x95 = ((uint64_t)(arg1[0]) * (arg2[5])); + x96 = ((uint64_t)(arg1[0]) * (arg2[4])); + x97 = ((uint64_t)(arg1[0]) * (arg2[3])); + x98 = ((uint64_t)(arg1[0]) * (arg2[2])); + x99 = ((uint64_t)(arg1[0]) * (arg2[1])); + x100 = ((uint64_t)(arg1[0]) * (arg2[0])); + x101 = (x100 + (x45 + (x44 + (x42 + (x39 + (x35 + (x30 + (x24 + (x17 + x9))))))))); + x102 = (x101 >> 26); + x103 = (uint32_t)(x101 & UINT32_C(0x3ffffff)); + x104 = (x91 + (x82 + (x74 + (x67 + (x61 + (x56 + (x52 + (x49 + (x47 + x46))))))))); + x105 = (x92 + (x83 + (x75 + (x68 + (x62 + (x57 + (x53 + (x50 + (x48 + x1))))))))); + x106 = (x93 + (x84 + (x76 + (x69 + (x63 + (x58 + (x54 + (x51 + (x10 + x2))))))))); + x107 = (x94 + (x85 + (x77 + (x70 + (x64 + (x59 + (x55 + (x18 + (x11 + x3))))))))); + x108 = (x95 + (x86 + (x78 + (x71 + (x65 + (x60 + (x25 + (x19 + (x12 + x4))))))))); + x109 = (x96 + (x87 + (x79 + (x72 + (x66 + (x31 + (x26 + (x20 + (x13 + x5))))))))); + x110 = (x97 + (x88 + (x80 + (x73 + (x36 + (x32 + (x27 + (x21 + (x14 + x6))))))))); + x111 = (x98 + (x89 + (x81 + (x40 + (x37 + (x33 + (x28 + (x22 + (x15 + x7))))))))); + x112 = (x99 + (x90 + (x43 + (x41 + (x38 + (x34 + (x29 + (x23 + (x16 + x8))))))))); + x113 = (x102 + x112); + x114 = (x113 >> 25); + x115 = (uint32_t)(x113 & UINT32_C(0x1ffffff)); + x116 = (x114 + x111); + x117 = (x116 >> 26); + x118 = (uint32_t)(x116 & UINT32_C(0x3ffffff)); + x119 = (x117 + x110); + x120 = (x119 >> 25); + x121 = (uint32_t)(x119 & UINT32_C(0x1ffffff)); + x122 = (x120 + x109); + x123 = (x122 >> 26); + x124 = (uint32_t)(x122 & UINT32_C(0x3ffffff)); + x125 = (x123 + x108); + x126 = (x125 >> 25); + x127 = (uint32_t)(x125 & UINT32_C(0x1ffffff)); + x128 = (x126 + x107); + x129 = (x128 >> 26); + x130 = (uint32_t)(x128 & UINT32_C(0x3ffffff)); + x131 = (x129 + x106); + x132 = (x131 >> 25); + x133 = (uint32_t)(x131 & UINT32_C(0x1ffffff)); + x134 = (x132 + x105); + x135 = (x134 >> 26); + x136 = (uint32_t)(x134 & UINT32_C(0x3ffffff)); + x137 = (x135 + x104); + x138 = (x137 >> 25); + x139 = (uint32_t)(x137 & UINT32_C(0x1ffffff)); + x140 = (x138 * UINT8_C(0x13)); + x141 = (x103 + x140); + x142 = (uint32_t)(x141 >> 26); + x143 = (uint32_t)(x141 & UINT32_C(0x3ffffff)); + x144 = (x142 + x115); + x145 = (fiat_25519_uint1)(x144 >> 25); + x146 = (x144 & UINT32_C(0x1ffffff)); + x147 = (x145 + x118); + out1[0] = x143; + out1[1] = x146; + out1[2] = x147; + out1[3] = x121; + out1[4] = x124; + out1[5] = x127; + out1[6] = x130; + out1[7] = x133; + out1[8] = x136; + out1[9] = x139; +} + +/* + * The function fiat_25519_carry_square squares a field element and reduces the result. + * + * Postconditions: + * eval out1 mod m = (eval arg1 * eval arg1) mod m + * + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_carry_square(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1) { + uint32_t x1; + uint32_t x2; + uint32_t x3; + uint32_t x4; + uint64_t x5; + uint32_t x6; + uint32_t x7; + uint32_t x8; + uint32_t x9; + uint32_t x10; + uint64_t x11; + uint32_t x12; + uint32_t x13; + uint32_t x14; + uint32_t x15; + uint32_t x16; + uint32_t x17; + uint32_t x18; + uint64_t x19; + uint64_t x20; + uint64_t x21; + uint64_t x22; + uint64_t x23; + uint64_t x24; + uint64_t x25; + uint64_t x26; + uint64_t x27; + uint64_t x28; + uint64_t x29; + uint64_t x30; + uint64_t x31; + uint64_t x32; + uint64_t x33; + uint64_t x34; + uint64_t x35; + uint64_t x36; + uint64_t x37; + uint64_t x38; + uint64_t x39; + uint64_t x40; + uint64_t x41; + uint64_t x42; + uint64_t x43; + uint64_t x44; + uint64_t x45; + uint64_t x46; + uint64_t x47; + uint64_t x48; + uint64_t x49; + uint64_t x50; + uint64_t x51; + uint64_t x52; + uint64_t x53; + uint64_t x54; + uint64_t x55; + uint64_t x56; + uint64_t x57; + uint64_t x58; + uint64_t x59; + uint64_t x60; + uint64_t x61; + uint64_t x62; + uint64_t x63; + uint64_t x64; + uint64_t x65; + uint64_t x66; + uint64_t x67; + uint64_t x68; + uint64_t x69; + uint64_t x70; + uint64_t x71; + uint64_t x72; + uint64_t x73; + uint64_t x74; + uint64_t x75; + uint32_t x76; + uint64_t x77; + uint64_t x78; + uint64_t x79; + uint64_t x80; + uint64_t x81; + uint64_t x82; + uint64_t x83; + uint64_t x84; + uint64_t x85; + uint64_t x86; + uint64_t x87; + uint32_t x88; + uint64_t x89; + uint64_t x90; + uint32_t x91; + uint64_t x92; + uint64_t x93; + uint32_t x94; + uint64_t x95; + uint64_t x96; + uint32_t x97; + uint64_t x98; + uint64_t x99; + uint32_t x100; + uint64_t x101; + uint64_t x102; + uint32_t x103; + uint64_t x104; + uint64_t x105; + uint32_t x106; + uint64_t x107; + uint64_t x108; + uint32_t x109; + uint64_t x110; + uint64_t x111; + uint32_t x112; + uint64_t x113; + uint64_t x114; + uint32_t x115; + uint32_t x116; + uint32_t x117; + fiat_25519_uint1 x118; + uint32_t x119; + uint32_t x120; + x1 = ((arg1[9]) * UINT8_C(0x13)); + x2 = (x1 * 0x2); + x3 = ((arg1[9]) * 0x2); + x4 = ((arg1[8]) * UINT8_C(0x13)); + x5 = ((uint64_t)x4 * 0x2); + x6 = ((arg1[8]) * 0x2); + x7 = ((arg1[7]) * UINT8_C(0x13)); + x8 = (x7 * 0x2); + x9 = ((arg1[7]) * 0x2); + x10 = ((arg1[6]) * UINT8_C(0x13)); + x11 = ((uint64_t)x10 * 0x2); + x12 = ((arg1[6]) * 0x2); + x13 = ((arg1[5]) * UINT8_C(0x13)); + x14 = ((arg1[5]) * 0x2); + x15 = ((arg1[4]) * 0x2); + x16 = ((arg1[3]) * 0x2); + x17 = ((arg1[2]) * 0x2); + x18 = ((arg1[1]) * 0x2); + x19 = ((uint64_t)(arg1[9]) * (x1 * 0x2)); + x20 = ((uint64_t)(arg1[8]) * x2); + x21 = ((uint64_t)(arg1[8]) * x4); + x22 = ((arg1[7]) * ((uint64_t)x2 * 0x2)); + x23 = ((arg1[7]) * x5); + x24 = ((uint64_t)(arg1[7]) * (x7 * 0x2)); + x25 = ((uint64_t)(arg1[6]) * x2); + x26 = ((arg1[6]) * x5); + x27 = ((uint64_t)(arg1[6]) * x8); + x28 = ((uint64_t)(arg1[6]) * x10); + x29 = ((arg1[5]) * ((uint64_t)x2 * 0x2)); + x30 = ((arg1[5]) * x5); + x31 = ((arg1[5]) * ((uint64_t)x8 * 0x2)); + x32 = ((arg1[5]) * x11); + x33 = ((uint64_t)(arg1[5]) * (x13 * 0x2)); + x34 = ((uint64_t)(arg1[4]) * x2); + x35 = ((arg1[4]) * x5); + x36 = ((uint64_t)(arg1[4]) * x8); + x37 = ((arg1[4]) * x11); + x38 = ((uint64_t)(arg1[4]) * x14); + x39 = ((uint64_t)(arg1[4]) * (arg1[4])); + x40 = ((arg1[3]) * ((uint64_t)x2 * 0x2)); + x41 = ((arg1[3]) * x5); + x42 = ((arg1[3]) * ((uint64_t)x8 * 0x2)); + x43 = ((uint64_t)(arg1[3]) * x12); + x44 = ((uint64_t)(arg1[3]) * (x14 * 0x2)); + x45 = ((uint64_t)(arg1[3]) * x15); + x46 = ((uint64_t)(arg1[3]) * ((arg1[3]) * 0x2)); + x47 = ((uint64_t)(arg1[2]) * x2); + x48 = ((arg1[2]) * x5); + x49 = ((uint64_t)(arg1[2]) * x9); + x50 = ((uint64_t)(arg1[2]) * x12); + x51 = ((uint64_t)(arg1[2]) * x14); + x52 = ((uint64_t)(arg1[2]) * x15); + x53 = ((uint64_t)(arg1[2]) * x16); + x54 = ((uint64_t)(arg1[2]) * (arg1[2])); + x55 = ((arg1[1]) * ((uint64_t)x2 * 0x2)); + x56 = ((uint64_t)(arg1[1]) * x6); + x57 = ((uint64_t)(arg1[1]) * (x9 * 0x2)); + x58 = ((uint64_t)(arg1[1]) * x12); + x59 = ((uint64_t)(arg1[1]) * (x14 * 0x2)); + x60 = ((uint64_t)(arg1[1]) * x15); + x61 = ((uint64_t)(arg1[1]) * (x16 * 0x2)); + x62 = ((uint64_t)(arg1[1]) * x17); + x63 = ((uint64_t)(arg1[1]) * ((arg1[1]) * 0x2)); + x64 = ((uint64_t)(arg1[0]) * x3); + x65 = ((uint64_t)(arg1[0]) * x6); + x66 = ((uint64_t)(arg1[0]) * x9); + x67 = ((uint64_t)(arg1[0]) * x12); + x68 = ((uint64_t)(arg1[0]) * x14); + x69 = ((uint64_t)(arg1[0]) * x15); + x70 = ((uint64_t)(arg1[0]) * x16); + x71 = ((uint64_t)(arg1[0]) * x17); + x72 = ((uint64_t)(arg1[0]) * x18); + x73 = ((uint64_t)(arg1[0]) * (arg1[0])); + x74 = (x73 + (x55 + (x48 + (x42 + (x37 + x33))))); + x75 = (x74 >> 26); + x76 = (uint32_t)(x74 & UINT32_C(0x3ffffff)); + x77 = (x64 + (x56 + (x49 + (x43 + x38)))); + x78 = (x65 + (x57 + (x50 + (x44 + (x39 + x19))))); + x79 = (x66 + (x58 + (x51 + (x45 + x20)))); + x80 = (x67 + (x59 + (x52 + (x46 + (x22 + x21))))); + x81 = (x68 + (x60 + (x53 + (x25 + x23)))); + x82 = (x69 + (x61 + (x54 + (x29 + (x26 + x24))))); + x83 = (x70 + (x62 + (x34 + (x30 + x27)))); + x84 = (x71 + (x63 + (x40 + (x35 + (x31 + x28))))); + x85 = (x72 + (x47 + (x41 + (x36 + x32)))); + x86 = (x75 + x85); + x87 = (x86 >> 25); + x88 = (uint32_t)(x86 & UINT32_C(0x1ffffff)); + x89 = (x87 + x84); + x90 = (x89 >> 26); + x91 = (uint32_t)(x89 & UINT32_C(0x3ffffff)); + x92 = (x90 + x83); + x93 = (x92 >> 25); + x94 = (uint32_t)(x92 & UINT32_C(0x1ffffff)); + x95 = (x93 + x82); + x96 = (x95 >> 26); + x97 = (uint32_t)(x95 & UINT32_C(0x3ffffff)); + x98 = (x96 + x81); + x99 = (x98 >> 25); + x100 = (uint32_t)(x98 & UINT32_C(0x1ffffff)); + x101 = (x99 + x80); + x102 = (x101 >> 26); + x103 = (uint32_t)(x101 & UINT32_C(0x3ffffff)); + x104 = (x102 + x79); + x105 = (x104 >> 25); + x106 = (uint32_t)(x104 & UINT32_C(0x1ffffff)); + x107 = (x105 + x78); + x108 = (x107 >> 26); + x109 = (uint32_t)(x107 & UINT32_C(0x3ffffff)); + x110 = (x108 + x77); + x111 = (x110 >> 25); + x112 = (uint32_t)(x110 & UINT32_C(0x1ffffff)); + x113 = (x111 * UINT8_C(0x13)); + x114 = (x76 + x113); + x115 = (uint32_t)(x114 >> 26); + x116 = (uint32_t)(x114 & UINT32_C(0x3ffffff)); + x117 = (x115 + x88); + x118 = (fiat_25519_uint1)(x117 >> 25); + x119 = (x117 & UINT32_C(0x1ffffff)); + x120 = (x118 + x91); + out1[0] = x116; + out1[1] = x119; + out1[2] = x120; + out1[3] = x94; + out1[4] = x97; + out1[5] = x100; + out1[6] = x103; + out1[7] = x106; + out1[8] = x109; + out1[9] = x112; +} + +/* + * The function fiat_25519_carry reduces a field element. + * + * Postconditions: + * eval out1 mod m = eval arg1 mod m + * + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_carry(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1) { + uint32_t x1; + uint32_t x2; + uint32_t x3; + uint32_t x4; + uint32_t x5; + uint32_t x6; + uint32_t x7; + uint32_t x8; + uint32_t x9; + uint32_t x10; + uint32_t x11; + uint32_t x12; + uint32_t x13; + uint32_t x14; + uint32_t x15; + uint32_t x16; + uint32_t x17; + uint32_t x18; + uint32_t x19; + uint32_t x20; + uint32_t x21; + uint32_t x22; + x1 = (arg1[0]); + x2 = ((x1 >> 26) + (arg1[1])); + x3 = ((x2 >> 25) + (arg1[2])); + x4 = ((x3 >> 26) + (arg1[3])); + x5 = ((x4 >> 25) + (arg1[4])); + x6 = ((x5 >> 26) + (arg1[5])); + x7 = ((x6 >> 25) + (arg1[6])); + x8 = ((x7 >> 26) + (arg1[7])); + x9 = ((x8 >> 25) + (arg1[8])); + x10 = ((x9 >> 26) + (arg1[9])); + x11 = ((x1 & UINT32_C(0x3ffffff)) + ((x10 >> 25) * UINT8_C(0x13))); + x12 = ((fiat_25519_uint1)(x11 >> 26) + (x2 & UINT32_C(0x1ffffff))); + x13 = (x11 & UINT32_C(0x3ffffff)); + x14 = (x12 & UINT32_C(0x1ffffff)); + x15 = ((fiat_25519_uint1)(x12 >> 25) + (x3 & UINT32_C(0x3ffffff))); + x16 = (x4 & UINT32_C(0x1ffffff)); + x17 = (x5 & UINT32_C(0x3ffffff)); + x18 = (x6 & UINT32_C(0x1ffffff)); + x19 = (x7 & UINT32_C(0x3ffffff)); + x20 = (x8 & UINT32_C(0x1ffffff)); + x21 = (x9 & UINT32_C(0x3ffffff)); + x22 = (x10 & UINT32_C(0x1ffffff)); + out1[0] = x13; + out1[1] = x14; + out1[2] = x15; + out1[3] = x16; + out1[4] = x17; + out1[5] = x18; + out1[6] = x19; + out1[7] = x20; + out1[8] = x21; + out1[9] = x22; +} + +/* + * The function fiat_25519_add adds two field elements. + * + * Postconditions: + * eval out1 mod m = (eval arg1 + eval arg2) mod m + * + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_add(fiat_25519_loose_field_element out1, const fiat_25519_tight_field_element arg1, const fiat_25519_tight_field_element arg2) { + uint32_t x1; + uint32_t x2; + uint32_t x3; + uint32_t x4; + uint32_t x5; + uint32_t x6; + uint32_t x7; + uint32_t x8; + uint32_t x9; + uint32_t x10; + x1 = ((arg1[0]) + (arg2[0])); + x2 = ((arg1[1]) + (arg2[1])); + x3 = ((arg1[2]) + (arg2[2])); + x4 = ((arg1[3]) + (arg2[3])); + x5 = ((arg1[4]) + (arg2[4])); + x6 = ((arg1[5]) + (arg2[5])); + x7 = ((arg1[6]) + (arg2[6])); + x8 = ((arg1[7]) + (arg2[7])); + x9 = ((arg1[8]) + (arg2[8])); + x10 = ((arg1[9]) + (arg2[9])); + out1[0] = x1; + out1[1] = x2; + out1[2] = x3; + out1[3] = x4; + out1[4] = x5; + out1[5] = x6; + out1[6] = x7; + out1[7] = x8; + out1[8] = x9; + out1[9] = x10; +} + +/* + * The function fiat_25519_sub subtracts two field elements. + * + * Postconditions: + * eval out1 mod m = (eval arg1 - eval arg2) mod m + * + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_sub(fiat_25519_loose_field_element out1, const fiat_25519_tight_field_element arg1, const fiat_25519_tight_field_element arg2) { + uint32_t x1; + uint32_t x2; + uint32_t x3; + uint32_t x4; + uint32_t x5; + uint32_t x6; + uint32_t x7; + uint32_t x8; + uint32_t x9; + uint32_t x10; + x1 = ((UINT32_C(0x7ffffda) + (arg1[0])) - (arg2[0])); + x2 = ((UINT32_C(0x3fffffe) + (arg1[1])) - (arg2[1])); + x3 = ((UINT32_C(0x7fffffe) + (arg1[2])) - (arg2[2])); + x4 = ((UINT32_C(0x3fffffe) + (arg1[3])) - (arg2[3])); + x5 = ((UINT32_C(0x7fffffe) + (arg1[4])) - (arg2[4])); + x6 = ((UINT32_C(0x3fffffe) + (arg1[5])) - (arg2[5])); + x7 = ((UINT32_C(0x7fffffe) + (arg1[6])) - (arg2[6])); + x8 = ((UINT32_C(0x3fffffe) + (arg1[7])) - (arg2[7])); + x9 = ((UINT32_C(0x7fffffe) + (arg1[8])) - (arg2[8])); + x10 = ((UINT32_C(0x3fffffe) + (arg1[9])) - (arg2[9])); + out1[0] = x1; + out1[1] = x2; + out1[2] = x3; + out1[3] = x4; + out1[4] = x5; + out1[5] = x6; + out1[6] = x7; + out1[7] = x8; + out1[8] = x9; + out1[9] = x10; +} + +/* + * The function fiat_25519_opp negates a field element. + * + * Postconditions: + * eval out1 mod m = -eval arg1 mod m + * + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_opp(fiat_25519_loose_field_element out1, const fiat_25519_tight_field_element arg1) { + uint32_t x1; + uint32_t x2; + uint32_t x3; + uint32_t x4; + uint32_t x5; + uint32_t x6; + uint32_t x7; + uint32_t x8; + uint32_t x9; + uint32_t x10; + x1 = (UINT32_C(0x7ffffda) - (arg1[0])); + x2 = (UINT32_C(0x3fffffe) - (arg1[1])); + x3 = (UINT32_C(0x7fffffe) - (arg1[2])); + x4 = (UINT32_C(0x3fffffe) - (arg1[3])); + x5 = (UINT32_C(0x7fffffe) - (arg1[4])); + x6 = (UINT32_C(0x3fffffe) - (arg1[5])); + x7 = (UINT32_C(0x7fffffe) - (arg1[6])); + x8 = (UINT32_C(0x3fffffe) - (arg1[7])); + x9 = (UINT32_C(0x7fffffe) - (arg1[8])); + x10 = (UINT32_C(0x3fffffe) - (arg1[9])); + out1[0] = x1; + out1[1] = x2; + out1[2] = x3; + out1[3] = x4; + out1[4] = x5; + out1[5] = x6; + out1[6] = x7; + out1[7] = x8; + out1[8] = x9; + out1[9] = x10; +} + +/* + * The function fiat_25519_to_bytes serializes a field element to bytes in little-endian order. + * + * Postconditions: + * out1 = map (λ x, ⌊((eval arg1 mod m) mod 2^(8 * (x + 1))) / 2^(8 * x)⌋) [0..31] + * + * Output Bounds: + * out1: [[0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0x7f]] + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_to_bytes(uint8_t out1[32], const fiat_25519_tight_field_element arg1) { + uint32_t x1; + fiat_25519_uint1 x2; + uint32_t x3; + fiat_25519_uint1 x4; + uint32_t x5; + fiat_25519_uint1 x6; + uint32_t x7; + fiat_25519_uint1 x8; + uint32_t x9; + fiat_25519_uint1 x10; + uint32_t x11; + fiat_25519_uint1 x12; + uint32_t x13; + fiat_25519_uint1 x14; + uint32_t x15; + fiat_25519_uint1 x16; + uint32_t x17; + fiat_25519_uint1 x18; + uint32_t x19; + fiat_25519_uint1 x20; + uint32_t x21; + uint32_t x22; + fiat_25519_uint1 x23; + uint32_t x24; + fiat_25519_uint1 x25; + uint32_t x26; + fiat_25519_uint1 x27; + uint32_t x28; + fiat_25519_uint1 x29; + uint32_t x30; + fiat_25519_uint1 x31; + uint32_t x32; + fiat_25519_uint1 x33; + uint32_t x34; + fiat_25519_uint1 x35; + uint32_t x36; + fiat_25519_uint1 x37; + uint32_t x38; + fiat_25519_uint1 x39; + uint32_t x40; + fiat_25519_uint1 x41; + uint32_t x42; + uint32_t x43; + uint32_t x44; + uint32_t x45; + uint32_t x46; + uint32_t x47; + uint32_t x48; + uint32_t x49; + uint8_t x50; + uint32_t x51; + uint8_t x52; + uint32_t x53; + uint8_t x54; + uint8_t x55; + uint32_t x56; + uint8_t x57; + uint32_t x58; + uint8_t x59; + uint32_t x60; + uint8_t x61; + uint8_t x62; + uint32_t x63; + uint8_t x64; + uint32_t x65; + uint8_t x66; + uint32_t x67; + uint8_t x68; + uint8_t x69; + uint32_t x70; + uint8_t x71; + uint32_t x72; + uint8_t x73; + uint32_t x74; + uint8_t x75; + uint8_t x76; + uint32_t x77; + uint8_t x78; + uint32_t x79; + uint8_t x80; + uint32_t x81; + uint8_t x82; + uint8_t x83; + uint8_t x84; + uint32_t x85; + uint8_t x86; + uint32_t x87; + uint8_t x88; + fiat_25519_uint1 x89; + uint32_t x90; + uint8_t x91; + uint32_t x92; + uint8_t x93; + uint32_t x94; + uint8_t x95; + uint8_t x96; + uint32_t x97; + uint8_t x98; + uint32_t x99; + uint8_t x100; + uint32_t x101; + uint8_t x102; + uint8_t x103; + uint32_t x104; + uint8_t x105; + uint32_t x106; + uint8_t x107; + uint32_t x108; + uint8_t x109; + uint8_t x110; + uint32_t x111; + uint8_t x112; + uint32_t x113; + uint8_t x114; + uint32_t x115; + uint8_t x116; + uint8_t x117; + fiat_25519_subborrowx_u26(&x1, &x2, 0x0, (arg1[0]), UINT32_C(0x3ffffed)); + fiat_25519_subborrowx_u25(&x3, &x4, x2, (arg1[1]), UINT32_C(0x1ffffff)); + fiat_25519_subborrowx_u26(&x5, &x6, x4, (arg1[2]), UINT32_C(0x3ffffff)); + fiat_25519_subborrowx_u25(&x7, &x8, x6, (arg1[3]), UINT32_C(0x1ffffff)); + fiat_25519_subborrowx_u26(&x9, &x10, x8, (arg1[4]), UINT32_C(0x3ffffff)); + fiat_25519_subborrowx_u25(&x11, &x12, x10, (arg1[5]), UINT32_C(0x1ffffff)); + fiat_25519_subborrowx_u26(&x13, &x14, x12, (arg1[6]), UINT32_C(0x3ffffff)); + fiat_25519_subborrowx_u25(&x15, &x16, x14, (arg1[7]), UINT32_C(0x1ffffff)); + fiat_25519_subborrowx_u26(&x17, &x18, x16, (arg1[8]), UINT32_C(0x3ffffff)); + fiat_25519_subborrowx_u25(&x19, &x20, x18, (arg1[9]), UINT32_C(0x1ffffff)); + fiat_25519_cmovznz_u32(&x21, x20, 0x0, UINT32_C(0xffffffff)); + fiat_25519_addcarryx_u26(&x22, &x23, 0x0, x1, (x21 & UINT32_C(0x3ffffed))); + fiat_25519_addcarryx_u25(&x24, &x25, x23, x3, (x21 & UINT32_C(0x1ffffff))); + fiat_25519_addcarryx_u26(&x26, &x27, x25, x5, (x21 & UINT32_C(0x3ffffff))); + fiat_25519_addcarryx_u25(&x28, &x29, x27, x7, (x21 & UINT32_C(0x1ffffff))); + fiat_25519_addcarryx_u26(&x30, &x31, x29, x9, (x21 & UINT32_C(0x3ffffff))); + fiat_25519_addcarryx_u25(&x32, &x33, x31, x11, (x21 & UINT32_C(0x1ffffff))); + fiat_25519_addcarryx_u26(&x34, &x35, x33, x13, (x21 & UINT32_C(0x3ffffff))); + fiat_25519_addcarryx_u25(&x36, &x37, x35, x15, (x21 & UINT32_C(0x1ffffff))); + fiat_25519_addcarryx_u26(&x38, &x39, x37, x17, (x21 & UINT32_C(0x3ffffff))); + fiat_25519_addcarryx_u25(&x40, &x41, x39, x19, (x21 & UINT32_C(0x1ffffff))); + x42 = (x40 << 6); + x43 = (x38 << 4); + x44 = (x36 << 3); + x45 = (x34 * (uint32_t)0x2); + x46 = (x30 << 6); + x47 = (x28 << 5); + x48 = (x26 << 3); + x49 = (x24 << 2); + x50 = (uint8_t)(x22 & UINT8_C(0xff)); + x51 = (x22 >> 8); + x52 = (uint8_t)(x51 & UINT8_C(0xff)); + x53 = (x51 >> 8); + x54 = (uint8_t)(x53 & UINT8_C(0xff)); + x55 = (uint8_t)(x53 >> 8); + x56 = (x49 + (uint32_t)x55); + x57 = (uint8_t)(x56 & UINT8_C(0xff)); + x58 = (x56 >> 8); + x59 = (uint8_t)(x58 & UINT8_C(0xff)); + x60 = (x58 >> 8); + x61 = (uint8_t)(x60 & UINT8_C(0xff)); + x62 = (uint8_t)(x60 >> 8); + x63 = (x48 + (uint32_t)x62); + x64 = (uint8_t)(x63 & UINT8_C(0xff)); + x65 = (x63 >> 8); + x66 = (uint8_t)(x65 & UINT8_C(0xff)); + x67 = (x65 >> 8); + x68 = (uint8_t)(x67 & UINT8_C(0xff)); + x69 = (uint8_t)(x67 >> 8); + x70 = (x47 + (uint32_t)x69); + x71 = (uint8_t)(x70 & UINT8_C(0xff)); + x72 = (x70 >> 8); + x73 = (uint8_t)(x72 & UINT8_C(0xff)); + x74 = (x72 >> 8); + x75 = (uint8_t)(x74 & UINT8_C(0xff)); + x76 = (uint8_t)(x74 >> 8); + x77 = (x46 + (uint32_t)x76); + x78 = (uint8_t)(x77 & UINT8_C(0xff)); + x79 = (x77 >> 8); + x80 = (uint8_t)(x79 & UINT8_C(0xff)); + x81 = (x79 >> 8); + x82 = (uint8_t)(x81 & UINT8_C(0xff)); + x83 = (uint8_t)(x81 >> 8); + x84 = (uint8_t)(x32 & UINT8_C(0xff)); + x85 = (x32 >> 8); + x86 = (uint8_t)(x85 & UINT8_C(0xff)); + x87 = (x85 >> 8); + x88 = (uint8_t)(x87 & UINT8_C(0xff)); + x89 = (fiat_25519_uint1)(x87 >> 8); + x90 = (x45 + (uint32_t)x89); + x91 = (uint8_t)(x90 & UINT8_C(0xff)); + x92 = (x90 >> 8); + x93 = (uint8_t)(x92 & UINT8_C(0xff)); + x94 = (x92 >> 8); + x95 = (uint8_t)(x94 & UINT8_C(0xff)); + x96 = (uint8_t)(x94 >> 8); + x97 = (x44 + (uint32_t)x96); + x98 = (uint8_t)(x97 & UINT8_C(0xff)); + x99 = (x97 >> 8); + x100 = (uint8_t)(x99 & UINT8_C(0xff)); + x101 = (x99 >> 8); + x102 = (uint8_t)(x101 & UINT8_C(0xff)); + x103 = (uint8_t)(x101 >> 8); + x104 = (x43 + (uint32_t)x103); + x105 = (uint8_t)(x104 & UINT8_C(0xff)); + x106 = (x104 >> 8); + x107 = (uint8_t)(x106 & UINT8_C(0xff)); + x108 = (x106 >> 8); + x109 = (uint8_t)(x108 & UINT8_C(0xff)); + x110 = (uint8_t)(x108 >> 8); + x111 = (x42 + (uint32_t)x110); + x112 = (uint8_t)(x111 & UINT8_C(0xff)); + x113 = (x111 >> 8); + x114 = (uint8_t)(x113 & UINT8_C(0xff)); + x115 = (x113 >> 8); + x116 = (uint8_t)(x115 & UINT8_C(0xff)); + x117 = (uint8_t)(x115 >> 8); + out1[0] = x50; + out1[1] = x52; + out1[2] = x54; + out1[3] = x57; + out1[4] = x59; + out1[5] = x61; + out1[6] = x64; + out1[7] = x66; + out1[8] = x68; + out1[9] = x71; + out1[10] = x73; + out1[11] = x75; + out1[12] = x78; + out1[13] = x80; + out1[14] = x82; + out1[15] = x83; + out1[16] = x84; + out1[17] = x86; + out1[18] = x88; + out1[19] = x91; + out1[20] = x93; + out1[21] = x95; + out1[22] = x98; + out1[23] = x100; + out1[24] = x102; + out1[25] = x105; + out1[26] = x107; + out1[27] = x109; + out1[28] = x112; + out1[29] = x114; + out1[30] = x116; + out1[31] = x117; +} + +/* + * The function fiat_25519_from_bytes deserializes a field element from bytes in little-endian order. + * + * Postconditions: + * eval out1 mod m = bytes_eval arg1 mod m + * + * Input Bounds: + * arg1: [[0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0x7f]] + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_from_bytes(fiat_25519_tight_field_element out1, const uint8_t arg1[32]) { + uint32_t x1; + uint32_t x2; + uint32_t x3; + uint32_t x4; + uint32_t x5; + uint32_t x6; + uint32_t x7; + uint32_t x8; + uint32_t x9; + uint32_t x10; + uint32_t x11; + uint32_t x12; + uint32_t x13; + uint32_t x14; + uint32_t x15; + uint8_t x16; + uint32_t x17; + uint32_t x18; + uint32_t x19; + uint32_t x20; + uint32_t x21; + uint32_t x22; + uint32_t x23; + uint32_t x24; + uint32_t x25; + uint32_t x26; + uint32_t x27; + uint32_t x28; + uint32_t x29; + uint32_t x30; + uint32_t x31; + uint8_t x32; + uint32_t x33; + uint32_t x34; + uint32_t x35; + uint32_t x36; + uint8_t x37; + uint32_t x38; + uint32_t x39; + uint32_t x40; + uint32_t x41; + uint8_t x42; + uint32_t x43; + uint32_t x44; + uint32_t x45; + uint32_t x46; + uint8_t x47; + uint32_t x48; + uint32_t x49; + uint32_t x50; + uint32_t x51; + uint8_t x52; + uint32_t x53; + uint32_t x54; + uint32_t x55; + uint32_t x56; + uint32_t x57; + uint32_t x58; + uint32_t x59; + uint8_t x60; + uint32_t x61; + uint32_t x62; + uint32_t x63; + uint32_t x64; + uint8_t x65; + uint32_t x66; + uint32_t x67; + uint32_t x68; + uint32_t x69; + uint8_t x70; + uint32_t x71; + uint32_t x72; + uint32_t x73; + uint32_t x74; + uint8_t x75; + uint32_t x76; + uint32_t x77; + uint32_t x78; + x1 = ((uint32_t)(arg1[31]) << 18); + x2 = ((uint32_t)(arg1[30]) << 10); + x3 = ((uint32_t)(arg1[29]) << 2); + x4 = ((uint32_t)(arg1[28]) << 20); + x5 = ((uint32_t)(arg1[27]) << 12); + x6 = ((uint32_t)(arg1[26]) << 4); + x7 = ((uint32_t)(arg1[25]) << 21); + x8 = ((uint32_t)(arg1[24]) << 13); + x9 = ((uint32_t)(arg1[23]) << 5); + x10 = ((uint32_t)(arg1[22]) << 23); + x11 = ((uint32_t)(arg1[21]) << 15); + x12 = ((uint32_t)(arg1[20]) << 7); + x13 = ((uint32_t)(arg1[19]) << 24); + x14 = ((uint32_t)(arg1[18]) << 16); + x15 = ((uint32_t)(arg1[17]) << 8); + x16 = (arg1[16]); + x17 = ((uint32_t)(arg1[15]) << 18); + x18 = ((uint32_t)(arg1[14]) << 10); + x19 = ((uint32_t)(arg1[13]) << 2); + x20 = ((uint32_t)(arg1[12]) << 19); + x21 = ((uint32_t)(arg1[11]) << 11); + x22 = ((uint32_t)(arg1[10]) << 3); + x23 = ((uint32_t)(arg1[9]) << 21); + x24 = ((uint32_t)(arg1[8]) << 13); + x25 = ((uint32_t)(arg1[7]) << 5); + x26 = ((uint32_t)(arg1[6]) << 22); + x27 = ((uint32_t)(arg1[5]) << 14); + x28 = ((uint32_t)(arg1[4]) << 6); + x29 = ((uint32_t)(arg1[3]) << 24); + x30 = ((uint32_t)(arg1[2]) << 16); + x31 = ((uint32_t)(arg1[1]) << 8); + x32 = (arg1[0]); + x33 = (x31 + (uint32_t)x32); + x34 = (x30 + x33); + x35 = (x29 + x34); + x36 = (x35 & UINT32_C(0x3ffffff)); + x37 = (uint8_t)(x35 >> 26); + x38 = (x28 + (uint32_t)x37); + x39 = (x27 + x38); + x40 = (x26 + x39); + x41 = (x40 & UINT32_C(0x1ffffff)); + x42 = (uint8_t)(x40 >> 25); + x43 = (x25 + (uint32_t)x42); + x44 = (x24 + x43); + x45 = (x23 + x44); + x46 = (x45 & UINT32_C(0x3ffffff)); + x47 = (uint8_t)(x45 >> 26); + x48 = (x22 + (uint32_t)x47); + x49 = (x21 + x48); + x50 = (x20 + x49); + x51 = (x50 & UINT32_C(0x1ffffff)); + x52 = (uint8_t)(x50 >> 25); + x53 = (x19 + (uint32_t)x52); + x54 = (x18 + x53); + x55 = (x17 + x54); + x56 = (x15 + (uint32_t)x16); + x57 = (x14 + x56); + x58 = (x13 + x57); + x59 = (x58 & UINT32_C(0x1ffffff)); + x60 = (uint8_t)(x58 >> 25); + x61 = (x12 + (uint32_t)x60); + x62 = (x11 + x61); + x63 = (x10 + x62); + x64 = (x63 & UINT32_C(0x3ffffff)); + x65 = (uint8_t)(x63 >> 26); + x66 = (x9 + (uint32_t)x65); + x67 = (x8 + x66); + x68 = (x7 + x67); + x69 = (x68 & UINT32_C(0x1ffffff)); + x70 = (uint8_t)(x68 >> 25); + x71 = (x6 + (uint32_t)x70); + x72 = (x5 + x71); + x73 = (x4 + x72); + x74 = (x73 & UINT32_C(0x3ffffff)); + x75 = (uint8_t)(x73 >> 26); + x76 = (x3 + (uint32_t)x75); + x77 = (x2 + x76); + x78 = (x1 + x77); + out1[0] = x36; + out1[1] = x41; + out1[2] = x46; + out1[3] = x51; + out1[4] = x55; + out1[5] = x59; + out1[6] = x64; + out1[7] = x69; + out1[8] = x74; + out1[9] = x78; +} + +/* + * The function fiat_25519_carry_scmul_121666 multiplies a field element by 121666 and reduces the result. + * + * Postconditions: + * eval out1 mod m = (121666 * eval arg1) mod m + * + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_carry_scmul_121666(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1) { + uint64_t x1; + uint64_t x2; + uint64_t x3; + uint64_t x4; + uint64_t x5; + uint64_t x6; + uint64_t x7; + uint64_t x8; + uint64_t x9; + uint64_t x10; + uint32_t x11; + uint32_t x12; + uint64_t x13; + uint32_t x14; + uint32_t x15; + uint64_t x16; + uint32_t x17; + uint32_t x18; + uint64_t x19; + uint32_t x20; + uint32_t x21; + uint64_t x22; + uint32_t x23; + uint32_t x24; + uint64_t x25; + uint32_t x26; + uint32_t x27; + uint64_t x28; + uint32_t x29; + uint32_t x30; + uint64_t x31; + uint32_t x32; + uint32_t x33; + uint64_t x34; + uint32_t x35; + uint32_t x36; + uint64_t x37; + uint32_t x38; + uint32_t x39; + uint32_t x40; + uint32_t x41; + fiat_25519_uint1 x42; + uint32_t x43; + uint32_t x44; + fiat_25519_uint1 x45; + uint32_t x46; + uint32_t x47; + x1 = ((uint64_t)UINT32_C(0x1db42) * (arg1[9])); + x2 = ((uint64_t)UINT32_C(0x1db42) * (arg1[8])); + x3 = ((uint64_t)UINT32_C(0x1db42) * (arg1[7])); + x4 = ((uint64_t)UINT32_C(0x1db42) * (arg1[6])); + x5 = ((uint64_t)UINT32_C(0x1db42) * (arg1[5])); + x6 = ((uint64_t)UINT32_C(0x1db42) * (arg1[4])); + x7 = ((uint64_t)UINT32_C(0x1db42) * (arg1[3])); + x8 = ((uint64_t)UINT32_C(0x1db42) * (arg1[2])); + x9 = ((uint64_t)UINT32_C(0x1db42) * (arg1[1])); + x10 = ((uint64_t)UINT32_C(0x1db42) * (arg1[0])); + x11 = (uint32_t)(x10 >> 26); + x12 = (uint32_t)(x10 & UINT32_C(0x3ffffff)); + x13 = (x11 + x9); + x14 = (uint32_t)(x13 >> 25); + x15 = (uint32_t)(x13 & UINT32_C(0x1ffffff)); + x16 = (x14 + x8); + x17 = (uint32_t)(x16 >> 26); + x18 = (uint32_t)(x16 & UINT32_C(0x3ffffff)); + x19 = (x17 + x7); + x20 = (uint32_t)(x19 >> 25); + x21 = (uint32_t)(x19 & UINT32_C(0x1ffffff)); + x22 = (x20 + x6); + x23 = (uint32_t)(x22 >> 26); + x24 = (uint32_t)(x22 & UINT32_C(0x3ffffff)); + x25 = (x23 + x5); + x26 = (uint32_t)(x25 >> 25); + x27 = (uint32_t)(x25 & UINT32_C(0x1ffffff)); + x28 = (x26 + x4); + x29 = (uint32_t)(x28 >> 26); + x30 = (uint32_t)(x28 & UINT32_C(0x3ffffff)); + x31 = (x29 + x3); + x32 = (uint32_t)(x31 >> 25); + x33 = (uint32_t)(x31 & UINT32_C(0x1ffffff)); + x34 = (x32 + x2); + x35 = (uint32_t)(x34 >> 26); + x36 = (uint32_t)(x34 & UINT32_C(0x3ffffff)); + x37 = (x35 + x1); + x38 = (uint32_t)(x37 >> 25); + x39 = (uint32_t)(x37 & UINT32_C(0x1ffffff)); + x40 = (x38 * UINT8_C(0x13)); + x41 = (x12 + x40); + x42 = (fiat_25519_uint1)(x41 >> 26); + x43 = (x41 & UINT32_C(0x3ffffff)); + x44 = (x42 + x15); + x45 = (fiat_25519_uint1)(x44 >> 25); + x46 = (x44 & UINT32_C(0x1ffffff)); + x47 = (x45 + x18); + out1[0] = x43; + out1[1] = x46; + out1[2] = x47; + out1[3] = x21; + out1[4] = x24; + out1[5] = x27; + out1[6] = x30; + out1[7] = x33; + out1[8] = x36; + out1[9] = x39; +} diff --git a/ring-0.17.14/third_party/fiat/curve25519_64.h b/ring-0.17.14/third_party/fiat/curve25519_64.h new file mode 100644 index 0000000000..ea2c23ca27 --- /dev/null +++ b/ring-0.17.14/third_party/fiat/curve25519_64.h @@ -0,0 +1,916 @@ +/* Autogenerated: 'src/ExtractionOCaml/unsaturated_solinas' --inline --static --use-value-barrier 25519 64 '(auto)' '2^255 - 19' carry_mul carry_square carry add sub opp selectznz to_bytes from_bytes relax carry_scmul121666 */ +/* curve description: 25519 */ +/* machine_wordsize = 64 (from "64") */ +/* requested operations: carry_mul, carry_square, carry, add, sub, opp, selectznz, to_bytes, from_bytes, relax, carry_scmul121666 */ +/* n = 5 (from "(auto)") */ +/* s-c = 2^255 - [(1, 19)] (from "2^255 - 19") */ +/* tight_bounds_multiplier = 1 (from "") */ +/* */ +/* Computed values: */ +/* carry_chain = [0, 1, 2, 3, 4, 0, 1] */ +/* eval z = z[0] + (z[1] << 51) + (z[2] << 102) + (z[3] << 153) + (z[4] << 204) */ +/* bytes_eval z = z[0] + (z[1] << 8) + (z[2] << 16) + (z[3] << 24) + (z[4] << 32) + (z[5] << 40) + (z[6] << 48) + (z[7] << 56) + (z[8] << 64) + (z[9] << 72) + (z[10] << 80) + (z[11] << 88) + (z[12] << 96) + (z[13] << 104) + (z[14] << 112) + (z[15] << 120) + (z[16] << 128) + (z[17] << 136) + (z[18] << 144) + (z[19] << 152) + (z[20] << 160) + (z[21] << 168) + (z[22] << 176) + (z[23] << 184) + (z[24] << 192) + (z[25] << 200) + (z[26] << 208) + (z[27] << 216) + (z[28] << 224) + (z[29] << 232) + (z[30] << 240) + (z[31] << 248) */ +/* balance = [0xfffffffffffda, 0xffffffffffffe, 0xffffffffffffe, 0xffffffffffffe, 0xffffffffffffe] */ + +#include +typedef unsigned char fiat_25519_uint1; +typedef signed char fiat_25519_int1; +#if defined(__GNUC__) || defined(__clang__) +# define FIAT_25519_FIAT_EXTENSION __extension__ +# define FIAT_25519_FIAT_INLINE __inline__ +#else +# define FIAT_25519_FIAT_EXTENSION +# define FIAT_25519_FIAT_INLINE +#endif + +FIAT_25519_FIAT_EXTENSION typedef signed __int128 fiat_25519_int128; +FIAT_25519_FIAT_EXTENSION typedef unsigned __int128 fiat_25519_uint128; + +/* The type fiat_25519_loose_field_element is a field element with loose bounds. */ +/* Bounds: [[0x0 ~> 0x18000000000000], [0x0 ~> 0x18000000000000], [0x0 ~> 0x18000000000000], [0x0 ~> 0x18000000000000], [0x0 ~> 0x18000000000000]] */ +typedef uint64_t fiat_25519_loose_field_element[5]; + +/* The type fiat_25519_tight_field_element is a field element with tight bounds. */ +/* Bounds: [[0x0 ~> 0x8000000000000], [0x0 ~> 0x8000000000000], [0x0 ~> 0x8000000000000], [0x0 ~> 0x8000000000000], [0x0 ~> 0x8000000000000]] */ +typedef uint64_t fiat_25519_tight_field_element[5]; + +#if (-1 & 3) != 3 +#error "This code only works on a two's complement system" +#endif + +#if !defined(FIAT_25519_NO_ASM) && (defined(__GNUC__) || defined(__clang__)) +static __inline__ uint64_t fiat_25519_value_barrier_u64(uint64_t a) { + __asm__("" : "+r"(a) : /* no inputs */); + return a; +} +#else +# define fiat_25519_value_barrier_u64(x) (x) +#endif + + +/* + * The function fiat_25519_addcarryx_u51 is an addition with carry. + * + * Postconditions: + * out1 = (arg1 + arg2 + arg3) mod 2^51 + * out2 = ⌊(arg1 + arg2 + arg3) / 2^51⌋ + * + * Input Bounds: + * arg1: [0x0 ~> 0x1] + * arg2: [0x0 ~> 0x7ffffffffffff] + * arg3: [0x0 ~> 0x7ffffffffffff] + * Output Bounds: + * out1: [0x0 ~> 0x7ffffffffffff] + * out2: [0x0 ~> 0x1] + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_addcarryx_u51(uint64_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint64_t arg2, uint64_t arg3) { + uint64_t x1; + uint64_t x2; + fiat_25519_uint1 x3; + x1 = ((arg1 + arg2) + arg3); + x2 = (x1 & UINT64_C(0x7ffffffffffff)); + x3 = (fiat_25519_uint1)(x1 >> 51); + *out1 = x2; + *out2 = x3; +} + +/* + * The function fiat_25519_subborrowx_u51 is a subtraction with borrow. + * + * Postconditions: + * out1 = (-arg1 + arg2 + -arg3) mod 2^51 + * out2 = -⌊(-arg1 + arg2 + -arg3) / 2^51⌋ + * + * Input Bounds: + * arg1: [0x0 ~> 0x1] + * arg2: [0x0 ~> 0x7ffffffffffff] + * arg3: [0x0 ~> 0x7ffffffffffff] + * Output Bounds: + * out1: [0x0 ~> 0x7ffffffffffff] + * out2: [0x0 ~> 0x1] + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_subborrowx_u51(uint64_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint64_t arg2, uint64_t arg3) { + int64_t x1; + fiat_25519_int1 x2; + uint64_t x3; + x1 = ((int64_t)(arg2 - (int64_t)arg1) - (int64_t)arg3); + x2 = (fiat_25519_int1)(x1 >> 51); + x3 = (x1 & UINT64_C(0x7ffffffffffff)); + *out1 = x3; + *out2 = (fiat_25519_uint1)(0x0 - x2); +} + +/* + * The function fiat_25519_cmovznz_u64 is a single-word conditional move. + * + * Postconditions: + * out1 = (if arg1 = 0 then arg2 else arg3) + * + * Input Bounds: + * arg1: [0x0 ~> 0x1] + * arg2: [0x0 ~> 0xffffffffffffffff] + * arg3: [0x0 ~> 0xffffffffffffffff] + * Output Bounds: + * out1: [0x0 ~> 0xffffffffffffffff] + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_cmovznz_u64(uint64_t* out1, fiat_25519_uint1 arg1, uint64_t arg2, uint64_t arg3) { + fiat_25519_uint1 x1; + uint64_t x2; + uint64_t x3; + x1 = (!(!arg1)); + x2 = ((fiat_25519_int1)(0x0 - x1) & UINT64_C(0xffffffffffffffff)); + x3 = ((fiat_25519_value_barrier_u64(x2) & arg3) | (fiat_25519_value_barrier_u64((~x2)) & arg2)); + *out1 = x3; +} + +/* + * The function fiat_25519_carry_mul multiplies two field elements and reduces the result. + * + * Postconditions: + * eval out1 mod m = (eval arg1 * eval arg2) mod m + * + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_carry_mul(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1, const fiat_25519_loose_field_element arg2) { + fiat_25519_uint128 x1; + fiat_25519_uint128 x2; + fiat_25519_uint128 x3; + fiat_25519_uint128 x4; + fiat_25519_uint128 x5; + fiat_25519_uint128 x6; + fiat_25519_uint128 x7; + fiat_25519_uint128 x8; + fiat_25519_uint128 x9; + fiat_25519_uint128 x10; + fiat_25519_uint128 x11; + fiat_25519_uint128 x12; + fiat_25519_uint128 x13; + fiat_25519_uint128 x14; + fiat_25519_uint128 x15; + fiat_25519_uint128 x16; + fiat_25519_uint128 x17; + fiat_25519_uint128 x18; + fiat_25519_uint128 x19; + fiat_25519_uint128 x20; + fiat_25519_uint128 x21; + fiat_25519_uint128 x22; + fiat_25519_uint128 x23; + fiat_25519_uint128 x24; + fiat_25519_uint128 x25; + fiat_25519_uint128 x26; + uint64_t x27; + uint64_t x28; + fiat_25519_uint128 x29; + fiat_25519_uint128 x30; + fiat_25519_uint128 x31; + fiat_25519_uint128 x32; + fiat_25519_uint128 x33; + uint64_t x34; + uint64_t x35; + fiat_25519_uint128 x36; + uint64_t x37; + uint64_t x38; + fiat_25519_uint128 x39; + uint64_t x40; + uint64_t x41; + fiat_25519_uint128 x42; + uint64_t x43; + uint64_t x44; + uint64_t x45; + uint64_t x46; + uint64_t x47; + uint64_t x48; + uint64_t x49; + fiat_25519_uint1 x50; + uint64_t x51; + uint64_t x52; + x1 = ((fiat_25519_uint128)(arg1[4]) * ((arg2[4]) * UINT8_C(0x13))); + x2 = ((fiat_25519_uint128)(arg1[4]) * ((arg2[3]) * UINT8_C(0x13))); + x3 = ((fiat_25519_uint128)(arg1[4]) * ((arg2[2]) * UINT8_C(0x13))); + x4 = ((fiat_25519_uint128)(arg1[4]) * ((arg2[1]) * UINT8_C(0x13))); + x5 = ((fiat_25519_uint128)(arg1[3]) * ((arg2[4]) * UINT8_C(0x13))); + x6 = ((fiat_25519_uint128)(arg1[3]) * ((arg2[3]) * UINT8_C(0x13))); + x7 = ((fiat_25519_uint128)(arg1[3]) * ((arg2[2]) * UINT8_C(0x13))); + x8 = ((fiat_25519_uint128)(arg1[2]) * ((arg2[4]) * UINT8_C(0x13))); + x9 = ((fiat_25519_uint128)(arg1[2]) * ((arg2[3]) * UINT8_C(0x13))); + x10 = ((fiat_25519_uint128)(arg1[1]) * ((arg2[4]) * UINT8_C(0x13))); + x11 = ((fiat_25519_uint128)(arg1[4]) * (arg2[0])); + x12 = ((fiat_25519_uint128)(arg1[3]) * (arg2[1])); + x13 = ((fiat_25519_uint128)(arg1[3]) * (arg2[0])); + x14 = ((fiat_25519_uint128)(arg1[2]) * (arg2[2])); + x15 = ((fiat_25519_uint128)(arg1[2]) * (arg2[1])); + x16 = ((fiat_25519_uint128)(arg1[2]) * (arg2[0])); + x17 = ((fiat_25519_uint128)(arg1[1]) * (arg2[3])); + x18 = ((fiat_25519_uint128)(arg1[1]) * (arg2[2])); + x19 = ((fiat_25519_uint128)(arg1[1]) * (arg2[1])); + x20 = ((fiat_25519_uint128)(arg1[1]) * (arg2[0])); + x21 = ((fiat_25519_uint128)(arg1[0]) * (arg2[4])); + x22 = ((fiat_25519_uint128)(arg1[0]) * (arg2[3])); + x23 = ((fiat_25519_uint128)(arg1[0]) * (arg2[2])); + x24 = ((fiat_25519_uint128)(arg1[0]) * (arg2[1])); + x25 = ((fiat_25519_uint128)(arg1[0]) * (arg2[0])); + x26 = (x25 + (x10 + (x9 + (x7 + x4)))); + x27 = (uint64_t)(x26 >> 51); + x28 = (uint64_t)(x26 & UINT64_C(0x7ffffffffffff)); + x29 = (x21 + (x17 + (x14 + (x12 + x11)))); + x30 = (x22 + (x18 + (x15 + (x13 + x1)))); + x31 = (x23 + (x19 + (x16 + (x5 + x2)))); + x32 = (x24 + (x20 + (x8 + (x6 + x3)))); + x33 = (x27 + x32); + x34 = (uint64_t)(x33 >> 51); + x35 = (uint64_t)(x33 & UINT64_C(0x7ffffffffffff)); + x36 = (x34 + x31); + x37 = (uint64_t)(x36 >> 51); + x38 = (uint64_t)(x36 & UINT64_C(0x7ffffffffffff)); + x39 = (x37 + x30); + x40 = (uint64_t)(x39 >> 51); + x41 = (uint64_t)(x39 & UINT64_C(0x7ffffffffffff)); + x42 = (x40 + x29); + x43 = (uint64_t)(x42 >> 51); + x44 = (uint64_t)(x42 & UINT64_C(0x7ffffffffffff)); + x45 = (x43 * UINT8_C(0x13)); + x46 = (x28 + x45); + x47 = (x46 >> 51); + x48 = (x46 & UINT64_C(0x7ffffffffffff)); + x49 = (x47 + x35); + x50 = (fiat_25519_uint1)(x49 >> 51); + x51 = (x49 & UINT64_C(0x7ffffffffffff)); + x52 = (x50 + x38); + out1[0] = x48; + out1[1] = x51; + out1[2] = x52; + out1[3] = x41; + out1[4] = x44; +} + +/* + * The function fiat_25519_carry_square squares a field element and reduces the result. + * + * Postconditions: + * eval out1 mod m = (eval arg1 * eval arg1) mod m + * + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_carry_square(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1) { + uint64_t x1; + uint64_t x2; + uint64_t x3; + uint64_t x4; + uint64_t x5; + uint64_t x6; + uint64_t x7; + uint64_t x8; + fiat_25519_uint128 x9; + fiat_25519_uint128 x10; + fiat_25519_uint128 x11; + fiat_25519_uint128 x12; + fiat_25519_uint128 x13; + fiat_25519_uint128 x14; + fiat_25519_uint128 x15; + fiat_25519_uint128 x16; + fiat_25519_uint128 x17; + fiat_25519_uint128 x18; + fiat_25519_uint128 x19; + fiat_25519_uint128 x20; + fiat_25519_uint128 x21; + fiat_25519_uint128 x22; + fiat_25519_uint128 x23; + fiat_25519_uint128 x24; + uint64_t x25; + uint64_t x26; + fiat_25519_uint128 x27; + fiat_25519_uint128 x28; + fiat_25519_uint128 x29; + fiat_25519_uint128 x30; + fiat_25519_uint128 x31; + uint64_t x32; + uint64_t x33; + fiat_25519_uint128 x34; + uint64_t x35; + uint64_t x36; + fiat_25519_uint128 x37; + uint64_t x38; + uint64_t x39; + fiat_25519_uint128 x40; + uint64_t x41; + uint64_t x42; + uint64_t x43; + uint64_t x44; + uint64_t x45; + uint64_t x46; + uint64_t x47; + fiat_25519_uint1 x48; + uint64_t x49; + uint64_t x50; + x1 = ((arg1[4]) * UINT8_C(0x13)); + x2 = (x1 * 0x2); + x3 = ((arg1[4]) * 0x2); + x4 = ((arg1[3]) * UINT8_C(0x13)); + x5 = (x4 * 0x2); + x6 = ((arg1[3]) * 0x2); + x7 = ((arg1[2]) * 0x2); + x8 = ((arg1[1]) * 0x2); + x9 = ((fiat_25519_uint128)(arg1[4]) * x1); + x10 = ((fiat_25519_uint128)(arg1[3]) * x2); + x11 = ((fiat_25519_uint128)(arg1[3]) * x4); + x12 = ((fiat_25519_uint128)(arg1[2]) * x2); + x13 = ((fiat_25519_uint128)(arg1[2]) * x5); + x14 = ((fiat_25519_uint128)(arg1[2]) * (arg1[2])); + x15 = ((fiat_25519_uint128)(arg1[1]) * x2); + x16 = ((fiat_25519_uint128)(arg1[1]) * x6); + x17 = ((fiat_25519_uint128)(arg1[1]) * x7); + x18 = ((fiat_25519_uint128)(arg1[1]) * (arg1[1])); + x19 = ((fiat_25519_uint128)(arg1[0]) * x3); + x20 = ((fiat_25519_uint128)(arg1[0]) * x6); + x21 = ((fiat_25519_uint128)(arg1[0]) * x7); + x22 = ((fiat_25519_uint128)(arg1[0]) * x8); + x23 = ((fiat_25519_uint128)(arg1[0]) * (arg1[0])); + x24 = (x23 + (x15 + x13)); + x25 = (uint64_t)(x24 >> 51); + x26 = (uint64_t)(x24 & UINT64_C(0x7ffffffffffff)); + x27 = (x19 + (x16 + x14)); + x28 = (x20 + (x17 + x9)); + x29 = (x21 + (x18 + x10)); + x30 = (x22 + (x12 + x11)); + x31 = (x25 + x30); + x32 = (uint64_t)(x31 >> 51); + x33 = (uint64_t)(x31 & UINT64_C(0x7ffffffffffff)); + x34 = (x32 + x29); + x35 = (uint64_t)(x34 >> 51); + x36 = (uint64_t)(x34 & UINT64_C(0x7ffffffffffff)); + x37 = (x35 + x28); + x38 = (uint64_t)(x37 >> 51); + x39 = (uint64_t)(x37 & UINT64_C(0x7ffffffffffff)); + x40 = (x38 + x27); + x41 = (uint64_t)(x40 >> 51); + x42 = (uint64_t)(x40 & UINT64_C(0x7ffffffffffff)); + x43 = (x41 * UINT8_C(0x13)); + x44 = (x26 + x43); + x45 = (x44 >> 51); + x46 = (x44 & UINT64_C(0x7ffffffffffff)); + x47 = (x45 + x33); + x48 = (fiat_25519_uint1)(x47 >> 51); + x49 = (x47 & UINT64_C(0x7ffffffffffff)); + x50 = (x48 + x36); + out1[0] = x46; + out1[1] = x49; + out1[2] = x50; + out1[3] = x39; + out1[4] = x42; +} + +/* + * The function fiat_25519_carry reduces a field element. + * + * Postconditions: + * eval out1 mod m = eval arg1 mod m + * + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_carry(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1) { + uint64_t x1; + uint64_t x2; + uint64_t x3; + uint64_t x4; + uint64_t x5; + uint64_t x6; + uint64_t x7; + uint64_t x8; + uint64_t x9; + uint64_t x10; + uint64_t x11; + uint64_t x12; + x1 = (arg1[0]); + x2 = ((x1 >> 51) + (arg1[1])); + x3 = ((x2 >> 51) + (arg1[2])); + x4 = ((x3 >> 51) + (arg1[3])); + x5 = ((x4 >> 51) + (arg1[4])); + x6 = ((x1 & UINT64_C(0x7ffffffffffff)) + ((x5 >> 51) * UINT8_C(0x13))); + x7 = ((fiat_25519_uint1)(x6 >> 51) + (x2 & UINT64_C(0x7ffffffffffff))); + x8 = (x6 & UINT64_C(0x7ffffffffffff)); + x9 = (x7 & UINT64_C(0x7ffffffffffff)); + x10 = ((fiat_25519_uint1)(x7 >> 51) + (x3 & UINT64_C(0x7ffffffffffff))); + x11 = (x4 & UINT64_C(0x7ffffffffffff)); + x12 = (x5 & UINT64_C(0x7ffffffffffff)); + out1[0] = x8; + out1[1] = x9; + out1[2] = x10; + out1[3] = x11; + out1[4] = x12; +} + +/* + * The function fiat_25519_add adds two field elements. + * + * Postconditions: + * eval out1 mod m = (eval arg1 + eval arg2) mod m + * + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_add(fiat_25519_loose_field_element out1, const fiat_25519_tight_field_element arg1, const fiat_25519_tight_field_element arg2) { + uint64_t x1; + uint64_t x2; + uint64_t x3; + uint64_t x4; + uint64_t x5; + x1 = ((arg1[0]) + (arg2[0])); + x2 = ((arg1[1]) + (arg2[1])); + x3 = ((arg1[2]) + (arg2[2])); + x4 = ((arg1[3]) + (arg2[3])); + x5 = ((arg1[4]) + (arg2[4])); + out1[0] = x1; + out1[1] = x2; + out1[2] = x3; + out1[3] = x4; + out1[4] = x5; +} + +/* + * The function fiat_25519_sub subtracts two field elements. + * + * Postconditions: + * eval out1 mod m = (eval arg1 - eval arg2) mod m + * + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_sub(fiat_25519_loose_field_element out1, const fiat_25519_tight_field_element arg1, const fiat_25519_tight_field_element arg2) { + uint64_t x1; + uint64_t x2; + uint64_t x3; + uint64_t x4; + uint64_t x5; + x1 = ((UINT64_C(0xfffffffffffda) + (arg1[0])) - (arg2[0])); + x2 = ((UINT64_C(0xffffffffffffe) + (arg1[1])) - (arg2[1])); + x3 = ((UINT64_C(0xffffffffffffe) + (arg1[2])) - (arg2[2])); + x4 = ((UINT64_C(0xffffffffffffe) + (arg1[3])) - (arg2[3])); + x5 = ((UINT64_C(0xffffffffffffe) + (arg1[4])) - (arg2[4])); + out1[0] = x1; + out1[1] = x2; + out1[2] = x3; + out1[3] = x4; + out1[4] = x5; +} + +/* + * The function fiat_25519_opp negates a field element. + * + * Postconditions: + * eval out1 mod m = -eval arg1 mod m + * + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_opp(fiat_25519_loose_field_element out1, const fiat_25519_tight_field_element arg1) { + uint64_t x1; + uint64_t x2; + uint64_t x3; + uint64_t x4; + uint64_t x5; + x1 = (UINT64_C(0xfffffffffffda) - (arg1[0])); + x2 = (UINT64_C(0xffffffffffffe) - (arg1[1])); + x3 = (UINT64_C(0xffffffffffffe) - (arg1[2])); + x4 = (UINT64_C(0xffffffffffffe) - (arg1[3])); + x5 = (UINT64_C(0xffffffffffffe) - (arg1[4])); + out1[0] = x1; + out1[1] = x2; + out1[2] = x3; + out1[3] = x4; + out1[4] = x5; +} + +/* + * The function fiat_25519_to_bytes serializes a field element to bytes in little-endian order. + * + * Postconditions: + * out1 = map (λ x, ⌊((eval arg1 mod m) mod 2^(8 * (x + 1))) / 2^(8 * x)⌋) [0..31] + * + * Output Bounds: + * out1: [[0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0x7f]] + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_to_bytes(uint8_t out1[32], const fiat_25519_tight_field_element arg1) { + uint64_t x1; + fiat_25519_uint1 x2; + uint64_t x3; + fiat_25519_uint1 x4; + uint64_t x5; + fiat_25519_uint1 x6; + uint64_t x7; + fiat_25519_uint1 x8; + uint64_t x9; + fiat_25519_uint1 x10; + uint64_t x11; + uint64_t x12; + fiat_25519_uint1 x13; + uint64_t x14; + fiat_25519_uint1 x15; + uint64_t x16; + fiat_25519_uint1 x17; + uint64_t x18; + fiat_25519_uint1 x19; + uint64_t x20; + fiat_25519_uint1 x21; + uint64_t x22; + uint64_t x23; + uint64_t x24; + uint64_t x25; + uint8_t x26; + uint64_t x27; + uint8_t x28; + uint64_t x29; + uint8_t x30; + uint64_t x31; + uint8_t x32; + uint64_t x33; + uint8_t x34; + uint64_t x35; + uint8_t x36; + uint8_t x37; + uint64_t x38; + uint8_t x39; + uint64_t x40; + uint8_t x41; + uint64_t x42; + uint8_t x43; + uint64_t x44; + uint8_t x45; + uint64_t x46; + uint8_t x47; + uint64_t x48; + uint8_t x49; + uint8_t x50; + uint64_t x51; + uint8_t x52; + uint64_t x53; + uint8_t x54; + uint64_t x55; + uint8_t x56; + uint64_t x57; + uint8_t x58; + uint64_t x59; + uint8_t x60; + uint64_t x61; + uint8_t x62; + uint64_t x63; + uint8_t x64; + fiat_25519_uint1 x65; + uint64_t x66; + uint8_t x67; + uint64_t x68; + uint8_t x69; + uint64_t x70; + uint8_t x71; + uint64_t x72; + uint8_t x73; + uint64_t x74; + uint8_t x75; + uint64_t x76; + uint8_t x77; + uint8_t x78; + uint64_t x79; + uint8_t x80; + uint64_t x81; + uint8_t x82; + uint64_t x83; + uint8_t x84; + uint64_t x85; + uint8_t x86; + uint64_t x87; + uint8_t x88; + uint64_t x89; + uint8_t x90; + uint8_t x91; + fiat_25519_subborrowx_u51(&x1, &x2, 0x0, (arg1[0]), UINT64_C(0x7ffffffffffed)); + fiat_25519_subborrowx_u51(&x3, &x4, x2, (arg1[1]), UINT64_C(0x7ffffffffffff)); + fiat_25519_subborrowx_u51(&x5, &x6, x4, (arg1[2]), UINT64_C(0x7ffffffffffff)); + fiat_25519_subborrowx_u51(&x7, &x8, x6, (arg1[3]), UINT64_C(0x7ffffffffffff)); + fiat_25519_subborrowx_u51(&x9, &x10, x8, (arg1[4]), UINT64_C(0x7ffffffffffff)); + fiat_25519_cmovznz_u64(&x11, x10, 0x0, UINT64_C(0xffffffffffffffff)); + fiat_25519_addcarryx_u51(&x12, &x13, 0x0, x1, (x11 & UINT64_C(0x7ffffffffffed))); + fiat_25519_addcarryx_u51(&x14, &x15, x13, x3, (x11 & UINT64_C(0x7ffffffffffff))); + fiat_25519_addcarryx_u51(&x16, &x17, x15, x5, (x11 & UINT64_C(0x7ffffffffffff))); + fiat_25519_addcarryx_u51(&x18, &x19, x17, x7, (x11 & UINT64_C(0x7ffffffffffff))); + fiat_25519_addcarryx_u51(&x20, &x21, x19, x9, (x11 & UINT64_C(0x7ffffffffffff))); + x22 = (x20 << 4); + x23 = (x18 * (uint64_t)0x2); + x24 = (x16 << 6); + x25 = (x14 << 3); + x26 = (uint8_t)(x12 & UINT8_C(0xff)); + x27 = (x12 >> 8); + x28 = (uint8_t)(x27 & UINT8_C(0xff)); + x29 = (x27 >> 8); + x30 = (uint8_t)(x29 & UINT8_C(0xff)); + x31 = (x29 >> 8); + x32 = (uint8_t)(x31 & UINT8_C(0xff)); + x33 = (x31 >> 8); + x34 = (uint8_t)(x33 & UINT8_C(0xff)); + x35 = (x33 >> 8); + x36 = (uint8_t)(x35 & UINT8_C(0xff)); + x37 = (uint8_t)(x35 >> 8); + x38 = (x25 + (uint64_t)x37); + x39 = (uint8_t)(x38 & UINT8_C(0xff)); + x40 = (x38 >> 8); + x41 = (uint8_t)(x40 & UINT8_C(0xff)); + x42 = (x40 >> 8); + x43 = (uint8_t)(x42 & UINT8_C(0xff)); + x44 = (x42 >> 8); + x45 = (uint8_t)(x44 & UINT8_C(0xff)); + x46 = (x44 >> 8); + x47 = (uint8_t)(x46 & UINT8_C(0xff)); + x48 = (x46 >> 8); + x49 = (uint8_t)(x48 & UINT8_C(0xff)); + x50 = (uint8_t)(x48 >> 8); + x51 = (x24 + (uint64_t)x50); + x52 = (uint8_t)(x51 & UINT8_C(0xff)); + x53 = (x51 >> 8); + x54 = (uint8_t)(x53 & UINT8_C(0xff)); + x55 = (x53 >> 8); + x56 = (uint8_t)(x55 & UINT8_C(0xff)); + x57 = (x55 >> 8); + x58 = (uint8_t)(x57 & UINT8_C(0xff)); + x59 = (x57 >> 8); + x60 = (uint8_t)(x59 & UINT8_C(0xff)); + x61 = (x59 >> 8); + x62 = (uint8_t)(x61 & UINT8_C(0xff)); + x63 = (x61 >> 8); + x64 = (uint8_t)(x63 & UINT8_C(0xff)); + x65 = (fiat_25519_uint1)(x63 >> 8); + x66 = (x23 + (uint64_t)x65); + x67 = (uint8_t)(x66 & UINT8_C(0xff)); + x68 = (x66 >> 8); + x69 = (uint8_t)(x68 & UINT8_C(0xff)); + x70 = (x68 >> 8); + x71 = (uint8_t)(x70 & UINT8_C(0xff)); + x72 = (x70 >> 8); + x73 = (uint8_t)(x72 & UINT8_C(0xff)); + x74 = (x72 >> 8); + x75 = (uint8_t)(x74 & UINT8_C(0xff)); + x76 = (x74 >> 8); + x77 = (uint8_t)(x76 & UINT8_C(0xff)); + x78 = (uint8_t)(x76 >> 8); + x79 = (x22 + (uint64_t)x78); + x80 = (uint8_t)(x79 & UINT8_C(0xff)); + x81 = (x79 >> 8); + x82 = (uint8_t)(x81 & UINT8_C(0xff)); + x83 = (x81 >> 8); + x84 = (uint8_t)(x83 & UINT8_C(0xff)); + x85 = (x83 >> 8); + x86 = (uint8_t)(x85 & UINT8_C(0xff)); + x87 = (x85 >> 8); + x88 = (uint8_t)(x87 & UINT8_C(0xff)); + x89 = (x87 >> 8); + x90 = (uint8_t)(x89 & UINT8_C(0xff)); + x91 = (uint8_t)(x89 >> 8); + out1[0] = x26; + out1[1] = x28; + out1[2] = x30; + out1[3] = x32; + out1[4] = x34; + out1[5] = x36; + out1[6] = x39; + out1[7] = x41; + out1[8] = x43; + out1[9] = x45; + out1[10] = x47; + out1[11] = x49; + out1[12] = x52; + out1[13] = x54; + out1[14] = x56; + out1[15] = x58; + out1[16] = x60; + out1[17] = x62; + out1[18] = x64; + out1[19] = x67; + out1[20] = x69; + out1[21] = x71; + out1[22] = x73; + out1[23] = x75; + out1[24] = x77; + out1[25] = x80; + out1[26] = x82; + out1[27] = x84; + out1[28] = x86; + out1[29] = x88; + out1[30] = x90; + out1[31] = x91; +} + +/* + * The function fiat_25519_from_bytes deserializes a field element from bytes in little-endian order. + * + * Postconditions: + * eval out1 mod m = bytes_eval arg1 mod m + * + * Input Bounds: + * arg1: [[0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0x7f]] + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_from_bytes(fiat_25519_tight_field_element out1, const uint8_t arg1[32]) { + uint64_t x1; + uint64_t x2; + uint64_t x3; + uint64_t x4; + uint64_t x5; + uint64_t x6; + uint64_t x7; + uint64_t x8; + uint64_t x9; + uint64_t x10; + uint64_t x11; + uint64_t x12; + uint64_t x13; + uint64_t x14; + uint64_t x15; + uint64_t x16; + uint64_t x17; + uint64_t x18; + uint64_t x19; + uint64_t x20; + uint64_t x21; + uint64_t x22; + uint64_t x23; + uint64_t x24; + uint64_t x25; + uint64_t x26; + uint64_t x27; + uint64_t x28; + uint64_t x29; + uint64_t x30; + uint64_t x31; + uint8_t x32; + uint64_t x33; + uint64_t x34; + uint64_t x35; + uint64_t x36; + uint64_t x37; + uint64_t x38; + uint64_t x39; + uint8_t x40; + uint64_t x41; + uint64_t x42; + uint64_t x43; + uint64_t x44; + uint64_t x45; + uint64_t x46; + uint64_t x47; + uint8_t x48; + uint64_t x49; + uint64_t x50; + uint64_t x51; + uint64_t x52; + uint64_t x53; + uint64_t x54; + uint64_t x55; + uint64_t x56; + uint8_t x57; + uint64_t x58; + uint64_t x59; + uint64_t x60; + uint64_t x61; + uint64_t x62; + uint64_t x63; + uint64_t x64; + uint8_t x65; + uint64_t x66; + uint64_t x67; + uint64_t x68; + uint64_t x69; + uint64_t x70; + uint64_t x71; + x1 = ((uint64_t)(arg1[31]) << 44); + x2 = ((uint64_t)(arg1[30]) << 36); + x3 = ((uint64_t)(arg1[29]) << 28); + x4 = ((uint64_t)(arg1[28]) << 20); + x5 = ((uint64_t)(arg1[27]) << 12); + x6 = ((uint64_t)(arg1[26]) << 4); + x7 = ((uint64_t)(arg1[25]) << 47); + x8 = ((uint64_t)(arg1[24]) << 39); + x9 = ((uint64_t)(arg1[23]) << 31); + x10 = ((uint64_t)(arg1[22]) << 23); + x11 = ((uint64_t)(arg1[21]) << 15); + x12 = ((uint64_t)(arg1[20]) << 7); + x13 = ((uint64_t)(arg1[19]) << 50); + x14 = ((uint64_t)(arg1[18]) << 42); + x15 = ((uint64_t)(arg1[17]) << 34); + x16 = ((uint64_t)(arg1[16]) << 26); + x17 = ((uint64_t)(arg1[15]) << 18); + x18 = ((uint64_t)(arg1[14]) << 10); + x19 = ((uint64_t)(arg1[13]) << 2); + x20 = ((uint64_t)(arg1[12]) << 45); + x21 = ((uint64_t)(arg1[11]) << 37); + x22 = ((uint64_t)(arg1[10]) << 29); + x23 = ((uint64_t)(arg1[9]) << 21); + x24 = ((uint64_t)(arg1[8]) << 13); + x25 = ((uint64_t)(arg1[7]) << 5); + x26 = ((uint64_t)(arg1[6]) << 48); + x27 = ((uint64_t)(arg1[5]) << 40); + x28 = ((uint64_t)(arg1[4]) << 32); + x29 = ((uint64_t)(arg1[3]) << 24); + x30 = ((uint64_t)(arg1[2]) << 16); + x31 = ((uint64_t)(arg1[1]) << 8); + x32 = (arg1[0]); + x33 = (x31 + (uint64_t)x32); + x34 = (x30 + x33); + x35 = (x29 + x34); + x36 = (x28 + x35); + x37 = (x27 + x36); + x38 = (x26 + x37); + x39 = (x38 & UINT64_C(0x7ffffffffffff)); + x40 = (uint8_t)(x38 >> 51); + x41 = (x25 + (uint64_t)x40); + x42 = (x24 + x41); + x43 = (x23 + x42); + x44 = (x22 + x43); + x45 = (x21 + x44); + x46 = (x20 + x45); + x47 = (x46 & UINT64_C(0x7ffffffffffff)); + x48 = (uint8_t)(x46 >> 51); + x49 = (x19 + (uint64_t)x48); + x50 = (x18 + x49); + x51 = (x17 + x50); + x52 = (x16 + x51); + x53 = (x15 + x52); + x54 = (x14 + x53); + x55 = (x13 + x54); + x56 = (x55 & UINT64_C(0x7ffffffffffff)); + x57 = (uint8_t)(x55 >> 51); + x58 = (x12 + (uint64_t)x57); + x59 = (x11 + x58); + x60 = (x10 + x59); + x61 = (x9 + x60); + x62 = (x8 + x61); + x63 = (x7 + x62); + x64 = (x63 & UINT64_C(0x7ffffffffffff)); + x65 = (uint8_t)(x63 >> 51); + x66 = (x6 + (uint64_t)x65); + x67 = (x5 + x66); + x68 = (x4 + x67); + x69 = (x3 + x68); + x70 = (x2 + x69); + x71 = (x1 + x70); + out1[0] = x39; + out1[1] = x47; + out1[2] = x56; + out1[3] = x64; + out1[4] = x71; +} + +/* + * The function fiat_25519_carry_scmul_121666 multiplies a field element by 121666 and reduces the result. + * + * Postconditions: + * eval out1 mod m = (121666 * eval arg1) mod m + * + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_carry_scmul_121666(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1) { + fiat_25519_uint128 x1; + fiat_25519_uint128 x2; + fiat_25519_uint128 x3; + fiat_25519_uint128 x4; + fiat_25519_uint128 x5; + uint64_t x6; + uint64_t x7; + fiat_25519_uint128 x8; + uint64_t x9; + uint64_t x10; + fiat_25519_uint128 x11; + uint64_t x12; + uint64_t x13; + fiat_25519_uint128 x14; + uint64_t x15; + uint64_t x16; + fiat_25519_uint128 x17; + uint64_t x18; + uint64_t x19; + uint64_t x20; + uint64_t x21; + fiat_25519_uint1 x22; + uint64_t x23; + uint64_t x24; + fiat_25519_uint1 x25; + uint64_t x26; + uint64_t x27; + x1 = ((fiat_25519_uint128)UINT32_C(0x1db42) * (arg1[4])); + x2 = ((fiat_25519_uint128)UINT32_C(0x1db42) * (arg1[3])); + x3 = ((fiat_25519_uint128)UINT32_C(0x1db42) * (arg1[2])); + x4 = ((fiat_25519_uint128)UINT32_C(0x1db42) * (arg1[1])); + x5 = ((fiat_25519_uint128)UINT32_C(0x1db42) * (arg1[0])); + x6 = (uint64_t)(x5 >> 51); + x7 = (uint64_t)(x5 & UINT64_C(0x7ffffffffffff)); + x8 = (x6 + x4); + x9 = (uint64_t)(x8 >> 51); + x10 = (uint64_t)(x8 & UINT64_C(0x7ffffffffffff)); + x11 = (x9 + x3); + x12 = (uint64_t)(x11 >> 51); + x13 = (uint64_t)(x11 & UINT64_C(0x7ffffffffffff)); + x14 = (x12 + x2); + x15 = (uint64_t)(x14 >> 51); + x16 = (uint64_t)(x14 & UINT64_C(0x7ffffffffffff)); + x17 = (x15 + x1); + x18 = (uint64_t)(x17 >> 51); + x19 = (uint64_t)(x17 & UINT64_C(0x7ffffffffffff)); + x20 = (x18 * UINT8_C(0x13)); + x21 = (x7 + x20); + x22 = (fiat_25519_uint1)(x21 >> 51); + x23 = (x21 & UINT64_C(0x7ffffffffffff)); + x24 = (x22 + x10); + x25 = (fiat_25519_uint1)(x24 >> 51); + x26 = (x24 & UINT64_C(0x7ffffffffffff)); + x27 = (x25 + x13); + out1[0] = x23; + out1[1] = x26; + out1[2] = x27; + out1[3] = x16; + out1[4] = x19; +} diff --git a/ring-0.17.14/third_party/fiat/curve25519_64_adx.h b/ring-0.17.14/third_party/fiat/curve25519_64_adx.h new file mode 100644 index 0000000000..02e8ad9114 --- /dev/null +++ b/ring-0.17.14/third_party/fiat/curve25519_64_adx.h @@ -0,0 +1,695 @@ +#include +#include "../../crypto/internal.h" + +#include +#include +#include + +typedef uint64_t fe4[4]; +typedef uint8_t fiat_uint1; +typedef int8_t fiat_int1; + +static __inline__ uint64_t fiat_value_barrier_u64(uint64_t a) { + __asm__("" : "+r"(a) : /* no inputs */); + return a; +} + +__attribute__((target("adx,bmi2"))) +static inline void fe4_mul(fe4 out, const fe4 x, const fe4 y) { fiat_curve25519_adx_mul(out, x, y); } + +__attribute__((target("adx,bmi2"))) +static inline void fe4_sq(fe4 out, const fe4 x) { fiat_curve25519_adx_square(out, x); } + +/* + * The function fiat_mulx_u64 is a multiplication, returning the full double-width result. + * + * Postconditions: + * out1 = (arg1 * arg2) mod 2^64 + * out2 = ⌊arg1 * arg2 / 2^64⌋ + * + * Input Bounds: + * arg1: [0x0 ~> 0xffffffffffffffff] + * arg2: [0x0 ~> 0xffffffffffffffff] + * Output Bounds: + * out1: [0x0 ~> 0xffffffffffffffff] + * out2: [0x0 ~> 0xffffffffffffffff] + */ +__attribute__((target("adx,bmi2"))) +static inline void fiat_mulx_u64(uint64_t* out1, uint64_t* out2, uint64_t arg1, uint64_t arg2) { +// NOTE: edited after generation +#if defined(_M_X64) + unsigned long long t; + *out1 = _umul128(arg1, arg2, &t); + *out2 = t; +#elif defined(_M_ARM64) + *out1 = arg1 * arg2; + *out2 = __umulh(arg1, arg2); +#else + unsigned __int128 t = (unsigned __int128)arg1 * arg2; + *out1 = t; + *out2 = (t >> 64); +#endif +} + +/* + * The function fiat_addcarryx_u64 is an addition with carry. + * + * Postconditions: + * out1 = (arg1 + arg2 + arg3) mod 2^64 + * out2 = ⌊(arg1 + arg2 + arg3) / 2^64⌋ + * + * Input Bounds: + * arg1: [0x0 ~> 0x1] + * arg2: [0x0 ~> 0xffffffffffffffff] + * arg3: [0x0 ~> 0xffffffffffffffff] + * Output Bounds: + * out1: [0x0 ~> 0xffffffffffffffff] + * out2: [0x0 ~> 0x1] + */ +__attribute__((target("adx,bmi2"))) +static inline void fiat_addcarryx_u64(uint64_t* out1, fiat_uint1* out2, fiat_uint1 arg1, uint64_t arg2, uint64_t arg3) { +// NOTE: edited after generation +#if defined(__has_builtin) +# if __has_builtin(__builtin_ia32_addcarryx_u64) +# define addcarry64 __builtin_ia32_addcarryx_u64 +# endif +#endif +#if defined(addcarry64) + long long unsigned int t; + *out2 = addcarry64(arg1, arg2, arg3, &t); + *out1 = t; +#elif defined(_M_X64) + long long unsigned int t; + *out2 = _addcarry_u64(arg1, arg2, arg3, out1); + *out1 = t; +#else + arg2 += arg1; + arg1 = arg2 < arg1; + uint64_t ret = arg2 + arg3; + arg1 += ret < arg2; + *out1 = ret; + *out2 = arg1; +#endif +#undef addcarry64 +} + +/* + * The function fiat_subborrowx_u64 is a subtraction with borrow. + * + * Postconditions: + * out1 = (-arg1 + arg2 + -arg3) mod 2^64 + * out2 = -⌊(-arg1 + arg2 + -arg3) / 2^64⌋ + * + * Input Bounds: + * arg1: [0x0 ~> 0x1] + * arg2: [0x0 ~> 0xffffffffffffffff] + * arg3: [0x0 ~> 0xffffffffffffffff] + * Output Bounds: + * out1: [0x0 ~> 0xffffffffffffffff] + * out2: [0x0 ~> 0x1] + */ +__attribute__((target("adx,bmi2"))) +static inline void fiat_subborrowx_u64(uint64_t* out1, fiat_uint1* out2, fiat_uint1 arg1, uint64_t arg2, uint64_t arg3) { +#if defined(__has_builtin) +# if __has_builtin(__builtin_ia32_subborrow_u64) +# define subborrow64 __builtin_ia32_subborrow_u64 +# endif +#endif +#if defined(subborrow64) + long long unsigned int t; + *out2 = subborrow64(arg1, arg2, arg3, &t); + *out1 = t; +#elif defined(_M_X64) + long long unsigned int t; + *out2 = _subborrow_u64(arg1, arg2, arg3, &t); // NOTE: edited after generation + *out1 = t; +#else + *out1 = arg2 - arg3 - arg1; + *out2 = (arg2 < arg3) | ((arg2 == arg3) & arg1); +#endif +#undef subborrow64 +} + +/* + * The function fiat_cmovznz_u64 is a single-word conditional move. + * + * Postconditions: + * out1 = (if arg1 = 0 then arg2 else arg3) + * + * Input Bounds: + * arg1: [0x0 ~> 0x1] + * arg2: [0x0 ~> 0xffffffffffffffff] + * arg3: [0x0 ~> 0xffffffffffffffff] + * Output Bounds: + * out1: [0x0 ~> 0xffffffffffffffff] + */ +__attribute__((target("adx,bmi2"))) +static inline void fiat_cmovznz_u64(uint64_t* out1, fiat_uint1 arg1, uint64_t arg2, uint64_t arg3) { + fiat_uint1 x1; + uint64_t x2; + uint64_t x3; + x1 = (!(!arg1)); + x2 = ((fiat_int1)(0x0 - x1) & UINT64_C(0xffffffffffffffff)); + x3 = ((fiat_value_barrier_u64(x2) & arg3) | (fiat_value_barrier_u64((~x2)) & arg2)); + *out1 = x3; +} + +/* + * Input Bounds: + * arg1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + * arg2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + * Output Bounds: + * out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + */ +__attribute__((target("adx,bmi2"))) +static void fe4_add(uint64_t out1[4], const uint64_t arg1[4], const uint64_t arg2[4]) { + uint64_t x1; + fiat_uint1 x2; + uint64_t x3; + fiat_uint1 x4; + uint64_t x5; + fiat_uint1 x6; + uint64_t x7; + fiat_uint1 x8; + uint64_t x9; + uint64_t x10; + fiat_uint1 x11; + uint64_t x12; + fiat_uint1 x13; + uint64_t x14; + fiat_uint1 x15; + uint64_t x16; + fiat_uint1 x17; + uint64_t x18; + uint64_t x19; + fiat_uint1 x20; + fiat_addcarryx_u64(&x1, &x2, 0x0, (arg1[0]), (arg2[0])); + fiat_addcarryx_u64(&x3, &x4, x2, (arg1[1]), (arg2[1])); + fiat_addcarryx_u64(&x5, &x6, x4, (arg1[2]), (arg2[2])); + fiat_addcarryx_u64(&x7, &x8, x6, (arg1[3]), (arg2[3])); + fiat_cmovznz_u64(&x9, x8, 0x0, UINT8_C(0x26)); // NOTE: clang 14 for Zen 2 uses sbb, and + fiat_addcarryx_u64(&x10, &x11, 0x0, x1, x9); + fiat_addcarryx_u64(&x12, &x13, x11, x3, 0x0); + fiat_addcarryx_u64(&x14, &x15, x13, x5, 0x0); + fiat_addcarryx_u64(&x16, &x17, x15, x7, 0x0); + fiat_cmovznz_u64(&x18, x17, 0x0, UINT8_C(0x26)); // NOTE: clang 14 for Zen 2 uses sbb, and + fiat_addcarryx_u64(&x19, &x20, 0x0, x10, x18); + out1[0] = x19; + out1[1] = x12; + out1[2] = x14; + out1[3] = x16; +} + +/* + * Input Bounds: + * arg1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + * arg2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + * Output Bounds: + * out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + */ +__attribute__((target("adx,bmi2"))) +static void fe4_sub(uint64_t out1[4], const uint64_t arg1[4], const uint64_t arg2[4]) { + uint64_t x1; + uint64_t x2; + fiat_uint1 x3; + uint64_t x4; + uint64_t x5; + fiat_uint1 x6; + uint64_t x7; + uint64_t x8; + fiat_uint1 x9; + uint64_t x10; + uint64_t x11; + fiat_uint1 x12; + uint64_t x13; + uint64_t x14; + fiat_uint1 x15; + uint64_t x16; + fiat_uint1 x17; + uint64_t x18; + fiat_uint1 x19; + uint64_t x20; + fiat_uint1 x21; + uint64_t x22; + uint64_t x23; + fiat_uint1 x24; + x1 = (arg2[0]); + fiat_subborrowx_u64(&x2, &x3, 0x0, (arg1[0]), x1); + x4 = (arg2[1]); + fiat_subborrowx_u64(&x5, &x6, x3, (arg1[1]), x4); + x7 = (arg2[2]); + fiat_subborrowx_u64(&x8, &x9, x6, (arg1[2]), x7); + x10 = (arg2[3]); + fiat_subborrowx_u64(&x11, &x12, x9, (arg1[3]), x10); + fiat_cmovznz_u64(&x13, x12, 0x0, UINT8_C(0x26)); // NOTE: clang 14 for Zen 2 uses sbb, and + fiat_subborrowx_u64(&x14, &x15, 0x0, x2, x13); + fiat_subborrowx_u64(&x16, &x17, x15, x5, 0x0); + fiat_subborrowx_u64(&x18, &x19, x17, x8, 0x0); + fiat_subborrowx_u64(&x20, &x21, x19, x11, 0x0); + fiat_cmovznz_u64(&x22, x21, 0x0, UINT8_C(0x26)); // NOTE: clang 14 for Zen 2 uses sbb, and + fiat_subborrowx_u64(&x23, &x24, 0x0, x14, x22); + out1[0] = x23; + out1[1] = x16; + out1[2] = x18; + out1[3] = x20; +} + +/* + * Input Bounds: + * arg1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + * arg2: [0x0 ~> 0x3ffffffffffffff] // NOTE: this is not any uint64! + * Output Bounds: + * out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + */ +__attribute__((target("adx,bmi2"))) +static void fe4_scmul(uint64_t out1[4], const uint64_t arg1[4], uint64_t arg2) { + uint64_t x1; + uint64_t x2; + uint64_t x3; + uint64_t x4; + uint64_t x5; + fiat_uint1 x6; + uint64_t x7; + uint64_t x8; + uint64_t x9; + fiat_uint1 x10; + uint64_t x11; + uint64_t x12; + uint64_t x13; + fiat_uint1 x14; + uint64_t x15; + uint64_t x16; + uint64_t x17; + fiat_uint1 x18; + uint64_t x19; + fiat_uint1 x20; + uint64_t x21; + fiat_uint1 x22; + uint64_t x23; + fiat_uint1 x24; + uint64_t x25; + uint64_t x26; + fiat_uint1 x27; + fiat_mulx_u64(&x1, &x2, (arg1[0]), arg2); + fiat_mulx_u64(&x3, &x4, (arg1[1]), arg2); + fiat_addcarryx_u64(&x5, &x6, 0x0, x2, x3); + fiat_mulx_u64(&x7, &x8, (arg1[2]), arg2); + fiat_addcarryx_u64(&x9, &x10, x6, x4, x7); + fiat_mulx_u64(&x11, &x12, (arg1[3]), arg2); + fiat_addcarryx_u64(&x13, &x14, x10, x8, x11); + fiat_mulx_u64(&x15, &x16, (x12 + (uint64_t)x14), UINT8_C(0x26)); + fiat_addcarryx_u64(&x17, &x18, 0x0, x1, x15); + fiat_addcarryx_u64(&x19, &x20, x18, x5, 0x0); + fiat_addcarryx_u64(&x21, &x22, x20, x9, 0x0); + fiat_addcarryx_u64(&x23, &x24, x22, x13, 0x0); + fiat_cmovznz_u64(&x25, x24, 0x0, UINT8_C(0x26)); // NOTE: clang 14 for Zen 2 uses sbb, and + fiat_addcarryx_u64(&x26, &x27, 0x0, x17, x25); + out1[0] = x26; + out1[1] = x19; + out1[2] = x21; + out1[3] = x23; +} + +/* + * Input Bounds: + * arg1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + * Output Bounds: + * out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + */ +__attribute__((target("adx,bmi2"))) +static void fe4_canon(uint64_t out1[4], const uint64_t arg1[4]) { + uint64_t x1; + fiat_uint1 x2; + uint64_t x3; + fiat_uint1 x4; + uint64_t x5; + fiat_uint1 x6; + uint64_t x7; + fiat_uint1 x8; + uint64_t x9; + uint64_t x10; + uint64_t x11; + uint64_t x12; + uint64_t x13; + fiat_uint1 x14; + uint64_t x15; + fiat_uint1 x16; + uint64_t x17; + fiat_uint1 x18; + uint64_t x19; + fiat_uint1 x20; + uint64_t x21; + uint64_t x22; + uint64_t x23; + uint64_t x24; + fiat_subborrowx_u64(&x1, &x2, 0x0, (arg1[0]), UINT64_C(0xffffffffffffffed)); + fiat_subborrowx_u64(&x3, &x4, x2, (arg1[1]), UINT64_C(0xffffffffffffffff)); + fiat_subborrowx_u64(&x5, &x6, x4, (arg1[2]), UINT64_C(0xffffffffffffffff)); + fiat_subborrowx_u64(&x7, &x8, x6, (arg1[3]), UINT64_C(0x7fffffffffffffff)); + fiat_cmovznz_u64(&x9, x8, x1, (arg1[0])); + fiat_cmovznz_u64(&x10, x8, x3, (arg1[1])); + fiat_cmovznz_u64(&x11, x8, x5, (arg1[2])); + fiat_cmovznz_u64(&x12, x8, x7, (arg1[3])); + fiat_subborrowx_u64(&x13, &x14, 0x0, x9, UINT64_C(0xffffffffffffffed)); + fiat_subborrowx_u64(&x15, &x16, x14, x10, UINT64_C(0xffffffffffffffff)); + fiat_subborrowx_u64(&x17, &x18, x16, x11, UINT64_C(0xffffffffffffffff)); + fiat_subborrowx_u64(&x19, &x20, x18, x12, UINT64_C(0x7fffffffffffffff)); + fiat_cmovznz_u64(&x21, x20, x13, x9); + fiat_cmovznz_u64(&x22, x20, x15, x10); + fiat_cmovznz_u64(&x23, x20, x17, x11); + fiat_cmovznz_u64(&x24, x20, x19, x12); + out1[0] = x21; + out1[1] = x22; + out1[2] = x23; + out1[3] = x24; +} + +/* + * Input Bounds: + * arg1: [0x0 ~> 0x1] + * arg2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + * arg3: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + * Output Bounds: + * out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + * out2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + */ +__attribute__((target("adx,bmi2"))) +static void fe4_cswap(uint64_t out1[4], uint64_t out2[4], fiat_uint1 arg1, const uint64_t arg2[4], const uint64_t arg3[4]) { + uint64_t x1; + uint64_t x2; + uint64_t x3; + uint64_t x4; + uint64_t x5; + uint64_t x6; + uint64_t x7; + uint64_t x8; + // NOTE: clang 14 for Zen 2 uses YMM registers + fiat_cmovznz_u64(&x1, arg1, (arg2[0]), (arg3[0])); + fiat_cmovznz_u64(&x2, arg1, (arg2[1]), (arg3[1])); + fiat_cmovznz_u64(&x3, arg1, (arg2[2]), (arg3[2])); + fiat_cmovznz_u64(&x4, arg1, (arg2[3]), (arg3[3])); + fiat_cmovznz_u64(&x5, arg1, (arg3[0]), (arg2[0])); + fiat_cmovznz_u64(&x6, arg1, (arg3[1]), (arg2[1])); + fiat_cmovznz_u64(&x7, arg1, (arg3[2]), (arg2[2])); + fiat_cmovznz_u64(&x8, arg1, (arg3[3]), (arg2[3])); + out1[0] = x1; + out1[1] = x2; + out1[2] = x3; + out1[3] = x4; + out2[0] = x5; + out2[1] = x6; + out2[2] = x7; + out2[3] = x8; +} + +// The following functions are adaped from crypto/curve25519/curve25519.c +// It would be desirable to share the code, but with the current field +// implementations both 4-limb and 5-limb versions of the curve-level code need +// to be included in builds targetting an unknown variant of x86_64. + +__attribute__((target("adx,bmi2"))) +static void fe4_invert(fe4 out, const fe4 z) { + fe4 t0; + fe4 t1; + fe4 t2; + fe4 t3; + int i; + + fe4_sq(t0, z); + fe4_sq(t1, t0); + for (i = 1; i < 2; ++i) { + fe4_sq(t1, t1); + } + fe4_mul(t1, z, t1); + fe4_mul(t0, t0, t1); + fe4_sq(t2, t0); + fe4_mul(t1, t1, t2); + fe4_sq(t2, t1); + for (i = 1; i < 5; ++i) { + fe4_sq(t2, t2); + } + fe4_mul(t1, t2, t1); + fe4_sq(t2, t1); + for (i = 1; i < 10; ++i) { + fe4_sq(t2, t2); + } + fe4_mul(t2, t2, t1); + fe4_sq(t3, t2); + for (i = 1; i < 20; ++i) { + fe4_sq(t3, t3); + } + fe4_mul(t2, t3, t2); + fe4_sq(t2, t2); + for (i = 1; i < 10; ++i) { + fe4_sq(t2, t2); + } + fe4_mul(t1, t2, t1); + fe4_sq(t2, t1); + for (i = 1; i < 50; ++i) { + fe4_sq(t2, t2); + } + fe4_mul(t2, t2, t1); + fe4_sq(t3, t2); + for (i = 1; i < 100; ++i) { + fe4_sq(t3, t3); + } + fe4_mul(t2, t3, t2); + fe4_sq(t2, t2); + for (i = 1; i < 50; ++i) { + fe4_sq(t2, t2); + } + fe4_mul(t1, t2, t1); + fe4_sq(t1, t1); + for (i = 1; i < 5; ++i) { + fe4_sq(t1, t1); + } + fe4_mul(out, t1, t0); +} + +RING_NOINLINE // https://github.com/rust-lang/rust/issues/116573 +__attribute__((target("adx,bmi2"))) +void x25519_scalar_mult_adx(uint8_t out[32], const uint8_t scalar[32], + const uint8_t point[32]) { + uint8_t e[32]; + OPENSSL_memcpy(e, scalar, 32); + e[0] &= 248; + e[31] &= 127; + e[31] |= 64; + + // The following implementation was transcribed to Coq and proven to + // correspond to unary scalar multiplication in affine coordinates given that + // x1 != 0 is the x coordinate of some point on the curve. It was also checked + // in Coq that doing a ladderstep with x1 = x3 = 0 gives z2' = z3' = 0, and z2 + // = z3 = 0 gives z2' = z3' = 0. The statement was quantified over the + // underlying field, so it applies to Curve25519 itself and the quadratic + // twist of Curve25519. It was not proven in Coq that prime-field arithmetic + // correctly simulates extension-field arithmetic on prime-field values. + // The decoding of the byte array representation of e was not considered. + // Specification of Montgomery curves in affine coordinates: + // + // Proof that these form a group that is isomorphic to a Weierstrass curve: + // + // Coq transcription and correctness proof of the loop (where scalarbits=255): + // + // + // preconditions: 0 <= e < 2^255 (not necessarily e < order), fe_invert(0) = 0 + fe4 x1, x2 = {1}, z2 = {0}, x3, z3 = {1}, tmp0, tmp1; + OPENSSL_memcpy(x1, point, sizeof(fe4)); + x1[3] &= (uint64_t)(-1)>>1; + OPENSSL_memcpy(x3, x1, sizeof(fe4)); + + unsigned swap = 0; + int pos; + for (pos = 254; pos >= 0; --pos) { + // loop invariant as of right before the test, for the case where x1 != 0: + // pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3 is nonzero + // let r := e >> (pos+1) in the following equalities of projective points: + // to_xz (r*P) === if swap then (x3, z3) else (x2, z2) + // to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3) + // x1 is the nonzero x coordinate of the nonzero point (r*P-(r+1)*P) + unsigned b = 1 & (e[pos / 8] >> (pos & 7)); + swap ^= b; + fe4_cswap(x2, x3, swap, x2, x3); + fe4_cswap(z2, z3, swap, z2, z3); + swap = b; + // Coq transcription of ladderstep formula (called from transcribed loop): + // + // + // x1 != 0 + // x1 = 0 + fe4_sub(tmp0, x3, z3); + fe4_sub(tmp1, x2, z2); + fe4_add(x2, x2, z2); + fe4_add(z2, x3, z3); + fe4_mul(z3, tmp0, x2); + fe4_mul(z2, z2, tmp1); + fe4_sq(tmp0, tmp1); + fe4_sq(tmp1, x2); + fe4_add(x3, z3, z2); + fe4_sub(z2, z3, z2); + fe4_mul(x2, tmp1, tmp0); + fe4_sub(tmp1, tmp1, tmp0); + fe4_sq(z2, z2); + fe4_scmul(z3, tmp1, 121666); + fe4_sq(x3, x3); + fe4_add(tmp0, tmp0, z3); + fe4_mul(z3, x1, z2); + fe4_mul(z2, tmp1, tmp0); + } + // here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3) else (x2, z2) + fe4_cswap(x2, x3, swap, x2, x3); + fe4_cswap(z2, z3, swap, z2, z3); + + fe4_invert(z2, z2); + fe4_mul(x2, x2, z2); + fe4_canon(x2, x2); + OPENSSL_memcpy(out, x2, sizeof(fe4)); +} + +typedef struct { + fe4 X; + fe4 Y; + fe4 Z; + fe4 T; +} ge_p3_4; + +typedef struct { + fe4 yplusx; + fe4 yminusx; + fe4 xy2d; +} ge_precomp_4; + +__attribute__((target("adx,bmi2"))) +static void inline_x25519_ge_dbl_4(ge_p3_4 *r, const ge_p3_4 *p, bool skip_t) { + // Transcribed from a Coq function proven against affine coordinates. + // https://github.com/mit-plv/fiat-crypto/blob/9943ba9e7d8f3e1c0054b2c94a5edca46ea73ef8/src/Curves/Edwards/XYZT/Basic.v#L136-L165 + fe4 trX, trZ, trT, t0, cX, cY, cZ, cT; + fe4_sq(trX, p->X); + fe4_sq(trZ, p->Y); + fe4_sq(trT, p->Z); + fe4_add(trT, trT, trT); + fe4_add(cY, p->X, p->Y); + fe4_sq(t0, cY); + fe4_add(cY, trZ, trX); + fe4_sub(cZ, trZ, trX); + fe4_sub(cX, t0, cY); + fe4_sub(cT, trT, cZ); + fe4_mul(r->X, cX, cT); + fe4_mul(r->Y, cY, cZ); + fe4_mul(r->Z, cZ, cT); + if (!skip_t) { + fe4_mul(r->T, cX, cY); + } +} + +__attribute__((target("adx,bmi2"))) +__attribute__((always_inline)) // 4% speedup with clang14 and zen2 +static inline void +ge_p3_add_p3_precomp_4(ge_p3_4 *r, const ge_p3_4 *p, const ge_precomp_4 *q) { + fe4 A, B, C, YplusX, YminusX, D, X3, Y3, Z3, T3; + // Transcribed from a Coq function proven against affine coordinates. + // https://github.com/mit-plv/fiat-crypto/blob/a36568d1d73aff5d7accc79fd28be672882f9c17/src/Curves/Edwards/XYZT/Precomputed.v#L38-L56 + fe4_add(YplusX, p->Y, p->X); + fe4_sub(YminusX, p->Y, p->X); + fe4_mul(A, YplusX, q->yplusx); + fe4_mul(B, YminusX, q->yminusx); + fe4_mul(C, q->xy2d, p->T); + fe4_add(D, p->Z, p->Z); + fe4_sub(X3, A, B); + fe4_add(Y3, A, B); + fe4_add(Z3, D, C); + fe4_sub(T3, D, C); + fe4_mul(r->X, X3, T3); + fe4_mul(r->Y, Y3, Z3); + fe4_mul(r->Z, Z3, T3); + fe4_mul(r->T, X3, Y3); +} + +__attribute__((always_inline)) // 25% speedup with clang14 and zen2 +static inline void table_select_4(ge_precomp_4 *t, const int pos, + const signed char b) { + uint8_t bnegative = constant_time_msb_w(b); + uint8_t babs = b - ((bnegative & b) << 1); + + uint8_t t_bytes[3][32] = { + {constant_time_is_zero_w(b) & 1}, {constant_time_is_zero_w(b) & 1}, {0}}; +#if defined(__clang__) + __asm__("" : "+m" (t_bytes) : /*no inputs*/); +#endif + OPENSSL_STATIC_ASSERT(sizeof(t_bytes) == sizeof(k25519Precomp[pos][0]), ""); + for (int i = 0; i < 8; i++) { + constant_time_conditional_memxor(t_bytes, k25519Precomp[pos][i], + sizeof(t_bytes), + constant_time_eq_w(babs, 1 + i)); + } + + OPENSSL_STATIC_ASSERT(sizeof(t_bytes) == sizeof(ge_precomp_4), ""); + + // fe4 uses saturated 64-bit limbs, so converting from bytes is just a copy. + OPENSSL_memcpy(t, t_bytes, sizeof(ge_precomp_4)); + + fe4 xy2d_neg = {0}; + fe4_sub(xy2d_neg, xy2d_neg, t->xy2d); + constant_time_conditional_memcpy(t->yplusx, t_bytes[1], sizeof(fe4), + bnegative); + constant_time_conditional_memcpy(t->yminusx, t_bytes[0], sizeof(fe4), + bnegative); + constant_time_conditional_memcpy(t->xy2d, xy2d_neg, sizeof(fe4), bnegative); +} + +// h = a * B +// where a = a[0]+256*a[1]+...+256^31 a[31] +// B is the Ed25519 base point (x,4/5) with x positive. +// +// Preconditions: +// a[31] <= 127 +RING_NOINLINE // https://github.com/rust-lang/rust/issues/116573 +__attribute__((target("adx,bmi2"))) +void x25519_ge_scalarmult_base_adx(uint8_t h[4][32], const uint8_t a[32]) { + signed char e[64]; + signed char carry; + + for (unsigned i = 0; i < 32; ++i) { + e[2 * i + 0] = (a[i] >> 0) & 15; + e[2 * i + 1] = (a[i] >> 4) & 15; + } + // each e[i] is between 0 and 15 + // e[63] is between 0 and 7 + + carry = 0; + for (unsigned i = 0; i < 63; ++i) { + e[i] += carry; + carry = e[i] + 8; + carry >>= 4; + e[i] -= carry << 4; + } + e[63] += carry; + // each e[i] is between -8 and 8 + + ge_p3_4 r = {{0}, {1}, {1}, {0}}; + for (unsigned i = 1; i < 64; i += 2) { + ge_precomp_4 t; + table_select_4(&t, i / 2, e[i]); + ge_p3_add_p3_precomp_4(&r, &r, &t); + } + + inline_x25519_ge_dbl_4(&r, &r, /*skip_t=*/true); + inline_x25519_ge_dbl_4(&r, &r, /*skip_t=*/true); + inline_x25519_ge_dbl_4(&r, &r, /*skip_t=*/true); + inline_x25519_ge_dbl_4(&r, &r, /*skip_t=*/false); + + for (unsigned i = 0; i < 64; i += 2) { + ge_precomp_4 t; + table_select_4(&t, i / 2, e[i]); + ge_p3_add_p3_precomp_4(&r, &r, &t); + } + + // fe4 uses saturated 64-bit limbs, so converting to bytes is just a copy. + // Satisfy stated precondition of fiat_25519_from_bytes; tests pass either way + fe4_canon(r.X, r.X); + fe4_canon(r.Y, r.Y); + fe4_canon(r.Z, r.Z); + fe4_canon(r.T, r.T); + OPENSSL_STATIC_ASSERT(sizeof(ge_p3_4) == sizeof(uint8_t[4][32]), ""); + OPENSSL_memcpy(h, &r, sizeof(ge_p3_4)); +} diff --git a/ring-0.17.14/third_party/fiat/curve25519_64_msvc.h b/ring-0.17.14/third_party/fiat/curve25519_64_msvc.h new file mode 100644 index 0000000000..4b916e42e1 --- /dev/null +++ b/ring-0.17.14/third_party/fiat/curve25519_64_msvc.h @@ -0,0 +1,1225 @@ +/* Autogenerated: 'src/ExtractionOCaml/unsaturated_solinas' --inline --static --use-value-barrier --no-wide-int 25519 64 '(auto)' '2^255 - 19' carry_mul carry_square carry add sub opp selectznz to_bytes from_bytes relax carry_scmul121666 */ +/* curve description: 25519 */ +/* machine_wordsize = 64 (from "64") */ +/* requested operations: carry_mul, carry_square, carry, add, sub, opp, selectznz, to_bytes, from_bytes, relax, carry_scmul121666 */ +/* n = 5 (from "(auto)") */ +/* s-c = 2^255 - [(1, 19)] (from "2^255 - 19") */ +/* tight_bounds_multiplier = 1 (from "") */ +/* */ +/* Computed values: */ +/* carry_chain = [0, 1, 2, 3, 4, 0, 1] */ +/* eval z = z[0] + (z[1] << 51) + (z[2] << 102) + (z[3] << 153) + (z[4] << 204) */ +/* bytes_eval z = z[0] + (z[1] << 8) + (z[2] << 16) + (z[3] << 24) + (z[4] << 32) + (z[5] << 40) + (z[6] << 48) + (z[7] << 56) + (z[8] << 64) + (z[9] << 72) + (z[10] << 80) + (z[11] << 88) + (z[12] << 96) + (z[13] << 104) + (z[14] << 112) + (z[15] << 120) + (z[16] << 128) + (z[17] << 136) + (z[18] << 144) + (z[19] << 152) + (z[20] << 160) + (z[21] << 168) + (z[22] << 176) + (z[23] << 184) + (z[24] << 192) + (z[25] << 200) + (z[26] << 208) + (z[27] << 216) + (z[28] << 224) + (z[29] << 232) + (z[30] << 240) + (z[31] << 248) */ +/* balance = [0xfffffffffffda, 0xffffffffffffe, 0xffffffffffffe, 0xffffffffffffe, 0xffffffffffffe] */ + +#include +#include +#if defined(_M_X64) +#include +#endif + +typedef unsigned char fiat_25519_uint1; +typedef signed char fiat_25519_int1; + +#define FIAT_25519_FIAT_INLINE inline + +/* The type fiat_25519_loose_field_element is a field element with loose bounds. */ +/* Bounds: [[0x0 ~> 0x18000000000000], [0x0 ~> 0x18000000000000], [0x0 ~> 0x18000000000000], [0x0 ~> 0x18000000000000], [0x0 ~> 0x18000000000000]] */ +typedef uint64_t fiat_25519_loose_field_element[5]; + +/* The type fiat_25519_tight_field_element is a field element with tight bounds. */ +/* Bounds: [[0x0 ~> 0x8000000000000], [0x0 ~> 0x8000000000000], [0x0 ~> 0x8000000000000], [0x0 ~> 0x8000000000000], [0x0 ~> 0x8000000000000]] */ +typedef uint64_t fiat_25519_tight_field_element[5]; + +#if (-1 & 3) != 3 +#error "This code only works on a two's complement system" +#endif + +#define fiat_25519_value_barrier_u64(x) (x) + +/* + * The function fiat_25519_addcarryx_u64 is an addition with carry. + * + * Postconditions: + * out1 = (arg1 + arg2 + arg3) mod 2^64 + * out2 = ⌊(arg1 + arg2 + arg3) / 2^64⌋ + * + * Input Bounds: + * arg1: [0x0 ~> 0x1] + * arg2: [0x0 ~> 0xffffffffffffffff] + * arg3: [0x0 ~> 0xffffffffffffffff] + * Output Bounds: + * out1: [0x0 ~> 0xffffffffffffffff] + * out2: [0x0 ~> 0x1] + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_addcarryx_u64(uint64_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint64_t arg2, uint64_t arg3) { +// NOTE: edited after generation +#if defined(_M_X64) + *out2 = _addcarry_u64(arg1, arg2, arg3, out1); +#else + arg2 += arg1; + arg1 = arg2 < arg1; + arg3 += arg2; + arg1 += arg3 < arg2; + *out1 = arg3; + *out2 = arg1; +#endif +} + +/* + * The function fiat_25519_subborrowx_u64 is a subtraction with borrow. + * + * Postconditions: + * out1 = (-arg1 + arg2 + -arg3) mod 2^64 + * out2 = -⌊(-arg1 + arg2 + -arg3) / 2^64⌋ + * + * Input Bounds: + * arg1: [0x0 ~> 0x1] + * arg2: [0x0 ~> 0xffffffffffffffff] + * arg3: [0x0 ~> 0xffffffffffffffff] + * Output Bounds: + * out1: [0x0 ~> 0xffffffffffffffff] + * out2: [0x0 ~> 0x1] + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_subborrowx_u64(uint64_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint64_t arg2, uint64_t arg3) { +#if defined(_M_X64) + *out2 = _subborrow_u64(arg1, arg2, arg3, out1); // NOTE: edited after generation +#else + *out1 = arg2 - arg3 - arg1; + *out2 = (arg2 < arg3) | ((arg2 == arg3) & arg1); +#endif +} + +/* + * The function fiat_25519_addcarryx_u51 is an addition with carry. + * + * Postconditions: + * out1 = (arg1 + arg2 + arg3) mod 2^51 + * out2 = ⌊(arg1 + arg2 + arg3) / 2^51⌋ + * + * Input Bounds: + * arg1: [0x0 ~> 0x1] + * arg2: [0x0 ~> 0x7ffffffffffff] + * arg3: [0x0 ~> 0x7ffffffffffff] + * Output Bounds: + * out1: [0x0 ~> 0x7ffffffffffff] + * out2: [0x0 ~> 0x1] + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_addcarryx_u51(uint64_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint64_t arg2, uint64_t arg3) { + uint64_t x1; + uint64_t x2; + fiat_25519_uint1 x3; + x1 = ((arg1 + arg2) + arg3); + x2 = (x1 & UINT64_C(0x7ffffffffffff)); + x3 = (fiat_25519_uint1)(x1 >> 51); + *out1 = x2; + *out2 = x3; +} + +/* + * The function fiat_25519_subborrowx_u51 is a subtraction with borrow. + * + * Postconditions: + * out1 = (-arg1 + arg2 + -arg3) mod 2^51 + * out2 = -⌊(-arg1 + arg2 + -arg3) / 2^51⌋ + * + * Input Bounds: + * arg1: [0x0 ~> 0x1] + * arg2: [0x0 ~> 0x7ffffffffffff] + * arg3: [0x0 ~> 0x7ffffffffffff] + * Output Bounds: + * out1: [0x0 ~> 0x7ffffffffffff] + * out2: [0x0 ~> 0x1] + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_subborrowx_u51(uint64_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint64_t arg2, uint64_t arg3) { + int64_t x1; + fiat_25519_int1 x2; + uint64_t x3; + x1 = ((int64_t)(arg2 - (int64_t)arg1) - (int64_t)arg3); + x2 = (fiat_25519_int1)(x1 >> 51); + x3 = (x1 & UINT64_C(0x7ffffffffffff)); + *out1 = x3; + *out2 = (fiat_25519_uint1)(0x0 - x2); +} + +/* + * The function fiat_25519_mulx_u64 is a multiplication, returning the full double-width result. + * + * Postconditions: + * out1 = (arg1 * arg2) mod 2^64 + * out2 = ⌊arg1 * arg2 / 2^64⌋ + * + * Input Bounds: + * arg1: [0x0 ~> 0xffffffffffffffff] + * arg2: [0x0 ~> 0xffffffffffffffff] + * Output Bounds: + * out1: [0x0 ~> 0xffffffffffffffff] + * out2: [0x0 ~> 0xffffffffffffffff] + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_mulx_u64(uint64_t* out1, uint64_t* out2, uint64_t arg1, uint64_t arg2) { +// NOTE: edited after generation +#if defined(_M_X64) + *out1 = _umul128(arg1, arg2, out2); +#elif defined(_M_ARM64) + *out1 = arg1 * arg2; + *out2 = __umulh(arg1, arg2); +#else +#error "This file is intended for MSVC on X64 or ARM64" +#endif +} + +/* + * The function fiat_25519_cmovznz_u64 is a single-word conditional move. + * + * Postconditions: + * out1 = (if arg1 = 0 then arg2 else arg3) + * + * Input Bounds: + * arg1: [0x0 ~> 0x1] + * arg2: [0x0 ~> 0xffffffffffffffff] + * arg3: [0x0 ~> 0xffffffffffffffff] + * Output Bounds: + * out1: [0x0 ~> 0xffffffffffffffff] + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_cmovznz_u64(uint64_t* out1, fiat_25519_uint1 arg1, uint64_t arg2, uint64_t arg3) { + fiat_25519_uint1 x1; + uint64_t x2; + uint64_t x3; + x1 = (!(!arg1)); + x2 = ((fiat_25519_int1)(0x0 - x1) & UINT64_C(0xffffffffffffffff)); + x3 = ((fiat_25519_value_barrier_u64(x2) & arg3) | (fiat_25519_value_barrier_u64((~x2)) & arg2)); + *out1 = x3; +} + +/* + * The function fiat_25519_carry_mul multiplies two field elements and reduces the result. + * + * Postconditions: + * eval out1 mod m = (eval arg1 * eval arg2) mod m + * + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_carry_mul(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1, const fiat_25519_loose_field_element arg2) { + uint64_t x1; + uint64_t x2; + uint64_t x3; + uint64_t x4; + uint64_t x5; + uint64_t x6; + uint64_t x7; + uint64_t x8; + uint64_t x9; + uint64_t x10; + uint64_t x11; + uint64_t x12; + uint64_t x13; + uint64_t x14; + uint64_t x15; + uint64_t x16; + uint64_t x17; + uint64_t x18; + uint64_t x19; + uint64_t x20; + uint64_t x21; + uint64_t x22; + uint64_t x23; + uint64_t x24; + uint64_t x25; + uint64_t x26; + uint64_t x27; + uint64_t x28; + uint64_t x29; + uint64_t x30; + uint64_t x31; + uint64_t x32; + uint64_t x33; + uint64_t x34; + uint64_t x35; + uint64_t x36; + uint64_t x37; + uint64_t x38; + uint64_t x39; + uint64_t x40; + uint64_t x41; + uint64_t x42; + uint64_t x43; + uint64_t x44; + uint64_t x45; + uint64_t x46; + uint64_t x47; + uint64_t x48; + uint64_t x49; + uint64_t x50; + uint64_t x51; + fiat_25519_uint1 x52; + uint64_t x53; + fiat_25519_uint1 x54; + uint64_t x55; + fiat_25519_uint1 x56; + uint64_t x57; + fiat_25519_uint1 x58; + uint64_t x59; + fiat_25519_uint1 x60; + uint64_t x61; + fiat_25519_uint1 x62; + uint64_t x63; + fiat_25519_uint1 x64; + uint64_t x65; + fiat_25519_uint1 x66; + uint64_t x67; + uint64_t x68; + uint64_t x69; + fiat_25519_uint1 x70; + uint64_t x71; + fiat_25519_uint1 x72; + uint64_t x73; + fiat_25519_uint1 x74; + uint64_t x75; + fiat_25519_uint1 x76; + uint64_t x77; + fiat_25519_uint1 x78; + uint64_t x79; + fiat_25519_uint1 x80; + uint64_t x81; + fiat_25519_uint1 x82; + uint64_t x83; + fiat_25519_uint1 x84; + uint64_t x85; + fiat_25519_uint1 x86; + uint64_t x87; + fiat_25519_uint1 x88; + uint64_t x89; + fiat_25519_uint1 x90; + uint64_t x91; + fiat_25519_uint1 x92; + uint64_t x93; + fiat_25519_uint1 x94; + uint64_t x95; + fiat_25519_uint1 x96; + uint64_t x97; + fiat_25519_uint1 x98; + uint64_t x99; + fiat_25519_uint1 x100; + uint64_t x101; + fiat_25519_uint1 x102; + uint64_t x103; + fiat_25519_uint1 x104; + uint64_t x105; + fiat_25519_uint1 x106; + uint64_t x107; + fiat_25519_uint1 x108; + uint64_t x109; + fiat_25519_uint1 x110; + uint64_t x111; + fiat_25519_uint1 x112; + uint64_t x113; + fiat_25519_uint1 x114; + uint64_t x115; + fiat_25519_uint1 x116; + uint64_t x117; + fiat_25519_uint1 x118; + uint64_t x119; + fiat_25519_uint1 x120; + uint64_t x121; + fiat_25519_uint1 x122; + uint64_t x123; + fiat_25519_uint1 x124; + uint64_t x125; + fiat_25519_uint1 x126; + uint64_t x127; + fiat_25519_uint1 x128; + uint64_t x129; + fiat_25519_uint1 x130; + uint64_t x131; + fiat_25519_uint1 x132; + uint64_t x133; + fiat_25519_uint1 x134; + uint64_t x135; + uint64_t x136; + uint64_t x137; + uint64_t x138; + fiat_25519_uint1 x139; + uint64_t x140; + uint64_t x141; + uint64_t x142; + uint64_t x143; + fiat_25519_uint1 x144; + uint64_t x145; + uint64_t x146; + uint64_t x147; + uint64_t x148; + fiat_25519_uint1 x149; + uint64_t x150; + uint64_t x151; + uint64_t x152; + uint64_t x153; + uint64_t x154; + uint64_t x155; + uint64_t x156; + uint64_t x157; + fiat_25519_uint1 x158; + uint64_t x159; + uint64_t x160; + fiat_25519_mulx_u64(&x1, &x2, (arg1[4]), ((arg2[4]) * UINT8_C(0x13))); + fiat_25519_mulx_u64(&x3, &x4, (arg1[4]), ((arg2[3]) * UINT8_C(0x13))); + fiat_25519_mulx_u64(&x5, &x6, (arg1[4]), ((arg2[2]) * UINT8_C(0x13))); + fiat_25519_mulx_u64(&x7, &x8, (arg1[4]), ((arg2[1]) * UINT8_C(0x13))); + fiat_25519_mulx_u64(&x9, &x10, (arg1[3]), ((arg2[4]) * UINT8_C(0x13))); + fiat_25519_mulx_u64(&x11, &x12, (arg1[3]), ((arg2[3]) * UINT8_C(0x13))); + fiat_25519_mulx_u64(&x13, &x14, (arg1[3]), ((arg2[2]) * UINT8_C(0x13))); + fiat_25519_mulx_u64(&x15, &x16, (arg1[2]), ((arg2[4]) * UINT8_C(0x13))); + fiat_25519_mulx_u64(&x17, &x18, (arg1[2]), ((arg2[3]) * UINT8_C(0x13))); + fiat_25519_mulx_u64(&x19, &x20, (arg1[1]), ((arg2[4]) * UINT8_C(0x13))); + fiat_25519_mulx_u64(&x21, &x22, (arg1[4]), (arg2[0])); + fiat_25519_mulx_u64(&x23, &x24, (arg1[3]), (arg2[1])); + fiat_25519_mulx_u64(&x25, &x26, (arg1[3]), (arg2[0])); + fiat_25519_mulx_u64(&x27, &x28, (arg1[2]), (arg2[2])); + fiat_25519_mulx_u64(&x29, &x30, (arg1[2]), (arg2[1])); + fiat_25519_mulx_u64(&x31, &x32, (arg1[2]), (arg2[0])); + fiat_25519_mulx_u64(&x33, &x34, (arg1[1]), (arg2[3])); + fiat_25519_mulx_u64(&x35, &x36, (arg1[1]), (arg2[2])); + fiat_25519_mulx_u64(&x37, &x38, (arg1[1]), (arg2[1])); + fiat_25519_mulx_u64(&x39, &x40, (arg1[1]), (arg2[0])); + fiat_25519_mulx_u64(&x41, &x42, (arg1[0]), (arg2[4])); + fiat_25519_mulx_u64(&x43, &x44, (arg1[0]), (arg2[3])); + fiat_25519_mulx_u64(&x45, &x46, (arg1[0]), (arg2[2])); + fiat_25519_mulx_u64(&x47, &x48, (arg1[0]), (arg2[1])); + fiat_25519_mulx_u64(&x49, &x50, (arg1[0]), (arg2[0])); + fiat_25519_addcarryx_u64(&x51, &x52, 0x0, x13, x7); + fiat_25519_addcarryx_u64(&x53, &x54, x52, x14, x8); + fiat_25519_addcarryx_u64(&x55, &x56, 0x0, x17, x51); + fiat_25519_addcarryx_u64(&x57, &x58, x56, x18, x53); + fiat_25519_addcarryx_u64(&x59, &x60, 0x0, x19, x55); + fiat_25519_addcarryx_u64(&x61, &x62, x60, x20, x57); + fiat_25519_addcarryx_u64(&x63, &x64, 0x0, x49, x59); + fiat_25519_addcarryx_u64(&x65, &x66, x64, x50, x61); + x67 = ((x63 >> 51) | ((x65 << 13) & UINT64_C(0xffffffffffffffff))); + x68 = (x63 & UINT64_C(0x7ffffffffffff)); + fiat_25519_addcarryx_u64(&x69, &x70, 0x0, x23, x21); + fiat_25519_addcarryx_u64(&x71, &x72, x70, x24, x22); + fiat_25519_addcarryx_u64(&x73, &x74, 0x0, x27, x69); + fiat_25519_addcarryx_u64(&x75, &x76, x74, x28, x71); + fiat_25519_addcarryx_u64(&x77, &x78, 0x0, x33, x73); + fiat_25519_addcarryx_u64(&x79, &x80, x78, x34, x75); + fiat_25519_addcarryx_u64(&x81, &x82, 0x0, x41, x77); + fiat_25519_addcarryx_u64(&x83, &x84, x82, x42, x79); + fiat_25519_addcarryx_u64(&x85, &x86, 0x0, x25, x1); + fiat_25519_addcarryx_u64(&x87, &x88, x86, x26, x2); + fiat_25519_addcarryx_u64(&x89, &x90, 0x0, x29, x85); + fiat_25519_addcarryx_u64(&x91, &x92, x90, x30, x87); + fiat_25519_addcarryx_u64(&x93, &x94, 0x0, x35, x89); + fiat_25519_addcarryx_u64(&x95, &x96, x94, x36, x91); + fiat_25519_addcarryx_u64(&x97, &x98, 0x0, x43, x93); + fiat_25519_addcarryx_u64(&x99, &x100, x98, x44, x95); + fiat_25519_addcarryx_u64(&x101, &x102, 0x0, x9, x3); + fiat_25519_addcarryx_u64(&x103, &x104, x102, x10, x4); + fiat_25519_addcarryx_u64(&x105, &x106, 0x0, x31, x101); + fiat_25519_addcarryx_u64(&x107, &x108, x106, x32, x103); + fiat_25519_addcarryx_u64(&x109, &x110, 0x0, x37, x105); + fiat_25519_addcarryx_u64(&x111, &x112, x110, x38, x107); + fiat_25519_addcarryx_u64(&x113, &x114, 0x0, x45, x109); + fiat_25519_addcarryx_u64(&x115, &x116, x114, x46, x111); + fiat_25519_addcarryx_u64(&x117, &x118, 0x0, x11, x5); + fiat_25519_addcarryx_u64(&x119, &x120, x118, x12, x6); + fiat_25519_addcarryx_u64(&x121, &x122, 0x0, x15, x117); + fiat_25519_addcarryx_u64(&x123, &x124, x122, x16, x119); + fiat_25519_addcarryx_u64(&x125, &x126, 0x0, x39, x121); + fiat_25519_addcarryx_u64(&x127, &x128, x126, x40, x123); + fiat_25519_addcarryx_u64(&x129, &x130, 0x0, x47, x125); + fiat_25519_addcarryx_u64(&x131, &x132, x130, x48, x127); + fiat_25519_addcarryx_u64(&x133, &x134, 0x0, x67, x129); + x135 = (x134 + x131); + x136 = ((x133 >> 51) | ((x135 << 13) & UINT64_C(0xffffffffffffffff))); + x137 = (x133 & UINT64_C(0x7ffffffffffff)); + fiat_25519_addcarryx_u64(&x138, &x139, 0x0, x136, x113); + x140 = (x139 + x115); + x141 = ((x138 >> 51) | ((x140 << 13) & UINT64_C(0xffffffffffffffff))); + x142 = (x138 & UINT64_C(0x7ffffffffffff)); + fiat_25519_addcarryx_u64(&x143, &x144, 0x0, x141, x97); + x145 = (x144 + x99); + x146 = ((x143 >> 51) | ((x145 << 13) & UINT64_C(0xffffffffffffffff))); + x147 = (x143 & UINT64_C(0x7ffffffffffff)); + fiat_25519_addcarryx_u64(&x148, &x149, 0x0, x146, x81); + x150 = (x149 + x83); + x151 = ((x148 >> 51) | ((x150 << 13) & UINT64_C(0xffffffffffffffff))); + x152 = (x148 & UINT64_C(0x7ffffffffffff)); + x153 = (x151 * UINT8_C(0x13)); + x154 = (x68 + x153); + x155 = (x154 >> 51); + x156 = (x154 & UINT64_C(0x7ffffffffffff)); + x157 = (x155 + x137); + x158 = (fiat_25519_uint1)(x157 >> 51); + x159 = (x157 & UINT64_C(0x7ffffffffffff)); + x160 = (x158 + x142); + out1[0] = x156; + out1[1] = x159; + out1[2] = x160; + out1[3] = x147; + out1[4] = x152; +} + +/* + * The function fiat_25519_carry_square squares a field element and reduces the result. + * + * Postconditions: + * eval out1 mod m = (eval arg1 * eval arg1) mod m + * + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_carry_square(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1) { + uint64_t x1; + uint64_t x2; + uint64_t x3; + uint64_t x4; + uint64_t x5; + uint64_t x6; + uint64_t x7; + uint64_t x8; + uint64_t x9; + uint64_t x10; + uint64_t x11; + uint64_t x12; + uint64_t x13; + uint64_t x14; + uint64_t x15; + uint64_t x16; + uint64_t x17; + uint64_t x18; + uint64_t x19; + uint64_t x20; + uint64_t x21; + uint64_t x22; + uint64_t x23; + uint64_t x24; + uint64_t x25; + uint64_t x26; + uint64_t x27; + uint64_t x28; + uint64_t x29; + uint64_t x30; + uint64_t x31; + uint64_t x32; + uint64_t x33; + uint64_t x34; + uint64_t x35; + uint64_t x36; + uint64_t x37; + uint64_t x38; + uint64_t x39; + fiat_25519_uint1 x40; + uint64_t x41; + fiat_25519_uint1 x42; + uint64_t x43; + fiat_25519_uint1 x44; + uint64_t x45; + fiat_25519_uint1 x46; + uint64_t x47; + uint64_t x48; + uint64_t x49; + fiat_25519_uint1 x50; + uint64_t x51; + fiat_25519_uint1 x52; + uint64_t x53; + fiat_25519_uint1 x54; + uint64_t x55; + fiat_25519_uint1 x56; + uint64_t x57; + fiat_25519_uint1 x58; + uint64_t x59; + fiat_25519_uint1 x60; + uint64_t x61; + fiat_25519_uint1 x62; + uint64_t x63; + fiat_25519_uint1 x64; + uint64_t x65; + fiat_25519_uint1 x66; + uint64_t x67; + fiat_25519_uint1 x68; + uint64_t x69; + fiat_25519_uint1 x70; + uint64_t x71; + fiat_25519_uint1 x72; + uint64_t x73; + fiat_25519_uint1 x74; + uint64_t x75; + fiat_25519_uint1 x76; + uint64_t x77; + fiat_25519_uint1 x78; + uint64_t x79; + fiat_25519_uint1 x80; + uint64_t x81; + fiat_25519_uint1 x82; + uint64_t x83; + uint64_t x84; + uint64_t x85; + uint64_t x86; + fiat_25519_uint1 x87; + uint64_t x88; + uint64_t x89; + uint64_t x90; + uint64_t x91; + fiat_25519_uint1 x92; + uint64_t x93; + uint64_t x94; + uint64_t x95; + uint64_t x96; + fiat_25519_uint1 x97; + uint64_t x98; + uint64_t x99; + uint64_t x100; + uint64_t x101; + uint64_t x102; + uint64_t x103; + uint64_t x104; + uint64_t x105; + fiat_25519_uint1 x106; + uint64_t x107; + uint64_t x108; + x1 = ((arg1[4]) * UINT8_C(0x13)); + x2 = (x1 * 0x2); + x3 = ((arg1[4]) * 0x2); + x4 = ((arg1[3]) * UINT8_C(0x13)); + x5 = (x4 * 0x2); + x6 = ((arg1[3]) * 0x2); + x7 = ((arg1[2]) * 0x2); + x8 = ((arg1[1]) * 0x2); + fiat_25519_mulx_u64(&x9, &x10, (arg1[4]), x1); + fiat_25519_mulx_u64(&x11, &x12, (arg1[3]), x2); + fiat_25519_mulx_u64(&x13, &x14, (arg1[3]), x4); + fiat_25519_mulx_u64(&x15, &x16, (arg1[2]), x2); + fiat_25519_mulx_u64(&x17, &x18, (arg1[2]), x5); + fiat_25519_mulx_u64(&x19, &x20, (arg1[2]), (arg1[2])); + fiat_25519_mulx_u64(&x21, &x22, (arg1[1]), x2); + fiat_25519_mulx_u64(&x23, &x24, (arg1[1]), x6); + fiat_25519_mulx_u64(&x25, &x26, (arg1[1]), x7); + fiat_25519_mulx_u64(&x27, &x28, (arg1[1]), (arg1[1])); + fiat_25519_mulx_u64(&x29, &x30, (arg1[0]), x3); + fiat_25519_mulx_u64(&x31, &x32, (arg1[0]), x6); + fiat_25519_mulx_u64(&x33, &x34, (arg1[0]), x7); + fiat_25519_mulx_u64(&x35, &x36, (arg1[0]), x8); + fiat_25519_mulx_u64(&x37, &x38, (arg1[0]), (arg1[0])); + fiat_25519_addcarryx_u64(&x39, &x40, 0x0, x21, x17); + fiat_25519_addcarryx_u64(&x41, &x42, x40, x22, x18); + fiat_25519_addcarryx_u64(&x43, &x44, 0x0, x37, x39); + fiat_25519_addcarryx_u64(&x45, &x46, x44, x38, x41); + x47 = ((x43 >> 51) | ((x45 << 13) & UINT64_C(0xffffffffffffffff))); + x48 = (x43 & UINT64_C(0x7ffffffffffff)); + fiat_25519_addcarryx_u64(&x49, &x50, 0x0, x23, x19); + fiat_25519_addcarryx_u64(&x51, &x52, x50, x24, x20); + fiat_25519_addcarryx_u64(&x53, &x54, 0x0, x29, x49); + fiat_25519_addcarryx_u64(&x55, &x56, x54, x30, x51); + fiat_25519_addcarryx_u64(&x57, &x58, 0x0, x25, x9); + fiat_25519_addcarryx_u64(&x59, &x60, x58, x26, x10); + fiat_25519_addcarryx_u64(&x61, &x62, 0x0, x31, x57); + fiat_25519_addcarryx_u64(&x63, &x64, x62, x32, x59); + fiat_25519_addcarryx_u64(&x65, &x66, 0x0, x27, x11); + fiat_25519_addcarryx_u64(&x67, &x68, x66, x28, x12); + fiat_25519_addcarryx_u64(&x69, &x70, 0x0, x33, x65); + fiat_25519_addcarryx_u64(&x71, &x72, x70, x34, x67); + fiat_25519_addcarryx_u64(&x73, &x74, 0x0, x15, x13); + fiat_25519_addcarryx_u64(&x75, &x76, x74, x16, x14); + fiat_25519_addcarryx_u64(&x77, &x78, 0x0, x35, x73); + fiat_25519_addcarryx_u64(&x79, &x80, x78, x36, x75); + fiat_25519_addcarryx_u64(&x81, &x82, 0x0, x47, x77); + x83 = (x82 + x79); + x84 = ((x81 >> 51) | ((x83 << 13) & UINT64_C(0xffffffffffffffff))); + x85 = (x81 & UINT64_C(0x7ffffffffffff)); + fiat_25519_addcarryx_u64(&x86, &x87, 0x0, x84, x69); + x88 = (x87 + x71); + x89 = ((x86 >> 51) | ((x88 << 13) & UINT64_C(0xffffffffffffffff))); + x90 = (x86 & UINT64_C(0x7ffffffffffff)); + fiat_25519_addcarryx_u64(&x91, &x92, 0x0, x89, x61); + x93 = (x92 + x63); + x94 = ((x91 >> 51) | ((x93 << 13) & UINT64_C(0xffffffffffffffff))); + x95 = (x91 & UINT64_C(0x7ffffffffffff)); + fiat_25519_addcarryx_u64(&x96, &x97, 0x0, x94, x53); + x98 = (x97 + x55); + x99 = ((x96 >> 51) | ((x98 << 13) & UINT64_C(0xffffffffffffffff))); + x100 = (x96 & UINT64_C(0x7ffffffffffff)); + x101 = (x99 * UINT8_C(0x13)); + x102 = (x48 + x101); + x103 = (x102 >> 51); + x104 = (x102 & UINT64_C(0x7ffffffffffff)); + x105 = (x103 + x85); + x106 = (fiat_25519_uint1)(x105 >> 51); + x107 = (x105 & UINT64_C(0x7ffffffffffff)); + x108 = (x106 + x90); + out1[0] = x104; + out1[1] = x107; + out1[2] = x108; + out1[3] = x95; + out1[4] = x100; +} + +/* + * The function fiat_25519_carry reduces a field element. + * + * Postconditions: + * eval out1 mod m = eval arg1 mod m + * + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_carry(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1) { + uint64_t x1; + uint64_t x2; + uint64_t x3; + uint64_t x4; + uint64_t x5; + uint64_t x6; + uint64_t x7; + uint64_t x8; + uint64_t x9; + uint64_t x10; + uint64_t x11; + uint64_t x12; + x1 = (arg1[0]); + x2 = ((x1 >> 51) + (arg1[1])); + x3 = ((x2 >> 51) + (arg1[2])); + x4 = ((x3 >> 51) + (arg1[3])); + x5 = ((x4 >> 51) + (arg1[4])); + x6 = ((x1 & UINT64_C(0x7ffffffffffff)) + ((x5 >> 51) * UINT8_C(0x13))); + x7 = ((fiat_25519_uint1)(x6 >> 51) + (x2 & UINT64_C(0x7ffffffffffff))); + x8 = (x6 & UINT64_C(0x7ffffffffffff)); + x9 = (x7 & UINT64_C(0x7ffffffffffff)); + x10 = ((fiat_25519_uint1)(x7 >> 51) + (x3 & UINT64_C(0x7ffffffffffff))); + x11 = (x4 & UINT64_C(0x7ffffffffffff)); + x12 = (x5 & UINT64_C(0x7ffffffffffff)); + out1[0] = x8; + out1[1] = x9; + out1[2] = x10; + out1[3] = x11; + out1[4] = x12; +} + +/* + * The function fiat_25519_add adds two field elements. + * + * Postconditions: + * eval out1 mod m = (eval arg1 + eval arg2) mod m + * + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_add(fiat_25519_loose_field_element out1, const fiat_25519_tight_field_element arg1, const fiat_25519_tight_field_element arg2) { + uint64_t x1; + uint64_t x2; + uint64_t x3; + uint64_t x4; + uint64_t x5; + x1 = ((arg1[0]) + (arg2[0])); + x2 = ((arg1[1]) + (arg2[1])); + x3 = ((arg1[2]) + (arg2[2])); + x4 = ((arg1[3]) + (arg2[3])); + x5 = ((arg1[4]) + (arg2[4])); + out1[0] = x1; + out1[1] = x2; + out1[2] = x3; + out1[3] = x4; + out1[4] = x5; +} + +/* + * The function fiat_25519_sub subtracts two field elements. + * + * Postconditions: + * eval out1 mod m = (eval arg1 - eval arg2) mod m + * + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_sub(fiat_25519_loose_field_element out1, const fiat_25519_tight_field_element arg1, const fiat_25519_tight_field_element arg2) { + uint64_t x1; + uint64_t x2; + uint64_t x3; + uint64_t x4; + uint64_t x5; + x1 = ((UINT64_C(0xfffffffffffda) + (arg1[0])) - (arg2[0])); + x2 = ((UINT64_C(0xffffffffffffe) + (arg1[1])) - (arg2[1])); + x3 = ((UINT64_C(0xffffffffffffe) + (arg1[2])) - (arg2[2])); + x4 = ((UINT64_C(0xffffffffffffe) + (arg1[3])) - (arg2[3])); + x5 = ((UINT64_C(0xffffffffffffe) + (arg1[4])) - (arg2[4])); + out1[0] = x1; + out1[1] = x2; + out1[2] = x3; + out1[3] = x4; + out1[4] = x5; +} + +/* + * The function fiat_25519_opp negates a field element. + * + * Postconditions: + * eval out1 mod m = -eval arg1 mod m + * + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_opp(fiat_25519_loose_field_element out1, const fiat_25519_tight_field_element arg1) { + uint64_t x1; + uint64_t x2; + uint64_t x3; + uint64_t x4; + uint64_t x5; + x1 = (UINT64_C(0xfffffffffffda) - (arg1[0])); + x2 = (UINT64_C(0xffffffffffffe) - (arg1[1])); + x3 = (UINT64_C(0xffffffffffffe) - (arg1[2])); + x4 = (UINT64_C(0xffffffffffffe) - (arg1[3])); + x5 = (UINT64_C(0xffffffffffffe) - (arg1[4])); + out1[0] = x1; + out1[1] = x2; + out1[2] = x3; + out1[3] = x4; + out1[4] = x5; +} + +/* + * The function fiat_25519_to_bytes serializes a field element to bytes in little-endian order. + * + * Postconditions: + * out1 = map (λ x, ⌊((eval arg1 mod m) mod 2^(8 * (x + 1))) / 2^(8 * x)⌋) [0..31] + * + * Output Bounds: + * out1: [[0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0x7f]] + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_to_bytes(uint8_t out1[32], const fiat_25519_tight_field_element arg1) { + uint64_t x1; + fiat_25519_uint1 x2; + uint64_t x3; + fiat_25519_uint1 x4; + uint64_t x5; + fiat_25519_uint1 x6; + uint64_t x7; + fiat_25519_uint1 x8; + uint64_t x9; + fiat_25519_uint1 x10; + uint64_t x11; + uint64_t x12; + fiat_25519_uint1 x13; + uint64_t x14; + fiat_25519_uint1 x15; + uint64_t x16; + fiat_25519_uint1 x17; + uint64_t x18; + fiat_25519_uint1 x19; + uint64_t x20; + fiat_25519_uint1 x21; + uint64_t x22; + uint64_t x23; + uint64_t x24; + uint64_t x25; + uint8_t x26; + uint64_t x27; + uint8_t x28; + uint64_t x29; + uint8_t x30; + uint64_t x31; + uint8_t x32; + uint64_t x33; + uint8_t x34; + uint64_t x35; + uint8_t x36; + uint8_t x37; + uint64_t x38; + uint8_t x39; + uint64_t x40; + uint8_t x41; + uint64_t x42; + uint8_t x43; + uint64_t x44; + uint8_t x45; + uint64_t x46; + uint8_t x47; + uint64_t x48; + uint8_t x49; + uint8_t x50; + uint64_t x51; + uint8_t x52; + uint64_t x53; + uint8_t x54; + uint64_t x55; + uint8_t x56; + uint64_t x57; + uint8_t x58; + uint64_t x59; + uint8_t x60; + uint64_t x61; + uint8_t x62; + uint64_t x63; + uint8_t x64; + fiat_25519_uint1 x65; + uint64_t x66; + uint8_t x67; + uint64_t x68; + uint8_t x69; + uint64_t x70; + uint8_t x71; + uint64_t x72; + uint8_t x73; + uint64_t x74; + uint8_t x75; + uint64_t x76; + uint8_t x77; + uint8_t x78; + uint64_t x79; + uint8_t x80; + uint64_t x81; + uint8_t x82; + uint64_t x83; + uint8_t x84; + uint64_t x85; + uint8_t x86; + uint64_t x87; + uint8_t x88; + uint64_t x89; + uint8_t x90; + uint8_t x91; + fiat_25519_subborrowx_u51(&x1, &x2, 0x0, (arg1[0]), UINT64_C(0x7ffffffffffed)); + fiat_25519_subborrowx_u51(&x3, &x4, x2, (arg1[1]), UINT64_C(0x7ffffffffffff)); + fiat_25519_subborrowx_u51(&x5, &x6, x4, (arg1[2]), UINT64_C(0x7ffffffffffff)); + fiat_25519_subborrowx_u51(&x7, &x8, x6, (arg1[3]), UINT64_C(0x7ffffffffffff)); + fiat_25519_subborrowx_u51(&x9, &x10, x8, (arg1[4]), UINT64_C(0x7ffffffffffff)); + fiat_25519_cmovznz_u64(&x11, x10, 0x0, UINT64_C(0xffffffffffffffff)); + fiat_25519_addcarryx_u51(&x12, &x13, 0x0, x1, (x11 & UINT64_C(0x7ffffffffffed))); + fiat_25519_addcarryx_u51(&x14, &x15, x13, x3, (x11 & UINT64_C(0x7ffffffffffff))); + fiat_25519_addcarryx_u51(&x16, &x17, x15, x5, (x11 & UINT64_C(0x7ffffffffffff))); + fiat_25519_addcarryx_u51(&x18, &x19, x17, x7, (x11 & UINT64_C(0x7ffffffffffff))); + fiat_25519_addcarryx_u51(&x20, &x21, x19, x9, (x11 & UINT64_C(0x7ffffffffffff))); + x22 = (x20 << 4); + x23 = (x18 * (uint64_t)0x2); + x24 = (x16 << 6); + x25 = (x14 << 3); + x26 = (uint8_t)(x12 & UINT8_C(0xff)); + x27 = (x12 >> 8); + x28 = (uint8_t)(x27 & UINT8_C(0xff)); + x29 = (x27 >> 8); + x30 = (uint8_t)(x29 & UINT8_C(0xff)); + x31 = (x29 >> 8); + x32 = (uint8_t)(x31 & UINT8_C(0xff)); + x33 = (x31 >> 8); + x34 = (uint8_t)(x33 & UINT8_C(0xff)); + x35 = (x33 >> 8); + x36 = (uint8_t)(x35 & UINT8_C(0xff)); + x37 = (uint8_t)(x35 >> 8); + x38 = (x25 + (uint64_t)x37); + x39 = (uint8_t)(x38 & UINT8_C(0xff)); + x40 = (x38 >> 8); + x41 = (uint8_t)(x40 & UINT8_C(0xff)); + x42 = (x40 >> 8); + x43 = (uint8_t)(x42 & UINT8_C(0xff)); + x44 = (x42 >> 8); + x45 = (uint8_t)(x44 & UINT8_C(0xff)); + x46 = (x44 >> 8); + x47 = (uint8_t)(x46 & UINT8_C(0xff)); + x48 = (x46 >> 8); + x49 = (uint8_t)(x48 & UINT8_C(0xff)); + x50 = (uint8_t)(x48 >> 8); + x51 = (x24 + (uint64_t)x50); + x52 = (uint8_t)(x51 & UINT8_C(0xff)); + x53 = (x51 >> 8); + x54 = (uint8_t)(x53 & UINT8_C(0xff)); + x55 = (x53 >> 8); + x56 = (uint8_t)(x55 & UINT8_C(0xff)); + x57 = (x55 >> 8); + x58 = (uint8_t)(x57 & UINT8_C(0xff)); + x59 = (x57 >> 8); + x60 = (uint8_t)(x59 & UINT8_C(0xff)); + x61 = (x59 >> 8); + x62 = (uint8_t)(x61 & UINT8_C(0xff)); + x63 = (x61 >> 8); + x64 = (uint8_t)(x63 & UINT8_C(0xff)); + x65 = (fiat_25519_uint1)(x63 >> 8); + x66 = (x23 + (uint64_t)x65); + x67 = (uint8_t)(x66 & UINT8_C(0xff)); + x68 = (x66 >> 8); + x69 = (uint8_t)(x68 & UINT8_C(0xff)); + x70 = (x68 >> 8); + x71 = (uint8_t)(x70 & UINT8_C(0xff)); + x72 = (x70 >> 8); + x73 = (uint8_t)(x72 & UINT8_C(0xff)); + x74 = (x72 >> 8); + x75 = (uint8_t)(x74 & UINT8_C(0xff)); + x76 = (x74 >> 8); + x77 = (uint8_t)(x76 & UINT8_C(0xff)); + x78 = (uint8_t)(x76 >> 8); + x79 = (x22 + (uint64_t)x78); + x80 = (uint8_t)(x79 & UINT8_C(0xff)); + x81 = (x79 >> 8); + x82 = (uint8_t)(x81 & UINT8_C(0xff)); + x83 = (x81 >> 8); + x84 = (uint8_t)(x83 & UINT8_C(0xff)); + x85 = (x83 >> 8); + x86 = (uint8_t)(x85 & UINT8_C(0xff)); + x87 = (x85 >> 8); + x88 = (uint8_t)(x87 & UINT8_C(0xff)); + x89 = (x87 >> 8); + x90 = (uint8_t)(x89 & UINT8_C(0xff)); + x91 = (uint8_t)(x89 >> 8); + out1[0] = x26; + out1[1] = x28; + out1[2] = x30; + out1[3] = x32; + out1[4] = x34; + out1[5] = x36; + out1[6] = x39; + out1[7] = x41; + out1[8] = x43; + out1[9] = x45; + out1[10] = x47; + out1[11] = x49; + out1[12] = x52; + out1[13] = x54; + out1[14] = x56; + out1[15] = x58; + out1[16] = x60; + out1[17] = x62; + out1[18] = x64; + out1[19] = x67; + out1[20] = x69; + out1[21] = x71; + out1[22] = x73; + out1[23] = x75; + out1[24] = x77; + out1[25] = x80; + out1[26] = x82; + out1[27] = x84; + out1[28] = x86; + out1[29] = x88; + out1[30] = x90; + out1[31] = x91; +} + +/* + * The function fiat_25519_from_bytes deserializes a field element from bytes in little-endian order. + * + * Postconditions: + * eval out1 mod m = bytes_eval arg1 mod m + * + * Input Bounds: + * arg1: [[0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0x7f]] + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_from_bytes(fiat_25519_tight_field_element out1, const uint8_t arg1[32]) { + uint64_t x1; + uint64_t x2; + uint64_t x3; + uint64_t x4; + uint64_t x5; + uint64_t x6; + uint64_t x7; + uint64_t x8; + uint64_t x9; + uint64_t x10; + uint64_t x11; + uint64_t x12; + uint64_t x13; + uint64_t x14; + uint64_t x15; + uint64_t x16; + uint64_t x17; + uint64_t x18; + uint64_t x19; + uint64_t x20; + uint64_t x21; + uint64_t x22; + uint64_t x23; + uint64_t x24; + uint64_t x25; + uint64_t x26; + uint64_t x27; + uint64_t x28; + uint64_t x29; + uint64_t x30; + uint64_t x31; + uint8_t x32; + uint64_t x33; + uint64_t x34; + uint64_t x35; + uint64_t x36; + uint64_t x37; + uint64_t x38; + uint64_t x39; + uint8_t x40; + uint64_t x41; + uint64_t x42; + uint64_t x43; + uint64_t x44; + uint64_t x45; + uint64_t x46; + uint64_t x47; + uint8_t x48; + uint64_t x49; + uint64_t x50; + uint64_t x51; + uint64_t x52; + uint64_t x53; + uint64_t x54; + uint64_t x55; + uint64_t x56; + uint8_t x57; + uint64_t x58; + uint64_t x59; + uint64_t x60; + uint64_t x61; + uint64_t x62; + uint64_t x63; + uint64_t x64; + uint8_t x65; + uint64_t x66; + uint64_t x67; + uint64_t x68; + uint64_t x69; + uint64_t x70; + uint64_t x71; + x1 = ((uint64_t)(arg1[31]) << 44); + x2 = ((uint64_t)(arg1[30]) << 36); + x3 = ((uint64_t)(arg1[29]) << 28); + x4 = ((uint64_t)(arg1[28]) << 20); + x5 = ((uint64_t)(arg1[27]) << 12); + x6 = ((uint64_t)(arg1[26]) << 4); + x7 = ((uint64_t)(arg1[25]) << 47); + x8 = ((uint64_t)(arg1[24]) << 39); + x9 = ((uint64_t)(arg1[23]) << 31); + x10 = ((uint64_t)(arg1[22]) << 23); + x11 = ((uint64_t)(arg1[21]) << 15); + x12 = ((uint64_t)(arg1[20]) << 7); + x13 = ((uint64_t)(arg1[19]) << 50); + x14 = ((uint64_t)(arg1[18]) << 42); + x15 = ((uint64_t)(arg1[17]) << 34); + x16 = ((uint64_t)(arg1[16]) << 26); + x17 = ((uint64_t)(arg1[15]) << 18); + x18 = ((uint64_t)(arg1[14]) << 10); + x19 = ((uint64_t)(arg1[13]) << 2); + x20 = ((uint64_t)(arg1[12]) << 45); + x21 = ((uint64_t)(arg1[11]) << 37); + x22 = ((uint64_t)(arg1[10]) << 29); + x23 = ((uint64_t)(arg1[9]) << 21); + x24 = ((uint64_t)(arg1[8]) << 13); + x25 = ((uint64_t)(arg1[7]) << 5); + x26 = ((uint64_t)(arg1[6]) << 48); + x27 = ((uint64_t)(arg1[5]) << 40); + x28 = ((uint64_t)(arg1[4]) << 32); + x29 = ((uint64_t)(arg1[3]) << 24); + x30 = ((uint64_t)(arg1[2]) << 16); + x31 = ((uint64_t)(arg1[1]) << 8); + x32 = (arg1[0]); + x33 = (x31 + (uint64_t)x32); + x34 = (x30 + x33); + x35 = (x29 + x34); + x36 = (x28 + x35); + x37 = (x27 + x36); + x38 = (x26 + x37); + x39 = (x38 & UINT64_C(0x7ffffffffffff)); + x40 = (uint8_t)(x38 >> 51); + x41 = (x25 + (uint64_t)x40); + x42 = (x24 + x41); + x43 = (x23 + x42); + x44 = (x22 + x43); + x45 = (x21 + x44); + x46 = (x20 + x45); + x47 = (x46 & UINT64_C(0x7ffffffffffff)); + x48 = (uint8_t)(x46 >> 51); + x49 = (x19 + (uint64_t)x48); + x50 = (x18 + x49); + x51 = (x17 + x50); + x52 = (x16 + x51); + x53 = (x15 + x52); + x54 = (x14 + x53); + x55 = (x13 + x54); + x56 = (x55 & UINT64_C(0x7ffffffffffff)); + x57 = (uint8_t)(x55 >> 51); + x58 = (x12 + (uint64_t)x57); + x59 = (x11 + x58); + x60 = (x10 + x59); + x61 = (x9 + x60); + x62 = (x8 + x61); + x63 = (x7 + x62); + x64 = (x63 & UINT64_C(0x7ffffffffffff)); + x65 = (uint8_t)(x63 >> 51); + x66 = (x6 + (uint64_t)x65); + x67 = (x5 + x66); + x68 = (x4 + x67); + x69 = (x3 + x68); + x70 = (x2 + x69); + x71 = (x1 + x70); + out1[0] = x39; + out1[1] = x47; + out1[2] = x56; + out1[3] = x64; + out1[4] = x71; +} + +/* + * The function fiat_25519_carry_scmul_121666 multiplies a field element by 121666 and reduces the result. + * + * Postconditions: + * eval out1 mod m = (121666 * eval arg1) mod m + * + */ +static FIAT_25519_FIAT_INLINE void fiat_25519_carry_scmul_121666(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1) { + uint64_t x1; + uint64_t x2; + uint64_t x3; + uint64_t x4; + uint64_t x5; + uint64_t x6; + uint64_t x7; + uint64_t x8; + uint64_t x9; + uint64_t x10; + uint64_t x11; + uint64_t x12; + uint64_t x13; + fiat_25519_uint1 x14; + uint64_t x15; + uint64_t x16; + uint64_t x17; + uint64_t x18; + fiat_25519_uint1 x19; + uint64_t x20; + uint64_t x21; + uint64_t x22; + uint64_t x23; + fiat_25519_uint1 x24; + uint64_t x25; + uint64_t x26; + uint64_t x27; + uint64_t x28; + fiat_25519_uint1 x29; + uint64_t x30; + uint64_t x31; + uint64_t x32; + uint64_t x33; + uint64_t x34; + fiat_25519_uint1 x35; + uint64_t x36; + uint64_t x37; + fiat_25519_uint1 x38; + uint64_t x39; + uint64_t x40; + fiat_25519_mulx_u64(&x1, &x2, UINT32_C(0x1db42), (arg1[4])); + fiat_25519_mulx_u64(&x3, &x4, UINT32_C(0x1db42), (arg1[3])); + fiat_25519_mulx_u64(&x5, &x6, UINT32_C(0x1db42), (arg1[2])); + fiat_25519_mulx_u64(&x7, &x8, UINT32_C(0x1db42), (arg1[1])); + fiat_25519_mulx_u64(&x9, &x10, UINT32_C(0x1db42), (arg1[0])); + x11 = ((x9 >> 51) | ((x10 << 13) & UINT64_C(0xffffffffffffffff))); + x12 = (x9 & UINT64_C(0x7ffffffffffff)); + fiat_25519_addcarryx_u64(&x13, &x14, 0x0, x11, x7); + x15 = (x14 + x8); + x16 = ((x13 >> 51) | ((x15 << 13) & UINT64_C(0xffffffffffffffff))); + x17 = (x13 & UINT64_C(0x7ffffffffffff)); + fiat_25519_addcarryx_u64(&x18, &x19, 0x0, x16, x5); + x20 = (x19 + x6); + x21 = ((x18 >> 51) | ((x20 << 13) & UINT64_C(0xffffffffffffffff))); + x22 = (x18 & UINT64_C(0x7ffffffffffff)); + fiat_25519_addcarryx_u64(&x23, &x24, 0x0, x21, x3); + x25 = (x24 + x4); + x26 = ((x23 >> 51) | ((x25 << 13) & UINT64_C(0xffffffffffffffff))); + x27 = (x23 & UINT64_C(0x7ffffffffffff)); + fiat_25519_addcarryx_u64(&x28, &x29, 0x0, x26, x1); + x30 = (x29 + x2); + x31 = ((x28 >> 51) | ((x30 << 13) & UINT64_C(0xffffffffffffffff))); + x32 = (x28 & UINT64_C(0x7ffffffffffff)); + x33 = (x31 * UINT8_C(0x13)); + x34 = (x12 + x33); + x35 = (fiat_25519_uint1)(x34 >> 51); + x36 = (x34 & UINT64_C(0x7ffffffffffff)); + x37 = (x35 + x17); + x38 = (fiat_25519_uint1)(x37 >> 51); + x39 = (x37 & UINT64_C(0x7ffffffffffff)); + x40 = (x38 + x22); + out1[0] = x36; + out1[1] = x39; + out1[2] = x40; + out1[3] = x27; + out1[4] = x32; +} diff --git a/ring-0.17.14/third_party/fiat/p256_32.h b/ring-0.17.14/third_party/fiat/p256_32.h new file mode 100644 index 0000000000..83289a1d9a --- /dev/null +++ b/ring-0.17.14/third_party/fiat/p256_32.h @@ -0,0 +1,2511 @@ +/* Autogenerated: 'src/ExtractionOCaml/word_by_word_montgomery' --inline --static --use-value-barrier p256 32 '2^256 - 2^224 + 2^192 + 2^96 - 1' mul square add sub opp from_montgomery to_montgomery nonzero selectznz to_bytes from_bytes one msat divstep divstep_precomp */ +/* curve description: p256 */ +/* machine_wordsize = 32 (from "32") */ +/* requested operations: mul, square, add, sub, opp, from_montgomery, to_montgomery, nonzero, selectznz, to_bytes, from_bytes, one, msat, divstep, divstep_precomp */ +/* m = 0xffffffff00000001000000000000000000000000ffffffffffffffffffffffff (from "2^256 - 2^224 + 2^192 + 2^96 - 1") */ +/* */ +/* NOTE: In addition to the bounds specified above each function, all */ +/* functions synthesized for this Montgomery arithmetic require the */ +/* input to be strictly less than the prime modulus (m), and also */ +/* require the input to be in the unique saturated representation. */ +/* All functions also ensure that these two properties are true of */ +/* return values. */ +/* */ +/* Computed values: */ +/* eval z = z[0] + (z[1] << 32) + (z[2] << 64) + (z[3] << 96) + (z[4] << 128) + (z[5] << 160) + (z[6] << 192) + (z[7] << 224) */ +/* bytes_eval z = z[0] + (z[1] << 8) + (z[2] << 16) + (z[3] << 24) + (z[4] << 32) + (z[5] << 40) + (z[6] << 48) + (z[7] << 56) + (z[8] << 64) + (z[9] << 72) + (z[10] << 80) + (z[11] << 88) + (z[12] << 96) + (z[13] << 104) + (z[14] << 112) + (z[15] << 120) + (z[16] << 128) + (z[17] << 136) + (z[18] << 144) + (z[19] << 152) + (z[20] << 160) + (z[21] << 168) + (z[22] << 176) + (z[23] << 184) + (z[24] << 192) + (z[25] << 200) + (z[26] << 208) + (z[27] << 216) + (z[28] << 224) + (z[29] << 232) + (z[30] << 240) + (z[31] << 248) */ +/* twos_complement_eval z = let x1 := z[0] + (z[1] << 32) + (z[2] << 64) + (z[3] << 96) + (z[4] << 128) + (z[5] << 160) + (z[6] << 192) + (z[7] << 224) in */ +/* if x1 & (2^256-1) < 2^255 then x1 & (2^256-1) else (x1 & (2^256-1)) - 2^256 */ + +#include +typedef unsigned char fiat_p256_uint1; +typedef signed char fiat_p256_int1; +#if defined(__GNUC__) || defined(__clang__) +# define FIAT_P256_FIAT_INLINE __inline__ +#else +# define FIAT_P256_FIAT_INLINE +#endif + +/* The type fiat_p256_montgomery_domain_field_element is a field element in the Montgomery domain. */ +/* Bounds: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]] */ +typedef uint32_t fiat_p256_montgomery_domain_field_element[8]; + +/* The type fiat_p256_non_montgomery_domain_field_element is a field element NOT in the Montgomery domain. */ +/* Bounds: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]] */ +typedef uint32_t fiat_p256_non_montgomery_domain_field_element[8]; + +#if (-1 & 3) != 3 +#error "This code only works on a two's complement system" +#endif + +#if !defined(FIAT_P256_NO_ASM) && (defined(__GNUC__) || defined(__clang__)) +static __inline__ uint32_t fiat_p256_value_barrier_u32(uint32_t a) { + __asm__("" : "+r"(a) : /* no inputs */); + return a; +} +#else +# define fiat_p256_value_barrier_u32(x) (x) +#endif + + +/* + * The function fiat_p256_addcarryx_u32 is an addition with carry. + * + * Postconditions: + * out1 = (arg1 + arg2 + arg3) mod 2^32 + * out2 = ⌊(arg1 + arg2 + arg3) / 2^32⌋ + * + * Input Bounds: + * arg1: [0x0 ~> 0x1] + * arg2: [0x0 ~> 0xffffffff] + * arg3: [0x0 ~> 0xffffffff] + * Output Bounds: + * out1: [0x0 ~> 0xffffffff] + * out2: [0x0 ~> 0x1] + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_addcarryx_u32(uint32_t* out1, fiat_p256_uint1* out2, fiat_p256_uint1 arg1, uint32_t arg2, uint32_t arg3) { + uint64_t x1; + uint32_t x2; + fiat_p256_uint1 x3; + x1 = ((arg1 + (uint64_t)arg2) + arg3); + x2 = (uint32_t)(x1 & UINT32_C(0xffffffff)); + x3 = (fiat_p256_uint1)(x1 >> 32); + *out1 = x2; + *out2 = x3; +} + +/* + * The function fiat_p256_subborrowx_u32 is a subtraction with borrow. + * + * Postconditions: + * out1 = (-arg1 + arg2 + -arg3) mod 2^32 + * out2 = -⌊(-arg1 + arg2 + -arg3) / 2^32⌋ + * + * Input Bounds: + * arg1: [0x0 ~> 0x1] + * arg2: [0x0 ~> 0xffffffff] + * arg3: [0x0 ~> 0xffffffff] + * Output Bounds: + * out1: [0x0 ~> 0xffffffff] + * out2: [0x0 ~> 0x1] + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_subborrowx_u32(uint32_t* out1, fiat_p256_uint1* out2, fiat_p256_uint1 arg1, uint32_t arg2, uint32_t arg3) { + int64_t x1; + fiat_p256_int1 x2; + uint32_t x3; + x1 = ((arg2 - (int64_t)arg1) - arg3); + x2 = (fiat_p256_int1)(x1 >> 32); + x3 = (uint32_t)(x1 & UINT32_C(0xffffffff)); + *out1 = x3; + *out2 = (fiat_p256_uint1)(0x0 - x2); +} + +/* + * The function fiat_p256_mulx_u32 is a multiplication, returning the full double-width result. + * + * Postconditions: + * out1 = (arg1 * arg2) mod 2^32 + * out2 = ⌊arg1 * arg2 / 2^32⌋ + * + * Input Bounds: + * arg1: [0x0 ~> 0xffffffff] + * arg2: [0x0 ~> 0xffffffff] + * Output Bounds: + * out1: [0x0 ~> 0xffffffff] + * out2: [0x0 ~> 0xffffffff] + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_mulx_u32(uint32_t* out1, uint32_t* out2, uint32_t arg1, uint32_t arg2) { + uint64_t x1; + uint32_t x2; + uint32_t x3; + x1 = ((uint64_t)arg1 * arg2); + x2 = (uint32_t)(x1 & UINT32_C(0xffffffff)); + x3 = (uint32_t)(x1 >> 32); + *out1 = x2; + *out2 = x3; +} + +/* + * The function fiat_p256_cmovznz_u32 is a single-word conditional move. + * + * Postconditions: + * out1 = (if arg1 = 0 then arg2 else arg3) + * + * Input Bounds: + * arg1: [0x0 ~> 0x1] + * arg2: [0x0 ~> 0xffffffff] + * arg3: [0x0 ~> 0xffffffff] + * Output Bounds: + * out1: [0x0 ~> 0xffffffff] + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_cmovznz_u32(uint32_t* out1, fiat_p256_uint1 arg1, uint32_t arg2, uint32_t arg3) { + fiat_p256_uint1 x1; + uint32_t x2; + uint32_t x3; + x1 = (!(!arg1)); + x2 = ((fiat_p256_int1)(0x0 - x1) & UINT32_C(0xffffffff)); + x3 = ((fiat_p256_value_barrier_u32(x2) & arg3) | (fiat_p256_value_barrier_u32((~x2)) & arg2)); + *out1 = x3; +} + +/* + * The function fiat_p256_mul multiplies two field elements in the Montgomery domain. + * + * Preconditions: + * 0 ≤ eval arg1 < m + * 0 ≤ eval arg2 < m + * Postconditions: + * eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) * eval (from_montgomery arg2)) mod m + * 0 ≤ eval out1 < m + * + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_mul(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) { + uint32_t x1; + uint32_t x2; + uint32_t x3; + uint32_t x4; + uint32_t x5; + uint32_t x6; + uint32_t x7; + uint32_t x8; + uint32_t x9; + uint32_t x10; + uint32_t x11; + uint32_t x12; + uint32_t x13; + uint32_t x14; + uint32_t x15; + uint32_t x16; + uint32_t x17; + uint32_t x18; + uint32_t x19; + uint32_t x20; + uint32_t x21; + uint32_t x22; + uint32_t x23; + uint32_t x24; + uint32_t x25; + fiat_p256_uint1 x26; + uint32_t x27; + fiat_p256_uint1 x28; + uint32_t x29; + fiat_p256_uint1 x30; + uint32_t x31; + fiat_p256_uint1 x32; + uint32_t x33; + fiat_p256_uint1 x34; + uint32_t x35; + fiat_p256_uint1 x36; + uint32_t x37; + fiat_p256_uint1 x38; + uint32_t x39; + uint32_t x40; + uint32_t x41; + uint32_t x42; + uint32_t x43; + uint32_t x44; + uint32_t x45; + uint32_t x46; + uint32_t x47; + uint32_t x48; + fiat_p256_uint1 x49; + uint32_t x50; + fiat_p256_uint1 x51; + uint32_t x52; + uint32_t x53; + fiat_p256_uint1 x54; + uint32_t x55; + fiat_p256_uint1 x56; + uint32_t x57; + fiat_p256_uint1 x58; + uint32_t x59; + fiat_p256_uint1 x60; + uint32_t x61; + fiat_p256_uint1 x62; + uint32_t x63; + fiat_p256_uint1 x64; + uint32_t x65; + fiat_p256_uint1 x66; + uint32_t x67; + fiat_p256_uint1 x68; + uint32_t x69; + fiat_p256_uint1 x70; + uint32_t x71; + uint32_t x72; + uint32_t x73; + uint32_t x74; + uint32_t x75; + uint32_t x76; + uint32_t x77; + uint32_t x78; + uint32_t x79; + uint32_t x80; + uint32_t x81; + uint32_t x82; + uint32_t x83; + uint32_t x84; + uint32_t x85; + uint32_t x86; + uint32_t x87; + fiat_p256_uint1 x88; + uint32_t x89; + fiat_p256_uint1 x90; + uint32_t x91; + fiat_p256_uint1 x92; + uint32_t x93; + fiat_p256_uint1 x94; + uint32_t x95; + fiat_p256_uint1 x96; + uint32_t x97; + fiat_p256_uint1 x98; + uint32_t x99; + fiat_p256_uint1 x100; + uint32_t x101; + uint32_t x102; + fiat_p256_uint1 x103; + uint32_t x104; + fiat_p256_uint1 x105; + uint32_t x106; + fiat_p256_uint1 x107; + uint32_t x108; + fiat_p256_uint1 x109; + uint32_t x110; + fiat_p256_uint1 x111; + uint32_t x112; + fiat_p256_uint1 x113; + uint32_t x114; + fiat_p256_uint1 x115; + uint32_t x116; + fiat_p256_uint1 x117; + uint32_t x118; + fiat_p256_uint1 x119; + uint32_t x120; + uint32_t x121; + uint32_t x122; + uint32_t x123; + uint32_t x124; + uint32_t x125; + uint32_t x126; + uint32_t x127; + uint32_t x128; + fiat_p256_uint1 x129; + uint32_t x130; + fiat_p256_uint1 x131; + uint32_t x132; + uint32_t x133; + fiat_p256_uint1 x134; + uint32_t x135; + fiat_p256_uint1 x136; + uint32_t x137; + fiat_p256_uint1 x138; + uint32_t x139; + fiat_p256_uint1 x140; + uint32_t x141; + fiat_p256_uint1 x142; + uint32_t x143; + fiat_p256_uint1 x144; + uint32_t x145; + fiat_p256_uint1 x146; + uint32_t x147; + fiat_p256_uint1 x148; + uint32_t x149; + fiat_p256_uint1 x150; + uint32_t x151; + uint32_t x152; + uint32_t x153; + uint32_t x154; + uint32_t x155; + uint32_t x156; + uint32_t x157; + uint32_t x158; + uint32_t x159; + uint32_t x160; + uint32_t x161; + uint32_t x162; + uint32_t x163; + uint32_t x164; + uint32_t x165; + uint32_t x166; + uint32_t x167; + uint32_t x168; + fiat_p256_uint1 x169; + uint32_t x170; + fiat_p256_uint1 x171; + uint32_t x172; + fiat_p256_uint1 x173; + uint32_t x174; + fiat_p256_uint1 x175; + uint32_t x176; + fiat_p256_uint1 x177; + uint32_t x178; + fiat_p256_uint1 x179; + uint32_t x180; + fiat_p256_uint1 x181; + uint32_t x182; + uint32_t x183; + fiat_p256_uint1 x184; + uint32_t x185; + fiat_p256_uint1 x186; + uint32_t x187; + fiat_p256_uint1 x188; + uint32_t x189; + fiat_p256_uint1 x190; + uint32_t x191; + fiat_p256_uint1 x192; + uint32_t x193; + fiat_p256_uint1 x194; + uint32_t x195; + fiat_p256_uint1 x196; + uint32_t x197; + fiat_p256_uint1 x198; + uint32_t x199; + fiat_p256_uint1 x200; + uint32_t x201; + uint32_t x202; + uint32_t x203; + uint32_t x204; + uint32_t x205; + uint32_t x206; + uint32_t x207; + uint32_t x208; + uint32_t x209; + fiat_p256_uint1 x210; + uint32_t x211; + fiat_p256_uint1 x212; + uint32_t x213; + uint32_t x214; + fiat_p256_uint1 x215; + uint32_t x216; + fiat_p256_uint1 x217; + uint32_t x218; + fiat_p256_uint1 x219; + uint32_t x220; + fiat_p256_uint1 x221; + uint32_t x222; + fiat_p256_uint1 x223; + uint32_t x224; + fiat_p256_uint1 x225; + uint32_t x226; + fiat_p256_uint1 x227; + uint32_t x228; + fiat_p256_uint1 x229; + uint32_t x230; + fiat_p256_uint1 x231; + uint32_t x232; + uint32_t x233; + uint32_t x234; + uint32_t x235; + uint32_t x236; + uint32_t x237; + uint32_t x238; + uint32_t x239; + uint32_t x240; + uint32_t x241; + uint32_t x242; + uint32_t x243; + uint32_t x244; + uint32_t x245; + uint32_t x246; + uint32_t x247; + uint32_t x248; + uint32_t x249; + fiat_p256_uint1 x250; + uint32_t x251; + fiat_p256_uint1 x252; + uint32_t x253; + fiat_p256_uint1 x254; + uint32_t x255; + fiat_p256_uint1 x256; + uint32_t x257; + fiat_p256_uint1 x258; + uint32_t x259; + fiat_p256_uint1 x260; + uint32_t x261; + fiat_p256_uint1 x262; + uint32_t x263; + uint32_t x264; + fiat_p256_uint1 x265; + uint32_t x266; + fiat_p256_uint1 x267; + uint32_t x268; + fiat_p256_uint1 x269; + uint32_t x270; + fiat_p256_uint1 x271; + uint32_t x272; + fiat_p256_uint1 x273; + uint32_t x274; + fiat_p256_uint1 x275; + uint32_t x276; + fiat_p256_uint1 x277; + uint32_t x278; + fiat_p256_uint1 x279; + uint32_t x280; + fiat_p256_uint1 x281; + uint32_t x282; + uint32_t x283; + uint32_t x284; + uint32_t x285; + uint32_t x286; + uint32_t x287; + uint32_t x288; + uint32_t x289; + uint32_t x290; + fiat_p256_uint1 x291; + uint32_t x292; + fiat_p256_uint1 x293; + uint32_t x294; + uint32_t x295; + fiat_p256_uint1 x296; + uint32_t x297; + fiat_p256_uint1 x298; + uint32_t x299; + fiat_p256_uint1 x300; + uint32_t x301; + fiat_p256_uint1 x302; + uint32_t x303; + fiat_p256_uint1 x304; + uint32_t x305; + fiat_p256_uint1 x306; + uint32_t x307; + fiat_p256_uint1 x308; + uint32_t x309; + fiat_p256_uint1 x310; + uint32_t x311; + fiat_p256_uint1 x312; + uint32_t x313; + uint32_t x314; + uint32_t x315; + uint32_t x316; + uint32_t x317; + uint32_t x318; + uint32_t x319; + uint32_t x320; + uint32_t x321; + uint32_t x322; + uint32_t x323; + uint32_t x324; + uint32_t x325; + uint32_t x326; + uint32_t x327; + uint32_t x328; + uint32_t x329; + uint32_t x330; + fiat_p256_uint1 x331; + uint32_t x332; + fiat_p256_uint1 x333; + uint32_t x334; + fiat_p256_uint1 x335; + uint32_t x336; + fiat_p256_uint1 x337; + uint32_t x338; + fiat_p256_uint1 x339; + uint32_t x340; + fiat_p256_uint1 x341; + uint32_t x342; + fiat_p256_uint1 x343; + uint32_t x344; + uint32_t x345; + fiat_p256_uint1 x346; + uint32_t x347; + fiat_p256_uint1 x348; + uint32_t x349; + fiat_p256_uint1 x350; + uint32_t x351; + fiat_p256_uint1 x352; + uint32_t x353; + fiat_p256_uint1 x354; + uint32_t x355; + fiat_p256_uint1 x356; + uint32_t x357; + fiat_p256_uint1 x358; + uint32_t x359; + fiat_p256_uint1 x360; + uint32_t x361; + fiat_p256_uint1 x362; + uint32_t x363; + uint32_t x364; + uint32_t x365; + uint32_t x366; + uint32_t x367; + uint32_t x368; + uint32_t x369; + uint32_t x370; + uint32_t x371; + fiat_p256_uint1 x372; + uint32_t x373; + fiat_p256_uint1 x374; + uint32_t x375; + uint32_t x376; + fiat_p256_uint1 x377; + uint32_t x378; + fiat_p256_uint1 x379; + uint32_t x380; + fiat_p256_uint1 x381; + uint32_t x382; + fiat_p256_uint1 x383; + uint32_t x384; + fiat_p256_uint1 x385; + uint32_t x386; + fiat_p256_uint1 x387; + uint32_t x388; + fiat_p256_uint1 x389; + uint32_t x390; + fiat_p256_uint1 x391; + uint32_t x392; + fiat_p256_uint1 x393; + uint32_t x394; + uint32_t x395; + uint32_t x396; + uint32_t x397; + uint32_t x398; + uint32_t x399; + uint32_t x400; + uint32_t x401; + uint32_t x402; + uint32_t x403; + uint32_t x404; + uint32_t x405; + uint32_t x406; + uint32_t x407; + uint32_t x408; + uint32_t x409; + uint32_t x410; + uint32_t x411; + fiat_p256_uint1 x412; + uint32_t x413; + fiat_p256_uint1 x414; + uint32_t x415; + fiat_p256_uint1 x416; + uint32_t x417; + fiat_p256_uint1 x418; + uint32_t x419; + fiat_p256_uint1 x420; + uint32_t x421; + fiat_p256_uint1 x422; + uint32_t x423; + fiat_p256_uint1 x424; + uint32_t x425; + uint32_t x426; + fiat_p256_uint1 x427; + uint32_t x428; + fiat_p256_uint1 x429; + uint32_t x430; + fiat_p256_uint1 x431; + uint32_t x432; + fiat_p256_uint1 x433; + uint32_t x434; + fiat_p256_uint1 x435; + uint32_t x436; + fiat_p256_uint1 x437; + uint32_t x438; + fiat_p256_uint1 x439; + uint32_t x440; + fiat_p256_uint1 x441; + uint32_t x442; + fiat_p256_uint1 x443; + uint32_t x444; + uint32_t x445; + uint32_t x446; + uint32_t x447; + uint32_t x448; + uint32_t x449; + uint32_t x450; + uint32_t x451; + uint32_t x452; + fiat_p256_uint1 x453; + uint32_t x454; + fiat_p256_uint1 x455; + uint32_t x456; + uint32_t x457; + fiat_p256_uint1 x458; + uint32_t x459; + fiat_p256_uint1 x460; + uint32_t x461; + fiat_p256_uint1 x462; + uint32_t x463; + fiat_p256_uint1 x464; + uint32_t x465; + fiat_p256_uint1 x466; + uint32_t x467; + fiat_p256_uint1 x468; + uint32_t x469; + fiat_p256_uint1 x470; + uint32_t x471; + fiat_p256_uint1 x472; + uint32_t x473; + fiat_p256_uint1 x474; + uint32_t x475; + uint32_t x476; + uint32_t x477; + uint32_t x478; + uint32_t x479; + uint32_t x480; + uint32_t x481; + uint32_t x482; + uint32_t x483; + uint32_t x484; + uint32_t x485; + uint32_t x486; + uint32_t x487; + uint32_t x488; + uint32_t x489; + uint32_t x490; + uint32_t x491; + uint32_t x492; + fiat_p256_uint1 x493; + uint32_t x494; + fiat_p256_uint1 x495; + uint32_t x496; + fiat_p256_uint1 x497; + uint32_t x498; + fiat_p256_uint1 x499; + uint32_t x500; + fiat_p256_uint1 x501; + uint32_t x502; + fiat_p256_uint1 x503; + uint32_t x504; + fiat_p256_uint1 x505; + uint32_t x506; + uint32_t x507; + fiat_p256_uint1 x508; + uint32_t x509; + fiat_p256_uint1 x510; + uint32_t x511; + fiat_p256_uint1 x512; + uint32_t x513; + fiat_p256_uint1 x514; + uint32_t x515; + fiat_p256_uint1 x516; + uint32_t x517; + fiat_p256_uint1 x518; + uint32_t x519; + fiat_p256_uint1 x520; + uint32_t x521; + fiat_p256_uint1 x522; + uint32_t x523; + fiat_p256_uint1 x524; + uint32_t x525; + uint32_t x526; + uint32_t x527; + uint32_t x528; + uint32_t x529; + uint32_t x530; + uint32_t x531; + uint32_t x532; + uint32_t x533; + fiat_p256_uint1 x534; + uint32_t x535; + fiat_p256_uint1 x536; + uint32_t x537; + uint32_t x538; + fiat_p256_uint1 x539; + uint32_t x540; + fiat_p256_uint1 x541; + uint32_t x542; + fiat_p256_uint1 x543; + uint32_t x544; + fiat_p256_uint1 x545; + uint32_t x546; + fiat_p256_uint1 x547; + uint32_t x548; + fiat_p256_uint1 x549; + uint32_t x550; + fiat_p256_uint1 x551; + uint32_t x552; + fiat_p256_uint1 x553; + uint32_t x554; + fiat_p256_uint1 x555; + uint32_t x556; + uint32_t x557; + uint32_t x558; + uint32_t x559; + uint32_t x560; + uint32_t x561; + uint32_t x562; + uint32_t x563; + uint32_t x564; + uint32_t x565; + uint32_t x566; + uint32_t x567; + uint32_t x568; + uint32_t x569; + uint32_t x570; + uint32_t x571; + uint32_t x572; + uint32_t x573; + fiat_p256_uint1 x574; + uint32_t x575; + fiat_p256_uint1 x576; + uint32_t x577; + fiat_p256_uint1 x578; + uint32_t x579; + fiat_p256_uint1 x580; + uint32_t x581; + fiat_p256_uint1 x582; + uint32_t x583; + fiat_p256_uint1 x584; + uint32_t x585; + fiat_p256_uint1 x586; + uint32_t x587; + uint32_t x588; + fiat_p256_uint1 x589; + uint32_t x590; + fiat_p256_uint1 x591; + uint32_t x592; + fiat_p256_uint1 x593; + uint32_t x594; + fiat_p256_uint1 x595; + uint32_t x596; + fiat_p256_uint1 x597; + uint32_t x598; + fiat_p256_uint1 x599; + uint32_t x600; + fiat_p256_uint1 x601; + uint32_t x602; + fiat_p256_uint1 x603; + uint32_t x604; + fiat_p256_uint1 x605; + uint32_t x606; + uint32_t x607; + uint32_t x608; + uint32_t x609; + uint32_t x610; + uint32_t x611; + uint32_t x612; + uint32_t x613; + uint32_t x614; + fiat_p256_uint1 x615; + uint32_t x616; + fiat_p256_uint1 x617; + uint32_t x618; + uint32_t x619; + fiat_p256_uint1 x620; + uint32_t x621; + fiat_p256_uint1 x622; + uint32_t x623; + fiat_p256_uint1 x624; + uint32_t x625; + fiat_p256_uint1 x626; + uint32_t x627; + fiat_p256_uint1 x628; + uint32_t x629; + fiat_p256_uint1 x630; + uint32_t x631; + fiat_p256_uint1 x632; + uint32_t x633; + fiat_p256_uint1 x634; + uint32_t x635; + fiat_p256_uint1 x636; + uint32_t x637; + uint32_t x638; + fiat_p256_uint1 x639; + uint32_t x640; + fiat_p256_uint1 x641; + uint32_t x642; + fiat_p256_uint1 x643; + uint32_t x644; + fiat_p256_uint1 x645; + uint32_t x646; + fiat_p256_uint1 x647; + uint32_t x648; + fiat_p256_uint1 x649; + uint32_t x650; + fiat_p256_uint1 x651; + uint32_t x652; + fiat_p256_uint1 x653; + uint32_t x654; + fiat_p256_uint1 x655; + uint32_t x656; + uint32_t x657; + uint32_t x658; + uint32_t x659; + uint32_t x660; + uint32_t x661; + uint32_t x662; + uint32_t x663; + x1 = (arg1[1]); + x2 = (arg1[2]); + x3 = (arg1[3]); + x4 = (arg1[4]); + x5 = (arg1[5]); + x6 = (arg1[6]); + x7 = (arg1[7]); + x8 = (arg1[0]); + fiat_p256_mulx_u32(&x9, &x10, x8, (arg2[7])); + fiat_p256_mulx_u32(&x11, &x12, x8, (arg2[6])); + fiat_p256_mulx_u32(&x13, &x14, x8, (arg2[5])); + fiat_p256_mulx_u32(&x15, &x16, x8, (arg2[4])); + fiat_p256_mulx_u32(&x17, &x18, x8, (arg2[3])); + fiat_p256_mulx_u32(&x19, &x20, x8, (arg2[2])); + fiat_p256_mulx_u32(&x21, &x22, x8, (arg2[1])); + fiat_p256_mulx_u32(&x23, &x24, x8, (arg2[0])); + fiat_p256_addcarryx_u32(&x25, &x26, 0x0, x24, x21); + fiat_p256_addcarryx_u32(&x27, &x28, x26, x22, x19); + fiat_p256_addcarryx_u32(&x29, &x30, x28, x20, x17); + fiat_p256_addcarryx_u32(&x31, &x32, x30, x18, x15); + fiat_p256_addcarryx_u32(&x33, &x34, x32, x16, x13); + fiat_p256_addcarryx_u32(&x35, &x36, x34, x14, x11); + fiat_p256_addcarryx_u32(&x37, &x38, x36, x12, x9); + x39 = (x38 + x10); + fiat_p256_mulx_u32(&x40, &x41, x23, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x42, &x43, x23, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x44, &x45, x23, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x46, &x47, x23, UINT32_C(0xffffffff)); + fiat_p256_addcarryx_u32(&x48, &x49, 0x0, x47, x44); + fiat_p256_addcarryx_u32(&x50, &x51, x49, x45, x42); + x52 = (x51 + x43); + fiat_p256_addcarryx_u32(&x53, &x54, 0x0, x23, x46); + fiat_p256_addcarryx_u32(&x55, &x56, x54, x25, x48); + fiat_p256_addcarryx_u32(&x57, &x58, x56, x27, x50); + fiat_p256_addcarryx_u32(&x59, &x60, x58, x29, x52); + fiat_p256_addcarryx_u32(&x61, &x62, x60, x31, 0x0); + fiat_p256_addcarryx_u32(&x63, &x64, x62, x33, 0x0); + fiat_p256_addcarryx_u32(&x65, &x66, x64, x35, x23); + fiat_p256_addcarryx_u32(&x67, &x68, x66, x37, x40); + fiat_p256_addcarryx_u32(&x69, &x70, x68, x39, x41); + fiat_p256_mulx_u32(&x71, &x72, x1, (arg2[7])); + fiat_p256_mulx_u32(&x73, &x74, x1, (arg2[6])); + fiat_p256_mulx_u32(&x75, &x76, x1, (arg2[5])); + fiat_p256_mulx_u32(&x77, &x78, x1, (arg2[4])); + fiat_p256_mulx_u32(&x79, &x80, x1, (arg2[3])); + fiat_p256_mulx_u32(&x81, &x82, x1, (arg2[2])); + fiat_p256_mulx_u32(&x83, &x84, x1, (arg2[1])); + fiat_p256_mulx_u32(&x85, &x86, x1, (arg2[0])); + fiat_p256_addcarryx_u32(&x87, &x88, 0x0, x86, x83); + fiat_p256_addcarryx_u32(&x89, &x90, x88, x84, x81); + fiat_p256_addcarryx_u32(&x91, &x92, x90, x82, x79); + fiat_p256_addcarryx_u32(&x93, &x94, x92, x80, x77); + fiat_p256_addcarryx_u32(&x95, &x96, x94, x78, x75); + fiat_p256_addcarryx_u32(&x97, &x98, x96, x76, x73); + fiat_p256_addcarryx_u32(&x99, &x100, x98, x74, x71); + x101 = (x100 + x72); + fiat_p256_addcarryx_u32(&x102, &x103, 0x0, x55, x85); + fiat_p256_addcarryx_u32(&x104, &x105, x103, x57, x87); + fiat_p256_addcarryx_u32(&x106, &x107, x105, x59, x89); + fiat_p256_addcarryx_u32(&x108, &x109, x107, x61, x91); + fiat_p256_addcarryx_u32(&x110, &x111, x109, x63, x93); + fiat_p256_addcarryx_u32(&x112, &x113, x111, x65, x95); + fiat_p256_addcarryx_u32(&x114, &x115, x113, x67, x97); + fiat_p256_addcarryx_u32(&x116, &x117, x115, x69, x99); + fiat_p256_addcarryx_u32(&x118, &x119, x117, x70, x101); + fiat_p256_mulx_u32(&x120, &x121, x102, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x122, &x123, x102, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x124, &x125, x102, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x126, &x127, x102, UINT32_C(0xffffffff)); + fiat_p256_addcarryx_u32(&x128, &x129, 0x0, x127, x124); + fiat_p256_addcarryx_u32(&x130, &x131, x129, x125, x122); + x132 = (x131 + x123); + fiat_p256_addcarryx_u32(&x133, &x134, 0x0, x102, x126); + fiat_p256_addcarryx_u32(&x135, &x136, x134, x104, x128); + fiat_p256_addcarryx_u32(&x137, &x138, x136, x106, x130); + fiat_p256_addcarryx_u32(&x139, &x140, x138, x108, x132); + fiat_p256_addcarryx_u32(&x141, &x142, x140, x110, 0x0); + fiat_p256_addcarryx_u32(&x143, &x144, x142, x112, 0x0); + fiat_p256_addcarryx_u32(&x145, &x146, x144, x114, x102); + fiat_p256_addcarryx_u32(&x147, &x148, x146, x116, x120); + fiat_p256_addcarryx_u32(&x149, &x150, x148, x118, x121); + x151 = ((uint32_t)x150 + x119); + fiat_p256_mulx_u32(&x152, &x153, x2, (arg2[7])); + fiat_p256_mulx_u32(&x154, &x155, x2, (arg2[6])); + fiat_p256_mulx_u32(&x156, &x157, x2, (arg2[5])); + fiat_p256_mulx_u32(&x158, &x159, x2, (arg2[4])); + fiat_p256_mulx_u32(&x160, &x161, x2, (arg2[3])); + fiat_p256_mulx_u32(&x162, &x163, x2, (arg2[2])); + fiat_p256_mulx_u32(&x164, &x165, x2, (arg2[1])); + fiat_p256_mulx_u32(&x166, &x167, x2, (arg2[0])); + fiat_p256_addcarryx_u32(&x168, &x169, 0x0, x167, x164); + fiat_p256_addcarryx_u32(&x170, &x171, x169, x165, x162); + fiat_p256_addcarryx_u32(&x172, &x173, x171, x163, x160); + fiat_p256_addcarryx_u32(&x174, &x175, x173, x161, x158); + fiat_p256_addcarryx_u32(&x176, &x177, x175, x159, x156); + fiat_p256_addcarryx_u32(&x178, &x179, x177, x157, x154); + fiat_p256_addcarryx_u32(&x180, &x181, x179, x155, x152); + x182 = (x181 + x153); + fiat_p256_addcarryx_u32(&x183, &x184, 0x0, x135, x166); + fiat_p256_addcarryx_u32(&x185, &x186, x184, x137, x168); + fiat_p256_addcarryx_u32(&x187, &x188, x186, x139, x170); + fiat_p256_addcarryx_u32(&x189, &x190, x188, x141, x172); + fiat_p256_addcarryx_u32(&x191, &x192, x190, x143, x174); + fiat_p256_addcarryx_u32(&x193, &x194, x192, x145, x176); + fiat_p256_addcarryx_u32(&x195, &x196, x194, x147, x178); + fiat_p256_addcarryx_u32(&x197, &x198, x196, x149, x180); + fiat_p256_addcarryx_u32(&x199, &x200, x198, x151, x182); + fiat_p256_mulx_u32(&x201, &x202, x183, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x203, &x204, x183, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x205, &x206, x183, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x207, &x208, x183, UINT32_C(0xffffffff)); + fiat_p256_addcarryx_u32(&x209, &x210, 0x0, x208, x205); + fiat_p256_addcarryx_u32(&x211, &x212, x210, x206, x203); + x213 = (x212 + x204); + fiat_p256_addcarryx_u32(&x214, &x215, 0x0, x183, x207); + fiat_p256_addcarryx_u32(&x216, &x217, x215, x185, x209); + fiat_p256_addcarryx_u32(&x218, &x219, x217, x187, x211); + fiat_p256_addcarryx_u32(&x220, &x221, x219, x189, x213); + fiat_p256_addcarryx_u32(&x222, &x223, x221, x191, 0x0); + fiat_p256_addcarryx_u32(&x224, &x225, x223, x193, 0x0); + fiat_p256_addcarryx_u32(&x226, &x227, x225, x195, x183); + fiat_p256_addcarryx_u32(&x228, &x229, x227, x197, x201); + fiat_p256_addcarryx_u32(&x230, &x231, x229, x199, x202); + x232 = ((uint32_t)x231 + x200); + fiat_p256_mulx_u32(&x233, &x234, x3, (arg2[7])); + fiat_p256_mulx_u32(&x235, &x236, x3, (arg2[6])); + fiat_p256_mulx_u32(&x237, &x238, x3, (arg2[5])); + fiat_p256_mulx_u32(&x239, &x240, x3, (arg2[4])); + fiat_p256_mulx_u32(&x241, &x242, x3, (arg2[3])); + fiat_p256_mulx_u32(&x243, &x244, x3, (arg2[2])); + fiat_p256_mulx_u32(&x245, &x246, x3, (arg2[1])); + fiat_p256_mulx_u32(&x247, &x248, x3, (arg2[0])); + fiat_p256_addcarryx_u32(&x249, &x250, 0x0, x248, x245); + fiat_p256_addcarryx_u32(&x251, &x252, x250, x246, x243); + fiat_p256_addcarryx_u32(&x253, &x254, x252, x244, x241); + fiat_p256_addcarryx_u32(&x255, &x256, x254, x242, x239); + fiat_p256_addcarryx_u32(&x257, &x258, x256, x240, x237); + fiat_p256_addcarryx_u32(&x259, &x260, x258, x238, x235); + fiat_p256_addcarryx_u32(&x261, &x262, x260, x236, x233); + x263 = (x262 + x234); + fiat_p256_addcarryx_u32(&x264, &x265, 0x0, x216, x247); + fiat_p256_addcarryx_u32(&x266, &x267, x265, x218, x249); + fiat_p256_addcarryx_u32(&x268, &x269, x267, x220, x251); + fiat_p256_addcarryx_u32(&x270, &x271, x269, x222, x253); + fiat_p256_addcarryx_u32(&x272, &x273, x271, x224, x255); + fiat_p256_addcarryx_u32(&x274, &x275, x273, x226, x257); + fiat_p256_addcarryx_u32(&x276, &x277, x275, x228, x259); + fiat_p256_addcarryx_u32(&x278, &x279, x277, x230, x261); + fiat_p256_addcarryx_u32(&x280, &x281, x279, x232, x263); + fiat_p256_mulx_u32(&x282, &x283, x264, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x284, &x285, x264, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x286, &x287, x264, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x288, &x289, x264, UINT32_C(0xffffffff)); + fiat_p256_addcarryx_u32(&x290, &x291, 0x0, x289, x286); + fiat_p256_addcarryx_u32(&x292, &x293, x291, x287, x284); + x294 = (x293 + x285); + fiat_p256_addcarryx_u32(&x295, &x296, 0x0, x264, x288); + fiat_p256_addcarryx_u32(&x297, &x298, x296, x266, x290); + fiat_p256_addcarryx_u32(&x299, &x300, x298, x268, x292); + fiat_p256_addcarryx_u32(&x301, &x302, x300, x270, x294); + fiat_p256_addcarryx_u32(&x303, &x304, x302, x272, 0x0); + fiat_p256_addcarryx_u32(&x305, &x306, x304, x274, 0x0); + fiat_p256_addcarryx_u32(&x307, &x308, x306, x276, x264); + fiat_p256_addcarryx_u32(&x309, &x310, x308, x278, x282); + fiat_p256_addcarryx_u32(&x311, &x312, x310, x280, x283); + x313 = ((uint32_t)x312 + x281); + fiat_p256_mulx_u32(&x314, &x315, x4, (arg2[7])); + fiat_p256_mulx_u32(&x316, &x317, x4, (arg2[6])); + fiat_p256_mulx_u32(&x318, &x319, x4, (arg2[5])); + fiat_p256_mulx_u32(&x320, &x321, x4, (arg2[4])); + fiat_p256_mulx_u32(&x322, &x323, x4, (arg2[3])); + fiat_p256_mulx_u32(&x324, &x325, x4, (arg2[2])); + fiat_p256_mulx_u32(&x326, &x327, x4, (arg2[1])); + fiat_p256_mulx_u32(&x328, &x329, x4, (arg2[0])); + fiat_p256_addcarryx_u32(&x330, &x331, 0x0, x329, x326); + fiat_p256_addcarryx_u32(&x332, &x333, x331, x327, x324); + fiat_p256_addcarryx_u32(&x334, &x335, x333, x325, x322); + fiat_p256_addcarryx_u32(&x336, &x337, x335, x323, x320); + fiat_p256_addcarryx_u32(&x338, &x339, x337, x321, x318); + fiat_p256_addcarryx_u32(&x340, &x341, x339, x319, x316); + fiat_p256_addcarryx_u32(&x342, &x343, x341, x317, x314); + x344 = (x343 + x315); + fiat_p256_addcarryx_u32(&x345, &x346, 0x0, x297, x328); + fiat_p256_addcarryx_u32(&x347, &x348, x346, x299, x330); + fiat_p256_addcarryx_u32(&x349, &x350, x348, x301, x332); + fiat_p256_addcarryx_u32(&x351, &x352, x350, x303, x334); + fiat_p256_addcarryx_u32(&x353, &x354, x352, x305, x336); + fiat_p256_addcarryx_u32(&x355, &x356, x354, x307, x338); + fiat_p256_addcarryx_u32(&x357, &x358, x356, x309, x340); + fiat_p256_addcarryx_u32(&x359, &x360, x358, x311, x342); + fiat_p256_addcarryx_u32(&x361, &x362, x360, x313, x344); + fiat_p256_mulx_u32(&x363, &x364, x345, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x365, &x366, x345, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x367, &x368, x345, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x369, &x370, x345, UINT32_C(0xffffffff)); + fiat_p256_addcarryx_u32(&x371, &x372, 0x0, x370, x367); + fiat_p256_addcarryx_u32(&x373, &x374, x372, x368, x365); + x375 = (x374 + x366); + fiat_p256_addcarryx_u32(&x376, &x377, 0x0, x345, x369); + fiat_p256_addcarryx_u32(&x378, &x379, x377, x347, x371); + fiat_p256_addcarryx_u32(&x380, &x381, x379, x349, x373); + fiat_p256_addcarryx_u32(&x382, &x383, x381, x351, x375); + fiat_p256_addcarryx_u32(&x384, &x385, x383, x353, 0x0); + fiat_p256_addcarryx_u32(&x386, &x387, x385, x355, 0x0); + fiat_p256_addcarryx_u32(&x388, &x389, x387, x357, x345); + fiat_p256_addcarryx_u32(&x390, &x391, x389, x359, x363); + fiat_p256_addcarryx_u32(&x392, &x393, x391, x361, x364); + x394 = ((uint32_t)x393 + x362); + fiat_p256_mulx_u32(&x395, &x396, x5, (arg2[7])); + fiat_p256_mulx_u32(&x397, &x398, x5, (arg2[6])); + fiat_p256_mulx_u32(&x399, &x400, x5, (arg2[5])); + fiat_p256_mulx_u32(&x401, &x402, x5, (arg2[4])); + fiat_p256_mulx_u32(&x403, &x404, x5, (arg2[3])); + fiat_p256_mulx_u32(&x405, &x406, x5, (arg2[2])); + fiat_p256_mulx_u32(&x407, &x408, x5, (arg2[1])); + fiat_p256_mulx_u32(&x409, &x410, x5, (arg2[0])); + fiat_p256_addcarryx_u32(&x411, &x412, 0x0, x410, x407); + fiat_p256_addcarryx_u32(&x413, &x414, x412, x408, x405); + fiat_p256_addcarryx_u32(&x415, &x416, x414, x406, x403); + fiat_p256_addcarryx_u32(&x417, &x418, x416, x404, x401); + fiat_p256_addcarryx_u32(&x419, &x420, x418, x402, x399); + fiat_p256_addcarryx_u32(&x421, &x422, x420, x400, x397); + fiat_p256_addcarryx_u32(&x423, &x424, x422, x398, x395); + x425 = (x424 + x396); + fiat_p256_addcarryx_u32(&x426, &x427, 0x0, x378, x409); + fiat_p256_addcarryx_u32(&x428, &x429, x427, x380, x411); + fiat_p256_addcarryx_u32(&x430, &x431, x429, x382, x413); + fiat_p256_addcarryx_u32(&x432, &x433, x431, x384, x415); + fiat_p256_addcarryx_u32(&x434, &x435, x433, x386, x417); + fiat_p256_addcarryx_u32(&x436, &x437, x435, x388, x419); + fiat_p256_addcarryx_u32(&x438, &x439, x437, x390, x421); + fiat_p256_addcarryx_u32(&x440, &x441, x439, x392, x423); + fiat_p256_addcarryx_u32(&x442, &x443, x441, x394, x425); + fiat_p256_mulx_u32(&x444, &x445, x426, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x446, &x447, x426, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x448, &x449, x426, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x450, &x451, x426, UINT32_C(0xffffffff)); + fiat_p256_addcarryx_u32(&x452, &x453, 0x0, x451, x448); + fiat_p256_addcarryx_u32(&x454, &x455, x453, x449, x446); + x456 = (x455 + x447); + fiat_p256_addcarryx_u32(&x457, &x458, 0x0, x426, x450); + fiat_p256_addcarryx_u32(&x459, &x460, x458, x428, x452); + fiat_p256_addcarryx_u32(&x461, &x462, x460, x430, x454); + fiat_p256_addcarryx_u32(&x463, &x464, x462, x432, x456); + fiat_p256_addcarryx_u32(&x465, &x466, x464, x434, 0x0); + fiat_p256_addcarryx_u32(&x467, &x468, x466, x436, 0x0); + fiat_p256_addcarryx_u32(&x469, &x470, x468, x438, x426); + fiat_p256_addcarryx_u32(&x471, &x472, x470, x440, x444); + fiat_p256_addcarryx_u32(&x473, &x474, x472, x442, x445); + x475 = ((uint32_t)x474 + x443); + fiat_p256_mulx_u32(&x476, &x477, x6, (arg2[7])); + fiat_p256_mulx_u32(&x478, &x479, x6, (arg2[6])); + fiat_p256_mulx_u32(&x480, &x481, x6, (arg2[5])); + fiat_p256_mulx_u32(&x482, &x483, x6, (arg2[4])); + fiat_p256_mulx_u32(&x484, &x485, x6, (arg2[3])); + fiat_p256_mulx_u32(&x486, &x487, x6, (arg2[2])); + fiat_p256_mulx_u32(&x488, &x489, x6, (arg2[1])); + fiat_p256_mulx_u32(&x490, &x491, x6, (arg2[0])); + fiat_p256_addcarryx_u32(&x492, &x493, 0x0, x491, x488); + fiat_p256_addcarryx_u32(&x494, &x495, x493, x489, x486); + fiat_p256_addcarryx_u32(&x496, &x497, x495, x487, x484); + fiat_p256_addcarryx_u32(&x498, &x499, x497, x485, x482); + fiat_p256_addcarryx_u32(&x500, &x501, x499, x483, x480); + fiat_p256_addcarryx_u32(&x502, &x503, x501, x481, x478); + fiat_p256_addcarryx_u32(&x504, &x505, x503, x479, x476); + x506 = (x505 + x477); + fiat_p256_addcarryx_u32(&x507, &x508, 0x0, x459, x490); + fiat_p256_addcarryx_u32(&x509, &x510, x508, x461, x492); + fiat_p256_addcarryx_u32(&x511, &x512, x510, x463, x494); + fiat_p256_addcarryx_u32(&x513, &x514, x512, x465, x496); + fiat_p256_addcarryx_u32(&x515, &x516, x514, x467, x498); + fiat_p256_addcarryx_u32(&x517, &x518, x516, x469, x500); + fiat_p256_addcarryx_u32(&x519, &x520, x518, x471, x502); + fiat_p256_addcarryx_u32(&x521, &x522, x520, x473, x504); + fiat_p256_addcarryx_u32(&x523, &x524, x522, x475, x506); + fiat_p256_mulx_u32(&x525, &x526, x507, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x527, &x528, x507, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x529, &x530, x507, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x531, &x532, x507, UINT32_C(0xffffffff)); + fiat_p256_addcarryx_u32(&x533, &x534, 0x0, x532, x529); + fiat_p256_addcarryx_u32(&x535, &x536, x534, x530, x527); + x537 = (x536 + x528); + fiat_p256_addcarryx_u32(&x538, &x539, 0x0, x507, x531); + fiat_p256_addcarryx_u32(&x540, &x541, x539, x509, x533); + fiat_p256_addcarryx_u32(&x542, &x543, x541, x511, x535); + fiat_p256_addcarryx_u32(&x544, &x545, x543, x513, x537); + fiat_p256_addcarryx_u32(&x546, &x547, x545, x515, 0x0); + fiat_p256_addcarryx_u32(&x548, &x549, x547, x517, 0x0); + fiat_p256_addcarryx_u32(&x550, &x551, x549, x519, x507); + fiat_p256_addcarryx_u32(&x552, &x553, x551, x521, x525); + fiat_p256_addcarryx_u32(&x554, &x555, x553, x523, x526); + x556 = ((uint32_t)x555 + x524); + fiat_p256_mulx_u32(&x557, &x558, x7, (arg2[7])); + fiat_p256_mulx_u32(&x559, &x560, x7, (arg2[6])); + fiat_p256_mulx_u32(&x561, &x562, x7, (arg2[5])); + fiat_p256_mulx_u32(&x563, &x564, x7, (arg2[4])); + fiat_p256_mulx_u32(&x565, &x566, x7, (arg2[3])); + fiat_p256_mulx_u32(&x567, &x568, x7, (arg2[2])); + fiat_p256_mulx_u32(&x569, &x570, x7, (arg2[1])); + fiat_p256_mulx_u32(&x571, &x572, x7, (arg2[0])); + fiat_p256_addcarryx_u32(&x573, &x574, 0x0, x572, x569); + fiat_p256_addcarryx_u32(&x575, &x576, x574, x570, x567); + fiat_p256_addcarryx_u32(&x577, &x578, x576, x568, x565); + fiat_p256_addcarryx_u32(&x579, &x580, x578, x566, x563); + fiat_p256_addcarryx_u32(&x581, &x582, x580, x564, x561); + fiat_p256_addcarryx_u32(&x583, &x584, x582, x562, x559); + fiat_p256_addcarryx_u32(&x585, &x586, x584, x560, x557); + x587 = (x586 + x558); + fiat_p256_addcarryx_u32(&x588, &x589, 0x0, x540, x571); + fiat_p256_addcarryx_u32(&x590, &x591, x589, x542, x573); + fiat_p256_addcarryx_u32(&x592, &x593, x591, x544, x575); + fiat_p256_addcarryx_u32(&x594, &x595, x593, x546, x577); + fiat_p256_addcarryx_u32(&x596, &x597, x595, x548, x579); + fiat_p256_addcarryx_u32(&x598, &x599, x597, x550, x581); + fiat_p256_addcarryx_u32(&x600, &x601, x599, x552, x583); + fiat_p256_addcarryx_u32(&x602, &x603, x601, x554, x585); + fiat_p256_addcarryx_u32(&x604, &x605, x603, x556, x587); + fiat_p256_mulx_u32(&x606, &x607, x588, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x608, &x609, x588, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x610, &x611, x588, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x612, &x613, x588, UINT32_C(0xffffffff)); + fiat_p256_addcarryx_u32(&x614, &x615, 0x0, x613, x610); + fiat_p256_addcarryx_u32(&x616, &x617, x615, x611, x608); + x618 = (x617 + x609); + fiat_p256_addcarryx_u32(&x619, &x620, 0x0, x588, x612); + fiat_p256_addcarryx_u32(&x621, &x622, x620, x590, x614); + fiat_p256_addcarryx_u32(&x623, &x624, x622, x592, x616); + fiat_p256_addcarryx_u32(&x625, &x626, x624, x594, x618); + fiat_p256_addcarryx_u32(&x627, &x628, x626, x596, 0x0); + fiat_p256_addcarryx_u32(&x629, &x630, x628, x598, 0x0); + fiat_p256_addcarryx_u32(&x631, &x632, x630, x600, x588); + fiat_p256_addcarryx_u32(&x633, &x634, x632, x602, x606); + fiat_p256_addcarryx_u32(&x635, &x636, x634, x604, x607); + x637 = ((uint32_t)x636 + x605); + fiat_p256_subborrowx_u32(&x638, &x639, 0x0, x621, UINT32_C(0xffffffff)); + fiat_p256_subborrowx_u32(&x640, &x641, x639, x623, UINT32_C(0xffffffff)); + fiat_p256_subborrowx_u32(&x642, &x643, x641, x625, UINT32_C(0xffffffff)); + fiat_p256_subborrowx_u32(&x644, &x645, x643, x627, 0x0); + fiat_p256_subborrowx_u32(&x646, &x647, x645, x629, 0x0); + fiat_p256_subborrowx_u32(&x648, &x649, x647, x631, 0x0); + fiat_p256_subborrowx_u32(&x650, &x651, x649, x633, 0x1); + fiat_p256_subborrowx_u32(&x652, &x653, x651, x635, UINT32_C(0xffffffff)); + fiat_p256_subborrowx_u32(&x654, &x655, x653, x637, 0x0); + fiat_p256_cmovznz_u32(&x656, x655, x638, x621); + fiat_p256_cmovznz_u32(&x657, x655, x640, x623); + fiat_p256_cmovznz_u32(&x658, x655, x642, x625); + fiat_p256_cmovznz_u32(&x659, x655, x644, x627); + fiat_p256_cmovznz_u32(&x660, x655, x646, x629); + fiat_p256_cmovznz_u32(&x661, x655, x648, x631); + fiat_p256_cmovznz_u32(&x662, x655, x650, x633); + fiat_p256_cmovznz_u32(&x663, x655, x652, x635); + out1[0] = x656; + out1[1] = x657; + out1[2] = x658; + out1[3] = x659; + out1[4] = x660; + out1[5] = x661; + out1[6] = x662; + out1[7] = x663; +} + +/* + * The function fiat_p256_square squares a field element in the Montgomery domain. + * + * Preconditions: + * 0 ≤ eval arg1 < m + * Postconditions: + * eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) * eval (from_montgomery arg1)) mod m + * 0 ≤ eval out1 < m + * + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_square(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1) { + uint32_t x1; + uint32_t x2; + uint32_t x3; + uint32_t x4; + uint32_t x5; + uint32_t x6; + uint32_t x7; + uint32_t x8; + uint32_t x9; + uint32_t x10; + uint32_t x11; + uint32_t x12; + uint32_t x13; + uint32_t x14; + uint32_t x15; + uint32_t x16; + uint32_t x17; + uint32_t x18; + uint32_t x19; + uint32_t x20; + uint32_t x21; + uint32_t x22; + uint32_t x23; + uint32_t x24; + uint32_t x25; + fiat_p256_uint1 x26; + uint32_t x27; + fiat_p256_uint1 x28; + uint32_t x29; + fiat_p256_uint1 x30; + uint32_t x31; + fiat_p256_uint1 x32; + uint32_t x33; + fiat_p256_uint1 x34; + uint32_t x35; + fiat_p256_uint1 x36; + uint32_t x37; + fiat_p256_uint1 x38; + uint32_t x39; + uint32_t x40; + uint32_t x41; + uint32_t x42; + uint32_t x43; + uint32_t x44; + uint32_t x45; + uint32_t x46; + uint32_t x47; + uint32_t x48; + fiat_p256_uint1 x49; + uint32_t x50; + fiat_p256_uint1 x51; + uint32_t x52; + uint32_t x53; + fiat_p256_uint1 x54; + uint32_t x55; + fiat_p256_uint1 x56; + uint32_t x57; + fiat_p256_uint1 x58; + uint32_t x59; + fiat_p256_uint1 x60; + uint32_t x61; + fiat_p256_uint1 x62; + uint32_t x63; + fiat_p256_uint1 x64; + uint32_t x65; + fiat_p256_uint1 x66; + uint32_t x67; + fiat_p256_uint1 x68; + uint32_t x69; + fiat_p256_uint1 x70; + uint32_t x71; + uint32_t x72; + uint32_t x73; + uint32_t x74; + uint32_t x75; + uint32_t x76; + uint32_t x77; + uint32_t x78; + uint32_t x79; + uint32_t x80; + uint32_t x81; + uint32_t x82; + uint32_t x83; + uint32_t x84; + uint32_t x85; + uint32_t x86; + uint32_t x87; + fiat_p256_uint1 x88; + uint32_t x89; + fiat_p256_uint1 x90; + uint32_t x91; + fiat_p256_uint1 x92; + uint32_t x93; + fiat_p256_uint1 x94; + uint32_t x95; + fiat_p256_uint1 x96; + uint32_t x97; + fiat_p256_uint1 x98; + uint32_t x99; + fiat_p256_uint1 x100; + uint32_t x101; + uint32_t x102; + fiat_p256_uint1 x103; + uint32_t x104; + fiat_p256_uint1 x105; + uint32_t x106; + fiat_p256_uint1 x107; + uint32_t x108; + fiat_p256_uint1 x109; + uint32_t x110; + fiat_p256_uint1 x111; + uint32_t x112; + fiat_p256_uint1 x113; + uint32_t x114; + fiat_p256_uint1 x115; + uint32_t x116; + fiat_p256_uint1 x117; + uint32_t x118; + fiat_p256_uint1 x119; + uint32_t x120; + uint32_t x121; + uint32_t x122; + uint32_t x123; + uint32_t x124; + uint32_t x125; + uint32_t x126; + uint32_t x127; + uint32_t x128; + fiat_p256_uint1 x129; + uint32_t x130; + fiat_p256_uint1 x131; + uint32_t x132; + uint32_t x133; + fiat_p256_uint1 x134; + uint32_t x135; + fiat_p256_uint1 x136; + uint32_t x137; + fiat_p256_uint1 x138; + uint32_t x139; + fiat_p256_uint1 x140; + uint32_t x141; + fiat_p256_uint1 x142; + uint32_t x143; + fiat_p256_uint1 x144; + uint32_t x145; + fiat_p256_uint1 x146; + uint32_t x147; + fiat_p256_uint1 x148; + uint32_t x149; + fiat_p256_uint1 x150; + uint32_t x151; + uint32_t x152; + uint32_t x153; + uint32_t x154; + uint32_t x155; + uint32_t x156; + uint32_t x157; + uint32_t x158; + uint32_t x159; + uint32_t x160; + uint32_t x161; + uint32_t x162; + uint32_t x163; + uint32_t x164; + uint32_t x165; + uint32_t x166; + uint32_t x167; + uint32_t x168; + fiat_p256_uint1 x169; + uint32_t x170; + fiat_p256_uint1 x171; + uint32_t x172; + fiat_p256_uint1 x173; + uint32_t x174; + fiat_p256_uint1 x175; + uint32_t x176; + fiat_p256_uint1 x177; + uint32_t x178; + fiat_p256_uint1 x179; + uint32_t x180; + fiat_p256_uint1 x181; + uint32_t x182; + uint32_t x183; + fiat_p256_uint1 x184; + uint32_t x185; + fiat_p256_uint1 x186; + uint32_t x187; + fiat_p256_uint1 x188; + uint32_t x189; + fiat_p256_uint1 x190; + uint32_t x191; + fiat_p256_uint1 x192; + uint32_t x193; + fiat_p256_uint1 x194; + uint32_t x195; + fiat_p256_uint1 x196; + uint32_t x197; + fiat_p256_uint1 x198; + uint32_t x199; + fiat_p256_uint1 x200; + uint32_t x201; + uint32_t x202; + uint32_t x203; + uint32_t x204; + uint32_t x205; + uint32_t x206; + uint32_t x207; + uint32_t x208; + uint32_t x209; + fiat_p256_uint1 x210; + uint32_t x211; + fiat_p256_uint1 x212; + uint32_t x213; + uint32_t x214; + fiat_p256_uint1 x215; + uint32_t x216; + fiat_p256_uint1 x217; + uint32_t x218; + fiat_p256_uint1 x219; + uint32_t x220; + fiat_p256_uint1 x221; + uint32_t x222; + fiat_p256_uint1 x223; + uint32_t x224; + fiat_p256_uint1 x225; + uint32_t x226; + fiat_p256_uint1 x227; + uint32_t x228; + fiat_p256_uint1 x229; + uint32_t x230; + fiat_p256_uint1 x231; + uint32_t x232; + uint32_t x233; + uint32_t x234; + uint32_t x235; + uint32_t x236; + uint32_t x237; + uint32_t x238; + uint32_t x239; + uint32_t x240; + uint32_t x241; + uint32_t x242; + uint32_t x243; + uint32_t x244; + uint32_t x245; + uint32_t x246; + uint32_t x247; + uint32_t x248; + uint32_t x249; + fiat_p256_uint1 x250; + uint32_t x251; + fiat_p256_uint1 x252; + uint32_t x253; + fiat_p256_uint1 x254; + uint32_t x255; + fiat_p256_uint1 x256; + uint32_t x257; + fiat_p256_uint1 x258; + uint32_t x259; + fiat_p256_uint1 x260; + uint32_t x261; + fiat_p256_uint1 x262; + uint32_t x263; + uint32_t x264; + fiat_p256_uint1 x265; + uint32_t x266; + fiat_p256_uint1 x267; + uint32_t x268; + fiat_p256_uint1 x269; + uint32_t x270; + fiat_p256_uint1 x271; + uint32_t x272; + fiat_p256_uint1 x273; + uint32_t x274; + fiat_p256_uint1 x275; + uint32_t x276; + fiat_p256_uint1 x277; + uint32_t x278; + fiat_p256_uint1 x279; + uint32_t x280; + fiat_p256_uint1 x281; + uint32_t x282; + uint32_t x283; + uint32_t x284; + uint32_t x285; + uint32_t x286; + uint32_t x287; + uint32_t x288; + uint32_t x289; + uint32_t x290; + fiat_p256_uint1 x291; + uint32_t x292; + fiat_p256_uint1 x293; + uint32_t x294; + uint32_t x295; + fiat_p256_uint1 x296; + uint32_t x297; + fiat_p256_uint1 x298; + uint32_t x299; + fiat_p256_uint1 x300; + uint32_t x301; + fiat_p256_uint1 x302; + uint32_t x303; + fiat_p256_uint1 x304; + uint32_t x305; + fiat_p256_uint1 x306; + uint32_t x307; + fiat_p256_uint1 x308; + uint32_t x309; + fiat_p256_uint1 x310; + uint32_t x311; + fiat_p256_uint1 x312; + uint32_t x313; + uint32_t x314; + uint32_t x315; + uint32_t x316; + uint32_t x317; + uint32_t x318; + uint32_t x319; + uint32_t x320; + uint32_t x321; + uint32_t x322; + uint32_t x323; + uint32_t x324; + uint32_t x325; + uint32_t x326; + uint32_t x327; + uint32_t x328; + uint32_t x329; + uint32_t x330; + fiat_p256_uint1 x331; + uint32_t x332; + fiat_p256_uint1 x333; + uint32_t x334; + fiat_p256_uint1 x335; + uint32_t x336; + fiat_p256_uint1 x337; + uint32_t x338; + fiat_p256_uint1 x339; + uint32_t x340; + fiat_p256_uint1 x341; + uint32_t x342; + fiat_p256_uint1 x343; + uint32_t x344; + uint32_t x345; + fiat_p256_uint1 x346; + uint32_t x347; + fiat_p256_uint1 x348; + uint32_t x349; + fiat_p256_uint1 x350; + uint32_t x351; + fiat_p256_uint1 x352; + uint32_t x353; + fiat_p256_uint1 x354; + uint32_t x355; + fiat_p256_uint1 x356; + uint32_t x357; + fiat_p256_uint1 x358; + uint32_t x359; + fiat_p256_uint1 x360; + uint32_t x361; + fiat_p256_uint1 x362; + uint32_t x363; + uint32_t x364; + uint32_t x365; + uint32_t x366; + uint32_t x367; + uint32_t x368; + uint32_t x369; + uint32_t x370; + uint32_t x371; + fiat_p256_uint1 x372; + uint32_t x373; + fiat_p256_uint1 x374; + uint32_t x375; + uint32_t x376; + fiat_p256_uint1 x377; + uint32_t x378; + fiat_p256_uint1 x379; + uint32_t x380; + fiat_p256_uint1 x381; + uint32_t x382; + fiat_p256_uint1 x383; + uint32_t x384; + fiat_p256_uint1 x385; + uint32_t x386; + fiat_p256_uint1 x387; + uint32_t x388; + fiat_p256_uint1 x389; + uint32_t x390; + fiat_p256_uint1 x391; + uint32_t x392; + fiat_p256_uint1 x393; + uint32_t x394; + uint32_t x395; + uint32_t x396; + uint32_t x397; + uint32_t x398; + uint32_t x399; + uint32_t x400; + uint32_t x401; + uint32_t x402; + uint32_t x403; + uint32_t x404; + uint32_t x405; + uint32_t x406; + uint32_t x407; + uint32_t x408; + uint32_t x409; + uint32_t x410; + uint32_t x411; + fiat_p256_uint1 x412; + uint32_t x413; + fiat_p256_uint1 x414; + uint32_t x415; + fiat_p256_uint1 x416; + uint32_t x417; + fiat_p256_uint1 x418; + uint32_t x419; + fiat_p256_uint1 x420; + uint32_t x421; + fiat_p256_uint1 x422; + uint32_t x423; + fiat_p256_uint1 x424; + uint32_t x425; + uint32_t x426; + fiat_p256_uint1 x427; + uint32_t x428; + fiat_p256_uint1 x429; + uint32_t x430; + fiat_p256_uint1 x431; + uint32_t x432; + fiat_p256_uint1 x433; + uint32_t x434; + fiat_p256_uint1 x435; + uint32_t x436; + fiat_p256_uint1 x437; + uint32_t x438; + fiat_p256_uint1 x439; + uint32_t x440; + fiat_p256_uint1 x441; + uint32_t x442; + fiat_p256_uint1 x443; + uint32_t x444; + uint32_t x445; + uint32_t x446; + uint32_t x447; + uint32_t x448; + uint32_t x449; + uint32_t x450; + uint32_t x451; + uint32_t x452; + fiat_p256_uint1 x453; + uint32_t x454; + fiat_p256_uint1 x455; + uint32_t x456; + uint32_t x457; + fiat_p256_uint1 x458; + uint32_t x459; + fiat_p256_uint1 x460; + uint32_t x461; + fiat_p256_uint1 x462; + uint32_t x463; + fiat_p256_uint1 x464; + uint32_t x465; + fiat_p256_uint1 x466; + uint32_t x467; + fiat_p256_uint1 x468; + uint32_t x469; + fiat_p256_uint1 x470; + uint32_t x471; + fiat_p256_uint1 x472; + uint32_t x473; + fiat_p256_uint1 x474; + uint32_t x475; + uint32_t x476; + uint32_t x477; + uint32_t x478; + uint32_t x479; + uint32_t x480; + uint32_t x481; + uint32_t x482; + uint32_t x483; + uint32_t x484; + uint32_t x485; + uint32_t x486; + uint32_t x487; + uint32_t x488; + uint32_t x489; + uint32_t x490; + uint32_t x491; + uint32_t x492; + fiat_p256_uint1 x493; + uint32_t x494; + fiat_p256_uint1 x495; + uint32_t x496; + fiat_p256_uint1 x497; + uint32_t x498; + fiat_p256_uint1 x499; + uint32_t x500; + fiat_p256_uint1 x501; + uint32_t x502; + fiat_p256_uint1 x503; + uint32_t x504; + fiat_p256_uint1 x505; + uint32_t x506; + uint32_t x507; + fiat_p256_uint1 x508; + uint32_t x509; + fiat_p256_uint1 x510; + uint32_t x511; + fiat_p256_uint1 x512; + uint32_t x513; + fiat_p256_uint1 x514; + uint32_t x515; + fiat_p256_uint1 x516; + uint32_t x517; + fiat_p256_uint1 x518; + uint32_t x519; + fiat_p256_uint1 x520; + uint32_t x521; + fiat_p256_uint1 x522; + uint32_t x523; + fiat_p256_uint1 x524; + uint32_t x525; + uint32_t x526; + uint32_t x527; + uint32_t x528; + uint32_t x529; + uint32_t x530; + uint32_t x531; + uint32_t x532; + uint32_t x533; + fiat_p256_uint1 x534; + uint32_t x535; + fiat_p256_uint1 x536; + uint32_t x537; + uint32_t x538; + fiat_p256_uint1 x539; + uint32_t x540; + fiat_p256_uint1 x541; + uint32_t x542; + fiat_p256_uint1 x543; + uint32_t x544; + fiat_p256_uint1 x545; + uint32_t x546; + fiat_p256_uint1 x547; + uint32_t x548; + fiat_p256_uint1 x549; + uint32_t x550; + fiat_p256_uint1 x551; + uint32_t x552; + fiat_p256_uint1 x553; + uint32_t x554; + fiat_p256_uint1 x555; + uint32_t x556; + uint32_t x557; + uint32_t x558; + uint32_t x559; + uint32_t x560; + uint32_t x561; + uint32_t x562; + uint32_t x563; + uint32_t x564; + uint32_t x565; + uint32_t x566; + uint32_t x567; + uint32_t x568; + uint32_t x569; + uint32_t x570; + uint32_t x571; + uint32_t x572; + uint32_t x573; + fiat_p256_uint1 x574; + uint32_t x575; + fiat_p256_uint1 x576; + uint32_t x577; + fiat_p256_uint1 x578; + uint32_t x579; + fiat_p256_uint1 x580; + uint32_t x581; + fiat_p256_uint1 x582; + uint32_t x583; + fiat_p256_uint1 x584; + uint32_t x585; + fiat_p256_uint1 x586; + uint32_t x587; + uint32_t x588; + fiat_p256_uint1 x589; + uint32_t x590; + fiat_p256_uint1 x591; + uint32_t x592; + fiat_p256_uint1 x593; + uint32_t x594; + fiat_p256_uint1 x595; + uint32_t x596; + fiat_p256_uint1 x597; + uint32_t x598; + fiat_p256_uint1 x599; + uint32_t x600; + fiat_p256_uint1 x601; + uint32_t x602; + fiat_p256_uint1 x603; + uint32_t x604; + fiat_p256_uint1 x605; + uint32_t x606; + uint32_t x607; + uint32_t x608; + uint32_t x609; + uint32_t x610; + uint32_t x611; + uint32_t x612; + uint32_t x613; + uint32_t x614; + fiat_p256_uint1 x615; + uint32_t x616; + fiat_p256_uint1 x617; + uint32_t x618; + uint32_t x619; + fiat_p256_uint1 x620; + uint32_t x621; + fiat_p256_uint1 x622; + uint32_t x623; + fiat_p256_uint1 x624; + uint32_t x625; + fiat_p256_uint1 x626; + uint32_t x627; + fiat_p256_uint1 x628; + uint32_t x629; + fiat_p256_uint1 x630; + uint32_t x631; + fiat_p256_uint1 x632; + uint32_t x633; + fiat_p256_uint1 x634; + uint32_t x635; + fiat_p256_uint1 x636; + uint32_t x637; + uint32_t x638; + fiat_p256_uint1 x639; + uint32_t x640; + fiat_p256_uint1 x641; + uint32_t x642; + fiat_p256_uint1 x643; + uint32_t x644; + fiat_p256_uint1 x645; + uint32_t x646; + fiat_p256_uint1 x647; + uint32_t x648; + fiat_p256_uint1 x649; + uint32_t x650; + fiat_p256_uint1 x651; + uint32_t x652; + fiat_p256_uint1 x653; + uint32_t x654; + fiat_p256_uint1 x655; + uint32_t x656; + uint32_t x657; + uint32_t x658; + uint32_t x659; + uint32_t x660; + uint32_t x661; + uint32_t x662; + uint32_t x663; + x1 = (arg1[1]); + x2 = (arg1[2]); + x3 = (arg1[3]); + x4 = (arg1[4]); + x5 = (arg1[5]); + x6 = (arg1[6]); + x7 = (arg1[7]); + x8 = (arg1[0]); + fiat_p256_mulx_u32(&x9, &x10, x8, (arg1[7])); + fiat_p256_mulx_u32(&x11, &x12, x8, (arg1[6])); + fiat_p256_mulx_u32(&x13, &x14, x8, (arg1[5])); + fiat_p256_mulx_u32(&x15, &x16, x8, (arg1[4])); + fiat_p256_mulx_u32(&x17, &x18, x8, (arg1[3])); + fiat_p256_mulx_u32(&x19, &x20, x8, (arg1[2])); + fiat_p256_mulx_u32(&x21, &x22, x8, (arg1[1])); + fiat_p256_mulx_u32(&x23, &x24, x8, (arg1[0])); + fiat_p256_addcarryx_u32(&x25, &x26, 0x0, x24, x21); + fiat_p256_addcarryx_u32(&x27, &x28, x26, x22, x19); + fiat_p256_addcarryx_u32(&x29, &x30, x28, x20, x17); + fiat_p256_addcarryx_u32(&x31, &x32, x30, x18, x15); + fiat_p256_addcarryx_u32(&x33, &x34, x32, x16, x13); + fiat_p256_addcarryx_u32(&x35, &x36, x34, x14, x11); + fiat_p256_addcarryx_u32(&x37, &x38, x36, x12, x9); + x39 = (x38 + x10); + fiat_p256_mulx_u32(&x40, &x41, x23, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x42, &x43, x23, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x44, &x45, x23, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x46, &x47, x23, UINT32_C(0xffffffff)); + fiat_p256_addcarryx_u32(&x48, &x49, 0x0, x47, x44); + fiat_p256_addcarryx_u32(&x50, &x51, x49, x45, x42); + x52 = (x51 + x43); + fiat_p256_addcarryx_u32(&x53, &x54, 0x0, x23, x46); + fiat_p256_addcarryx_u32(&x55, &x56, x54, x25, x48); + fiat_p256_addcarryx_u32(&x57, &x58, x56, x27, x50); + fiat_p256_addcarryx_u32(&x59, &x60, x58, x29, x52); + fiat_p256_addcarryx_u32(&x61, &x62, x60, x31, 0x0); + fiat_p256_addcarryx_u32(&x63, &x64, x62, x33, 0x0); + fiat_p256_addcarryx_u32(&x65, &x66, x64, x35, x23); + fiat_p256_addcarryx_u32(&x67, &x68, x66, x37, x40); + fiat_p256_addcarryx_u32(&x69, &x70, x68, x39, x41); + fiat_p256_mulx_u32(&x71, &x72, x1, (arg1[7])); + fiat_p256_mulx_u32(&x73, &x74, x1, (arg1[6])); + fiat_p256_mulx_u32(&x75, &x76, x1, (arg1[5])); + fiat_p256_mulx_u32(&x77, &x78, x1, (arg1[4])); + fiat_p256_mulx_u32(&x79, &x80, x1, (arg1[3])); + fiat_p256_mulx_u32(&x81, &x82, x1, (arg1[2])); + fiat_p256_mulx_u32(&x83, &x84, x1, (arg1[1])); + fiat_p256_mulx_u32(&x85, &x86, x1, (arg1[0])); + fiat_p256_addcarryx_u32(&x87, &x88, 0x0, x86, x83); + fiat_p256_addcarryx_u32(&x89, &x90, x88, x84, x81); + fiat_p256_addcarryx_u32(&x91, &x92, x90, x82, x79); + fiat_p256_addcarryx_u32(&x93, &x94, x92, x80, x77); + fiat_p256_addcarryx_u32(&x95, &x96, x94, x78, x75); + fiat_p256_addcarryx_u32(&x97, &x98, x96, x76, x73); + fiat_p256_addcarryx_u32(&x99, &x100, x98, x74, x71); + x101 = (x100 + x72); + fiat_p256_addcarryx_u32(&x102, &x103, 0x0, x55, x85); + fiat_p256_addcarryx_u32(&x104, &x105, x103, x57, x87); + fiat_p256_addcarryx_u32(&x106, &x107, x105, x59, x89); + fiat_p256_addcarryx_u32(&x108, &x109, x107, x61, x91); + fiat_p256_addcarryx_u32(&x110, &x111, x109, x63, x93); + fiat_p256_addcarryx_u32(&x112, &x113, x111, x65, x95); + fiat_p256_addcarryx_u32(&x114, &x115, x113, x67, x97); + fiat_p256_addcarryx_u32(&x116, &x117, x115, x69, x99); + fiat_p256_addcarryx_u32(&x118, &x119, x117, x70, x101); + fiat_p256_mulx_u32(&x120, &x121, x102, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x122, &x123, x102, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x124, &x125, x102, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x126, &x127, x102, UINT32_C(0xffffffff)); + fiat_p256_addcarryx_u32(&x128, &x129, 0x0, x127, x124); + fiat_p256_addcarryx_u32(&x130, &x131, x129, x125, x122); + x132 = (x131 + x123); + fiat_p256_addcarryx_u32(&x133, &x134, 0x0, x102, x126); + fiat_p256_addcarryx_u32(&x135, &x136, x134, x104, x128); + fiat_p256_addcarryx_u32(&x137, &x138, x136, x106, x130); + fiat_p256_addcarryx_u32(&x139, &x140, x138, x108, x132); + fiat_p256_addcarryx_u32(&x141, &x142, x140, x110, 0x0); + fiat_p256_addcarryx_u32(&x143, &x144, x142, x112, 0x0); + fiat_p256_addcarryx_u32(&x145, &x146, x144, x114, x102); + fiat_p256_addcarryx_u32(&x147, &x148, x146, x116, x120); + fiat_p256_addcarryx_u32(&x149, &x150, x148, x118, x121); + x151 = ((uint32_t)x150 + x119); + fiat_p256_mulx_u32(&x152, &x153, x2, (arg1[7])); + fiat_p256_mulx_u32(&x154, &x155, x2, (arg1[6])); + fiat_p256_mulx_u32(&x156, &x157, x2, (arg1[5])); + fiat_p256_mulx_u32(&x158, &x159, x2, (arg1[4])); + fiat_p256_mulx_u32(&x160, &x161, x2, (arg1[3])); + fiat_p256_mulx_u32(&x162, &x163, x2, (arg1[2])); + fiat_p256_mulx_u32(&x164, &x165, x2, (arg1[1])); + fiat_p256_mulx_u32(&x166, &x167, x2, (arg1[0])); + fiat_p256_addcarryx_u32(&x168, &x169, 0x0, x167, x164); + fiat_p256_addcarryx_u32(&x170, &x171, x169, x165, x162); + fiat_p256_addcarryx_u32(&x172, &x173, x171, x163, x160); + fiat_p256_addcarryx_u32(&x174, &x175, x173, x161, x158); + fiat_p256_addcarryx_u32(&x176, &x177, x175, x159, x156); + fiat_p256_addcarryx_u32(&x178, &x179, x177, x157, x154); + fiat_p256_addcarryx_u32(&x180, &x181, x179, x155, x152); + x182 = (x181 + x153); + fiat_p256_addcarryx_u32(&x183, &x184, 0x0, x135, x166); + fiat_p256_addcarryx_u32(&x185, &x186, x184, x137, x168); + fiat_p256_addcarryx_u32(&x187, &x188, x186, x139, x170); + fiat_p256_addcarryx_u32(&x189, &x190, x188, x141, x172); + fiat_p256_addcarryx_u32(&x191, &x192, x190, x143, x174); + fiat_p256_addcarryx_u32(&x193, &x194, x192, x145, x176); + fiat_p256_addcarryx_u32(&x195, &x196, x194, x147, x178); + fiat_p256_addcarryx_u32(&x197, &x198, x196, x149, x180); + fiat_p256_addcarryx_u32(&x199, &x200, x198, x151, x182); + fiat_p256_mulx_u32(&x201, &x202, x183, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x203, &x204, x183, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x205, &x206, x183, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x207, &x208, x183, UINT32_C(0xffffffff)); + fiat_p256_addcarryx_u32(&x209, &x210, 0x0, x208, x205); + fiat_p256_addcarryx_u32(&x211, &x212, x210, x206, x203); + x213 = (x212 + x204); + fiat_p256_addcarryx_u32(&x214, &x215, 0x0, x183, x207); + fiat_p256_addcarryx_u32(&x216, &x217, x215, x185, x209); + fiat_p256_addcarryx_u32(&x218, &x219, x217, x187, x211); + fiat_p256_addcarryx_u32(&x220, &x221, x219, x189, x213); + fiat_p256_addcarryx_u32(&x222, &x223, x221, x191, 0x0); + fiat_p256_addcarryx_u32(&x224, &x225, x223, x193, 0x0); + fiat_p256_addcarryx_u32(&x226, &x227, x225, x195, x183); + fiat_p256_addcarryx_u32(&x228, &x229, x227, x197, x201); + fiat_p256_addcarryx_u32(&x230, &x231, x229, x199, x202); + x232 = ((uint32_t)x231 + x200); + fiat_p256_mulx_u32(&x233, &x234, x3, (arg1[7])); + fiat_p256_mulx_u32(&x235, &x236, x3, (arg1[6])); + fiat_p256_mulx_u32(&x237, &x238, x3, (arg1[5])); + fiat_p256_mulx_u32(&x239, &x240, x3, (arg1[4])); + fiat_p256_mulx_u32(&x241, &x242, x3, (arg1[3])); + fiat_p256_mulx_u32(&x243, &x244, x3, (arg1[2])); + fiat_p256_mulx_u32(&x245, &x246, x3, (arg1[1])); + fiat_p256_mulx_u32(&x247, &x248, x3, (arg1[0])); + fiat_p256_addcarryx_u32(&x249, &x250, 0x0, x248, x245); + fiat_p256_addcarryx_u32(&x251, &x252, x250, x246, x243); + fiat_p256_addcarryx_u32(&x253, &x254, x252, x244, x241); + fiat_p256_addcarryx_u32(&x255, &x256, x254, x242, x239); + fiat_p256_addcarryx_u32(&x257, &x258, x256, x240, x237); + fiat_p256_addcarryx_u32(&x259, &x260, x258, x238, x235); + fiat_p256_addcarryx_u32(&x261, &x262, x260, x236, x233); + x263 = (x262 + x234); + fiat_p256_addcarryx_u32(&x264, &x265, 0x0, x216, x247); + fiat_p256_addcarryx_u32(&x266, &x267, x265, x218, x249); + fiat_p256_addcarryx_u32(&x268, &x269, x267, x220, x251); + fiat_p256_addcarryx_u32(&x270, &x271, x269, x222, x253); + fiat_p256_addcarryx_u32(&x272, &x273, x271, x224, x255); + fiat_p256_addcarryx_u32(&x274, &x275, x273, x226, x257); + fiat_p256_addcarryx_u32(&x276, &x277, x275, x228, x259); + fiat_p256_addcarryx_u32(&x278, &x279, x277, x230, x261); + fiat_p256_addcarryx_u32(&x280, &x281, x279, x232, x263); + fiat_p256_mulx_u32(&x282, &x283, x264, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x284, &x285, x264, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x286, &x287, x264, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x288, &x289, x264, UINT32_C(0xffffffff)); + fiat_p256_addcarryx_u32(&x290, &x291, 0x0, x289, x286); + fiat_p256_addcarryx_u32(&x292, &x293, x291, x287, x284); + x294 = (x293 + x285); + fiat_p256_addcarryx_u32(&x295, &x296, 0x0, x264, x288); + fiat_p256_addcarryx_u32(&x297, &x298, x296, x266, x290); + fiat_p256_addcarryx_u32(&x299, &x300, x298, x268, x292); + fiat_p256_addcarryx_u32(&x301, &x302, x300, x270, x294); + fiat_p256_addcarryx_u32(&x303, &x304, x302, x272, 0x0); + fiat_p256_addcarryx_u32(&x305, &x306, x304, x274, 0x0); + fiat_p256_addcarryx_u32(&x307, &x308, x306, x276, x264); + fiat_p256_addcarryx_u32(&x309, &x310, x308, x278, x282); + fiat_p256_addcarryx_u32(&x311, &x312, x310, x280, x283); + x313 = ((uint32_t)x312 + x281); + fiat_p256_mulx_u32(&x314, &x315, x4, (arg1[7])); + fiat_p256_mulx_u32(&x316, &x317, x4, (arg1[6])); + fiat_p256_mulx_u32(&x318, &x319, x4, (arg1[5])); + fiat_p256_mulx_u32(&x320, &x321, x4, (arg1[4])); + fiat_p256_mulx_u32(&x322, &x323, x4, (arg1[3])); + fiat_p256_mulx_u32(&x324, &x325, x4, (arg1[2])); + fiat_p256_mulx_u32(&x326, &x327, x4, (arg1[1])); + fiat_p256_mulx_u32(&x328, &x329, x4, (arg1[0])); + fiat_p256_addcarryx_u32(&x330, &x331, 0x0, x329, x326); + fiat_p256_addcarryx_u32(&x332, &x333, x331, x327, x324); + fiat_p256_addcarryx_u32(&x334, &x335, x333, x325, x322); + fiat_p256_addcarryx_u32(&x336, &x337, x335, x323, x320); + fiat_p256_addcarryx_u32(&x338, &x339, x337, x321, x318); + fiat_p256_addcarryx_u32(&x340, &x341, x339, x319, x316); + fiat_p256_addcarryx_u32(&x342, &x343, x341, x317, x314); + x344 = (x343 + x315); + fiat_p256_addcarryx_u32(&x345, &x346, 0x0, x297, x328); + fiat_p256_addcarryx_u32(&x347, &x348, x346, x299, x330); + fiat_p256_addcarryx_u32(&x349, &x350, x348, x301, x332); + fiat_p256_addcarryx_u32(&x351, &x352, x350, x303, x334); + fiat_p256_addcarryx_u32(&x353, &x354, x352, x305, x336); + fiat_p256_addcarryx_u32(&x355, &x356, x354, x307, x338); + fiat_p256_addcarryx_u32(&x357, &x358, x356, x309, x340); + fiat_p256_addcarryx_u32(&x359, &x360, x358, x311, x342); + fiat_p256_addcarryx_u32(&x361, &x362, x360, x313, x344); + fiat_p256_mulx_u32(&x363, &x364, x345, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x365, &x366, x345, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x367, &x368, x345, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x369, &x370, x345, UINT32_C(0xffffffff)); + fiat_p256_addcarryx_u32(&x371, &x372, 0x0, x370, x367); + fiat_p256_addcarryx_u32(&x373, &x374, x372, x368, x365); + x375 = (x374 + x366); + fiat_p256_addcarryx_u32(&x376, &x377, 0x0, x345, x369); + fiat_p256_addcarryx_u32(&x378, &x379, x377, x347, x371); + fiat_p256_addcarryx_u32(&x380, &x381, x379, x349, x373); + fiat_p256_addcarryx_u32(&x382, &x383, x381, x351, x375); + fiat_p256_addcarryx_u32(&x384, &x385, x383, x353, 0x0); + fiat_p256_addcarryx_u32(&x386, &x387, x385, x355, 0x0); + fiat_p256_addcarryx_u32(&x388, &x389, x387, x357, x345); + fiat_p256_addcarryx_u32(&x390, &x391, x389, x359, x363); + fiat_p256_addcarryx_u32(&x392, &x393, x391, x361, x364); + x394 = ((uint32_t)x393 + x362); + fiat_p256_mulx_u32(&x395, &x396, x5, (arg1[7])); + fiat_p256_mulx_u32(&x397, &x398, x5, (arg1[6])); + fiat_p256_mulx_u32(&x399, &x400, x5, (arg1[5])); + fiat_p256_mulx_u32(&x401, &x402, x5, (arg1[4])); + fiat_p256_mulx_u32(&x403, &x404, x5, (arg1[3])); + fiat_p256_mulx_u32(&x405, &x406, x5, (arg1[2])); + fiat_p256_mulx_u32(&x407, &x408, x5, (arg1[1])); + fiat_p256_mulx_u32(&x409, &x410, x5, (arg1[0])); + fiat_p256_addcarryx_u32(&x411, &x412, 0x0, x410, x407); + fiat_p256_addcarryx_u32(&x413, &x414, x412, x408, x405); + fiat_p256_addcarryx_u32(&x415, &x416, x414, x406, x403); + fiat_p256_addcarryx_u32(&x417, &x418, x416, x404, x401); + fiat_p256_addcarryx_u32(&x419, &x420, x418, x402, x399); + fiat_p256_addcarryx_u32(&x421, &x422, x420, x400, x397); + fiat_p256_addcarryx_u32(&x423, &x424, x422, x398, x395); + x425 = (x424 + x396); + fiat_p256_addcarryx_u32(&x426, &x427, 0x0, x378, x409); + fiat_p256_addcarryx_u32(&x428, &x429, x427, x380, x411); + fiat_p256_addcarryx_u32(&x430, &x431, x429, x382, x413); + fiat_p256_addcarryx_u32(&x432, &x433, x431, x384, x415); + fiat_p256_addcarryx_u32(&x434, &x435, x433, x386, x417); + fiat_p256_addcarryx_u32(&x436, &x437, x435, x388, x419); + fiat_p256_addcarryx_u32(&x438, &x439, x437, x390, x421); + fiat_p256_addcarryx_u32(&x440, &x441, x439, x392, x423); + fiat_p256_addcarryx_u32(&x442, &x443, x441, x394, x425); + fiat_p256_mulx_u32(&x444, &x445, x426, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x446, &x447, x426, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x448, &x449, x426, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x450, &x451, x426, UINT32_C(0xffffffff)); + fiat_p256_addcarryx_u32(&x452, &x453, 0x0, x451, x448); + fiat_p256_addcarryx_u32(&x454, &x455, x453, x449, x446); + x456 = (x455 + x447); + fiat_p256_addcarryx_u32(&x457, &x458, 0x0, x426, x450); + fiat_p256_addcarryx_u32(&x459, &x460, x458, x428, x452); + fiat_p256_addcarryx_u32(&x461, &x462, x460, x430, x454); + fiat_p256_addcarryx_u32(&x463, &x464, x462, x432, x456); + fiat_p256_addcarryx_u32(&x465, &x466, x464, x434, 0x0); + fiat_p256_addcarryx_u32(&x467, &x468, x466, x436, 0x0); + fiat_p256_addcarryx_u32(&x469, &x470, x468, x438, x426); + fiat_p256_addcarryx_u32(&x471, &x472, x470, x440, x444); + fiat_p256_addcarryx_u32(&x473, &x474, x472, x442, x445); + x475 = ((uint32_t)x474 + x443); + fiat_p256_mulx_u32(&x476, &x477, x6, (arg1[7])); + fiat_p256_mulx_u32(&x478, &x479, x6, (arg1[6])); + fiat_p256_mulx_u32(&x480, &x481, x6, (arg1[5])); + fiat_p256_mulx_u32(&x482, &x483, x6, (arg1[4])); + fiat_p256_mulx_u32(&x484, &x485, x6, (arg1[3])); + fiat_p256_mulx_u32(&x486, &x487, x6, (arg1[2])); + fiat_p256_mulx_u32(&x488, &x489, x6, (arg1[1])); + fiat_p256_mulx_u32(&x490, &x491, x6, (arg1[0])); + fiat_p256_addcarryx_u32(&x492, &x493, 0x0, x491, x488); + fiat_p256_addcarryx_u32(&x494, &x495, x493, x489, x486); + fiat_p256_addcarryx_u32(&x496, &x497, x495, x487, x484); + fiat_p256_addcarryx_u32(&x498, &x499, x497, x485, x482); + fiat_p256_addcarryx_u32(&x500, &x501, x499, x483, x480); + fiat_p256_addcarryx_u32(&x502, &x503, x501, x481, x478); + fiat_p256_addcarryx_u32(&x504, &x505, x503, x479, x476); + x506 = (x505 + x477); + fiat_p256_addcarryx_u32(&x507, &x508, 0x0, x459, x490); + fiat_p256_addcarryx_u32(&x509, &x510, x508, x461, x492); + fiat_p256_addcarryx_u32(&x511, &x512, x510, x463, x494); + fiat_p256_addcarryx_u32(&x513, &x514, x512, x465, x496); + fiat_p256_addcarryx_u32(&x515, &x516, x514, x467, x498); + fiat_p256_addcarryx_u32(&x517, &x518, x516, x469, x500); + fiat_p256_addcarryx_u32(&x519, &x520, x518, x471, x502); + fiat_p256_addcarryx_u32(&x521, &x522, x520, x473, x504); + fiat_p256_addcarryx_u32(&x523, &x524, x522, x475, x506); + fiat_p256_mulx_u32(&x525, &x526, x507, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x527, &x528, x507, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x529, &x530, x507, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x531, &x532, x507, UINT32_C(0xffffffff)); + fiat_p256_addcarryx_u32(&x533, &x534, 0x0, x532, x529); + fiat_p256_addcarryx_u32(&x535, &x536, x534, x530, x527); + x537 = (x536 + x528); + fiat_p256_addcarryx_u32(&x538, &x539, 0x0, x507, x531); + fiat_p256_addcarryx_u32(&x540, &x541, x539, x509, x533); + fiat_p256_addcarryx_u32(&x542, &x543, x541, x511, x535); + fiat_p256_addcarryx_u32(&x544, &x545, x543, x513, x537); + fiat_p256_addcarryx_u32(&x546, &x547, x545, x515, 0x0); + fiat_p256_addcarryx_u32(&x548, &x549, x547, x517, 0x0); + fiat_p256_addcarryx_u32(&x550, &x551, x549, x519, x507); + fiat_p256_addcarryx_u32(&x552, &x553, x551, x521, x525); + fiat_p256_addcarryx_u32(&x554, &x555, x553, x523, x526); + x556 = ((uint32_t)x555 + x524); + fiat_p256_mulx_u32(&x557, &x558, x7, (arg1[7])); + fiat_p256_mulx_u32(&x559, &x560, x7, (arg1[6])); + fiat_p256_mulx_u32(&x561, &x562, x7, (arg1[5])); + fiat_p256_mulx_u32(&x563, &x564, x7, (arg1[4])); + fiat_p256_mulx_u32(&x565, &x566, x7, (arg1[3])); + fiat_p256_mulx_u32(&x567, &x568, x7, (arg1[2])); + fiat_p256_mulx_u32(&x569, &x570, x7, (arg1[1])); + fiat_p256_mulx_u32(&x571, &x572, x7, (arg1[0])); + fiat_p256_addcarryx_u32(&x573, &x574, 0x0, x572, x569); + fiat_p256_addcarryx_u32(&x575, &x576, x574, x570, x567); + fiat_p256_addcarryx_u32(&x577, &x578, x576, x568, x565); + fiat_p256_addcarryx_u32(&x579, &x580, x578, x566, x563); + fiat_p256_addcarryx_u32(&x581, &x582, x580, x564, x561); + fiat_p256_addcarryx_u32(&x583, &x584, x582, x562, x559); + fiat_p256_addcarryx_u32(&x585, &x586, x584, x560, x557); + x587 = (x586 + x558); + fiat_p256_addcarryx_u32(&x588, &x589, 0x0, x540, x571); + fiat_p256_addcarryx_u32(&x590, &x591, x589, x542, x573); + fiat_p256_addcarryx_u32(&x592, &x593, x591, x544, x575); + fiat_p256_addcarryx_u32(&x594, &x595, x593, x546, x577); + fiat_p256_addcarryx_u32(&x596, &x597, x595, x548, x579); + fiat_p256_addcarryx_u32(&x598, &x599, x597, x550, x581); + fiat_p256_addcarryx_u32(&x600, &x601, x599, x552, x583); + fiat_p256_addcarryx_u32(&x602, &x603, x601, x554, x585); + fiat_p256_addcarryx_u32(&x604, &x605, x603, x556, x587); + fiat_p256_mulx_u32(&x606, &x607, x588, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x608, &x609, x588, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x610, &x611, x588, UINT32_C(0xffffffff)); + fiat_p256_mulx_u32(&x612, &x613, x588, UINT32_C(0xffffffff)); + fiat_p256_addcarryx_u32(&x614, &x615, 0x0, x613, x610); + fiat_p256_addcarryx_u32(&x616, &x617, x615, x611, x608); + x618 = (x617 + x609); + fiat_p256_addcarryx_u32(&x619, &x620, 0x0, x588, x612); + fiat_p256_addcarryx_u32(&x621, &x622, x620, x590, x614); + fiat_p256_addcarryx_u32(&x623, &x624, x622, x592, x616); + fiat_p256_addcarryx_u32(&x625, &x626, x624, x594, x618); + fiat_p256_addcarryx_u32(&x627, &x628, x626, x596, 0x0); + fiat_p256_addcarryx_u32(&x629, &x630, x628, x598, 0x0); + fiat_p256_addcarryx_u32(&x631, &x632, x630, x600, x588); + fiat_p256_addcarryx_u32(&x633, &x634, x632, x602, x606); + fiat_p256_addcarryx_u32(&x635, &x636, x634, x604, x607); + x637 = ((uint32_t)x636 + x605); + fiat_p256_subborrowx_u32(&x638, &x639, 0x0, x621, UINT32_C(0xffffffff)); + fiat_p256_subborrowx_u32(&x640, &x641, x639, x623, UINT32_C(0xffffffff)); + fiat_p256_subborrowx_u32(&x642, &x643, x641, x625, UINT32_C(0xffffffff)); + fiat_p256_subborrowx_u32(&x644, &x645, x643, x627, 0x0); + fiat_p256_subborrowx_u32(&x646, &x647, x645, x629, 0x0); + fiat_p256_subborrowx_u32(&x648, &x649, x647, x631, 0x0); + fiat_p256_subborrowx_u32(&x650, &x651, x649, x633, 0x1); + fiat_p256_subborrowx_u32(&x652, &x653, x651, x635, UINT32_C(0xffffffff)); + fiat_p256_subborrowx_u32(&x654, &x655, x653, x637, 0x0); + fiat_p256_cmovznz_u32(&x656, x655, x638, x621); + fiat_p256_cmovznz_u32(&x657, x655, x640, x623); + fiat_p256_cmovznz_u32(&x658, x655, x642, x625); + fiat_p256_cmovznz_u32(&x659, x655, x644, x627); + fiat_p256_cmovznz_u32(&x660, x655, x646, x629); + fiat_p256_cmovznz_u32(&x661, x655, x648, x631); + fiat_p256_cmovznz_u32(&x662, x655, x650, x633); + fiat_p256_cmovznz_u32(&x663, x655, x652, x635); + out1[0] = x656; + out1[1] = x657; + out1[2] = x658; + out1[3] = x659; + out1[4] = x660; + out1[5] = x661; + out1[6] = x662; + out1[7] = x663; +} + +/* + * The function fiat_p256_add adds two field elements in the Montgomery domain. + * + * Preconditions: + * 0 ≤ eval arg1 < m + * 0 ≤ eval arg2 < m + * Postconditions: + * eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) + eval (from_montgomery arg2)) mod m + * 0 ≤ eval out1 < m + * + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_add(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) { + uint32_t x1; + fiat_p256_uint1 x2; + uint32_t x3; + fiat_p256_uint1 x4; + uint32_t x5; + fiat_p256_uint1 x6; + uint32_t x7; + fiat_p256_uint1 x8; + uint32_t x9; + fiat_p256_uint1 x10; + uint32_t x11; + fiat_p256_uint1 x12; + uint32_t x13; + fiat_p256_uint1 x14; + uint32_t x15; + fiat_p256_uint1 x16; + uint32_t x17; + fiat_p256_uint1 x18; + uint32_t x19; + fiat_p256_uint1 x20; + uint32_t x21; + fiat_p256_uint1 x22; + uint32_t x23; + fiat_p256_uint1 x24; + uint32_t x25; + fiat_p256_uint1 x26; + uint32_t x27; + fiat_p256_uint1 x28; + uint32_t x29; + fiat_p256_uint1 x30; + uint32_t x31; + fiat_p256_uint1 x32; + uint32_t x33; + fiat_p256_uint1 x34; + uint32_t x35; + uint32_t x36; + uint32_t x37; + uint32_t x38; + uint32_t x39; + uint32_t x40; + uint32_t x41; + uint32_t x42; + fiat_p256_addcarryx_u32(&x1, &x2, 0x0, (arg1[0]), (arg2[0])); + fiat_p256_addcarryx_u32(&x3, &x4, x2, (arg1[1]), (arg2[1])); + fiat_p256_addcarryx_u32(&x5, &x6, x4, (arg1[2]), (arg2[2])); + fiat_p256_addcarryx_u32(&x7, &x8, x6, (arg1[3]), (arg2[3])); + fiat_p256_addcarryx_u32(&x9, &x10, x8, (arg1[4]), (arg2[4])); + fiat_p256_addcarryx_u32(&x11, &x12, x10, (arg1[5]), (arg2[5])); + fiat_p256_addcarryx_u32(&x13, &x14, x12, (arg1[6]), (arg2[6])); + fiat_p256_addcarryx_u32(&x15, &x16, x14, (arg1[7]), (arg2[7])); + fiat_p256_subborrowx_u32(&x17, &x18, 0x0, x1, UINT32_C(0xffffffff)); + fiat_p256_subborrowx_u32(&x19, &x20, x18, x3, UINT32_C(0xffffffff)); + fiat_p256_subborrowx_u32(&x21, &x22, x20, x5, UINT32_C(0xffffffff)); + fiat_p256_subborrowx_u32(&x23, &x24, x22, x7, 0x0); + fiat_p256_subborrowx_u32(&x25, &x26, x24, x9, 0x0); + fiat_p256_subborrowx_u32(&x27, &x28, x26, x11, 0x0); + fiat_p256_subborrowx_u32(&x29, &x30, x28, x13, 0x1); + fiat_p256_subborrowx_u32(&x31, &x32, x30, x15, UINT32_C(0xffffffff)); + fiat_p256_subborrowx_u32(&x33, &x34, x32, x16, 0x0); + fiat_p256_cmovznz_u32(&x35, x34, x17, x1); + fiat_p256_cmovznz_u32(&x36, x34, x19, x3); + fiat_p256_cmovznz_u32(&x37, x34, x21, x5); + fiat_p256_cmovznz_u32(&x38, x34, x23, x7); + fiat_p256_cmovznz_u32(&x39, x34, x25, x9); + fiat_p256_cmovznz_u32(&x40, x34, x27, x11); + fiat_p256_cmovznz_u32(&x41, x34, x29, x13); + fiat_p256_cmovznz_u32(&x42, x34, x31, x15); + out1[0] = x35; + out1[1] = x36; + out1[2] = x37; + out1[3] = x38; + out1[4] = x39; + out1[5] = x40; + out1[6] = x41; + out1[7] = x42; +} + +/* + * The function fiat_p256_sub subtracts two field elements in the Montgomery domain. + * + * Preconditions: + * 0 ≤ eval arg1 < m + * 0 ≤ eval arg2 < m + * Postconditions: + * eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) - eval (from_montgomery arg2)) mod m + * 0 ≤ eval out1 < m + * + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_sub(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) { + uint32_t x1; + fiat_p256_uint1 x2; + uint32_t x3; + fiat_p256_uint1 x4; + uint32_t x5; + fiat_p256_uint1 x6; + uint32_t x7; + fiat_p256_uint1 x8; + uint32_t x9; + fiat_p256_uint1 x10; + uint32_t x11; + fiat_p256_uint1 x12; + uint32_t x13; + fiat_p256_uint1 x14; + uint32_t x15; + fiat_p256_uint1 x16; + uint32_t x17; + uint32_t x18; + fiat_p256_uint1 x19; + uint32_t x20; + fiat_p256_uint1 x21; + uint32_t x22; + fiat_p256_uint1 x23; + uint32_t x24; + fiat_p256_uint1 x25; + uint32_t x26; + fiat_p256_uint1 x27; + uint32_t x28; + fiat_p256_uint1 x29; + uint32_t x30; + fiat_p256_uint1 x31; + uint32_t x32; + fiat_p256_uint1 x33; + fiat_p256_subborrowx_u32(&x1, &x2, 0x0, (arg1[0]), (arg2[0])); + fiat_p256_subborrowx_u32(&x3, &x4, x2, (arg1[1]), (arg2[1])); + fiat_p256_subborrowx_u32(&x5, &x6, x4, (arg1[2]), (arg2[2])); + fiat_p256_subborrowx_u32(&x7, &x8, x6, (arg1[3]), (arg2[3])); + fiat_p256_subborrowx_u32(&x9, &x10, x8, (arg1[4]), (arg2[4])); + fiat_p256_subborrowx_u32(&x11, &x12, x10, (arg1[5]), (arg2[5])); + fiat_p256_subborrowx_u32(&x13, &x14, x12, (arg1[6]), (arg2[6])); + fiat_p256_subborrowx_u32(&x15, &x16, x14, (arg1[7]), (arg2[7])); + fiat_p256_cmovznz_u32(&x17, x16, 0x0, UINT32_C(0xffffffff)); + fiat_p256_addcarryx_u32(&x18, &x19, 0x0, x1, x17); + fiat_p256_addcarryx_u32(&x20, &x21, x19, x3, x17); + fiat_p256_addcarryx_u32(&x22, &x23, x21, x5, x17); + fiat_p256_addcarryx_u32(&x24, &x25, x23, x7, 0x0); + fiat_p256_addcarryx_u32(&x26, &x27, x25, x9, 0x0); + fiat_p256_addcarryx_u32(&x28, &x29, x27, x11, 0x0); + fiat_p256_addcarryx_u32(&x30, &x31, x29, x13, (fiat_p256_uint1)(x17 & 0x1)); + fiat_p256_addcarryx_u32(&x32, &x33, x31, x15, x17); + out1[0] = x18; + out1[1] = x20; + out1[2] = x22; + out1[3] = x24; + out1[4] = x26; + out1[5] = x28; + out1[6] = x30; + out1[7] = x32; +} + +/* + * The function fiat_p256_opp negates a field element in the Montgomery domain. + * + * Preconditions: + * 0 ≤ eval arg1 < m + * Postconditions: + * eval (from_montgomery out1) mod m = -eval (from_montgomery arg1) mod m + * 0 ≤ eval out1 < m + * + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_opp(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1) { + uint32_t x1; + fiat_p256_uint1 x2; + uint32_t x3; + fiat_p256_uint1 x4; + uint32_t x5; + fiat_p256_uint1 x6; + uint32_t x7; + fiat_p256_uint1 x8; + uint32_t x9; + fiat_p256_uint1 x10; + uint32_t x11; + fiat_p256_uint1 x12; + uint32_t x13; + fiat_p256_uint1 x14; + uint32_t x15; + fiat_p256_uint1 x16; + uint32_t x17; + uint32_t x18; + fiat_p256_uint1 x19; + uint32_t x20; + fiat_p256_uint1 x21; + uint32_t x22; + fiat_p256_uint1 x23; + uint32_t x24; + fiat_p256_uint1 x25; + uint32_t x26; + fiat_p256_uint1 x27; + uint32_t x28; + fiat_p256_uint1 x29; + uint32_t x30; + fiat_p256_uint1 x31; + uint32_t x32; + fiat_p256_uint1 x33; + fiat_p256_subborrowx_u32(&x1, &x2, 0x0, 0x0, (arg1[0])); + fiat_p256_subborrowx_u32(&x3, &x4, x2, 0x0, (arg1[1])); + fiat_p256_subborrowx_u32(&x5, &x6, x4, 0x0, (arg1[2])); + fiat_p256_subborrowx_u32(&x7, &x8, x6, 0x0, (arg1[3])); + fiat_p256_subborrowx_u32(&x9, &x10, x8, 0x0, (arg1[4])); + fiat_p256_subborrowx_u32(&x11, &x12, x10, 0x0, (arg1[5])); + fiat_p256_subborrowx_u32(&x13, &x14, x12, 0x0, (arg1[6])); + fiat_p256_subborrowx_u32(&x15, &x16, x14, 0x0, (arg1[7])); + fiat_p256_cmovznz_u32(&x17, x16, 0x0, UINT32_C(0xffffffff)); + fiat_p256_addcarryx_u32(&x18, &x19, 0x0, x1, x17); + fiat_p256_addcarryx_u32(&x20, &x21, x19, x3, x17); + fiat_p256_addcarryx_u32(&x22, &x23, x21, x5, x17); + fiat_p256_addcarryx_u32(&x24, &x25, x23, x7, 0x0); + fiat_p256_addcarryx_u32(&x26, &x27, x25, x9, 0x0); + fiat_p256_addcarryx_u32(&x28, &x29, x27, x11, 0x0); + fiat_p256_addcarryx_u32(&x30, &x31, x29, x13, (fiat_p256_uint1)(x17 & 0x1)); + fiat_p256_addcarryx_u32(&x32, &x33, x31, x15, x17); + out1[0] = x18; + out1[1] = x20; + out1[2] = x22; + out1[3] = x24; + out1[4] = x26; + out1[5] = x28; + out1[6] = x30; + out1[7] = x32; +} + +/* + * The function fiat_p256_nonzero outputs a single non-zero word if the input is non-zero and zero otherwise. + * + * Preconditions: + * 0 ≤ eval arg1 < m + * Postconditions: + * out1 = 0 ↔ eval (from_montgomery arg1) mod m = 0 + * + * Input Bounds: + * arg1: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]] + * Output Bounds: + * out1: [0x0 ~> 0xffffffff] + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_nonzero(uint32_t* out1, const uint32_t arg1[8]) { + uint32_t x1; + x1 = ((arg1[0]) | ((arg1[1]) | ((arg1[2]) | ((arg1[3]) | ((arg1[4]) | ((arg1[5]) | ((arg1[6]) | (arg1[7])))))))); + *out1 = x1; +} + +/* + * The function fiat_p256_selectznz is a multi-limb conditional select. + * + * Postconditions: + * eval out1 = (if arg1 = 0 then eval arg2 else eval arg3) + * + * Input Bounds: + * arg1: [0x0 ~> 0x1] + * arg2: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]] + * arg3: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]] + * Output Bounds: + * out1: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]] + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_selectznz(uint32_t out1[8], fiat_p256_uint1 arg1, const uint32_t arg2[8], const uint32_t arg3[8]) { + uint32_t x1; + uint32_t x2; + uint32_t x3; + uint32_t x4; + uint32_t x5; + uint32_t x6; + uint32_t x7; + uint32_t x8; + fiat_p256_cmovznz_u32(&x1, arg1, (arg2[0]), (arg3[0])); + fiat_p256_cmovznz_u32(&x2, arg1, (arg2[1]), (arg3[1])); + fiat_p256_cmovznz_u32(&x3, arg1, (arg2[2]), (arg3[2])); + fiat_p256_cmovznz_u32(&x4, arg1, (arg2[3]), (arg3[3])); + fiat_p256_cmovznz_u32(&x5, arg1, (arg2[4]), (arg3[4])); + fiat_p256_cmovznz_u32(&x6, arg1, (arg2[5]), (arg3[5])); + fiat_p256_cmovznz_u32(&x7, arg1, (arg2[6]), (arg3[6])); + fiat_p256_cmovznz_u32(&x8, arg1, (arg2[7]), (arg3[7])); + out1[0] = x1; + out1[1] = x2; + out1[2] = x3; + out1[3] = x4; + out1[4] = x5; + out1[5] = x6; + out1[6] = x7; + out1[7] = x8; +} diff --git a/ring-0.17.14/third_party/fiat/p256_64.h b/ring-0.17.14/third_party/fiat/p256_64.h new file mode 100644 index 0000000000..a06dd225e4 --- /dev/null +++ b/ring-0.17.14/third_party/fiat/p256_64.h @@ -0,0 +1,957 @@ +/* Autogenerated: 'src/ExtractionOCaml/word_by_word_montgomery' --inline --static --use-value-barrier p256 64 '2^256 - 2^224 + 2^192 + 2^96 - 1' mul square add sub opp from_montgomery to_montgomery nonzero selectznz to_bytes from_bytes one msat divstep divstep_precomp */ +/* curve description: p256 */ +/* machine_wordsize = 64 (from "64") */ +/* requested operations: mul, square, add, sub, opp, from_montgomery, to_montgomery, nonzero, selectznz, to_bytes, from_bytes, one, msat, divstep, divstep_precomp */ +/* m = 0xffffffff00000001000000000000000000000000ffffffffffffffffffffffff (from "2^256 - 2^224 + 2^192 + 2^96 - 1") */ +/* */ +/* NOTE: In addition to the bounds specified above each function, all */ +/* functions synthesized for this Montgomery arithmetic require the */ +/* input to be strictly less than the prime modulus (m), and also */ +/* require the input to be in the unique saturated representation. */ +/* All functions also ensure that these two properties are true of */ +/* return values. */ +/* */ +/* Computed values: */ +/* eval z = z[0] + (z[1] << 64) + (z[2] << 128) + (z[3] << 192) */ +/* bytes_eval z = z[0] + (z[1] << 8) + (z[2] << 16) + (z[3] << 24) + (z[4] << 32) + (z[5] << 40) + (z[6] << 48) + (z[7] << 56) + (z[8] << 64) + (z[9] << 72) + (z[10] << 80) + (z[11] << 88) + (z[12] << 96) + (z[13] << 104) + (z[14] << 112) + (z[15] << 120) + (z[16] << 128) + (z[17] << 136) + (z[18] << 144) + (z[19] << 152) + (z[20] << 160) + (z[21] << 168) + (z[22] << 176) + (z[23] << 184) + (z[24] << 192) + (z[25] << 200) + (z[26] << 208) + (z[27] << 216) + (z[28] << 224) + (z[29] << 232) + (z[30] << 240) + (z[31] << 248) */ +/* twos_complement_eval z = let x1 := z[0] + (z[1] << 64) + (z[2] << 128) + (z[3] << 192) in */ +/* if x1 & (2^256-1) < 2^255 then x1 & (2^256-1) else (x1 & (2^256-1)) - 2^256 */ + +#include +typedef unsigned char fiat_p256_uint1; +typedef signed char fiat_p256_int1; +#if defined(__GNUC__) || defined(__clang__) +# define FIAT_P256_FIAT_EXTENSION __extension__ +# define FIAT_P256_FIAT_INLINE __inline__ +#else +# define FIAT_P256_FIAT_EXTENSION +# define FIAT_P256_FIAT_INLINE +#endif + +FIAT_P256_FIAT_EXTENSION typedef signed __int128 fiat_p256_int128; +FIAT_P256_FIAT_EXTENSION typedef unsigned __int128 fiat_p256_uint128; + +/* The type fiat_p256_montgomery_domain_field_element is a field element in the Montgomery domain. */ +/* Bounds: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] */ +typedef uint64_t fiat_p256_montgomery_domain_field_element[4]; + +/* The type fiat_p256_non_montgomery_domain_field_element is a field element NOT in the Montgomery domain. */ +/* Bounds: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] */ +typedef uint64_t fiat_p256_non_montgomery_domain_field_element[4]; + +#if (-1 & 3) != 3 +#error "This code only works on a two's complement system" +#endif + +#if !defined(FIAT_P256_NO_ASM) && (defined(__GNUC__) || defined(__clang__)) +static __inline__ uint64_t fiat_p256_value_barrier_u64(uint64_t a) { + __asm__("" : "+r"(a) : /* no inputs */); + return a; +} +#else +# define fiat_p256_value_barrier_u64(x) (x) +#endif + + +/* + * The function fiat_p256_addcarryx_u64 is an addition with carry. + * + * Postconditions: + * out1 = (arg1 + arg2 + arg3) mod 2^64 + * out2 = ⌊(arg1 + arg2 + arg3) / 2^64⌋ + * + * Input Bounds: + * arg1: [0x0 ~> 0x1] + * arg2: [0x0 ~> 0xffffffffffffffff] + * arg3: [0x0 ~> 0xffffffffffffffff] + * Output Bounds: + * out1: [0x0 ~> 0xffffffffffffffff] + * out2: [0x0 ~> 0x1] + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_addcarryx_u64(uint64_t* out1, fiat_p256_uint1* out2, fiat_p256_uint1 arg1, uint64_t arg2, uint64_t arg3) { + fiat_p256_uint128 x1; + uint64_t x2; + fiat_p256_uint1 x3; + x1 = ((arg1 + (fiat_p256_uint128)arg2) + arg3); + x2 = (uint64_t)(x1 & UINT64_C(0xffffffffffffffff)); + x3 = (fiat_p256_uint1)(x1 >> 64); + *out1 = x2; + *out2 = x3; +} + +/* + * The function fiat_p256_subborrowx_u64 is a subtraction with borrow. + * + * Postconditions: + * out1 = (-arg1 + arg2 + -arg3) mod 2^64 + * out2 = -⌊(-arg1 + arg2 + -arg3) / 2^64⌋ + * + * Input Bounds: + * arg1: [0x0 ~> 0x1] + * arg2: [0x0 ~> 0xffffffffffffffff] + * arg3: [0x0 ~> 0xffffffffffffffff] + * Output Bounds: + * out1: [0x0 ~> 0xffffffffffffffff] + * out2: [0x0 ~> 0x1] + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_subborrowx_u64(uint64_t* out1, fiat_p256_uint1* out2, fiat_p256_uint1 arg1, uint64_t arg2, uint64_t arg3) { + fiat_p256_int128 x1; + fiat_p256_int1 x2; + uint64_t x3; + x1 = ((arg2 - (fiat_p256_int128)arg1) - arg3); + x2 = (fiat_p256_int1)(x1 >> 64); + x3 = (uint64_t)(x1 & UINT64_C(0xffffffffffffffff)); + *out1 = x3; + *out2 = (fiat_p256_uint1)(0x0 - x2); +} + +/* + * The function fiat_p256_mulx_u64 is a multiplication, returning the full double-width result. + * + * Postconditions: + * out1 = (arg1 * arg2) mod 2^64 + * out2 = ⌊arg1 * arg2 / 2^64⌋ + * + * Input Bounds: + * arg1: [0x0 ~> 0xffffffffffffffff] + * arg2: [0x0 ~> 0xffffffffffffffff] + * Output Bounds: + * out1: [0x0 ~> 0xffffffffffffffff] + * out2: [0x0 ~> 0xffffffffffffffff] + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_mulx_u64(uint64_t* out1, uint64_t* out2, uint64_t arg1, uint64_t arg2) { + fiat_p256_uint128 x1; + uint64_t x2; + uint64_t x3; + x1 = ((fiat_p256_uint128)arg1 * arg2); + x2 = (uint64_t)(x1 & UINT64_C(0xffffffffffffffff)); + x3 = (uint64_t)(x1 >> 64); + *out1 = x2; + *out2 = x3; +} + +/* + * The function fiat_p256_cmovznz_u64 is a single-word conditional move. + * + * Postconditions: + * out1 = (if arg1 = 0 then arg2 else arg3) + * + * Input Bounds: + * arg1: [0x0 ~> 0x1] + * arg2: [0x0 ~> 0xffffffffffffffff] + * arg3: [0x0 ~> 0xffffffffffffffff] + * Output Bounds: + * out1: [0x0 ~> 0xffffffffffffffff] + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_cmovznz_u64(uint64_t* out1, fiat_p256_uint1 arg1, uint64_t arg2, uint64_t arg3) { + fiat_p256_uint1 x1; + uint64_t x2; + uint64_t x3; + x1 = (!(!arg1)); + x2 = ((fiat_p256_int1)(0x0 - x1) & UINT64_C(0xffffffffffffffff)); + x3 = ((fiat_p256_value_barrier_u64(x2) & arg3) | (fiat_p256_value_barrier_u64((~x2)) & arg2)); + *out1 = x3; +} + +/* + * The function fiat_p256_mul multiplies two field elements in the Montgomery domain. + * + * Preconditions: + * 0 ≤ eval arg1 < m + * 0 ≤ eval arg2 < m + * Postconditions: + * eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) * eval (from_montgomery arg2)) mod m + * 0 ≤ eval out1 < m + * + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_mul(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) { + uint64_t x1; + uint64_t x2; + uint64_t x3; + uint64_t x4; + uint64_t x5; + uint64_t x6; + uint64_t x7; + uint64_t x8; + uint64_t x9; + uint64_t x10; + uint64_t x11; + uint64_t x12; + uint64_t x13; + fiat_p256_uint1 x14; + uint64_t x15; + fiat_p256_uint1 x16; + uint64_t x17; + fiat_p256_uint1 x18; + uint64_t x19; + uint64_t x20; + uint64_t x21; + uint64_t x22; + uint64_t x23; + uint64_t x24; + uint64_t x25; + uint64_t x26; + fiat_p256_uint1 x27; + uint64_t x28; + uint64_t x29; + fiat_p256_uint1 x30; + uint64_t x31; + fiat_p256_uint1 x32; + uint64_t x33; + fiat_p256_uint1 x34; + uint64_t x35; + fiat_p256_uint1 x36; + uint64_t x37; + fiat_p256_uint1 x38; + uint64_t x39; + uint64_t x40; + uint64_t x41; + uint64_t x42; + uint64_t x43; + uint64_t x44; + uint64_t x45; + uint64_t x46; + uint64_t x47; + fiat_p256_uint1 x48; + uint64_t x49; + fiat_p256_uint1 x50; + uint64_t x51; + fiat_p256_uint1 x52; + uint64_t x53; + uint64_t x54; + fiat_p256_uint1 x55; + uint64_t x56; + fiat_p256_uint1 x57; + uint64_t x58; + fiat_p256_uint1 x59; + uint64_t x60; + fiat_p256_uint1 x61; + uint64_t x62; + fiat_p256_uint1 x63; + uint64_t x64; + uint64_t x65; + uint64_t x66; + uint64_t x67; + uint64_t x68; + uint64_t x69; + uint64_t x70; + fiat_p256_uint1 x71; + uint64_t x72; + uint64_t x73; + fiat_p256_uint1 x74; + uint64_t x75; + fiat_p256_uint1 x76; + uint64_t x77; + fiat_p256_uint1 x78; + uint64_t x79; + fiat_p256_uint1 x80; + uint64_t x81; + fiat_p256_uint1 x82; + uint64_t x83; + uint64_t x84; + uint64_t x85; + uint64_t x86; + uint64_t x87; + uint64_t x88; + uint64_t x89; + uint64_t x90; + uint64_t x91; + uint64_t x92; + fiat_p256_uint1 x93; + uint64_t x94; + fiat_p256_uint1 x95; + uint64_t x96; + fiat_p256_uint1 x97; + uint64_t x98; + uint64_t x99; + fiat_p256_uint1 x100; + uint64_t x101; + fiat_p256_uint1 x102; + uint64_t x103; + fiat_p256_uint1 x104; + uint64_t x105; + fiat_p256_uint1 x106; + uint64_t x107; + fiat_p256_uint1 x108; + uint64_t x109; + uint64_t x110; + uint64_t x111; + uint64_t x112; + uint64_t x113; + uint64_t x114; + uint64_t x115; + fiat_p256_uint1 x116; + uint64_t x117; + uint64_t x118; + fiat_p256_uint1 x119; + uint64_t x120; + fiat_p256_uint1 x121; + uint64_t x122; + fiat_p256_uint1 x123; + uint64_t x124; + fiat_p256_uint1 x125; + uint64_t x126; + fiat_p256_uint1 x127; + uint64_t x128; + uint64_t x129; + uint64_t x130; + uint64_t x131; + uint64_t x132; + uint64_t x133; + uint64_t x134; + uint64_t x135; + uint64_t x136; + uint64_t x137; + fiat_p256_uint1 x138; + uint64_t x139; + fiat_p256_uint1 x140; + uint64_t x141; + fiat_p256_uint1 x142; + uint64_t x143; + uint64_t x144; + fiat_p256_uint1 x145; + uint64_t x146; + fiat_p256_uint1 x147; + uint64_t x148; + fiat_p256_uint1 x149; + uint64_t x150; + fiat_p256_uint1 x151; + uint64_t x152; + fiat_p256_uint1 x153; + uint64_t x154; + uint64_t x155; + uint64_t x156; + uint64_t x157; + uint64_t x158; + uint64_t x159; + uint64_t x160; + fiat_p256_uint1 x161; + uint64_t x162; + uint64_t x163; + fiat_p256_uint1 x164; + uint64_t x165; + fiat_p256_uint1 x166; + uint64_t x167; + fiat_p256_uint1 x168; + uint64_t x169; + fiat_p256_uint1 x170; + uint64_t x171; + fiat_p256_uint1 x172; + uint64_t x173; + uint64_t x174; + fiat_p256_uint1 x175; + uint64_t x176; + fiat_p256_uint1 x177; + uint64_t x178; + fiat_p256_uint1 x179; + uint64_t x180; + fiat_p256_uint1 x181; + uint64_t x182; + fiat_p256_uint1 x183; + uint64_t x184; + uint64_t x185; + uint64_t x186; + uint64_t x187; + x1 = (arg1[1]); + x2 = (arg1[2]); + x3 = (arg1[3]); + x4 = (arg1[0]); + fiat_p256_mulx_u64(&x5, &x6, x4, (arg2[3])); + fiat_p256_mulx_u64(&x7, &x8, x4, (arg2[2])); + fiat_p256_mulx_u64(&x9, &x10, x4, (arg2[1])); + fiat_p256_mulx_u64(&x11, &x12, x4, (arg2[0])); + fiat_p256_addcarryx_u64(&x13, &x14, 0x0, x12, x9); + fiat_p256_addcarryx_u64(&x15, &x16, x14, x10, x7); + fiat_p256_addcarryx_u64(&x17, &x18, x16, x8, x5); + x19 = (x18 + x6); + fiat_p256_mulx_u64(&x20, &x21, x11, UINT64_C(0xffffffff00000001)); + fiat_p256_mulx_u64(&x22, &x23, x11, UINT32_C(0xffffffff)); + fiat_p256_mulx_u64(&x24, &x25, x11, UINT64_C(0xffffffffffffffff)); + fiat_p256_addcarryx_u64(&x26, &x27, 0x0, x25, x22); + x28 = (x27 + x23); + fiat_p256_addcarryx_u64(&x29, &x30, 0x0, x11, x24); + fiat_p256_addcarryx_u64(&x31, &x32, x30, x13, x26); + fiat_p256_addcarryx_u64(&x33, &x34, x32, x15, x28); + fiat_p256_addcarryx_u64(&x35, &x36, x34, x17, x20); + fiat_p256_addcarryx_u64(&x37, &x38, x36, x19, x21); + fiat_p256_mulx_u64(&x39, &x40, x1, (arg2[3])); + fiat_p256_mulx_u64(&x41, &x42, x1, (arg2[2])); + fiat_p256_mulx_u64(&x43, &x44, x1, (arg2[1])); + fiat_p256_mulx_u64(&x45, &x46, x1, (arg2[0])); + fiat_p256_addcarryx_u64(&x47, &x48, 0x0, x46, x43); + fiat_p256_addcarryx_u64(&x49, &x50, x48, x44, x41); + fiat_p256_addcarryx_u64(&x51, &x52, x50, x42, x39); + x53 = (x52 + x40); + fiat_p256_addcarryx_u64(&x54, &x55, 0x0, x31, x45); + fiat_p256_addcarryx_u64(&x56, &x57, x55, x33, x47); + fiat_p256_addcarryx_u64(&x58, &x59, x57, x35, x49); + fiat_p256_addcarryx_u64(&x60, &x61, x59, x37, x51); + fiat_p256_addcarryx_u64(&x62, &x63, x61, x38, x53); + fiat_p256_mulx_u64(&x64, &x65, x54, UINT64_C(0xffffffff00000001)); + fiat_p256_mulx_u64(&x66, &x67, x54, UINT32_C(0xffffffff)); + fiat_p256_mulx_u64(&x68, &x69, x54, UINT64_C(0xffffffffffffffff)); + fiat_p256_addcarryx_u64(&x70, &x71, 0x0, x69, x66); + x72 = (x71 + x67); + fiat_p256_addcarryx_u64(&x73, &x74, 0x0, x54, x68); + fiat_p256_addcarryx_u64(&x75, &x76, x74, x56, x70); + fiat_p256_addcarryx_u64(&x77, &x78, x76, x58, x72); + fiat_p256_addcarryx_u64(&x79, &x80, x78, x60, x64); + fiat_p256_addcarryx_u64(&x81, &x82, x80, x62, x65); + x83 = ((uint64_t)x82 + x63); + fiat_p256_mulx_u64(&x84, &x85, x2, (arg2[3])); + fiat_p256_mulx_u64(&x86, &x87, x2, (arg2[2])); + fiat_p256_mulx_u64(&x88, &x89, x2, (arg2[1])); + fiat_p256_mulx_u64(&x90, &x91, x2, (arg2[0])); + fiat_p256_addcarryx_u64(&x92, &x93, 0x0, x91, x88); + fiat_p256_addcarryx_u64(&x94, &x95, x93, x89, x86); + fiat_p256_addcarryx_u64(&x96, &x97, x95, x87, x84); + x98 = (x97 + x85); + fiat_p256_addcarryx_u64(&x99, &x100, 0x0, x75, x90); + fiat_p256_addcarryx_u64(&x101, &x102, x100, x77, x92); + fiat_p256_addcarryx_u64(&x103, &x104, x102, x79, x94); + fiat_p256_addcarryx_u64(&x105, &x106, x104, x81, x96); + fiat_p256_addcarryx_u64(&x107, &x108, x106, x83, x98); + fiat_p256_mulx_u64(&x109, &x110, x99, UINT64_C(0xffffffff00000001)); + fiat_p256_mulx_u64(&x111, &x112, x99, UINT32_C(0xffffffff)); + fiat_p256_mulx_u64(&x113, &x114, x99, UINT64_C(0xffffffffffffffff)); + fiat_p256_addcarryx_u64(&x115, &x116, 0x0, x114, x111); + x117 = (x116 + x112); + fiat_p256_addcarryx_u64(&x118, &x119, 0x0, x99, x113); + fiat_p256_addcarryx_u64(&x120, &x121, x119, x101, x115); + fiat_p256_addcarryx_u64(&x122, &x123, x121, x103, x117); + fiat_p256_addcarryx_u64(&x124, &x125, x123, x105, x109); + fiat_p256_addcarryx_u64(&x126, &x127, x125, x107, x110); + x128 = ((uint64_t)x127 + x108); + fiat_p256_mulx_u64(&x129, &x130, x3, (arg2[3])); + fiat_p256_mulx_u64(&x131, &x132, x3, (arg2[2])); + fiat_p256_mulx_u64(&x133, &x134, x3, (arg2[1])); + fiat_p256_mulx_u64(&x135, &x136, x3, (arg2[0])); + fiat_p256_addcarryx_u64(&x137, &x138, 0x0, x136, x133); + fiat_p256_addcarryx_u64(&x139, &x140, x138, x134, x131); + fiat_p256_addcarryx_u64(&x141, &x142, x140, x132, x129); + x143 = (x142 + x130); + fiat_p256_addcarryx_u64(&x144, &x145, 0x0, x120, x135); + fiat_p256_addcarryx_u64(&x146, &x147, x145, x122, x137); + fiat_p256_addcarryx_u64(&x148, &x149, x147, x124, x139); + fiat_p256_addcarryx_u64(&x150, &x151, x149, x126, x141); + fiat_p256_addcarryx_u64(&x152, &x153, x151, x128, x143); + fiat_p256_mulx_u64(&x154, &x155, x144, UINT64_C(0xffffffff00000001)); + fiat_p256_mulx_u64(&x156, &x157, x144, UINT32_C(0xffffffff)); + fiat_p256_mulx_u64(&x158, &x159, x144, UINT64_C(0xffffffffffffffff)); + fiat_p256_addcarryx_u64(&x160, &x161, 0x0, x159, x156); + x162 = (x161 + x157); + fiat_p256_addcarryx_u64(&x163, &x164, 0x0, x144, x158); + fiat_p256_addcarryx_u64(&x165, &x166, x164, x146, x160); + fiat_p256_addcarryx_u64(&x167, &x168, x166, x148, x162); + fiat_p256_addcarryx_u64(&x169, &x170, x168, x150, x154); + fiat_p256_addcarryx_u64(&x171, &x172, x170, x152, x155); + x173 = ((uint64_t)x172 + x153); + fiat_p256_subborrowx_u64(&x174, &x175, 0x0, x165, UINT64_C(0xffffffffffffffff)); + fiat_p256_subborrowx_u64(&x176, &x177, x175, x167, UINT32_C(0xffffffff)); + fiat_p256_subborrowx_u64(&x178, &x179, x177, x169, 0x0); + fiat_p256_subborrowx_u64(&x180, &x181, x179, x171, UINT64_C(0xffffffff00000001)); + fiat_p256_subborrowx_u64(&x182, &x183, x181, x173, 0x0); + fiat_p256_cmovznz_u64(&x184, x183, x174, x165); + fiat_p256_cmovznz_u64(&x185, x183, x176, x167); + fiat_p256_cmovznz_u64(&x186, x183, x178, x169); + fiat_p256_cmovznz_u64(&x187, x183, x180, x171); + out1[0] = x184; + out1[1] = x185; + out1[2] = x186; + out1[3] = x187; +} + +/* + * The function fiat_p256_square squares a field element in the Montgomery domain. + * + * Preconditions: + * 0 ≤ eval arg1 < m + * Postconditions: + * eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) * eval (from_montgomery arg1)) mod m + * 0 ≤ eval out1 < m + * + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_square(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1) { + uint64_t x1; + uint64_t x2; + uint64_t x3; + uint64_t x4; + uint64_t x5; + uint64_t x6; + uint64_t x7; + uint64_t x8; + uint64_t x9; + uint64_t x10; + uint64_t x11; + uint64_t x12; + uint64_t x13; + fiat_p256_uint1 x14; + uint64_t x15; + fiat_p256_uint1 x16; + uint64_t x17; + fiat_p256_uint1 x18; + uint64_t x19; + uint64_t x20; + uint64_t x21; + uint64_t x22; + uint64_t x23; + uint64_t x24; + uint64_t x25; + uint64_t x26; + fiat_p256_uint1 x27; + uint64_t x28; + uint64_t x29; + fiat_p256_uint1 x30; + uint64_t x31; + fiat_p256_uint1 x32; + uint64_t x33; + fiat_p256_uint1 x34; + uint64_t x35; + fiat_p256_uint1 x36; + uint64_t x37; + fiat_p256_uint1 x38; + uint64_t x39; + uint64_t x40; + uint64_t x41; + uint64_t x42; + uint64_t x43; + uint64_t x44; + uint64_t x45; + uint64_t x46; + uint64_t x47; + fiat_p256_uint1 x48; + uint64_t x49; + fiat_p256_uint1 x50; + uint64_t x51; + fiat_p256_uint1 x52; + uint64_t x53; + uint64_t x54; + fiat_p256_uint1 x55; + uint64_t x56; + fiat_p256_uint1 x57; + uint64_t x58; + fiat_p256_uint1 x59; + uint64_t x60; + fiat_p256_uint1 x61; + uint64_t x62; + fiat_p256_uint1 x63; + uint64_t x64; + uint64_t x65; + uint64_t x66; + uint64_t x67; + uint64_t x68; + uint64_t x69; + uint64_t x70; + fiat_p256_uint1 x71; + uint64_t x72; + uint64_t x73; + fiat_p256_uint1 x74; + uint64_t x75; + fiat_p256_uint1 x76; + uint64_t x77; + fiat_p256_uint1 x78; + uint64_t x79; + fiat_p256_uint1 x80; + uint64_t x81; + fiat_p256_uint1 x82; + uint64_t x83; + uint64_t x84; + uint64_t x85; + uint64_t x86; + uint64_t x87; + uint64_t x88; + uint64_t x89; + uint64_t x90; + uint64_t x91; + uint64_t x92; + fiat_p256_uint1 x93; + uint64_t x94; + fiat_p256_uint1 x95; + uint64_t x96; + fiat_p256_uint1 x97; + uint64_t x98; + uint64_t x99; + fiat_p256_uint1 x100; + uint64_t x101; + fiat_p256_uint1 x102; + uint64_t x103; + fiat_p256_uint1 x104; + uint64_t x105; + fiat_p256_uint1 x106; + uint64_t x107; + fiat_p256_uint1 x108; + uint64_t x109; + uint64_t x110; + uint64_t x111; + uint64_t x112; + uint64_t x113; + uint64_t x114; + uint64_t x115; + fiat_p256_uint1 x116; + uint64_t x117; + uint64_t x118; + fiat_p256_uint1 x119; + uint64_t x120; + fiat_p256_uint1 x121; + uint64_t x122; + fiat_p256_uint1 x123; + uint64_t x124; + fiat_p256_uint1 x125; + uint64_t x126; + fiat_p256_uint1 x127; + uint64_t x128; + uint64_t x129; + uint64_t x130; + uint64_t x131; + uint64_t x132; + uint64_t x133; + uint64_t x134; + uint64_t x135; + uint64_t x136; + uint64_t x137; + fiat_p256_uint1 x138; + uint64_t x139; + fiat_p256_uint1 x140; + uint64_t x141; + fiat_p256_uint1 x142; + uint64_t x143; + uint64_t x144; + fiat_p256_uint1 x145; + uint64_t x146; + fiat_p256_uint1 x147; + uint64_t x148; + fiat_p256_uint1 x149; + uint64_t x150; + fiat_p256_uint1 x151; + uint64_t x152; + fiat_p256_uint1 x153; + uint64_t x154; + uint64_t x155; + uint64_t x156; + uint64_t x157; + uint64_t x158; + uint64_t x159; + uint64_t x160; + fiat_p256_uint1 x161; + uint64_t x162; + uint64_t x163; + fiat_p256_uint1 x164; + uint64_t x165; + fiat_p256_uint1 x166; + uint64_t x167; + fiat_p256_uint1 x168; + uint64_t x169; + fiat_p256_uint1 x170; + uint64_t x171; + fiat_p256_uint1 x172; + uint64_t x173; + uint64_t x174; + fiat_p256_uint1 x175; + uint64_t x176; + fiat_p256_uint1 x177; + uint64_t x178; + fiat_p256_uint1 x179; + uint64_t x180; + fiat_p256_uint1 x181; + uint64_t x182; + fiat_p256_uint1 x183; + uint64_t x184; + uint64_t x185; + uint64_t x186; + uint64_t x187; + x1 = (arg1[1]); + x2 = (arg1[2]); + x3 = (arg1[3]); + x4 = (arg1[0]); + fiat_p256_mulx_u64(&x5, &x6, x4, (arg1[3])); + fiat_p256_mulx_u64(&x7, &x8, x4, (arg1[2])); + fiat_p256_mulx_u64(&x9, &x10, x4, (arg1[1])); + fiat_p256_mulx_u64(&x11, &x12, x4, (arg1[0])); + fiat_p256_addcarryx_u64(&x13, &x14, 0x0, x12, x9); + fiat_p256_addcarryx_u64(&x15, &x16, x14, x10, x7); + fiat_p256_addcarryx_u64(&x17, &x18, x16, x8, x5); + x19 = (x18 + x6); + fiat_p256_mulx_u64(&x20, &x21, x11, UINT64_C(0xffffffff00000001)); + fiat_p256_mulx_u64(&x22, &x23, x11, UINT32_C(0xffffffff)); + fiat_p256_mulx_u64(&x24, &x25, x11, UINT64_C(0xffffffffffffffff)); + fiat_p256_addcarryx_u64(&x26, &x27, 0x0, x25, x22); + x28 = (x27 + x23); + fiat_p256_addcarryx_u64(&x29, &x30, 0x0, x11, x24); + fiat_p256_addcarryx_u64(&x31, &x32, x30, x13, x26); + fiat_p256_addcarryx_u64(&x33, &x34, x32, x15, x28); + fiat_p256_addcarryx_u64(&x35, &x36, x34, x17, x20); + fiat_p256_addcarryx_u64(&x37, &x38, x36, x19, x21); + fiat_p256_mulx_u64(&x39, &x40, x1, (arg1[3])); + fiat_p256_mulx_u64(&x41, &x42, x1, (arg1[2])); + fiat_p256_mulx_u64(&x43, &x44, x1, (arg1[1])); + fiat_p256_mulx_u64(&x45, &x46, x1, (arg1[0])); + fiat_p256_addcarryx_u64(&x47, &x48, 0x0, x46, x43); + fiat_p256_addcarryx_u64(&x49, &x50, x48, x44, x41); + fiat_p256_addcarryx_u64(&x51, &x52, x50, x42, x39); + x53 = (x52 + x40); + fiat_p256_addcarryx_u64(&x54, &x55, 0x0, x31, x45); + fiat_p256_addcarryx_u64(&x56, &x57, x55, x33, x47); + fiat_p256_addcarryx_u64(&x58, &x59, x57, x35, x49); + fiat_p256_addcarryx_u64(&x60, &x61, x59, x37, x51); + fiat_p256_addcarryx_u64(&x62, &x63, x61, x38, x53); + fiat_p256_mulx_u64(&x64, &x65, x54, UINT64_C(0xffffffff00000001)); + fiat_p256_mulx_u64(&x66, &x67, x54, UINT32_C(0xffffffff)); + fiat_p256_mulx_u64(&x68, &x69, x54, UINT64_C(0xffffffffffffffff)); + fiat_p256_addcarryx_u64(&x70, &x71, 0x0, x69, x66); + x72 = (x71 + x67); + fiat_p256_addcarryx_u64(&x73, &x74, 0x0, x54, x68); + fiat_p256_addcarryx_u64(&x75, &x76, x74, x56, x70); + fiat_p256_addcarryx_u64(&x77, &x78, x76, x58, x72); + fiat_p256_addcarryx_u64(&x79, &x80, x78, x60, x64); + fiat_p256_addcarryx_u64(&x81, &x82, x80, x62, x65); + x83 = ((uint64_t)x82 + x63); + fiat_p256_mulx_u64(&x84, &x85, x2, (arg1[3])); + fiat_p256_mulx_u64(&x86, &x87, x2, (arg1[2])); + fiat_p256_mulx_u64(&x88, &x89, x2, (arg1[1])); + fiat_p256_mulx_u64(&x90, &x91, x2, (arg1[0])); + fiat_p256_addcarryx_u64(&x92, &x93, 0x0, x91, x88); + fiat_p256_addcarryx_u64(&x94, &x95, x93, x89, x86); + fiat_p256_addcarryx_u64(&x96, &x97, x95, x87, x84); + x98 = (x97 + x85); + fiat_p256_addcarryx_u64(&x99, &x100, 0x0, x75, x90); + fiat_p256_addcarryx_u64(&x101, &x102, x100, x77, x92); + fiat_p256_addcarryx_u64(&x103, &x104, x102, x79, x94); + fiat_p256_addcarryx_u64(&x105, &x106, x104, x81, x96); + fiat_p256_addcarryx_u64(&x107, &x108, x106, x83, x98); + fiat_p256_mulx_u64(&x109, &x110, x99, UINT64_C(0xffffffff00000001)); + fiat_p256_mulx_u64(&x111, &x112, x99, UINT32_C(0xffffffff)); + fiat_p256_mulx_u64(&x113, &x114, x99, UINT64_C(0xffffffffffffffff)); + fiat_p256_addcarryx_u64(&x115, &x116, 0x0, x114, x111); + x117 = (x116 + x112); + fiat_p256_addcarryx_u64(&x118, &x119, 0x0, x99, x113); + fiat_p256_addcarryx_u64(&x120, &x121, x119, x101, x115); + fiat_p256_addcarryx_u64(&x122, &x123, x121, x103, x117); + fiat_p256_addcarryx_u64(&x124, &x125, x123, x105, x109); + fiat_p256_addcarryx_u64(&x126, &x127, x125, x107, x110); + x128 = ((uint64_t)x127 + x108); + fiat_p256_mulx_u64(&x129, &x130, x3, (arg1[3])); + fiat_p256_mulx_u64(&x131, &x132, x3, (arg1[2])); + fiat_p256_mulx_u64(&x133, &x134, x3, (arg1[1])); + fiat_p256_mulx_u64(&x135, &x136, x3, (arg1[0])); + fiat_p256_addcarryx_u64(&x137, &x138, 0x0, x136, x133); + fiat_p256_addcarryx_u64(&x139, &x140, x138, x134, x131); + fiat_p256_addcarryx_u64(&x141, &x142, x140, x132, x129); + x143 = (x142 + x130); + fiat_p256_addcarryx_u64(&x144, &x145, 0x0, x120, x135); + fiat_p256_addcarryx_u64(&x146, &x147, x145, x122, x137); + fiat_p256_addcarryx_u64(&x148, &x149, x147, x124, x139); + fiat_p256_addcarryx_u64(&x150, &x151, x149, x126, x141); + fiat_p256_addcarryx_u64(&x152, &x153, x151, x128, x143); + fiat_p256_mulx_u64(&x154, &x155, x144, UINT64_C(0xffffffff00000001)); + fiat_p256_mulx_u64(&x156, &x157, x144, UINT32_C(0xffffffff)); + fiat_p256_mulx_u64(&x158, &x159, x144, UINT64_C(0xffffffffffffffff)); + fiat_p256_addcarryx_u64(&x160, &x161, 0x0, x159, x156); + x162 = (x161 + x157); + fiat_p256_addcarryx_u64(&x163, &x164, 0x0, x144, x158); + fiat_p256_addcarryx_u64(&x165, &x166, x164, x146, x160); + fiat_p256_addcarryx_u64(&x167, &x168, x166, x148, x162); + fiat_p256_addcarryx_u64(&x169, &x170, x168, x150, x154); + fiat_p256_addcarryx_u64(&x171, &x172, x170, x152, x155); + x173 = ((uint64_t)x172 + x153); + fiat_p256_subborrowx_u64(&x174, &x175, 0x0, x165, UINT64_C(0xffffffffffffffff)); + fiat_p256_subborrowx_u64(&x176, &x177, x175, x167, UINT32_C(0xffffffff)); + fiat_p256_subborrowx_u64(&x178, &x179, x177, x169, 0x0); + fiat_p256_subborrowx_u64(&x180, &x181, x179, x171, UINT64_C(0xffffffff00000001)); + fiat_p256_subborrowx_u64(&x182, &x183, x181, x173, 0x0); + fiat_p256_cmovznz_u64(&x184, x183, x174, x165); + fiat_p256_cmovznz_u64(&x185, x183, x176, x167); + fiat_p256_cmovznz_u64(&x186, x183, x178, x169); + fiat_p256_cmovznz_u64(&x187, x183, x180, x171); + out1[0] = x184; + out1[1] = x185; + out1[2] = x186; + out1[3] = x187; +} + +/* + * The function fiat_p256_add adds two field elements in the Montgomery domain. + * + * Preconditions: + * 0 ≤ eval arg1 < m + * 0 ≤ eval arg2 < m + * Postconditions: + * eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) + eval (from_montgomery arg2)) mod m + * 0 ≤ eval out1 < m + * + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_add(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) { + uint64_t x1; + fiat_p256_uint1 x2; + uint64_t x3; + fiat_p256_uint1 x4; + uint64_t x5; + fiat_p256_uint1 x6; + uint64_t x7; + fiat_p256_uint1 x8; + uint64_t x9; + fiat_p256_uint1 x10; + uint64_t x11; + fiat_p256_uint1 x12; + uint64_t x13; + fiat_p256_uint1 x14; + uint64_t x15; + fiat_p256_uint1 x16; + uint64_t x17; + fiat_p256_uint1 x18; + uint64_t x19; + uint64_t x20; + uint64_t x21; + uint64_t x22; + fiat_p256_addcarryx_u64(&x1, &x2, 0x0, (arg1[0]), (arg2[0])); + fiat_p256_addcarryx_u64(&x3, &x4, x2, (arg1[1]), (arg2[1])); + fiat_p256_addcarryx_u64(&x5, &x6, x4, (arg1[2]), (arg2[2])); + fiat_p256_addcarryx_u64(&x7, &x8, x6, (arg1[3]), (arg2[3])); + fiat_p256_subborrowx_u64(&x9, &x10, 0x0, x1, UINT64_C(0xffffffffffffffff)); + fiat_p256_subborrowx_u64(&x11, &x12, x10, x3, UINT32_C(0xffffffff)); + fiat_p256_subborrowx_u64(&x13, &x14, x12, x5, 0x0); + fiat_p256_subborrowx_u64(&x15, &x16, x14, x7, UINT64_C(0xffffffff00000001)); + fiat_p256_subborrowx_u64(&x17, &x18, x16, x8, 0x0); + fiat_p256_cmovznz_u64(&x19, x18, x9, x1); + fiat_p256_cmovznz_u64(&x20, x18, x11, x3); + fiat_p256_cmovznz_u64(&x21, x18, x13, x5); + fiat_p256_cmovznz_u64(&x22, x18, x15, x7); + out1[0] = x19; + out1[1] = x20; + out1[2] = x21; + out1[3] = x22; +} + +/* + * The function fiat_p256_sub subtracts two field elements in the Montgomery domain. + * + * Preconditions: + * 0 ≤ eval arg1 < m + * 0 ≤ eval arg2 < m + * Postconditions: + * eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) - eval (from_montgomery arg2)) mod m + * 0 ≤ eval out1 < m + * + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_sub(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) { + uint64_t x1; + fiat_p256_uint1 x2; + uint64_t x3; + fiat_p256_uint1 x4; + uint64_t x5; + fiat_p256_uint1 x6; + uint64_t x7; + fiat_p256_uint1 x8; + uint64_t x9; + uint64_t x10; + fiat_p256_uint1 x11; + uint64_t x12; + fiat_p256_uint1 x13; + uint64_t x14; + fiat_p256_uint1 x15; + uint64_t x16; + fiat_p256_uint1 x17; + fiat_p256_subborrowx_u64(&x1, &x2, 0x0, (arg1[0]), (arg2[0])); + fiat_p256_subborrowx_u64(&x3, &x4, x2, (arg1[1]), (arg2[1])); + fiat_p256_subborrowx_u64(&x5, &x6, x4, (arg1[2]), (arg2[2])); + fiat_p256_subborrowx_u64(&x7, &x8, x6, (arg1[3]), (arg2[3])); + fiat_p256_cmovznz_u64(&x9, x8, 0x0, UINT64_C(0xffffffffffffffff)); + fiat_p256_addcarryx_u64(&x10, &x11, 0x0, x1, x9); + fiat_p256_addcarryx_u64(&x12, &x13, x11, x3, (x9 & UINT32_C(0xffffffff))); + fiat_p256_addcarryx_u64(&x14, &x15, x13, x5, 0x0); + fiat_p256_addcarryx_u64(&x16, &x17, x15, x7, (x9 & UINT64_C(0xffffffff00000001))); + out1[0] = x10; + out1[1] = x12; + out1[2] = x14; + out1[3] = x16; +} + +/* + * The function fiat_p256_opp negates a field element in the Montgomery domain. + * + * Preconditions: + * 0 ≤ eval arg1 < m + * Postconditions: + * eval (from_montgomery out1) mod m = -eval (from_montgomery arg1) mod m + * 0 ≤ eval out1 < m + * + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_opp(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1) { + uint64_t x1; + fiat_p256_uint1 x2; + uint64_t x3; + fiat_p256_uint1 x4; + uint64_t x5; + fiat_p256_uint1 x6; + uint64_t x7; + fiat_p256_uint1 x8; + uint64_t x9; + uint64_t x10; + fiat_p256_uint1 x11; + uint64_t x12; + fiat_p256_uint1 x13; + uint64_t x14; + fiat_p256_uint1 x15; + uint64_t x16; + fiat_p256_uint1 x17; + fiat_p256_subborrowx_u64(&x1, &x2, 0x0, 0x0, (arg1[0])); + fiat_p256_subborrowx_u64(&x3, &x4, x2, 0x0, (arg1[1])); + fiat_p256_subborrowx_u64(&x5, &x6, x4, 0x0, (arg1[2])); + fiat_p256_subborrowx_u64(&x7, &x8, x6, 0x0, (arg1[3])); + fiat_p256_cmovznz_u64(&x9, x8, 0x0, UINT64_C(0xffffffffffffffff)); + fiat_p256_addcarryx_u64(&x10, &x11, 0x0, x1, x9); + fiat_p256_addcarryx_u64(&x12, &x13, x11, x3, (x9 & UINT32_C(0xffffffff))); + fiat_p256_addcarryx_u64(&x14, &x15, x13, x5, 0x0); + fiat_p256_addcarryx_u64(&x16, &x17, x15, x7, (x9 & UINT64_C(0xffffffff00000001))); + out1[0] = x10; + out1[1] = x12; + out1[2] = x14; + out1[3] = x16; +} + +/* + * The function fiat_p256_nonzero outputs a single non-zero word if the input is non-zero and zero otherwise. + * + * Preconditions: + * 0 ≤ eval arg1 < m + * Postconditions: + * out1 = 0 ↔ eval (from_montgomery arg1) mod m = 0 + * + * Input Bounds: + * arg1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + * Output Bounds: + * out1: [0x0 ~> 0xffffffffffffffff] + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_nonzero(uint64_t* out1, const uint64_t arg1[4]) { + uint64_t x1; + x1 = ((arg1[0]) | ((arg1[1]) | ((arg1[2]) | (arg1[3])))); + *out1 = x1; +} + +/* + * The function fiat_p256_selectznz is a multi-limb conditional select. + * + * Postconditions: + * eval out1 = (if arg1 = 0 then eval arg2 else eval arg3) + * + * Input Bounds: + * arg1: [0x0 ~> 0x1] + * arg2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + * arg3: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + * Output Bounds: + * out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_selectznz(uint64_t out1[4], fiat_p256_uint1 arg1, const uint64_t arg2[4], const uint64_t arg3[4]) { + uint64_t x1; + uint64_t x2; + uint64_t x3; + uint64_t x4; + fiat_p256_cmovznz_u64(&x1, arg1, (arg2[0]), (arg3[0])); + fiat_p256_cmovznz_u64(&x2, arg1, (arg2[1]), (arg3[1])); + fiat_p256_cmovznz_u64(&x3, arg1, (arg2[2]), (arg3[2])); + fiat_p256_cmovznz_u64(&x4, arg1, (arg2[3]), (arg3[3])); + out1[0] = x1; + out1[1] = x2; + out1[2] = x3; + out1[3] = x4; +} + diff --git a/ring-0.17.14/third_party/fiat/p256_64_msvc.h b/ring-0.17.14/third_party/fiat/p256_64_msvc.h new file mode 100644 index 0000000000..8b65a37342 --- /dev/null +++ b/ring-0.17.14/third_party/fiat/p256_64_msvc.h @@ -0,0 +1,2002 @@ +/* Autogenerated: 'src/ExtractionOCaml/word_by_word_montgomery' --inline --static --use-value-barrier --no-wide-int p256 64 '2^256 - 2^224 + 2^192 + 2^96 - 1' mul square add sub opp from_montgomery to_montgomery nonzero selectznz to_bytes from_bytes one msat divstep divstep_precomp */ +/* curve description: p256 */ +/* machine_wordsize = 64 (from "64") */ +/* requested operations: mul, square, add, sub, opp, from_montgomery, to_montgomery, nonzero, selectznz, to_bytes, from_bytes, one, msat, divstep, divstep_precomp */ +/* m = 0xffffffff00000001000000000000000000000000ffffffffffffffffffffffff (from "2^256 - 2^224 + 2^192 + 2^96 - 1") */ +/* */ +/* NOTE: In addition to the bounds specified above each function, all */ +/* functions synthesized for this Montgomery arithmetic require the */ +/* input to be strictly less than the prime modulus (m), and also */ +/* require the input to be in the unique saturated representation. */ +/* All functions also ensure that these two properties are true of */ +/* return values. */ +/* */ +/* Computed values: */ +/* eval z = z[0] + (z[1] << 64) + (z[2] << 128) + (z[3] << 192) */ +/* bytes_eval z = z[0] + (z[1] << 8) + (z[2] << 16) + (z[3] << 24) + (z[4] << 32) + (z[5] << 40) + (z[6] << 48) + (z[7] << 56) + (z[8] << 64) + (z[9] << 72) + (z[10] << 80) + (z[11] << 88) + (z[12] << 96) + (z[13] << 104) + (z[14] << 112) + (z[15] << 120) + (z[16] << 128) + (z[17] << 136) + (z[18] << 144) + (z[19] << 152) + (z[20] << 160) + (z[21] << 168) + (z[22] << 176) + (z[23] << 184) + (z[24] << 192) + (z[25] << 200) + (z[26] << 208) + (z[27] << 216) + (z[28] << 224) + (z[29] << 232) + (z[30] << 240) + (z[31] << 248) */ +/* twos_complement_eval z = let x1 := z[0] + (z[1] << 64) + (z[2] << 128) + (z[3] << 192) in */ +/* if x1 & (2^256-1) < 2^255 then x1 & (2^256-1) else (x1 & (2^256-1)) - 2^256 */ + +#include +#include +#if defined(_M_X64) +#include +#endif + +typedef unsigned char fiat_p256_uint1; +typedef signed char fiat_p256_int1; + +#define FIAT_P256_FIAT_INLINE inline + +/* The type fiat_p256_montgomery_domain_field_element is a field element in the Montgomery domain. */ +/* Bounds: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] */ +typedef uint64_t fiat_p256_montgomery_domain_field_element[4]; + +/* The type fiat_p256_non_montgomery_domain_field_element is a field element NOT in the Montgomery domain. */ +/* Bounds: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] */ +typedef uint64_t fiat_p256_non_montgomery_domain_field_element[4]; + +#if (-1 & 3) != 3 +#error "This code only works on a two's complement system" +#endif + +#define fiat_p256_value_barrier_u64(x) (x) + + +/* + * The function fiat_p256_addcarryx_u64 is an addition with carry. + * + * Postconditions: + * out1 = (arg1 + arg2 + arg3) mod 2^64 + * out2 = ⌊(arg1 + arg2 + arg3) / 2^64⌋ + * + * Input Bounds: + * arg1: [0x0 ~> 0x1] + * arg2: [0x0 ~> 0xffffffffffffffff] + * arg3: [0x0 ~> 0xffffffffffffffff] + * Output Bounds: + * out1: [0x0 ~> 0xffffffffffffffff] + * out2: [0x0 ~> 0x1] + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_addcarryx_u64(uint64_t* out1, fiat_p256_uint1* out2, fiat_p256_uint1 arg1, uint64_t arg2, uint64_t arg3) { +#if defined(_M_X64) + *out2 = _addcarry_u64(arg1, arg2, arg3, out1); +#else + arg2 += arg1; + arg1 = arg2 < arg1; + arg3 += arg2; + arg1 += arg3 < arg2; + *out1 = arg3; + *out2 = arg1; +#endif +} + +/* + * The function fiat_p256_subborrowx_u64 is a subtraction with borrow. + * + * Postconditions: + * out1 = (-arg1 + arg2 + -arg3) mod 2^64 + * out2 = -⌊(-arg1 + arg2 + -arg3) / 2^64⌋ + * + * Input Bounds: + * arg1: [0x0 ~> 0x1] + * arg2: [0x0 ~> 0xffffffffffffffff] + * arg3: [0x0 ~> 0xffffffffffffffff] + * Output Bounds: + * out1: [0x0 ~> 0xffffffffffffffff] + * out2: [0x0 ~> 0x1] + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_subborrowx_u64(uint64_t* out1, fiat_p256_uint1* out2, fiat_p256_uint1 arg1, uint64_t arg2, uint64_t arg3) { +#if defined(_M_X64) + *out2 = _subborrow_u64(arg1, arg2, arg3, out1); // NOTE: edited after generation +#else + *out1 = arg2 - arg3 - arg1; + *out2 = (arg2 < arg3) | ((arg2 == arg3) & arg1); +#endif +} + +/* + * The function fiat_p256_mulx_u64 is a multiplication, returning the full double-width result. + * + * Postconditions: + * out1 = (arg1 * arg2) mod 2^64 + * out2 = ⌊arg1 * arg2 / 2^64⌋ + * + * Input Bounds: + * arg1: [0x0 ~> 0xffffffffffffffff] + * arg2: [0x0 ~> 0xffffffffffffffff] + * Output Bounds: + * out1: [0x0 ~> 0xffffffffffffffff] + * out2: [0x0 ~> 0xffffffffffffffff] + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_mulx_u64(uint64_t* out1, uint64_t* out2, uint64_t arg1, uint64_t arg2) { +// NOTE: edited after generation +#if defined(_M_X64) + *out1 = _umul128(arg1, arg2, out2); +#elif defined(_M_ARM64) + *out1 = arg1 * arg2; + *out2 = __umulh(arg1, arg2); +#else +#error "This file is intended for MSVC on X64 or ARM64" +#endif +} + +/* + * The function fiat_p256_cmovznz_u64 is a single-word conditional move. + * + * Postconditions: + * out1 = (if arg1 = 0 then arg2 else arg3) + * + * Input Bounds: + * arg1: [0x0 ~> 0x1] + * arg2: [0x0 ~> 0xffffffffffffffff] + * arg3: [0x0 ~> 0xffffffffffffffff] + * Output Bounds: + * out1: [0x0 ~> 0xffffffffffffffff] + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_cmovznz_u64(uint64_t* out1, fiat_p256_uint1 arg1, uint64_t arg2, uint64_t arg3) { + fiat_p256_uint1 x1; + uint64_t x2; + uint64_t x3; + x1 = (!(!arg1)); + x2 = ((fiat_p256_int1)(0x0 - x1) & UINT64_C(0xffffffffffffffff)); + x3 = ((fiat_p256_value_barrier_u64(x2) & arg3) | (fiat_p256_value_barrier_u64((~x2)) & arg2)); + *out1 = x3; +} + +/* + * The function fiat_p256_mul multiplies two field elements in the Montgomery domain. + * + * Preconditions: + * 0 ≤ eval arg1 < m + * 0 ≤ eval arg2 < m + * Postconditions: + * eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) * eval (from_montgomery arg2)) mod m + * 0 ≤ eval out1 < m + * + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_mul(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) { + uint64_t x1; + uint64_t x2; + uint64_t x3; + uint64_t x4; + uint64_t x5; + uint64_t x6; + uint64_t x7; + uint64_t x8; + uint64_t x9; + uint64_t x10; + uint64_t x11; + uint64_t x12; + uint64_t x13; + fiat_p256_uint1 x14; + uint64_t x15; + fiat_p256_uint1 x16; + uint64_t x17; + fiat_p256_uint1 x18; + uint64_t x19; + uint64_t x20; + uint64_t x21; + uint64_t x22; + uint64_t x23; + uint64_t x24; + uint64_t x25; + uint64_t x26; + fiat_p256_uint1 x27; + uint64_t x28; + uint64_t x29; + fiat_p256_uint1 x30; + uint64_t x31; + fiat_p256_uint1 x32; + uint64_t x33; + fiat_p256_uint1 x34; + uint64_t x35; + fiat_p256_uint1 x36; + uint64_t x37; + fiat_p256_uint1 x38; + uint64_t x39; + uint64_t x40; + uint64_t x41; + uint64_t x42; + uint64_t x43; + uint64_t x44; + uint64_t x45; + uint64_t x46; + uint64_t x47; + fiat_p256_uint1 x48; + uint64_t x49; + fiat_p256_uint1 x50; + uint64_t x51; + fiat_p256_uint1 x52; + uint64_t x53; + uint64_t x54; + fiat_p256_uint1 x55; + uint64_t x56; + fiat_p256_uint1 x57; + uint64_t x58; + fiat_p256_uint1 x59; + uint64_t x60; + fiat_p256_uint1 x61; + uint64_t x62; + fiat_p256_uint1 x63; + uint64_t x64; + uint64_t x65; + uint64_t x66; + uint64_t x67; + uint64_t x68; + uint64_t x69; + uint64_t x70; + fiat_p256_uint1 x71; + uint64_t x72; + uint64_t x73; + fiat_p256_uint1 x74; + uint64_t x75; + fiat_p256_uint1 x76; + uint64_t x77; + fiat_p256_uint1 x78; + uint64_t x79; + fiat_p256_uint1 x80; + uint64_t x81; + fiat_p256_uint1 x82; + uint64_t x83; + uint64_t x84; + uint64_t x85; + uint64_t x86; + uint64_t x87; + uint64_t x88; + uint64_t x89; + uint64_t x90; + uint64_t x91; + uint64_t x92; + fiat_p256_uint1 x93; + uint64_t x94; + fiat_p256_uint1 x95; + uint64_t x96; + fiat_p256_uint1 x97; + uint64_t x98; + uint64_t x99; + fiat_p256_uint1 x100; + uint64_t x101; + fiat_p256_uint1 x102; + uint64_t x103; + fiat_p256_uint1 x104; + uint64_t x105; + fiat_p256_uint1 x106; + uint64_t x107; + fiat_p256_uint1 x108; + uint64_t x109; + uint64_t x110; + uint64_t x111; + uint64_t x112; + uint64_t x113; + uint64_t x114; + uint64_t x115; + fiat_p256_uint1 x116; + uint64_t x117; + uint64_t x118; + fiat_p256_uint1 x119; + uint64_t x120; + fiat_p256_uint1 x121; + uint64_t x122; + fiat_p256_uint1 x123; + uint64_t x124; + fiat_p256_uint1 x125; + uint64_t x126; + fiat_p256_uint1 x127; + uint64_t x128; + uint64_t x129; + uint64_t x130; + uint64_t x131; + uint64_t x132; + uint64_t x133; + uint64_t x134; + uint64_t x135; + uint64_t x136; + uint64_t x137; + fiat_p256_uint1 x138; + uint64_t x139; + fiat_p256_uint1 x140; + uint64_t x141; + fiat_p256_uint1 x142; + uint64_t x143; + uint64_t x144; + fiat_p256_uint1 x145; + uint64_t x146; + fiat_p256_uint1 x147; + uint64_t x148; + fiat_p256_uint1 x149; + uint64_t x150; + fiat_p256_uint1 x151; + uint64_t x152; + fiat_p256_uint1 x153; + uint64_t x154; + uint64_t x155; + uint64_t x156; + uint64_t x157; + uint64_t x158; + uint64_t x159; + uint64_t x160; + fiat_p256_uint1 x161; + uint64_t x162; + uint64_t x163; + fiat_p256_uint1 x164; + uint64_t x165; + fiat_p256_uint1 x166; + uint64_t x167; + fiat_p256_uint1 x168; + uint64_t x169; + fiat_p256_uint1 x170; + uint64_t x171; + fiat_p256_uint1 x172; + uint64_t x173; + uint64_t x174; + fiat_p256_uint1 x175; + uint64_t x176; + fiat_p256_uint1 x177; + uint64_t x178; + fiat_p256_uint1 x179; + uint64_t x180; + fiat_p256_uint1 x181; + uint64_t x182; + fiat_p256_uint1 x183; + uint64_t x184; + uint64_t x185; + uint64_t x186; + uint64_t x187; + x1 = (arg1[1]); + x2 = (arg1[2]); + x3 = (arg1[3]); + x4 = (arg1[0]); + fiat_p256_mulx_u64(&x5, &x6, x4, (arg2[3])); + fiat_p256_mulx_u64(&x7, &x8, x4, (arg2[2])); + fiat_p256_mulx_u64(&x9, &x10, x4, (arg2[1])); + fiat_p256_mulx_u64(&x11, &x12, x4, (arg2[0])); + fiat_p256_addcarryx_u64(&x13, &x14, 0x0, x12, x9); + fiat_p256_addcarryx_u64(&x15, &x16, x14, x10, x7); + fiat_p256_addcarryx_u64(&x17, &x18, x16, x8, x5); + x19 = (x18 + x6); + fiat_p256_mulx_u64(&x20, &x21, x11, UINT64_C(0xffffffff00000001)); + fiat_p256_mulx_u64(&x22, &x23, x11, UINT32_C(0xffffffff)); + fiat_p256_mulx_u64(&x24, &x25, x11, UINT64_C(0xffffffffffffffff)); + fiat_p256_addcarryx_u64(&x26, &x27, 0x0, x25, x22); + x28 = (x27 + x23); + fiat_p256_addcarryx_u64(&x29, &x30, 0x0, x11, x24); + fiat_p256_addcarryx_u64(&x31, &x32, x30, x13, x26); + fiat_p256_addcarryx_u64(&x33, &x34, x32, x15, x28); + fiat_p256_addcarryx_u64(&x35, &x36, x34, x17, x20); + fiat_p256_addcarryx_u64(&x37, &x38, x36, x19, x21); + fiat_p256_mulx_u64(&x39, &x40, x1, (arg2[3])); + fiat_p256_mulx_u64(&x41, &x42, x1, (arg2[2])); + fiat_p256_mulx_u64(&x43, &x44, x1, (arg2[1])); + fiat_p256_mulx_u64(&x45, &x46, x1, (arg2[0])); + fiat_p256_addcarryx_u64(&x47, &x48, 0x0, x46, x43); + fiat_p256_addcarryx_u64(&x49, &x50, x48, x44, x41); + fiat_p256_addcarryx_u64(&x51, &x52, x50, x42, x39); + x53 = (x52 + x40); + fiat_p256_addcarryx_u64(&x54, &x55, 0x0, x31, x45); + fiat_p256_addcarryx_u64(&x56, &x57, x55, x33, x47); + fiat_p256_addcarryx_u64(&x58, &x59, x57, x35, x49); + fiat_p256_addcarryx_u64(&x60, &x61, x59, x37, x51); + fiat_p256_addcarryx_u64(&x62, &x63, x61, x38, x53); + fiat_p256_mulx_u64(&x64, &x65, x54, UINT64_C(0xffffffff00000001)); + fiat_p256_mulx_u64(&x66, &x67, x54, UINT32_C(0xffffffff)); + fiat_p256_mulx_u64(&x68, &x69, x54, UINT64_C(0xffffffffffffffff)); + fiat_p256_addcarryx_u64(&x70, &x71, 0x0, x69, x66); + x72 = (x71 + x67); + fiat_p256_addcarryx_u64(&x73, &x74, 0x0, x54, x68); + fiat_p256_addcarryx_u64(&x75, &x76, x74, x56, x70); + fiat_p256_addcarryx_u64(&x77, &x78, x76, x58, x72); + fiat_p256_addcarryx_u64(&x79, &x80, x78, x60, x64); + fiat_p256_addcarryx_u64(&x81, &x82, x80, x62, x65); + x83 = ((uint64_t)x82 + x63); + fiat_p256_mulx_u64(&x84, &x85, x2, (arg2[3])); + fiat_p256_mulx_u64(&x86, &x87, x2, (arg2[2])); + fiat_p256_mulx_u64(&x88, &x89, x2, (arg2[1])); + fiat_p256_mulx_u64(&x90, &x91, x2, (arg2[0])); + fiat_p256_addcarryx_u64(&x92, &x93, 0x0, x91, x88); + fiat_p256_addcarryx_u64(&x94, &x95, x93, x89, x86); + fiat_p256_addcarryx_u64(&x96, &x97, x95, x87, x84); + x98 = (x97 + x85); + fiat_p256_addcarryx_u64(&x99, &x100, 0x0, x75, x90); + fiat_p256_addcarryx_u64(&x101, &x102, x100, x77, x92); + fiat_p256_addcarryx_u64(&x103, &x104, x102, x79, x94); + fiat_p256_addcarryx_u64(&x105, &x106, x104, x81, x96); + fiat_p256_addcarryx_u64(&x107, &x108, x106, x83, x98); + fiat_p256_mulx_u64(&x109, &x110, x99, UINT64_C(0xffffffff00000001)); + fiat_p256_mulx_u64(&x111, &x112, x99, UINT32_C(0xffffffff)); + fiat_p256_mulx_u64(&x113, &x114, x99, UINT64_C(0xffffffffffffffff)); + fiat_p256_addcarryx_u64(&x115, &x116, 0x0, x114, x111); + x117 = (x116 + x112); + fiat_p256_addcarryx_u64(&x118, &x119, 0x0, x99, x113); + fiat_p256_addcarryx_u64(&x120, &x121, x119, x101, x115); + fiat_p256_addcarryx_u64(&x122, &x123, x121, x103, x117); + fiat_p256_addcarryx_u64(&x124, &x125, x123, x105, x109); + fiat_p256_addcarryx_u64(&x126, &x127, x125, x107, x110); + x128 = ((uint64_t)x127 + x108); + fiat_p256_mulx_u64(&x129, &x130, x3, (arg2[3])); + fiat_p256_mulx_u64(&x131, &x132, x3, (arg2[2])); + fiat_p256_mulx_u64(&x133, &x134, x3, (arg2[1])); + fiat_p256_mulx_u64(&x135, &x136, x3, (arg2[0])); + fiat_p256_addcarryx_u64(&x137, &x138, 0x0, x136, x133); + fiat_p256_addcarryx_u64(&x139, &x140, x138, x134, x131); + fiat_p256_addcarryx_u64(&x141, &x142, x140, x132, x129); + x143 = (x142 + x130); + fiat_p256_addcarryx_u64(&x144, &x145, 0x0, x120, x135); + fiat_p256_addcarryx_u64(&x146, &x147, x145, x122, x137); + fiat_p256_addcarryx_u64(&x148, &x149, x147, x124, x139); + fiat_p256_addcarryx_u64(&x150, &x151, x149, x126, x141); + fiat_p256_addcarryx_u64(&x152, &x153, x151, x128, x143); + fiat_p256_mulx_u64(&x154, &x155, x144, UINT64_C(0xffffffff00000001)); + fiat_p256_mulx_u64(&x156, &x157, x144, UINT32_C(0xffffffff)); + fiat_p256_mulx_u64(&x158, &x159, x144, UINT64_C(0xffffffffffffffff)); + fiat_p256_addcarryx_u64(&x160, &x161, 0x0, x159, x156); + x162 = (x161 + x157); + fiat_p256_addcarryx_u64(&x163, &x164, 0x0, x144, x158); + fiat_p256_addcarryx_u64(&x165, &x166, x164, x146, x160); + fiat_p256_addcarryx_u64(&x167, &x168, x166, x148, x162); + fiat_p256_addcarryx_u64(&x169, &x170, x168, x150, x154); + fiat_p256_addcarryx_u64(&x171, &x172, x170, x152, x155); + x173 = ((uint64_t)x172 + x153); + fiat_p256_subborrowx_u64(&x174, &x175, 0x0, x165, UINT64_C(0xffffffffffffffff)); + fiat_p256_subborrowx_u64(&x176, &x177, x175, x167, UINT32_C(0xffffffff)); + fiat_p256_subborrowx_u64(&x178, &x179, x177, x169, 0x0); + fiat_p256_subborrowx_u64(&x180, &x181, x179, x171, UINT64_C(0xffffffff00000001)); + fiat_p256_subborrowx_u64(&x182, &x183, x181, x173, 0x0); + fiat_p256_cmovznz_u64(&x184, x183, x174, x165); + fiat_p256_cmovznz_u64(&x185, x183, x176, x167); + fiat_p256_cmovznz_u64(&x186, x183, x178, x169); + fiat_p256_cmovznz_u64(&x187, x183, x180, x171); + out1[0] = x184; + out1[1] = x185; + out1[2] = x186; + out1[3] = x187; +} + +/* + * The function fiat_p256_square squares a field element in the Montgomery domain. + * + * Preconditions: + * 0 ≤ eval arg1 < m + * Postconditions: + * eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) * eval (from_montgomery arg1)) mod m + * 0 ≤ eval out1 < m + * + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_square(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1) { + uint64_t x1; + uint64_t x2; + uint64_t x3; + uint64_t x4; + uint64_t x5; + uint64_t x6; + uint64_t x7; + uint64_t x8; + uint64_t x9; + uint64_t x10; + uint64_t x11; + uint64_t x12; + uint64_t x13; + fiat_p256_uint1 x14; + uint64_t x15; + fiat_p256_uint1 x16; + uint64_t x17; + fiat_p256_uint1 x18; + uint64_t x19; + uint64_t x20; + uint64_t x21; + uint64_t x22; + uint64_t x23; + uint64_t x24; + uint64_t x25; + uint64_t x26; + fiat_p256_uint1 x27; + uint64_t x28; + uint64_t x29; + fiat_p256_uint1 x30; + uint64_t x31; + fiat_p256_uint1 x32; + uint64_t x33; + fiat_p256_uint1 x34; + uint64_t x35; + fiat_p256_uint1 x36; + uint64_t x37; + fiat_p256_uint1 x38; + uint64_t x39; + uint64_t x40; + uint64_t x41; + uint64_t x42; + uint64_t x43; + uint64_t x44; + uint64_t x45; + uint64_t x46; + uint64_t x47; + fiat_p256_uint1 x48; + uint64_t x49; + fiat_p256_uint1 x50; + uint64_t x51; + fiat_p256_uint1 x52; + uint64_t x53; + uint64_t x54; + fiat_p256_uint1 x55; + uint64_t x56; + fiat_p256_uint1 x57; + uint64_t x58; + fiat_p256_uint1 x59; + uint64_t x60; + fiat_p256_uint1 x61; + uint64_t x62; + fiat_p256_uint1 x63; + uint64_t x64; + uint64_t x65; + uint64_t x66; + uint64_t x67; + uint64_t x68; + uint64_t x69; + uint64_t x70; + fiat_p256_uint1 x71; + uint64_t x72; + uint64_t x73; + fiat_p256_uint1 x74; + uint64_t x75; + fiat_p256_uint1 x76; + uint64_t x77; + fiat_p256_uint1 x78; + uint64_t x79; + fiat_p256_uint1 x80; + uint64_t x81; + fiat_p256_uint1 x82; + uint64_t x83; + uint64_t x84; + uint64_t x85; + uint64_t x86; + uint64_t x87; + uint64_t x88; + uint64_t x89; + uint64_t x90; + uint64_t x91; + uint64_t x92; + fiat_p256_uint1 x93; + uint64_t x94; + fiat_p256_uint1 x95; + uint64_t x96; + fiat_p256_uint1 x97; + uint64_t x98; + uint64_t x99; + fiat_p256_uint1 x100; + uint64_t x101; + fiat_p256_uint1 x102; + uint64_t x103; + fiat_p256_uint1 x104; + uint64_t x105; + fiat_p256_uint1 x106; + uint64_t x107; + fiat_p256_uint1 x108; + uint64_t x109; + uint64_t x110; + uint64_t x111; + uint64_t x112; + uint64_t x113; + uint64_t x114; + uint64_t x115; + fiat_p256_uint1 x116; + uint64_t x117; + uint64_t x118; + fiat_p256_uint1 x119; + uint64_t x120; + fiat_p256_uint1 x121; + uint64_t x122; + fiat_p256_uint1 x123; + uint64_t x124; + fiat_p256_uint1 x125; + uint64_t x126; + fiat_p256_uint1 x127; + uint64_t x128; + uint64_t x129; + uint64_t x130; + uint64_t x131; + uint64_t x132; + uint64_t x133; + uint64_t x134; + uint64_t x135; + uint64_t x136; + uint64_t x137; + fiat_p256_uint1 x138; + uint64_t x139; + fiat_p256_uint1 x140; + uint64_t x141; + fiat_p256_uint1 x142; + uint64_t x143; + uint64_t x144; + fiat_p256_uint1 x145; + uint64_t x146; + fiat_p256_uint1 x147; + uint64_t x148; + fiat_p256_uint1 x149; + uint64_t x150; + fiat_p256_uint1 x151; + uint64_t x152; + fiat_p256_uint1 x153; + uint64_t x154; + uint64_t x155; + uint64_t x156; + uint64_t x157; + uint64_t x158; + uint64_t x159; + uint64_t x160; + fiat_p256_uint1 x161; + uint64_t x162; + uint64_t x163; + fiat_p256_uint1 x164; + uint64_t x165; + fiat_p256_uint1 x166; + uint64_t x167; + fiat_p256_uint1 x168; + uint64_t x169; + fiat_p256_uint1 x170; + uint64_t x171; + fiat_p256_uint1 x172; + uint64_t x173; + uint64_t x174; + fiat_p256_uint1 x175; + uint64_t x176; + fiat_p256_uint1 x177; + uint64_t x178; + fiat_p256_uint1 x179; + uint64_t x180; + fiat_p256_uint1 x181; + uint64_t x182; + fiat_p256_uint1 x183; + uint64_t x184; + uint64_t x185; + uint64_t x186; + uint64_t x187; + x1 = (arg1[1]); + x2 = (arg1[2]); + x3 = (arg1[3]); + x4 = (arg1[0]); + fiat_p256_mulx_u64(&x5, &x6, x4, (arg1[3])); + fiat_p256_mulx_u64(&x7, &x8, x4, (arg1[2])); + fiat_p256_mulx_u64(&x9, &x10, x4, (arg1[1])); + fiat_p256_mulx_u64(&x11, &x12, x4, (arg1[0])); + fiat_p256_addcarryx_u64(&x13, &x14, 0x0, x12, x9); + fiat_p256_addcarryx_u64(&x15, &x16, x14, x10, x7); + fiat_p256_addcarryx_u64(&x17, &x18, x16, x8, x5); + x19 = (x18 + x6); + fiat_p256_mulx_u64(&x20, &x21, x11, UINT64_C(0xffffffff00000001)); + fiat_p256_mulx_u64(&x22, &x23, x11, UINT32_C(0xffffffff)); + fiat_p256_mulx_u64(&x24, &x25, x11, UINT64_C(0xffffffffffffffff)); + fiat_p256_addcarryx_u64(&x26, &x27, 0x0, x25, x22); + x28 = (x27 + x23); + fiat_p256_addcarryx_u64(&x29, &x30, 0x0, x11, x24); + fiat_p256_addcarryx_u64(&x31, &x32, x30, x13, x26); + fiat_p256_addcarryx_u64(&x33, &x34, x32, x15, x28); + fiat_p256_addcarryx_u64(&x35, &x36, x34, x17, x20); + fiat_p256_addcarryx_u64(&x37, &x38, x36, x19, x21); + fiat_p256_mulx_u64(&x39, &x40, x1, (arg1[3])); + fiat_p256_mulx_u64(&x41, &x42, x1, (arg1[2])); + fiat_p256_mulx_u64(&x43, &x44, x1, (arg1[1])); + fiat_p256_mulx_u64(&x45, &x46, x1, (arg1[0])); + fiat_p256_addcarryx_u64(&x47, &x48, 0x0, x46, x43); + fiat_p256_addcarryx_u64(&x49, &x50, x48, x44, x41); + fiat_p256_addcarryx_u64(&x51, &x52, x50, x42, x39); + x53 = (x52 + x40); + fiat_p256_addcarryx_u64(&x54, &x55, 0x0, x31, x45); + fiat_p256_addcarryx_u64(&x56, &x57, x55, x33, x47); + fiat_p256_addcarryx_u64(&x58, &x59, x57, x35, x49); + fiat_p256_addcarryx_u64(&x60, &x61, x59, x37, x51); + fiat_p256_addcarryx_u64(&x62, &x63, x61, x38, x53); + fiat_p256_mulx_u64(&x64, &x65, x54, UINT64_C(0xffffffff00000001)); + fiat_p256_mulx_u64(&x66, &x67, x54, UINT32_C(0xffffffff)); + fiat_p256_mulx_u64(&x68, &x69, x54, UINT64_C(0xffffffffffffffff)); + fiat_p256_addcarryx_u64(&x70, &x71, 0x0, x69, x66); + x72 = (x71 + x67); + fiat_p256_addcarryx_u64(&x73, &x74, 0x0, x54, x68); + fiat_p256_addcarryx_u64(&x75, &x76, x74, x56, x70); + fiat_p256_addcarryx_u64(&x77, &x78, x76, x58, x72); + fiat_p256_addcarryx_u64(&x79, &x80, x78, x60, x64); + fiat_p256_addcarryx_u64(&x81, &x82, x80, x62, x65); + x83 = ((uint64_t)x82 + x63); + fiat_p256_mulx_u64(&x84, &x85, x2, (arg1[3])); + fiat_p256_mulx_u64(&x86, &x87, x2, (arg1[2])); + fiat_p256_mulx_u64(&x88, &x89, x2, (arg1[1])); + fiat_p256_mulx_u64(&x90, &x91, x2, (arg1[0])); + fiat_p256_addcarryx_u64(&x92, &x93, 0x0, x91, x88); + fiat_p256_addcarryx_u64(&x94, &x95, x93, x89, x86); + fiat_p256_addcarryx_u64(&x96, &x97, x95, x87, x84); + x98 = (x97 + x85); + fiat_p256_addcarryx_u64(&x99, &x100, 0x0, x75, x90); + fiat_p256_addcarryx_u64(&x101, &x102, x100, x77, x92); + fiat_p256_addcarryx_u64(&x103, &x104, x102, x79, x94); + fiat_p256_addcarryx_u64(&x105, &x106, x104, x81, x96); + fiat_p256_addcarryx_u64(&x107, &x108, x106, x83, x98); + fiat_p256_mulx_u64(&x109, &x110, x99, UINT64_C(0xffffffff00000001)); + fiat_p256_mulx_u64(&x111, &x112, x99, UINT32_C(0xffffffff)); + fiat_p256_mulx_u64(&x113, &x114, x99, UINT64_C(0xffffffffffffffff)); + fiat_p256_addcarryx_u64(&x115, &x116, 0x0, x114, x111); + x117 = (x116 + x112); + fiat_p256_addcarryx_u64(&x118, &x119, 0x0, x99, x113); + fiat_p256_addcarryx_u64(&x120, &x121, x119, x101, x115); + fiat_p256_addcarryx_u64(&x122, &x123, x121, x103, x117); + fiat_p256_addcarryx_u64(&x124, &x125, x123, x105, x109); + fiat_p256_addcarryx_u64(&x126, &x127, x125, x107, x110); + x128 = ((uint64_t)x127 + x108); + fiat_p256_mulx_u64(&x129, &x130, x3, (arg1[3])); + fiat_p256_mulx_u64(&x131, &x132, x3, (arg1[2])); + fiat_p256_mulx_u64(&x133, &x134, x3, (arg1[1])); + fiat_p256_mulx_u64(&x135, &x136, x3, (arg1[0])); + fiat_p256_addcarryx_u64(&x137, &x138, 0x0, x136, x133); + fiat_p256_addcarryx_u64(&x139, &x140, x138, x134, x131); + fiat_p256_addcarryx_u64(&x141, &x142, x140, x132, x129); + x143 = (x142 + x130); + fiat_p256_addcarryx_u64(&x144, &x145, 0x0, x120, x135); + fiat_p256_addcarryx_u64(&x146, &x147, x145, x122, x137); + fiat_p256_addcarryx_u64(&x148, &x149, x147, x124, x139); + fiat_p256_addcarryx_u64(&x150, &x151, x149, x126, x141); + fiat_p256_addcarryx_u64(&x152, &x153, x151, x128, x143); + fiat_p256_mulx_u64(&x154, &x155, x144, UINT64_C(0xffffffff00000001)); + fiat_p256_mulx_u64(&x156, &x157, x144, UINT32_C(0xffffffff)); + fiat_p256_mulx_u64(&x158, &x159, x144, UINT64_C(0xffffffffffffffff)); + fiat_p256_addcarryx_u64(&x160, &x161, 0x0, x159, x156); + x162 = (x161 + x157); + fiat_p256_addcarryx_u64(&x163, &x164, 0x0, x144, x158); + fiat_p256_addcarryx_u64(&x165, &x166, x164, x146, x160); + fiat_p256_addcarryx_u64(&x167, &x168, x166, x148, x162); + fiat_p256_addcarryx_u64(&x169, &x170, x168, x150, x154); + fiat_p256_addcarryx_u64(&x171, &x172, x170, x152, x155); + x173 = ((uint64_t)x172 + x153); + fiat_p256_subborrowx_u64(&x174, &x175, 0x0, x165, UINT64_C(0xffffffffffffffff)); + fiat_p256_subborrowx_u64(&x176, &x177, x175, x167, UINT32_C(0xffffffff)); + fiat_p256_subborrowx_u64(&x178, &x179, x177, x169, 0x0); + fiat_p256_subborrowx_u64(&x180, &x181, x179, x171, UINT64_C(0xffffffff00000001)); + fiat_p256_subborrowx_u64(&x182, &x183, x181, x173, 0x0); + fiat_p256_cmovznz_u64(&x184, x183, x174, x165); + fiat_p256_cmovznz_u64(&x185, x183, x176, x167); + fiat_p256_cmovznz_u64(&x186, x183, x178, x169); + fiat_p256_cmovznz_u64(&x187, x183, x180, x171); + out1[0] = x184; + out1[1] = x185; + out1[2] = x186; + out1[3] = x187; +} + +/* + * The function fiat_p256_add adds two field elements in the Montgomery domain. + * + * Preconditions: + * 0 ≤ eval arg1 < m + * 0 ≤ eval arg2 < m + * Postconditions: + * eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) + eval (from_montgomery arg2)) mod m + * 0 ≤ eval out1 < m + * + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_add(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) { + uint64_t x1; + fiat_p256_uint1 x2; + uint64_t x3; + fiat_p256_uint1 x4; + uint64_t x5; + fiat_p256_uint1 x6; + uint64_t x7; + fiat_p256_uint1 x8; + uint64_t x9; + fiat_p256_uint1 x10; + uint64_t x11; + fiat_p256_uint1 x12; + uint64_t x13; + fiat_p256_uint1 x14; + uint64_t x15; + fiat_p256_uint1 x16; + uint64_t x17; + fiat_p256_uint1 x18; + uint64_t x19; + uint64_t x20; + uint64_t x21; + uint64_t x22; + fiat_p256_addcarryx_u64(&x1, &x2, 0x0, (arg1[0]), (arg2[0])); + fiat_p256_addcarryx_u64(&x3, &x4, x2, (arg1[1]), (arg2[1])); + fiat_p256_addcarryx_u64(&x5, &x6, x4, (arg1[2]), (arg2[2])); + fiat_p256_addcarryx_u64(&x7, &x8, x6, (arg1[3]), (arg2[3])); + fiat_p256_subborrowx_u64(&x9, &x10, 0x0, x1, UINT64_C(0xffffffffffffffff)); + fiat_p256_subborrowx_u64(&x11, &x12, x10, x3, UINT32_C(0xffffffff)); + fiat_p256_subborrowx_u64(&x13, &x14, x12, x5, 0x0); + fiat_p256_subborrowx_u64(&x15, &x16, x14, x7, UINT64_C(0xffffffff00000001)); + fiat_p256_subborrowx_u64(&x17, &x18, x16, x8, 0x0); + fiat_p256_cmovznz_u64(&x19, x18, x9, x1); + fiat_p256_cmovznz_u64(&x20, x18, x11, x3); + fiat_p256_cmovznz_u64(&x21, x18, x13, x5); + fiat_p256_cmovznz_u64(&x22, x18, x15, x7); + out1[0] = x19; + out1[1] = x20; + out1[2] = x21; + out1[3] = x22; +} + +/* + * The function fiat_p256_sub subtracts two field elements in the Montgomery domain. + * + * Preconditions: + * 0 ≤ eval arg1 < m + * 0 ≤ eval arg2 < m + * Postconditions: + * eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) - eval (from_montgomery arg2)) mod m + * 0 ≤ eval out1 < m + * + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_sub(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) { + uint64_t x1; + fiat_p256_uint1 x2; + uint64_t x3; + fiat_p256_uint1 x4; + uint64_t x5; + fiat_p256_uint1 x6; + uint64_t x7; + fiat_p256_uint1 x8; + uint64_t x9; + uint64_t x10; + fiat_p256_uint1 x11; + uint64_t x12; + fiat_p256_uint1 x13; + uint64_t x14; + fiat_p256_uint1 x15; + uint64_t x16; + fiat_p256_uint1 x17; + fiat_p256_subborrowx_u64(&x1, &x2, 0x0, (arg1[0]), (arg2[0])); + fiat_p256_subborrowx_u64(&x3, &x4, x2, (arg1[1]), (arg2[1])); + fiat_p256_subborrowx_u64(&x5, &x6, x4, (arg1[2]), (arg2[2])); + fiat_p256_subborrowx_u64(&x7, &x8, x6, (arg1[3]), (arg2[3])); + fiat_p256_cmovznz_u64(&x9, x8, 0x0, UINT64_C(0xffffffffffffffff)); + fiat_p256_addcarryx_u64(&x10, &x11, 0x0, x1, x9); + fiat_p256_addcarryx_u64(&x12, &x13, x11, x3, (x9 & UINT32_C(0xffffffff))); + fiat_p256_addcarryx_u64(&x14, &x15, x13, x5, 0x0); + fiat_p256_addcarryx_u64(&x16, &x17, x15, x7, (x9 & UINT64_C(0xffffffff00000001))); + out1[0] = x10; + out1[1] = x12; + out1[2] = x14; + out1[3] = x16; +} + +/* + * The function fiat_p256_opp negates a field element in the Montgomery domain. + * + * Preconditions: + * 0 ≤ eval arg1 < m + * Postconditions: + * eval (from_montgomery out1) mod m = -eval (from_montgomery arg1) mod m + * 0 ≤ eval out1 < m + * + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_opp(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1) { + uint64_t x1; + fiat_p256_uint1 x2; + uint64_t x3; + fiat_p256_uint1 x4; + uint64_t x5; + fiat_p256_uint1 x6; + uint64_t x7; + fiat_p256_uint1 x8; + uint64_t x9; + uint64_t x10; + fiat_p256_uint1 x11; + uint64_t x12; + fiat_p256_uint1 x13; + uint64_t x14; + fiat_p256_uint1 x15; + uint64_t x16; + fiat_p256_uint1 x17; + fiat_p256_subborrowx_u64(&x1, &x2, 0x0, 0x0, (arg1[0])); + fiat_p256_subborrowx_u64(&x3, &x4, x2, 0x0, (arg1[1])); + fiat_p256_subborrowx_u64(&x5, &x6, x4, 0x0, (arg1[2])); + fiat_p256_subborrowx_u64(&x7, &x8, x6, 0x0, (arg1[3])); + fiat_p256_cmovznz_u64(&x9, x8, 0x0, UINT64_C(0xffffffffffffffff)); + fiat_p256_addcarryx_u64(&x10, &x11, 0x0, x1, x9); + fiat_p256_addcarryx_u64(&x12, &x13, x11, x3, (x9 & UINT32_C(0xffffffff))); + fiat_p256_addcarryx_u64(&x14, &x15, x13, x5, 0x0); + fiat_p256_addcarryx_u64(&x16, &x17, x15, x7, (x9 & UINT64_C(0xffffffff00000001))); + out1[0] = x10; + out1[1] = x12; + out1[2] = x14; + out1[3] = x16; +} + +/* + * The function fiat_p256_from_montgomery translates a field element out of the Montgomery domain. + * + * Preconditions: + * 0 ≤ eval arg1 < m + * Postconditions: + * eval out1 mod m = (eval arg1 * ((2^64)⁻¹ mod m)^4) mod m + * 0 ≤ eval out1 < m + * + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_from_montgomery(fiat_p256_non_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1) { + uint64_t x1; + uint64_t x2; + uint64_t x3; + uint64_t x4; + uint64_t x5; + uint64_t x6; + uint64_t x7; + uint64_t x8; + fiat_p256_uint1 x9; + uint64_t x10; + fiat_p256_uint1 x11; + uint64_t x12; + fiat_p256_uint1 x13; + uint64_t x14; + fiat_p256_uint1 x15; + uint64_t x16; + uint64_t x17; + uint64_t x18; + uint64_t x19; + uint64_t x20; + uint64_t x21; + uint64_t x22; + fiat_p256_uint1 x23; + uint64_t x24; + fiat_p256_uint1 x25; + uint64_t x26; + fiat_p256_uint1 x27; + uint64_t x28; + fiat_p256_uint1 x29; + uint64_t x30; + fiat_p256_uint1 x31; + uint64_t x32; + fiat_p256_uint1 x33; + uint64_t x34; + fiat_p256_uint1 x35; + uint64_t x36; + fiat_p256_uint1 x37; + uint64_t x38; + uint64_t x39; + uint64_t x40; + uint64_t x41; + uint64_t x42; + uint64_t x43; + uint64_t x44; + fiat_p256_uint1 x45; + uint64_t x46; + fiat_p256_uint1 x47; + uint64_t x48; + fiat_p256_uint1 x49; + uint64_t x50; + fiat_p256_uint1 x51; + uint64_t x52; + fiat_p256_uint1 x53; + uint64_t x54; + fiat_p256_uint1 x55; + uint64_t x56; + fiat_p256_uint1 x57; + uint64_t x58; + fiat_p256_uint1 x59; + uint64_t x60; + uint64_t x61; + uint64_t x62; + uint64_t x63; + uint64_t x64; + uint64_t x65; + uint64_t x66; + fiat_p256_uint1 x67; + uint64_t x68; + fiat_p256_uint1 x69; + uint64_t x70; + fiat_p256_uint1 x71; + uint64_t x72; + fiat_p256_uint1 x73; + uint64_t x74; + fiat_p256_uint1 x75; + uint64_t x76; + uint64_t x77; + fiat_p256_uint1 x78; + uint64_t x79; + fiat_p256_uint1 x80; + uint64_t x81; + fiat_p256_uint1 x82; + uint64_t x83; + fiat_p256_uint1 x84; + uint64_t x85; + fiat_p256_uint1 x86; + uint64_t x87; + uint64_t x88; + uint64_t x89; + uint64_t x90; + x1 = (arg1[0]); + fiat_p256_mulx_u64(&x2, &x3, x1, UINT64_C(0xffffffff00000001)); + fiat_p256_mulx_u64(&x4, &x5, x1, UINT32_C(0xffffffff)); + fiat_p256_mulx_u64(&x6, &x7, x1, UINT64_C(0xffffffffffffffff)); + fiat_p256_addcarryx_u64(&x8, &x9, 0x0, x7, x4); + fiat_p256_addcarryx_u64(&x10, &x11, 0x0, x1, x6); + fiat_p256_addcarryx_u64(&x12, &x13, x11, 0x0, x8); + fiat_p256_addcarryx_u64(&x14, &x15, 0x0, x12, (arg1[1])); + fiat_p256_mulx_u64(&x16, &x17, x14, UINT64_C(0xffffffff00000001)); + fiat_p256_mulx_u64(&x18, &x19, x14, UINT32_C(0xffffffff)); + fiat_p256_mulx_u64(&x20, &x21, x14, UINT64_C(0xffffffffffffffff)); + fiat_p256_addcarryx_u64(&x22, &x23, 0x0, x21, x18); + fiat_p256_addcarryx_u64(&x24, &x25, 0x0, x14, x20); + fiat_p256_addcarryx_u64(&x26, &x27, x25, (x15 + (x13 + (x9 + x5))), x22); + fiat_p256_addcarryx_u64(&x28, &x29, x27, x2, (x23 + x19)); + fiat_p256_addcarryx_u64(&x30, &x31, x29, x3, x16); + fiat_p256_addcarryx_u64(&x32, &x33, 0x0, x26, (arg1[2])); + fiat_p256_addcarryx_u64(&x34, &x35, x33, x28, 0x0); + fiat_p256_addcarryx_u64(&x36, &x37, x35, x30, 0x0); + fiat_p256_mulx_u64(&x38, &x39, x32, UINT64_C(0xffffffff00000001)); + fiat_p256_mulx_u64(&x40, &x41, x32, UINT32_C(0xffffffff)); + fiat_p256_mulx_u64(&x42, &x43, x32, UINT64_C(0xffffffffffffffff)); + fiat_p256_addcarryx_u64(&x44, &x45, 0x0, x43, x40); + fiat_p256_addcarryx_u64(&x46, &x47, 0x0, x32, x42); + fiat_p256_addcarryx_u64(&x48, &x49, x47, x34, x44); + fiat_p256_addcarryx_u64(&x50, &x51, x49, x36, (x45 + x41)); + fiat_p256_addcarryx_u64(&x52, &x53, x51, (x37 + (x31 + x17)), x38); + fiat_p256_addcarryx_u64(&x54, &x55, 0x0, x48, (arg1[3])); + fiat_p256_addcarryx_u64(&x56, &x57, x55, x50, 0x0); + fiat_p256_addcarryx_u64(&x58, &x59, x57, x52, 0x0); + fiat_p256_mulx_u64(&x60, &x61, x54, UINT64_C(0xffffffff00000001)); + fiat_p256_mulx_u64(&x62, &x63, x54, UINT32_C(0xffffffff)); + fiat_p256_mulx_u64(&x64, &x65, x54, UINT64_C(0xffffffffffffffff)); + fiat_p256_addcarryx_u64(&x66, &x67, 0x0, x65, x62); + fiat_p256_addcarryx_u64(&x68, &x69, 0x0, x54, x64); + fiat_p256_addcarryx_u64(&x70, &x71, x69, x56, x66); + fiat_p256_addcarryx_u64(&x72, &x73, x71, x58, (x67 + x63)); + fiat_p256_addcarryx_u64(&x74, &x75, x73, (x59 + (x53 + x39)), x60); + x76 = (x75 + x61); + fiat_p256_subborrowx_u64(&x77, &x78, 0x0, x70, UINT64_C(0xffffffffffffffff)); + fiat_p256_subborrowx_u64(&x79, &x80, x78, x72, UINT32_C(0xffffffff)); + fiat_p256_subborrowx_u64(&x81, &x82, x80, x74, 0x0); + fiat_p256_subborrowx_u64(&x83, &x84, x82, x76, UINT64_C(0xffffffff00000001)); + fiat_p256_subborrowx_u64(&x85, &x86, x84, 0x0, 0x0); + fiat_p256_cmovznz_u64(&x87, x86, x77, x70); + fiat_p256_cmovznz_u64(&x88, x86, x79, x72); + fiat_p256_cmovznz_u64(&x89, x86, x81, x74); + fiat_p256_cmovznz_u64(&x90, x86, x83, x76); + out1[0] = x87; + out1[1] = x88; + out1[2] = x89; + out1[3] = x90; +} + +/* + * The function fiat_p256_to_montgomery translates a field element into the Montgomery domain. + * + * Preconditions: + * 0 ≤ eval arg1 < m + * Postconditions: + * eval (from_montgomery out1) mod m = eval arg1 mod m + * 0 ≤ eval out1 < m + * + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_to_montgomery(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_non_montgomery_domain_field_element arg1) { + uint64_t x1; + uint64_t x2; + uint64_t x3; + uint64_t x4; + uint64_t x5; + uint64_t x6; + uint64_t x7; + uint64_t x8; + uint64_t x9; + uint64_t x10; + uint64_t x11; + uint64_t x12; + uint64_t x13; + fiat_p256_uint1 x14; + uint64_t x15; + fiat_p256_uint1 x16; + uint64_t x17; + fiat_p256_uint1 x18; + uint64_t x19; + uint64_t x20; + uint64_t x21; + uint64_t x22; + uint64_t x23; + uint64_t x24; + uint64_t x25; + fiat_p256_uint1 x26; + uint64_t x27; + fiat_p256_uint1 x28; + uint64_t x29; + fiat_p256_uint1 x30; + uint64_t x31; + fiat_p256_uint1 x32; + uint64_t x33; + fiat_p256_uint1 x34; + uint64_t x35; + fiat_p256_uint1 x36; + uint64_t x37; + uint64_t x38; + uint64_t x39; + uint64_t x40; + uint64_t x41; + uint64_t x42; + uint64_t x43; + uint64_t x44; + uint64_t x45; + fiat_p256_uint1 x46; + uint64_t x47; + fiat_p256_uint1 x48; + uint64_t x49; + fiat_p256_uint1 x50; + uint64_t x51; + fiat_p256_uint1 x52; + uint64_t x53; + fiat_p256_uint1 x54; + uint64_t x55; + fiat_p256_uint1 x56; + uint64_t x57; + fiat_p256_uint1 x58; + uint64_t x59; + uint64_t x60; + uint64_t x61; + uint64_t x62; + uint64_t x63; + uint64_t x64; + uint64_t x65; + fiat_p256_uint1 x66; + uint64_t x67; + fiat_p256_uint1 x68; + uint64_t x69; + fiat_p256_uint1 x70; + uint64_t x71; + fiat_p256_uint1 x72; + uint64_t x73; + fiat_p256_uint1 x74; + uint64_t x75; + fiat_p256_uint1 x76; + uint64_t x77; + uint64_t x78; + uint64_t x79; + uint64_t x80; + uint64_t x81; + uint64_t x82; + uint64_t x83; + uint64_t x84; + uint64_t x85; + fiat_p256_uint1 x86; + uint64_t x87; + fiat_p256_uint1 x88; + uint64_t x89; + fiat_p256_uint1 x90; + uint64_t x91; + fiat_p256_uint1 x92; + uint64_t x93; + fiat_p256_uint1 x94; + uint64_t x95; + fiat_p256_uint1 x96; + uint64_t x97; + fiat_p256_uint1 x98; + uint64_t x99; + uint64_t x100; + uint64_t x101; + uint64_t x102; + uint64_t x103; + uint64_t x104; + uint64_t x105; + fiat_p256_uint1 x106; + uint64_t x107; + fiat_p256_uint1 x108; + uint64_t x109; + fiat_p256_uint1 x110; + uint64_t x111; + fiat_p256_uint1 x112; + uint64_t x113; + fiat_p256_uint1 x114; + uint64_t x115; + fiat_p256_uint1 x116; + uint64_t x117; + uint64_t x118; + uint64_t x119; + uint64_t x120; + uint64_t x121; + uint64_t x122; + uint64_t x123; + uint64_t x124; + uint64_t x125; + fiat_p256_uint1 x126; + uint64_t x127; + fiat_p256_uint1 x128; + uint64_t x129; + fiat_p256_uint1 x130; + uint64_t x131; + fiat_p256_uint1 x132; + uint64_t x133; + fiat_p256_uint1 x134; + uint64_t x135; + fiat_p256_uint1 x136; + uint64_t x137; + fiat_p256_uint1 x138; + uint64_t x139; + uint64_t x140; + uint64_t x141; + uint64_t x142; + uint64_t x143; + uint64_t x144; + uint64_t x145; + fiat_p256_uint1 x146; + uint64_t x147; + fiat_p256_uint1 x148; + uint64_t x149; + fiat_p256_uint1 x150; + uint64_t x151; + fiat_p256_uint1 x152; + uint64_t x153; + fiat_p256_uint1 x154; + uint64_t x155; + fiat_p256_uint1 x156; + uint64_t x157; + fiat_p256_uint1 x158; + uint64_t x159; + fiat_p256_uint1 x160; + uint64_t x161; + fiat_p256_uint1 x162; + uint64_t x163; + fiat_p256_uint1 x164; + uint64_t x165; + fiat_p256_uint1 x166; + uint64_t x167; + uint64_t x168; + uint64_t x169; + uint64_t x170; + x1 = (arg1[1]); + x2 = (arg1[2]); + x3 = (arg1[3]); + x4 = (arg1[0]); + fiat_p256_mulx_u64(&x5, &x6, x4, UINT64_C(0x4fffffffd)); + fiat_p256_mulx_u64(&x7, &x8, x4, UINT64_C(0xfffffffffffffffe)); + fiat_p256_mulx_u64(&x9, &x10, x4, UINT64_C(0xfffffffbffffffff)); + fiat_p256_mulx_u64(&x11, &x12, x4, 0x3); + fiat_p256_addcarryx_u64(&x13, &x14, 0x0, x12, x9); + fiat_p256_addcarryx_u64(&x15, &x16, x14, x10, x7); + fiat_p256_addcarryx_u64(&x17, &x18, x16, x8, x5); + fiat_p256_mulx_u64(&x19, &x20, x11, UINT64_C(0xffffffff00000001)); + fiat_p256_mulx_u64(&x21, &x22, x11, UINT32_C(0xffffffff)); + fiat_p256_mulx_u64(&x23, &x24, x11, UINT64_C(0xffffffffffffffff)); + fiat_p256_addcarryx_u64(&x25, &x26, 0x0, x24, x21); + fiat_p256_addcarryx_u64(&x27, &x28, 0x0, x11, x23); + fiat_p256_addcarryx_u64(&x29, &x30, x28, x13, x25); + fiat_p256_addcarryx_u64(&x31, &x32, x30, x15, (x26 + x22)); + fiat_p256_addcarryx_u64(&x33, &x34, x32, x17, x19); + fiat_p256_addcarryx_u64(&x35, &x36, x34, (x18 + x6), x20); + fiat_p256_mulx_u64(&x37, &x38, x1, UINT64_C(0x4fffffffd)); + fiat_p256_mulx_u64(&x39, &x40, x1, UINT64_C(0xfffffffffffffffe)); + fiat_p256_mulx_u64(&x41, &x42, x1, UINT64_C(0xfffffffbffffffff)); + fiat_p256_mulx_u64(&x43, &x44, x1, 0x3); + fiat_p256_addcarryx_u64(&x45, &x46, 0x0, x44, x41); + fiat_p256_addcarryx_u64(&x47, &x48, x46, x42, x39); + fiat_p256_addcarryx_u64(&x49, &x50, x48, x40, x37); + fiat_p256_addcarryx_u64(&x51, &x52, 0x0, x29, x43); + fiat_p256_addcarryx_u64(&x53, &x54, x52, x31, x45); + fiat_p256_addcarryx_u64(&x55, &x56, x54, x33, x47); + fiat_p256_addcarryx_u64(&x57, &x58, x56, x35, x49); + fiat_p256_mulx_u64(&x59, &x60, x51, UINT64_C(0xffffffff00000001)); + fiat_p256_mulx_u64(&x61, &x62, x51, UINT32_C(0xffffffff)); + fiat_p256_mulx_u64(&x63, &x64, x51, UINT64_C(0xffffffffffffffff)); + fiat_p256_addcarryx_u64(&x65, &x66, 0x0, x64, x61); + fiat_p256_addcarryx_u64(&x67, &x68, 0x0, x51, x63); + fiat_p256_addcarryx_u64(&x69, &x70, x68, x53, x65); + fiat_p256_addcarryx_u64(&x71, &x72, x70, x55, (x66 + x62)); + fiat_p256_addcarryx_u64(&x73, &x74, x72, x57, x59); + fiat_p256_addcarryx_u64(&x75, &x76, x74, (((uint64_t)x58 + x36) + (x50 + x38)), x60); + fiat_p256_mulx_u64(&x77, &x78, x2, UINT64_C(0x4fffffffd)); + fiat_p256_mulx_u64(&x79, &x80, x2, UINT64_C(0xfffffffffffffffe)); + fiat_p256_mulx_u64(&x81, &x82, x2, UINT64_C(0xfffffffbffffffff)); + fiat_p256_mulx_u64(&x83, &x84, x2, 0x3); + fiat_p256_addcarryx_u64(&x85, &x86, 0x0, x84, x81); + fiat_p256_addcarryx_u64(&x87, &x88, x86, x82, x79); + fiat_p256_addcarryx_u64(&x89, &x90, x88, x80, x77); + fiat_p256_addcarryx_u64(&x91, &x92, 0x0, x69, x83); + fiat_p256_addcarryx_u64(&x93, &x94, x92, x71, x85); + fiat_p256_addcarryx_u64(&x95, &x96, x94, x73, x87); + fiat_p256_addcarryx_u64(&x97, &x98, x96, x75, x89); + fiat_p256_mulx_u64(&x99, &x100, x91, UINT64_C(0xffffffff00000001)); + fiat_p256_mulx_u64(&x101, &x102, x91, UINT32_C(0xffffffff)); + fiat_p256_mulx_u64(&x103, &x104, x91, UINT64_C(0xffffffffffffffff)); + fiat_p256_addcarryx_u64(&x105, &x106, 0x0, x104, x101); + fiat_p256_addcarryx_u64(&x107, &x108, 0x0, x91, x103); + fiat_p256_addcarryx_u64(&x109, &x110, x108, x93, x105); + fiat_p256_addcarryx_u64(&x111, &x112, x110, x95, (x106 + x102)); + fiat_p256_addcarryx_u64(&x113, &x114, x112, x97, x99); + fiat_p256_addcarryx_u64(&x115, &x116, x114, (((uint64_t)x98 + x76) + (x90 + x78)), x100); + fiat_p256_mulx_u64(&x117, &x118, x3, UINT64_C(0x4fffffffd)); + fiat_p256_mulx_u64(&x119, &x120, x3, UINT64_C(0xfffffffffffffffe)); + fiat_p256_mulx_u64(&x121, &x122, x3, UINT64_C(0xfffffffbffffffff)); + fiat_p256_mulx_u64(&x123, &x124, x3, 0x3); + fiat_p256_addcarryx_u64(&x125, &x126, 0x0, x124, x121); + fiat_p256_addcarryx_u64(&x127, &x128, x126, x122, x119); + fiat_p256_addcarryx_u64(&x129, &x130, x128, x120, x117); + fiat_p256_addcarryx_u64(&x131, &x132, 0x0, x109, x123); + fiat_p256_addcarryx_u64(&x133, &x134, x132, x111, x125); + fiat_p256_addcarryx_u64(&x135, &x136, x134, x113, x127); + fiat_p256_addcarryx_u64(&x137, &x138, x136, x115, x129); + fiat_p256_mulx_u64(&x139, &x140, x131, UINT64_C(0xffffffff00000001)); + fiat_p256_mulx_u64(&x141, &x142, x131, UINT32_C(0xffffffff)); + fiat_p256_mulx_u64(&x143, &x144, x131, UINT64_C(0xffffffffffffffff)); + fiat_p256_addcarryx_u64(&x145, &x146, 0x0, x144, x141); + fiat_p256_addcarryx_u64(&x147, &x148, 0x0, x131, x143); + fiat_p256_addcarryx_u64(&x149, &x150, x148, x133, x145); + fiat_p256_addcarryx_u64(&x151, &x152, x150, x135, (x146 + x142)); + fiat_p256_addcarryx_u64(&x153, &x154, x152, x137, x139); + fiat_p256_addcarryx_u64(&x155, &x156, x154, (((uint64_t)x138 + x116) + (x130 + x118)), x140); + fiat_p256_subborrowx_u64(&x157, &x158, 0x0, x149, UINT64_C(0xffffffffffffffff)); + fiat_p256_subborrowx_u64(&x159, &x160, x158, x151, UINT32_C(0xffffffff)); + fiat_p256_subborrowx_u64(&x161, &x162, x160, x153, 0x0); + fiat_p256_subborrowx_u64(&x163, &x164, x162, x155, UINT64_C(0xffffffff00000001)); + fiat_p256_subborrowx_u64(&x165, &x166, x164, x156, 0x0); + fiat_p256_cmovznz_u64(&x167, x166, x157, x149); + fiat_p256_cmovznz_u64(&x168, x166, x159, x151); + fiat_p256_cmovznz_u64(&x169, x166, x161, x153); + fiat_p256_cmovznz_u64(&x170, x166, x163, x155); + out1[0] = x167; + out1[1] = x168; + out1[2] = x169; + out1[3] = x170; +} + +/* + * The function fiat_p256_nonzero outputs a single non-zero word if the input is non-zero and zero otherwise. + * + * Preconditions: + * 0 ≤ eval arg1 < m + * Postconditions: + * out1 = 0 ↔ eval (from_montgomery arg1) mod m = 0 + * + * Input Bounds: + * arg1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + * Output Bounds: + * out1: [0x0 ~> 0xffffffffffffffff] + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_nonzero(uint64_t* out1, const uint64_t arg1[4]) { + uint64_t x1; + x1 = ((arg1[0]) | ((arg1[1]) | ((arg1[2]) | (arg1[3])))); + *out1 = x1; +} + +/* + * The function fiat_p256_selectznz is a multi-limb conditional select. + * + * Postconditions: + * out1 = (if arg1 = 0 then arg2 else arg3) + * + * Input Bounds: + * arg1: [0x0 ~> 0x1] + * arg2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + * arg3: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + * Output Bounds: + * out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_selectznz(uint64_t out1[4], fiat_p256_uint1 arg1, const uint64_t arg2[4], const uint64_t arg3[4]) { + uint64_t x1; + uint64_t x2; + uint64_t x3; + uint64_t x4; + fiat_p256_cmovznz_u64(&x1, arg1, (arg2[0]), (arg3[0])); + fiat_p256_cmovznz_u64(&x2, arg1, (arg2[1]), (arg3[1])); + fiat_p256_cmovznz_u64(&x3, arg1, (arg2[2]), (arg3[2])); + fiat_p256_cmovznz_u64(&x4, arg1, (arg2[3]), (arg3[3])); + out1[0] = x1; + out1[1] = x2; + out1[2] = x3; + out1[3] = x4; +} + +/* + * The function fiat_p256_to_bytes serializes a field element NOT in the Montgomery domain to bytes in little-endian order. + * + * Preconditions: + * 0 ≤ eval arg1 < m + * Postconditions: + * out1 = map (λ x, ⌊((eval arg1 mod m) mod 2^(8 * (x + 1))) / 2^(8 * x)⌋) [0..31] + * + * Input Bounds: + * arg1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + * Output Bounds: + * out1: [[0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff]] + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_to_bytes(uint8_t out1[32], const uint64_t arg1[4]) { + uint64_t x1; + uint64_t x2; + uint64_t x3; + uint64_t x4; + uint8_t x5; + uint64_t x6; + uint8_t x7; + uint64_t x8; + uint8_t x9; + uint64_t x10; + uint8_t x11; + uint64_t x12; + uint8_t x13; + uint64_t x14; + uint8_t x15; + uint64_t x16; + uint8_t x17; + uint8_t x18; + uint8_t x19; + uint64_t x20; + uint8_t x21; + uint64_t x22; + uint8_t x23; + uint64_t x24; + uint8_t x25; + uint64_t x26; + uint8_t x27; + uint64_t x28; + uint8_t x29; + uint64_t x30; + uint8_t x31; + uint8_t x32; + uint8_t x33; + uint64_t x34; + uint8_t x35; + uint64_t x36; + uint8_t x37; + uint64_t x38; + uint8_t x39; + uint64_t x40; + uint8_t x41; + uint64_t x42; + uint8_t x43; + uint64_t x44; + uint8_t x45; + uint8_t x46; + uint8_t x47; + uint64_t x48; + uint8_t x49; + uint64_t x50; + uint8_t x51; + uint64_t x52; + uint8_t x53; + uint64_t x54; + uint8_t x55; + uint64_t x56; + uint8_t x57; + uint64_t x58; + uint8_t x59; + uint8_t x60; + x1 = (arg1[3]); + x2 = (arg1[2]); + x3 = (arg1[1]); + x4 = (arg1[0]); + x5 = (uint8_t)(x4 & UINT8_C(0xff)); + x6 = (x4 >> 8); + x7 = (uint8_t)(x6 & UINT8_C(0xff)); + x8 = (x6 >> 8); + x9 = (uint8_t)(x8 & UINT8_C(0xff)); + x10 = (x8 >> 8); + x11 = (uint8_t)(x10 & UINT8_C(0xff)); + x12 = (x10 >> 8); + x13 = (uint8_t)(x12 & UINT8_C(0xff)); + x14 = (x12 >> 8); + x15 = (uint8_t)(x14 & UINT8_C(0xff)); + x16 = (x14 >> 8); + x17 = (uint8_t)(x16 & UINT8_C(0xff)); + x18 = (uint8_t)(x16 >> 8); + x19 = (uint8_t)(x3 & UINT8_C(0xff)); + x20 = (x3 >> 8); + x21 = (uint8_t)(x20 & UINT8_C(0xff)); + x22 = (x20 >> 8); + x23 = (uint8_t)(x22 & UINT8_C(0xff)); + x24 = (x22 >> 8); + x25 = (uint8_t)(x24 & UINT8_C(0xff)); + x26 = (x24 >> 8); + x27 = (uint8_t)(x26 & UINT8_C(0xff)); + x28 = (x26 >> 8); + x29 = (uint8_t)(x28 & UINT8_C(0xff)); + x30 = (x28 >> 8); + x31 = (uint8_t)(x30 & UINT8_C(0xff)); + x32 = (uint8_t)(x30 >> 8); + x33 = (uint8_t)(x2 & UINT8_C(0xff)); + x34 = (x2 >> 8); + x35 = (uint8_t)(x34 & UINT8_C(0xff)); + x36 = (x34 >> 8); + x37 = (uint8_t)(x36 & UINT8_C(0xff)); + x38 = (x36 >> 8); + x39 = (uint8_t)(x38 & UINT8_C(0xff)); + x40 = (x38 >> 8); + x41 = (uint8_t)(x40 & UINT8_C(0xff)); + x42 = (x40 >> 8); + x43 = (uint8_t)(x42 & UINT8_C(0xff)); + x44 = (x42 >> 8); + x45 = (uint8_t)(x44 & UINT8_C(0xff)); + x46 = (uint8_t)(x44 >> 8); + x47 = (uint8_t)(x1 & UINT8_C(0xff)); + x48 = (x1 >> 8); + x49 = (uint8_t)(x48 & UINT8_C(0xff)); + x50 = (x48 >> 8); + x51 = (uint8_t)(x50 & UINT8_C(0xff)); + x52 = (x50 >> 8); + x53 = (uint8_t)(x52 & UINT8_C(0xff)); + x54 = (x52 >> 8); + x55 = (uint8_t)(x54 & UINT8_C(0xff)); + x56 = (x54 >> 8); + x57 = (uint8_t)(x56 & UINT8_C(0xff)); + x58 = (x56 >> 8); + x59 = (uint8_t)(x58 & UINT8_C(0xff)); + x60 = (uint8_t)(x58 >> 8); + out1[0] = x5; + out1[1] = x7; + out1[2] = x9; + out1[3] = x11; + out1[4] = x13; + out1[5] = x15; + out1[6] = x17; + out1[7] = x18; + out1[8] = x19; + out1[9] = x21; + out1[10] = x23; + out1[11] = x25; + out1[12] = x27; + out1[13] = x29; + out1[14] = x31; + out1[15] = x32; + out1[16] = x33; + out1[17] = x35; + out1[18] = x37; + out1[19] = x39; + out1[20] = x41; + out1[21] = x43; + out1[22] = x45; + out1[23] = x46; + out1[24] = x47; + out1[25] = x49; + out1[26] = x51; + out1[27] = x53; + out1[28] = x55; + out1[29] = x57; + out1[30] = x59; + out1[31] = x60; +} + +/* + * The function fiat_p256_from_bytes deserializes a field element NOT in the Montgomery domain from bytes in little-endian order. + * + * Preconditions: + * 0 ≤ bytes_eval arg1 < m + * Postconditions: + * eval out1 mod m = bytes_eval arg1 mod m + * 0 ≤ eval out1 < m + * + * Input Bounds: + * arg1: [[0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff]] + * Output Bounds: + * out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_from_bytes(uint64_t out1[4], const uint8_t arg1[32]) { + uint64_t x1; + uint64_t x2; + uint64_t x3; + uint64_t x4; + uint64_t x5; + uint64_t x6; + uint64_t x7; + uint8_t x8; + uint64_t x9; + uint64_t x10; + uint64_t x11; + uint64_t x12; + uint64_t x13; + uint64_t x14; + uint64_t x15; + uint8_t x16; + uint64_t x17; + uint64_t x18; + uint64_t x19; + uint64_t x20; + uint64_t x21; + uint64_t x22; + uint64_t x23; + uint8_t x24; + uint64_t x25; + uint64_t x26; + uint64_t x27; + uint64_t x28; + uint64_t x29; + uint64_t x30; + uint64_t x31; + uint8_t x32; + uint64_t x33; + uint64_t x34; + uint64_t x35; + uint64_t x36; + uint64_t x37; + uint64_t x38; + uint64_t x39; + uint64_t x40; + uint64_t x41; + uint64_t x42; + uint64_t x43; + uint64_t x44; + uint64_t x45; + uint64_t x46; + uint64_t x47; + uint64_t x48; + uint64_t x49; + uint64_t x50; + uint64_t x51; + uint64_t x52; + uint64_t x53; + uint64_t x54; + uint64_t x55; + uint64_t x56; + uint64_t x57; + uint64_t x58; + uint64_t x59; + uint64_t x60; + x1 = ((uint64_t)(arg1[31]) << 56); + x2 = ((uint64_t)(arg1[30]) << 48); + x3 = ((uint64_t)(arg1[29]) << 40); + x4 = ((uint64_t)(arg1[28]) << 32); + x5 = ((uint64_t)(arg1[27]) << 24); + x6 = ((uint64_t)(arg1[26]) << 16); + x7 = ((uint64_t)(arg1[25]) << 8); + x8 = (arg1[24]); + x9 = ((uint64_t)(arg1[23]) << 56); + x10 = ((uint64_t)(arg1[22]) << 48); + x11 = ((uint64_t)(arg1[21]) << 40); + x12 = ((uint64_t)(arg1[20]) << 32); + x13 = ((uint64_t)(arg1[19]) << 24); + x14 = ((uint64_t)(arg1[18]) << 16); + x15 = ((uint64_t)(arg1[17]) << 8); + x16 = (arg1[16]); + x17 = ((uint64_t)(arg1[15]) << 56); + x18 = ((uint64_t)(arg1[14]) << 48); + x19 = ((uint64_t)(arg1[13]) << 40); + x20 = ((uint64_t)(arg1[12]) << 32); + x21 = ((uint64_t)(arg1[11]) << 24); + x22 = ((uint64_t)(arg1[10]) << 16); + x23 = ((uint64_t)(arg1[9]) << 8); + x24 = (arg1[8]); + x25 = ((uint64_t)(arg1[7]) << 56); + x26 = ((uint64_t)(arg1[6]) << 48); + x27 = ((uint64_t)(arg1[5]) << 40); + x28 = ((uint64_t)(arg1[4]) << 32); + x29 = ((uint64_t)(arg1[3]) << 24); + x30 = ((uint64_t)(arg1[2]) << 16); + x31 = ((uint64_t)(arg1[1]) << 8); + x32 = (arg1[0]); + x33 = (x31 + (uint64_t)x32); + x34 = (x30 + x33); + x35 = (x29 + x34); + x36 = (x28 + x35); + x37 = (x27 + x36); + x38 = (x26 + x37); + x39 = (x25 + x38); + x40 = (x23 + (uint64_t)x24); + x41 = (x22 + x40); + x42 = (x21 + x41); + x43 = (x20 + x42); + x44 = (x19 + x43); + x45 = (x18 + x44); + x46 = (x17 + x45); + x47 = (x15 + (uint64_t)x16); + x48 = (x14 + x47); + x49 = (x13 + x48); + x50 = (x12 + x49); + x51 = (x11 + x50); + x52 = (x10 + x51); + x53 = (x9 + x52); + x54 = (x7 + (uint64_t)x8); + x55 = (x6 + x54); + x56 = (x5 + x55); + x57 = (x4 + x56); + x58 = (x3 + x57); + x59 = (x2 + x58); + x60 = (x1 + x59); + out1[0] = x39; + out1[1] = x46; + out1[2] = x53; + out1[3] = x60; +} + +/* + * The function fiat_p256_set_one returns the field element one in the Montgomery domain. + * + * Postconditions: + * eval (from_montgomery out1) mod m = 1 mod m + * 0 ≤ eval out1 < m + * + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_set_one(fiat_p256_montgomery_domain_field_element out1) { + out1[0] = 0x1; + out1[1] = UINT64_C(0xffffffff00000000); + out1[2] = UINT64_C(0xffffffffffffffff); + out1[3] = UINT32_C(0xfffffffe); +} + +/* + * The function fiat_p256_msat returns the saturated representation of the prime modulus. + * + * Postconditions: + * twos_complement_eval out1 = m + * 0 ≤ eval out1 < m + * + * Output Bounds: + * out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_msat(uint64_t out1[5]) { + out1[0] = UINT64_C(0xffffffffffffffff); + out1[1] = UINT32_C(0xffffffff); + out1[2] = 0x0; + out1[3] = UINT64_C(0xffffffff00000001); + out1[4] = 0x0; +} + +/* + * The function fiat_p256_divstep computes a divstep. + * + * Preconditions: + * 0 ≤ eval arg4 < m + * 0 ≤ eval arg5 < m + * Postconditions: + * out1 = (if 0 < arg1 ∧ (twos_complement_eval arg3) is odd then 1 - arg1 else 1 + arg1) + * twos_complement_eval out2 = (if 0 < arg1 ∧ (twos_complement_eval arg3) is odd then twos_complement_eval arg3 else twos_complement_eval arg2) + * twos_complement_eval out3 = (if 0 < arg1 ∧ (twos_complement_eval arg3) is odd then ⌊(twos_complement_eval arg3 - twos_complement_eval arg2) / 2⌋ else ⌊(twos_complement_eval arg3 + (twos_complement_eval arg3 mod 2) * twos_complement_eval arg2) / 2⌋) + * eval (from_montgomery out4) mod m = (if 0 < arg1 ∧ (twos_complement_eval arg3) is odd then (2 * eval (from_montgomery arg5)) mod m else (2 * eval (from_montgomery arg4)) mod m) + * eval (from_montgomery out5) mod m = (if 0 < arg1 ∧ (twos_complement_eval arg3) is odd then (eval (from_montgomery arg4) - eval (from_montgomery arg4)) mod m else (eval (from_montgomery arg5) + (twos_complement_eval arg3 mod 2) * eval (from_montgomery arg4)) mod m) + * 0 ≤ eval out5 < m + * 0 ≤ eval out5 < m + * 0 ≤ eval out2 < m + * 0 ≤ eval out3 < m + * + * Input Bounds: + * arg1: [0x0 ~> 0xffffffffffffffff] + * arg2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + * arg3: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + * arg4: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + * arg5: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + * Output Bounds: + * out1: [0x0 ~> 0xffffffffffffffff] + * out2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + * out3: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + * out4: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + * out5: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_divstep(uint64_t* out1, uint64_t out2[5], uint64_t out3[5], uint64_t out4[4], uint64_t out5[4], uint64_t arg1, const uint64_t arg2[5], const uint64_t arg3[5], const uint64_t arg4[4], const uint64_t arg5[4]) { + uint64_t x1; + fiat_p256_uint1 x2; + fiat_p256_uint1 x3; + uint64_t x4; + fiat_p256_uint1 x5; + uint64_t x6; + uint64_t x7; + uint64_t x8; + uint64_t x9; + uint64_t x10; + uint64_t x11; + uint64_t x12; + fiat_p256_uint1 x13; + uint64_t x14; + fiat_p256_uint1 x15; + uint64_t x16; + fiat_p256_uint1 x17; + uint64_t x18; + fiat_p256_uint1 x19; + uint64_t x20; + fiat_p256_uint1 x21; + uint64_t x22; + uint64_t x23; + uint64_t x24; + uint64_t x25; + uint64_t x26; + uint64_t x27; + uint64_t x28; + uint64_t x29; + uint64_t x30; + uint64_t x31; + fiat_p256_uint1 x32; + uint64_t x33; + fiat_p256_uint1 x34; + uint64_t x35; + fiat_p256_uint1 x36; + uint64_t x37; + fiat_p256_uint1 x38; + uint64_t x39; + fiat_p256_uint1 x40; + uint64_t x41; + fiat_p256_uint1 x42; + uint64_t x43; + fiat_p256_uint1 x44; + uint64_t x45; + fiat_p256_uint1 x46; + uint64_t x47; + fiat_p256_uint1 x48; + uint64_t x49; + uint64_t x50; + uint64_t x51; + uint64_t x52; + uint64_t x53; + fiat_p256_uint1 x54; + uint64_t x55; + fiat_p256_uint1 x56; + uint64_t x57; + fiat_p256_uint1 x58; + uint64_t x59; + fiat_p256_uint1 x60; + uint64_t x61; + uint64_t x62; + fiat_p256_uint1 x63; + uint64_t x64; + fiat_p256_uint1 x65; + uint64_t x66; + fiat_p256_uint1 x67; + uint64_t x68; + fiat_p256_uint1 x69; + uint64_t x70; + uint64_t x71; + uint64_t x72; + uint64_t x73; + fiat_p256_uint1 x74; + uint64_t x75; + uint64_t x76; + uint64_t x77; + uint64_t x78; + uint64_t x79; + uint64_t x80; + fiat_p256_uint1 x81; + uint64_t x82; + fiat_p256_uint1 x83; + uint64_t x84; + fiat_p256_uint1 x85; + uint64_t x86; + fiat_p256_uint1 x87; + uint64_t x88; + fiat_p256_uint1 x89; + uint64_t x90; + uint64_t x91; + uint64_t x92; + uint64_t x93; + uint64_t x94; + fiat_p256_uint1 x95; + uint64_t x96; + fiat_p256_uint1 x97; + uint64_t x98; + fiat_p256_uint1 x99; + uint64_t x100; + fiat_p256_uint1 x101; + uint64_t x102; + fiat_p256_uint1 x103; + uint64_t x104; + fiat_p256_uint1 x105; + uint64_t x106; + fiat_p256_uint1 x107; + uint64_t x108; + fiat_p256_uint1 x109; + uint64_t x110; + fiat_p256_uint1 x111; + uint64_t x112; + fiat_p256_uint1 x113; + uint64_t x114; + uint64_t x115; + uint64_t x116; + uint64_t x117; + uint64_t x118; + uint64_t x119; + uint64_t x120; + uint64_t x121; + uint64_t x122; + uint64_t x123; + uint64_t x124; + uint64_t x125; + uint64_t x126; + fiat_p256_addcarryx_u64(&x1, &x2, 0x0, (~arg1), 0x1); + x3 = (fiat_p256_uint1)((fiat_p256_uint1)(x1 >> 63) & (fiat_p256_uint1)((arg3[0]) & 0x1)); + fiat_p256_addcarryx_u64(&x4, &x5, 0x0, (~arg1), 0x1); + fiat_p256_cmovznz_u64(&x6, x3, arg1, x4); + fiat_p256_cmovznz_u64(&x7, x3, (arg2[0]), (arg3[0])); + fiat_p256_cmovznz_u64(&x8, x3, (arg2[1]), (arg3[1])); + fiat_p256_cmovznz_u64(&x9, x3, (arg2[2]), (arg3[2])); + fiat_p256_cmovznz_u64(&x10, x3, (arg2[3]), (arg3[3])); + fiat_p256_cmovznz_u64(&x11, x3, (arg2[4]), (arg3[4])); + fiat_p256_addcarryx_u64(&x12, &x13, 0x0, 0x1, (~(arg2[0]))); + fiat_p256_addcarryx_u64(&x14, &x15, x13, 0x0, (~(arg2[1]))); + fiat_p256_addcarryx_u64(&x16, &x17, x15, 0x0, (~(arg2[2]))); + fiat_p256_addcarryx_u64(&x18, &x19, x17, 0x0, (~(arg2[3]))); + fiat_p256_addcarryx_u64(&x20, &x21, x19, 0x0, (~(arg2[4]))); + fiat_p256_cmovznz_u64(&x22, x3, (arg3[0]), x12); + fiat_p256_cmovznz_u64(&x23, x3, (arg3[1]), x14); + fiat_p256_cmovznz_u64(&x24, x3, (arg3[2]), x16); + fiat_p256_cmovznz_u64(&x25, x3, (arg3[3]), x18); + fiat_p256_cmovznz_u64(&x26, x3, (arg3[4]), x20); + fiat_p256_cmovznz_u64(&x27, x3, (arg4[0]), (arg5[0])); + fiat_p256_cmovznz_u64(&x28, x3, (arg4[1]), (arg5[1])); + fiat_p256_cmovznz_u64(&x29, x3, (arg4[2]), (arg5[2])); + fiat_p256_cmovznz_u64(&x30, x3, (arg4[3]), (arg5[3])); + fiat_p256_addcarryx_u64(&x31, &x32, 0x0, x27, x27); + fiat_p256_addcarryx_u64(&x33, &x34, x32, x28, x28); + fiat_p256_addcarryx_u64(&x35, &x36, x34, x29, x29); + fiat_p256_addcarryx_u64(&x37, &x38, x36, x30, x30); + fiat_p256_subborrowx_u64(&x39, &x40, 0x0, x31, UINT64_C(0xffffffffffffffff)); + fiat_p256_subborrowx_u64(&x41, &x42, x40, x33, UINT32_C(0xffffffff)); + fiat_p256_subborrowx_u64(&x43, &x44, x42, x35, 0x0); + fiat_p256_subborrowx_u64(&x45, &x46, x44, x37, UINT64_C(0xffffffff00000001)); + fiat_p256_subborrowx_u64(&x47, &x48, x46, x38, 0x0); + x49 = (arg4[3]); + x50 = (arg4[2]); + x51 = (arg4[1]); + x52 = (arg4[0]); + fiat_p256_subborrowx_u64(&x53, &x54, 0x0, 0x0, x52); + fiat_p256_subborrowx_u64(&x55, &x56, x54, 0x0, x51); + fiat_p256_subborrowx_u64(&x57, &x58, x56, 0x0, x50); + fiat_p256_subborrowx_u64(&x59, &x60, x58, 0x0, x49); + fiat_p256_cmovznz_u64(&x61, x60, 0x0, UINT64_C(0xffffffffffffffff)); + fiat_p256_addcarryx_u64(&x62, &x63, 0x0, x53, x61); + fiat_p256_addcarryx_u64(&x64, &x65, x63, x55, (x61 & UINT32_C(0xffffffff))); + fiat_p256_addcarryx_u64(&x66, &x67, x65, x57, 0x0); + fiat_p256_addcarryx_u64(&x68, &x69, x67, x59, (x61 & UINT64_C(0xffffffff00000001))); + fiat_p256_cmovznz_u64(&x70, x3, (arg5[0]), x62); + fiat_p256_cmovznz_u64(&x71, x3, (arg5[1]), x64); + fiat_p256_cmovznz_u64(&x72, x3, (arg5[2]), x66); + fiat_p256_cmovznz_u64(&x73, x3, (arg5[3]), x68); + x74 = (fiat_p256_uint1)(x22 & 0x1); + fiat_p256_cmovznz_u64(&x75, x74, 0x0, x7); + fiat_p256_cmovznz_u64(&x76, x74, 0x0, x8); + fiat_p256_cmovznz_u64(&x77, x74, 0x0, x9); + fiat_p256_cmovznz_u64(&x78, x74, 0x0, x10); + fiat_p256_cmovznz_u64(&x79, x74, 0x0, x11); + fiat_p256_addcarryx_u64(&x80, &x81, 0x0, x22, x75); + fiat_p256_addcarryx_u64(&x82, &x83, x81, x23, x76); + fiat_p256_addcarryx_u64(&x84, &x85, x83, x24, x77); + fiat_p256_addcarryx_u64(&x86, &x87, x85, x25, x78); + fiat_p256_addcarryx_u64(&x88, &x89, x87, x26, x79); + fiat_p256_cmovznz_u64(&x90, x74, 0x0, x27); + fiat_p256_cmovznz_u64(&x91, x74, 0x0, x28); + fiat_p256_cmovznz_u64(&x92, x74, 0x0, x29); + fiat_p256_cmovznz_u64(&x93, x74, 0x0, x30); + fiat_p256_addcarryx_u64(&x94, &x95, 0x0, x70, x90); + fiat_p256_addcarryx_u64(&x96, &x97, x95, x71, x91); + fiat_p256_addcarryx_u64(&x98, &x99, x97, x72, x92); + fiat_p256_addcarryx_u64(&x100, &x101, x99, x73, x93); + fiat_p256_subborrowx_u64(&x102, &x103, 0x0, x94, UINT64_C(0xffffffffffffffff)); + fiat_p256_subborrowx_u64(&x104, &x105, x103, x96, UINT32_C(0xffffffff)); + fiat_p256_subborrowx_u64(&x106, &x107, x105, x98, 0x0); + fiat_p256_subborrowx_u64(&x108, &x109, x107, x100, UINT64_C(0xffffffff00000001)); + fiat_p256_subborrowx_u64(&x110, &x111, x109, x101, 0x0); + fiat_p256_addcarryx_u64(&x112, &x113, 0x0, x6, 0x1); + x114 = ((x80 >> 1) | ((x82 << 63) & UINT64_C(0xffffffffffffffff))); + x115 = ((x82 >> 1) | ((x84 << 63) & UINT64_C(0xffffffffffffffff))); + x116 = ((x84 >> 1) | ((x86 << 63) & UINT64_C(0xffffffffffffffff))); + x117 = ((x86 >> 1) | ((x88 << 63) & UINT64_C(0xffffffffffffffff))); + x118 = ((x88 & UINT64_C(0x8000000000000000)) | (x88 >> 1)); + fiat_p256_cmovznz_u64(&x119, x48, x39, x31); + fiat_p256_cmovznz_u64(&x120, x48, x41, x33); + fiat_p256_cmovznz_u64(&x121, x48, x43, x35); + fiat_p256_cmovznz_u64(&x122, x48, x45, x37); + fiat_p256_cmovznz_u64(&x123, x111, x102, x94); + fiat_p256_cmovznz_u64(&x124, x111, x104, x96); + fiat_p256_cmovznz_u64(&x125, x111, x106, x98); + fiat_p256_cmovznz_u64(&x126, x111, x108, x100); + *out1 = x112; + out2[0] = x7; + out2[1] = x8; + out2[2] = x9; + out2[3] = x10; + out2[4] = x11; + out3[0] = x114; + out3[1] = x115; + out3[2] = x116; + out3[3] = x117; + out3[4] = x118; + out4[0] = x119; + out4[1] = x120; + out4[2] = x121; + out4[3] = x122; + out5[0] = x123; + out5[1] = x124; + out5[2] = x125; + out5[3] = x126; +} + +/* + * The function fiat_p256_divstep_precomp returns the precomputed value for Bernstein-Yang-inversion (in montgomery form). + * + * Postconditions: + * eval (from_montgomery out1) = ⌊(m - 1) / 2⌋^(if ⌊log2 m⌋ + 1 < 46 then ⌊(49 * (⌊log2 m⌋ + 1) + 80) / 17⌋ else ⌊(49 * (⌊log2 m⌋ + 1) + 57) / 17⌋) + * 0 ≤ eval out1 < m + * + * Output Bounds: + * out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] + */ +static FIAT_P256_FIAT_INLINE void fiat_p256_divstep_precomp(uint64_t out1[4]) { + out1[0] = UINT64_C(0x67ffffffb8000000); + out1[1] = UINT64_C(0xc000000038000000); + out1[2] = UINT64_C(0xd80000007fffffff); + out1[3] = UINT64_C(0x2fffffffffffffff); +}

, + other_prime_len_bits: BitLength, +) -> Result, LimbSliceError> { + // `elem_exp_consttime_inner` is parameterized on `STORAGE_LIMBS` only so + // we can run tests with larger-than-supported-in-operation test vectors. + elem_exp_consttime_inner::( + out, + base, + oneRRR, + exponent, + p, + other_prime_len_bits, + ) +} + +// The maximum modulus size supported for `elem_exp_consttime` in normal +// operation. +const ELEM_EXP_CONSTTIME_MAX_MODULUS_LIMBS: usize = 2048 / LIMB_BITS; +const _LIMBS_PER_CHUNK_DIVIDES_ELEM_EXP_CONSTTIME_MAX_MODULUS_LIMBS: () = + assert!(ELEM_EXP_CONSTTIME_MAX_MODULUS_LIMBS % limbs512::LIMBS_PER_CHUNK == 0); +const WINDOW_BITS: u32 = 5; +const TABLE_ENTRIES: usize = 1 << WINDOW_BITS; +const STORAGE_ENTRIES: usize = TABLE_ENTRIES + if cfg!(target_arch = "x86_64") { 3 } else { 0 }; + +#[cfg(not(target_arch = "x86_64"))] +fn elem_exp_consttime_inner( + out: Storage, + base_mod_n: &Elem, + oneRRR: &One, + exponent: &PrivateExponent, + m: &Modulus, + other_prime_len_bits: BitLength, +) -> Result, LimbSliceError> { + use crate::{bssl, limb::Window}; + + let base_rinverse: Elem = elem_reduced(out, base_mod_n, m, other_prime_len_bits); + + let num_limbs = m.limbs().len(); + let m_chunked: AsChunks = match slice::as_chunks(m.limbs()) + { + (m, []) => m, + _ => { + return Err(LimbSliceError::len_mismatch(LenMismatchError::new( + num_limbs, + ))) + } + }; + let cpe = m_chunked.len(); // 512-bit chunks per entry. + + // This code doesn't have the strict alignment requirements that the x86_64 + // version does, but uses the same aligned storage for convenience. + assert!(STORAGE_LIMBS % (STORAGE_ENTRIES * limbs512::LIMBS_PER_CHUNK) == 0); // TODO: `const` + let mut table = limbs512::AlignedStorage::::zeroed(); + let mut table = table + .aligned_chunks_mut(TABLE_ENTRIES, cpe) + .map_err(LimbSliceError::len_mismatch)?; + + // TODO: Rewrite the below in terms of `AsChunks`. + let table = table.as_flattened_mut(); + + fn gather(table: &[Limb], acc: &mut Elem, i: Window) { + prefixed_extern! { + fn LIMBS_select_512_32( + r: *mut Limb, + table: *const Limb, + num_limbs: c::size_t, + i: Window, + ) -> bssl::Result; + } + Result::from(unsafe { + LIMBS_select_512_32(acc.limbs.as_mut_ptr(), table.as_ptr(), acc.limbs.len(), i) + }) + .unwrap(); + } + + fn power( + table: &[Limb], + mut acc: Elem, + m: &Modulus, + i: Window, + mut tmp: Elem, + ) -> (Elem, Elem) { + for _ in 0..WINDOW_BITS { + acc = elem_squared(acc, m); + } + gather(table, &mut tmp, i); + let acc = elem_mul(&tmp, acc, m); + (acc, tmp) + } + + fn entry(table: &[Limb], i: usize, num_limbs: usize) -> &[Limb] { + &table[(i * num_limbs)..][..num_limbs] + } + fn entry_mut(table: &mut [Limb], i: usize, num_limbs: usize) -> &mut [Limb] { + &mut table[(i * num_limbs)..][..num_limbs] + } + + // table[0] = base**0 (i.e. 1). + m.oneR(entry_mut(table, 0, num_limbs)); + + // table[1] = base*R == (base/R * RRR)/R + limbs_mul_mont( + ( + entry_mut(table, 1, num_limbs), + base_rinverse.limbs.as_ref(), + oneRRR.as_ref().limbs.as_ref(), + ), + m.limbs(), + m.n0(), + m.cpu_features(), + )?; + for i in 2..TABLE_ENTRIES { + let (src1, src2) = if i % 2 == 0 { + (i / 2, i / 2) + } else { + (i - 1, 1) + }; + let (previous, rest) = table.split_at_mut(num_limbs * i); + let src1 = entry(previous, src1, num_limbs); + let src2 = entry(previous, src2, num_limbs); + let dst = entry_mut(rest, 0, num_limbs); + limbs_mul_mont((dst, src1, src2), m.limbs(), m.n0(), m.cpu_features())?; + } + + let mut acc = Elem { + limbs: base_rinverse.limbs, + encoding: PhantomData, + }; + let tmp = m.alloc_zero(); + let tmp = Elem { + limbs: tmp.limbs, + encoding: PhantomData, + }; + let (acc, _) = limb::fold_5_bit_windows( + exponent.limbs(), + |initial_window| { + gather(&table, &mut acc, initial_window); + (acc, tmp) + }, + |(acc, tmp), window| power(&table, acc, m, window, tmp), + ); + + Ok(acc.into_unencoded(m)) +} + +#[cfg(target_arch = "x86_64")] +fn elem_exp_consttime_inner( + out: Storage, + base_mod_n: &Elem, + oneRRR: &One, + exponent: &PrivateExponent, + m: &Modulus, + other_prime_len_bits: BitLength, +) -> Result, LimbSliceError> { + use super::limbs::x86_64::mont::{ + gather5, mul_mont5, mul_mont_gather5_amm, power5_amm, scatter5, sqr_mont5, + }; + use crate::{ + cpu::{ + intel::{Adx, Bmi2}, + GetFeature as _, + }, + limb::{LeakyWindow, Window}, + polyfill::slice::AsChunksMut, + }; + + let n0 = m.n0(); + + let cpu2 = m.cpu_features().get_feature(); + let cpu3 = m.cpu_features().get_feature(); + + if base_mod_n.limbs.len() != m.limbs().len() * 2 { + return Err(LimbSliceError::len_mismatch(LenMismatchError::new( + base_mod_n.limbs.len(), + ))); + } + + let m_original: AsChunks = match slice::as_chunks(m.limbs()) { + (m, []) => m, + _ => return Err(LimbSliceError::len_mismatch(LenMismatchError::new(8))), + }; + let cpe = m_original.len(); // 512-bit chunks per entry + + let oneRRR = &oneRRR.as_ref().limbs; + let oneRRR = match slice::as_chunks(oneRRR) { + (c, []) => c, + _ => { + return Err(LimbSliceError::len_mismatch(LenMismatchError::new( + oneRRR.len(), + ))) + } + }; + + // The x86_64 assembly was written under the assumption that the input data + // is aligned to `MOD_EXP_CTIME_ALIGN` bytes, which was/is 64 in OpenSSL. + // Subsequently, it was changed such that, according to BoringSSL, they + // only require 16 byte alignment. We enforce the old, stronger, alignment + // unless/until we can see a benefit to reducing it. + // + // Similarly, OpenSSL uses the x86_64 assembly functions by giving it only + // inputs `tmp`, `am`, and `np` that immediately follow the table. + // According to BoringSSL, in older versions of the OpenSSL code, this + // extra space was required for memory safety because the assembly code + // would over-read the table; according to BoringSSL, this is no longer the + // case. Regardless, the upstream code also contained comments implying + // that this was also important for performance. For now, we do as OpenSSL + // did/does. + const MOD_EXP_CTIME_ALIGN: usize = 64; + // Required by + const _TABLE_ENTRIES_IS_32: () = assert!(TABLE_ENTRIES == 32); + const _STORAGE_ENTRIES_HAS_3_EXTRA: () = assert!(STORAGE_ENTRIES == TABLE_ENTRIES + 3); + + assert!(STORAGE_LIMBS % (STORAGE_ENTRIES * limbs512::LIMBS_PER_CHUNK) == 0); // TODO: `const` + let mut table = limbs512::AlignedStorage::::zeroed(); + let mut table = table + .aligned_chunks_mut(STORAGE_ENTRIES, cpe) + .map_err(LimbSliceError::len_mismatch)?; + let (mut table, mut state) = table.split_at_mut(TABLE_ENTRIES * cpe); + assert_eq!((table.as_ptr() as usize) % MOD_EXP_CTIME_ALIGN, 0); + + // These are named `(tmp, am, np)` in BoringSSL. + let (mut acc, mut rest) = state.split_at_mut(cpe); + let (mut base_cached, mut m_cached) = rest.split_at_mut(cpe); + + // "To improve cache locality" according to upstream. + m_cached + .as_flattened_mut() + .copy_from_slice(m_original.as_flattened()); + let m_cached = m_cached.as_ref(); + + let out: Elem = elem_reduced(out, base_mod_n, m, other_prime_len_bits); + let base_rinverse = match slice::as_chunks(&out.limbs) { + (c, []) => c, + _ => { + return Err(LimbSliceError::len_mismatch(LenMismatchError::new( + out.limbs.len(), + ))) + } + }; + + // base_cached = base*R == (base/R * RRR)/R + mul_mont5( + base_cached.as_mut(), + base_rinverse, + oneRRR, + m_cached, + n0, + cpu2, + )?; + let base_cached = base_cached.as_ref(); + let mut out = Storage::from(out); // recycle. + + // Fill in all the powers of 2 of `acc` into the table using only squaring and without any + // gathering, storing the last calculated power into `acc`. + fn scatter_powers_of_2( + mut table: AsChunksMut, + mut acc: AsChunksMut, + m_cached: AsChunks, + n0: &N0, + mut i: LeakyWindow, + cpu: Option<(Adx, Bmi2)>, + ) -> Result<(), LimbSliceError> { + loop { + scatter5(acc.as_ref(), table.as_mut(), i)?; + i *= 2; + if i >= TABLE_ENTRIES as LeakyWindow { + break; + } + sqr_mont5(acc.as_mut(), m_cached, n0, cpu)?; + } + Ok(()) + } + + // All entries in `table` will be Montgomery encoded. + + // acc = table[0] = base**0 (i.e. 1). + m.oneR(acc.as_flattened_mut()); + scatter5(acc.as_ref(), table.as_mut(), 0)?; + + // acc = base**1 (i.e. base). + acc.as_flattened_mut() + .copy_from_slice(base_cached.as_flattened()); + + // Fill in entries 1, 2, 4, 8, 16. + scatter_powers_of_2(table.as_mut(), acc.as_mut(), m_cached, n0, 1, cpu2)?; + // Fill in entries 3, 6, 12, 24; 5, 10, 20, 30; 7, 14, 28; 9, 18; 11, 22; 13, 26; 15, 30; + // 17; 19; 21; 23; 25; 27; 29; 31. + for i in (3..(TABLE_ENTRIES as LeakyWindow)).step_by(2) { + let power = Window::from(i - 1); + assert!(power < 32); // Not secret, + unsafe { + mul_mont_gather5_amm( + acc.as_mut(), + base_cached, + table.as_ref(), + m_cached, + n0, + power, + cpu3, + ) + }?; + scatter_powers_of_2(table.as_mut(), acc.as_mut(), m_cached, n0, i, cpu2)?; + } + + let table = table.as_ref(); + + let acc = limb::fold_5_bit_windows( + exponent.limbs(), + |initial_window| { + unsafe { gather5(acc.as_mut(), table, initial_window) } + .unwrap_or_else(unwrap_impossible_limb_slice_error); + acc + }, + |mut acc, window| { + unsafe { power5_amm(acc.as_mut(), table, m_cached, n0, window, cpu3) } + .unwrap_or_else(unwrap_impossible_limb_slice_error); + acc + }, + ); + + // Reuse `base_rinverse`'s limbs to save an allocation. + out.limbs.copy_from_slice(acc.as_flattened()); + Ok(from_montgomery_amm(out, m)) +} + +/// Verified a == b**-1 (mod m), i.e. a**-1 == b (mod m). +pub fn verify_inverses_consttime( + a: &Elem, + b: Elem, + m: &Modulus, +) -> Result<(), error::Unspecified> { + let r = elem_mul(a, b, m); + limb::verify_limbs_equal_1_leak_bit(&r.limbs) +} + +#[inline] +pub fn elem_verify_equal_consttime( + a: &Elem, + b: &Elem, +) -> Result<(), error::Unspecified> { + let equal = limb::limbs_equal_limbs_consttime(&a.limbs, &b.limbs) + .unwrap_or_else(unwrap_impossible_len_mismatch_error); + if !equal.leak() { + return Err(error::Unspecified); + } + Ok(()) +} + +#[cold] +#[inline(never)] +fn unwrap_impossible_len_mismatch_error(LenMismatchError { .. }: LenMismatchError) -> T { + unreachable!() +} + +#[cold] +#[inline(never)] +fn unwrap_impossible_limb_slice_error(err: LimbSliceError) { + match err { + LimbSliceError::LenMismatch(_) => unreachable!(), + LimbSliceError::TooShort(_) => unreachable!(), + LimbSliceError::TooLong(_) => unreachable!(), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::cpu; + use crate::testutil as test; + + // Type-level representation of an arbitrary modulus. + struct M {} + + impl PublicModulus for M {} + + #[test] + fn test_elem_exp_consttime() { + let cpu_features = cpu::features(); + test::run( + test_vector_file!("../../crypto/fipsmodule/bn/test/mod_exp_tests.txt"), + |section, test_case| { + assert_eq!(section, ""); + + let m = consume_modulus::(test_case, "M"); + let m = m.modulus(cpu_features); + let expected_result = consume_elem(test_case, "ModExp", &m); + let base = consume_elem(test_case, "A", &m); + let e = { + let bytes = test_case.consume_bytes("E"); + PrivateExponent::from_be_bytes_for_test_only(untrusted::Input::from(&bytes), &m) + .expect("valid exponent") + }; + + let oneRR = One::newRR(m.alloc_zero(), &m); + let oneRRR = One::newRRR(oneRR, &m); + + // `base` in the test vectors is reduced (mod M) already but + // the API expects the bsae to be (mod N) where N = M * P for + // some other prime of the same length. Fake that here. + // Pretend there's another prime of equal length. + struct N {} + let other_modulus_len_bits = m.len_bits(); + let base: Elem = { + let mut limbs = BoxedLimbs::zero(base.limbs.len() * 2); + limbs[..base.limbs.len()].copy_from_slice(&base.limbs); + Elem { + limbs, + encoding: PhantomData, + } + }; + + let too_big = m.limbs().len() > ELEM_EXP_CONSTTIME_MAX_MODULUS_LIMBS; + let actual_result = if !too_big { + elem_exp_consttime( + m.alloc_zero(), + &base, + &oneRRR, + &e, + &m, + other_modulus_len_bits, + ) + } else { + let actual_result = elem_exp_consttime( + m.alloc_zero(), + &base, + &oneRRR, + &e, + &m, + other_modulus_len_bits, + ); + // TODO: Be more specific with which error we expect? + assert!(actual_result.is_err()); + // Try again with a larger-than-normally-supported limit + elem_exp_consttime_inner::<_, _, { (4096 / LIMB_BITS) * STORAGE_ENTRIES }>( + m.alloc_zero(), + &base, + &oneRRR, + &e, + &m, + other_modulus_len_bits, + ) + }; + match actual_result { + Ok(r) => assert_elem_eq(&r, &expected_result), + Err(LimbSliceError::LenMismatch { .. }) => panic!(), + Err(LimbSliceError::TooLong { .. }) => panic!(), + Err(LimbSliceError::TooShort { .. }) => panic!(), + }; + + Ok(()) + }, + ) + } + + // TODO: fn test_elem_exp_vartime() using + // "src/rsa/bigint_elem_exp_vartime_tests.txt". See that file for details. + // In the meantime, the function is tested indirectly via the RSA + // verification and signing tests. + #[test] + fn test_elem_mul() { + let cpu_features = cpu::features(); + test::run( + test_vector_file!("../../crypto/fipsmodule/bn/test/mod_mul_tests.txt"), + |section, test_case| { + assert_eq!(section, ""); + + let m = consume_modulus::(test_case, "M"); + let m = m.modulus(cpu_features); + let expected_result = consume_elem(test_case, "ModMul", &m); + let a = consume_elem(test_case, "A", &m); + let b = consume_elem(test_case, "B", &m); + + let b = into_encoded(m.alloc_zero(), b, &m); + let a = into_encoded(m.alloc_zero(), a, &m); + let actual_result = elem_mul(&a, b, &m); + let actual_result = actual_result.into_unencoded(&m); + assert_elem_eq(&actual_result, &expected_result); + + Ok(()) + }, + ) + } + + #[test] + fn test_elem_squared() { + let cpu_features = cpu::features(); + test::run( + test_vector_file!("bigint_elem_squared_tests.txt"), + |section, test_case| { + assert_eq!(section, ""); + + let m = consume_modulus::(test_case, "M"); + let m = m.modulus(cpu_features); + let expected_result = consume_elem(test_case, "ModSquare", &m); + let a = consume_elem(test_case, "A", &m); + + let a = into_encoded(m.alloc_zero(), a, &m); + let actual_result = elem_squared(a, &m); + let actual_result = actual_result.into_unencoded(&m); + assert_elem_eq(&actual_result, &expected_result); + + Ok(()) + }, + ) + } + + #[test] + fn test_elem_reduced() { + let cpu_features = cpu::features(); + test::run( + test_vector_file!("bigint_elem_reduced_tests.txt"), + |section, test_case| { + assert_eq!(section, ""); + + struct M {} + + let m_ = consume_modulus::(test_case, "M"); + let m = m_.modulus(cpu_features); + let expected_result = consume_elem(test_case, "R", &m); + let a = + consume_elem_unchecked::(test_case, "A", expected_result.limbs.len() * 2); + let other_modulus_len_bits = m_.len_bits(); + + let actual_result = elem_reduced(m.alloc_zero(), &a, &m, other_modulus_len_bits); + let oneRR = One::newRR(m.alloc_zero(), &m); + let actual_result = elem_mul(oneRR.as_ref(), actual_result, &m); + assert_elem_eq(&actual_result, &expected_result); + + Ok(()) + }, + ) + } + + #[test] + fn test_elem_reduced_once() { + let cpu_features = cpu::features(); + test::run( + test_vector_file!("bigint_elem_reduced_once_tests.txt"), + |section, test_case| { + assert_eq!(section, ""); + + struct M {} + struct O {} + let m = consume_modulus::(test_case, "m"); + let m = m.modulus(cpu_features); + let a = consume_elem_unchecked::(test_case, "a", m.limbs().len()); + let expected_result = consume_elem::(test_case, "r", &m); + let other_modulus_len_bits = m.len_bits(); + + let actual_result = + elem_reduced_once(m.alloc_zero(), &a, &m, other_modulus_len_bits); + assert_elem_eq(&actual_result, &expected_result); + + Ok(()) + }, + ) + } + + fn consume_elem( + test_case: &mut test::TestCase, + name: &str, + m: &Modulus, + ) -> Elem { + let value = test_case.consume_bytes(name); + Elem::from_be_bytes_padded(untrusted::Input::from(&value), m).unwrap() + } + + fn consume_elem_unchecked( + test_case: &mut test::TestCase, + name: &str, + num_limbs: usize, + ) -> Elem { + let bytes = test_case.consume_bytes(name); + let mut limbs = BoxedLimbs::zero(num_limbs); + limb::parse_big_endian_and_pad_consttime(untrusted::Input::from(&bytes), &mut limbs) + .unwrap(); + Elem { + limbs, + encoding: PhantomData, + } + } + + fn consume_modulus(test_case: &mut test::TestCase, name: &str) -> OwnedModulus { + let value = test_case.consume_bytes(name); + OwnedModulus::from( + OwnedModulusValue::from_be_bytes(untrusted::Input::from(&value)).unwrap(), + ) + } + + fn assert_elem_eq(a: &Elem, b: &Elem) { + if elem_verify_equal_consttime(a, b).is_err() { + panic!("{:x?} != {:x?}", &*a.limbs, &*b.limbs); + } + } + + fn into_encoded(out: Storage, a: Elem, m: &Modulus) -> Elem { + let oneRR = One::newRR(out, m); + elem_mul(oneRR.as_ref(), a, m) + } +} diff --git a/ring-0.17.14/src/arithmetic/bigint/boxed_limbs.rs b/ring-0.17.14/src/arithmetic/bigint/boxed_limbs.rs new file mode 100644 index 0000000000..6eb465c91d --- /dev/null +++ b/ring-0.17.14/src/arithmetic/bigint/boxed_limbs.rs @@ -0,0 +1,81 @@ +// Copyright 2015-2023 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::Modulus; +use crate::{ + error, + limb::{self, Limb}, +}; +use alloc::{boxed::Box, vec}; +use core::{ + marker::PhantomData, + ops::{Deref, DerefMut}, +}; + +/// All `BoxedLimbs` are stored in the same number of limbs. +pub(super) struct BoxedLimbs { + limbs: Box<[Limb]>, + + /// The modulus *m* that determines the size of `limbx`. + m: PhantomData, +} + +impl Deref for BoxedLimbs { + type Target = [Limb]; + #[inline] + fn deref(&self) -> &Self::Target { + &self.limbs + } +} + +impl DerefMut for BoxedLimbs { + #[inline] + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.limbs + } +} + +// TODO: `derive(Clone)` after https://github.com/rust-lang/rust/issues/26925 +// is resolved or restrict `M: Clone`. +impl Clone for BoxedLimbs { + fn clone(&self) -> Self { + Self { + limbs: self.limbs.clone(), + m: self.m, + } + } +} + +impl BoxedLimbs { + pub(super) fn from_be_bytes_padded_less_than( + input: untrusted::Input, + m: &Modulus, + ) -> Result { + let mut r = Self::zero(m.limbs().len()); + limb::parse_big_endian_and_pad_consttime(input, &mut r)?; + limb::verify_limbs_less_than_limbs_leak_bit(&r, m.limbs())?; + Ok(r) + } + + pub(super) fn zero(len: usize) -> Self { + Self { + limbs: vec![0; len].into_boxed_slice(), + m: PhantomData, + } + } + + pub(super) fn into_limbs(self) -> Box<[Limb]> { + self.limbs + } +} diff --git a/ring-0.17.14/src/arithmetic/bigint/modulus.rs b/ring-0.17.14/src/arithmetic/bigint/modulus.rs new file mode 100644 index 0000000000..b0cd6d6519 --- /dev/null +++ b/ring-0.17.14/src/arithmetic/bigint/modulus.rs @@ -0,0 +1,202 @@ +// Copyright 2015-2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::{ + super::montgomery::Unencoded, unwrap_impossible_len_mismatch_error, BoxedLimbs, Elem, + OwnedModulusValue, PublicModulus, Storage, N0, +}; +use crate::{ + bits::BitLength, + cpu, error, + limb::{self, Limb, LIMB_BITS}, + polyfill::LeadingZerosStripped, +}; +use core::marker::PhantomData; + +/// The modulus *m* for a ring ℤ/mℤ, along with the precomputed values needed +/// for efficient Montgomery multiplication modulo *m*. The value must be odd +/// and larger than 2. The larger-than-1 requirement is imposed, at least, by +/// the modular inversion code. +pub struct OwnedModulus { + inner: OwnedModulusValue, + + // n0 * N == -1 (mod r). + // + // r == 2**(N0::LIMBS_USED * LIMB_BITS) and LG_LITTLE_R == lg(r). This + // ensures that we can do integer division by |r| by simply ignoring + // `N0::LIMBS_USED` limbs. Similarly, we can calculate values modulo `r` by + // just looking at the lowest `N0::LIMBS_USED` limbs. This is what makes + // Montgomery multiplication efficient. + // + // As shown in Algorithm 1 of "Fast Prime Field Elliptic Curve Cryptography + // with 256 Bit Primes" by Shay Gueron and Vlad Krasnov, in the loop of a + // multi-limb Montgomery multiplication of a * b (mod n), given the + // unreduced product t == a * b, we repeatedly calculate: + // + // t1 := t % r |t1| is |t|'s lowest limb (see previous paragraph). + // t2 := t1*n0*n + // t3 := t + t2 + // t := t3 / r copy all limbs of |t3| except the lowest to |t|. + // + // In the last step, it would only make sense to ignore the lowest limb of + // |t3| if it were zero. The middle steps ensure that this is the case: + // + // t3 == 0 (mod r) + // t + t2 == 0 (mod r) + // t + t1*n0*n == 0 (mod r) + // t1*n0*n == -t (mod r) + // t*n0*n == -t (mod r) + // n0*n == -1 (mod r) + // n0 == -1/n (mod r) + // + // Thus, in each iteration of the loop, we multiply by the constant factor + // n0, the negative inverse of n (mod r). + // + // TODO(perf): Not all 32-bit platforms actually make use of n0[1]. For the + // ones that don't, we could use a shorter `R` value and use faster `Limb` + // calculations instead of double-precision `u64` calculations. + n0: N0, +} + +impl Clone for OwnedModulus { + fn clone(&self) -> Self { + Self { + inner: self.inner.clone(), + n0: self.n0, + } + } +} + +impl OwnedModulus { + pub(crate) fn from(n: OwnedModulusValue) -> Self { + // n_mod_r = n % r. As explained in the documentation for `n0`, this is + // done by taking the lowest `N0::LIMBS_USED` limbs of `n`. + #[allow(clippy::useless_conversion)] + let n0 = { + prefixed_extern! { + fn bn_neg_inv_mod_r_u64(n: u64) -> u64; + } + + // XXX: u64::from isn't guaranteed to be constant time. + let mut n_mod_r: u64 = u64::from(n.limbs()[0]); + + if N0::LIMBS_USED == 2 { + // XXX: If we use `<< LIMB_BITS` here then 64-bit builds + // fail to compile because of `deny(exceeding_bitshifts)`. + debug_assert_eq!(LIMB_BITS, 32); + n_mod_r |= u64::from(n.limbs()[1]) << 32; + } + N0::precalculated(unsafe { bn_neg_inv_mod_r_u64(n_mod_r) }) + }; + + Self { inner: n, n0 } + } + + pub fn to_elem(&self, l: &Modulus) -> Result, error::Unspecified> { + self.inner.verify_less_than(l)?; + let mut limbs = BoxedLimbs::zero(l.limbs().len()); + limbs[..self.inner.limbs().len()].copy_from_slice(self.inner.limbs()); + Ok(Elem { + limbs, + encoding: PhantomData, + }) + } + + pub(crate) fn modulus(&self, cpu_features: cpu::Features) -> Modulus { + Modulus { + limbs: self.inner.limbs(), + n0: self.n0, + len_bits: self.len_bits(), + m: PhantomData, + cpu_features, + } + } + + pub fn len_bits(&self) -> BitLength { + self.inner.len_bits() + } +} + +impl OwnedModulus { + pub fn be_bytes(&self) -> LeadingZerosStripped + Clone + '_> { + LeadingZerosStripped::new(limb::unstripped_be_bytes(self.inner.limbs())) + } +} + +pub struct Modulus<'a, M> { + limbs: &'a [Limb], + n0: N0, + len_bits: BitLength, + m: PhantomData, + cpu_features: cpu::Features, +} + +impl Modulus<'_, M> { + pub(super) fn oneR(&self, out: &mut [Limb]) { + assert_eq!(self.limbs.len(), out.len()); + + let r = self.limbs.len() * LIMB_BITS; + + // out = 2**r - m where m = self. + limb::limbs_negative_odd(out, self.limbs); + + let lg_m = self.len_bits().as_bits(); + let leading_zero_bits_in_m = r - lg_m; + + // When m's length is a multiple of LIMB_BITS, which is the case we + // most want to optimize for, then we already have + // out == 2**r - m == 2**r (mod m). + if leading_zero_bits_in_m != 0 { + debug_assert!(leading_zero_bits_in_m < LIMB_BITS); + // Correct out to 2**(lg m) (mod m). `limbs_negative_odd` flipped + // all the leading zero bits to ones. Flip them back. + *out.last_mut().unwrap() &= (!0) >> leading_zero_bits_in_m; + + // Now we have out == 2**(lg m) (mod m). Keep doubling until we get + // to 2**r (mod m). + for _ in 0..leading_zero_bits_in_m { + limb::limbs_double_mod(out, self.limbs) + .unwrap_or_else(unwrap_impossible_len_mismatch_error); + } + } + + // Now out == 2**r (mod m) == 1*R. + } + + // TODO: XXX Avoid duplication with `Modulus`. + pub fn alloc_zero(&self) -> Storage { + Storage { + limbs: BoxedLimbs::zero(self.limbs.len()), + } + } + + #[inline] + pub(super) fn limbs(&self) -> &[Limb] { + self.limbs + } + + #[inline] + pub(super) fn n0(&self) -> &N0 { + &self.n0 + } + + pub fn len_bits(&self) -> BitLength { + self.len_bits + } + + #[inline] + pub(crate) fn cpu_features(&self) -> cpu::Features { + self.cpu_features + } +} diff --git a/ring-0.17.14/src/arithmetic/bigint/modulusvalue.rs b/ring-0.17.14/src/arithmetic/bigint/modulusvalue.rs new file mode 100644 index 0000000000..84f5884902 --- /dev/null +++ b/ring-0.17.14/src/arithmetic/bigint/modulusvalue.rs @@ -0,0 +1,88 @@ +// Copyright 2015-2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::{ + super::{MAX_LIMBS, MIN_LIMBS}, + BoxedLimbs, Modulus, PublicModulus, +}; +use crate::{ + bits::BitLength, + error, + limb::{self, Limb, LIMB_BYTES}, +}; + +/// `OwnedModulus`, without the overhead of Montgomery multiplication support. +pub(crate) struct OwnedModulusValue { + limbs: BoxedLimbs, // Also `value >= 3`. + + len_bits: BitLength, +} + +impl Clone for OwnedModulusValue { + fn clone(&self) -> Self { + Self { + limbs: self.limbs.clone(), + len_bits: self.len_bits, + } + } +} + +impl OwnedModulusValue { + pub(crate) fn from_be_bytes(input: untrusted::Input) -> Result { + let num_limbs = (input.len() + LIMB_BYTES - 1) / LIMB_BYTES; + const _MODULUS_MIN_LIMBS_AT_LEAST_2: () = assert!(MIN_LIMBS >= 2); + if num_limbs < MIN_LIMBS { + return Err(error::KeyRejected::unexpected_error()); + } + if num_limbs > MAX_LIMBS { + return Err(error::KeyRejected::too_large()); + } + // The above implies n >= 3, so we don't need to check that. + + // Reject leading zeros. Also reject the value zero ([0]) because zero + // isn't positive. + if untrusted::Reader::new(input).peek(0) { + return Err(error::KeyRejected::invalid_encoding()); + } + + let mut limbs = BoxedLimbs::zero(num_limbs); + limb::parse_big_endian_and_pad_consttime(input, &mut limbs) + .map_err(|error::Unspecified| error::KeyRejected::unexpected_error())?; + limb::limbs_reject_even_leak_bit(&limbs) + .map_err(|_: error::Unspecified| error::KeyRejected::invalid_component())?; + + let len_bits = limb::limbs_minimal_bits(&limbs); + + Ok(Self { limbs, len_bits }) + } + + pub fn verify_less_than(&self, l: &Modulus) -> Result<(), error::Unspecified> { + if self.len_bits() > l.len_bits() { + return Err(error::Unspecified); + } + if self.limbs.len() == l.limbs().len() { + limb::verify_limbs_less_than_limbs_leak_bit(&self.limbs, l.limbs())?; + } + Ok(()) + } + + pub fn len_bits(&self) -> BitLength { + self.len_bits + } + + #[inline] + pub(super) fn limbs(&self) -> &[Limb] { + &self.limbs + } +} diff --git a/ring-0.17.14/src/arithmetic/bigint/private_exponent.rs b/ring-0.17.14/src/arithmetic/bigint/private_exponent.rs new file mode 100644 index 0000000000..82e75bf888 --- /dev/null +++ b/ring-0.17.14/src/arithmetic/bigint/private_exponent.rs @@ -0,0 +1,77 @@ +// Copyright 2015-2023 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::{limb, BoxedLimbs, Limb, Modulus}; +use crate::error; +use alloc::boxed::Box; + +pub struct PrivateExponent { + // Unlike most `[Limb]` we deal with, these are stored most significant + // word first. + limbs: Box<[Limb]>, +} + +impl PrivateExponent { + // `p` is the modulus for which the exponent is in the interval [1, `p` - 1). + pub fn from_be_bytes_padded( + input: untrusted::Input, + p: &Modulus, + ) -> Result { + let mut dP = BoxedLimbs::from_be_bytes_padded_less_than(input, p)?; + + // Proof that `dP < p - 1`: + // + // If `dP < p` then either `dP == p - 1` or `dP < p - 1`. Since `p` is + // odd, `p - 1` is even. `d` is odd, and an odd number modulo an even + // number is odd. Therefore `dP` must be odd. But then it cannot be + // `p - 1` and so we know `dP < p - 1`. + // + // Further we know `dP != 0` because `dP` is not even. + limb::limbs_reject_even_leak_bit(&dP)?; + dP.reverse(); + + Ok(Self { + limbs: dP.into_limbs(), + }) + } + + // Create a `PrivateExponent` with a value that we do not support in + // production use, to allow testing with additional test vectors. + #[cfg(test)] + pub fn from_be_bytes_for_test_only( + input: untrusted::Input, + p: &Modulus, + ) -> Result { + use crate::limb::LIMB_BYTES; + + // Do exactly what `from_be_bytes_padded` does for any inputs it accepts. + if let r @ Ok(_) = Self::from_be_bytes_padded(input, p) { + return r; + } + + let num_limbs = (input.len() + LIMB_BYTES - 1) / LIMB_BYTES; + let mut limbs = BoxedLimbs::::zero(num_limbs); + limb::parse_big_endian_and_pad_consttime(input, &mut limbs) + .map_err(|error::Unspecified| error::KeyRejected::unexpected_error())?; + limbs.reverse(); + Ok(Self { + limbs: limbs.into_limbs(), + }) + } + + #[inline] + pub(super) fn limbs(&self) -> &[Limb] { + &self.limbs + } +} diff --git a/ring-0.17.14/src/arithmetic/constant.rs b/ring-0.17.14/src/arithmetic/constant.rs new file mode 100644 index 0000000000..81d61ea9a7 --- /dev/null +++ b/ring-0.17.14/src/arithmetic/constant.rs @@ -0,0 +1,27 @@ +use crate::limb::LeakyLimb; +use core::mem::size_of; + +const fn parse_digit(d: u8) -> u8 { + match d.to_ascii_lowercase() { + b'0'..=b'9' => d - b'0', + b'a'..=b'f' => d - b'a' + 10, + _ => panic!(), + } +} + +// TODO: this would be nicer as a trait, but currently traits don't support const functions +pub const fn limbs_from_hex(hex: &str) -> [LeakyLimb; LIMBS] { + let hex = hex.as_bytes(); + let mut limbs = [0; LIMBS]; + let limb_nibbles = size_of::() * 2; + let mut i = 0; + + while i < hex.len() { + let char = hex[hex.len() - 1 - i]; + let val = parse_digit(char); + limbs[i / limb_nibbles] |= (val as LeakyLimb) << ((i % limb_nibbles) * 4); + i += 1; + } + + limbs +} diff --git a/ring-0.17.14/src/arithmetic/ffi.rs b/ring-0.17.14/src/arithmetic/ffi.rs new file mode 100644 index 0000000000..336fb68de8 --- /dev/null +++ b/ring-0.17.14/src/arithmetic/ffi.rs @@ -0,0 +1,97 @@ +// Copyright 2024-2025 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::{inout::AliasingSlices3, n0::N0, LimbSliceError, MAX_LIMBS, MIN_LIMBS}; +use crate::{c, limb::Limb, polyfill::usize_from_u32}; +use core::{mem::size_of, num::NonZeroUsize}; + +const _MAX_LIMBS_ADDRESSES_MEMORY_SAFETY_ISSUES: () = { + // BoringSSL's limit: 8 kiloBYTES. + const BN_MONTGOMERY_MAX_WORDS: usize = (8 * 1092) / size_of::(); + assert!(MAX_LIMBS <= BN_MONTGOMERY_MAX_WORDS); + + // Some 64-bit assembly implementations were written to take `len` as a + // `c_int`, so they zero out the undefined top half of `len` to convert it + // to a `usize`. But, others don't. + assert!(MAX_LIMBS <= usize_from_u32(u32::MAX)); +}; + +macro_rules! bn_mul_mont_ffi { + ( $in_out:expr, $n:expr, $n0:expr, $cpu:expr, + unsafe { ($MIN_LEN:expr, $MOD_LEN:expr, $Cpu:ty) => $f:ident }) => {{ + use crate::{c, limb::Limb}; + prefixed_extern! { + // `r` and/or 'a' and/or 'b' may alias. + // XXX: BoringSSL declares these functions to return `int`. + fn $f( + r: *mut Limb, + a: *const Limb, + b: *const Limb, + n: *const Limb, + n0: &N0, + len: c::NonZero_size_t, + ); + } + unsafe { + crate::arithmetic::ffi::bn_mul_mont_ffi::<$Cpu, { $MIN_LEN }, { $MOD_LEN }>( + $in_out, $n, $n0, $cpu, $f, + ) + } + }}; +} + +#[inline] +pub(super) unsafe fn bn_mul_mont_ffi( + in_out: impl AliasingSlices3, + n: &[Limb], + n0: &N0, + cpu: Cpu, + f: unsafe extern "C" fn( + r: *mut Limb, + a: *const Limb, + b: *const Limb, + n: *const Limb, + n0: &N0, + len: c::NonZero_size_t, + ), +) -> Result<(), LimbSliceError> { + assert_eq!(n.len() % LEN_MOD, 0); // The caller should guard against this. + + /// The x86 implementation of `bn_mul_mont`, at least, requires at least 4 + /// limbs. For a long time we have required 4 limbs for all targets, though + /// this may be unnecessary. + const _MIN_LIMBS_AT_LEAST_4: () = assert!(MIN_LIMBS >= 4); + // We haven't tested shorter lengths. + assert!(LEN_MIN >= MIN_LIMBS); + if n.len() < LEN_MIN { + return Err(LimbSliceError::too_short(n.len())); + } + let len = NonZeroUsize::new(n.len()).unwrap_or_else(|| { + // Unreachable because we checked against `LEN_MIN`, and we checked + // `LEN_MIN` is nonzero. + unreachable!() + }); + + // Avoid stack overflow from the alloca inside. + if len.get() > MAX_LIMBS { + return Err(LimbSliceError::too_long(n.len())); + } + in_out + .with_non_dangling_non_null_pointers_rab(len, |r, a, b| { + let n = n.as_ptr(); + let _: Cpu = cpu; + unsafe { f(r, a, b, n, n0, len) }; + }) + .map_err(LimbSliceError::len_mismatch) +} diff --git a/ring-0.17.14/src/arithmetic/inout.rs b/ring-0.17.14/src/arithmetic/inout.rs new file mode 100644 index 0000000000..0c9c4a1293 --- /dev/null +++ b/ring-0.17.14/src/arithmetic/inout.rs @@ -0,0 +1,177 @@ +// Copyright 2025 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +pub(crate) use crate::error::LenMismatchError; +use core::num::NonZeroUsize; + +pub(crate) trait AliasingSlices2 { + /// The pointers passed to `f` will be valid and non-null, and will not + /// be dangling, so they can be passed to C functions. + /// + /// The first pointer, `r`, may be pointing to uninitialized memory for + /// `expected_len` elements of type `T`, properly aligned and writable. + /// `f` must not read from `r` before writing to it. + /// + /// The second & third pointers, `a` and `b`, point to `expected_len` + /// values of type `T`, properly aligned. + /// + /// `r`, `a`, and/or `b` may alias each other only in the following ways: + /// `ptr::eq(r, a)`, `ptr::eq(r, b)`, and/or `ptr::eq(a, b)`; i.e. they + /// will not be "overlapping." + /// + /// Implementations of this trait shouldn't override this default + /// implementation. + #[inline(always)] + fn with_non_dangling_non_null_pointers_ra( + self, + expected_len: NonZeroUsize, + f: impl FnOnce(*mut T, *const T) -> R, + ) -> Result + where + Self: Sized, + { + self.with_potentially_dangling_non_null_pointers_ra(expected_len.get(), f) + } + + /// If `expected_len == 0` then the pointers passed to `f` may be + /// dangling pointers, which should not be passed to C functions. In all + /// other respects, this works like + /// `Self::with_non_dangling_non_null_pointers_rab`. + /// + /// Implementations of this trait should implement this method and not + /// `with_non_dangling_non_null_pointers_rab`. Users of this trait should + /// use `with_non_dangling_non_null_pointers_rab` and not this. + fn with_potentially_dangling_non_null_pointers_ra( + self, + expected_len: usize, + f: impl FnOnce(*mut T, *const T) -> R, + ) -> Result; +} + +impl AliasingSlices2 for &mut [T] { + fn with_potentially_dangling_non_null_pointers_ra( + self, + expected_len: usize, + f: impl FnOnce(*mut T, *const T) -> R, + ) -> Result { + let r = self; + if r.len() != expected_len { + return Err(LenMismatchError::new(r.len())); + } + Ok(f(r.as_mut_ptr(), r.as_ptr())) + } +} + +impl AliasingSlices2 for (&mut [T], &[T]) { + fn with_potentially_dangling_non_null_pointers_ra( + self, + expected_len: usize, + f: impl FnOnce(*mut T, *const T) -> R, + ) -> Result { + let (r, a) = self; + if r.len() != expected_len { + return Err(LenMismatchError::new(r.len())); + } + if a.len() != expected_len { + return Err(LenMismatchError::new(a.len())); + } + Ok(f(r.as_mut_ptr(), a.as_ptr())) + } +} + +pub(crate) trait AliasingSlices3 { + /// The pointers passed to `f` will all be non-null and properly aligned, + /// and will not be dangling. + /// + /// The first pointer, `r` points to potentially-uninitialized writable + /// space for `expected_len` elements of type `T`. Accordingly, `f` must + /// not read from `r` before writing to it. + /// + /// The second & third pointers, `a` and `b`, point to `expected_len` + /// initialized values of type `T`. + /// + /// `r`, `a`, and/or `b` may alias each other, but only in the following + /// ways: `ptr::eq(r, a)`, `ptr::eq(r, b)`, and/or `ptr::eq(a, b)`; they + /// will not be "overlapping." + /// + /// Implementations of this trait shouldn't override this default + /// implementation. + #[inline(always)] + fn with_non_dangling_non_null_pointers_rab( + self, + expected_len: NonZeroUsize, + f: impl FnOnce(*mut T, *const T, *const T) -> R, + ) -> Result + where + Self: Sized, + { + self.with_potentially_dangling_non_null_pointers_rab(expected_len.get(), f) + } + + /// If `expected_len == 0` then the pointers passed to `f` may be + /// dangling pointers, which should not be passed to C functions. In all + /// other respects, this works like + /// `Self::with_non_dangling_non_null_pointers_rab`. + /// + /// Implementations of this trait should implement this method and not + /// `with_non_dangling_non_null_pointers_rab`. Users of this trait should + /// use `with_non_dangling_non_null_pointers_rab` and not this. + fn with_potentially_dangling_non_null_pointers_rab( + self, + expected_len: usize, + f: impl FnOnce(*mut T, *const T, *const T) -> R, + ) -> Result; +} + +impl AliasingSlices3 for &mut [T] { + fn with_potentially_dangling_non_null_pointers_rab( + self, + expected_len: usize, + f: impl FnOnce(*mut T, *const T, *const T) -> R, + ) -> Result { + >::with_potentially_dangling_non_null_pointers_ra( + self, + expected_len, + |r, a| f(r, r, a), + ) + } +} + +impl AliasingSlices3 for (&mut [T], &[T], &[T]) { + fn with_potentially_dangling_non_null_pointers_rab( + self, + expected_len: usize, + f: impl FnOnce(*mut T, *const T, *const T) -> R, + ) -> Result { + let (r, a, b) = self; + ((r, a), b).with_potentially_dangling_non_null_pointers_rab(expected_len, f) + } +} + +impl AliasingSlices3 for (RA, &[T]) +where + RA: AliasingSlices2, +{ + fn with_potentially_dangling_non_null_pointers_rab( + self, + expected_len: usize, + f: impl FnOnce(*mut T, *const T, *const T) -> R, + ) -> Result { + let (ra, b) = self; + if b.len() != expected_len { + return Err(LenMismatchError::new(b.len())); + } + ra.with_potentially_dangling_non_null_pointers_ra(expected_len, |r, a| f(r, a, b.as_ptr())) + } +} diff --git a/ring-0.17.14/src/arithmetic/limbs/aarch64/mod.rs b/ring-0.17.14/src/arithmetic/limbs/aarch64/mod.rs new file mode 100644 index 0000000000..fecce47f33 --- /dev/null +++ b/ring-0.17.14/src/arithmetic/limbs/aarch64/mod.rs @@ -0,0 +1,17 @@ +// Copyright 2025 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![cfg(all(target_arch = "aarch64", target_endian = "little"))] + +pub(in super::super) mod mont; diff --git a/ring-0.17.14/src/arithmetic/limbs/aarch64/mont.rs b/ring-0.17.14/src/arithmetic/limbs/aarch64/mont.rs new file mode 100644 index 0000000000..edd289f7c0 --- /dev/null +++ b/ring-0.17.14/src/arithmetic/limbs/aarch64/mont.rs @@ -0,0 +1,59 @@ +// Copyright 2025 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![cfg(all(target_arch = "aarch64", target_endian = "little"))] + +use super::super::super::{inout::AliasingSlices3 as _, n0::N0, LimbSliceError, MAX_LIMBS}; +use crate::{ + c, + limb::Limb, + polyfill::slice::{AsChunks, AsChunksMut}, +}; +use core::num::NonZeroUsize; + +#[inline] +pub(in super::super::super) fn sqr_mont5( + mut in_out: AsChunksMut, + n: AsChunks, + n0: &N0, +) -> Result<(), LimbSliceError> { + prefixed_extern! { + // `r` and/or 'a' may alias. + // XXX: BoringSSL (kinda, implicitly) declares this to return `int`. + // `num` must be a non-zero multiple of 8. + fn bn_sqr8x_mont( + rp: *mut Limb, + ap: *const Limb, + ap_again: *const Limb, + np: *const Limb, + n0: &N0, + num: c::NonZero_size_t); + } + + let in_out = in_out.as_flattened_mut(); + let n = n.as_flattened(); + let num_limbs = NonZeroUsize::new(n.len()).ok_or_else(|| LimbSliceError::too_short(n.len()))?; + + // Avoid stack overflow from the alloca inside. + if num_limbs.get() > MAX_LIMBS { + return Err(LimbSliceError::too_long(num_limbs.get())); + } + + in_out + .with_non_dangling_non_null_pointers_rab(num_limbs, |r, a, a_again| { + let n = n.as_ptr(); // Non-dangling because num_limbs > 0. + unsafe { bn_sqr8x_mont(r, a, a_again, n, n0, num_limbs) }; + }) + .map_err(LimbSliceError::len_mismatch) +} diff --git a/ring-0.17.14/src/arithmetic/limbs/mod.rs b/ring-0.17.14/src/arithmetic/limbs/mod.rs new file mode 100644 index 0000000000..0dc6af36c3 --- /dev/null +++ b/ring-0.17.14/src/arithmetic/limbs/mod.rs @@ -0,0 +1,16 @@ +// Copyright 2025 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +pub(super) mod aarch64; +pub(super) mod x86_64; diff --git a/ring-0.17.14/src/arithmetic/limbs/x86_64/mod.rs b/ring-0.17.14/src/arithmetic/limbs/x86_64/mod.rs new file mode 100644 index 0000000000..d7dc08cf7b --- /dev/null +++ b/ring-0.17.14/src/arithmetic/limbs/x86_64/mod.rs @@ -0,0 +1,17 @@ +// Copyright 2025 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![cfg(target_arch = "x86_64")] + +pub(in super::super::super) mod mont; diff --git a/ring-0.17.14/src/arithmetic/limbs/x86_64/mont.rs b/ring-0.17.14/src/arithmetic/limbs/x86_64/mont.rs new file mode 100644 index 0000000000..bf92c0c56f --- /dev/null +++ b/ring-0.17.14/src/arithmetic/limbs/x86_64/mont.rs @@ -0,0 +1,312 @@ +// Copyright 2015-2025 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![cfg(target_arch = "x86_64")] + +use super::super::super::{ + inout::{AliasingSlices2, AliasingSlices3}, + n0::N0, + LimbSliceError, MAX_LIMBS, +}; +use crate::{ + c, + cpu::intel::{Adx, Bmi1, Bmi2}, + error::LenMismatchError, + limb::{LeakyWindow, Limb, Window}, + polyfill::slice::{AsChunks, AsChunksMut}, +}; +use core::num::NonZeroUsize; + +const _512_IS_LIMB_BITS_TIMES_8: () = assert!(8 * Limb::BITS == 512); + +#[inline] +pub(in super::super::super) fn mul_mont5( + mut r: AsChunksMut, + a: AsChunks, + b: AsChunks, + m: AsChunks, + n0: &N0, + maybe_adx_bmi2: Option<(Adx, Bmi2)>, +) -> Result<(), LimbSliceError> { + mul_mont5_4x( + (r.as_flattened_mut(), a.as_flattened(), b.as_flattened()), + m.into(), + n0, + maybe_adx_bmi2, + ) +} + +pub const MIN_4X: usize = 8; + +#[inline] +pub(in super::super::super) fn mul_mont5_4x( + in_out: impl AliasingSlices3, + n: AsChunks, + n0: &N0, + maybe_adx_bmi2: Option<(Adx, Bmi2)>, +) -> Result<(), LimbSliceError> { + const MOD_4X: usize = 4; + let n = n.as_flattened(); + if let Some(cpu) = maybe_adx_bmi2 { + bn_mul_mont_ffi!(in_out, n, n0, cpu, unsafe { + (MIN_4X, MOD_4X, (Adx, Bmi2)) => bn_mulx4x_mont + }) + } else { + bn_mul_mont_ffi!(in_out, n, n0, (), unsafe { + (MIN_4X, MOD_4X, ()) => bn_mul4x_mont + }) + } +} + +#[inline] +pub(in super::super::super) fn sqr_mont5( + mut in_out: AsChunksMut, + n: AsChunks, + n0: &N0, + maybe_adx_bmi2: Option<(Adx, Bmi2)>, +) -> Result<(), LimbSliceError> { + prefixed_extern! { + // `r` and/or 'a' may alias. + // XXX: BoringSSL declares this to return `int`. + // `num` must be a non-zero multiple of 8. + fn bn_sqr8x_mont( + rp: *mut Limb, + ap: *const Limb, + mulx_adx_capable: Limb, + np: *const Limb, + n0: &N0, + num: c::NonZero_size_t); + } + + let in_out = in_out.as_flattened_mut(); + let n = n.as_flattened(); + let num_limbs = NonZeroUsize::new(n.len()).ok_or_else(|| LimbSliceError::too_short(n.len()))?; + + // Avoid stack overflow from the alloca inside. + if num_limbs.get() > MAX_LIMBS { + return Err(LimbSliceError::too_long(num_limbs.get())); + } + + // `Limb::from(mulx_adx.is_some())`, but intentionally branchy. + let mulx_adx_capable = match maybe_adx_bmi2 { + Some(_) => Limb::from(true), + None => Limb::from(false), + }; + + in_out + .with_non_dangling_non_null_pointers_ra(num_limbs, |r, a| { + let n = n.as_ptr(); // Non-dangling because num_limbs > 0. + unsafe { bn_sqr8x_mont(r, a, mulx_adx_capable, n, n0, num_limbs) }; + }) + .map_err(LimbSliceError::len_mismatch) +} + +#[inline(always)] +pub(in super::super::super) fn scatter5( + a: AsChunks, + mut table: AsChunksMut, + power: LeakyWindow, +) -> Result<(), LimbSliceError> { + prefixed_extern! { + // Upstream uses `num: c::size_t` too, and `power: c::size_t`; see + // `_MAX_LIMBS_ADDRESSES_MEMORY_SAFETY_ISSUES`. + fn bn_scatter5( + inp: *const Limb, + num: c::NonZero_size_t, + table: *mut Limb, + power: LeakyWindow, + ); + } + let num_limbs = check_common(a, table.as_ref())?; + let a = a.as_flattened(); + let table = table.as_flattened_mut(); + assert!(power < 32); + unsafe { bn_scatter5(a.as_ptr(), num_limbs, table.as_mut_ptr(), power) }; + Ok(()) +} + +// SAFETY: `power` must be less than 32. +#[inline(always)] +pub(in super::super::super) unsafe fn gather5( + mut r: AsChunksMut, + table: AsChunks, + power: Window, +) -> Result<(), LimbSliceError> { + prefixed_extern! { + // Upstream uses `num: c::size_t` too, and `power: c::size_t`; see + // `_MAX_LIMBS_ADDRESSES_MEMORY_SAFETY_ISSUES`. + fn bn_gather5( + out: *mut Limb, + num: c::NonZero_size_t, + table: *const Limb, + power: Window); + } + let num_limbs = check_common(r.as_ref(), table)?; + let r = r.as_flattened_mut(); + let table = table.as_flattened(); + // SAFETY: We cannot assert that `power` is in range because it is secret. + // TODO: Create a `Window5` type that is guaranteed to be in range. + unsafe { bn_gather5(r.as_mut_ptr(), num_limbs, table.as_ptr(), power) }; + Ok(()) +} + +// SAFETY: `power` must be less than 32. +#[inline(always)] +pub(in super::super::super) unsafe fn mul_mont_gather5_amm( + mut r: AsChunksMut, + a: AsChunks, + table: AsChunks, + n: AsChunks, + n0: &N0, + power: Window, + maybe_adx_bmi1_bmi2: Option<(Adx, Bmi1, Bmi2)>, +) -> Result<(), LimbSliceError> { + prefixed_extern! { + // Upstream has `num: c_int` and `power: c_int`; see + // `_MAX_LIMBS_ADDRESSES_MEMORY_SAFETY_ISSUES`. + fn bn_mul4x_mont_gather5( + rp: *mut Limb, + ap: *const Limb, + table: *const Limb, + np: *const Limb, + n0: &N0, + num: c::NonZero_size_t, + power: Window, + ); + // Upstream has `num: c_int` and `power: c_int`; see + // `_MAX_LIMBS_ADDRESSES_MEMORY_SAFETY_ISSUES`. + fn bn_mulx4x_mont_gather5( + rp: *mut Limb, + ap: *const Limb, + table: *const Limb, + np: *const Limb, + n0: &N0, + num: c::NonZero_size_t, + power: Window, + ); + } + let num_limbs = check_common_with_n(r.as_ref(), table, n)?; + let a = a.as_flattened(); + if a.len() != num_limbs.get() { + return Err(LimbSliceError::len_mismatch(LenMismatchError::new(a.len()))); + } + let r = r.as_flattened_mut(); + let r = r.as_mut_ptr(); + let a = a.as_ptr(); + let table = table.as_flattened(); + let table = table.as_ptr(); + let n = n.as_flattened(); + let n = n.as_ptr(); + // SAFETY: We cannot assert that `power` is in range because it is secret. + // TODO: Create a `Window5` type that is guaranteed to be in range. + if maybe_adx_bmi1_bmi2.is_some() { + unsafe { bn_mulx4x_mont_gather5(r, a, table, n, n0, num_limbs, power) } + } else { + unsafe { bn_mul4x_mont_gather5(r, a, table, n, n0, num_limbs, power) } + }; + Ok(()) +} + +// SAFETY: `power` must be less than 32. +#[inline(always)] +pub(in super::super::super) unsafe fn power5_amm( + mut in_out: AsChunksMut, + table: AsChunks, + n: AsChunks, + n0: &N0, + power: Window, + maybe_adx_bmi1_bmi2: Option<(Adx, Bmi1, Bmi2)>, +) -> Result<(), LimbSliceError> { + prefixed_extern! { + // Upstream has `num: c_int` and `power: c_int`; see + // `_MAX_LIMBS_ADDRESSES_MEMORY_SAFETY_ISSUES`. + fn bn_power5_nohw( + rp: *mut Limb, + ap: *const Limb, + table: *const Limb, + np: *const Limb, + n0: &N0, + num: c::NonZero_size_t, + power: Window, + ); + // Upstream has `num: c_int` and `power: c_int`; see + // `_MAX_LIMBS_ADDRESSES_MEMORY_SAFETY_ISSUES`. + fn bn_powerx5( + rp: *mut Limb, + ap: *const Limb, + table: *const Limb, + np: *const Limb, + n0: &N0, + num: c::NonZero_size_t, + power: Window, + ); + } + let num_limbs = check_common_with_n(in_out.as_ref(), table, n)?; + let in_out = in_out.as_flattened_mut(); + let r = in_out.as_mut_ptr(); + let a = in_out.as_ptr(); + let table = table.as_flattened(); + let table = table.as_ptr(); + let n = n.as_flattened(); + let n = n.as_ptr(); + // SAFETY: We cannot assert that `power` is in range because it is secret. + // TODO: Create a `Window5` type that is guaranteed to be in range. + if maybe_adx_bmi1_bmi2.is_some() { + unsafe { bn_powerx5(r, a, table, n, n0, num_limbs, power) } + } else { + unsafe { bn_power5_nohw(r, a, table, n, n0, num_limbs, power) } + }; + Ok(()) +} + +// Helps the compiler will be able to hoist all of these checks out of the +// loops in the caller. Try to help the compiler by doing the checks +// consistently in each function and also by inlining this function and all the +// callers. +#[inline(always)] +fn check_common( + a: AsChunks, + table: AsChunks, +) -> Result { + assert_eq!((table.as_ptr() as usize) % 16, 0); // According to BoringSSL. + let a = a.as_flattened(); + let table = table.as_flattened(); + let num_limbs = NonZeroUsize::new(a.len()).ok_or_else(|| LimbSliceError::too_short(a.len()))?; + if num_limbs.get() > MAX_LIMBS { + return Err(LimbSliceError::too_long(a.len())); + } + if num_limbs.get() * 32 != table.len() { + return Err(LimbSliceError::len_mismatch(LenMismatchError::new( + table.len(), + ))); + }; + Ok(num_limbs) +} + +#[inline(always)] +fn check_common_with_n( + a: AsChunks, + table: AsChunks, + n: AsChunks, +) -> Result { + // Choose `a` instead of `n` so that every function starts with + // `check_common` passing the exact same arguments, so that the compiler + // can easily de-dupe the checks. + let num_limbs = check_common(a, table)?; + let n = n.as_flattened(); + if n.len() != num_limbs.get() { + return Err(LimbSliceError::len_mismatch(LenMismatchError::new(n.len()))); + } + Ok(num_limbs) +} diff --git a/ring-0.17.14/src/arithmetic/limbs512/mod.rs b/ring-0.17.14/src/arithmetic/limbs512/mod.rs new file mode 100644 index 0000000000..8b77ecf475 --- /dev/null +++ b/ring-0.17.14/src/arithmetic/limbs512/mod.rs @@ -0,0 +1,17 @@ +// Copyright 2025 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +mod storage; + +pub(super) use self::storage::{AlignedStorage, LIMBS_PER_CHUNK}; diff --git a/ring-0.17.14/src/arithmetic/limbs512/storage.rs b/ring-0.17.14/src/arithmetic/limbs512/storage.rs new file mode 100644 index 0000000000..210192b4f6 --- /dev/null +++ b/ring-0.17.14/src/arithmetic/limbs512/storage.rs @@ -0,0 +1,60 @@ +// Copyright 2025 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use crate::{ + error::LenMismatchError, + limb::{Limb, LIMB_BITS}, + polyfill::slice::{self, AsChunksMut}, +}; +use core::mem::{align_of, size_of}; + +// Some x86_64 assembly is written under the assumption that some of its +// input data and/or temporary storage is aligned to `MOD_EXP_CTIME_ALIGN` +// bytes, which was/is 64 in OpenSSL. +// +// We use this in the non-X86-64 implementation of exponentiation as well, +// with the hope of converging th two implementations into one. + +#[repr(C, align(64))] +pub struct AlignedStorage([Limb; N]); + +const _LIMB_SIZE_DIVIDES_ALIGNMENT: () = + assert!(align_of::>() % size_of::() == 0); + +pub const LIMBS_PER_CHUNK: usize = 512 / LIMB_BITS; + +impl AlignedStorage { + pub fn zeroed() -> Self { + assert_eq!(N % LIMBS_PER_CHUNK, 0); // TODO: const. + Self([0; N]) + } + + // The result will have every chunk aligned on a 64 byte boundary. + pub fn aligned_chunks_mut( + &mut self, + num_entries: usize, + chunks_per_entry: usize, + ) -> Result, LenMismatchError> { + let total_limbs = num_entries * chunks_per_entry * LIMBS_PER_CHUNK; + let len = self.0.len(); + let flattened = self + .0 + .get_mut(..total_limbs) + .ok_or_else(|| LenMismatchError::new(len))?; + match slice::as_chunks_mut(flattened) { + (chunks, []) => Ok(chunks), + (_, r) => Err(LenMismatchError::new(r.len())), + } + } +} diff --git a/ring-0.17.14/src/arithmetic/montgomery.rs b/ring-0.17.14/src/arithmetic/montgomery.rs new file mode 100644 index 0000000000..4098eced31 --- /dev/null +++ b/ring-0.17.14/src/arithmetic/montgomery.rs @@ -0,0 +1,384 @@ +// Copyright 2017-2025 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +pub use super::n0::N0; +use super::{inout::AliasingSlices3, LimbSliceError, MIN_LIMBS}; +use crate::cpu; +use cfg_if::cfg_if; + +// Indicates that the element is not encoded; there is no *R* factor +// that needs to be canceled out. +#[derive(Copy, Clone)] +pub enum Unencoded {} + +// Indicates that the element is encoded; the value has one *R* +// factor that needs to be canceled out. +#[derive(Copy, Clone)] +pub enum R {} + +// Indicates the element is encoded three times; the value has three +// *R* factors that need to be canceled out. +#[allow(clippy::upper_case_acronyms)] +#[derive(Copy, Clone)] +pub enum RRR {} + +// Indicates the element is encoded twice; the value has two *R* +// factors that need to be canceled out. +#[derive(Copy, Clone)] +pub enum RR {} + +// Indicates the element is inversely encoded; the value has one +// 1/*R* factor that needs to be canceled out. +#[derive(Copy, Clone)] +pub enum RInverse {} + +pub trait Encoding {} + +impl Encoding for RRR {} +impl Encoding for RR {} +impl Encoding for R {} +impl Encoding for Unencoded {} +impl Encoding for RInverse {} + +/// The encoding of the result of a reduction. +pub trait ReductionEncoding { + type Output: Encoding; +} + +impl ReductionEncoding for RRR { + type Output = RR; +} + +impl ReductionEncoding for RR { + type Output = R; +} +impl ReductionEncoding for R { + type Output = Unencoded; +} +impl ReductionEncoding for Unencoded { + type Output = RInverse; +} + +/// The encoding of the result of a multiplication. +pub trait ProductEncoding { + type Output: Encoding; +} + +impl ProductEncoding for (Unencoded, E) { + type Output = E::Output; +} + +impl ProductEncoding for (R, E) { + type Output = E; +} + +impl ProductEncoding for (RR, RR) { + type Output = RRR; +} + +impl ProductEncoding for (RInverse, E) +where + E::Output: ReductionEncoding, +{ + type Output = <::Output as ReductionEncoding>::Output; +} + +// XXX: Rust doesn't allow overlapping impls, +// TODO (if/when Rust allows it): +// impl ProductEncoding for +// (E1, E2) { +// type Output = <(E2, E1) as ProductEncoding>::Output; +// } +impl ProductEncoding for (RR, Unencoded) { + type Output = <(Unencoded, RR) as ProductEncoding>::Output; +} +impl ProductEncoding for (RR, RInverse) { + type Output = <(RInverse, RR) as ProductEncoding>::Output; +} + +impl ProductEncoding for (RRR, RInverse) { + type Output = <(RInverse, RRR) as ProductEncoding>::Output; +} + +#[allow(unused_imports)] +use crate::{bssl, c, limb::Limb}; + +#[inline(always)] +pub(super) fn limbs_mul_mont( + in_out: impl AliasingSlices3, + n: &[Limb], + n0: &N0, + cpu: cpu::Features, +) -> Result<(), LimbSliceError> { + const MOD_FALLBACK: usize = 1; // No restriction. + cfg_if! { + if #[cfg(all(target_arch = "aarch64", target_endian = "little"))] { + let _: cpu::Features = cpu; + const MIN_4X: usize = 4; + const MOD_4X: usize = 4; + if n.len() >= MIN_4X && n.len() % MOD_4X == 0 { + bn_mul_mont_ffi!(in_out, n, n0, (), unsafe { + (MIN_4X, MOD_4X, ()) => bn_mul4x_mont + }) + } else { + bn_mul_mont_ffi!(in_out, n, n0, (), unsafe { + (MIN_LIMBS, MOD_FALLBACK, ()) => bn_mul_mont_nohw + }) + } + } else if #[cfg(all(target_arch = "arm", target_endian = "little"))] { + const MIN_8X: usize = 8; + const MOD_8X: usize = 8; + if n.len() >= MIN_8X && n.len() % MOD_8X == 0 { + use crate::cpu::{GetFeature as _, arm::Neon}; + if let Some(cpu) = cpu.get_feature() { + return bn_mul_mont_ffi!(in_out, n, n0, cpu, unsafe { + (MIN_8X, MOD_8X, Neon) => bn_mul8x_mont_neon + }); + } + } + // The ARM version of `bn_mul_mont_nohw` has a minimum of 2. + const _MIN_LIMBS_AT_LEAST_2: () = assert!(MIN_LIMBS >= 2); + bn_mul_mont_ffi!(in_out, n, n0, (), unsafe { + (MIN_LIMBS, MOD_FALLBACK, ()) => bn_mul_mont_nohw + }) + } else if #[cfg(target_arch = "x86")] { + use crate::{cpu::GetFeature as _, cpu::intel::Sse2}; + // The X86 implementation of `bn_mul_mont` has a minimum of 4. + const _MIN_LIMBS_AT_LEAST_4: () = assert!(MIN_LIMBS >= 4); + if let Some(cpu) = cpu.get_feature() { + bn_mul_mont_ffi!(in_out, n, n0, cpu, unsafe { + (MIN_LIMBS, MOD_FALLBACK, Sse2) => bn_mul_mont + }) + } else { + // This isn't really an FFI call; it's defined below. + unsafe { + super::ffi::bn_mul_mont_ffi::<(), {MIN_LIMBS}, 1>(in_out, n, n0, (), + bn_mul_mont_fallback) + } + } + } else if #[cfg(target_arch = "x86_64")] { + use crate::{cpu::GetFeature as _, polyfill::slice}; + use super::limbs::x86_64; + if n.len() >= x86_64::mont::MIN_4X { + if let (n, []) = slice::as_chunks(n) { + return x86_64::mont::mul_mont5_4x(in_out, n, n0, cpu.get_feature()); + } + } + bn_mul_mont_ffi!(in_out, n, n0, (), unsafe { + (MIN_LIMBS, MOD_FALLBACK, ()) => bn_mul_mont_nohw + }) + } else { + // Use the fallback implementation implemented below through the + // FFI wrapper defined below, so that Rust and C code both go + // through `bn_mul_mont`. + bn_mul_mont_ffi!(in_out, n, n0, cpu, unsafe { + (MIN_LIMBS, MOD_FALLBACK, cpu::Features) => bn_mul_mont + }) + } + } +} + +cfg_if! { + if #[cfg(not(any( + all(target_arch = "aarch64", target_endian = "little"), + all(target_arch = "arm", target_endian = "little"), + target_arch = "x86_64")))] { + + // TODO: Stop calling this from C and un-export it. + #[cfg(not(target_arch = "x86"))] + prefixed_export! { + unsafe extern "C" fn bn_mul_mont( + r: *mut Limb, + a: *const Limb, + b: *const Limb, + n: *const Limb, + n0: &N0, + num_limbs: c::NonZero_size_t, + ) { + unsafe { bn_mul_mont_fallback(r, a, b, n, n0, num_limbs) } + } + } + + #[cfg_attr(target_arch = "x86", cold)] + #[cfg_attr(target_arch = "x86", inline(never))] + unsafe extern "C" fn bn_mul_mont_fallback( + r: *mut Limb, + a: *const Limb, + b: *const Limb, + n: *const Limb, + n0: &N0, + num_limbs: c::NonZero_size_t, + ) { + use super::MAX_LIMBS; + + let num_limbs = num_limbs.get(); + + // The mutable pointer `r` may alias `a` and/or `b`, so the lifetimes of + // any slices for `a` or `b` must not overlap with the lifetime of any + // mutable for `r`. + + // Nothing aliases `n` + let n = unsafe { core::slice::from_raw_parts(n, num_limbs) }; + + let mut tmp = [0; 2 * MAX_LIMBS]; + let tmp = &mut tmp[..(2 * num_limbs)]; + { + let a: &[Limb] = unsafe { core::slice::from_raw_parts(a, num_limbs) }; + let b: &[Limb] = unsafe { core::slice::from_raw_parts(b, num_limbs) }; + limbs_mul(tmp, a, b); + } + let r: &mut [Limb] = unsafe { core::slice::from_raw_parts_mut(r, num_limbs) }; + limbs_from_mont_in_place(r, tmp, n, n0); + } + } +} + +// `bigint` needs then when the `alloc` feature is enabled. `bn_mul_mont` above needs this when +// we are using the platforms for which we don't have `bn_mul_mont` in assembly. +#[cfg(any( + feature = "alloc", + not(any( + all(target_arch = "aarch64", target_endian = "little"), + all(target_arch = "arm", target_endian = "little"), + target_arch = "x86_64" + )) +))] +pub(super) fn limbs_from_mont_in_place(r: &mut [Limb], tmp: &mut [Limb], m: &[Limb], n0: &N0) { + prefixed_extern! { + fn bn_from_montgomery_in_place( + r: *mut Limb, + num_r: c::size_t, + a: *mut Limb, + num_a: c::size_t, + n: *const Limb, + num_n: c::size_t, + n0: &N0, + ) -> bssl::Result; + } + Result::from(unsafe { + bn_from_montgomery_in_place( + r.as_mut_ptr(), + r.len(), + tmp.as_mut_ptr(), + tmp.len(), + m.as_ptr(), + m.len(), + n0, + ) + }) + .unwrap() +} + +#[cfg(not(any( + all(target_arch = "aarch64", target_endian = "little"), + all(target_arch = "arm", target_endian = "little"), + target_arch = "x86_64" +)))] +fn limbs_mul(r: &mut [Limb], a: &[Limb], b: &[Limb]) { + debug_assert_eq!(r.len(), 2 * a.len()); + debug_assert_eq!(a.len(), b.len()); + let ab_len = a.len(); + + r[..ab_len].fill(0); + for (i, &b_limb) in b.iter().enumerate() { + r[ab_len + i] = unsafe { + limbs_mul_add_limb(r[i..][..ab_len].as_mut_ptr(), a.as_ptr(), b_limb, ab_len) + }; + } +} + +#[cfg(any( + test, + not(any( + all(target_arch = "aarch64", target_endian = "little"), + all(target_arch = "arm", target_endian = "little"), + target_arch = "x86_64", + )) +))] +prefixed_extern! { + // `r` must not alias `a` + #[must_use] + fn limbs_mul_add_limb(r: *mut Limb, a: *const Limb, b: Limb, num_limbs: c::size_t) -> Limb; +} + +/// r = r**2 +pub(super) fn limbs_square_mont( + r: &mut [Limb], + n: &[Limb], + n0: &N0, + cpu: cpu::Features, +) -> Result<(), LimbSliceError> { + #[cfg(all(target_arch = "aarch64", target_endian = "little"))] + { + use super::limbs::aarch64; + use crate::polyfill::slice; + if let ((r, []), (n, [])) = (slice::as_chunks_mut(r), slice::as_chunks(n)) { + return aarch64::mont::sqr_mont5(r, n, n0); + } + } + + #[cfg(target_arch = "x86_64")] + { + use super::limbs::x86_64; + use crate::{cpu::GetFeature as _, polyfill::slice}; + if let ((r, []), (n, [])) = (slice::as_chunks_mut(r), slice::as_chunks(n)) { + return x86_64::mont::sqr_mont5(r, n, n0, cpu.get_feature()); + } + } + + limbs_mul_mont(r, n, n0, cpu) +} + +#[cfg(test)] +mod tests { + use super::super::MAX_LIMBS; + use super::*; + use crate::limb::Limb; + + #[test] + // TODO: wasm + fn test_mul_add_words() { + const ZERO: Limb = 0; + const MAX: Limb = ZERO.wrapping_sub(1); + static TEST_CASES: &[(&[Limb], &[Limb], Limb, Limb, &[Limb])] = &[ + (&[0], &[0], 0, 0, &[0]), + (&[MAX], &[0], MAX, 0, &[MAX]), + (&[0], &[MAX], MAX, MAX - 1, &[1]), + (&[MAX], &[MAX], MAX, MAX, &[0]), + (&[0, 0], &[MAX, MAX], MAX, MAX - 1, &[1, MAX]), + (&[1, 0], &[MAX, MAX], MAX, MAX - 1, &[2, MAX]), + (&[MAX, 0], &[MAX, MAX], MAX, MAX, &[0, 0]), + (&[0, 1], &[MAX, MAX], MAX, MAX, &[1, 0]), + (&[MAX, MAX], &[MAX, MAX], MAX, MAX, &[0, MAX]), + ]; + + for (i, (r_input, a, w, expected_retval, expected_r)) in TEST_CASES.iter().enumerate() { + let mut r = [0; MAX_LIMBS]; + let r = { + let r = &mut r[..r_input.len()]; + r.copy_from_slice(r_input); + r + }; + assert_eq!(r.len(), a.len()); // Sanity check + let actual_retval = + unsafe { limbs_mul_add_limb(r.as_mut_ptr(), a.as_ptr(), *w, a.len()) }; + assert_eq!(&r, expected_r, "{}: {:x?} != {:x?}", i, r, expected_r); + assert_eq!( + actual_retval, *expected_retval, + "{}: {:x?} != {:x?}", + i, actual_retval, *expected_retval + ); + } + } +} diff --git a/ring-0.17.14/src/arithmetic/n0.rs b/ring-0.17.14/src/arithmetic/n0.rs new file mode 100644 index 0000000000..f0c77ddecc --- /dev/null +++ b/ring-0.17.14/src/arithmetic/n0.rs @@ -0,0 +1,37 @@ +// Copyright 2015-2022 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use crate::limb::Limb; + +#[derive(Clone, Copy)] +#[repr(transparent)] +pub struct N0([Limb; 2]); + +impl N0 { + #[cfg(feature = "alloc")] + pub(super) const LIMBS_USED: usize = 64 / crate::limb::LIMB_BITS; + + #[inline] + pub const fn precalculated(n0: u64) -> Self { + #[cfg(target_pointer_width = "64")] + { + Self([n0, 0]) + } + + #[cfg(target_pointer_width = "32")] + { + Self([n0 as Limb, (n0 >> crate::limb::LIMB_BITS) as Limb]) + } + } +} diff --git a/ring-0.17.14/src/bb/boolmask.rs b/ring-0.17.14/src/bb/boolmask.rs new file mode 100644 index 0000000000..d504df15aa --- /dev/null +++ b/ring-0.17.14/src/bb/boolmask.rs @@ -0,0 +1,41 @@ +// Copyright 2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::Word; +use core::ops; + +// BoolMask is either `BoolMask::TRUE` or `BoolMask::FALSE`. +#[repr(transparent)] +pub struct BoolMask(Word); + +impl BoolMask { + #[cfg(test)] + pub(super) const TRUE: Self = Self(Word::MAX); + #[cfg(test)] + pub(super) const FALSE: Self = Self(0); + + /// Returns true if `self` is `BoolMask::TRUE`; otherwise, returns false + /// (`self` is `BoolMask::FALSE`). + pub(crate) fn leak(self) -> bool { + self.0 != 0 + } +} + +impl ops::BitAnd for BoolMask { + type Output = Self; + + fn bitand(self, rhs: Self) -> Self { + Self(self.0 & rhs.0) + } +} diff --git a/ring-0.17.14/src/bb/leaky.rs b/ring-0.17.14/src/bb/leaky.rs new file mode 100644 index 0000000000..4ff5b6c914 --- /dev/null +++ b/ring-0.17.14/src/bb/leaky.rs @@ -0,0 +1,27 @@ +// Copyright 2015-2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#[cfg(target_pointer_width = "64")] +type CompilerWord = u64; + +#[cfg(target_pointer_width = "32")] +type CompilerWord = u32; + +/// A native word that isn't secret. +/// +/// `LeakyWord` supports `as` conversions to/from native types. +/// +/// XXX: This isn't the native word size on targets where a pointer isn't the +/// same size as a native word. TODO: Fix this. +pub(crate) type LeakyWord = CompilerWord; diff --git a/ring-0.17.14/src/bb/mod.rs b/ring-0.17.14/src/bb/mod.rs new file mode 100644 index 0000000000..3920a5f0cf --- /dev/null +++ b/ring-0.17.14/src/bb/mod.rs @@ -0,0 +1,162 @@ +// Copyright 2015-2025 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! Building blocks. + +use crate::{c, error}; +use core::{ffi::c_int, num::NonZeroUsize}; + +mod boolmask; +mod leaky; +mod word; + +pub(crate) use self::{boolmask::BoolMask, leaky::LeakyWord, word::Word}; + +/// Returns `Ok(())` if `a == b` and `Err(error::Unspecified)` otherwise. +pub fn verify_slices_are_equal(a: &[u8], b: &[u8]) -> Result<(), error::Unspecified> { + let len = a.len(); // Arbitrary choice. + if b.len() != len { + return Err(error::Unspecified); + } + match NonZeroUsize::new(len) { + Some(len) => { + let a = a.as_ptr(); + let b = b.as_ptr(); + // SAFETY: `a` and `b` are valid non-null non-dangling pointers to `len` + // bytes. + let result = unsafe { CRYPTO_memcmp(a, b, len) }; + match result { + 0 => Ok(()), + _ => Err(error::Unspecified), + } + } + None => Ok(()), // Empty slices are equal. + } +} + +prefixed_extern! { + fn CRYPTO_memcmp(a: *const u8, b: *const u8, len: c::NonZero_size_t) -> c_int; +} + +pub(crate) fn xor_16(a: [u8; 16], b: [u8; 16]) -> [u8; 16] { + let a = u128::from_ne_bytes(a); + let b = u128::from_ne_bytes(b); + let r = a ^ b; + r.to_ne_bytes() +} + +#[inline(always)] +pub(crate) fn xor_assign<'a>(a: impl IntoIterator, b: u8) { + a.into_iter().for_each(|a| *a ^= b); +} + +/// XORs the first N bytes of `b` into `a`, where N is +/// `core::cmp::min(a.len(), b.len())`. +#[inline(always)] +pub(crate) fn xor_assign_at_start<'a>( + a: impl IntoIterator, + b: impl IntoIterator, +) { + a.into_iter().zip(b).for_each(|(a, b)| *a ^= *b); +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{bssl, rand}; + + fn leak_in_test(a: BoolMask) -> bool { + a.leak() + } + + #[test] + fn test_constant_time() -> Result<(), error::Unspecified> { + prefixed_extern! { + fn bssl_constant_time_test_main() -> bssl::Result; + } + Result::from(unsafe { bssl_constant_time_test_main() }) + } + + #[test] + fn constant_time_conditional_memcpy() -> Result<(), error::Unspecified> { + let rng = rand::SystemRandom::new(); + for _ in 0..100 { + let mut out = rand::generate::<[u8; 256]>(&rng)?.expose(); + let input = rand::generate::<[u8; 256]>(&rng)?.expose(); + + // Mask to 16 bits to make zero more likely than it would otherwise be. + let b = (rand::generate::<[u8; 1]>(&rng)?.expose()[0] & 0x0f) == 0; + + let ref_in = input; + let ref_out = if b { input } else { out }; + + prefixed_extern! { + fn bssl_constant_time_test_conditional_memcpy(dst: &mut [u8; 256], src: &[u8; 256], b: BoolMask); + } + unsafe { + bssl_constant_time_test_conditional_memcpy( + &mut out, + &input, + if b { BoolMask::TRUE } else { BoolMask::FALSE }, + ) + } + assert_eq!(ref_in, input); + assert_eq!(ref_out, out); + } + + Ok(()) + } + + #[test] + fn constant_time_conditional_memxor() -> Result<(), error::Unspecified> { + let rng = rand::SystemRandom::new(); + for _ in 0..256 { + let mut out = rand::generate::<[u8; 256]>(&rng)?.expose(); + let input = rand::generate::<[u8; 256]>(&rng)?.expose(); + + // Mask to 16 bits to make zero more likely than it would otherwise be. + let b = (rand::generate::<[u8; 1]>(&rng)?.expose()[0] & 0x0f) != 0; + + let ref_in = input; + let mut ref_out = out; + if b { + xor_assign_at_start(&mut ref_out, &ref_in) + }; + + prefixed_extern! { + fn bssl_constant_time_test_conditional_memxor(dst: &mut [u8; 256], src: &[u8; 256], b: BoolMask); + } + unsafe { + bssl_constant_time_test_conditional_memxor( + &mut out, + &input, + if b { BoolMask::TRUE } else { BoolMask::FALSE }, + ); + } + + assert_eq!(ref_in, input); + assert_eq!(ref_out, out); + } + + Ok(()) + } + + #[test] + fn test_bool_mask_bitwise_and_is_logical_and() { + assert!(leak_in_test(BoolMask::TRUE & BoolMask::TRUE)); + assert!(!leak_in_test(BoolMask::TRUE & BoolMask::FALSE)); + assert!(!leak_in_test(BoolMask::FALSE & BoolMask::TRUE)); + assert!(!leak_in_test(BoolMask::FALSE & BoolMask::FALSE)); + } +} diff --git a/ring-0.17.14/src/bb/word.rs b/ring-0.17.14/src/bb/word.rs new file mode 100644 index 0000000000..a799df334d --- /dev/null +++ b/ring-0.17.14/src/bb/word.rs @@ -0,0 +1,44 @@ +// Copyright 2015-2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::LeakyWord; + +/// A native word that may hold a secret. +/// +/// XXX: Currently this is a type alias of `LeakyWord` so it doesn't enforce, +/// except by convention, the prevention of leaks. This is a temporary state to +/// support the refactorings that will +/// +/// XXX: This isn't the native word size on targets where a pointer isn't the +/// same size as a native word. TODO: Fix this. +/// +/// XXX: Over time, we'll evolve Word into a newtype with an API that minimizes +/// leaks and makes all leaks explicit, like so: +pub(crate) type Word = LeakyWord; + +/* TODO: +#[repr(transparent)] +pub(crate) struct Word(LeakyWord); + +impl Word { + pub fn leak_word(self) -> LeakyWord { self.0 } +} + +impl From for Word { + fn from(w: LeakyWord) -> Self { + // TODO: Use a stronger `black_box`. + Self(core::hint::black_box(w)) + } +} +*/ diff --git a/ring-0.17.14/src/bits.rs b/ring-0.17.14/src/bits.rs new file mode 100644 index 0000000000..01b5dc53e4 --- /dev/null +++ b/ring-0.17.14/src/bits.rs @@ -0,0 +1,135 @@ +// Copyright 2016 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! Bit lengths. + +use crate::{error::InputTooLongError, polyfill}; + +/// The length of something, in bits. +/// +/// This can represent a bit length that isn't a whole number of bytes. +#[derive(Clone, Copy, Debug, Eq, PartialEq, PartialOrd)] +#[repr(transparent)] +pub struct BitLength(T); + +pub(crate) trait FromByteLen: Sized { + /// Constructs a `BitLength` from the given length in bytes. + /// + /// Fails if `bytes * 8` is too large for a `T`. + fn from_byte_len(bytes: T) -> Result>; +} + +impl FromByteLen for BitLength { + #[inline] + fn from_byte_len(bytes: usize) -> Result { + match bytes.checked_mul(8) { + Some(bits) => Ok(Self(bits)), + None => Err(InputTooLongError::new(bytes)), + } + } +} + +impl FromByteLen for BitLength { + #[inline] + fn from_byte_len(bytes: u64) -> Result> { + match bytes.checked_mul(8) { + Some(bits) => Ok(Self(bits)), + None => Err(InputTooLongError::new(bytes)), + } + } +} + +impl FromByteLen for BitLength { + #[inline] + fn from_byte_len(bytes: usize) -> Result> { + match polyfill::u64_from_usize(bytes).checked_mul(8) { + Some(bits) => Ok(Self(bits)), + None => Err(InputTooLongError::new(bytes)), + } + } +} + +impl BitLength { + /// Constructs a `BitLength` from the given length in bits. + #[inline] + pub const fn from_bits(bits: T) -> Self { + Self(bits) + } +} + +impl BitLength { + /// The number of bits this bit length represents, as the underlying type. + #[inline] + pub fn as_bits(self) -> T { + self.0 + } +} + +// Lengths measured in bits, where all arithmetic is guaranteed not to +// overflow. +impl BitLength { + #[cfg(feature = "alloc")] + #[inline] + pub(crate) fn half_rounded_up(&self) -> Self { + let round_up = self.0 & 1; + Self((self.0 / 2) + round_up) + } + + /// The bit length, rounded up to a whole number of bytes. + #[inline] + pub const fn as_usize_bytes_rounded_up(&self) -> usize { + // Equivalent to (self.0 + 7) / 8, except with no potential for + // overflow and without branches. + + // Branchless round_up = if self.0 & 0b111 != 0 { 1 } else { 0 }; + let round_up = ((self.0 >> 2) | (self.0 >> 1) | self.0) & 1; + + (self.0 / 8) + round_up + } + + #[cfg(feature = "alloc")] + #[inline] + pub(crate) fn try_sub_1(self) -> Result { + let sum = self.0.checked_sub(1).ok_or(crate::error::Unspecified)?; + Ok(Self(sum)) + } +} + +impl BitLength { + pub fn to_be_bytes(self) -> [u8; 8] { + self.0.to_be_bytes() + } +} + +#[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] +impl From> for BitLength { + fn from(BitLength(value): BitLength) -> Self { + BitLength(polyfill::u64_from_usize(value)) + } +} + +impl TryFrom> for BitLength { + type Error = >::Error; + + fn try_from(BitLength(value): BitLength) -> Result { + value.try_into().map(BitLength) + } +} + +const _TEST_AS_USIZE_BYTES_ROUNDED_UP_EVEN: () = + assert!(BitLength::from_bits(8192).as_usize_bytes_rounded_up() == 8192 / 8); +const _TEST_AS_USIZE_BYTES_ROUNDED_UP_ONE_BIT_HIGH: () = + assert!(BitLength::from_bits(8192 + 1).as_usize_bytes_rounded_up() == (8192 / 8) + 1); +const _TEST_AS_USIZE_BYTES_ROUNDED_UP_SEVEN_BITS_HIGH: () = + assert!(BitLength::from_bits(8192 + 7).as_usize_bytes_rounded_up() == (8192 / 8) + 1); diff --git a/ring-0.17.14/src/bssl.rs b/ring-0.17.14/src/bssl.rs new file mode 100644 index 0000000000..1b1c753784 --- /dev/null +++ b/ring-0.17.14/src/bssl.rs @@ -0,0 +1,59 @@ +// Copyright 2015 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use crate::error; +use core::ffi::c_int; + +/// An `int` returned from a foreign function containing **1** if the function +/// was successful or **0** if an error occurred. This is the convention used by +/// C code in `ring`. +#[must_use] +#[repr(transparent)] +pub struct Result(c_int); + +impl From for core::result::Result<(), error::Unspecified> { + fn from(ret: Result) -> Self { + match ret.0 { + 1 => Ok(()), + c => { + debug_assert_eq!(c, 0, "`bssl::Result` value must be 0 or 1"); + Err(error::Unspecified) + } + } + } +} + +#[cfg(test)] +mod tests { + mod result { + use crate::bssl; + use core::{ + ffi::c_int, + mem::{align_of, size_of}, + }; + + #[test] + fn size_and_alignment() { + type Underlying = c_int; + assert_eq!(size_of::(), size_of::()); + assert_eq!(align_of::(), align_of::()); + } + + #[test] + fn semantics() { + assert!(Result::from(bssl::Result(0)).is_err()); + assert!(Result::from(bssl::Result(1)).is_ok()); + } + } +} diff --git a/ring-0.17.14/src/c.rs b/ring-0.17.14/src/c.rs new file mode 100644 index 0000000000..f5822ebccf --- /dev/null +++ b/ring-0.17.14/src/c.rs @@ -0,0 +1,33 @@ +// Copyright 2016-2019 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! C types. +//! +//! Avoid using the `libc` crate to get C types since `libc` doesn't support +//! all the targets we need to support. It turns out that the few types we need +//! are all uniformly defined on the platforms we care about. This will +//! probably change if/when we support 16-bit platforms or platforms where +//! `usize` and `uintptr_t` are different sizes. +//! +//! TODO(MSRV, feature(c_size_t)): Use `core::{ffi::c_size_t}`. +//! TODO(MSRV-1.79): Use `NonZero`. + +// Keep in sync with the checks in base.h that verify these assumptions. + +#![allow(dead_code)] + +use core::num::NonZeroUsize; + +pub(crate) type size_t = usize; +pub(crate) type NonZero_size_t = NonZeroUsize; diff --git a/ring-0.17.14/src/cpu.rs b/ring-0.17.14/src/cpu.rs new file mode 100644 index 0000000000..293e1b5355 --- /dev/null +++ b/ring-0.17.14/src/cpu.rs @@ -0,0 +1,231 @@ +// Copyright 2016 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +pub(crate) use self::features::Features; +use core::mem::size_of; + +macro_rules! impl_get_feature { + { + features: [ + $( { ( $( $arch:expr ),+ ) => $Name:ident }, )+ + ], + } => { + $( + #[cfg(any( $( target_arch = $arch ),+ ))] + #[derive(Clone, Copy)] + pub(crate) struct $Name(crate::cpu::Features); + + #[cfg(any( $( target_arch = $arch ),+ ))] + impl $Name { + const fn mask() -> u32 { + 1 << (Shift::$Name as u32) + } + } + + #[cfg(any( $( target_arch = $arch ),+ ))] + impl crate::cpu::GetFeature<$Name> for super::features::Values { + #[inline(always)] + fn get_feature(&self) -> Option<$Name> { + const MASK: u32 = $Name::mask(); + const STATICALLY_DETECTED: bool = (crate::cpu::CAPS_STATIC & MASK) == MASK; + if STATICALLY_DETECTED { // TODO: `const` + return Some($Name(self.cpu())); + } + + if (self.values() & MASK) == MASK { + Some($Name(self.cpu())) + } else { + None + } + } + } + )+ + + #[repr(u32)] + enum Shift { + $( + #[cfg(any( $( target_arch = $arch ),+ ))] + $Name, + )+ + + #[cfg(target_arch = "x86_64")] + IntelCpu, + + #[cfg(any(all(target_arch = "aarch64", target_endian = "little"), + all(target_arch = "arm", target_endian = "little"), + target_arch = "x86", target_arch = "x86_64"))] + // Synthesized to ensure the dynamic flag set is always non-zero. + // + // Keep this at the end as it is never checked except during init. + Initialized, + } + } +} + +pub(crate) trait GetFeature { + fn get_feature(&self) -> Option; +} + +impl GetFeature<()> for features::Values { + #[inline(always)] + fn get_feature(&self) -> Option<()> { + Some(()) + } +} + +impl GetFeature<(A, B)> for features::Values +where + features::Values: GetFeature, + features::Values: GetFeature, +{ + #[inline(always)] + fn get_feature(&self) -> Option<(A, B)> { + match (self.get_feature(), self.get_feature()) { + (Some(a), Some(b)) => Some((a, b)), + _ => None, + } + } +} + +impl GetFeature<(A, B, C)> for features::Values +where + features::Values: GetFeature, + features::Values: GetFeature, + features::Values: GetFeature, +{ + #[inline(always)] + fn get_feature(&self) -> Option<(A, B, C)> { + match (self.get_feature(), self.get_feature(), self.get_feature()) { + (Some(a), Some(b), Some(c)) => Some((a, b, c)), + _ => None, + } + } +} + +impl GetFeature for Features +where + features::Values: GetFeature, +{ + #[inline(always)] + fn get_feature(&self) -> Option { + self.values().get_feature() + } +} + +#[inline(always)] +pub(crate) fn features() -> Features { + featureflags::get_or_init() +} + +mod features { + use crate::polyfill::NotSend; + + /// A witness indicating that CPU features have been detected and cached. + /// + /// This is a zero-sized type so that it can be "stored" wherever convenient. + #[derive(Copy, Clone)] + pub(crate) struct Features(NotSend); + + impl Features { + pub fn values(self) -> Values { + Values { + values: super::featureflags::get(self), + cpu: self, + } + } + } + + cfg_if::cfg_if! { + if #[cfg(any(all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little"), + target_arch = "x86", target_arch = "x86_64"))] { + impl Features { + // SAFETY: This must only be called after CPU features have been written + // and synchronized. + pub(super) unsafe fn new_after_feature_flags_written_and_synced_unchecked() -> Self { + Self(NotSend::VALUE) + } + } + } else { + impl Features { + pub(super) fn new_no_features_to_detect() -> Self { + Self(NotSend::VALUE) + } + } + } + } + + pub struct Values { + values: u32, + cpu: Features, + } + + impl Values { + #[inline(always)] + pub(super) fn values(&self) -> u32 { + self.values + } + + #[inline(always)] + pub(super) fn cpu(&self) -> Features { + self.cpu + } + } +} + +const _: () = assert!(size_of::() == 0); + +cfg_if::cfg_if! { + if #[cfg(any(all(target_arch = "aarch64", target_endian = "little"), all(target_arch = "arm", target_endian = "little")))] { + pub mod arm; + use arm::featureflags; + } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + pub mod intel; + use intel::featureflags; + } else { + mod featureflags { + use super::Features; + + #[inline(always)] + pub(super) fn get_or_init() -> Features { + Features::new_no_features_to_detect() + } + + #[inline(always)] + pub(super) fn get(_cpu_features: Features) -> u32 { + STATIC_DETECTED + } + + pub(super) const STATIC_DETECTED: u32 = 0; + pub(super) const FORCE_DYNAMIC_DETECTION: u32 = 0; + } + } +} + +const CAPS_STATIC: u32 = featureflags::STATIC_DETECTED & !featureflags::FORCE_DYNAMIC_DETECTION; + +#[allow(clippy::assertions_on_constants, clippy::bad_bit_mask)] +const _FORCE_DYNAMIC_DETECTION_HONORED: () = + assert!((CAPS_STATIC & featureflags::FORCE_DYNAMIC_DETECTION) == 0); + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_static_is_subset_of_dynamic() { + let cpu = features(); + let dynamic = featureflags::get(cpu); + assert_eq!(dynamic & CAPS_STATIC, CAPS_STATIC); + } +} diff --git a/ring-0.17.14/src/cpu/arm.rs b/ring-0.17.14/src/cpu/arm.rs new file mode 100644 index 0000000000..8da973a1a3 --- /dev/null +++ b/ring-0.17.14/src/cpu/arm.rs @@ -0,0 +1,192 @@ +// Copyright 2016-2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::CAPS_STATIC; + +mod abi_assumptions { + use core::mem::size_of; + + // TODO: Support ARM64_32; see + // https://github.com/briansmith/ring/issues/1832#issuecomment-1892928147. This also requires + // replacing all `cfg(target_pointer_width)` logic for non-pointer/reference things + // (`N0`, `Limb`, `LimbMask`, `crypto_word_t` etc.). + #[cfg(target_arch = "aarch64")] + const _ASSUMED_POINTER_SIZE: usize = 8; + #[cfg(target_arch = "arm")] + const _ASSUMED_POINTER_SIZE: usize = 4; + const _ASSUMED_USIZE_SIZE: () = assert!(size_of::() == _ASSUMED_POINTER_SIZE); + const _ASSUMED_REF_SIZE: () = assert!(size_of::<&'static u8>() == _ASSUMED_POINTER_SIZE); + + // To support big-endian, we'd need to make several changes as described in + // https://github.com/briansmith/ring/issues/1832. + const _ASSUMED_ENDIANNESS: () = assert!(cfg!(target_endian = "little")); +} + +// uclibc: When linked statically, uclibc doesn't provide getauxval. +// When linked dynamically, recent versions do provide it, but we +// want to support older versions too. Assume that if uclibc is being +// used, this is an embedded target where the user cares a lot about +// minimizing code size and also that they know in advance exactly +// what target features are supported, so rely only on static feature +// detection. + +cfg_if::cfg_if! { + if #[cfg(all(all(target_arch = "aarch64", target_endian = "little"), + any(target_os = "ios", target_os = "macos", target_os = "tvos", target_os = "visionos", target_os = "watchos")))] { + mod darwin; + use darwin as detect; + } else if #[cfg(all(all(target_arch = "aarch64", target_endian = "little"), target_os = "fuchsia"))] { + mod fuchsia; + use fuchsia as detect; + } else if #[cfg(any(target_os = "android", target_os = "linux"))] { + mod linux; + use linux as detect; + } else if #[cfg(all(all(target_arch = "aarch64", target_endian = "little"), target_os = "windows"))] { + mod windows; + use windows as detect; + } else { + mod detect { + pub const FORCE_DYNAMIC_DETECTION: u32 = 0; + pub fn detect_features() -> u32 { 0 } + } + } +} + +impl_get_feature! { + features: [ + // TODO(MSRV): 32-bit ARM doesn't have `target_feature = "neon"` yet. + { ("aarch64", "arm") => Neon }, + + // TODO(MSRV): There is no "pmull" feature listed from + // `rustc --print cfg --target=aarch64-apple-darwin`. Originally ARMv8 tied + // PMULL detection into AES detection, but later versions split it; see + // https://developer.arm.com/downloads/-/exploration-tools/feature-names-for-a-profile + // "Features introduced prior to 2020." Change this to use "pmull" when + // that is supported. + { ("aarch64") => PMull }, + + { ("aarch64") => Aes }, + + { ("aarch64") => Sha256 }, + + // Keep in sync with `ARMV8_SHA512`. + + // "sha3" is overloaded for both SHA-3 and SHA-512. + { ("aarch64") => Sha512 }, + ], +} + +pub(super) mod featureflags { + pub(in super::super) use super::detect::FORCE_DYNAMIC_DETECTION; + use super::*; + use crate::{ + cpu, + polyfill::{once_cell::race, usize_from_u32}, + }; + use core::num::NonZeroUsize; + #[cfg(all(target_arch = "arm", target_endian = "little"))] + use core::sync::atomic::{AtomicU32, Ordering}; + + pub(in super::super) fn get_or_init() -> cpu::Features { + fn init() -> NonZeroUsize { + let detected = detect::detect_features(); + let filtered = (if cfg!(feature = "unstable-testing-arm-no-hw") { + !Neon::mask() + } else { + 0 + }) | (if cfg!(feature = "unstable-testing-arm-no-neon") { + Neon::mask() + } else { + 0 + }); + let detected = detected & !filtered; + let merged = CAPS_STATIC | detected; + + #[cfg(all( + target_arch = "arm", + target_endian = "little", + target_has_atomic = "32" + ))] + if (merged & Neon::mask()) == Neon::mask() { + // `neon_available` is declared as `alignas(4) uint32_t` in the C code. + // AtomicU32 is `#[repr(C, align(4))]`. + prefixed_extern! { + static neon_available: AtomicU32; + } + // SAFETY: The C code only reads `neon_available`, and its + // reads are synchronized through the `OnceNonZeroUsize` + // Acquire/Release semantics as we ensure we have a + // `cpu::Features` instance before calling into the C code. + let p = unsafe { &neon_available }; + p.store(1, Ordering::Relaxed); + } + + let merged = usize_from_u32(merged) | (1 << (Shift::Initialized as u32)); + NonZeroUsize::new(merged).unwrap() // Can't fail because we just set a bit. + } + + // SAFETY: This is the only caller. Any concurrent reading doesn't + // affect the safety of the writing. + let _: NonZeroUsize = FEATURES.get_or_init(init); + + // SAFETY: We initialized the CPU features as required. + unsafe { cpu::Features::new_after_feature_flags_written_and_synced_unchecked() } + } + + pub(in super::super) fn get(_cpu_features: cpu::Features) -> u32 { + // SAFETY: Since only `get_or_init()` could have created + // `_cpu_features`, and it only does so after `FEATURES.get_or_init()`, + // we know we are reading from `FEATURES` after initializing it. + // + // Also, 0 means "no features detected" to users, which is designed to + // be a safe configuration. + let features = FEATURES.get().map(NonZeroUsize::get).unwrap_or(0); + + // The truncation is lossless, as we set the value with a u32. + #[allow(clippy::cast_possible_truncation)] + let features = features as u32; + + features + } + + static FEATURES: race::OnceNonZeroUsize = race::OnceNonZeroUsize::new(); + + // TODO(MSRV): There is no "pmull" feature listed from + // `rustc --print cfg --target=aarch64-apple-darwin`. Originally ARMv8 tied + // PMULL detection into AES detection, but later versions split it; see + // https://developer.arm.com/downloads/-/exploration-tools/feature-names-for-a-profile + // "Features introduced prior to 2020." Change this to use "pmull" when + // that is supported. + // + // "sha3" is overloaded for both SHA-3 and SHA-512. + #[cfg(all(target_arch = "aarch64", target_endian = "little"))] + #[rustfmt::skip] + pub(in super::super) const STATIC_DETECTED: u32 = 0 + | (if cfg!(target_feature = "neon") { Neon::mask() } else { 0 }) + | (if cfg!(target_feature = "aes") { Aes::mask() } else { 0 }) + | (if cfg!(target_feature = "aes") { PMull::mask() } else { 0 }) + | (if cfg!(target_feature = "sha2") { Sha256::mask() } else { 0 }) + | (if cfg!(target_feature = "sha3") { Sha512::mask() } else { 0 }) + ; + + // TODO(MSRV): 32-bit ARM doesn't support any static feature detection yet. + #[cfg(all(target_arch = "arm", target_endian = "little"))] + pub(in super::super) const STATIC_DETECTED: u32 = 0; +} + +#[allow(clippy::assertions_on_constants)] +const _AARCH64_HAS_NEON: () = assert!( + ((CAPS_STATIC & Neon::mask()) == Neon::mask()) + || !cfg!(all(target_arch = "aarch64", target_endian = "little")) +); diff --git a/ring-0.17.14/src/cpu/arm/darwin.rs b/ring-0.17.14/src/cpu/arm/darwin.rs new file mode 100644 index 0000000000..6d40bc7eca --- /dev/null +++ b/ring-0.17.14/src/cpu/arm/darwin.rs @@ -0,0 +1,113 @@ +// Copyright 2016-2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::{Aes, Neon, PMull, Sha256, Sha512, CAPS_STATIC}; +use crate::polyfill::cstr; + +// ``` +// $ rustc +1.61.0 --print cfg --target=aarch64-apple-ios | grep -E "neon|aes|sha|pmull" +// target_feature="aes" +// target_feature="neon" +// target_feature="sha2" +// $ rustc +1.61.0 --print cfg --target=aarch64-apple-darwin | grep -E "neon|aes|sha|pmull" +// target_feature="aes" +// target_feature="neon" +// target_feature="sha2" +// target_feature="sha3" +// ``` +// +// XXX/TODO(coverage)/TODO(size): aarch64-apple-darwin is statically guaranteed to have "sha3" but +// other aarch64-apple-* targets require dynamic detection. Since we don't have test coverage for +// the other targets yet, we wouldn't have a way of testing the dynamic detection if we statically +// enabled `Sha512` for -darwin. So instead, temporarily, we statically ignore the static +// availability of the feature on -darwin so that it runs the dynamic detection. +pub const MIN_STATIC_FEATURES: u32 = Neon::mask() | Aes::mask() | Sha256::mask() | PMull::mask(); +pub const FORCE_DYNAMIC_DETECTION: u32 = !MIN_STATIC_FEATURES; + +// MSRV: Enforce 1.61.0 onaarch64-apple-*, in particular) prior to. Earlier +// versions of Rust before did not report the AAarch64 CPU features correctly +// for these targets. Cargo.toml specifies `rust-version` but versions before +// Rust 1.56 don't know about it. +#[allow(clippy::assertions_on_constants)] +const _AARCH64_APPLE_TARGETS_EXPECTED_FEATURES: () = + assert!((CAPS_STATIC & MIN_STATIC_FEATURES) == MIN_STATIC_FEATURES); + +// Ensure we don't accidentally allow features statically beyond +// `MIN_STATIC_FEATURES` so that dynamic detection is done uniformly for +// all of these targets. +#[allow(clippy::assertions_on_constants)] +const _AARCH64_APPLE_DARWIN_TARGETS_EXPECTED_FEATURES: () = + assert!(CAPS_STATIC == MIN_STATIC_FEATURES); + +pub fn detect_features() -> u32 { + fn detect_feature(name: cstr::Ref) -> bool { + use crate::polyfill; + use core::mem; + use libc::{c_int, c_void}; + + let mut value: c_int = 0; + let mut len = mem::size_of_val(&value); + let value_ptr = polyfill::ptr::from_mut(&mut value).cast::(); + // SAFETY: `value_ptr` is a valid pointer to `value` and `len` is the size of `value`. + let rc = unsafe { + libc::sysctlbyname(name.as_ptr(), value_ptr, &mut len, core::ptr::null_mut(), 0) + }; + // All the conditions are separated so we can observe them in code coverage. + if rc != 0 { + return false; + } + debug_assert_eq!(len, mem::size_of_val(&value)); + if len != mem::size_of_val(&value) { + return false; + } + value != 0 + } + + // We do not need to check for the presence of NEON, as Armv8-A always has it + const _ASSERT_NEON_DETECTED: () = assert!((CAPS_STATIC & Neon::mask()) == Neon::mask()); + + let mut features = 0; + + // TODO(MSRV 1.77): Use c"..." literal. + const SHA512_NAME: cstr::Ref = + cstr::unwrap_const_from_bytes_with_nul(b"hw.optional.armv8_2_sha512\0"); + if detect_feature(SHA512_NAME) { + features |= Sha512::mask(); + } + + features +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::cpu; + + #[test] + fn sha512_detection() { + // We intentionally disable static feature detection for SHA-512. + const _SHA512_NOT_STATICALLY_DETECTED: () = assert!((CAPS_STATIC & Sha512::mask()) == 0); + + if cfg!(target_os = "macos") { + use crate::cpu::{arm::Sha512, GetFeature as _}; + + // All aarch64-apple-darwin targets have SHA3 enabled statically... + assert!(cfg!(target_feature = "sha3")); + + // ...so we should detect it. + let cpu = cpu::features(); + assert!(matches!(cpu.get_feature(), Some(Sha512 { .. }))); + } + } +} diff --git a/ring-0.17.14/src/cpu/arm/fuchsia.rs b/ring-0.17.14/src/cpu/arm/fuchsia.rs new file mode 100644 index 0000000000..bb1232b3a8 --- /dev/null +++ b/ring-0.17.14/src/cpu/arm/fuchsia.rs @@ -0,0 +1,58 @@ +// Copyright 2016-2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::{Aes, Neon, PMull, Sha256, Sha512, CAPS_STATIC}; + +pub const FORCE_DYNAMIC_DETECTION: u32 = 0; + +pub fn detect_features() -> u32 { + type zx_status_t = i32; + + #[link(name = "zircon")] + extern "C" { + fn zx_system_get_features(kind: u32, features: *mut u32) -> zx_status_t; + } + + const ZX_OK: i32 = 0; + const ZX_FEATURE_KIND_CPU: u32 = 0; + const ZX_ARM64_FEATURE_ISA_AES: u32 = 1 << 3; + const ZX_ARM64_FEATURE_ISA_PMULL: u32 = 1 << 4; + const ZX_ARM64_FEATURE_ISA_SHA256: u32 = 1 << 6; + const ZX_ARM64_FEATURE_ISA_SHA512: u32 = 1 << 18; + + let mut caps = 0; + let rc = unsafe { zx_system_get_features(ZX_FEATURE_KIND_CPU, &mut caps) }; + + let mut features = 0; + + // We do not need to check for the presence of NEON, as Armv8-A always has it + const _ASSERT_NEON_DETECTED: () = assert!((CAPS_STATIC & Neon::mask()) == Neon::mask()); + + if rc == ZX_OK { + if caps & ZX_ARM64_FEATURE_ISA_AES == ZX_ARM64_FEATURE_ISA_AES { + features |= Aes::mask(); + } + if caps & ZX_ARM64_FEATURE_ISA_PMULL == ZX_ARM64_FEATURE_ISA_PMULL { + features |= PMull::mask(); + } + if caps & ZX_ARM64_FEATURE_ISA_SHA256 == ZX_ARM64_FEATURE_ISA_SHA256 { + features |= Sha256::mask(); + } + if caps & ZX_ARM64_FEATURE_ISA_SHA512 == ZX_ARM64_FEATURE_ISA_SHA512 { + features |= Sha512::mask(); + } + } + + features +} diff --git a/ring-0.17.14/src/cpu/arm/linux.rs b/ring-0.17.14/src/cpu/arm/linux.rs new file mode 100644 index 0000000000..db9549c7b9 --- /dev/null +++ b/ring-0.17.14/src/cpu/arm/linux.rs @@ -0,0 +1,107 @@ +// Copyright 2016-2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::Neon; + +// Work around a bug in LLVM/rustc where `-C target_cpu=cortex-a72`-- +// and `-C target_cpu=native` on Cortex-A72 Raspberry PI devices in +// particular--enables crypto features even though not all Cortex-A72 +// CPUs have crypto features: +// +// ``` +// $ rustc --print cfg --target=aarch64-unknown-linux-gnu | grep feature +// target_feature="neon" +// $ rustc --print cfg --target=aarch64-unknown-linux-gnu -C target_cpu=cortex-a72 | grep feature +// target_feature="aes" +// target_feature="crc" +// target_feature="neon" +// target_feature="pmuv3" +// target_feature="sha2" +// ``` +// +// XXX/TODO(MSRV https://github.com/llvm/llvm-project/issues/90365): This +// workaround is heavy-handed since it forces extra branches for devices that +// have correctly-modeled feature sets, so it should be removed. +pub const FORCE_DYNAMIC_DETECTION: u32 = !Neon::mask(); + +// `uclibc` does not provide `getauxval` so just use static feature detection +// for it. +#[cfg(target_env = "uclibc")] +pub fn detect_features() -> u32 { + 0 +} + +#[cfg(all( + not(target_env = "uclibc"), + all(target_arch = "aarch64", target_endian = "little") +))] +pub fn detect_features() -> u32 { + use super::{Aes, PMull, Sha256, Sha512, CAPS_STATIC}; + use libc::{getauxval, AT_HWCAP, HWCAP_AES, HWCAP_PMULL, HWCAP_SHA2, HWCAP_SHA512}; + + let mut features = 0; + + // We do not need to check for the presence of NEON, as Armv8-A always has it + const _ASSERT_NEON_DETECTED: () = assert!((CAPS_STATIC & Neon::mask()) == Neon::mask()); + + let caps = unsafe { getauxval(AT_HWCAP) }; + + if caps & HWCAP_AES == HWCAP_AES { + features |= Aes::mask(); + } + if caps & HWCAP_PMULL == HWCAP_PMULL { + features |= PMull::mask(); + } + if caps & HWCAP_SHA2 == HWCAP_SHA2 { + features |= Sha256::mask(); + } + if caps & HWCAP_SHA512 == HWCAP_SHA512 { + features |= Sha512::mask(); + } + + features +} + +#[cfg(all( + not(target_env = "uclibc"), + all(target_arch = "arm", target_endian = "little") +))] +pub fn detect_features() -> u32 { + use super::CAPS_STATIC; + + // The `libc` crate doesn't provide this functionality on all + // 32-bit Linux targets, like Android or -musl. Use this polyfill + // for all 32-bit ARM targets so that testing on one of them will + // be more meaningful to the others. + use libc::c_ulong; + extern "C" { + pub fn getauxval(type_: c_ulong) -> c_ulong; + } + const AT_HWCAP: c_ulong = 16; + const HWCAP_NEON: c_ulong = 1 << 12; + + let mut features = 0; + + if CAPS_STATIC & Neon::mask() != Neon::mask() { + let caps = unsafe { getauxval(AT_HWCAP) }; + + // OpenSSL and BoringSSL don't enable any other features if NEON isn't + // available. We don't enable any hardware implementations for 32-bit ARM. + if caps & HWCAP_NEON == HWCAP_NEON { + features |= Neon::mask(); + } + } + + features +} diff --git a/ring-0.17.14/src/cpu/arm/windows.rs b/ring-0.17.14/src/cpu/arm/windows.rs new file mode 100644 index 0000000000..a753706935 --- /dev/null +++ b/ring-0.17.14/src/cpu/arm/windows.rs @@ -0,0 +1,38 @@ +// Copyright 2016-2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::{Aes, Neon, PMull, Sha256, CAPS_STATIC}; +use windows_sys::Win32::System::Threading::{ + IsProcessorFeaturePresent, PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE, +}; + +pub const FORCE_DYNAMIC_DETECTION: u32 = 0; + +pub fn detect_features() -> u32 { + // We do not need to check for the presence of NEON, as Armv8-A always has it + const _ASSERT_NEON_DETECTED: () = assert!((CAPS_STATIC & Neon::mask()) == Neon::mask()); + + let mut features = 0; + + let result = unsafe { IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) }; + + if result != 0 { + // These are all covered by one call in Windows + features |= Aes::mask(); + features |= PMull::mask(); + features |= Sha256::mask(); + } + + features +} diff --git a/ring-0.17.14/src/cpu/intel.rs b/ring-0.17.14/src/cpu/intel.rs new file mode 100644 index 0000000000..f45052fe7f --- /dev/null +++ b/ring-0.17.14/src/cpu/intel.rs @@ -0,0 +1,382 @@ +// Copyright 2016-2021 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use cfg_if::cfg_if; + +mod abi_assumptions { + use core::mem::size_of; + + // TOOD: Support targets that do not have SSE and SSE2 enabled, such as + // x86_64-unknown-linux-none. See + // https://github.com/briansmith/ring/issues/1793#issuecomment-1793243725, + // https://github.com/briansmith/ring/issues/1832, + // https://github.com/briansmith/ring/issues/1833. + const _ASSUMES_SSE2: () = + assert!(cfg!(target_feature = "sse") && cfg!(target_feature = "sse2")); + + #[cfg(target_arch = "x86_64")] + const _ASSUMED_POINTER_SIZE: usize = 8; + #[cfg(target_arch = "x86")] + const _ASSUMED_POINTER_SIZE: usize = 4; + const _ASSUMED_USIZE_SIZE: () = assert!(size_of::() == _ASSUMED_POINTER_SIZE); + const _ASSUMED_REF_SIZE: () = assert!(size_of::<&'static u8>() == _ASSUMED_POINTER_SIZE); + + const _ASSUMED_ENDIANNESS: () = assert!(cfg!(target_endian = "little")); +} + +pub(super) mod featureflags { + use super::super::CAPS_STATIC; + use crate::{ + cpu, + polyfill::{once_cell::race, usize_from_u32}, + }; + use core::num::NonZeroUsize; + + pub(in super::super) fn get_or_init() -> cpu::Features { + // SAFETY: `OPENSSL_cpuid_setup` must be called only in + // `INIT.call_once()` below. + prefixed_extern! { + fn OPENSSL_cpuid_setup(out: &mut [u32; 4]); + } + + let _: NonZeroUsize = FEATURES.get_or_init(|| { + let mut cpuid = [0; 4]; + // SAFETY: We assume that it is safe to execute CPUID and XGETBV. + unsafe { + OPENSSL_cpuid_setup(&mut cpuid); + } + let detected = super::cpuid_to_caps_and_set_c_flags(&cpuid); + let merged = CAPS_STATIC | detected; + + let merged = usize_from_u32(merged) | (1 << (super::Shift::Initialized as u32)); + NonZeroUsize::new(merged).unwrap() // Can't fail because we just set a bit. + }); + + // SAFETY: We initialized the CPU features as required. + // `INIT.call_once` has `happens-before` semantics. + unsafe { cpu::Features::new_after_feature_flags_written_and_synced_unchecked() } + } + + pub(in super::super) fn get(_cpu_features: cpu::Features) -> u32 { + // SAFETY: Since only `get_or_init()` could have created + // `_cpu_features`, and it only does so after `FEATURES.get_or_init()`, + // we know we are reading from `FEATURES` after initializing it. + // + // Also, 0 means "no features detected" to users, which is designed to + // be a safe configuration. + let features = FEATURES.get().map(NonZeroUsize::get).unwrap_or(0); + + // The truncation is lossless, as we set the value with a u32. + #[allow(clippy::cast_possible_truncation)] + let features = features as u32; + + features + } + + static FEATURES: race::OnceNonZeroUsize = race::OnceNonZeroUsize::new(); + + #[cfg(target_arch = "x86")] + #[rustfmt::skip] + pub const STATIC_DETECTED: u32 = 0 + | (if cfg!(target_feature = "sse2") { super::Sse2::mask() } else { 0 }) + ; + + // Limited to x86_64-v2 features. + // TODO: Add missing x86-64-v3 features if we find real-world use of x86-64-v3. + // TODO: Add all features we use. + #[cfg(target_arch = "x86_64")] + #[rustfmt::skip] + pub const STATIC_DETECTED: u32 = 0 + | if cfg!(target_feature = "sse4.1") { super::Sse41::mask() } else { 0 } + | if cfg!(target_feature = "ssse3") { super::Ssse3::mask() } else { 0 } + ; + + pub const FORCE_DYNAMIC_DETECTION: u32 = 0; +} + +fn cpuid_to_caps_and_set_c_flags(cpuid: &[u32; 4]) -> u32 { + // "Intel" citations are for "Intel 64 and IA-32 Architectures Software + // Developer’s Manual", Combined Volumes, December 2024. + // "AMD" citations are for "AMD64 Technology AMD64 Architecture + // Programmer’s Manual, Volumes 1-5" Revision 4.08 April 2024. + + // The `prefixed_extern!` uses below assume this + #[cfg(target_arch = "x86_64")] + use core::{mem::align_of, sync::atomic::AtomicU32}; + #[cfg(target_arch = "x86_64")] + const _ATOMIC32_ALIGNMENT_EQUALS_U32_ALIGNMENT: () = + assert!(align_of::() == align_of::()); + + fn check(leaf: u32, bit: u32) -> bool { + let shifted = 1 << bit; + (leaf & shifted) == shifted + } + fn set(out: &mut u32, shift: Shift) { + let shifted = 1 << (shift as u32); + debug_assert_eq!(*out & shifted, 0); + *out |= shifted; + debug_assert_eq!(*out & shifted, shifted); + } + + #[cfg(target_arch = "x86_64")] + let is_intel = check(cpuid[0], 30); // Synthesized by `OPENSSL_cpuid_setup` + + // CPUID leaf 1. + let leaf1_ecx = cpuid[1]; + + // Intel: "Structured Extended Feature Flags Enumeration Leaf" + #[cfg(target_arch = "x86_64")] + let (extended_features_ebx, extended_features_ecx) = (cpuid[2], cpuid[3]); + + let mut caps = 0; + + // AMD: "Collectively the SSE1, [...] are referred to as the legacy SSE + // instructions. All legacy SSE instructions support 128-bit vector + // operands." + + // Intel: "11.6.2 Checking for Intel SSE and SSE2 Support" + // We have to assume the prerequisites for SSE/SSE2 are met since we're + // already almost definitely using SSE registers if these target features + // are enabled. + // + // These also seem to help ensure CMOV support; There doesn't seem to be + // a `cfg!(target_feature = "cmov")`. It is likely that removing these + // assertions will remove the requirement for CMOV. With our without + // CMOV, it is likely that some of our timing side channel prevention does + // not work. Presumably the people who delete these are verifying that it + // all works fine. + const _SSE_REQUIRED: () = assert!(cfg!(target_feature = "sse")); + const _SSE2_REQUIRED: () = assert!(cfg!(target_feature = "sse2")); + + #[cfg(all(target_arch = "x86", not(target_feature = "sse2")))] + { + // If somebody is trying to compile for an x86 target without SSE2 + // and they deleted the `_SSE2_REQUIRED` const assertion above then + // they're probably trying to support a Linux/BSD/etc. distro that + // tries to support ancient x86 systems without SSE/SSE2. Try to + // reduce the harm caused, by implementing dynamic feature detection + // for them so that most systems will work like normal. + // + // Note that usually an x86-64 target with SSE2 disabled by default, + // usually `-none-` targets, will not support dynamically-detected use + // of SIMD registers via CPUID. A whole different mechanism is needed + // to support them. Same for i*86-*-none targets. + let leaf1_edx = cpuid[0]; + let sse1_available = check(leaf1_edx, 25); + let sse2_available = check(leaf1_edx, 26); + if sse1_available && sse2_available { + set(&mut caps, Shift::Sse2); + } + } + + // Sometimes people delete the `_SSE_REQUIRED`/`_SSE2_REQUIRED` const + // assertions in an attempt to support pre-SSE2 32-bit x86 systems. If they + // do, hopefully they won't delete these redundant assertions, so that + // x86_64 isn't affected. + #[cfg(target_arch = "x86_64")] + const _SSE2_REQUIRED_X86_64: () = assert!(cfg!(target_feature = "sse2")); + #[cfg(target_arch = "x86_64")] + const _SSE_REQUIRED_X86_64: () = assert!(cfg!(target_feature = "sse2")); + + // Intel: "12.7.2 Checking for SSSE3 Support" + // If/when we support dynamic detection of SSE/SSE2, make this conditional + // on SSE/SSE2. + if check(leaf1_ecx, 9) { + set(&mut caps, Shift::Ssse3); + } + + // Intel: "12.12.2 Checking for Intel SSE4.1 Support" + // If/when we support dynamic detection of SSE/SSE2, make this conditional + // on SSE/SSE2. + // XXX: We don't check for SSE3 and we're not sure if it is compatible for + // us to do so; does AMD advertise SSE3? TODO: address this. + // XXX: We don't condition this on SSSE3 being available. TODO: address + // this. + #[cfg(target_arch = "x86_64")] + if check(leaf1_ecx, 19) { + set(&mut caps, Shift::Sse41); + } + + // AMD: "The extended SSE instructions include [...]." + + // Intel: "14.3 DETECTION OF INTEL AVX INSTRUCTIONS" + // `OPENSSL_cpuid_setup` clears this bit when it detects the OS doesn't + // support AVX state. + let avx_available = check(leaf1_ecx, 28); + if avx_available { + set(&mut caps, Shift::Avx); + } + + #[cfg(target_arch = "x86_64")] + if avx_available { + // The Intel docs don't seem to document the detection. The instruction + // definitions of the VEX.256 instructions reference the + // VAES/VPCLMULQDQ features and the documentation for the extended + // features gives the values. We combine these into one feature because + // we never use them independently. + let vaes_available = check(extended_features_ecx, 9); + let vclmul_available = check(extended_features_ecx, 10); + if vaes_available && vclmul_available { + set(&mut caps, Shift::VAesClmul); + } + } + + // "14.7.1 Detection of Intel AVX2 Hardware support" + // XXX: We don't condition AVX2 on AVX. TODO: Address this. + // `OPENSSL_cpuid_setup` clears this bit when it detects the OS doesn't + // support AVX state. + #[cfg(target_arch = "x86_64")] + if check(extended_features_ebx, 5) { + set(&mut caps, Shift::Avx2); + + // Declared as `uint32_t` in the C code. + prefixed_extern! { + static avx2_available: AtomicU32; + } + // SAFETY: The C code only reads `avx2_available`, and its reads are + // synchronized through the `OnceNonZeroUsize` Acquire/Release + // semantics as we ensure we have a `cpu::Features` instance before + // calling into the C code. + let flag = unsafe { &avx2_available }; + flag.store(1, core::sync::atomic::Ordering::Relaxed); + } + + // Intel: "12.13.4 Checking for Intel AES-NI Support" + // If/when we support dynamic detection of SSE/SSE2, revisit this. + // TODO: Clarify "interesting" states like (!SSE && AVX && AES-NI) + // and AES-NI & !AVX. + // Each check of `ClMul`, `Aes`, and `Sha` must be paired with a check for + // an AVX feature (e.g. `Avx`) or an SSE feature (e.g. `Ssse3`), as every + // use will either be supported by SSE* or AVX* instructions. We then + // assume that those supporting instructions' prerequisites (e.g. OS + // support for AVX or SSE state, respectively) are the only prerequisites + // for these features. + if check(leaf1_ecx, 1) { + set(&mut caps, Shift::ClMul); + } + if check(leaf1_ecx, 25) { + set(&mut caps, Shift::Aes); + } + // See BoringSSL 69c26de93c82ad98daecaec6e0c8644cdf74b03f before enabling + // static feature detection for this. + #[cfg(target_arch = "x86_64")] + if check(extended_features_ebx, 29) { + set(&mut caps, Shift::Sha); + } + + #[cfg(target_arch = "x86_64")] + { + if is_intel { + set(&mut caps, Shift::IntelCpu); + } + + if check(leaf1_ecx, 22) { + set(&mut caps, Shift::Movbe); + } + + let adx_available = check(extended_features_ebx, 19); + if adx_available { + set(&mut caps, Shift::Adx); + } + + // Some 6th Generation (Skylake) CPUs claim to support BMI1 and BMI2 + // when they don't; see erratum "SKD052". The Intel document at + // https://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/6th-gen-core-u-y-spec-update.pdf + // contains the footnote "Affects 6th Generation Intel Pentium processor + // family and Intel Celeron processor family". Further research indicates + // that Skylake Pentium/Celeron do not implement AVX or ADX. It turns + // out that we only use BMI1 and BMI2 in combination with ADX and/or + // AVX. + // + // rust `std::arch::is_x86_feature_detected` does a very similar thing + // but only looks at AVX, not ADX. Note that they reference an older + // version of the erratum labeled SKL052. + let believe_bmi_bits = !is_intel || (adx_available || avx_available); + + if check(extended_features_ebx, 3) && believe_bmi_bits { + set(&mut caps, Shift::Bmi1); + } + + let bmi2_available = check(extended_features_ebx, 8) && believe_bmi_bits; + if bmi2_available { + set(&mut caps, Shift::Bmi2); + } + + if adx_available && bmi2_available { + // Declared as `uint32_t` in the C code. + prefixed_extern! { + static adx_bmi2_available: AtomicU32; + } + // SAFETY: The C code only reads `adx_bmi2_available`, and its + // reads are synchronized through the `OnceNonZeroUsize` + // Acquire/Release semantics as we ensure we have a + // `cpu::Features` instance before calling into the C code. + let flag = unsafe { &adx_bmi2_available }; + flag.store(1, core::sync::atomic::Ordering::Relaxed); + } + } + + caps +} + +impl_get_feature! { + features: [ + { ("x86_64") => VAesClmul }, + { ("x86", "x86_64") => ClMul }, + { ("x86", "x86_64") => Ssse3 }, + { ("x86_64") => Sse41 }, + { ("x86_64") => Movbe }, + { ("x86", "x86_64") => Aes }, + { ("x86", "x86_64") => Avx }, + { ("x86_64") => Bmi1 }, + { ("x86_64") => Avx2 }, + { ("x86_64") => Bmi2 }, + { ("x86_64") => Adx }, + // See BoringSSL 69c26de93c82ad98daecaec6e0c8644cdf74b03f before enabling + // static feature detection for this. + { ("x86_64") => Sha }, + // x86_64 can just assume SSE2 is available. + { ("x86") => Sse2 }, + ], +} + +cfg_if! { + if #[cfg(target_arch = "x86_64")] { + #[derive(Clone, Copy)] + pub(crate) struct IntelCpu(super::Features); + + impl super::GetFeature for super::features::Values { + fn get_feature(&self) -> Option { + const MASK: u32 = 1 << (Shift::IntelCpu as u32); + if (self.values() & MASK) == MASK { + Some(IntelCpu(self.cpu())) + } else { + None + } + } + } + } +} + +#[cfg(test)] +mod tests { + // This should always pass on any x86 system except very, very, old ones. + #[cfg(target_arch = "x86")] + #[test] + fn x86_has_sse2() { + use super::*; + use crate::cpu::{self, GetFeature as _}; + assert!(matches!(cpu::features().get_feature(), Some(Sse2 { .. }))) + } +} diff --git a/ring-0.17.14/src/data/alg-rsa-encryption.der b/ring-0.17.14/src/data/alg-rsa-encryption.der new file mode 100644 index 0000000000..77d159a1c6 Binary files /dev/null and b/ring-0.17.14/src/data/alg-rsa-encryption.der differ diff --git a/ring-0.17.14/src/debug.rs b/ring-0.17.14/src/debug.rs new file mode 100644 index 0000000000..5de58be461 --- /dev/null +++ b/ring-0.17.14/src/debug.rs @@ -0,0 +1,84 @@ +// Copyright 2018 Trent Clarke. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +// Generates an implementation of the Debug trait for a type that defers to the +// Debug implementation for a given field. +macro_rules! derive_debug_via_id { + ($typename:ident) => { + impl ::core::fmt::Debug for $typename { + fn fmt(&self, f: &mut ::core::fmt::Formatter) -> Result<(), ::core::fmt::Error> { + ::core::fmt::Debug::fmt(&self.id, f) + } + } + }; +} + +macro_rules! derive_debug_via_field { + ($type:ty, $field:ident) => { + derive_debug_via_field!($type, stringify!($type), $field); + }; + + ($type:ty, $typename:expr, $field:ident) => { + impl ::core::fmt::Debug for $type { + fn fmt(&self, f: &mut ::core::fmt::Formatter) -> Result<(), ::core::fmt::Error> { + f.debug_struct($typename) + .field(stringify!($field), &self.$field) + .finish() + } + } + }; +} + +// Generates an implementation of the Debug trait for a type that outputs the +// hex encoding of the byte slice representation of the value. +macro_rules! derive_debug_self_as_ref_hex_bytes { + ($typename:ident) => { + impl ::core::fmt::Debug for $typename { + fn fmt(&self, f: &mut ::core::fmt::Formatter) -> Result<(), ::core::fmt::Error> { + crate::debug::write_hex_tuple(f, stringify!($typename), self) + } + } + }; +} + +pub(crate) fn write_hex_tuple( + fmt: &mut core::fmt::Formatter, + type_name: &str, + value: &dyn AsRef<[u8]>, +) -> Result<(), ::core::fmt::Error> { + fmt.debug_tuple(type_name) + .field(&HexStr(value.as_ref())) + .finish() +} + +pub struct HexStr<'a>(pub &'a [u8]); + +impl core::fmt::Debug for HexStr<'_> { + fn fmt(&self, fmt: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> { + fmt.write_str("\"")?; + write_hex_bytes(fmt, self.0)?; + fmt.write_str("\"")?; + Ok(()) + } +} + +pub(crate) fn write_hex_bytes( + fmt: &mut core::fmt::Formatter, + bytes: &[u8], +) -> Result<(), ::core::fmt::Error> { + for byte in bytes { + write!(fmt, "{:02x}", byte)?; + } + Ok(()) +} diff --git a/ring-0.17.14/src/deprecated_constant_time.rs b/ring-0.17.14/src/deprecated_constant_time.rs new file mode 100644 index 0000000000..5703c6f08e --- /dev/null +++ b/ring-0.17.14/src/deprecated_constant_time.rs @@ -0,0 +1,22 @@ +// Copyright 2015-2025 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use crate::{bb, error}; + +#[deprecated( + note = "To be removed. Internal function not intended for external use with no promises regarding side channels." +)] +pub fn verify_slices_are_equal(a: &[u8], b: &[u8]) -> Result<(), error::Unspecified> { + bb::verify_slices_are_equal(a, b) +} diff --git a/ring-0.17.14/src/deprecated_test.rs b/ring-0.17.14/src/deprecated_test.rs new file mode 100644 index 0000000000..3758aa2b0b --- /dev/null +++ b/ring-0.17.14/src/deprecated_test.rs @@ -0,0 +1,45 @@ +// Copyright 2025 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![doc(hidden)] + +/// References a test input file. +#[macro_export] +macro_rules! test_file { + ($file_name:expr) => { + $crate::test::File { + file_name: $file_name, + contents: include_str!($file_name), + } + }; +} + +pub use crate::testutil::{ + compile_time_assert_clone, compile_time_assert_copy, compile_time_assert_eq, + compile_time_assert_send, compile_time_assert_sync, from_hex, run, File, TestCase, +}; + +#[cfg(feature = "std")] +pub use crate::testutil::compile_time_assert_std_error_error; + +#[deprecated(note = "internal API that will be removed")] +#[doc(hidden)] +pub mod rand { + #[deprecated(note = "internal API that will be removed")] + pub type FixedByteRandom = crate::testutil::rand::FixedByteRandom; + #[deprecated(note = "internal API that will be removed")] + pub type FixedSliceRandom<'a> = crate::testutil::rand::FixedSliceRandom<'a>; + #[deprecated(note = "internal API that will be removed")] + pub type FixedSliceSequenceRandom<'a> = crate::testutil::rand::FixedSliceSequenceRandom<'a>; +} diff --git a/ring-0.17.14/src/digest.rs b/ring-0.17.14/src/digest.rs new file mode 100644 index 0000000000..e771237786 --- /dev/null +++ b/ring-0.17.14/src/digest.rs @@ -0,0 +1,680 @@ +// Copyright 2015-2019 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! SHA-2 and the legacy SHA-1 digest algorithm. +//! +//! If all the data is available in a single contiguous slice then the `digest` +//! function should be used. Otherwise, the digest can be calculated in +//! multiple steps using `Context`. + +use self::{ + dynstate::DynState, + sha2::{SHA256_BLOCK_LEN, SHA512_BLOCK_LEN}, +}; +use crate::{ + bits::{BitLength, FromByteLen as _}, + cpu, debug, error, + polyfill::{self, slice, sliceutil}, +}; +use core::num::Wrapping; + +pub(crate) use self::finish_error::FinishError; + +mod dynstate; +mod sha1; +mod sha2; + +#[derive(Clone)] +pub(crate) struct BlockContext { + state: DynState, + + // Note that SHA-512 has a 128-bit input bit counter, but this + // implementation only supports up to 2^64-1 input bits for all algorithms, + // so a 64-bit counter is more than sufficient. + completed_bytes: u64, + + /// The context's algorithm. + pub algorithm: &'static Algorithm, +} + +impl BlockContext { + pub(crate) fn new(algorithm: &'static Algorithm) -> Self { + Self { + state: algorithm.initial_state.clone(), + completed_bytes: 0, + algorithm, + } + } + + /// Processes all the full blocks in `input`, returning the partial block + /// at the end, which may be empty. + pub(crate) fn update<'i>(&mut self, input: &'i [u8], cpu_features: cpu::Features) -> &'i [u8] { + let (completed_bytes, leftover) = self.block_data_order(input, cpu_features); + // Using saturated addition here allows `update` to be infallible and + // panic-free. If we were to reach the maximum value here then `finish` + // will detect that we processed too much data when it converts this to + // a bit length. + self.completed_bytes = self + .completed_bytes + .saturating_add(polyfill::u64_from_usize(completed_bytes)); + leftover + } + + // On input, `block[..num_pending]` is the (possibly-empty) last *partial* + // chunk of input. It *must* be partial; that is, it is required that + // `num_pending < self.algorithm.block_len`. + // + // `block` may be arbitrarily overwritten. + pub(crate) fn try_finish( + mut self, + block: &mut [u8; MAX_BLOCK_LEN], + num_pending: usize, + cpu_features: cpu::Features, + ) -> Result { + let completed_bits = self + .completed_bytes + .checked_add(polyfill::u64_from_usize(num_pending)) + .ok_or_else(|| { + // Choosing self.completed_bytes here is lossy & somewhat arbitrary. + InputTooLongError::new(self.completed_bytes) + }) + .and_then(BitLength::from_byte_len) + .map_err(FinishError::input_too_long)?; + + let block_len = self.algorithm.block_len(); + let block = &mut block[..block_len]; + + let padding = match block.get_mut(num_pending..) { + Some([separator, padding @ ..]) => { + *separator = 0x80; + padding + } + // Precondition violated. + unreachable => { + return Err(FinishError::pending_not_a_partial_block( + unreachable.as_deref(), + )); + } + }; + + let padding = match padding + .len() + .checked_sub(self.algorithm.block_len.len_len()) + { + Some(_) => padding, + None => { + padding.fill(0); + let (completed_bytes, leftover) = self.block_data_order(block, cpu_features); + debug_assert_eq!((completed_bytes, leftover.len()), (block_len, 0)); + // We don't increase |self.completed_bytes| because the padding + // isn't data, and so it isn't included in the data length. + &mut block[..] + } + }; + + let (to_zero, len) = padding.split_at_mut(padding.len() - 8); + to_zero.fill(0); + len.copy_from_slice(&completed_bits.to_be_bytes()); + + let (completed_bytes, leftover) = self.block_data_order(block, cpu_features); + debug_assert_eq!((completed_bytes, leftover.len()), (block_len, 0)); + + Ok(Digest { + algorithm: self.algorithm, + value: self.state.format_output(), + }) + } + + #[must_use] + fn block_data_order<'d>( + &mut self, + data: &'d [u8], + cpu_features: cpu::Features, + ) -> (usize, &'d [u8]) { + (self.algorithm.block_data_order)(&mut self.state, data, cpu_features) + } +} + +pub(crate) type InputTooLongError = error::InputTooLongError; + +cold_exhaustive_error! { + enum finish_error::FinishError { + input_too_long => InputTooLong(InputTooLongError), + pending_not_a_partial_block_inner => PendingNotAPartialBlock(usize), + } +} + +impl FinishError { + #[cold] + #[inline(never)] + fn pending_not_a_partial_block(padding: Option<&[u8]>) -> Self { + match padding { + None => Self::pending_not_a_partial_block_inner(0), + Some(padding) => Self::pending_not_a_partial_block_inner(padding.len()), + } + } +} + +/// A context for multi-step (Init-Update-Finish) digest calculations. +/// +/// # Examples +/// +/// ``` +/// use ring::digest; +/// +/// let one_shot = digest::digest(&digest::SHA384, b"hello, world"); +/// +/// let mut ctx = digest::Context::new(&digest::SHA384); +/// ctx.update(b"hello"); +/// ctx.update(b", "); +/// ctx.update(b"world"); +/// let multi_part = ctx.finish(); +/// +/// assert_eq!(&one_shot.as_ref(), &multi_part.as_ref()); +/// ``` +#[derive(Clone)] +pub struct Context { + block: BlockContext, + // TODO: More explicitly force 64-bit alignment for |pending|. + pending: [u8; MAX_BLOCK_LEN], + + // Invariant: `self.num_pending < self.block.algorithm.block_len`. + num_pending: usize, +} + +impl Context { + /// Constructs a new context. + pub fn new(algorithm: &'static Algorithm) -> Self { + Self { + block: BlockContext::new(algorithm), + pending: [0u8; MAX_BLOCK_LEN], + num_pending: 0, + } + } + + pub(crate) fn clone_from(block: &BlockContext) -> Self { + Self { + block: block.clone(), + pending: [0u8; MAX_BLOCK_LEN], + num_pending: 0, + } + } + + /// Updates the digest with all the data in `data`. + pub fn update(&mut self, data: &[u8]) { + let cpu_features = cpu::features(); + + let block_len = self.block.algorithm.block_len(); + let buffer = &mut self.pending[..block_len]; + + let to_digest = if self.num_pending == 0 { + data + } else { + let buffer_to_fill = match buffer.get_mut(self.num_pending..) { + Some(buffer_to_fill) => buffer_to_fill, + None => { + // Impossible because of the invariant. + unreachable!(); + } + }; + sliceutil::overwrite_at_start(buffer_to_fill, data); + match slice::split_at_checked(data, buffer_to_fill.len()) { + Some((just_copied, to_digest)) => { + debug_assert_eq!(buffer_to_fill.len(), just_copied.len()); + debug_assert_eq!(self.num_pending + just_copied.len(), block_len); + let leftover = self.block.update(buffer, cpu_features); + debug_assert_eq!(leftover.len(), 0); + self.num_pending = 0; + to_digest + } + None => { + self.num_pending += data.len(); + // If `data` isn't enough to complete a block, buffer it and stop. + debug_assert!(self.num_pending < block_len); + return; + } + } + }; + + let leftover = self.block.update(to_digest, cpu_features); + sliceutil::overwrite_at_start(buffer, leftover); + self.num_pending = leftover.len(); + debug_assert!(self.num_pending < block_len); + } + + /// Finalizes the digest calculation and returns the digest value. + /// + /// `finish` consumes the context so it cannot be (mis-)used after `finish` + /// has been called. + pub fn finish(self) -> Digest { + let cpu = cpu::features(); + self.try_finish(cpu) + .map_err(error::erase::) + .unwrap() + } + + pub(crate) fn try_finish( + mut self, + cpu_features: cpu::Features, + ) -> Result { + self.block + .try_finish(&mut self.pending, self.num_pending, cpu_features) + .map_err(|err| match err { + FinishError::InputTooLong(i) => i, + FinishError::PendingNotAPartialBlock(_) => { + // Due to invariant. + unreachable!() + } + }) + } + + /// The algorithm that this context is using. + #[inline(always)] + pub fn algorithm(&self) -> &'static Algorithm { + self.block.algorithm + } +} + +/// Returns the digest of `data` using the given digest algorithm. +pub fn digest(algorithm: &'static Algorithm, data: &[u8]) -> Digest { + let cpu = cpu::features(); + Digest::compute_from(algorithm, data, cpu) + .map_err(error::erase::) + .unwrap() +} + +/// A calculated digest value. +/// +/// Use [`Self::as_ref`] to get the value as a `&[u8]`. +#[derive(Clone, Copy)] +pub struct Digest { + value: Output, + algorithm: &'static Algorithm, +} + +impl Digest { + pub(crate) fn compute_from( + algorithm: &'static Algorithm, + data: &[u8], + cpu: cpu::Features, + ) -> Result { + let mut ctx = Context::new(algorithm); + ctx.update(data); + ctx.try_finish(cpu) + } + + /// The algorithm that was used to calculate the digest value. + #[inline(always)] + pub fn algorithm(&self) -> &'static Algorithm { + self.algorithm + } +} + +impl AsRef<[u8]> for Digest { + #[inline(always)] + fn as_ref(&self) -> &[u8] { + &self.value.0[..self.algorithm.output_len()] + } +} + +impl core::fmt::Debug for Digest { + fn fmt(&self, fmt: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(fmt, "{:?}:", self.algorithm)?; + debug::write_hex_bytes(fmt, self.as_ref()) + } +} + +/// A digest algorithm. +pub struct Algorithm { + output_len: OutputLen, + chaining_len: usize, + block_len: BlockLen, + + /// `block_data_order` processes all the full blocks of data in `data`. It + /// returns the number of bytes processed and the unprocessed data, which + /// is guaranteed to be less than `block_len` bytes long. + block_data_order: for<'d> fn( + state: &mut DynState, + data: &'d [u8], + cpu_features: cpu::Features, + ) -> (usize, &'d [u8]), + + initial_state: DynState, + + id: AlgorithmID, +} + +#[derive(Debug, Eq, PartialEq)] +enum AlgorithmID { + SHA1, + SHA256, + SHA384, + SHA512, + SHA512_256, +} + +impl PartialEq for Algorithm { + fn eq(&self, other: &Self) -> bool { + self.id == other.id + } +} + +impl Eq for Algorithm {} + +derive_debug_via_id!(Algorithm); + +impl Algorithm { + /// The internal block length. + pub fn block_len(&self) -> usize { + self.block_len.into() + } + + /// The size of the chaining value of the digest function, in bytes. + /// + /// For non-truncated algorithms (SHA-1, SHA-256, SHA-512), this is equal + /// to [`Self::output_len()`]. For truncated algorithms (e.g. SHA-384, + /// SHA-512/256), this is equal to the length before truncation. This is + /// mostly helpful for determining the size of an HMAC key that is + /// appropriate for the digest algorithm. + pub fn chaining_len(&self) -> usize { + self.chaining_len + } + + /// The length of a finalized digest. + pub fn output_len(&self) -> usize { + self.output_len.into() + } +} + +/// SHA-1 as specified in [FIPS 180-4]. Deprecated. +/// +/// [FIPS 180-4]: http://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf +pub static SHA1_FOR_LEGACY_USE_ONLY: Algorithm = Algorithm { + output_len: sha1::OUTPUT_LEN, + chaining_len: sha1::CHAINING_LEN, + block_len: sha1::BLOCK_LEN, + block_data_order: dynstate::sha1_block_data_order, + initial_state: DynState::new32([ + Wrapping(0x67452301u32), + Wrapping(0xefcdab89u32), + Wrapping(0x98badcfeu32), + Wrapping(0x10325476u32), + Wrapping(0xc3d2e1f0u32), + Wrapping(0), + Wrapping(0), + Wrapping(0), + ]), + id: AlgorithmID::SHA1, +}; + +/// SHA-256 as specified in [FIPS 180-4]. +/// +/// [FIPS 180-4]: http://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf +pub static SHA256: Algorithm = Algorithm { + output_len: OutputLen::_256, + chaining_len: SHA256_OUTPUT_LEN, + block_len: SHA256_BLOCK_LEN, + block_data_order: dynstate::sha256_block_data_order, + initial_state: DynState::new32([ + Wrapping(0x6a09e667u32), + Wrapping(0xbb67ae85u32), + Wrapping(0x3c6ef372u32), + Wrapping(0xa54ff53au32), + Wrapping(0x510e527fu32), + Wrapping(0x9b05688cu32), + Wrapping(0x1f83d9abu32), + Wrapping(0x5be0cd19u32), + ]), + id: AlgorithmID::SHA256, +}; + +/// SHA-384 as specified in [FIPS 180-4]. +/// +/// [FIPS 180-4]: http://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf +pub static SHA384: Algorithm = Algorithm { + output_len: OutputLen::_384, + chaining_len: SHA512_OUTPUT_LEN, + block_len: SHA512_BLOCK_LEN, + block_data_order: dynstate::sha512_block_data_order, + initial_state: DynState::new64([ + Wrapping(0xcbbb9d5dc1059ed8), + Wrapping(0x629a292a367cd507), + Wrapping(0x9159015a3070dd17), + Wrapping(0x152fecd8f70e5939), + Wrapping(0x67332667ffc00b31), + Wrapping(0x8eb44a8768581511), + Wrapping(0xdb0c2e0d64f98fa7), + Wrapping(0x47b5481dbefa4fa4), + ]), + id: AlgorithmID::SHA384, +}; + +/// SHA-512 as specified in [FIPS 180-4]. +/// +/// [FIPS 180-4]: http://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf +pub static SHA512: Algorithm = Algorithm { + output_len: OutputLen::_512, + chaining_len: SHA512_OUTPUT_LEN, + block_len: SHA512_BLOCK_LEN, + block_data_order: dynstate::sha512_block_data_order, + initial_state: DynState::new64([ + Wrapping(0x6a09e667f3bcc908), + Wrapping(0xbb67ae8584caa73b), + Wrapping(0x3c6ef372fe94f82b), + Wrapping(0xa54ff53a5f1d36f1), + Wrapping(0x510e527fade682d1), + Wrapping(0x9b05688c2b3e6c1f), + Wrapping(0x1f83d9abfb41bd6b), + Wrapping(0x5be0cd19137e2179), + ]), + id: AlgorithmID::SHA512, +}; + +/// SHA-512/256 as specified in [FIPS 180-4]. +/// +/// This is *not* the same as just truncating the output of SHA-512, as +/// SHA-512/256 has its own initial state distinct from SHA-512's initial +/// state. +/// +/// [FIPS 180-4]: http://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf +pub static SHA512_256: Algorithm = Algorithm { + output_len: OutputLen::_256, + chaining_len: SHA512_OUTPUT_LEN, + block_len: SHA512_BLOCK_LEN, + block_data_order: dynstate::sha512_block_data_order, + initial_state: DynState::new64([ + Wrapping(0x22312194fc2bf72c), + Wrapping(0x9f555fa3c84c64c2), + Wrapping(0x2393b86b6f53b151), + Wrapping(0x963877195940eabd), + Wrapping(0x96283ee2a88effe3), + Wrapping(0xbe5e1e2553863992), + Wrapping(0x2b0199fc2c85b8aa), + Wrapping(0x0eb72ddc81c52ca2), + ]), + id: AlgorithmID::SHA512_256, +}; + +#[derive(Clone, Copy)] +struct Output([u8; MAX_OUTPUT_LEN]); + +/// The maximum block length ([`Algorithm::block_len()`]) of all the algorithms +/// in this module. +pub const MAX_BLOCK_LEN: usize = BlockLen::MAX.into(); + +/// The maximum output length ([`Algorithm::output_len()`]) of all the +/// algorithms in this module. +pub const MAX_OUTPUT_LEN: usize = OutputLen::MAX.into(); + +/// The maximum chaining length ([`Algorithm::chaining_len()`]) of all the +/// algorithms in this module. +pub const MAX_CHAINING_LEN: usize = MAX_OUTPUT_LEN; + +#[inline] +fn format_output(input: [Wrapping; sha2::CHAINING_WORDS], f: F) -> Output +where + F: Fn(T) -> [u8; N], + T: Copy, +{ + let mut output = Output([0; MAX_OUTPUT_LEN]); + output + .0 + .chunks_mut(N) + .zip(input.iter().copied().map(|Wrapping(w)| f(w))) + .for_each(|(o, i)| { + o.copy_from_slice(&i); + }); + output +} + +/// The length of the output of SHA-1, in bytes. +pub const SHA1_OUTPUT_LEN: usize = sha1::OUTPUT_LEN.into(); + +/// The length of the output of SHA-256, in bytes. +pub const SHA256_OUTPUT_LEN: usize = OutputLen::_256.into(); + +/// The length of the output of SHA-384, in bytes. +pub const SHA384_OUTPUT_LEN: usize = OutputLen::_384.into(); + +/// The length of the output of SHA-512, in bytes. +pub const SHA512_OUTPUT_LEN: usize = OutputLen::_512.into(); + +/// The length of the output of SHA-512/256, in bytes. +pub const SHA512_256_OUTPUT_LEN: usize = OutputLen::_256.into(); + +#[derive(Clone, Copy)] +enum BlockLen { + _512 = 512 / 8, + _1024 = 1024 / 8, // MAX +} + +impl BlockLen { + const MAX: Self = Self::_1024; + #[inline(always)] + const fn into(self) -> usize { + self as usize + } + + #[inline(always)] + const fn len_len(self) -> usize { + let len_len = match self { + BlockLen::_512 => LenLen::_64, + BlockLen::_1024 => LenLen::_128, + }; + len_len as usize + } +} + +#[derive(Clone, Copy)] +enum LenLen { + _64 = 64 / 8, + _128 = 128 / 8, +} + +#[derive(Clone, Copy)] +enum OutputLen { + _160 = 160 / 8, + _256 = 256 / 8, + _384 = 384 / 8, + _512 = 512 / 8, // MAX +} + +impl OutputLen { + const MAX: Self = Self::_512; + + #[inline(always)] + const fn into(self) -> usize { + self as usize + } +} + +#[cfg(test)] +mod tests { + mod max_input { + extern crate alloc; + use super::super::super::digest; + use crate::polyfill::u64_from_usize; + use alloc::vec; + + macro_rules! max_input_tests { + ( $algorithm_name:ident ) => { + mod $algorithm_name { + use super::super::super::super::digest; + + #[test] + fn max_input_test() { + super::max_input_test(&digest::$algorithm_name); + } + + #[test] + #[should_panic] + fn too_long_input_test_block() { + super::too_long_input_test_block(&digest::$algorithm_name); + } + + #[test] + #[should_panic] + fn too_long_input_test_byte() { + super::too_long_input_test_byte(&digest::$algorithm_name); + } + } + }; + } + + fn max_input_test(alg: &'static digest::Algorithm) { + let mut context = nearly_full_context(alg); + let next_input = vec![0u8; alg.block_len() - 1]; + context.update(&next_input); + let _ = context.finish(); // no panic + } + + fn too_long_input_test_block(alg: &'static digest::Algorithm) { + let mut context = nearly_full_context(alg); + let next_input = vec![0u8; alg.block_len()]; + context.update(&next_input); + let _ = context.finish(); // should panic + } + + fn too_long_input_test_byte(alg: &'static digest::Algorithm) { + let mut context = nearly_full_context(alg); + let next_input = vec![0u8; alg.block_len() - 1]; + context.update(&next_input); + context.update(&[0]); + let _ = context.finish(); // should panic + } + + fn nearly_full_context(alg: &'static digest::Algorithm) -> digest::Context { + // All implementations currently support up to 2^64-1 bits + // of input; according to the spec, SHA-384 and SHA-512 + // support up to 2^128-1, but that's not implemented yet. + let max_bytes = 1u64 << (64 - 3); + let max_blocks = max_bytes / u64_from_usize(alg.block_len()); + let completed_bytes = (max_blocks - 1) * u64_from_usize(alg.block_len()); + digest::Context { + block: digest::BlockContext { + state: alg.initial_state.clone(), + completed_bytes, + algorithm: alg, + }, + pending: [0u8; digest::MAX_BLOCK_LEN], + num_pending: 0, + } + } + + max_input_tests!(SHA1_FOR_LEGACY_USE_ONLY); + max_input_tests!(SHA256); + max_input_tests!(SHA384); + max_input_tests!(SHA512); + } +} diff --git a/ring-0.17.14/src/digest/dynstate.rs b/ring-0.17.14/src/digest/dynstate.rs new file mode 100644 index 0000000000..c74f69f906 --- /dev/null +++ b/ring-0.17.14/src/digest/dynstate.rs @@ -0,0 +1,98 @@ +// Copyright 2015-2019 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::{format_output, sha1, sha2, Output}; +use crate::{cpu, polyfill::slice}; +use core::mem::size_of; + +// Invariant: When constructed with `new32` (resp. `new64`), `As32` (resp. +// `As64`) is the active variant. +// Invariant: The active variant never changes after initialization. +#[derive(Clone)] +pub(super) enum DynState { + As64(sha2::State64), + As32(sha2::State32), +} + +impl DynState { + pub const fn new32(initial_state: sha2::State32) -> Self { + Self::As32(initial_state) + } + + pub const fn new64(initial_state: sha2::State64) -> Self { + Self::As64(initial_state) + } + + pub fn format_output(self) -> Output { + match self { + Self::As64(state) => { + format_output::<_, _, { size_of::() }>(state, u64::to_be_bytes) + } + Self::As32(state) => { + format_output::<_, _, { size_of::() }>(state, u32::to_be_bytes) + } + } + } +} + +pub(super) fn sha1_block_data_order<'d>( + state: &mut DynState, + data: &'d [u8], + _cpu_features: cpu::Features, +) -> (usize, &'d [u8]) { + let state = match state { + DynState::As32(state) => state, + _ => { + unreachable!(); + } + }; + + let (full_blocks, leftover) = slice::as_chunks(data); + sha1::sha1_block_data_order(state, full_blocks); + (full_blocks.as_flattened().len(), leftover) +} + +pub(super) fn sha256_block_data_order<'d>( + state: &mut DynState, + data: &'d [u8], + cpu_features: cpu::Features, +) -> (usize, &'d [u8]) { + let state = match state { + DynState::As32(state) => state, + _ => { + unreachable!(); + } + }; + + let (full_blocks, leftover) = slice::as_chunks(data); + sha2::block_data_order_32(state, full_blocks, cpu_features); + (full_blocks.len() * sha2::SHA256_BLOCK_LEN.into(), leftover) +} + +pub(super) fn sha512_block_data_order<'d>( + state: &mut DynState, + data: &'d [u8], + cpu_features: cpu::Features, +) -> (usize, &'d [u8]) { + let state = match state { + DynState::As64(state) => state, + _ => { + unreachable!(); + } + }; + + let (full_blocks, leftover) = slice::as_chunks(data); + sha2::block_data_order_64(state, full_blocks, cpu_features); + (full_blocks.len() * sha2::SHA512_BLOCK_LEN.into(), leftover) +} diff --git a/ring-0.17.14/src/digest/sha1.rs b/ring-0.17.14/src/digest/sha1.rs new file mode 100644 index 0000000000..e8a4616d15 --- /dev/null +++ b/ring-0.17.14/src/digest/sha1.rs @@ -0,0 +1,119 @@ +// Copyright 2015-2025 Brian Smith. +// Copyright 2016 Simon Sapin. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::{ + sha2::{ + fallback::{ch, maj, Word}, + State32, + }, + BlockLen, OutputLen, +}; +use crate::polyfill::slice::{self, AsChunks}; +use core::{mem::size_of, num::Wrapping}; + +pub(super) const BLOCK_LEN: BlockLen = BlockLen::_512; +pub const CHAINING_LEN: usize = 160 / 8; +pub(super) const OUTPUT_LEN: OutputLen = OutputLen::_160; +const CHAINING_WORDS: usize = CHAINING_LEN / 4; + +type W32 = Wrapping; + +// FIPS 180-4 4.1.1 +#[inline] +fn parity(x: W32, y: W32, z: W32) -> W32 { + x ^ y ^ z +} + +type State = [W32; CHAINING_WORDS]; +const ROUNDS: usize = 80; + +pub fn sha1_block_data_order(state: &mut State32, data: AsChunks) { + // The unwrap won't fail because `CHAINING_WORDS` is smaller than the + // length. + let state: &mut State = (&mut state[..CHAINING_WORDS]).try_into().unwrap(); + // SAFETY: The caller guarantees that this is called with data pointing to `num` + // `BLOCK_LEN`-long blocks. + *state = block_data_order(*state, data) +} + +#[inline] +#[rustfmt::skip] +fn block_data_order( + mut H: [W32; CHAINING_WORDS], + M: AsChunks, +) -> [W32; CHAINING_WORDS] +{ + for M in M { + let (M, remainder): (AsChunks()}>, &[u8]) = slice::as_chunks(M); + debug_assert!(remainder.is_empty()); + + // FIPS 180-4 6.1.2 Step 1 + let mut W: [W32; ROUNDS] = [W32::ZERO; ROUNDS]; + W.iter_mut().zip(M).for_each(|(Wt, Mt)| { + *Wt = W32::from_be_bytes(*Mt); + }); + for t in 16..ROUNDS { + let wt = W[t - 3] ^ W[t - 8] ^ W[t - 14] ^ W[t - 16]; + W[t] = rotl(wt, 1); + } + + // FIPS 180-4 6.1.2 Step 2 + let [a, b, c, d, e] = H; + + // FIPS 180-4 6.1.2 Step 3 with constants and functions from FIPS 180-4 {4.1.1, 4.2.1} + let (a, b, c, d, e) = step3(a, b, c, d, e, &W, 0, Wrapping(0x5a827999), ch); + let (a, b, c, d, e) = step3(a, b, c, d, e, &W, 20, Wrapping(0x6ed9eba1), parity); + let (a, b, c, d, e) = step3(a, b, c, d, e, &W, 40, Wrapping(0x8f1bbcdc), maj); + let (a, b, c, d, e) = step3(a, b, c, d, e, &W, 60, Wrapping(0xca62c1d6), parity); + + // FIPS 180-4 6.1.2 Step 4 + H[0] += a; + H[1] += b; + H[2] += c; + H[3] += d; + H[4] += e; + } + + H +} + +#[inline(always)] +fn step3( + mut a: W32, + mut b: W32, + mut c: W32, + mut d: W32, + mut e: W32, + W: &[W32; 80], + t: usize, + k: W32, + f: impl Fn(W32, W32, W32) -> W32, +) -> (W32, W32, W32, W32, W32) { + let W = &W[t..(t + 20)]; + for W_t in W.iter() { + let T = rotl(a, 5) + f(b, c, d) + e + k + W_t; + e = d; + d = c; + c = rotl(b, 30); + b = a; + a = T; + } + (a, b, c, d, e) +} + +#[inline(always)] +fn rotl(x: W32, n: u32) -> W32 { + Wrapping(x.0.rotate_left(n)) +} diff --git a/ring-0.17.14/src/digest/sha2/fallback.rs b/ring-0.17.14/src/digest/sha2/fallback.rs new file mode 100644 index 0000000000..556f57db51 --- /dev/null +++ b/ring-0.17.14/src/digest/sha2/fallback.rs @@ -0,0 +1,372 @@ +// Copyright 2019-2025 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::CHAINING_WORDS; +use crate::polyfill::slice::{self, AsChunks}; +use core::{ + num::Wrapping, + ops::{Add, AddAssign, BitAnd, BitOr, BitXor, Not, Shr}, +}; + +#[cfg_attr( + any( + all(target_arch = "aarch64", target_endian = "little"), + all(target_arch = "arm", target_endian = "little"), + target_arch = "x86_64" + ), + allow(dead_code) +)] +#[inline] +pub(super) fn block_data_order( + mut H: [S; CHAINING_WORDS], + M: AsChunks, +) -> [S; CHAINING_WORDS] +where + for<'a> &'a S::InputBytes: From<&'a [u8; BYTES_LEN]>, +{ + for M in M { + let (M, remainder): (AsChunks, &[u8]) = slice::as_chunks(M); + debug_assert!(remainder.is_empty()); + + // FIPS 180-4 {6.2.2, 6.4.2} Step 1 + // + // TODO(MSRV): Use `let W: [S::from(0); S::ROUNDS]` instead; depends on + // https://github.com/rust-lang/rust/issues/43408. + let mut W = S::zero_w(); + let W = W.as_mut(); + W.iter_mut().zip(M).for_each(|(Wt, Mt)| { + let Mt: &S::InputBytes = Mt.into(); + *Wt = S::from_be_bytes(*Mt); + }); + for t in 16..S::ROUNDS { + W[t] = sigma_1(W[t - 2]) + W[t - 7] + sigma_0(W[t - 15]) + W[t - 16] + } + + // FIPS 180-4 {6.2.2, 6.4.2} Step 2 + let [mut a, mut b, mut c, mut d, mut e, mut f, mut g, mut h] = H; + + // FIPS 180-4 {6.2.2, 6.4.2} Step 3 + for (Kt, Wt) in S::K.as_ref().iter().zip(W.iter()) { + let T1 = h + SIGMA_1(e) + ch(e, f, g) + *Kt + *Wt; + let T2 = SIGMA_0(a) + maj(a, b, c); + h = g; + g = f; + f = e; + e = d + T1; + d = c; + c = b; + b = a; + a = T1 + T2; + } + + // FIPS 180-4 {6.2.2, 6.4.2} Step 4 + H[0] += a; + H[1] += b; + H[2] += c; + H[3] += d; + H[4] += e; + H[5] += f; + H[6] += g; + H[7] += h; + } + + H +} + +// FIPS 180-4 {4.1.1, 4.1.2, 4.1.3} +#[inline(always)] +pub(in super::super) fn ch(x: W, y: W, z: W) -> W { + (x & y) | (!x & z) +} + +// FIPS 180-4 {4.1.1, 4.1.2, 4.1.3} +#[inline(always)] +pub(in super::super) fn maj(x: W, y: W, z: W) -> W { + (x & y) | (x & z) | (y & z) +} + +// FIPS 180-4 {4.1.2, 4.1.3} +#[inline(always)] +fn SIGMA_0(x: S) -> S { + x.rotr(S::BIG_SIGMA_0.0) ^ x.rotr(S::BIG_SIGMA_0.1) ^ x.rotr(S::BIG_SIGMA_0.2) +} + +// FIPS 180-4 {4.1.2, 4.1.3} +#[inline(always)] +fn SIGMA_1(x: S) -> S { + x.rotr(S::BIG_SIGMA_1.0) ^ x.rotr(S::BIG_SIGMA_1.1) ^ x.rotr(S::BIG_SIGMA_1.2) +} + +// FIPS 180-4 {4.1.2, 4.1.3} +#[inline(always)] +fn sigma_0(x: S) -> S { + x.rotr(S::SMALL_SIGMA_0.0) ^ x.rotr(S::SMALL_SIGMA_0.1) ^ (x >> S::SMALL_SIGMA_0.2) +} + +// FIPS 180-4 {4.1.2, 4.1.3} +#[inline(always)] +fn sigma_1(x: S) -> S { + x.rotr(S::SMALL_SIGMA_1.0) ^ x.rotr(S::SMALL_SIGMA_1.1) ^ (x >> S::SMALL_SIGMA_1.2) +} + +// Commonality between SHA-1 and SHA-2 words. +pub(in super::super) trait Word: + 'static + + Sized + + Copy + + Add + + AddAssign + + BitAnd + + BitOr + + Not +{ + const ZERO: Self; + + type InputBytes: Copy; + + fn from_be_bytes(input: Self::InputBytes) -> Self; + + fn rotr(self, count: u32) -> Self; +} + +/// A SHA-2 input word. +pub(super) trait Sha2: Word + BitXor + Shr { + const BIG_SIGMA_0: (u32, u32, u32); + const BIG_SIGMA_1: (u32, u32, u32); + const SMALL_SIGMA_0: (u32, u32, usize); + const SMALL_SIGMA_1: (u32, u32, usize); + + const ROUNDS: usize; + + type W: AsRef<[Self]> + AsMut<[Self]>; + fn zero_w() -> Self::W; + + const K: &'static Self::W; +} + +impl Word for Wrapping { + const ZERO: Self = Self(0); + type InputBytes = [u8; 4]; + + #[inline(always)] + fn from_be_bytes(input: Self::InputBytes) -> Self { + Self(u32::from_be_bytes(input)) + } + + #[inline(always)] + fn rotr(self, count: u32) -> Self { + Self(self.0.rotate_right(count)) + } +} + +// SHA-256 +impl Sha2 for Wrapping { + // FIPS 180-4 4.1.2 + const BIG_SIGMA_0: (u32, u32, u32) = (2, 13, 22); + const BIG_SIGMA_1: (u32, u32, u32) = (6, 11, 25); + const SMALL_SIGMA_0: (u32, u32, usize) = (7, 18, 3); + const SMALL_SIGMA_1: (u32, u32, usize) = (17, 19, 10); + + // FIPS 180-4 {6.2.2} Step 1 + const ROUNDS: usize = 64; + + type W = [Self; Self::ROUNDS]; + fn zero_w() -> Self::W { + [Self::ZERO; Self::ROUNDS] + } + + // FIPS 180-4 4.2.2 + const K: &'static Self::W = &[ + Self(0x428a2f98), + Self(0x71374491), + Self(0xb5c0fbcf), + Self(0xe9b5dba5), + Self(0x3956c25b), + Self(0x59f111f1), + Self(0x923f82a4), + Self(0xab1c5ed5), + Self(0xd807aa98), + Self(0x12835b01), + Self(0x243185be), + Self(0x550c7dc3), + Self(0x72be5d74), + Self(0x80deb1fe), + Self(0x9bdc06a7), + Self(0xc19bf174), + Self(0xe49b69c1), + Self(0xefbe4786), + Self(0x0fc19dc6), + Self(0x240ca1cc), + Self(0x2de92c6f), + Self(0x4a7484aa), + Self(0x5cb0a9dc), + Self(0x76f988da), + Self(0x983e5152), + Self(0xa831c66d), + Self(0xb00327c8), + Self(0xbf597fc7), + Self(0xc6e00bf3), + Self(0xd5a79147), + Self(0x06ca6351), + Self(0x14292967), + Self(0x27b70a85), + Self(0x2e1b2138), + Self(0x4d2c6dfc), + Self(0x53380d13), + Self(0x650a7354), + Self(0x766a0abb), + Self(0x81c2c92e), + Self(0x92722c85), + Self(0xa2bfe8a1), + Self(0xa81a664b), + Self(0xc24b8b70), + Self(0xc76c51a3), + Self(0xd192e819), + Self(0xd6990624), + Self(0xf40e3585), + Self(0x106aa070), + Self(0x19a4c116), + Self(0x1e376c08), + Self(0x2748774c), + Self(0x34b0bcb5), + Self(0x391c0cb3), + Self(0x4ed8aa4a), + Self(0x5b9cca4f), + Self(0x682e6ff3), + Self(0x748f82ee), + Self(0x78a5636f), + Self(0x84c87814), + Self(0x8cc70208), + Self(0x90befffa), + Self(0xa4506ceb), + Self(0xbef9a3f7), + Self(0xc67178f2), + ]; +} + +impl Word for Wrapping { + const ZERO: Self = Self(0); + type InputBytes = [u8; 8]; + + #[inline(always)] + fn from_be_bytes(input: Self::InputBytes) -> Self { + Self(u64::from_be_bytes(input)) + } + + #[inline(always)] + fn rotr(self, count: u32) -> Self { + Self(self.0.rotate_right(count)) + } +} + +// SHA-384 and SHA-512 +impl Sha2 for Wrapping { + // FIPS 180-4 4.1.3 + const BIG_SIGMA_0: (u32, u32, u32) = (28, 34, 39); + const BIG_SIGMA_1: (u32, u32, u32) = (14, 18, 41); + const SMALL_SIGMA_0: (u32, u32, usize) = (1, 8, 7); + const SMALL_SIGMA_1: (u32, u32, usize) = (19, 61, 6); + + // FIPS 180-4 {6.4.2} Step 1 + const ROUNDS: usize = 80; + + type W = [Self; Self::ROUNDS]; + fn zero_w() -> Self::W { + [Self::ZERO; Self::ROUNDS] + } + + // FIPS 180-4 4.2.3 + const K: &'static Self::W = &[ + Self(0x428a2f98d728ae22), + Self(0x7137449123ef65cd), + Self(0xb5c0fbcfec4d3b2f), + Self(0xe9b5dba58189dbbc), + Self(0x3956c25bf348b538), + Self(0x59f111f1b605d019), + Self(0x923f82a4af194f9b), + Self(0xab1c5ed5da6d8118), + Self(0xd807aa98a3030242), + Self(0x12835b0145706fbe), + Self(0x243185be4ee4b28c), + Self(0x550c7dc3d5ffb4e2), + Self(0x72be5d74f27b896f), + Self(0x80deb1fe3b1696b1), + Self(0x9bdc06a725c71235), + Self(0xc19bf174cf692694), + Self(0xe49b69c19ef14ad2), + Self(0xefbe4786384f25e3), + Self(0x0fc19dc68b8cd5b5), + Self(0x240ca1cc77ac9c65), + Self(0x2de92c6f592b0275), + Self(0x4a7484aa6ea6e483), + Self(0x5cb0a9dcbd41fbd4), + Self(0x76f988da831153b5), + Self(0x983e5152ee66dfab), + Self(0xa831c66d2db43210), + Self(0xb00327c898fb213f), + Self(0xbf597fc7beef0ee4), + Self(0xc6e00bf33da88fc2), + Self(0xd5a79147930aa725), + Self(0x06ca6351e003826f), + Self(0x142929670a0e6e70), + Self(0x27b70a8546d22ffc), + Self(0x2e1b21385c26c926), + Self(0x4d2c6dfc5ac42aed), + Self(0x53380d139d95b3df), + Self(0x650a73548baf63de), + Self(0x766a0abb3c77b2a8), + Self(0x81c2c92e47edaee6), + Self(0x92722c851482353b), + Self(0xa2bfe8a14cf10364), + Self(0xa81a664bbc423001), + Self(0xc24b8b70d0f89791), + Self(0xc76c51a30654be30), + Self(0xd192e819d6ef5218), + Self(0xd69906245565a910), + Self(0xf40e35855771202a), + Self(0x106aa07032bbd1b8), + Self(0x19a4c116b8d2d0c8), + Self(0x1e376c085141ab53), + Self(0x2748774cdf8eeb99), + Self(0x34b0bcb5e19b48a8), + Self(0x391c0cb3c5c95a63), + Self(0x4ed8aa4ae3418acb), + Self(0x5b9cca4f7763e373), + Self(0x682e6ff3d6b2b8a3), + Self(0x748f82ee5defb2fc), + Self(0x78a5636f43172f60), + Self(0x84c87814a1f0ab72), + Self(0x8cc702081a6439ec), + Self(0x90befffa23631e28), + Self(0xa4506cebde82bde9), + Self(0xbef9a3f7b2c67915), + Self(0xc67178f2e372532b), + Self(0xca273eceea26619c), + Self(0xd186b8c721c0c207), + Self(0xeada7dd6cde0eb1e), + Self(0xf57d4f7fee6ed178), + Self(0x06f067aa72176fba), + Self(0x0a637dc5a2c898a6), + Self(0x113f9804bef90dae), + Self(0x1b710b35131c471b), + Self(0x28db77f523047d84), + Self(0x32caab7b40c72493), + Self(0x3c9ebe0a15c9bebc), + Self(0x431d67c49c100d4c), + Self(0x4cc5d4becb3e42b6), + Self(0x597f299cfc657e2a), + Self(0x5fcb6fab3ad6faec), + Self(0x6c44198c4a475817), + ]; +} diff --git a/ring-0.17.14/src/digest/sha2/ffi.rs b/ring-0.17.14/src/digest/sha2/ffi.rs new file mode 100644 index 0000000000..3ee8044e3b --- /dev/null +++ b/ring-0.17.14/src/digest/sha2/ffi.rs @@ -0,0 +1,71 @@ +// Copyright 2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::CHAINING_WORDS; +use crate::polyfill::slice::AsChunks; +use core::num::{NonZeroUsize, Wrapping}; + +/// `unsafe { T => f }` means it is safe to call `f` iff we can construct +/// a value of type `T`. +macro_rules! sha2_ffi { + ( $U:ty, $BLOCK_LEN:expr, unsafe { $Cpu:ty => $f:ident }, + $state:expr, $data:expr, $cpu:expr $(,)? ) => {{ + prefixed_extern! { + fn $f( + state: *mut [core::num::Wrapping<$U>; crate::digest::sha2::CHAINING_WORDS], + data: *const [u8; $BLOCK_LEN], + num: crate::c::NonZero_size_t); + } + // SAFETY: The user asserts that $f has the signature above and is safe + // to call if additionally we have a value of type `$Cpu`, which we do. + unsafe { + crate::digest::sha2::ffi::sha2_ffi::<$U, $Cpu, { $BLOCK_LEN }>($state, $data, $cpu, $f) + } + }}; +} + +macro_rules! sha2_32_ffi { + ( unsafe { $Cpu:ty => $f:ident }, $state:expr, $data:expr, $cpu:expr $(,)? ) => { + sha2_ffi!(u32, crate::digest::sha2::SHA256_BLOCK_LEN.into(), + unsafe { $Cpu => $f }, $state, $data, $cpu) + } +} + +macro_rules! sha2_64_ffi { + ( unsafe { $Cpu:ty => $f:ident }, $state:expr, $data:expr, $cpu:expr $(,)? ) => { + sha2_ffi!(u64, SHA512_BLOCK_LEN.into(), unsafe { $Cpu => $f }, $state, $data, $cpu) + } +} + +pub(super) unsafe fn sha2_ffi( + state: &mut [Wrapping; CHAINING_WORDS], + data: AsChunks, + cpu: Cpu, + f: unsafe extern "C" fn( + *mut [Wrapping; CHAINING_WORDS], + *const [u8; BLOCK_LEN], + crate::c::NonZero_size_t, + ), +) { + if let Some(blocks) = NonZeroUsize::new(data.len()) { + let data = data.as_ptr(); + let _: Cpu = cpu; + // SAFETY: + // * `blocks` is non-zero. + // * `data` is non-NULL and points to `blocks` blocks. + // * The caller asserted that `f` meets this contract if we have + // an instance of `Cpu`. + unsafe { f(state, data, blocks) } + } +} diff --git a/ring-0.17.14/src/digest/sha2/mod.rs b/ring-0.17.14/src/digest/sha2/mod.rs new file mode 100644 index 0000000000..78ca27b6da --- /dev/null +++ b/ring-0.17.14/src/digest/sha2/mod.rs @@ -0,0 +1,34 @@ +// Copyright 2019-2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::BlockLen; + +pub(super) use self::{ + sha2_32::{block_data_order_32, State32, SHA256_BLOCK_LEN}, + sha2_64::{block_data_order_64, State64, SHA512_BLOCK_LEN}, +}; + +pub(super) const CHAINING_WORDS: usize = 8; + +#[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + all(target_arch = "arm", target_endian = "little"), + target_arch = "x86_64" +))] +#[macro_use] +mod ffi; + +pub(super) mod fallback; +mod sha2_32; +mod sha2_64; diff --git a/ring-0.17.14/src/digest/sha2/sha2_32.rs b/ring-0.17.14/src/digest/sha2/sha2_32.rs new file mode 100644 index 0000000000..4f762d1587 --- /dev/null +++ b/ring-0.17.14/src/digest/sha2/sha2_32.rs @@ -0,0 +1,63 @@ +// Copyright 2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::{BlockLen, CHAINING_WORDS}; +use crate::{cpu, polyfill::slice::AsChunks}; +use cfg_if::cfg_if; +use core::num::Wrapping; + +pub(in super::super) const SHA256_BLOCK_LEN: BlockLen = BlockLen::_512; + +pub type State32 = [Wrapping; CHAINING_WORDS]; + +pub(crate) fn block_data_order_32( + state: &mut State32, + data: AsChunks, + cpu: cpu::Features, +) { + cfg_if! { + if #[cfg(all(target_arch = "aarch64", target_endian = "little"))] { + use cpu::{GetFeature as _, arm::Sha256}; + if let Some(cpu) = cpu.get_feature() { + sha2_32_ffi!(unsafe { Sha256 => sha256_block_data_order_hw }, state, data, cpu) + } else { + sha2_32_ffi!(unsafe { () => sha256_block_data_order_nohw }, state, data, ()) + } + } else if #[cfg(all(target_arch = "arm", target_endian = "little"))] { + use cpu::{GetFeature as _, arm::Neon}; + if let Some(cpu) = cpu.get_feature() { + sha2_32_ffi!(unsafe { Neon => sha256_block_data_order_neon }, state, data, cpu) + } else { + sha2_32_ffi!(unsafe { () => sha256_block_data_order_nohw }, state, data, ()) + } + } else if #[cfg(target_arch = "x86_64")] { + use cpu::{GetFeature as _, intel::{Avx, IntelCpu, Sha, Ssse3 }}; + let cpu = cpu.values(); + if let Some(cpu) = cpu.get_feature() { + sha2_32_ffi!(unsafe { (Sha, Ssse3) => sha256_block_data_order_hw }, state, data, cpu) + } else if let Some(cpu) = cpu.get_feature() { + // Pre-Zen AMD CPUs had slow SHLD/SHRD; Zen added the SHA + // extension; see the discussion in upstream's sha1-586.pl. + sha2_32_ffi!(unsafe { (Avx, IntelCpu) => sha256_block_data_order_avx }, state, data, cpu) + } else if let Some(cpu) = cpu.get_feature() { + sha2_32_ffi!(unsafe { Ssse3 => sha256_block_data_order_ssse3 }, state, data, cpu) + } else { + sha2_32_ffi!(unsafe { () => sha256_block_data_order_nohw }, state, data, ()) + } + } else { + let _ = cpu; // Unneeded. + *state = super::fallback::block_data_order(*state, data) + } + } +} diff --git a/ring-0.17.14/src/digest/sha2/sha2_64.rs b/ring-0.17.14/src/digest/sha2/sha2_64.rs new file mode 100644 index 0000000000..63c254e1d9 --- /dev/null +++ b/ring-0.17.14/src/digest/sha2/sha2_64.rs @@ -0,0 +1,60 @@ +// Copyright 2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::{BlockLen, CHAINING_WORDS}; +use crate::{cpu, polyfill::slice::AsChunks}; +use cfg_if::cfg_if; +use core::num::Wrapping; + +pub(in super::super) const SHA512_BLOCK_LEN: BlockLen = BlockLen::_1024; + +pub type State64 = [Wrapping; CHAINING_WORDS]; + +pub(crate) fn block_data_order_64( + state: &mut State64, + data: AsChunks, + cpu: cpu::Features, +) { + cfg_if! { + if #[cfg(all(target_arch = "aarch64", target_endian = "little"))] { + use cpu::{GetFeature as _, arm::Sha512}; + if let Some(cpu) = cpu.get_feature() { + sha2_64_ffi!(unsafe { Sha512 => sha512_block_data_order_hw }, state, data, cpu) + } else { + sha2_64_ffi!(unsafe { () => sha512_block_data_order_nohw }, state, data, ()) + } + } else if #[cfg(all(target_arch = "arm", target_endian = "little"))] { + use cpu::{GetFeature as _, arm::Neon}; + if let Some(cpu) = cpu.get_feature() { + sha2_64_ffi!(unsafe { Neon => sha512_block_data_order_neon }, state, data, cpu) + } else { + sha2_64_ffi!(unsafe { () => sha512_block_data_order_nohw }, state, data, ()) + } + } else if #[cfg(target_arch = "x86_64")] { + use cpu::{GetFeature as _, intel::{Avx, IntelCpu}}; + if let Some(cpu) = cpu.get_feature() { + // Pre-Zen AMD CPUs had microcoded SHLD/SHRD which makes the + // AVX version slow. We're also unsure of the side channel + // ramifications of those microcoded instructions. + sha2_64_ffi!(unsafe { (Avx, IntelCpu) => sha512_block_data_order_avx }, + state, data, cpu); + } else { + sha2_64_ffi!(unsafe { () => sha512_block_data_order_nohw }, state, data, ()) + } + } else { + let _ = cpu; // Unneeded. + *state = super::fallback::block_data_order(*state, data) + } + } +} diff --git a/ring-0.17.14/src/ec.rs b/ring-0.17.14/src/ec.rs new file mode 100644 index 0000000000..33e7fc2feb --- /dev/null +++ b/ring-0.17.14/src/ec.rs @@ -0,0 +1,68 @@ +// Copyright 2015-2017 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use crate::{cpu, error, rand}; + +pub use self::keys::{KeyPair, PublicKey, Seed}; + +pub struct Curve { + pub public_key_len: usize, + pub elem_scalar_seed_len: usize, + + pub id: CurveID, + + // Precondition: `bytes` is the correct length. + check_private_key_bytes: fn(bytes: &[u8], cpu: cpu::Features) -> Result<(), error::Unspecified>, + + generate_private_key: fn( + rng: &dyn rand::SecureRandom, + &mut [u8], + cpu: cpu::Features, + ) -> Result<(), error::Unspecified>, + + public_from_private: fn( + public_out: &mut [u8], + private_key: &Seed, + cpu: cpu::Features, + ) -> Result<(), error::Unspecified>, +} + +derive_debug_via_id!(Curve); + +#[derive(Clone, Copy, Debug, PartialEq)] +pub enum CurveID { + Curve25519, + P256, + P384, +} + +const ELEM_MAX_BITS: usize = 384; +pub const ELEM_MAX_BYTES: usize = (ELEM_MAX_BITS + 7) / 8; + +pub const SCALAR_MAX_BYTES: usize = ELEM_MAX_BYTES; +const SEED_MAX_BYTES: usize = ELEM_MAX_BYTES; + +/// The maximum length of a PKCS#8 documents generated by *ring* for ECC keys. +/// +/// This is NOT the maximum length of a PKCS#8 document that can be consumed by +/// `pkcs8::unwrap_key()`. +/// +/// `40` is the length of the P-384 template. It is actually one byte shorter +/// than the P-256 template, but the private key and the public key are much +/// longer. +pub const PKCS8_DOCUMENT_MAX_LEN: usize = 40 + SCALAR_MAX_BYTES + keys::PUBLIC_KEY_MAX_LEN; + +pub mod curve25519; +mod keys; +pub mod suite_b; diff --git a/ring-0.17.14/src/ec/curve25519.rs b/ring-0.17.14/src/ec/curve25519.rs new file mode 100644 index 0000000000..4f4596d083 --- /dev/null +++ b/ring-0.17.14/src/ec/curve25519.rs @@ -0,0 +1,22 @@ +// Copyright 2016 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! Elliptic curve operations and schemes using Curve25519. + +pub mod ed25519; + +pub mod x25519; + +mod ops; +mod scalar; diff --git a/ring-0.17.14/src/ec/curve25519/ed25519.rs b/ring-0.17.14/src/ec/curve25519/ed25519.rs new file mode 100644 index 0000000000..fe1a9ff607 --- /dev/null +++ b/ring-0.17.14/src/ec/curve25519/ed25519.rs @@ -0,0 +1,32 @@ +// Copyright 2015-2016 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! EdDSA Signatures. + +use super::ops::ELEM_LEN; +use crate::digest; + +pub mod signing; +pub mod verification; + +/// The length of an Ed25519 public key. +pub const ED25519_PUBLIC_KEY_LEN: usize = ELEM_LEN; + +pub fn eddsa_digest(signature_r: &[u8], public_key: &[u8], msg: &[u8]) -> digest::Digest { + let mut ctx = digest::Context::new(&digest::SHA512); + ctx.update(signature_r); + ctx.update(public_key); + ctx.update(msg); + ctx.finish() +} diff --git a/ring-0.17.14/src/ec/curve25519/ed25519/ed25519_pkcs8_v2_template.der b/ring-0.17.14/src/ec/curve25519/ed25519/ed25519_pkcs8_v2_template.der new file mode 100644 index 0000000000..7230086e9c Binary files /dev/null and b/ring-0.17.14/src/ec/curve25519/ed25519/ed25519_pkcs8_v2_template.der differ diff --git a/ring-0.17.14/src/ec/curve25519/ed25519/signing.rs b/ring-0.17.14/src/ec/curve25519/ed25519/signing.rs new file mode 100644 index 0000000000..d6d249d782 --- /dev/null +++ b/ring-0.17.14/src/ec/curve25519/ed25519/signing.rs @@ -0,0 +1,275 @@ +// Copyright 2015-2016 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! EdDSA Signatures. + +use super::{super::ops::*, eddsa_digest, ED25519_PUBLIC_KEY_LEN}; +use crate::{ + cpu, digest, error, + io::der, + pkcs8, rand, + signature::{self, KeyPair as SigningKeyPair}, +}; + +/// An Ed25519 key pair, for signing. +pub struct Ed25519KeyPair { + // RFC 8032 Section 5.1.6 calls this *s*. + private_scalar: Scalar, + + // RFC 8032 Section 5.1.6 calls this *prefix*. + private_prefix: Prefix, + + // RFC 8032 Section 5.1.5 calls this *A*. + public_key: PublicKey, +} + +derive_debug_via_field!(Ed25519KeyPair, stringify!(Ed25519KeyPair), public_key); + +impl Ed25519KeyPair { + /// Generates a new key pair and returns the key pair serialized as a + /// PKCS#8 document. + /// + /// The PKCS#8 document will be a v2 `OneAsymmetricKey` with the public key, + /// as described in [RFC 5958 Section 2]; see [RFC 8410 Section 10.3] for an + /// example. + /// + /// [RFC 5958 Section 2]: https://tools.ietf.org/html/rfc5958#section-2 + /// [RFC 8410 Section 10.3]: https://tools.ietf.org/html/rfc8410#section-10.3 + pub fn generate_pkcs8( + rng: &dyn rand::SecureRandom, + ) -> Result { + let cpu_features = cpu::features(); + let seed: [u8; SEED_LEN] = rand::generate(rng)?.expose(); + let key_pair = Self::from_seed_(&seed, cpu_features); + Ok(pkcs8::wrap_key( + &PKCS8_TEMPLATE, + &seed[..], + key_pair.public_key().as_ref(), + )) + } + + /// Constructs an Ed25519 key pair by parsing an unencrypted PKCS#8 v2 + /// Ed25519 private key. + /// + /// `openssl genpkey -algorithm ED25519` generates PKCS# v1 keys, which + /// require the use of `Ed25519KeyPair::from_pkcs8_maybe_unchecked()` + /// instead of `Ed25519KeyPair::from_pkcs8()`. + /// + /// The input must be in PKCS#8 v2 format, and in particular it must contain + /// the public key in addition to the private key. `from_pkcs8()` will + /// verify that the public key and the private key are consistent with each + /// other. + /// + /// Some early implementations of PKCS#8 v2, including earlier versions of + /// *ring* and other implementations, wrapped the public key in the wrong + /// ASN.1 tags. Both that incorrect form and the standardized form are + /// accepted. + /// + /// If you need to parse PKCS#8 v1 files (without the public key) then use + /// `Ed25519KeyPair::from_pkcs8_maybe_unchecked()` instead. + pub fn from_pkcs8(pkcs8: &[u8]) -> Result { + let version = pkcs8::Version::V2Only(pkcs8::PublicKeyOptions { + accept_legacy_ed25519_public_key_tag: true, + }); + let (seed, public_key) = unwrap_pkcs8(version, untrusted::Input::from(pkcs8))?; + Self::from_seed_and_public_key( + seed.as_slice_less_safe(), + public_key.unwrap().as_slice_less_safe(), + ) + } + + /// Constructs an Ed25519 key pair by parsing an unencrypted PKCS#8 v1 or v2 + /// Ed25519 private key. + /// + /// `openssl genpkey -algorithm ED25519` generates PKCS# v1 keys. + /// + /// It is recommended to use `Ed25519KeyPair::from_pkcs8()`, which accepts + /// only PKCS#8 v2 files that contain the public key. + /// `from_pkcs8_maybe_unchecked()` parses PKCS#2 files exactly like + /// `from_pkcs8()`. It also accepts v1 files. PKCS#8 v1 files do not contain + /// the public key, so when a v1 file is parsed the public key will be + /// computed from the private key, and there will be no consistency check + /// between the public key and the private key. + /// + /// Some early implementations of PKCS#8 v2, including earlier versions of + /// *ring* and other implementations, wrapped the public key in the wrong + /// ASN.1 tags. Both that incorrect form and the standardized form are + /// accepted. + /// + /// PKCS#8 v2 files are parsed exactly like `Ed25519KeyPair::from_pkcs8()`. + pub fn from_pkcs8_maybe_unchecked(pkcs8: &[u8]) -> Result { + let version = pkcs8::Version::V1OrV2(pkcs8::PublicKeyOptions { + accept_legacy_ed25519_public_key_tag: true, + }); + let (seed, public_key) = unwrap_pkcs8(version, untrusted::Input::from(pkcs8))?; + if let Some(public_key) = public_key { + Self::from_seed_and_public_key( + seed.as_slice_less_safe(), + public_key.as_slice_less_safe(), + ) + } else { + Self::from_seed_unchecked(seed.as_slice_less_safe()) + } + } + + /// Constructs an Ed25519 key pair from the private key seed `seed` and its + /// public key `public_key`. + /// + /// It is recommended to use `Ed25519KeyPair::from_pkcs8()` instead. + /// + /// The private and public keys will be verified to be consistent with each + /// other. This helps avoid misuse of the key (e.g. accidentally swapping + /// the private key and public key, or using the wrong private key for the + /// public key). This also detects any corruption of the public or private + /// key. + pub fn from_seed_and_public_key( + seed: &[u8], + public_key: &[u8], + ) -> Result { + let pair = Self::from_seed_unchecked(seed)?; + + // This implicitly verifies that `public_key` is the right length. + // XXX: This rejects ~18 keys when they are partially reduced, though + // those keys are virtually impossible to find. + if public_key != pair.public_key.as_ref() { + let err = if public_key.len() != pair.public_key.as_ref().len() { + error::KeyRejected::invalid_encoding() + } else { + error::KeyRejected::inconsistent_components() + }; + return Err(err); + } + + Ok(pair) + } + + /// Constructs a Ed25519 key pair from the private key seed `seed`. + /// + /// It is recommended to use `Ed25519KeyPair::from_pkcs8()` instead. When + /// that is not practical, it is recommended to use + /// `Ed25519KeyPair::from_seed_and_public_key()` instead. + /// + /// Since the public key is not given, the public key will be computed from + /// the private key. It is not possible to detect misuse or corruption of + /// the private key since the public key isn't given as input. + pub fn from_seed_unchecked(seed: &[u8]) -> Result { + let seed = seed + .try_into() + .map_err(|_| error::KeyRejected::invalid_encoding())?; + Ok(Self::from_seed_(seed, cpu::features())) + } + + fn from_seed_(seed: &Seed, cpu_features: cpu::Features) -> Self { + let h = digest::digest(&digest::SHA512, seed); + let (private_scalar, private_prefix) = h.as_ref().split_at(SCALAR_LEN); + + let private_scalar = + MaskedScalar::from_bytes_masked(private_scalar.try_into().unwrap()).into(); + + let a = ExtPoint::from_scalarmult_base(&private_scalar, cpu_features); + + Self { + private_scalar, + private_prefix: private_prefix.try_into().unwrap(), + public_key: PublicKey(a.into_encoded_point(cpu_features)), + } + } + + /// Returns the signature of the message `msg`. + pub fn sign(&self, msg: &[u8]) -> signature::Signature { + let cpu_features = cpu::features(); + signature::Signature::new(|signature_bytes| { + prefixed_extern! { + fn x25519_sc_muladd( + s: &mut [u8; SCALAR_LEN], + a: &Scalar, + b: &Scalar, + c: &Scalar, + ); + } + + let (signature_bytes, _unused) = signature_bytes.split_at_mut(ELEM_LEN + SCALAR_LEN); + let (signature_r, signature_s) = signature_bytes.split_at_mut(ELEM_LEN); + let nonce = { + let mut ctx = digest::Context::new(&digest::SHA512); + ctx.update(&self.private_prefix); + ctx.update(msg); + ctx.finish() + }; + let nonce = Scalar::from_sha512_digest_reduced(nonce); + + let r = ExtPoint::from_scalarmult_base(&nonce, cpu_features); + signature_r.copy_from_slice(&r.into_encoded_point(cpu_features)); + let hram_digest = eddsa_digest(signature_r, self.public_key.as_ref(), msg); + let hram = Scalar::from_sha512_digest_reduced(hram_digest); + unsafe { + x25519_sc_muladd( + signature_s.try_into().unwrap(), + &hram, + &self.private_scalar, + &nonce, + ); + } + + SIGNATURE_LEN + }) + } +} + +impl signature::KeyPair for Ed25519KeyPair { + type PublicKey = PublicKey; + + fn public_key(&self) -> &Self::PublicKey { + &self.public_key + } +} + +#[derive(Clone, Copy)] +pub struct PublicKey([u8; ED25519_PUBLIC_KEY_LEN]); + +impl AsRef<[u8]> for PublicKey { + fn as_ref(&self) -> &[u8] { + self.0.as_ref() + } +} + +derive_debug_self_as_ref_hex_bytes!(PublicKey); + +fn unwrap_pkcs8( + version: pkcs8::Version, + input: untrusted::Input, +) -> Result<(untrusted::Input, Option), error::KeyRejected> { + let (private_key, public_key) = pkcs8::unwrap_key(&PKCS8_TEMPLATE, version, input)?; + let private_key = private_key + .read_all(error::Unspecified, |input| { + der::expect_tag_and_get_value(input, der::Tag::OctetString) + }) + .map_err(|error::Unspecified| error::KeyRejected::invalid_encoding())?; + Ok((private_key, public_key)) +} + +type Prefix = [u8; PREFIX_LEN]; +const PREFIX_LEN: usize = digest::SHA512_OUTPUT_LEN - SCALAR_LEN; + +const SIGNATURE_LEN: usize = ELEM_LEN + SCALAR_LEN; + +type Seed = [u8; SEED_LEN]; +const SEED_LEN: usize = 32; + +static PKCS8_TEMPLATE: pkcs8::Template = pkcs8::Template { + bytes: include_bytes!("ed25519_pkcs8_v2_template.der"), + alg_id_range: core::ops::Range { start: 7, end: 12 }, + curve_id_index: 0, + private_key_index: 0x10, +}; diff --git a/ring-0.17.14/src/ec/curve25519/ed25519/verification.rs b/ring-0.17.14/src/ec/curve25519/ed25519/verification.rs new file mode 100644 index 0000000000..8a81f42f51 --- /dev/null +++ b/ring-0.17.14/src/ec/curve25519/ed25519/verification.rs @@ -0,0 +1,85 @@ +// Copyright 2015-2016 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! EdDSA Signatures. + +use super::{super::ops::*, eddsa_digest}; +use crate::{cpu, error, sealed, signature}; + +/// Parameters for EdDSA signing and verification. +pub struct EdDSAParameters; + +impl core::fmt::Debug for EdDSAParameters { + fn fmt(&self, f: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> { + write!(f, "ring::signature::ED25519") + } +} + +/// Verification of [Ed25519] signatures. +/// +/// Ed25519 uses SHA-512 as the digest algorithm. +/// +/// [Ed25519]: https://ed25519.cr.yp.to/ +pub static ED25519: EdDSAParameters = EdDSAParameters {}; + +impl signature::VerificationAlgorithm for EdDSAParameters { + fn verify( + &self, + public_key: untrusted::Input, + msg: untrusted::Input, + signature: untrusted::Input, + ) -> Result<(), error::Unspecified> { + let cpu_features = cpu::features(); + + let public_key: &[u8; ELEM_LEN] = public_key.as_slice_less_safe().try_into()?; + let (signature_r, signature_s) = signature.read_all(error::Unspecified, |input| { + let signature_r: &[u8; ELEM_LEN] = input + .read_bytes(ELEM_LEN)? + .as_slice_less_safe() + .try_into()?; + let signature_s: &[u8; SCALAR_LEN] = input + .read_bytes(SCALAR_LEN)? + .as_slice_less_safe() + .try_into()?; + Ok((signature_r, signature_s)) + })?; + + let signature_s = Scalar::from_bytes_checked(*signature_s)?; + + let mut a = ExtPoint::from_encoded_point_vartime(public_key)?; + a.invert_vartime(); + + let h_digest = eddsa_digest(signature_r, public_key, msg.as_slice_less_safe()); + let h = Scalar::from_sha512_digest_reduced(h_digest); + + let mut r = Point::new_at_infinity(); + unsafe { x25519_ge_double_scalarmult_vartime(&mut r, &h, &a, &signature_s) }; + let r_check = r.into_encoded_point(cpu_features); + if *signature_r != r_check { + return Err(error::Unspecified); + } + Ok(()) + } +} + +impl sealed::Sealed for EdDSAParameters {} + +prefixed_extern! { + fn x25519_ge_double_scalarmult_vartime( + r: &mut Point, + a_coeff: &Scalar, + a: &ExtPoint, + b_coeff: &Scalar, + ); +} diff --git a/ring-0.17.14/src/ec/curve25519/ops.rs b/ring-0.17.14/src/ec/curve25519/ops.rs new file mode 100644 index 0000000000..b34d68c8f2 --- /dev/null +++ b/ring-0.17.14/src/ec/curve25519/ops.rs @@ -0,0 +1,180 @@ +// Copyright 2015-2017 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! Elliptic curve operations on the birationally equivalent curves Curve25519 +//! and Edwards25519. + +pub use super::scalar::{MaskedScalar, Scalar, SCALAR_LEN}; +use crate::{ + bssl, cpu, error, + limb::{Limb, LIMB_BITS}, +}; +use core::{ffi::c_int, marker::PhantomData}; + +// Elem` is `fe` in curve25519/internal.h. +// Elem is `fe_loose` in curve25519/internal.h. +// Keep this in sync with curve25519/internal.h. +#[repr(C)] +pub struct Elem { + limbs: [Limb; ELEM_LIMBS], // This is called `v` in the C code. + encoding: PhantomData, +} + +pub trait Encoding {} +pub struct T; +impl Encoding for T {} + +const ELEM_LIMBS: usize = 5 * 64 / LIMB_BITS; + +impl Elem { + fn zero() -> Self { + Self { + limbs: Default::default(), + encoding: PhantomData, + } + } +} + +impl Elem { + fn negate(&mut self) { + unsafe { + x25519_fe_neg(self); + } + } +} + +// An encoding of a curve point. If on Curve25519, it should be encoded as +// described in Section 5 of [RFC 7748]. If on Edwards25519, it should be +// encoded as described in section 5.1.2 of [RFC 8032]. +// +// [RFC 7748] https://tools.ietf.org/html/rfc7748#section-5 +// [RFC 8032] https://tools.ietf.org/html/rfc8032#section-5.1.2 +pub type EncodedPoint = [u8; ELEM_LEN]; +pub const ELEM_LEN: usize = 32; + +// Keep this in sync with `ge_p3` in curve25519/internal.h. +#[repr(C)] +pub struct ExtPoint { + x: Elem, + y: Elem, + z: Elem, + t: Elem, +} + +impl ExtPoint { + // Returns the result of multiplying the base point by the scalar in constant time. + pub(super) fn from_scalarmult_base(scalar: &Scalar, cpu: cpu::Features) -> Self { + let mut r = Self { + x: Elem::zero(), + y: Elem::zero(), + z: Elem::zero(), + t: Elem::zero(), + }; + prefixed_extern! { + fn x25519_ge_scalarmult_base(h: &mut ExtPoint, a: &Scalar, has_fe25519_adx: c_int); + } + unsafe { + x25519_ge_scalarmult_base(&mut r, scalar, has_fe25519_adx(cpu).into()); + } + r + } + + pub fn from_encoded_point_vartime(encoded: &EncodedPoint) -> Result { + let mut point = Self { + x: Elem::zero(), + y: Elem::zero(), + z: Elem::zero(), + t: Elem::zero(), + }; + + Result::from(unsafe { x25519_ge_frombytes_vartime(&mut point, encoded) }).map(|()| point) + } + + pub(super) fn into_encoded_point(self, cpu_features: cpu::Features) -> EncodedPoint { + encode_point(self.x, self.y, self.z, cpu_features) + } + + pub(super) fn invert_vartime(&mut self) { + self.x.negate(); + self.t.negate(); + } +} + +// Keep this in sync with `ge_p2` in curve25519/internal.h. +#[repr(C)] +pub struct Point { + x: Elem, + y: Elem, + z: Elem, +} + +impl Point { + pub fn new_at_infinity() -> Self { + Self { + x: Elem::zero(), + y: Elem::zero(), + z: Elem::zero(), + } + } + + pub(super) fn into_encoded_point(self, cpu_features: cpu::Features) -> EncodedPoint { + encode_point(self.x, self.y, self.z, cpu_features) + } +} + +fn encode_point(x: Elem, y: Elem, z: Elem, _cpu_features: cpu::Features) -> EncodedPoint { + let mut bytes = [0; ELEM_LEN]; + + let sign_bit: u8 = unsafe { + let mut recip = Elem::zero(); + x25519_fe_invert(&mut recip, &z); + + let mut x_over_z = Elem::zero(); + x25519_fe_mul_ttt(&mut x_over_z, &x, &recip); + + let mut y_over_z = Elem::zero(); + x25519_fe_mul_ttt(&mut y_over_z, &y, &recip); + x25519_fe_tobytes(&mut bytes, &y_over_z); + + x25519_fe_isnegative(&x_over_z) + }; + + // The preceding computations must execute in constant time, but this + // doesn't need to. + bytes[ELEM_LEN - 1] ^= sign_bit << 7; + + bytes +} + +#[inline(always)] +pub(super) fn has_fe25519_adx(cpu: cpu::Features) -> bool { + cfg_if::cfg_if! { + if #[cfg(all(target_arch = "x86_64", not(target_os = "windows")))] { + use cpu::{intel::{Adx, Bmi1, Bmi2}, GetFeature as _}; + matches!(cpu.get_feature(), Some((Adx { .. }, Bmi1 { .. }, Bmi2 { .. }))) + } else { + let _ = cpu; + false + } + } +} + +prefixed_extern! { + fn x25519_fe_invert(out: &mut Elem, z: &Elem); + fn x25519_fe_isnegative(elem: &Elem) -> u8; + fn x25519_fe_mul_ttt(h: &mut Elem, f: &Elem, g: &Elem); + fn x25519_fe_neg(f: &mut Elem); + fn x25519_fe_tobytes(bytes: &mut EncodedPoint, elem: &Elem); + fn x25519_ge_frombytes_vartime(h: &mut ExtPoint, s: &EncodedPoint) -> bssl::Result; +} diff --git a/ring-0.17.14/src/ec/curve25519/scalar.rs b/ring-0.17.14/src/ec/curve25519/scalar.rs new file mode 100644 index 0000000000..a56e281d52 --- /dev/null +++ b/ring-0.17.14/src/ec/curve25519/scalar.rs @@ -0,0 +1,78 @@ +// Copyright 2015-2019 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use crate::{ + arithmetic::limbs_from_hex, + digest, error, limb, + polyfill::slice::{self, AsChunks}, +}; +use core::array; + +#[repr(transparent)] +pub struct Scalar([u8; SCALAR_LEN]); + +pub const SCALAR_LEN: usize = 32; + +impl Scalar { + // Constructs a `Scalar` from `bytes`, failing if `bytes` encodes a scalar + // that is not in the range [0, n). + pub fn from_bytes_checked(bytes: [u8; SCALAR_LEN]) -> Result { + const ORDER: [limb::Limb; SCALAR_LEN / limb::LIMB_BYTES] = + limbs_from_hex("1000000000000000000000000000000014def9dea2f79cd65812631a5cf5d3ed"); + let order = ORDER.map(limb::Limb::from); + + let (limbs_as_bytes, _empty): (AsChunks, _) = + slice::as_chunks(&bytes); + debug_assert!(_empty.is_empty()); + let limbs: [limb::Limb; SCALAR_LEN / limb::LIMB_BYTES] = + array::from_fn(|i| limb::Limb::from_le_bytes(limbs_as_bytes[i])); + limb::verify_limbs_less_than_limbs_leak_bit(&limbs, &order)?; + + Ok(Self(bytes)) + } + + // Constructs a `Scalar` from `digest` reduced modulo n. + pub fn from_sha512_digest_reduced(digest: digest::Digest) -> Self { + prefixed_extern! { + fn x25519_sc_reduce(s: &mut UnreducedScalar); + } + let mut unreduced = [0u8; digest::SHA512_OUTPUT_LEN]; + unreduced.copy_from_slice(digest.as_ref()); + unsafe { x25519_sc_reduce(&mut unreduced) }; + Self((&unreduced[..SCALAR_LEN]).try_into().unwrap()) + } +} + +#[repr(transparent)] +pub struct MaskedScalar([u8; SCALAR_LEN]); + +impl MaskedScalar { + pub fn from_bytes_masked(bytes: [u8; SCALAR_LEN]) -> Self { + prefixed_extern! { + fn x25519_sc_mask(a: &mut [u8; SCALAR_LEN]); + } + let mut r = Self(bytes); + unsafe { x25519_sc_mask(&mut r.0) }; + r + } +} + +impl From for Scalar { + fn from(MaskedScalar(scalar): MaskedScalar) -> Self { + Self(scalar) + } +} + +type UnreducedScalar = [u8; UNREDUCED_SCALAR_LEN]; +const UNREDUCED_SCALAR_LEN: usize = SCALAR_LEN * 2; diff --git a/ring-0.17.14/src/ec/curve25519/x25519.rs b/ring-0.17.14/src/ec/curve25519/x25519.rs new file mode 100644 index 0000000000..216648a5bd --- /dev/null +++ b/ring-0.17.14/src/ec/curve25519/x25519.rs @@ -0,0 +1,249 @@ +// Copyright 2015-2016 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! X25519 Key agreement. + +use super::{ops, scalar::SCALAR_LEN}; +use crate::{agreement, bb, cpu, ec, error, rand}; +use core::ffi::c_int; + +static CURVE25519: ec::Curve = ec::Curve { + public_key_len: PUBLIC_KEY_LEN, + elem_scalar_seed_len: ELEM_AND_SCALAR_LEN, + id: ec::CurveID::Curve25519, + check_private_key_bytes: x25519_check_private_key_bytes, + generate_private_key: x25519_generate_private_key, + public_from_private: x25519_public_from_private, +}; + +/// X25519 (ECDH using Curve25519) as described in [RFC 7748]. +/// +/// Everything is as described in RFC 7748. Key agreement will fail if the +/// result of the X25519 operation is zero; see the notes on the +/// "all-zero value" in [RFC 7748 section 6.1]. +/// +/// [RFC 7748]: https://tools.ietf.org/html/rfc7748 +/// [RFC 7748 section 6.1]: https://tools.ietf.org/html/rfc7748#section-6.1 +pub static X25519: agreement::Algorithm = agreement::Algorithm { + curve: &CURVE25519, + ecdh: x25519_ecdh, +}; + +#[allow(clippy::unnecessary_wraps)] +fn x25519_check_private_key_bytes( + bytes: &[u8], + _: cpu::Features, +) -> Result<(), error::Unspecified> { + debug_assert_eq!(bytes.len(), PRIVATE_KEY_LEN); + Ok(()) +} + +fn x25519_generate_private_key( + rng: &dyn rand::SecureRandom, + out: &mut [u8], + _: cpu::Features, +) -> Result<(), error::Unspecified> { + rng.fill(out) +} + +fn x25519_public_from_private( + public_out: &mut [u8], + private_key: &ec::Seed, + cpu_features: cpu::Features, +) -> Result<(), error::Unspecified> { + let public_out = public_out.try_into()?; + + let private_key: &[u8; SCALAR_LEN] = private_key.bytes_less_safe().try_into()?; + let private_key = ops::MaskedScalar::from_bytes_masked(*private_key); + + #[cfg(all( + all(target_arch = "arm", target_endian = "little"), + any(target_os = "android", target_os = "linux") + ))] + if let Some(cpu) = >::get_feature(&cpu_features) { + static MONTGOMERY_BASE_POINT: [u8; 32] = [ + 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, + ]; + x25519_neon(public_out, &private_key, &MONTGOMERY_BASE_POINT, cpu); + return Ok(()); + } + + prefixed_extern! { + fn x25519_public_from_private_generic_masked( + public_key_out: &mut PublicKey, + private_key: &PrivateKey, + use_adx: c_int, + ); + } + unsafe { + x25519_public_from_private_generic_masked( + public_out, + &private_key, + ops::has_fe25519_adx(cpu_features).into(), + ); + } + + Ok(()) +} + +fn x25519_ecdh( + out: &mut [u8], + my_private_key: &ec::Seed, + peer_public_key: untrusted::Input, + cpu_features: cpu::Features, +) -> Result<(), error::Unspecified> { + let my_private_key: &[u8; SCALAR_LEN] = my_private_key.bytes_less_safe().try_into()?; + let my_private_key = ops::MaskedScalar::from_bytes_masked(*my_private_key); + let peer_public_key: &[u8; PUBLIC_KEY_LEN] = peer_public_key.as_slice_less_safe().try_into()?; + + fn scalar_mult( + out: &mut ops::EncodedPoint, + scalar: &ops::MaskedScalar, + point: &ops::EncodedPoint, + #[allow(unused_variables)] cpu_features: cpu::Features, + ) { + #[cfg(all( + all(target_arch = "arm", target_endian = "little"), + any(target_os = "android", target_os = "linux") + ))] + if let Some(cpu) = >::get_feature(&cpu_features) { + return x25519_neon(out, scalar, point, cpu); + } + + #[cfg(all(target_arch = "x86_64", not(target_os = "windows")))] + { + if ops::has_fe25519_adx(cpu_features) { + prefixed_extern! { + fn x25519_scalar_mult_adx( + out: &mut ops::EncodedPoint, + scalar: &ops::MaskedScalar, + point: &ops::EncodedPoint, + ); + } + return unsafe { x25519_scalar_mult_adx(out, scalar, point) }; + } + } + + prefixed_extern! { + fn x25519_scalar_mult_generic_masked( + out: &mut ops::EncodedPoint, + scalar: &ops::MaskedScalar, + point: &ops::EncodedPoint, + ); + } + unsafe { + x25519_scalar_mult_generic_masked(out, scalar, point); + } + } + + scalar_mult( + out.try_into()?, + &my_private_key, + peer_public_key, + cpu_features, + ); + + let zeros: SharedSecret = [0; SHARED_SECRET_LEN]; + if bb::verify_slices_are_equal(out, &zeros).is_ok() { + // All-zero output results when the input is a point of small order. + return Err(error::Unspecified); + } + + Ok(()) +} + +// BoringSSL uses `!defined(OPENSSL_APPLE)`. +#[cfg(all( + all(target_arch = "arm", target_endian = "little"), + any(target_os = "android", target_os = "linux") +))] +fn x25519_neon( + out: &mut ops::EncodedPoint, + scalar: &ops::MaskedScalar, + point: &ops::EncodedPoint, + _cpu: cpu::arm::Neon, +) { + prefixed_extern! { + fn x25519_NEON( + out: &mut ops::EncodedPoint, + scalar: &ops::MaskedScalar, + point: &ops::EncodedPoint, + ); + } + unsafe { x25519_NEON(out, scalar, point) } +} + +const ELEM_AND_SCALAR_LEN: usize = ops::ELEM_LEN; + +type PrivateKey = ops::MaskedScalar; +const PRIVATE_KEY_LEN: usize = ELEM_AND_SCALAR_LEN; + +// An X25519 public key as an encoded Curve25519 point. +type PublicKey = [u8; PUBLIC_KEY_LEN]; +const PUBLIC_KEY_LEN: usize = ELEM_AND_SCALAR_LEN; + +// An X25519 shared secret as an encoded Curve25519 point. +type SharedSecret = [u8; SHARED_SECRET_LEN]; +const SHARED_SECRET_LEN: usize = ELEM_AND_SCALAR_LEN; + +#[cfg(test)] +mod tests { + use super::*; + use crate::ec; + use untrusted::Input; + + #[test] + fn test_x25519_public_from_private() { + struct TestVector { + private: [u8; 32], + public: [u8; 32], + } + static TEST_CASES: &[TestVector] = &[ + TestVector { + private: [ + 0x77, 0x07, 0x6d, 0x0a, 0x73, 0x18, 0xa5, 0x7d, 0x3c, 0x16, 0xc1, 0x72, 0x51, + 0xb2, 0x66, 0x45, 0xdf, 0x4c, 0x2f, 0x87, 0xeb, 0xc0, 0x99, 0x2a, 0xb1, 0x77, + 0xfb, 0xa5, 0x1d, 0xb9, 0x2c, 0x2a, + ], + public: [ + 0x85, 0x20, 0xf0, 0x09, 0x89, 0x30, 0xa7, 0x54, 0x74, 0x8b, 0x7d, 0xdc, 0xb4, + 0x3e, 0xf7, 0x5a, 0x0d, 0xbf, 0x3a, 0x0d, 0x26, 0x38, 0x1a, 0xf4, 0xeb, 0xa4, + 0xa9, 0x8e, 0xaa, 0x9b, 0x4e, 0x6a, + ], + }, + TestVector { + private: [ + 0x5d, 0xab, 0x08, 0x7e, 0x62, 0x4a, 0x8a, 0x4b, 0x79, 0xe1, 0x7f, 0x8b, 0x83, + 0x80, 0x0e, 0xe6, 0x6f, 0x3b, 0xb1, 0x29, 0x26, 0x18, 0xb6, 0xfd, 0x1c, 0x2f, + 0x8b, 0x27, 0xff, 0x88, 0xe0, 0xeb, + ], + public: [ + 0xde, 0x9e, 0xdb, 0x7d, 0x7b, 0x7d, 0xc1, 0xb4, 0xd3, 0x5b, 0x61, 0xc2, 0xec, + 0xe4, 0x35, 0x37, 0x3f, 0x83, 0x43, 0xc8, 0x5b, 0x78, 0x67, 0x4d, 0xad, 0xfc, + 0x7e, 0x14, 0x6f, 0x88, 0x2b, 0x4f, + ], + }, + ]; + let cpu_features = cpu::features(); + for test_case in TEST_CASES { + let seed = + ec::Seed::from_bytes(&CURVE25519, Input::from(&test_case.private), cpu_features) + .unwrap(); + let mut output = [0u8; 32]; + x25519_public_from_private(&mut output, &seed, cpu_features).unwrap(); + assert_eq!(output, test_case.public); + } + } +} diff --git a/ring-0.17.14/src/ec/keys.rs b/ring-0.17.14/src/ec/keys.rs new file mode 100644 index 0000000000..6cf925035b --- /dev/null +++ b/ring-0.17.14/src/ec/keys.rs @@ -0,0 +1,97 @@ +use super::{Curve, ELEM_MAX_BYTES, SEED_MAX_BYTES}; +use crate::{cpu, error, rand}; + +pub struct KeyPair { + seed: Seed, + public_key: PublicKey, +} + +impl KeyPair { + pub(super) fn derive( + seed: Seed, + cpu_features: cpu::Features, + ) -> Result { + let public_key = seed.compute_public_key(cpu_features)?; + Ok(Self { seed, public_key }) + } + + pub fn public_key(&self) -> &PublicKey { + &self.public_key + } + pub fn split(self) -> (Seed, PublicKey) { + (self.seed, self.public_key) + } +} + +pub struct Seed { + bytes: [u8; SEED_MAX_BYTES], + curve: &'static Curve, +} + +impl Seed { + pub(crate) fn generate( + curve: &'static Curve, + rng: &dyn rand::SecureRandom, + cpu: cpu::Features, + ) -> Result { + let mut r = Self { + bytes: [0u8; SEED_MAX_BYTES], + curve, + }; + (curve.generate_private_key)(rng, &mut r.bytes[..curve.elem_scalar_seed_len], cpu)?; + Ok(r) + } + + pub(crate) fn from_bytes( + curve: &'static Curve, + bytes: untrusted::Input, + cpu: cpu::Features, + ) -> Result { + let bytes = bytes.as_slice_less_safe(); + if curve.elem_scalar_seed_len != bytes.len() { + return Err(error::Unspecified); + } + (curve.check_private_key_bytes)(bytes, cpu)?; + let mut r = Self { + bytes: [0; SEED_MAX_BYTES], + curve, + }; + r.bytes[..curve.elem_scalar_seed_len].copy_from_slice(bytes); + Ok(r) + } + + pub fn bytes_less_safe(&self) -> &[u8] { + &self.bytes[..self.curve.elem_scalar_seed_len] + } + + pub(crate) fn compute_public_key( + &self, + cpu_features: cpu::Features, + ) -> Result { + let mut public_key = PublicKey { + bytes: [0u8; PUBLIC_KEY_MAX_LEN], + len: self.curve.public_key_len, + }; + (self.curve.public_from_private)( + &mut public_key.bytes[..public_key.len], + self, + cpu_features, + )?; + Ok(public_key) + } +} + +#[derive(Copy, Clone)] +pub struct PublicKey { + bytes: [u8; PUBLIC_KEY_MAX_LEN], + len: usize, +} + +impl AsRef<[u8]> for PublicKey { + fn as_ref(&self) -> &[u8] { + &self.bytes[..self.len] + } +} + +/// The maximum length, in bytes, of an encoded public key. +pub const PUBLIC_KEY_MAX_LEN: usize = 1 + (2 * ELEM_MAX_BYTES); diff --git a/ring-0.17.14/src/ec/suite_b.rs b/ring-0.17.14/src/ec/suite_b.rs new file mode 100644 index 0000000000..41adcfdf0b --- /dev/null +++ b/ring-0.17.14/src/ec/suite_b.rs @@ -0,0 +1,239 @@ +// Copyright 2016 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! Elliptic curve operations on P-256 & P-384. + +use self::ops::*; +use crate::{arithmetic::montgomery::*, cpu, ec, error, io::der, pkcs8}; + +// NIST SP 800-56A Step 3: "If q is an odd prime p, verify that +// yQ**2 = xQ**3 + axQ + b in GF(p), where the arithmetic is performed modulo +// p." +// +// That is, verify that (x, y) is on the curve, which is true iif: +// +// y**2 == x**3 + a*x + b (mod q) +// +// Or, equivalently, but more efficiently: +// +// y**2 == (x**2 + a)*x + b (mod q) +// +fn verify_affine_point_is_on_the_curve( + q: &Modulus, + (x, y): (&Elem, &Elem), +) -> Result<(), error::Unspecified> { + verify_affine_point_is_on_the_curve_scaled(q, (x, y), &Elem::from(q.a()), &Elem::from(q.b())) +} + +// Use `verify_affine_point_is_on_the_curve` instead of this function whenever +// the affine coordinates are available or will become available. This function +// should only be used then the affine coordinates are never calculated. See +// the notes for `verify_affine_point_is_on_the_curve_scaled`. +// +// The value `z**2` is returned on success because it is useful for ECDSA +// verification. +// +// This function also verifies that the point is not at infinity. +fn verify_jacobian_point_is_on_the_curve( + q: &Modulus, + p: &Point, +) -> Result, error::Unspecified> { + let z = q.point_z(p); + + // Verify that the point is not at infinity. + q.elem_verify_is_not_zero(&z)?; + + let x = q.point_x(p); + let y = q.point_y(p); + + // We are given Jacobian coordinates (x, y, z). So, we have: + // + // (x/z**2, y/z**3) == (x', y'), + // + // where (x', y') are the affine coordinates. The curve equation is: + // + // y'**2 == x'**3 + a*x' + b == (x'**2 + a)*x' + b + // + // Substituting our Jacobian coordinates, we get: + // + // / y \**2 / / x \**2 \ / x \ + // | ---- | == | | ---- | + a | * | ---- | + b + // \ z**3 / \ \ z**2 / / \ z**2 / + // + // Simplify: + // + // y**2 / x**2 \ x + // ---- == | ---- + a | * ---- + b + // z**6 \ z**4 / z**2 + // + // Multiply both sides by z**6: + // + // z**6 / x**2 \ z**6 + // ---- * y**2 == | ---- + a | * ---- * x + (z**6) * b + // z**6 \ z**4 / z**2 + // + // Simplify: + // + // / x**2 \ + // y**2 == | ---- + a | * z**4 * x + (z**6) * b + // \ z**4 / + // + // Distribute z**4: + // + // / z**4 \ + // y**2 == | ---- * x**2 + z**4 * a | * x + (z**6) * b + // \ z**4 / + // + // Simplify: + // + // y**2 == (x**2 + z**4 * a) * x + (z**6) * b + // + let z2 = q.elem_squared(&z); + let z4 = q.elem_squared(&z2); + let z4_a = q.elem_product(&z4, &Elem::from(q.a())); + let z6 = q.elem_product(&z4, &z2); + let z6_b = q.elem_product(&z6, &Elem::from(q.b())); + verify_affine_point_is_on_the_curve_scaled(q, (&x, &y), &z4_a, &z6_b)?; + Ok(z2) +} + +// Handles the common logic of point-is-on-the-curve checks for both affine and +// Jacobian cases. +// +// When doing the check that the point is on the curve after a computation, +// to avoid fault attacks or mitigate potential bugs, it is better for security +// to use `verify_affine_point_is_on_the_curve` on the affine coordinates, +// because it provides some protection against faults that occur in the +// computation of the inverse of `z`. See the paper and presentation "Fault +// Attacks on Projective-to-Affine Coordinates Conversion" by Diana Maimuţ, +// Cédric Murdica, David Naccache, Mehdi Tibouchi. That presentation concluded +// simply "Check the validity of the result after conversion to affine +// coordinates." (It seems like a good idea to verify that +// z_inv * z == 1 mod q too). +// +// In the case of affine coordinates (x, y), `a_scaled` and `b_scaled` are +// `a` and `b`, respectively. In the case of Jacobian coordinates (x, y, z), +// the computation and comparison is the same, except `a_scaled` and `b_scaled` +// are (z**4 * a) and (z**6 * b), respectively. Thus, performance is another +// reason to prefer doing the check on the affine coordinates, as Jacobian +// computation requires 3 extra multiplications and 2 extra squarings. +// +// An example of a fault attack that isn't mitigated by a point-on-the-curve +// check after multiplication is given in "Sign Change Fault Attacks On +// Elliptic Curve Cryptosystems" by Johannes Blömer, Martin Otto, and +// Jean-Pierre Seifert. +fn verify_affine_point_is_on_the_curve_scaled( + q: &Modulus, + (x, y): (&Elem, &Elem), + a_scaled: &Elem, + b_scaled: &Elem, +) -> Result<(), error::Unspecified> { + let lhs = q.elem_squared(y); + + let mut rhs = q.elem_squared(x); + q.add_assign(&mut rhs, a_scaled); + q.elem_mul(&mut rhs, x); + q.add_assign(&mut rhs, b_scaled); + + if !q.elems_are_equal(&lhs, &rhs).leak() { + return Err(error::Unspecified); + } + + Ok(()) +} + +pub(crate) fn key_pair_from_pkcs8( + curve: &'static ec::Curve, + template: &pkcs8::Template, + input: untrusted::Input, + cpu_features: cpu::Features, +) -> Result { + let (ec_private_key, _) = pkcs8::unwrap_key(template, pkcs8::Version::V1Only, input)?; + let (private_key, public_key) = + ec_private_key.read_all(error::KeyRejected::invalid_encoding(), |input| { + // https://tools.ietf.org/html/rfc5915#section-3 + der::nested( + input, + der::Tag::Sequence, + error::KeyRejected::invalid_encoding(), + |input| key_pair_from_pkcs8_(template, input), + ) + })?; + key_pair_from_bytes(curve, private_key, public_key, cpu_features) +} + +fn key_pair_from_pkcs8_<'a>( + template: &pkcs8::Template, + input: &mut untrusted::Reader<'a>, +) -> Result<(untrusted::Input<'a>, untrusted::Input<'a>), error::KeyRejected> { + let version = der::small_nonnegative_integer(input) + .map_err(|error::Unspecified| error::KeyRejected::invalid_encoding())?; + if version != 1 { + return Err(error::KeyRejected::version_not_supported()); + } + + let private_key = der::expect_tag_and_get_value(input, der::Tag::OctetString) + .map_err(|error::Unspecified| error::KeyRejected::invalid_encoding())?; + + // [0] parameters (optional). + if input.peek(u8::from(der::Tag::ContextSpecificConstructed0)) { + let actual_alg_id = + der::expect_tag_and_get_value(input, der::Tag::ContextSpecificConstructed0) + .map_err(|error::Unspecified| error::KeyRejected::invalid_encoding())?; + if actual_alg_id.as_slice_less_safe() != template.curve_oid().as_slice_less_safe() { + return Err(error::KeyRejected::wrong_algorithm()); + } + } + + // [1] publicKey. The RFC says it is optional, but we require it + // to be present. + let public_key = der::nested( + input, + der::Tag::ContextSpecificConstructed1, + error::Unspecified, + der::bit_string_with_no_unused_bits, + ) + .map_err(|error::Unspecified| error::KeyRejected::invalid_encoding())?; + + Ok((private_key, public_key)) +} + +pub(crate) fn key_pair_from_bytes( + curve: &'static ec::Curve, + private_key_bytes: untrusted::Input, + public_key_bytes: untrusted::Input, + cpu_features: cpu::Features, +) -> Result { + let seed = ec::Seed::from_bytes(curve, private_key_bytes, cpu_features) + .map_err(|error::Unspecified| error::KeyRejected::invalid_component())?; + + let r = ec::KeyPair::derive(seed, cpu_features) + .map_err(|error::Unspecified| error::KeyRejected::unexpected_error())?; + if public_key_bytes.as_slice_less_safe() != r.public_key().as_ref() { + return Err(error::KeyRejected::inconsistent_components()); + } + + Ok(r) +} + +pub mod curve; + +pub mod ecdh; + +pub mod ecdsa; + +mod ops; + +mod private_key; +mod public_key; diff --git a/ring-0.17.14/src/ec/suite_b/curve.rs b/ring-0.17.14/src/ec/suite_b/curve.rs new file mode 100644 index 0000000000..0e9f1a9e62 --- /dev/null +++ b/ring-0.17.14/src/ec/suite_b/curve.rs @@ -0,0 +1,93 @@ +// Copyright 2015-2017 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use crate::{cpu, ec, error, rand}; + +/// A key agreement algorithm. +macro_rules! suite_b_curve { + ( $NAME:ident, $bits:expr, $private_key_ops:expr, $id:expr, + $check_private_key_bytes:ident, $generate_private_key:ident, + $public_from_private:ident) => { + /// Public keys are encoding in uncompressed form using the + /// Octet-String-to-Elliptic-Curve-Point algorithm in + /// [SEC 1: Elliptic Curve Cryptography, Version 2.0]. Public keys are + /// validated during key agreement according to + /// [NIST Special Publication 800-56A, revision 2] and Appendix B.3 of + /// the NSA's [Suite B Implementer's Guide to NIST SP 800-56A]. + /// + /// [SEC 1: Elliptic Curve Cryptography, Version 2.0]: + /// http://www.secg.org/sec1-v2.pdf + /// [NIST Special Publication 800-56A, revision 2]: + /// http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-56Ar2.pdf + /// [Suite B Implementer's Guide to NIST SP 800-56A]: + /// https://github.com/briansmith/ring/blob/main/doc/ecdh.pdf + pub static $NAME: ec::Curve = ec::Curve { + public_key_len: 1 + (2 * (($bits + 7) / 8)), + elem_scalar_seed_len: ($bits + 7) / 8, + id: $id, + check_private_key_bytes: $check_private_key_bytes, + generate_private_key: $generate_private_key, + public_from_private: $public_from_private, + }; + + fn $check_private_key_bytes( + bytes: &[u8], + cpu: cpu::Features, + ) -> Result<(), error::Unspecified> { + debug_assert_eq!(bytes.len(), $bits / 8); + ec::suite_b::private_key::check_scalar_big_endian_bytes($private_key_ops, bytes, cpu) + } + + fn $generate_private_key( + rng: &dyn rand::SecureRandom, + out: &mut [u8], + cpu: cpu::Features, + ) -> Result<(), error::Unspecified> { + ec::suite_b::private_key::generate_private_scalar_bytes($private_key_ops, rng, out, cpu) + } + + fn $public_from_private( + public_out: &mut [u8], + private_key: &ec::Seed, + cpu: cpu::Features, + ) -> Result<(), error::Unspecified> { + ec::suite_b::private_key::public_from_private( + $private_key_ops, + public_out, + private_key, + cpu, + ) + } + }; +} + +suite_b_curve!( + P256, + 256, + &ec::suite_b::ops::p256::PRIVATE_KEY_OPS, + ec::CurveID::P256, + p256_check_private_key_bytes, + p256_generate_private_key, + p256_public_from_private +); + +suite_b_curve!( + P384, + 384, + &ec::suite_b::ops::p384::PRIVATE_KEY_OPS, + ec::CurveID::P384, + p384_check_private_key_bytes, + p384_generate_private_key, + p384_public_from_private +); diff --git a/ring-0.17.14/src/ec/suite_b/ecdh.rs b/ring-0.17.14/src/ec/suite_b/ecdh.rs new file mode 100644 index 0000000000..59b4374b8f --- /dev/null +++ b/ring-0.17.14/src/ec/suite_b/ecdh.rs @@ -0,0 +1,243 @@ +// Copyright 2015-2017 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! ECDH key agreement using the P-256 and P-384 curves. + +use super::{ops::*, private_key::*, public_key::*}; +use crate::{agreement, cpu, ec, error}; + +/// A key agreement algorithm. +macro_rules! ecdh { + ( $NAME:ident, $curve:expr, $name_str:expr, $private_key_ops:expr, + $public_key_ops:expr, $ecdh:ident ) => { + #[doc = "ECDH using the NSA Suite B"] + #[doc=$name_str] + #[doc = "curve."] + /// + /// Public keys are encoding in uncompressed form using the + /// Octet-String-to-Elliptic-Curve-Point algorithm in + /// [SEC 1: Elliptic Curve Cryptography, Version 2.0]. Public keys are + /// validated during key agreement according to + /// [NIST Special Publication 800-56A, revision 2] and Appendix B.3 of + /// the NSA's [Suite B Implementer's Guide to NIST SP 800-56A]. + /// + /// [SEC 1: Elliptic Curve Cryptography, Version 2.0]: + /// http://www.secg.org/sec1-v2.pdf + /// [NIST Special Publication 800-56A, revision 2]: + /// http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-56Ar2.pdf + /// [Suite B Implementer's Guide to NIST SP 800-56A]: + /// https://github.com/briansmith/ring/blob/main/doc/ecdh.pdf + pub static $NAME: agreement::Algorithm = agreement::Algorithm { + curve: $curve, + ecdh: $ecdh, + }; + + fn $ecdh( + out: &mut [u8], + my_private_key: &ec::Seed, + peer_public_key: untrusted::Input, + cpu: cpu::Features, + ) -> Result<(), error::Unspecified> { + ecdh( + $private_key_ops, + $public_key_ops, + out, + my_private_key, + peer_public_key, + cpu, + ) + } + }; +} + +ecdh!( + ECDH_P256, + &ec::suite_b::curve::P256, + "P-256 (secp256r1)", + &p256::PRIVATE_KEY_OPS, + &p256::PUBLIC_KEY_OPS, + p256_ecdh +); + +ecdh!( + ECDH_P384, + &ec::suite_b::curve::P384, + "P-384 (secp384r1)", + &p384::PRIVATE_KEY_OPS, + &p384::PUBLIC_KEY_OPS, + p384_ecdh +); + +fn ecdh( + private_key_ops: &PrivateKeyOps, + public_key_ops: &PublicKeyOps, + out: &mut [u8], + my_private_key: &ec::Seed, + peer_public_key: untrusted::Input, + cpu: cpu::Features, +) -> Result<(), error::Unspecified> { + // The NIST SP 800-56Ar2 steps are from section 5.7.1.2 Elliptic Curve + // Cryptography Cofactor Diffie-Hellman (ECC CDH) Primitive. + // + // The "NSA Guide" steps are from section 3.1 of the NSA guide, "Ephemeral + // Unified Model." + + let q = &public_key_ops.common.elem_modulus(cpu); + + // NSA Guide Step 1 is handled separately. + + // NIST SP 800-56Ar2 5.6.2.2.2. + // NSA Guide Step 2. + // + // `parse_uncompressed_point` verifies that the point is not at infinity + // and that it is on the curve, using the Partial Public-Key Validation + // Routine. + let peer_public_key = parse_uncompressed_point(public_key_ops, q, peer_public_key)?; + + // NIST SP 800-56Ar2 Step 1. + // NSA Guide Step 3 (except point at infinity check). + // + // Note that the cofactor (h) is one since we only support prime-order + // curves, so we can safely ignore the cofactor. + // + // It is impossible for the result to be the point at infinity because our + // private key is in the range [1, n) and the curve has prime order and + // `parse_uncompressed_point` verified that the peer public key is on the + // curve and not at infinity. However, since the standards require the + // check, we do it using `assert!`. + // + // NIST SP 800-56Ar2 defines "Destroy" thusly: "In this Recommendation, to + // destroy is an action applied to a key or a piece of secret data. After + // a key or a piece of secret data is destroyed, no information about its + // value can be recovered." We interpret "destroy" somewhat liberally: we + // assume that since we throw away the values to be destroyed, no + // information about their values can be recovered. This doesn't meet the + // NSA guide's explicit requirement to "zeroize" them though. + // TODO: this only needs common scalar ops + let n = &private_key_ops.common.scalar_modulus(cpu); + let my_private_key = private_key_as_scalar(n, my_private_key); + let product = private_key_ops.point_mul(&my_private_key, &peer_public_key, cpu); + + // NIST SP 800-56Ar2 Steps 2, 3, 4, and 5. + // NSA Guide Steps 3 (point at infinity check) and 4. + // + // Again, we have a pretty liberal interpretation of the NIST's spec's + // "Destroy" that doesn't meet the NSA requirement to "zeroize." + // `big_endian_affine_from_jacobian` verifies that the result is not at + // infinity and also does an extra check to verify that the point is on + // the curve. + big_endian_affine_from_jacobian(private_key_ops, q, out, None, &product) + + // NSA Guide Step 5 & 6 are deferred to the caller. Again, we have a + // pretty liberal interpretation of the NIST's spec's "Destroy" that + // doesn't meet the NSA requirement to "zeroize." +} + +#[cfg(test)] +mod tests { + use super::super::ops; + use crate::testutil as test; + use crate::{agreement, ec, limb}; + + static SUPPORTED_SUITE_B_ALGS: [(&str, &agreement::Algorithm, &ec::Curve, &ops::CommonOps); 2] = [ + ( + "P-256", + &agreement::ECDH_P256, + &super::super::curve::P256, + &ops::p256::COMMON_OPS, + ), + ( + "P-384", + &agreement::ECDH_P384, + &super::super::curve::P384, + &ops::p384::COMMON_OPS, + ), + ]; + + #[test] + fn test_agreement_suite_b_ecdh_generate() { + // Generates a string of bytes 0x00...00, which will always result in + // a scalar value of zero. + let random_00 = test::rand::FixedByteRandom { byte: 0x00 }; + + // Generates a string of bytes 0xFF...FF, which will be larger than the + // group order of any curve that is supported. + let random_ff = test::rand::FixedByteRandom { byte: 0xff }; + + for &(_, alg, curve, ops) in SUPPORTED_SUITE_B_ALGS.iter() { + // Test that the private key value zero is rejected and that + // `generate` gives up after a while of only getting zeros. + assert!(agreement::EphemeralPrivateKey::generate(alg, &random_00).is_err()); + + // Test that the private key value larger than the group order is + // rejected and that `generate` gives up after a while of only + // getting values larger than the group order. + assert!(agreement::EphemeralPrivateKey::generate(alg, &random_ff).is_err()); + + // Test that a private key value exactly equal to the group order + // is rejected and that `generate` gives up after a while of only + // getting that value from the PRNG. + let mut n_bytes = [0u8; ec::SCALAR_MAX_BYTES]; + let num_bytes = curve.elem_scalar_seed_len; + limb::big_endian_from_limbs(ops.n_limbs(), &mut n_bytes[..num_bytes]); + { + let n_bytes = &mut n_bytes[..num_bytes]; + let rng = test::rand::FixedSliceRandom { bytes: n_bytes }; + assert!(agreement::EphemeralPrivateKey::generate(alg, &rng).is_err()); + } + + // Test that a private key value exactly equal to the group order + // minus 1 is accepted. + let mut n_minus_1_bytes = n_bytes; + { + let n_minus_1_bytes = &mut n_minus_1_bytes[..num_bytes]; + n_minus_1_bytes[num_bytes - 1] -= 1; + let rng = test::rand::FixedSliceRandom { + bytes: n_minus_1_bytes, + }; + let key = agreement::EphemeralPrivateKey::generate(alg, &rng).unwrap(); + assert_eq!(n_minus_1_bytes, key.bytes_for_test()); + } + + // Test that n + 1 also fails. + let mut n_plus_1_bytes = n_bytes; + { + let n_plus_1_bytes = &mut n_plus_1_bytes[..num_bytes]; + n_plus_1_bytes[num_bytes - 1] += 1; + let rng = test::rand::FixedSliceRandom { + bytes: n_plus_1_bytes, + }; + assert!(agreement::EphemeralPrivateKey::generate(alg, &rng).is_err()); + } + + // Test recovery from initial RNG failure. The first value will be + // n, then n + 1, then zero, the next value will be n - 1, which + // will be accepted. + { + let bytes = [ + &n_bytes[..num_bytes], + &n_plus_1_bytes[..num_bytes], + &[0u8; ec::SCALAR_MAX_BYTES][..num_bytes], + &n_minus_1_bytes[..num_bytes], + ]; + let rng = test::rand::FixedSliceSequenceRandom { + bytes: &bytes, + current: core::cell::UnsafeCell::new(0), + }; + let key = agreement::EphemeralPrivateKey::generate(alg, &rng).unwrap(); + assert_eq!(&n_minus_1_bytes[..num_bytes], key.bytes_for_test()); + } + } + } +} diff --git a/ring-0.17.14/src/ec/suite_b/ecdsa.rs b/ring-0.17.14/src/ec/suite_b/ecdsa.rs new file mode 100644 index 0000000000..274c991626 --- /dev/null +++ b/ring-0.17.14/src/ec/suite_b/ecdsa.rs @@ -0,0 +1,3 @@ +mod digest_scalar; +pub mod signing; +pub mod verification; diff --git a/ring-0.17.14/src/ec/suite_b/ecdsa/digest_scalar.rs b/ring-0.17.14/src/ec/suite_b/ecdsa/digest_scalar.rs new file mode 100644 index 0000000000..6479c9490c --- /dev/null +++ b/ring-0.17.14/src/ec/suite_b/ecdsa/digest_scalar.rs @@ -0,0 +1,119 @@ +// Copyright 2015-2016 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! ECDSA Signatures using the P-256 and P-384 curves. + +use crate::{digest, ec::suite_b::ops::*}; + +/// Calculate the digest of `msg` using the digest algorithm `digest_alg`. Then +/// convert the digest to a scalar in the range [0, n) as described in +/// NIST's FIPS 186-4 Section 4.2. Note that this is one of the few cases where +/// a `Scalar` is allowed to have the value zero. +/// +/// NIST's FIPS 186-4 4.2 says "When the length of the output of the hash +/// function is greater than N (i.e., the bit length of q), then the leftmost N +/// bits of the hash function output block shall be used in any calculation +/// using the hash function output during the generation or verification of a +/// digital signature." +/// +/// "Leftmost N bits" means "N most significant bits" because we interpret the +/// digest as a bit-endian encoded integer. +/// +/// The NSA guide instead vaguely suggests that we should convert the digest +/// value to an integer and then reduce it mod `n`. However, real-world +/// implementations (e.g. `digest_to_bn` in OpenSSL and `hashToInt` in Go) do +/// what FIPS 186-4 says to do, not what the NSA guide suggests. +/// +/// Why shifting the value right by at most one bit is sufficient: P-256's `n` +/// has its 256th bit set; i.e. 2**255 < n < 2**256. Once we've truncated the +/// digest to 256 bits and converted it to an integer, it will have a value +/// less than 2**256. If the value is larger than `n` then shifting it one bit +/// right will give a value less than 2**255, which is less than `n`. The +/// analogous argument applies for P-384. However, it does *not* apply in +/// general; for example, it doesn't apply to P-521. +pub(super) fn digest_scalar(n: &Modulus, msg: digest::Digest) -> Scalar { + digest_scalar_(n, msg.as_ref()) +} + +#[cfg(test)] +pub(super) fn digest_bytes_scalar(n: &Modulus, digest: &[u8]) -> Scalar { + digest_scalar_(n, digest) +} + +// This is a separate function solely so that we can test specific digest +// values like all-zero values and values larger than `n`. +fn digest_scalar_(n: &Modulus, digest: &[u8]) -> Scalar { + let len = n.bytes_len(); + let digest = if digest.len() > len { + &digest[..len] + } else { + digest + }; + + scalar_parse_big_endian_partially_reduced_variable_consttime(n, untrusted::Input::from(digest)) + .unwrap() +} + +#[cfg(test)] +mod tests { + use super::digest_bytes_scalar; + use crate::testutil as test; + use crate::{cpu, digest, ec::suite_b::ops::*, limb}; + + #[test] + fn test() { + let cpu = cpu::features(); + test::run( + test_vector_file!("ecdsa_digest_scalar_tests.txt"), + |section, test_case| { + assert_eq!(section, ""); + + let curve_name = test_case.consume_string("Curve"); + let digest_name = test_case.consume_string("Digest"); + let input = test_case.consume_bytes("Input"); + let output = test_case.consume_bytes("Output"); + + let (ops, digest_alg) = match (curve_name.as_str(), digest_name.as_str()) { + ("P-256", "SHA256") => (&p256::PUBLIC_SCALAR_OPS, &digest::SHA256), + ("P-256", "SHA384") => (&p256::PUBLIC_SCALAR_OPS, &digest::SHA384), + ("P-384", "SHA256") => (&p384::PUBLIC_SCALAR_OPS, &digest::SHA256), + ("P-384", "SHA384") => (&p384::PUBLIC_SCALAR_OPS, &digest::SHA384), + _ => { + panic!("Unsupported curve+digest: {}+{}", curve_name, digest_name); + } + }; + let n = &ops.scalar_ops.scalar_modulus(cpu); + + assert_eq!(input.len(), digest_alg.output_len()); + assert_eq!(output.len(), ops.scalar_ops.scalar_bytes_len()); + assert_eq!(output.len(), n.bytes_len()); + + let expected = scalar_parse_big_endian_variable( + n, + limb::AllowZero::Yes, + untrusted::Input::from(&output), + ) + .unwrap(); + + let actual = digest_bytes_scalar(n, &input); + assert_eq!( + ops.scalar_ops.leak_limbs(&actual), + ops.scalar_ops.leak_limbs(&expected) + ); + + Ok(()) + }, + ); + } +} diff --git a/ring-0.17.14/src/ec/suite_b/ecdsa/ecPublicKey_p256_pkcs8_v1_template.der b/ring-0.17.14/src/ec/suite_b/ecdsa/ecPublicKey_p256_pkcs8_v1_template.der new file mode 100644 index 0000000000..d579082387 Binary files /dev/null and b/ring-0.17.14/src/ec/suite_b/ecdsa/ecPublicKey_p256_pkcs8_v1_template.der differ diff --git a/ring-0.17.14/src/ec/suite_b/ecdsa/ecPublicKey_p384_pkcs8_v1_template.der b/ring-0.17.14/src/ec/suite_b/ecdsa/ecPublicKey_p384_pkcs8_v1_template.der new file mode 100644 index 0000000000..76cc36d403 Binary files /dev/null and b/ring-0.17.14/src/ec/suite_b/ecdsa/ecPublicKey_p384_pkcs8_v1_template.der differ diff --git a/ring-0.17.14/src/ec/suite_b/ecdsa/signing.rs b/ring-0.17.14/src/ec/suite_b/ecdsa/signing.rs new file mode 100644 index 0000000000..c2bc733c8a --- /dev/null +++ b/ring-0.17.14/src/ec/suite_b/ecdsa/signing.rs @@ -0,0 +1,615 @@ +// Copyright 2015-2016 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! ECDSA Signatures using the P-256 and P-384 curves. + +use super::digest_scalar::digest_scalar; +use crate::{ + arithmetic::montgomery::*, + cpu, digest, + ec::{ + self, + suite_b::{ops::*, private_key}, + }, + error, + io::der, + limb, pkcs8, rand, sealed, signature, +}; +/// An ECDSA signing algorithm. +pub struct EcdsaSigningAlgorithm { + curve: &'static ec::Curve, + private_scalar_ops: &'static PrivateScalarOps, + private_key_ops: &'static PrivateKeyOps, + digest_alg: &'static digest::Algorithm, + pkcs8_template: &'static pkcs8::Template, + format_rs: fn(ops: &'static ScalarOps, r: &Scalar, s: &Scalar, out: &mut [u8]) -> usize, + id: AlgorithmID, +} + +#[derive(Debug, Eq, PartialEq)] +enum AlgorithmID { + ECDSA_P256_SHA256_FIXED_SIGNING, + ECDSA_P384_SHA384_FIXED_SIGNING, + ECDSA_P256_SHA256_ASN1_SIGNING, + ECDSA_P384_SHA384_ASN1_SIGNING, +} + +derive_debug_via_id!(EcdsaSigningAlgorithm); + +impl PartialEq for EcdsaSigningAlgorithm { + fn eq(&self, other: &Self) -> bool { + self.id == other.id + } +} + +impl Eq for EcdsaSigningAlgorithm {} + +impl sealed::Sealed for EcdsaSigningAlgorithm {} + +/// An ECDSA key pair, used for signing. +pub struct EcdsaKeyPair { + d: Scalar, + nonce_key: NonceRandomKey, + alg: &'static EcdsaSigningAlgorithm, + public_key: PublicKey, +} + +derive_debug_via_field!(EcdsaKeyPair, stringify!(EcdsaKeyPair), public_key); + +impl EcdsaKeyPair { + /// Generates a new key pair and returns the key pair serialized as a + /// PKCS#8 document. + /// + /// The PKCS#8 document will be a v1 `OneAsymmetricKey` with the public key + /// included in the `ECPrivateKey` structure, as described in + /// [RFC 5958 Section 2] and [RFC 5915]. The `ECPrivateKey` structure will + /// not have a `parameters` field so the generated key is compatible with + /// PKCS#11. + /// + /// [RFC 5915]: https://tools.ietf.org/html/rfc5915 + /// [RFC 5958 Section 2]: https://tools.ietf.org/html/rfc5958#section-2 + pub fn generate_pkcs8( + alg: &'static EcdsaSigningAlgorithm, + rng: &dyn rand::SecureRandom, + ) -> Result { + let cpu = cpu::features(); + let private_key = ec::Seed::generate(alg.curve, rng, cpu)?; + let public_key = private_key.compute_public_key(cpu)?; + Ok(pkcs8::wrap_key( + alg.pkcs8_template, + private_key.bytes_less_safe(), + public_key.as_ref(), + )) + } + + /// Constructs an ECDSA key pair by parsing an unencrypted PKCS#8 v1 + /// id-ecPublicKey `ECPrivateKey` key. + /// + /// The input must be in PKCS#8 v1 format. It must contain the public key in + /// the `ECPrivateKey` structure; `from_pkcs8()` will verify that the public + /// key and the private key are consistent with each other. The algorithm + /// identifier must identify the curve by name; it must not use an + /// "explicit" encoding of the curve. The `parameters` field of the + /// `ECPrivateKey`, if present, must be the same named curve that is in the + /// algorithm identifier in the PKCS#8 header. + pub fn from_pkcs8( + alg: &'static EcdsaSigningAlgorithm, + pkcs8: &[u8], + rng: &dyn rand::SecureRandom, + ) -> Result { + let key_pair = ec::suite_b::key_pair_from_pkcs8( + alg.curve, + alg.pkcs8_template, + untrusted::Input::from(pkcs8), + cpu::features(), + )?; + Self::new(alg, key_pair, rng) + } + + /// Constructs an ECDSA key pair from the private key and public key bytes + /// + /// The private key must encoded as a big-endian fixed-length integer. For + /// example, a P-256 private key must be 32 bytes prefixed with leading + /// zeros as needed. + /// + /// The public key is encoding in uncompressed form using the + /// Octet-String-to-Elliptic-Curve-Point algorithm in + /// [SEC 1: Elliptic Curve Cryptography, Version 2.0]. + /// + /// This is intended for use by code that deserializes key pairs. It is + /// recommended to use `EcdsaKeyPair::from_pkcs8()` (with a PKCS#8-encoded + /// key) instead. + /// + /// [SEC 1: Elliptic Curve Cryptography, Version 2.0]: + /// http://www.secg.org/sec1-v2.pdf + pub fn from_private_key_and_public_key( + alg: &'static EcdsaSigningAlgorithm, + private_key: &[u8], + public_key: &[u8], + rng: &dyn rand::SecureRandom, + ) -> Result { + let key_pair = ec::suite_b::key_pair_from_bytes( + alg.curve, + untrusted::Input::from(private_key), + untrusted::Input::from(public_key), + cpu::features(), + )?; + Self::new(alg, key_pair, rng) + } + + fn new( + alg: &'static EcdsaSigningAlgorithm, + key_pair: ec::KeyPair, + rng: &dyn rand::SecureRandom, + ) -> Result { + let cpu = cpu::features(); + + let (seed, public_key) = key_pair.split(); + let n = &alg.private_scalar_ops.scalar_ops.scalar_modulus(cpu); + let d = private_key::private_key_as_scalar(n, &seed); + let d = alg.private_scalar_ops.to_mont(&d, cpu); + + let nonce_key = NonceRandomKey::new(alg, &seed, rng)?; + Ok(Self { + d, + nonce_key, + alg, + public_key: PublicKey(public_key), + }) + } + + /// Returns the signature of the `message` using a random nonce generated by `rng`. + pub fn sign( + &self, + rng: &dyn rand::SecureRandom, + message: &[u8], + ) -> Result { + let cpu = cpu::features(); + + // Step 4 (out of order). + let h = digest::digest(self.alg.digest_alg, message); + + // Incorporate `h` into the nonce to hedge against faulty RNGs. (This + // is not an approved random number generator that is mandated in + // the spec.) + let nonce_rng = NonceRandom { + key: &self.nonce_key, + message_digest: &h, + rng, + }; + + self.sign_digest(h, &nonce_rng, cpu) + } + + #[cfg(test)] + fn sign_with_fixed_nonce_during_test( + &self, + rng: &dyn rand::SecureRandom, + message: &[u8], + ) -> Result { + // Step 4 (out of order). + let h = digest::digest(self.alg.digest_alg, message); + + self.sign_digest(h, rng, cpu::features()) + } + + /// Returns the signature of message digest `h` using a "random" nonce + /// generated by `rng`. + fn sign_digest( + &self, + h: digest::Digest, + rng: &dyn rand::SecureRandom, + cpu: cpu::Features, + ) -> Result { + // NSA Suite B Implementer's Guide to ECDSA Section 3.4.1: ECDSA + // Signature Generation. + + // NSA Guide Prerequisites: + // + // Prior to generating an ECDSA signature, the signatory shall + // obtain: + // + // 1. an authentic copy of the domain parameters, + // 2. a digital signature key pair (d,Q), either generated by a + // method from Appendix A.1, or obtained from a trusted third + // party, + // 3. assurance of the validity of the public key Q (see Appendix + // A.3), and + // 4. assurance that he/she/it actually possesses the associated + // private key d (see [SP800-89] Section 6). + // + // The domain parameters are hard-coded into the source code. + // `EcdsaKeyPair::generate_pkcs8()` can be used to meet the second + // requirement; otherwise, it is up to the user to ensure the key pair + // was obtained from a trusted private key. The constructors for + // `EcdsaKeyPair` ensure that #3 and #4 are met subject to the caveats + // in SP800-89 Section 6. + + let ops = self.alg.private_scalar_ops; + let scalar_ops = ops.scalar_ops; + let cops = scalar_ops.common; + let private_key_ops = self.alg.private_key_ops; + let q = &cops.elem_modulus(cpu); + let n = &scalar_ops.scalar_modulus(cpu); + + for _ in 0..100 { + // XXX: iteration conut? + // Step 1. + let k = private_key::random_scalar(self.alg.private_key_ops, n, rng)?; + let k_inv = ops.scalar_inv_to_mont(&k, cpu); + + // Step 2. + let r = private_key_ops.point_mul_base(&k, cpu); + + // Step 3. + let r = { + let (x, _) = private_key::affine_from_jacobian(private_key_ops, q, &r)?; + let x = q.elem_unencoded(&x); + n.elem_reduced_to_scalar(&x) + }; + if n.is_zero(&r) { + continue; + } + + // Step 4 is done by the caller. + + // Step 5. + let e = digest_scalar(n, h); + + // Step 6. + let s = { + let mut e_plus_dr = scalar_ops.scalar_product(&self.d, &r, cpu); + n.add_assign(&mut e_plus_dr, &e); + scalar_ops.scalar_product(&k_inv, &e_plus_dr, cpu) + }; + if n.is_zero(&s) { + continue; + } + + // Step 7 with encoding. + return Ok(signature::Signature::new(|sig_bytes| { + (self.alg.format_rs)(scalar_ops, &r, &s, sig_bytes) + })); + } + + Err(error::Unspecified) + } +} + +/// Generates an ECDSA nonce in a way that attempts to protect against a faulty +/// `SecureRandom`. +struct NonceRandom<'a> { + key: &'a NonceRandomKey, + message_digest: &'a digest::Digest, + rng: &'a dyn rand::SecureRandom, +} + +impl core::fmt::Debug for NonceRandom<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("NonceRandom").finish() + } +} + +impl rand::sealed::SecureRandom for NonceRandom<'_> { + fn fill_impl(&self, dest: &mut [u8]) -> Result<(), error::Unspecified> { + // Use the same digest algorithm that will be used to digest the + // message. The digest algorithm's output is exactly the right size; + // this is checked below. + // + // XXX(perf): The single iteration will require two digest block + // operations because the amount of data digested is larger than one + // block. + let digest_alg = self.key.0.algorithm(); + let mut ctx = digest::Context::new(digest_alg); + + // Digest the randomized digest of the private key. + let key = self.key.0.as_ref(); + ctx.update(key); + + // The random value is digested between the key and the message so that + // the key and the message are not directly digested in the same digest + // block. + assert!(key.len() <= digest_alg.block_len() / 2); + { + let mut rand = [0u8; digest::MAX_BLOCK_LEN]; + let rand = &mut rand[..digest_alg.block_len() - key.len()]; + assert!(rand.len() >= dest.len()); + self.rng.fill(rand)?; + ctx.update(rand); + } + + ctx.update(self.message_digest.as_ref()); + + let nonce = ctx.finish(); + + // `copy_from_slice()` panics if the lengths differ, so we don't have + // to separately assert that the lengths are the same. + dest.copy_from_slice(nonce.as_ref()); + + Ok(()) + } +} + +impl sealed::Sealed for NonceRandom<'_> {} + +struct NonceRandomKey(digest::Digest); + +impl NonceRandomKey { + fn new( + alg: &EcdsaSigningAlgorithm, + seed: &ec::Seed, + rng: &dyn rand::SecureRandom, + ) -> Result { + let mut rand = [0; digest::MAX_OUTPUT_LEN]; + let rand = &mut rand[0..alg.curve.elem_scalar_seed_len]; + + // XXX: `KeyRejected` isn't the right way to model failure of the RNG, + // but to fix that we'd need to break the API by changing the result type. + // TODO: Fix the API in the next breaking release. + rng.fill(rand) + .map_err(|error::Unspecified| error::KeyRejected::rng_failed())?; + + let mut ctx = digest::Context::new(alg.digest_alg); + ctx.update(rand); + ctx.update(seed.bytes_less_safe()); + Ok(Self(ctx.finish())) + } +} + +impl signature::KeyPair for EcdsaKeyPair { + type PublicKey = PublicKey; + + fn public_key(&self) -> &Self::PublicKey { + &self.public_key + } +} + +#[derive(Clone, Copy)] +pub struct PublicKey(ec::PublicKey); + +derive_debug_self_as_ref_hex_bytes!(PublicKey); + +impl AsRef<[u8]> for PublicKey { + fn as_ref(&self) -> &[u8] { + self.0.as_ref() + } +} + +fn format_rs_fixed(ops: &'static ScalarOps, r: &Scalar, s: &Scalar, out: &mut [u8]) -> usize { + let scalar_len = ops.scalar_bytes_len(); + + let (r_out, rest) = out.split_at_mut(scalar_len); + limb::big_endian_from_limbs(ops.leak_limbs(r), r_out); + + let (s_out, _) = rest.split_at_mut(scalar_len); + limb::big_endian_from_limbs(ops.leak_limbs(s), s_out); + + 2 * scalar_len +} + +fn format_rs_asn1(ops: &'static ScalarOps, r: &Scalar, s: &Scalar, out: &mut [u8]) -> usize { + // This assumes `a` is not zero since neither `r` or `s` is allowed to be + // zero. + fn format_integer_tlv(ops: &ScalarOps, a: &Scalar, out: &mut [u8]) -> usize { + let mut fixed = [0u8; ec::SCALAR_MAX_BYTES + 1]; + let fixed = &mut fixed[..(ops.scalar_bytes_len() + 1)]; + limb::big_endian_from_limbs(ops.leak_limbs(a), &mut fixed[1..]); + + // Since `a_fixed_out` is an extra byte long, it is guaranteed to start + // with a zero. + debug_assert_eq!(fixed[0], 0); + + // There must be at least one non-zero byte since `a` isn't zero. + let first_index = fixed.iter().position(|b| *b != 0).unwrap(); + + // If the first byte has its high bit set, it needs to be prefixed with 0x00. + let first_index = if fixed[first_index] & 0x80 != 0 { + first_index - 1 + } else { + first_index + }; + let value = &fixed[first_index..]; + + out[0] = der::Tag::Integer.into(); + + // Lengths less than 128 are encoded in one byte. + assert!(value.len() < 128); + #[allow(clippy::cast_possible_truncation)] + { + out[1] = value.len() as u8; + } + + out[2..][..value.len()].copy_from_slice(value); + + 2 + value.len() + } + + out[0] = der::Tag::Sequence.into(); + let r_tlv_len = format_integer_tlv(ops, r, &mut out[2..]); + let s_tlv_len = format_integer_tlv(ops, s, &mut out[2..][r_tlv_len..]); + + // Lengths less than 128 are encoded in one byte. + let value_len = r_tlv_len + s_tlv_len; + assert!(value_len < 128); + #[allow(clippy::cast_possible_truncation)] + { + out[1] = value_len as u8; + } + + 2 + value_len +} + +/// Signing of fixed-length (PKCS#11 style) ECDSA signatures using the +/// P-256 curve and SHA-256. +/// +/// See "`ECDSA_*_FIXED` Details" in `ring::signature`'s module-level +/// documentation for more details. +pub static ECDSA_P256_SHA256_FIXED_SIGNING: EcdsaSigningAlgorithm = EcdsaSigningAlgorithm { + curve: &ec::suite_b::curve::P256, + private_scalar_ops: &p256::PRIVATE_SCALAR_OPS, + private_key_ops: &p256::PRIVATE_KEY_OPS, + digest_alg: &digest::SHA256, + pkcs8_template: &EC_PUBLIC_KEY_P256_PKCS8_V1_TEMPLATE, + format_rs: format_rs_fixed, + id: AlgorithmID::ECDSA_P256_SHA256_FIXED_SIGNING, +}; + +/// Signing of fixed-length (PKCS#11 style) ECDSA signatures using the +/// P-384 curve and SHA-384. +/// +/// See "`ECDSA_*_FIXED` Details" in `ring::signature`'s module-level +/// documentation for more details. +pub static ECDSA_P384_SHA384_FIXED_SIGNING: EcdsaSigningAlgorithm = EcdsaSigningAlgorithm { + curve: &ec::suite_b::curve::P384, + private_scalar_ops: &p384::PRIVATE_SCALAR_OPS, + private_key_ops: &p384::PRIVATE_KEY_OPS, + digest_alg: &digest::SHA384, + pkcs8_template: &EC_PUBLIC_KEY_P384_PKCS8_V1_TEMPLATE, + format_rs: format_rs_fixed, + id: AlgorithmID::ECDSA_P384_SHA384_FIXED_SIGNING, +}; + +/// Signing of ASN.1 DER-encoded ECDSA signatures using the P-256 curve and +/// SHA-256. +/// +/// See "`ECDSA_*_ASN1` Details" in `ring::signature`'s module-level +/// documentation for more details. +pub static ECDSA_P256_SHA256_ASN1_SIGNING: EcdsaSigningAlgorithm = EcdsaSigningAlgorithm { + curve: &ec::suite_b::curve::P256, + private_scalar_ops: &p256::PRIVATE_SCALAR_OPS, + private_key_ops: &p256::PRIVATE_KEY_OPS, + digest_alg: &digest::SHA256, + pkcs8_template: &EC_PUBLIC_KEY_P256_PKCS8_V1_TEMPLATE, + format_rs: format_rs_asn1, + id: AlgorithmID::ECDSA_P256_SHA256_ASN1_SIGNING, +}; + +/// Signing of ASN.1 DER-encoded ECDSA signatures using the P-384 curve and +/// SHA-384. +/// +/// See "`ECDSA_*_ASN1` Details" in `ring::signature`'s module-level +/// documentation for more details. +pub static ECDSA_P384_SHA384_ASN1_SIGNING: EcdsaSigningAlgorithm = EcdsaSigningAlgorithm { + curve: &ec::suite_b::curve::P384, + private_scalar_ops: &p384::PRIVATE_SCALAR_OPS, + private_key_ops: &p384::PRIVATE_KEY_OPS, + digest_alg: &digest::SHA384, + pkcs8_template: &EC_PUBLIC_KEY_P384_PKCS8_V1_TEMPLATE, + format_rs: format_rs_asn1, + id: AlgorithmID::ECDSA_P384_SHA384_ASN1_SIGNING, +}; + +static EC_PUBLIC_KEY_P256_PKCS8_V1_TEMPLATE: pkcs8::Template = pkcs8::Template { + bytes: include_bytes!("ecPublicKey_p256_pkcs8_v1_template.der"), + alg_id_range: core::ops::Range { start: 8, end: 27 }, + curve_id_index: 9, + private_key_index: 0x24, +}; + +static EC_PUBLIC_KEY_P384_PKCS8_V1_TEMPLATE: pkcs8::Template = pkcs8::Template { + bytes: include_bytes!("ecPublicKey_p384_pkcs8_v1_template.der"), + alg_id_range: core::ops::Range { start: 8, end: 24 }, + curve_id_index: 9, + private_key_index: 0x23, +}; + +#[cfg(test)] +mod tests { + use crate::testutil as test; + use crate::{rand, signature}; + + #[test] + fn signature_ecdsa_sign_fixed_test() { + let rng = rand::SystemRandom::new(); + + test::run( + test_vector_file!("ecdsa_sign_fixed_tests.txt"), + |section, test_case| { + assert_eq!(section, ""); + + let curve_name = test_case.consume_string("Curve"); + let digest_name = test_case.consume_string("Digest"); + let msg = test_case.consume_bytes("Msg"); + let d = test_case.consume_bytes("d"); + let q = test_case.consume_bytes("Q"); + let k = test_case.consume_bytes("k"); + + let expected_result = test_case.consume_bytes("Sig"); + + let alg = match (curve_name.as_str(), digest_name.as_str()) { + ("P-256", "SHA256") => &signature::ECDSA_P256_SHA256_FIXED_SIGNING, + ("P-384", "SHA384") => &signature::ECDSA_P384_SHA384_FIXED_SIGNING, + _ => { + panic!("Unsupported curve+digest: {}+{}", curve_name, digest_name); + } + }; + + let private_key = + signature::EcdsaKeyPair::from_private_key_and_public_key(alg, &d, &q, &rng) + .unwrap(); + let rng = test::rand::FixedSliceRandom { bytes: &k }; + + let actual_result = private_key + .sign_with_fixed_nonce_during_test(&rng, &msg) + .unwrap(); + + assert_eq!(actual_result.as_ref(), &expected_result[..]); + + Ok(()) + }, + ); + } + + #[test] + fn signature_ecdsa_sign_asn1_test() { + let rng = rand::SystemRandom::new(); + + test::run( + test_vector_file!("ecdsa_sign_asn1_tests.txt"), + |section, test_case| { + assert_eq!(section, ""); + + let curve_name = test_case.consume_string("Curve"); + let digest_name = test_case.consume_string("Digest"); + let msg = test_case.consume_bytes("Msg"); + let d = test_case.consume_bytes("d"); + let q = test_case.consume_bytes("Q"); + let k = test_case.consume_bytes("k"); + + let expected_result = test_case.consume_bytes("Sig"); + + let alg = match (curve_name.as_str(), digest_name.as_str()) { + ("P-256", "SHA256") => &signature::ECDSA_P256_SHA256_ASN1_SIGNING, + ("P-384", "SHA384") => &signature::ECDSA_P384_SHA384_ASN1_SIGNING, + _ => { + panic!("Unsupported curve+digest: {}+{}", curve_name, digest_name); + } + }; + + let private_key = + signature::EcdsaKeyPair::from_private_key_and_public_key(alg, &d, &q, &rng) + .unwrap(); + let rng = test::rand::FixedSliceRandom { bytes: &k }; + + let actual_result = private_key + .sign_with_fixed_nonce_during_test(&rng, &msg) + .unwrap(); + + assert_eq!(actual_result.as_ref(), &expected_result[..]); + + Ok(()) + }, + ); + } +} diff --git a/ring-0.17.14/src/ec/suite_b/ecdsa/verification.rs b/ring-0.17.14/src/ec/suite_b/ecdsa/verification.rs new file mode 100644 index 0000000000..e500831aa6 --- /dev/null +++ b/ring-0.17.14/src/ec/suite_b/ecdsa/verification.rs @@ -0,0 +1,332 @@ +// Copyright 2015-2016 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! ECDSA Signatures using the P-256 and P-384 curves. + +use super::digest_scalar::digest_scalar; +use crate::{ + arithmetic::montgomery::*, + cpu, digest, + ec::suite_b::{ops::*, public_key::*, verify_jacobian_point_is_on_the_curve}, + error, + io::der, + limb, sealed, signature, +}; + +/// An ECDSA verification algorithm. +pub struct EcdsaVerificationAlgorithm { + ops: &'static PublicScalarOps, + digest_alg: &'static digest::Algorithm, + split_rs: + for<'a> fn( + ops: &'static ScalarOps, + input: &mut untrusted::Reader<'a>, + ) + -> Result<(untrusted::Input<'a>, untrusted::Input<'a>), error::Unspecified>, + id: AlgorithmID, +} + +#[derive(Debug)] +enum AlgorithmID { + ECDSA_P256_SHA256_ASN1, + ECDSA_P256_SHA256_FIXED, + ECDSA_P256_SHA384_ASN1, + ECDSA_P384_SHA256_ASN1, + ECDSA_P384_SHA384_ASN1, + ECDSA_P384_SHA384_FIXED, +} + +derive_debug_via_id!(EcdsaVerificationAlgorithm); + +impl signature::VerificationAlgorithm for EcdsaVerificationAlgorithm { + fn verify( + &self, + public_key: untrusted::Input, + msg: untrusted::Input, + signature: untrusted::Input, + ) -> Result<(), error::Unspecified> { + let cpu = cpu::features(); + let e = { + // NSA Guide Step 2: "Use the selected hash function to compute H = + // Hash(M)." + let h = digest::digest(self.digest_alg, msg.as_slice_less_safe()); + + // NSA Guide Step 3: "Convert the bit string H to an integer e as + // described in Appendix B.2." + let n = &self.ops.scalar_ops.scalar_modulus(cpu); + digest_scalar(n, h) + }; + + self.verify_digest(public_key, e, signature) + } +} + +impl EcdsaVerificationAlgorithm { + /// This is intentionally not public. + fn verify_digest( + &self, + public_key: untrusted::Input, + e: Scalar, + signature: untrusted::Input, + ) -> Result<(), error::Unspecified> { + let cpu = cpu::features(); + + // NSA Suite B Implementer's Guide to ECDSA Section 3.4.2. + + let public_key_ops = self.ops.public_key_ops; + let scalar_ops = self.ops.scalar_ops; + let q = &public_key_ops.common.elem_modulus(cpu); + let n = &scalar_ops.scalar_modulus(cpu); + + // NSA Guide Prerequisites: + // + // Prior to accepting a verified digital signature as valid the + // verifier shall have: + // + // 1. assurance of the signatory’s claimed identity, + // 2. an authentic copy of the domain parameters, (q, FR, a, b, SEED, + // G, n, h), + // 3. assurance of the validity of the public key, and + // 4. assurance that the claimed signatory actually possessed the + // private key that was used to generate the digital signature at + // the time that the signature was generated. + // + // Prerequisites #1 and #4 are outside the scope of what this function + // can do. Prerequisite #2 is handled implicitly as the domain + // parameters are hard-coded into the source. Prerequisite #3 is + // handled by `parse_uncompressed_point`. + let peer_pub_key = parse_uncompressed_point(public_key_ops, q, public_key)?; + + let (r, s) = signature.read_all(error::Unspecified, |input| { + (self.split_rs)(scalar_ops, input) + })?; + + // NSA Guide Step 1: "If r and s are not both integers in the interval + // [1, n − 1], output INVALID." + let r = scalar_parse_big_endian_variable(n, limb::AllowZero::No, r)?; + let s = scalar_parse_big_endian_variable(n, limb::AllowZero::No, s)?; + + // NSA Guide Step 4: "Compute w = s**−1 mod n, using the routine in + // Appendix B.1." + let w = self.ops.scalar_inv_to_mont_vartime(&s, cpu); + + // NSA Guide Step 5: "Compute u1 = (e * w) mod n, and compute + // u2 = (r * w) mod n." + let u1 = scalar_ops.scalar_product(&e, &w, cpu); + let u2 = scalar_ops.scalar_product(&r, &w, cpu); + + // NSA Guide Step 6: "Compute the elliptic curve point + // R = (xR, yR) = u1*G + u2*Q, using EC scalar multiplication and EC + // addition. If R is equal to the point at infinity, output INVALID." + let product = (self.ops.twin_mul)(&u1, &u2, &peer_pub_key, cpu); + + // Verify that the point we computed is on the curve; see + // `verify_affine_point_is_on_the_curve_scaled` for details on why. It + // would be more secure to do the check on the affine coordinates if we + // were going to convert to affine form (again, see + // `verify_affine_point_is_on_the_curve_scaled` for details on why). + // But, we're going to avoid converting to affine for performance + // reasons, so we do the verification using the Jacobian coordinates. + let z2 = verify_jacobian_point_is_on_the_curve(q, &product)?; + + // NSA Guide Step 7: "Compute v = xR mod n." + // NSA Guide Step 8: "Compare v and r0. If v = r0, output VALID; + // otherwise, output INVALID." + // + // Instead, we use Greg Maxwell's trick to avoid the inversion mod `q` + // that would be necessary to compute the affine X coordinate. + let x = q.point_x(&product); + fn sig_r_equals_x(q: &Modulus, r: &Elem, x: &Elem, z2: &Elem) -> bool { + let r_jacobian = q.elem_product(z2, r); + let x = q.elem_unencoded(x); + q.elems_are_equal(&r_jacobian, &x).leak() + } + let mut r = self.ops.scalar_as_elem(&r); + if sig_r_equals_x(q, &r, &x, &z2) { + return Ok(()); + } + if q.elem_less_than_vartime(&r, &self.ops.q_minus_n) { + let n = Elem::from(self.ops.n()); + q.add_assign(&mut r, &n); + if sig_r_equals_x(q, &r, &x, &z2) { + return Ok(()); + } + } + + Err(error::Unspecified) + } +} + +impl sealed::Sealed for EcdsaVerificationAlgorithm {} + +fn split_rs_fixed<'a>( + ops: &'static ScalarOps, + input: &mut untrusted::Reader<'a>, +) -> Result<(untrusted::Input<'a>, untrusted::Input<'a>), error::Unspecified> { + let scalar_len = ops.scalar_bytes_len(); + let r = input.read_bytes(scalar_len)?; + let s = input.read_bytes(scalar_len)?; + Ok((r, s)) +} + +fn split_rs_asn1<'a>( + _ops: &'static ScalarOps, + input: &mut untrusted::Reader<'a>, +) -> Result<(untrusted::Input<'a>, untrusted::Input<'a>), error::Unspecified> { + der::nested(input, der::Tag::Sequence, error::Unspecified, |input| { + let r = der::positive_integer(input)?.big_endian_without_leading_zero_as_input(); + let s = der::positive_integer(input)?.big_endian_without_leading_zero_as_input(); + Ok((r, s)) + }) +} + +/// Verification of fixed-length (PKCS#11 style) ECDSA signatures using the +/// P-256 curve and SHA-256. +/// +/// See "`ECDSA_*_FIXED` Details" in `ring::signature`'s module-level +/// documentation for more details. +pub static ECDSA_P256_SHA256_FIXED: EcdsaVerificationAlgorithm = EcdsaVerificationAlgorithm { + ops: &p256::PUBLIC_SCALAR_OPS, + digest_alg: &digest::SHA256, + split_rs: split_rs_fixed, + id: AlgorithmID::ECDSA_P256_SHA256_FIXED, +}; + +/// Verification of fixed-length (PKCS#11 style) ECDSA signatures using the +/// P-384 curve and SHA-384. +/// +/// See "`ECDSA_*_FIXED` Details" in `ring::signature`'s module-level +/// documentation for more details. +pub static ECDSA_P384_SHA384_FIXED: EcdsaVerificationAlgorithm = EcdsaVerificationAlgorithm { + ops: &p384::PUBLIC_SCALAR_OPS, + digest_alg: &digest::SHA384, + split_rs: split_rs_fixed, + id: AlgorithmID::ECDSA_P384_SHA384_FIXED, +}; + +/// Verification of ASN.1 DER-encoded ECDSA signatures using the P-256 curve +/// and SHA-256. +/// +/// See "`ECDSA_*_ASN1` Details" in `ring::signature`'s module-level +/// documentation for more details. +pub static ECDSA_P256_SHA256_ASN1: EcdsaVerificationAlgorithm = EcdsaVerificationAlgorithm { + ops: &p256::PUBLIC_SCALAR_OPS, + digest_alg: &digest::SHA256, + split_rs: split_rs_asn1, + id: AlgorithmID::ECDSA_P256_SHA256_ASN1, +}; + +/// *Not recommended*. Verification of ASN.1 DER-encoded ECDSA signatures using +/// the P-256 curve and SHA-384. +/// +/// In most situations, P-256 should be used only with SHA-256 and P-384 +/// should be used only with SHA-384. However, in some cases, particularly TLS +/// on the web, it is necessary to support P-256 with SHA-384 for compatibility +/// with widely-deployed implementations that do not follow these guidelines. +/// +/// See "`ECDSA_*_ASN1` Details" in `ring::signature`'s module-level +/// documentation for more details. +pub static ECDSA_P256_SHA384_ASN1: EcdsaVerificationAlgorithm = EcdsaVerificationAlgorithm { + ops: &p256::PUBLIC_SCALAR_OPS, + digest_alg: &digest::SHA384, + split_rs: split_rs_asn1, + id: AlgorithmID::ECDSA_P256_SHA384_ASN1, +}; + +/// *Not recommended*. Verification of ASN.1 DER-encoded ECDSA signatures using +/// the P-384 curve and SHA-256. +/// +/// In most situations, P-256 should be used only with SHA-256 and P-384 +/// should be used only with SHA-384. However, in some cases, particularly TLS +/// on the web, it is necessary to support P-256 with SHA-384 for compatibility +/// with widely-deployed implementations that do not follow these guidelines. +/// +/// See "`ECDSA_*_ASN1` Details" in `ring::signature`'s module-level +/// documentation for more details. +pub static ECDSA_P384_SHA256_ASN1: EcdsaVerificationAlgorithm = EcdsaVerificationAlgorithm { + ops: &p384::PUBLIC_SCALAR_OPS, + digest_alg: &digest::SHA256, + split_rs: split_rs_asn1, + id: AlgorithmID::ECDSA_P384_SHA256_ASN1, +}; + +/// Verification of ASN.1 DER-encoded ECDSA signatures using the P-384 curve +/// and SHA-384. +/// +/// See "`ECDSA_*_ASN1` Details" in `ring::signature`'s module-level +/// documentation for more details. +pub static ECDSA_P384_SHA384_ASN1: EcdsaVerificationAlgorithm = EcdsaVerificationAlgorithm { + ops: &p384::PUBLIC_SCALAR_OPS, + digest_alg: &digest::SHA384, + split_rs: split_rs_asn1, + id: AlgorithmID::ECDSA_P384_SHA384_ASN1, +}; + +#[cfg(test)] +mod tests { + extern crate alloc; + use super::*; + use crate::testutil as test; + use alloc::{vec, vec::Vec}; + + #[test] + fn test_digest_based_test_vectors() { + let cpu = cpu::features(); + test::run( + test_vector_file!("../../../../crypto/fipsmodule/ecdsa/ecdsa_verify_tests.txt"), + |section, test_case| { + assert_eq!(section, ""); + + let curve_name = test_case.consume_string("Curve"); + + let public_key = { + let mut public_key = vec![0x04]; + public_key.extend(&test_case.consume_bytes("X")); + public_key.extend(&test_case.consume_bytes("Y")); + public_key + }; + + let digest = test_case.consume_bytes("Digest"); + + let sig = { + let mut sig = Vec::new(); + sig.extend(&test_case.consume_bytes("R")); + sig.extend(&test_case.consume_bytes("S")); + sig + }; + + let invalid = test_case.consume_optional_string("Invalid"); + + let alg = match curve_name.as_str() { + "P-256" => &ECDSA_P256_SHA256_FIXED, + "P-384" => &ECDSA_P384_SHA384_FIXED, + _ => { + panic!("Unsupported curve: {}", curve_name); + } + }; + let n = &alg.ops.scalar_ops.scalar_modulus(cpu); + + let digest = super::super::digest_scalar::digest_bytes_scalar(n, &digest[..]); + let actual_result = alg.verify_digest( + untrusted::Input::from(&public_key[..]), + digest, + untrusted::Input::from(&sig[..]), + ); + assert_eq!(actual_result.is_ok(), invalid.is_none()); + + Ok(()) + }, + ); + } +} diff --git a/ring-0.17.14/src/ec/suite_b/ops.rs b/ring-0.17.14/src/ec/suite_b/ops.rs new file mode 100644 index 0000000000..7c2ed7f208 --- /dev/null +++ b/ring-0.17.14/src/ec/suite_b/ops.rs @@ -0,0 +1,1406 @@ +// Copyright 2016 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use crate::{ + arithmetic::limbs_from_hex, + arithmetic::montgomery::*, + bb::LeakyWord, + cpu, + error::{self, LenMismatchError}, + limb::*, +}; +use core::marker::PhantomData; + +use elem::{mul_mont, unary_op, unary_op_assign, unary_op_from_binary_op_assign}; + +/// A field element, i.e. an element of ℤ/qℤ for the curve's field modulus +/// *q*. +pub type Elem = elem::Elem; +type PublicElem = elem::PublicElem; + +/// Represents the (prime) order *q* of the curve's prime field. +#[derive(Clone, Copy)] +pub enum Q {} + +/// A scalar. Its value is in [0, n). Zero-valued scalars are forbidden in most +/// contexts. +pub type Scalar = elem::Elem; +type PublicScalar = elem::PublicElem; + +/// Represents the prime order *n* of the curve's group. +#[derive(Clone, Copy)] +pub enum N {} + +pub(super) struct Modulus { + // TODO: [Limb; elem::NumLimbs::MAX] + limbs: &'static [Limb; elem::NumLimbs::MAX], + num_limbs: elem::NumLimbs, + cops: &'static CommonOps, + m: PhantomData, + cpu: cpu::Features, +} + +pub struct Point { + // The coordinates are stored in a contiguous array, where the first + // `ops.num_limbs` elements are the X coordinate, the next + // `ops.num_limbs` elements are the Y coordinate, and the next + // `ops.num_limbs` elements are the Z coordinate. This layout is dictated + // by the requirements of the nistz256 code. + xyz: [Limb; 3 * elem::NumLimbs::MAX], +} + +impl Point { + pub fn new_at_infinity() -> Self { + Self { + xyz: [0; 3 * elem::NumLimbs::MAX], + } + } +} + +/// Operations and values needed by all curve operations. +pub struct CommonOps { + num_limbs: elem::NumLimbs, + q: PublicModulus, + n: PublicElem, + + pub a: PublicElem, // Must be -3 mod q + pub b: PublicElem, + + // In all cases, `r`, `a`, and `b` may all alias each other. + elem_mul_mont: unsafe extern "C" fn(r: *mut Limb, a: *const Limb, b: *const Limb), + elem_sqr_mont: unsafe extern "C" fn(r: *mut Limb, a: *const Limb), +} + +impl CommonOps { + pub(super) fn elem_modulus(&'static self, cpu_features: cpu::Features) -> Modulus { + Modulus { + // TODO: limbs: self.q.p.map(Limb::from), + limbs: &self.q.p, + num_limbs: self.num_limbs, + cops: self, + m: PhantomData, + cpu: cpu_features, + } + } + + pub(super) fn scalar_modulus(&'static self, cpu_features: cpu::Features) -> Modulus { + Modulus { + // TODO: limbs: self.n.limbs.map(Limb::from), + limbs: &self.n.limbs, + num_limbs: self.num_limbs, + cops: self, + m: PhantomData, + cpu: cpu_features, + } + } + + // The length of a field element, which is the same as the length of a + // scalar, in bytes. + pub fn len(&self) -> usize { + // Keep in sync with `Modulus::len()` + self.num_limbs.into() * LIMB_BYTES + } + + #[cfg(test)] + pub(super) fn n_limbs(&self) -> &[Limb] { + &self.n.limbs[..self.num_limbs.into()] + } +} + +impl Modulus { + pub fn cpu(&self) -> cpu::Features { + self.cpu + } + + // Keep in sync with `CommonOps::len()`. + pub fn bytes_len(&self) -> usize { + self.num_limbs.into() * LIMB_BYTES + } +} + +impl Modulus { + #[inline] + pub fn add_assign(&self, a: &mut elem::Elem, b: &elem::Elem) { + let num_limbs = self.num_limbs.into(); + limbs_add_assign_mod( + &mut a.limbs[..num_limbs], + &b.limbs[..num_limbs], + &self.limbs[..num_limbs], + ) + .unwrap_or_else(unwrap_impossible_len_mismatch_error) + } +} + +impl Modulus { + #[inline] + pub fn elems_are_equal(&self, a: &Elem, b: &Elem) -> LimbMask { + let num_limbs = self.num_limbs.into(); + limbs_equal_limbs_consttime(&a.limbs[..num_limbs], &b.limbs[..num_limbs]) + .unwrap_or_else(unwrap_impossible_len_mismatch_error) + } + + #[inline] + pub fn elem_unencoded(&self, a: &Elem) -> Elem { + self.elem_product(a, &Elem::one()) + } +} + +impl CommonOps { + #[inline] + fn is_zero(&self, a: &elem::Elem) -> bool { + let num_limbs = self.num_limbs.into(); + limbs_are_zero(&a.limbs[..num_limbs]).leak() + } + + #[inline] + fn elem_mul(&self, a: &mut Elem, b: &Elem, _cpu: cpu::Features) { + elem::binary_op_assign(self.elem_mul_mont, a, b) + } + + #[inline] + fn elem_product( + &self, + a: &Elem, + b: &Elem, + _cpu: cpu::Features, + ) -> Elem<<(EA, EB) as ProductEncoding>::Output> + where + (EA, EB): ProductEncoding, + { + mul_mont(self.elem_mul_mont, a, b) + } + + #[inline] + fn elem_square(&self, a: &mut Elem, _cpu: cpu::Features) { + unary_op_assign(self.elem_sqr_mont, a); + } + + #[inline] + fn elem_squared(&self, a: &Elem, _cpu: cpu::Features) -> Elem { + unary_op(self.elem_sqr_mont, a) + } +} + +impl Modulus { + #[inline] + pub fn elem_mul(&self, a: &mut Elem, b: &Elem) { + self.cops.elem_mul(a, b, self.cpu) + } + + #[inline] + pub fn elem_product( + &self, + a: &Elem, + b: &Elem, + ) -> Elem<<(EA, EB) as ProductEncoding>::Output> + where + (EA, EB): ProductEncoding, + { + self.cops.elem_product(a, b, self.cpu) + } + + #[inline] + pub fn elem_square(&self, a: &mut Elem) { + self.cops.elem_square(a, self.cpu) + } + + #[inline] + pub fn elem_squared(&self, a: &Elem) -> Elem { + self.cops.elem_squared(a, self.cpu) + } +} + +impl Modulus { + #[inline] + pub fn is_zero(&self, a: &elem::Elem) -> bool { + self.cops.is_zero(a) + } +} + +impl Modulus { + pub fn elem_verify_is_not_zero(&self, a: &Elem) -> Result<(), error::Unspecified> { + if self.is_zero(a) { + Err(error::Unspecified) + } else { + Ok(()) + } + } + + pub(super) fn a(&self) -> &'static PublicElem { + &self.cops.a + } + pub(super) fn b(&self) -> &'static PublicElem { + &self.cops.b + } +} + +impl PrivateKeyOps { + pub(super) fn point_sum(&self, a: &Point, b: &Point, _cpu: cpu::Features) -> Point { + let mut r = Point::new_at_infinity(); + unsafe { + (self.point_add_jacobian_impl)(r.xyz.as_mut_ptr(), a.xyz.as_ptr(), b.xyz.as_ptr()) + } + r + } +} + +impl Modulus { + pub fn point_x(&self, p: &Point) -> Elem { + let num_limbs = self.num_limbs.into(); + let mut r = Elem::zero(); + r.limbs[..num_limbs].copy_from_slice(&p.xyz[0..num_limbs]); + r + } + + pub fn point_y(&self, p: &Point) -> Elem { + let num_limbs = self.num_limbs.into(); + let mut r = Elem::zero(); + r.limbs[..num_limbs].copy_from_slice(&p.xyz[num_limbs..(2 * num_limbs)]); + r + } + + pub fn point_z(&self, p: &Point) -> Elem { + let num_limbs = self.num_limbs.into(); + let mut r = Elem::zero(); + r.limbs[..num_limbs].copy_from_slice(&p.xyz[(2 * num_limbs)..(3 * num_limbs)]); + r + } +} + +struct PublicModulus { + p: [LeakyLimb; elem::NumLimbs::MAX], + rr: PublicElem, +} + +/// Operations on private keys, for ECDH and ECDSA signing. +pub struct PrivateKeyOps { + pub common: &'static CommonOps, + elem_inv_squared: fn(q: &Modulus, a: &Elem) -> Elem, + point_mul_base_impl: fn(a: &Scalar, cpu: cpu::Features) -> Point, + point_mul_impl: unsafe extern "C" fn( + r: *mut Limb, // [3][num_limbs] + p_scalar: *const Limb, // [num_limbs] + p_x: *const Limb, // [num_limbs] + p_y: *const Limb, // [num_limbs] + ), + point_add_jacobian_impl: unsafe extern "C" fn(r: *mut Limb, a: *const Limb, b: *const Limb), +} + +impl PrivateKeyOps { + pub fn leak_limbs<'a>(&self, a: &'a Elem) -> &'a [Limb] { + &a.limbs[..self.common.num_limbs.into()] + } + + #[inline(always)] + pub(super) fn point_mul_base(&self, a: &Scalar, cpu: cpu::Features) -> Point { + (self.point_mul_base_impl)(a, cpu) + } + + #[inline(always)] + pub(super) fn point_mul( + &self, + p_scalar: &Scalar, + (p_x, p_y): &(Elem, Elem), + _cpu: cpu::Features, + ) -> Point { + let mut r = Point::new_at_infinity(); + unsafe { + (self.point_mul_impl)( + r.xyz.as_mut_ptr(), + p_scalar.limbs.as_ptr(), + p_x.limbs.as_ptr(), + p_y.limbs.as_ptr(), + ); + } + r + } + + #[inline] + pub(super) fn elem_inverse_squared(&self, q: &Modulus, a: &Elem) -> Elem { + (self.elem_inv_squared)(q, a) + } +} + +/// Operations and values needed by all operations on public keys (ECDH +/// agreement and ECDSA verification). +pub struct PublicKeyOps { + pub common: &'static CommonOps, +} + +impl PublicKeyOps { + // The serialized bytes are in big-endian order, zero-padded. The limbs + // of `Elem` are in the native endianness, least significant limb to + // most significant limb. Besides the parsing, conversion, this also + // implements NIST SP 800-56A Step 2: "Verify that xQ and yQ are integers + // in the interval [0, p-1] in the case that q is an odd prime p[.]" + pub(super) fn elem_parse( + &self, + q: &Modulus, + input: &mut untrusted::Reader, + ) -> Result, error::Unspecified> { + let _cpu = cpu::features(); + let encoded_value = input.read_bytes(self.common.len())?; + let parsed = elem_parse_big_endian_fixed_consttime(q, encoded_value)?; + let mut r = Elem::zero(); + let rr = Elem::from(&self.common.q.rr); + // Montgomery encode (elem_to_mont). + // TODO: do something about this. + unsafe { + (self.common.elem_mul_mont)( + r.limbs.as_mut_ptr(), + parsed.limbs.as_ptr(), + rr.limbs.as_ptr(), + ) + } + Ok(r) + } +} + +// Operations used by both ECDSA signing and ECDSA verification. In general +// these must be side-channel resistant. +pub struct ScalarOps { + pub common: &'static CommonOps, + + scalar_mul_mont: unsafe extern "C" fn(r: *mut Limb, a: *const Limb, b: *const Limb), +} + +impl ScalarOps { + pub(super) fn scalar_modulus(&'static self, cpu_features: cpu::Features) -> Modulus { + self.common.scalar_modulus(cpu_features) + } + + // The (maximum) length of a scalar, not including any padding. + pub fn scalar_bytes_len(&self) -> usize { + self.common.len() + } +} + +impl ScalarOps { + pub fn leak_limbs<'s>(&self, s: &'s Scalar) -> &'s [Limb] { + &s.limbs[..self.common.num_limbs.into()] + } + + #[inline] + pub(super) fn scalar_product( + &self, + a: &Scalar, + b: &Scalar, + _cpu: cpu::Features, + ) -> Scalar<<(EA, EB) as ProductEncoding>::Output> + where + (EA, EB): ProductEncoding, + { + mul_mont(self.scalar_mul_mont, a, b) + } +} + +/// Operations on public scalars needed by ECDSA signature verification. +pub struct PublicScalarOps { + pub scalar_ops: &'static ScalarOps, + pub public_key_ops: &'static PublicKeyOps, + + pub(super) twin_mul: fn( + g_scalar: &Scalar, + p_scalar: &Scalar, + p_xy: &(Elem, Elem), + cpu: cpu::Features, + ) -> Point, + scalar_inv_to_mont_vartime: fn(s: &Scalar, cpu: cpu::Features) -> Scalar, + pub(super) q_minus_n: PublicElem, +} + +impl PublicScalarOps { + pub fn n(&self) -> &PublicElem { + &self.scalar_ops.common.n + } + + #[inline] + pub fn scalar_as_elem(&self, a: &Scalar) -> Elem { + Elem { + limbs: a.limbs, + m: PhantomData, + encoding: PhantomData, + } + } +} + +impl Modulus { + pub fn elem_less_than_vartime(&self, a: &Elem, b: &PublicElem) -> bool { + let num_limbs = self.num_limbs.into(); + limbs_less_than_limbs_vartime(&a.limbs[..num_limbs], &b.limbs[..num_limbs]) + .unwrap_or_else(|LenMismatchError { .. }| unreachable!()) + } +} + +impl PublicScalarOps { + pub(super) fn scalar_inv_to_mont_vartime( + &self, + s: &Scalar, + cpu: cpu::Features, + ) -> Scalar { + (self.scalar_inv_to_mont_vartime)(s, cpu) + } +} + +#[allow(non_snake_case)] +pub struct PrivateScalarOps { + pub scalar_ops: &'static ScalarOps, + + oneRR_mod_n: PublicScalar, // 1 * R**2 (mod n). TOOD: Use One. + scalar_inv_to_mont: fn(a: Scalar, cpu: cpu::Features) -> Scalar, +} + +impl PrivateScalarOps { + pub(super) fn to_mont(&self, s: &Scalar, cpu: cpu::Features) -> Scalar { + self.scalar_ops + .scalar_product(s, &Scalar::from(&self.oneRR_mod_n), cpu) + } + + /// Returns the modular inverse of `a` (mod `n`). Panics if `a` is zero. + pub(super) fn scalar_inv_to_mont(&self, a: &Scalar, cpu: cpu::Features) -> Scalar { + assert!(!self.scalar_ops.common.is_zero(a)); + let a = self.to_mont(a, cpu); + (self.scalar_inv_to_mont)(a, cpu) + } +} + +// XXX: Inefficient and unnecessarily depends on `PrivateKeyOps`. TODO: implement interleaved wNAF +// multiplication. +fn twin_mul_inefficient( + ops: &PrivateKeyOps, + g_scalar: &Scalar, + p_scalar: &Scalar, + p_xy: &(Elem, Elem), + cpu: cpu::Features, +) -> Point { + let scaled_g = ops.point_mul_base(g_scalar, cpu); + let scaled_p = ops.point_mul(p_scalar, p_xy, cpu); + ops.point_sum(&scaled_g, &scaled_p, cpu) +} + +// This assumes n < q < 2*n. +impl Modulus { + pub fn elem_reduced_to_scalar(&self, elem: &Elem) -> Scalar { + let num_limbs = self.num_limbs.into(); + let mut r_limbs = elem.limbs; + limbs_reduce_once(&mut r_limbs[..num_limbs], &self.limbs[..num_limbs]) + .unwrap_or_else(unwrap_impossible_len_mismatch_error); + Scalar { + limbs: r_limbs, + m: PhantomData, + encoding: PhantomData, + } + } +} + +// Returns (`a` squared `squarings` times) * `b`. +fn elem_sqr_mul( + ops: &CommonOps, + a: &Elem, + squarings: LeakyWord, + b: &Elem, + cpu: cpu::Features, +) -> Elem { + debug_assert!(squarings >= 1); + let mut tmp = ops.elem_squared(a, cpu); + for _ in 1..squarings { + ops.elem_square(&mut tmp, cpu); + } + ops.elem_product(&tmp, b, cpu) +} + +// Sets `acc` = (`acc` squared `squarings` times) * `b`. +fn elem_sqr_mul_acc( + ops: &CommonOps, + acc: &mut Elem, + squarings: LeakyWord, + b: &Elem, + cpu: cpu::Features, +) { + debug_assert!(squarings >= 1); + for _ in 0..squarings { + ops.elem_square(acc, cpu); + } + ops.elem_mul(acc, b, cpu) +} + +#[inline] +pub(super) fn elem_parse_big_endian_fixed_consttime( + q: &Modulus, + bytes: untrusted::Input, +) -> Result, error::Unspecified> { + parse_big_endian_fixed_consttime(q, bytes, AllowZero::Yes) +} + +#[inline] +pub(super) fn scalar_parse_big_endian_fixed_consttime( + n: &Modulus, + bytes: untrusted::Input, +) -> Result { + parse_big_endian_fixed_consttime(n, bytes, AllowZero::No) +} + +#[inline] +pub(super) fn scalar_parse_big_endian_variable( + n: &Modulus, + allow_zero: AllowZero, + bytes: untrusted::Input, +) -> Result { + let num_limbs = n.num_limbs.into(); + let mut r = Scalar::zero(); + parse_big_endian_in_range_and_pad_consttime( + bytes, + allow_zero, + &n.limbs[..num_limbs], + &mut r.limbs[..num_limbs], + )?; + Ok(r) +} + +pub(super) fn scalar_parse_big_endian_partially_reduced_variable_consttime( + n: &Modulus, + bytes: untrusted::Input, +) -> Result { + let num_limbs = n.num_limbs.into(); + let mut r = Scalar::zero(); + { + let r = &mut r.limbs[..num_limbs]; + parse_big_endian_and_pad_consttime(bytes, r)?; + limbs_reduce_once(r, &n.limbs[..num_limbs]) + .unwrap_or_else(unwrap_impossible_len_mismatch_error); + } + + Ok(r) +} + +fn parse_big_endian_fixed_consttime( + m: &Modulus, + bytes: untrusted::Input, + allow_zero: AllowZero, +) -> Result, error::Unspecified> { + let num_limbs = m.num_limbs.into(); + if bytes.len() != m.bytes_len() { + return Err(error::Unspecified); + } + let mut r = elem::Elem::zero(); + parse_big_endian_in_range_and_pad_consttime( + bytes, + allow_zero, + &m.limbs[..num_limbs], + &mut r.limbs[..num_limbs], + )?; + Ok(r) +} + +#[cold] +#[inline(never)] +fn unwrap_impossible_len_mismatch_error(LenMismatchError { .. }: LenMismatchError) -> T { + unreachable!() +} + +#[cfg(test)] +mod tests { + extern crate alloc; + use super::*; + use crate::testutil as test; + use alloc::{format, vec, vec::Vec}; + + const ZERO_SCALAR: Scalar = Scalar { + limbs: [0; elem::NumLimbs::MAX], + m: PhantomData, + encoding: PhantomData, + }; + + trait Convert { + fn convert(self, q: &Modulus) -> Elem; + } + + impl Convert for Elem { + fn convert(self, _q: &Modulus) -> Elem { + self + } + } + + impl Convert for Elem { + fn convert(self, q: &Modulus) -> Elem { + q.elem_unencoded(&self) + } + } + + fn q_minus_n_plus_n_equals_0_test(ops: &PublicScalarOps) { + let cops = ops.scalar_ops.common; + let q = &cops.elem_modulus(cpu::features()); + let mut x = Elem::from(&ops.q_minus_n); + q.add_assign(&mut x, &Elem::from(&cops.n)); + assert!(q.is_zero(&x)); + } + + #[test] + fn p256_q_minus_n_plus_n_equals_0_test() { + q_minus_n_plus_n_equals_0_test(&p256::PUBLIC_SCALAR_OPS); + } + + #[test] + fn p384_q_minus_n_plus_n_equals_0_test() { + q_minus_n_plus_n_equals_0_test(&p384::PUBLIC_SCALAR_OPS); + } + + #[test] + fn p256_elem_add_test() { + elem_add_test( + &p256::PUBLIC_SCALAR_OPS, + test_vector_file!("ops/p256_elem_sum_tests.txt"), + ); + } + + #[test] + fn p384_elem_add_test() { + elem_add_test( + &p384::PUBLIC_SCALAR_OPS, + test_vector_file!("ops/p384_elem_sum_tests.txt"), + ); + } + + fn elem_add_test(ops: &PublicScalarOps, test_file: test::File) { + let cops = ops.public_key_ops.common; + let q = &cops.elem_modulus(cpu::features()); + test::run(test_file, |section, test_case| { + assert_eq!(section, ""); + + let a = consume_elem(q, test_case, "a"); + let b = consume_elem(q, test_case, "b"); + let expected_sum = consume_elem(q, test_case, "r"); + + let mut actual_sum = a; + q.add_assign(&mut actual_sum, &b); + assert_limbs_are_equal(cops, &actual_sum.limbs, &expected_sum.limbs); + + let mut actual_sum = b; + q.add_assign(&mut actual_sum, &a); + assert_limbs_are_equal(cops, &actual_sum.limbs, &expected_sum.limbs); + + Ok(()) + }) + } + + // XXX: There's no `p256_sub` in *ring*; it's logic is inlined into + // the point arithmetic functions. Thus, we can't test it. + + #[test] + fn p384_elem_sub_test() { + prefixed_extern! { + fn p384_elem_sub(r: *mut Limb, a: *const Limb, b: *const Limb); + } + elem_sub_test( + &p384::COMMON_OPS, + p384_elem_sub, + test_vector_file!("ops/p384_elem_sum_tests.txt"), + ); + } + + fn elem_sub_test( + ops: &'static CommonOps, + elem_sub: unsafe extern "C" fn(r: *mut Limb, a: *const Limb, b: *const Limb), + test_file: test::File, + ) { + let q = &ops.elem_modulus(cpu::features()); + test::run(test_file, |section, test_case| { + assert_eq!(section, ""); + + let a = consume_elem(q, test_case, "a"); + let b = consume_elem(q, test_case, "b"); + let r = consume_elem(q, test_case, "r"); + + let mut actual_difference = Elem::::zero(); + unsafe { + elem_sub( + actual_difference.limbs.as_mut_ptr(), + r.limbs.as_ptr(), + b.limbs.as_ptr(), + ); + } + assert_limbs_are_equal(ops, &actual_difference.limbs, &a.limbs); + + let mut actual_difference = Elem::::zero(); + unsafe { + elem_sub( + actual_difference.limbs.as_mut_ptr(), + r.limbs.as_ptr(), + a.limbs.as_ptr(), + ); + } + assert_limbs_are_equal(ops, &actual_difference.limbs, &b.limbs); + + Ok(()) + }) + } + + // XXX: There's no `p256_div_by_2` in *ring*; it's logic is inlined + // into the point arithmetic functions. Thus, we can't test it. + + #[test] + fn p384_elem_div_by_2_test() { + prefixed_extern! { + fn p384_elem_div_by_2(r: *mut Limb, a: *const Limb); + } + elem_div_by_2_test( + &p384::COMMON_OPS, + p384_elem_div_by_2, + test_vector_file!("ops/p384_elem_div_by_2_tests.txt"), + ); + } + + fn elem_div_by_2_test( + ops: &'static CommonOps, + elem_div_by_2: unsafe extern "C" fn(r: *mut Limb, a: *const Limb), + test_file: test::File, + ) { + let q = &ops.elem_modulus(cpu::features()); + test::run(test_file, |section, test_case| { + assert_eq!(section, ""); + + let a = consume_elem(q, test_case, "a"); + let r = consume_elem(q, test_case, "r"); + + let mut actual_result = Elem::::zero(); + unsafe { + elem_div_by_2(actual_result.limbs.as_mut_ptr(), a.limbs.as_ptr()); + } + assert_limbs_are_equal(ops, &actual_result.limbs, &r.limbs); + + Ok(()) + }) + } + + // There is no `ecp_nistz256_neg` on other targets. + #[cfg(target_arch = "x86_64")] + #[test] + fn p256_elem_neg_test() { + prefixed_extern! { + fn ecp_nistz256_neg(r: *mut Limb, a: *const Limb); + } + elem_neg_test( + &p256::COMMON_OPS, + ecp_nistz256_neg, + test_vector_file!("ops/p256_elem_neg_tests.txt"), + ); + } + + #[test] + fn p384_elem_neg_test() { + prefixed_extern! { + fn p384_elem_neg(r: *mut Limb, a: *const Limb); + } + elem_neg_test( + &p384::COMMON_OPS, + p384_elem_neg, + test_vector_file!("ops/p384_elem_neg_tests.txt"), + ); + } + + fn elem_neg_test( + ops: &'static CommonOps, + elem_neg: unsafe extern "C" fn(r: *mut Limb, a: *const Limb), + test_file: test::File, + ) { + let q = &ops.elem_modulus(cpu::features()); + test::run(test_file, |section, test_case| { + assert_eq!(section, ""); + + let a = consume_elem(q, test_case, "a"); + let b = consume_elem(q, test_case, "b"); + + // Verify -a == b. + { + let mut actual_result = Elem::::zero(); + unsafe { + elem_neg(actual_result.limbs.as_mut_ptr(), a.limbs.as_ptr()); + } + assert_limbs_are_equal(ops, &actual_result.limbs, &b.limbs); + } + + // Verify -b == a. + { + let mut actual_result = Elem::::zero(); + unsafe { + elem_neg(actual_result.limbs.as_mut_ptr(), b.limbs.as_ptr()); + } + assert_limbs_are_equal(ops, &actual_result.limbs, &a.limbs); + } + + Ok(()) + }) + } + + #[test] + fn p256_elem_mul_test() { + elem_mul_test( + &p256::COMMON_OPS, + test_vector_file!("ops/p256_elem_mul_tests.txt"), + ); + } + + #[test] + fn p384_elem_mul_test() { + elem_mul_test( + &p384::COMMON_OPS, + test_vector_file!("ops/p384_elem_mul_tests.txt"), + ); + } + + fn elem_mul_test(ops: &'static CommonOps, test_file: test::File) { + let q = &ops.elem_modulus(cpu::features()); + test::run(test_file, |section, test_case| { + assert_eq!(section, ""); + + let mut a = consume_elem(q, test_case, "a"); + let b = consume_elem(q, test_case, "b"); + let r = consume_elem(q, test_case, "r"); + q.elem_mul(&mut a, &b); + assert_limbs_are_equal(ops, &a.limbs, &r.limbs); + + Ok(()) + }) + } + + #[test] + fn p256_scalar_mul_test() { + scalar_mul_test( + &p256::SCALAR_OPS, + test_vector_file!("ops/p256_scalar_mul_tests.txt"), + ); + } + + #[test] + fn p384_scalar_mul_test() { + scalar_mul_test( + &p384::SCALAR_OPS, + test_vector_file!("ops/p384_scalar_mul_tests.txt"), + ); + } + + fn scalar_mul_test(ops: &ScalarOps, test_file: test::File) { + let cpu = cpu::features(); + let cops = ops.common; + let n = &cops.scalar_modulus(cpu); + test::run(test_file, |section, test_case| { + assert_eq!(section, ""); + let a = consume_scalar(n, test_case, "a"); + let b = consume_scalar_mont(n, test_case, "b"); + let expected_result = consume_scalar(n, test_case, "r"); + let actual_result = ops.scalar_product(&a, &b, cpu); + assert_limbs_are_equal(cops, &actual_result.limbs, &expected_result.limbs); + + Ok(()) + }) + } + + #[test] + fn p256_scalar_square_test() { + prefixed_extern! { + fn p256_scalar_sqr_rep_mont(r: *mut Limb, a: *const Limb, rep: LeakyWord); + } + scalar_square_test( + &p256::SCALAR_OPS, + p256_scalar_sqr_rep_mont, + test_vector_file!("ops/p256_scalar_square_tests.txt"), + ); + } + + // XXX: There's no `p384_scalar_square_test()` because there's no dedicated + // `p384_scalar_sqr_rep_mont()`. + + fn scalar_square_test( + ops: &ScalarOps, + sqr_rep: unsafe extern "C" fn(r: *mut Limb, a: *const Limb, rep: LeakyWord), + test_file: test::File, + ) { + let cpu = cpu::features(); + let cops = ops.common; + let n = &cops.scalar_modulus(cpu); + test::run(test_file, |section, test_case| { + assert_eq!(section, ""); + let cpu = cpu::features(); + let a = consume_scalar(n, test_case, "a"); + let expected_result = consume_scalar(n, test_case, "r"); + + { + let mut actual_result: Scalar = Scalar { + limbs: [0; elem::NumLimbs::MAX], + m: PhantomData, + encoding: PhantomData, + }; + unsafe { + sqr_rep(actual_result.limbs.as_mut_ptr(), a.limbs.as_ptr(), 1); + } + assert_limbs_are_equal(cops, &actual_result.limbs, &expected_result.limbs); + } + + { + let actual_result = ops.scalar_product(&a, &a, cpu); + assert_limbs_are_equal(cops, &actual_result.limbs, &expected_result.limbs); + } + + Ok(()) + }) + } + + #[test] + #[should_panic(expected = "!self.scalar_ops.common.is_zero(a)")] + fn p256_scalar_inv_to_mont_zero_panic_test() { + let _ = p256::PRIVATE_SCALAR_OPS.scalar_inv_to_mont(&ZERO_SCALAR, cpu::features()); + } + + #[test] + #[should_panic(expected = "!self.scalar_ops.common.is_zero(a)")] + fn p384_scalar_inv_to_mont_zero_panic_test() { + let _ = p384::PRIVATE_SCALAR_OPS.scalar_inv_to_mont(&ZERO_SCALAR, cpu::features()); + } + + #[test] + fn p256_point_sum_test() { + point_sum_test( + &p256::PRIVATE_KEY_OPS, + test_vector_file!("ops/p256_point_sum_tests.txt"), + ); + } + + #[test] + fn p384_point_sum_test() { + point_sum_test( + &p384::PRIVATE_KEY_OPS, + test_vector_file!("ops/p384_point_sum_tests.txt"), + ); + } + + fn point_sum_test(ops: &PrivateKeyOps, test_file: test::File) { + let cpu = cpu::features(); + + test::run(test_file, |section, test_case| { + assert_eq!(section, ""); + + let a = consume_jacobian_point(ops, test_case, "a"); + let b = consume_jacobian_point(ops, test_case, "b"); + let r_expected: TestPoint = consume_point(ops, test_case, "r"); + + let r_actual = ops.point_sum(&a, &b, cpu); + assert_point_actual_equals_expected(ops, &r_actual, &r_expected); + + Ok(()) + }); + } + + #[test] + fn p256_point_sum_mixed_test() { + prefixed_extern! { + fn p256_point_add_affine( + r: *mut Limb, // [p256::COMMON_OPS.num_limbs*3] + a: *const Limb, // [p256::COMMON_OPS.num_limbs*3] + b: *const Limb, // [p256::COMMON_OPS.num_limbs*2] + ); + } + point_sum_mixed_test( + &p256::PRIVATE_KEY_OPS, + p256_point_add_affine, + test_vector_file!("ops/p256_point_sum_mixed_tests.txt"), + ); + } + + // XXX: There is no `nistz384_point_add_affine()`. + + fn point_sum_mixed_test( + ops: &PrivateKeyOps, + point_add_affine: unsafe extern "C" fn( + r: *mut Limb, // [ops.num_limbs*3] + a: *const Limb, // [ops.num_limbs*3] + b: *const Limb, // [ops.num_limbs*2] + ), + test_file: test::File, + ) { + test::run(test_file, |section, test_case| { + assert_eq!(section, ""); + + let a = consume_jacobian_point(ops, test_case, "a"); + let b = consume_affine_point(ops, test_case, "b"); + let r_expected: TestPoint = consume_point(ops, test_case, "r"); + + let mut r_actual = Point::new_at_infinity(); + unsafe { + point_add_affine(r_actual.xyz.as_mut_ptr(), a.xyz.as_ptr(), b.xy.as_ptr()); + } + + assert_point_actual_equals_expected(ops, &r_actual, &r_expected); + + Ok(()) + }); + } + + #[test] + fn p256_point_double_test() { + prefixed_extern! { + fn p256_point_double( + r: *mut Limb, // [p256::COMMON_OPS.num_limbs*3] + a: *const Limb, // [p256::COMMON_OPS.num_limbs*3] + ); + } + point_double_test( + &p256::PRIVATE_KEY_OPS, + p256_point_double, + test_vector_file!("ops/p256_point_double_tests.txt"), + ); + } + + #[test] + fn p384_point_double_test() { + prefixed_extern! { + fn p384_point_double( + r: *mut Limb, // [p384::COMMON_OPS.num_limbs*3] + a: *const Limb, // [p384::COMMON_OPS.num_limbs*3] + ); + } + point_double_test( + &p384::PRIVATE_KEY_OPS, + p384_point_double, + test_vector_file!("ops/p384_point_double_tests.txt"), + ); + } + + fn point_double_test( + ops: &PrivateKeyOps, + point_double: unsafe extern "C" fn( + r: *mut Limb, // [ops.num_limbs*3] + a: *const Limb, // [ops.num_limbs*3] + ), + test_file: test::File, + ) { + test::run(test_file, |section, test_case| { + assert_eq!(section, ""); + + let a = consume_jacobian_point(ops, test_case, "a"); + let r_expected: TestPoint = consume_point(ops, test_case, "r"); + + let mut r_actual = Point::new_at_infinity(); + unsafe { + point_double(r_actual.xyz.as_mut_ptr(), a.xyz.as_ptr()); + } + + assert_point_actual_equals_expected(ops, &r_actual, &r_expected); + + Ok(()) + }); + } + + /// TODO: We should be testing `point_mul` with points other than the generator. + #[test] + fn p256_point_mul_test() { + let generator = ( + Elem::from(&p256::GENERATOR.0), + Elem::from(&p256::GENERATOR.1), + ); + point_mul_base_tests( + &p256::PRIVATE_KEY_OPS, + |s, cpu| p256::PRIVATE_KEY_OPS.point_mul(s, &generator, cpu), + test_vector_file!("ops/p256_point_mul_base_tests.txt"), + ); + } + + /// TODO: We should be testing `point_mul` with points other than the generator. + #[test] + fn p384_point_mul_test() { + let generator = ( + Elem::from(&p384::GENERATOR.0), + Elem::from(&p384::GENERATOR.1), + ); + + point_mul_base_tests( + &p384::PRIVATE_KEY_OPS, + |s, cpu| p384::PRIVATE_KEY_OPS.point_mul(s, &generator, cpu), + test_vector_file!("ops/p384_point_mul_base_tests.txt"), + ); + } + + #[test] + fn p256_point_mul_serialized_test() { + point_mul_serialized_test( + &p256::PRIVATE_KEY_OPS, + &p256::PUBLIC_KEY_OPS, + test_vector_file!("ops/p256_point_mul_serialized_tests.txt"), + ); + } + + fn point_mul_serialized_test( + priv_ops: &PrivateKeyOps, + pub_ops: &PublicKeyOps, + test_file: test::File, + ) { + let cpu = cpu::features(); + let cops = pub_ops.common; + let q = &cops.elem_modulus(cpu); + let n = &cops.scalar_modulus(cpu); + test::run(test_file, |section, test_case| { + assert_eq!(section, ""); + let p_scalar = consume_scalar(n, test_case, "p_scalar"); + + let p = test_case.consume_bytes("p"); + let p = super::super::public_key::parse_uncompressed_point( + pub_ops, + q, + untrusted::Input::from(&p), + ) + .expect("valid point"); + + let expected_result = test_case.consume_bytes("r"); + + let product = priv_ops.point_mul(&p_scalar, &p, cpu::features()); + + let mut actual_result = vec![4u8; 1 + (2 * cops.len())]; + { + let (x, y) = actual_result[1..].split_at_mut(cops.len()); + super::super::private_key::big_endian_affine_from_jacobian( + priv_ops, + q, + x, + Some(y), + &product, + ) + .expect("successful encoding"); + } + + assert_eq!(expected_result, actual_result); + + Ok(()) + }) + } + + #[test] + fn p256_point_mul_base_test() { + point_mul_base_tests( + &p256::PRIVATE_KEY_OPS, + |s, cpu| p256::PRIVATE_KEY_OPS.point_mul_base(s, cpu), + test_vector_file!("ops/p256_point_mul_base_tests.txt"), + ); + } + + #[test] + fn p384_point_mul_base_test() { + point_mul_base_tests( + &p384::PRIVATE_KEY_OPS, + |s, cpu| p384::PRIVATE_KEY_OPS.point_mul_base(s, cpu), + test_vector_file!("ops/p384_point_mul_base_tests.txt"), + ); + } + + pub(super) fn point_mul_base_tests( + ops: &PrivateKeyOps, + f: impl Fn(&Scalar, cpu::Features) -> Point, + test_file: test::File, + ) { + let cpu = cpu::features(); + let n = &ops.common.scalar_modulus(cpu); + test::run(test_file, |section, test_case| { + assert_eq!(section, ""); + let g_scalar = consume_scalar(n, test_case, "g_scalar"); + let expected_result: TestPoint = consume_point(ops, test_case, "r"); + let actual_result = f(&g_scalar, cpu); + assert_point_actual_equals_expected(ops, &actual_result, &expected_result); + Ok(()) + }) + } + + fn assert_point_actual_equals_expected( + ops: &PrivateKeyOps, + actual_point: &Point, + expected_point: &TestPoint, + ) where + Elem: Convert, + { + let cpu = cpu::features(); + + let cops = ops.common; + let q = &cops.elem_modulus(cpu); + let actual_x = &q.point_x(actual_point); + let actual_y = &q.point_y(actual_point); + let actual_z = &q.point_z(actual_point); + match expected_point { + TestPoint::Infinity => { + let zero = Elem::zero(); + assert_elems_are_equal(q, actual_z, &zero); + } + TestPoint::Affine(expected_x, expected_y) => { + let zz_inv = ops.elem_inverse_squared(q, actual_z); + let x_aff = q.elem_product(actual_x, &zz_inv); + let y_aff = { + let zzzz_inv = q.elem_squared(&zz_inv); + let zzz_inv = q.elem_product(actual_z, &zzzz_inv); + q.elem_product(actual_y, &zzz_inv) + }; + + let x_aff = x_aff.convert(q); + let y_aff = y_aff.convert(q); + + assert_elems_are_equal(q, &x_aff, expected_x); + assert_elems_are_equal(q, &y_aff, expected_y); + } + } + } + + fn consume_jacobian_point( + ops: &PrivateKeyOps, + test_case: &mut test::TestCase, + name: &str, + ) -> Point { + let q = &ops.common.elem_modulus(cpu::features()); + let input = test_case.consume_string(name); + let elems = input.split(", ").collect::>(); + assert_eq!(elems.len(), 3); + let mut p = Point::new_at_infinity(); + consume_point_elem(q, &mut p.xyz, &elems, 0); + consume_point_elem(q, &mut p.xyz, &elems, 1); + consume_point_elem(q, &mut p.xyz, &elems, 2); + p + } + + struct AffinePoint { + xy: [Limb; 2 * elem::NumLimbs::MAX], + } + + fn consume_affine_point( + ops: &PrivateKeyOps, + test_case: &mut test::TestCase, + name: &str, + ) -> AffinePoint { + let q = &ops.common.elem_modulus(cpu::features()); + let input = test_case.consume_string(name); + let elems = input.split(", ").collect::>(); + assert_eq!(elems.len(), 2); + let mut p = AffinePoint { + xy: [0; 2 * elem::NumLimbs::MAX], + }; + consume_point_elem(q, &mut p.xy, &elems, 0); + consume_point_elem(q, &mut p.xy, &elems, 1); + p + } + + fn consume_point_elem(q: &Modulus, limbs_out: &mut [Limb], elems: &[&str], i: usize) { + let num_limbs = q.num_limbs.into(); + let bytes = test::from_hex(elems[i]).unwrap(); + let bytes = untrusted::Input::from(&bytes); + let r: Elem = elem_parse_big_endian_fixed_consttime(q, bytes).unwrap(); + // XXX: “Transmute” this to `Elem` limbs. + limbs_out[(i * num_limbs)..((i + 1) * num_limbs)].copy_from_slice(&r.limbs[..num_limbs]); + } + + enum TestPoint { + Infinity, + Affine(Elem, Elem), + } + + fn consume_point( + ops: &PrivateKeyOps, + test_case: &mut test::TestCase, + name: &str, + ) -> TestPoint { + let q = &ops.common.elem_modulus(cpu::features()); + fn consume_point_elem(q: &Modulus, elems: &[&str], i: usize) -> Elem { + let bytes = test::from_hex(elems[i]).unwrap(); + let bytes = untrusted::Input::from(&bytes); + let unencoded: Elem = + elem_parse_big_endian_fixed_consttime(q, bytes).unwrap(); + // XXX: “Transmute” this to `Elem` limbs. + Elem { + limbs: unencoded.limbs, + m: PhantomData, + encoding: PhantomData, + } + } + + let input = test_case.consume_string(name); + if input == "inf" { + return TestPoint::Infinity; + } + let elems = input.split(", ").collect::>(); + assert_eq!(elems.len(), 2); + let x = consume_point_elem(q, &elems, 0); + let y = consume_point_elem(q, &elems, 1); + TestPoint::Affine(x, y) + } + + fn assert_elems_are_equal(q: &Modulus, a: &Elem, b: &Elem) { + assert_limbs_are_equal(q.cops, &a.limbs, &b.limbs) + } + + fn assert_limbs_are_equal( + ops: &CommonOps, + actual: &[Limb; elem::NumLimbs::MAX], + expected: &[Limb; elem::NumLimbs::MAX], + ) { + let num_limbs = ops.num_limbs.into(); + if actual[..num_limbs] != expected[..num_limbs] { + let mut actual_s = alloc::string::String::new(); + let mut expected_s = alloc::string::String::new(); + for j in 0..num_limbs { + let width = LIMB_BITS / 4; + let formatted = format!("{:0width$x}", actual[num_limbs - j - 1]); + actual_s.push_str(&formatted); + let formatted = format!("{:0width$x}", expected[num_limbs - j - 1]); + expected_s.push_str(&formatted); + } + panic!( + "Actual != Expected,\nActual = {}, Expected = {}", + actual_s, expected_s + ); + } + } + + fn consume_elem(q: &Modulus, test_case: &mut test::TestCase, name: &str) -> Elem { + let unpadded_bytes = test_case.consume_bytes(name); + let mut bytes = vec![0; q.bytes_len() - unpadded_bytes.len()]; + bytes.extend(&unpadded_bytes); + + let bytes = untrusted::Input::from(&bytes); + let r: Elem = elem_parse_big_endian_fixed_consttime(q, bytes).unwrap(); + // XXX: “Transmute” this to an `Elem`. + Elem { + limbs: r.limbs, + m: PhantomData, + encoding: PhantomData, + } + } + + fn consume_scalar(n: &Modulus, test_case: &mut test::TestCase, name: &str) -> Scalar { + let bytes = test_case.consume_bytes(name); + let bytes = untrusted::Input::from(&bytes); + scalar_parse_big_endian_variable(n, AllowZero::Yes, bytes).unwrap() + } + + fn consume_scalar_mont( + n: &Modulus, + test_case: &mut test::TestCase, + name: &str, + ) -> Scalar { + let bytes = test_case.consume_bytes(name); + let bytes = untrusted::Input::from(&bytes); + let s = scalar_parse_big_endian_variable(n, AllowZero::Yes, bytes).unwrap(); + // “Transmute” it to a `Scalar`. + Scalar { + limbs: s.limbs, + m: PhantomData, + encoding: PhantomData, + } + } +} + +mod elem; +pub mod p256; +pub mod p384; diff --git a/ring-0.17.14/src/ec/suite_b/ops/elem.rs b/ring-0.17.14/src/ec/suite_b/ops/elem.rs new file mode 100644 index 0000000000..6c308d959e --- /dev/null +++ b/ring-0.17.14/src/ec/suite_b/ops/elem.rs @@ -0,0 +1,167 @@ +// Copyright 2017 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use crate::ec::suite_b::ops::{ + p256::NUM_LIMBS as P256_NUM_LIMBS, p384::NUM_LIMBS as P384_NUM_LIMBS, +}; +use crate::{ + arithmetic::{ + limbs_from_hex, + montgomery::{Encoding, ProductEncoding, Unencoded}, + }, + limb::{LeakyLimb, Limb}, +}; +use core::marker::PhantomData; + +#[derive(Clone, Copy)] +pub(super) enum NumLimbs { + P256, + P384, +} + +impl NumLimbs { + pub(super) const MAX: usize = Self::P384.into(); + + pub(super) const fn into(self) -> usize { + match self { + NumLimbs::P256 => P256_NUM_LIMBS, + NumLimbs::P384 => P384_NUM_LIMBS, + } + } +} + +/// Elements of ℤ/mℤ for some modulus *m*. Elements are always fully reduced +/// with respect to *m*; i.e. the 0 <= x < m for every value x. +#[derive(Clone, Copy)] +pub struct Elem { + // XXX: pub + pub(super) limbs: [Limb; NumLimbs::MAX], + + /// The modulus *m* for the ring ℤ/mℤ for which this element is a value. + pub(super) m: PhantomData, + + /// The number of Montgomery factors that need to be canceled out from + /// `value` to get the actual value. + pub(super) encoding: PhantomData, +} + +pub struct PublicElem { + pub(super) limbs: [LeakyLimb; NumLimbs::MAX], + pub(super) m: PhantomData, + pub(super) encoding: PhantomData, +} + +impl From<&PublicElem> for Elem { + fn from(value: &PublicElem) -> Self { + Self { + limbs: core::array::from_fn(|i| Limb::from(value.limbs[i])), + m: value.m, + encoding: value.encoding, + } + } +} + +impl Elem { + // There's no need to convert `value` to the Montgomery domain since + // 0 * R**2 (mod m) == 0, so neither the modulus nor the encoding are needed + // as inputs for constructing a zero-valued element. + pub fn zero() -> Self { + Self { + limbs: [0; NumLimbs::MAX], + m: PhantomData, + encoding: PhantomData, + } + } +} + +impl Elem { + pub fn one() -> Self { + let mut r = Self::zero(); + r.limbs[0] = 1; + r + } +} + +impl PublicElem { + pub const fn from_hex(hex: &str) -> Self { + Self { + limbs: limbs_from_hex(hex), + m: PhantomData, + encoding: PhantomData, + } + } +} + +#[inline] +pub fn mul_mont( + f: unsafe extern "C" fn(r: *mut Limb, a: *const Limb, b: *const Limb), + a: &Elem, + b: &Elem, +) -> Elem::Output> +where + (EA, EB): ProductEncoding, +{ + binary_op(f, a, b) +} + +// let r = f(a, b); return r; +#[inline] +pub fn binary_op( + f: unsafe extern "C" fn(r: *mut Limb, a: *const Limb, b: *const Limb), + a: &Elem, + b: &Elem, +) -> Elem { + let mut r = Elem::zero(); + unsafe { f(r.limbs.as_mut_ptr(), a.limbs.as_ptr(), b.limbs.as_ptr()) } + r +} + +// a := f(a, b); +#[inline] +pub fn binary_op_assign( + f: unsafe extern "C" fn(r: *mut Limb, a: *const Limb, b: *const Limb), + a: &mut Elem, + b: &Elem, +) { + unsafe { f(a.limbs.as_mut_ptr(), a.limbs.as_ptr(), b.limbs.as_ptr()) } +} + +// let r = f(a); return r; +#[inline] +pub fn unary_op( + f: unsafe extern "C" fn(r: *mut Limb, a: *const Limb), + a: &Elem, +) -> Elem { + let mut r = Elem::zero(); + unsafe { f(r.limbs.as_mut_ptr(), a.limbs.as_ptr()) } + r +} + +// a := f(a); +#[inline] +pub fn unary_op_assign( + f: unsafe extern "C" fn(r: *mut Limb, a: *const Limb), + a: &mut Elem, +) { + unsafe { f(a.limbs.as_mut_ptr(), a.limbs.as_ptr()) } +} + +// a := f(a, a); +#[inline] +pub fn unary_op_from_binary_op_assign( + f: unsafe extern "C" fn(r: *mut Limb, a: *const Limb, b: *const Limb), + a: &mut Elem, +) { + unsafe { f(a.limbs.as_mut_ptr(), a.limbs.as_ptr(), a.limbs.as_ptr()) } +} diff --git a/ring-0.17.14/src/ec/suite_b/ops/p256.rs b/ring-0.17.14/src/ec/suite_b/ops/p256.rs new file mode 100644 index 0000000000..6f3cd02398 --- /dev/null +++ b/ring-0.17.14/src/ec/suite_b/ops/p256.rs @@ -0,0 +1,334 @@ +// Copyright 2016-2023 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::{ + elem::{binary_op, binary_op_assign}, + elem_sqr_mul, elem_sqr_mul_acc, PublicModulus, *, +}; + +pub(super) const NUM_LIMBS: usize = 256 / LIMB_BITS; + +pub static COMMON_OPS: CommonOps = CommonOps { + num_limbs: elem::NumLimbs::P256, + + q: PublicModulus { + p: limbs_from_hex("ffffffff00000001000000000000000000000000ffffffffffffffffffffffff"), + rr: PublicElem::from_hex("4fffffffdfffffffffffffffefffffffbffffffff0000000000000003"), + }, + n: PublicElem::from_hex("ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632551"), + + a: PublicElem::from_hex("fffffffc00000004000000000000000000000003fffffffffffffffffffffffc"), + b: PublicElem::from_hex("dc30061d04874834e5a220abf7212ed6acf005cd78843090d89cdf6229c4bddf"), + + elem_mul_mont: p256_mul_mont, + elem_sqr_mont: p256_sqr_mont, +}; + +#[cfg(test)] +pub(super) static GENERATOR: (PublicElem, PublicElem) = ( + PublicElem::from_hex("18905f76a53755c679fb732b7762251075ba95fc5fedb60179e730d418a9143c"), + PublicElem::from_hex("8571ff1825885d85d2e88688dd21f3258b4ab8e4ba19e45cddf25357ce95560a"), +); + +pub static PRIVATE_KEY_OPS: PrivateKeyOps = PrivateKeyOps { + common: &COMMON_OPS, + elem_inv_squared: p256_elem_inv_squared, + point_mul_base_impl: p256_point_mul_base_impl, + point_mul_impl: p256_point_mul, + point_add_jacobian_impl: p256_point_add, +}; + +fn p256_elem_inv_squared(q: &Modulus, a: &Elem) -> Elem { + // Calculate a**-2 (mod q) == a**(q - 3) (mod q) + // + // The exponent (q - 3) is: + // + // 0xffffffff00000001000000000000000000000000fffffffffffffffffffffffc + + #[inline] + fn sqr_mul(q: &Modulus, a: &Elem, squarings: LeakyWord, b: &Elem) -> Elem { + elem_sqr_mul(&COMMON_OPS, a, squarings, b, q.cpu()) + } + + #[inline] + fn sqr_mul_acc(q: &Modulus, a: &mut Elem, squarings: LeakyWord, b: &Elem) { + elem_sqr_mul_acc(&COMMON_OPS, a, squarings, b, q.cpu()) + } + + let b_1 = &a; + let b_11 = sqr_mul(q, b_1, 1, b_1); + let b_111 = sqr_mul(q, &b_11, 1, b_1); + let f_11 = sqr_mul(q, &b_111, 3, &b_111); + let fff = sqr_mul(q, &f_11, 6, &f_11); + let fff_111 = sqr_mul(q, &fff, 3, &b_111); + let fffffff_11 = sqr_mul(q, &fff_111, 15, &fff_111); + let ffffffff = sqr_mul(q, &fffffff_11, 2, &b_11); + + // ffffffff00000001 + let mut acc = sqr_mul(q, &ffffffff, 31 + 1, b_1); + + // ffffffff00000001000000000000000000000000ffffffff + sqr_mul_acc(q, &mut acc, 96 + 32, &ffffffff); + + // ffffffff00000001000000000000000000000000ffffffffffffffff + sqr_mul_acc(q, &mut acc, 32, &ffffffff); + + // ffffffff00000001000000000000000000000000fffffffffffffffffffffff_11 + sqr_mul_acc(q, &mut acc, 30, &fffffff_11); + + // ffffffff00000001000000000000000000000000fffffffffffffffffffffffc + q.elem_square(&mut acc); + q.elem_square(&mut acc); + + acc +} + +fn p256_point_mul_base_impl(g_scalar: &Scalar, _cpu: cpu::Features) -> Point { + prefixed_extern! { + fn p256_point_mul_base( + r: *mut Limb, // [3][COMMON_OPS.num_limbs] + g_scalar: *const Limb, // [COMMON_OPS.num_limbs] + ); + } + + let mut r = Point::new_at_infinity(); + unsafe { + p256_point_mul_base(r.xyz.as_mut_ptr(), g_scalar.limbs.as_ptr()); + } + r +} + +pub static PUBLIC_KEY_OPS: PublicKeyOps = PublicKeyOps { + common: &COMMON_OPS, +}; + +pub static SCALAR_OPS: ScalarOps = ScalarOps { + common: &COMMON_OPS, + scalar_mul_mont: p256_scalar_mul_mont, +}; + +pub static PUBLIC_SCALAR_OPS: PublicScalarOps = PublicScalarOps { + scalar_ops: &SCALAR_OPS, + public_key_ops: &PUBLIC_KEY_OPS, + + #[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + target_arch = "x86_64" + ))] + twin_mul: twin_mul_nistz256, + + #[cfg(not(any( + all(target_arch = "aarch64", target_endian = "little"), + target_arch = "x86_64" + )))] + twin_mul: |g_scalar, p_scalar, p_xy, cpu| { + twin_mul_inefficient(&PRIVATE_KEY_OPS, g_scalar, p_scalar, p_xy, cpu) + }, + + q_minus_n: PublicElem::from_hex("4319055358e8617b0c46353d039cdaae"), + + // TODO: Use an optimized variable-time implementation. + scalar_inv_to_mont_vartime: |s, cpu| PRIVATE_SCALAR_OPS.scalar_inv_to_mont(s, cpu), +}; + +#[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + target_arch = "x86_64" +))] +fn twin_mul_nistz256( + g_scalar: &Scalar, + p_scalar: &Scalar, + p_xy: &(Elem, Elem), + cpu: cpu::Features, +) -> Point { + let scaled_g = point_mul_base_vartime(g_scalar, cpu); + let scaled_p = PRIVATE_KEY_OPS.point_mul(p_scalar, p_xy, cpu::features()); + PRIVATE_KEY_OPS.point_sum(&scaled_g, &scaled_p, cpu) +} + +#[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + target_arch = "x86_64" +))] +fn point_mul_base_vartime(g_scalar: &Scalar, _cpu: cpu::Features) -> Point { + prefixed_extern! { + fn p256_point_mul_base_vartime(r: *mut Limb, // [3][COMMON_OPS.num_limbs] + g_scalar: *const Limb, // [COMMON_OPS.num_limbs] + ); + } + let mut scaled_g = Point::new_at_infinity(); + unsafe { + p256_point_mul_base_vartime(scaled_g.xyz.as_mut_ptr(), g_scalar.limbs.as_ptr()); + } + scaled_g +} + +pub static PRIVATE_SCALAR_OPS: PrivateScalarOps = PrivateScalarOps { + scalar_ops: &SCALAR_OPS, + + oneRR_mod_n: PublicScalar::from_hex( + "66e12d94f3d956202845b2392b6bec594699799c49bd6fa683244c95be79eea2", + ), + scalar_inv_to_mont: p256_scalar_inv_to_mont, +}; + +#[allow(clippy::just_underscores_and_digits)] +fn p256_scalar_inv_to_mont(a: Scalar, _cpu: cpu::Features) -> Scalar { + // Calculate the modular inverse of scalar |a| using Fermat's Little + // Theorem: + // + // a**-1 (mod n) == a**(n - 2) (mod n) + // + // The exponent (n - 2) is: + // + // 0xffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254f + + #[inline] + fn mul(a: &Scalar, b: &Scalar) -> Scalar { + binary_op(p256_scalar_mul_mont, a, b) + } + + #[inline] + fn sqr(a: &Scalar) -> Scalar { + let mut tmp = Scalar::zero(); + unsafe { p256_scalar_sqr_rep_mont(tmp.limbs.as_mut_ptr(), a.limbs.as_ptr(), 1) } + tmp + } + + // Returns (`a` squared `squarings` times) * `b`. + fn sqr_mul(a: &Scalar, squarings: LeakyWord, b: &Scalar) -> Scalar { + debug_assert!(squarings >= 1); + let mut tmp = Scalar::zero(); + unsafe { p256_scalar_sqr_rep_mont(tmp.limbs.as_mut_ptr(), a.limbs.as_ptr(), squarings) } + mul(&tmp, b) + } + + // Sets `acc` = (`acc` squared `squarings` times) * `b`. + fn sqr_mul_acc(acc: &mut Scalar, squarings: LeakyWord, b: &Scalar) { + debug_assert!(squarings >= 1); + unsafe { p256_scalar_sqr_rep_mont(acc.limbs.as_mut_ptr(), acc.limbs.as_ptr(), squarings) } + binary_op_assign(p256_scalar_mul_mont, acc, b); + } + + let _1 = &a; + + let _10 = sqr(_1); // 2 + let _100 = sqr(&_10); // 4 + let _101 = mul(&_100, _1); // 5 + let _111 = mul(&_101, &_10); // 7 + + let _1000 = sqr(&_100); // 8 + let _10000 = sqr(&_1000); // 16 + let _100000 = sqr(&_10000); // 32 + + let _100111 = mul(&_111, &_100000); // 39 = 7 + 32 + let _101011 = mul(&_100, &_100111); // 43 = 4 + 39 + let _101111 = mul(&_100, &_101011); // 47 = 4 + 39 + let _1001111 = mul(&_100000, &_101111); // 79 = 32 + 47 + let _86 = sqr(&_101011); // 86 = 43 * 2 + let _1011011 = mul(&_101, &_86); // 91 = 5 + 86 + let _92 = mul(_1, &_1011011); // 92 = 1 + 91 + let _1100011 = mul(&_111, &_92); // 99 = 7 + 92 + let _10111111 = mul(&_92, &_1100011); // 191 = 92 + 99 + let _11011111 = mul(&_100000, &_10111111); // 223 = 32 + 191 + + let ff = mul(&_100000, &_11011111); // 255 = 32 + 223 + let ffff = sqr_mul(&ff, 0 + 8, &ff); + let ffffffff = sqr_mul(&ffff, 0 + 16, &ffff); + + // ffffffff00000000ffffffff + let mut acc = sqr_mul(&ffffffff, 32 + 32, &ffffffff); + + // ffffffff00000000ffffffffffffffff + sqr_mul_acc(&mut acc, 0 + 32, &ffffffff); + + // The rest of the exponent, in binary, is: + // + // 1011110011100110111110101010110110100111000101111001111010000100 + // 1111001110111001110010101100001011111100011000110010010101001111 + + sqr_mul_acc(&mut acc, 6, &_101111); + sqr_mul_acc(&mut acc, 2 + 3, &_111); + sqr_mul_acc(&mut acc, 2 + 8, &_11011111); + sqr_mul_acc(&mut acc, 1 + 3, &_101); + sqr_mul_acc(&mut acc, 1 + 7, &_1011011); + sqr_mul_acc(&mut acc, 1 + 6, &_100111); + sqr_mul_acc(&mut acc, 3 + 6, &_101111); + sqr_mul_acc(&mut acc, 2 + 3, &_111); + sqr_mul_acc(&mut acc, 3, &_101); + sqr_mul_acc(&mut acc, 4 + 7, &_1001111); + sqr_mul_acc(&mut acc, 2 + 3, &_111); + sqr_mul_acc(&mut acc, 1 + 3, &_111); + sqr_mul_acc(&mut acc, 2 + 3, &_111); + sqr_mul_acc(&mut acc, 2 + 6, &_101011); + sqr_mul_acc(&mut acc, 4 + 8, &_10111111); + sqr_mul_acc(&mut acc, 3 + 7, &_1100011); + sqr_mul_acc(&mut acc, 2 + 1, _1); + sqr_mul_acc(&mut acc, 2 + 3, &_101); + sqr_mul_acc(&mut acc, 1 + 7, &_1001111); + + acc +} + +prefixed_extern! { + pub(super) fn p256_mul_mont( + r: *mut Limb, // [COMMON_OPS.num_limbs] + a: *const Limb, // [COMMON_OPS.num_limbs] + b: *const Limb, // [COMMON_OPS.num_limbs] + ); + pub(super) fn p256_sqr_mont( + r: *mut Limb, // [COMMON_OPS.num_limbs] + a: *const Limb, // [COMMON_OPS.num_limbs] + ); + + fn p256_point_add( + r: *mut Limb, // [3][COMMON_OPS.num_limbs] + a: *const Limb, // [3][COMMON_OPS.num_limbs] + b: *const Limb, // [3][COMMON_OPS.num_limbs] + ); + fn p256_point_mul( + r: *mut Limb, // [3][COMMON_OPS.num_limbs] + p_scalar: *const Limb, // [COMMON_OPS.num_limbs] + p_x: *const Limb, // [COMMON_OPS.num_limbs] + p_y: *const Limb, // [COMMON_OPS.num_limbs] + ); + + fn p256_scalar_mul_mont( + r: *mut Limb, // [COMMON_OPS.num_limbs] + a: *const Limb, // [COMMON_OPS.num_limbs] + b: *const Limb, // [COMMON_OPS.num_limbs] + ); + fn p256_scalar_sqr_rep_mont( + r: *mut Limb, // [COMMON_OPS.num_limbs] + a: *const Limb, // [COMMON_OPS.num_limbs] + rep: LeakyWord, + ); +} + +#[cfg(test)] +mod tests { + #[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + target_arch = "x86_64" + ))] + #[test] + fn p256_point_mul_base_vartime_test() { + use super::{super::tests::point_mul_base_tests, *}; + point_mul_base_tests( + &PRIVATE_KEY_OPS, + point_mul_base_vartime, + test_vector_file!("p256_point_mul_base_tests.txt"), + ); + } +} diff --git a/ring-0.17.14/src/ec/suite_b/ops/p384.rs b/ring-0.17.14/src/ec/suite_b/ops/p384.rs new file mode 100644 index 0000000000..f1bde6d848 --- /dev/null +++ b/ring-0.17.14/src/ec/suite_b/ops/p384.rs @@ -0,0 +1,304 @@ +// Copyright 2016-2023 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::{ + elem::{binary_op, binary_op_assign}, + elem_sqr_mul, elem_sqr_mul_acc, PublicModulus, *, +}; + +pub(super) const NUM_LIMBS: usize = 384 / LIMB_BITS; + +pub static COMMON_OPS: CommonOps = CommonOps { + num_limbs: elem::NumLimbs::P384, + + q: PublicModulus { + p: limbs_from_hex("fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffeffffffff0000000000000000ffffffff"), + rr: PublicElem::from_hex("10000000200000000fffffffe000000000000000200000000fffffffe00000001"), + }, + n: PublicElem::from_hex("ffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52973"), + + a: PublicElem::from_hex("fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffbfffffffc0000000000000003fffffffc"), + b: PublicElem::from_hex("cd08114b604fbff9b62b21f41f022094e3374bee94938ae277f2209b1920022ef729add87a4c32ec081188719d412dcc"), + + elem_mul_mont: p384_elem_mul_mont, + elem_sqr_mont: p384_elem_sqr_mont, +}; + +pub(super) static GENERATOR: (PublicElem, PublicElem) = ( + PublicElem::from_hex("4d3aadc2299e1513812ff723614ede2b6454868459a30eff879c3afc541b4d6e20e378e2a0d6ce383dd0756649c0b528"), + PublicElem::from_hex("2b78abc25a15c5e9dd8002263969a840c6c3521968f4ffd98bade7562e83b050a1bfa8bf7bb4a9ac23043dad4b03a4fe"), +); + +pub static PRIVATE_KEY_OPS: PrivateKeyOps = PrivateKeyOps { + common: &COMMON_OPS, + elem_inv_squared: p384_elem_inv_squared, + point_mul_base_impl: p384_point_mul_base_impl, + point_mul_impl: p384_point_mul, + point_add_jacobian_impl: p384_point_add, +}; + +fn p384_elem_inv_squared(q: &Modulus, a: &Elem) -> Elem { + // Calculate a**-2 (mod q) == a**(q - 3) (mod q) + // + // The exponent (q - 3) is: + // + // 0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffe\ + // ffffffff0000000000000000fffffffc + + #[inline] + fn sqr_mul(q: &Modulus, a: &Elem, squarings: LeakyWord, b: &Elem) -> Elem { + elem_sqr_mul(&COMMON_OPS, a, squarings, b, q.cpu()) + } + + #[inline] + fn sqr_mul_acc(q: &Modulus, a: &mut Elem, squarings: LeakyWord, b: &Elem) { + elem_sqr_mul_acc(&COMMON_OPS, a, squarings, b, q.cpu()) + } + + let b_1 = &a; + let b_11 = sqr_mul(q, b_1, 1, b_1); + let b_111 = sqr_mul(q, &b_11, 1, b_1); + let f_11 = sqr_mul(q, &b_111, 3, &b_111); + let fff = sqr_mul(q, &f_11, 6, &f_11); + let fff_111 = sqr_mul(q, &fff, 3, &b_111); + let fffffff_11 = sqr_mul(q, &fff_111, 15, &fff_111); + + let fffffffffffffff = sqr_mul(q, &fffffff_11, 30, &fffffff_11); + + let ffffffffffffffffffffffffffffff = sqr_mul(q, &fffffffffffffff, 60, &fffffffffffffff); + + // ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff + let mut acc = sqr_mul( + q, + &ffffffffffffffffffffffffffffff, + 120, + &ffffffffffffffffffffffffffffff, + ); + + // fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff_111 + sqr_mul_acc(q, &mut acc, 15, &fff_111); + + // fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffeffffffff + sqr_mul_acc(q, &mut acc, 1 + 30, &fffffff_11); + sqr_mul_acc(q, &mut acc, 2, &b_11); + + // fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffeffffffff + // 0000000000000000fffffff_11 + sqr_mul_acc(q, &mut acc, 64 + 30, &fffffff_11); + + // fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffeffffffff + // 0000000000000000fffffffc + q.elem_square(&mut acc); + q.elem_square(&mut acc); + + acc +} + +fn p384_point_mul_base_impl(a: &Scalar, cpu: cpu::Features) -> Point { + // XXX: Not efficient. TODO: Precompute multiples of the generator. + let generator = (Elem::from(&GENERATOR.0), Elem::from(&GENERATOR.1)); + PRIVATE_KEY_OPS.point_mul(a, &generator, cpu) +} + +pub static PUBLIC_KEY_OPS: PublicKeyOps = PublicKeyOps { + common: &COMMON_OPS, +}; + +pub static SCALAR_OPS: ScalarOps = ScalarOps { + common: &COMMON_OPS, + scalar_mul_mont: p384_scalar_mul_mont, +}; + +pub static PUBLIC_SCALAR_OPS: PublicScalarOps = PublicScalarOps { + scalar_ops: &SCALAR_OPS, + public_key_ops: &PUBLIC_KEY_OPS, + twin_mul: |g_scalar, p_scalar, p_xy, cpu| { + twin_mul_inefficient(&PRIVATE_KEY_OPS, g_scalar, p_scalar, p_xy, cpu) + }, + + q_minus_n: PublicElem::from_hex("389cb27e0bc8d21fa7e5f24cb74f58851313e696333ad68c"), + + // TODO: Use an optimized variable-time implementation. + scalar_inv_to_mont_vartime: |s, cpu| PRIVATE_SCALAR_OPS.scalar_inv_to_mont(s, cpu), +}; + +pub static PRIVATE_SCALAR_OPS: PrivateScalarOps = PrivateScalarOps { + scalar_ops: &SCALAR_OPS, + + oneRR_mod_n: PublicScalar::from_hex("c84ee012b39bf213fb05b7a28266895d40d49174aab1cc5bc3e483afcb82947ff3d81e5df1aa4192d319b2419b409a9"), + scalar_inv_to_mont: p384_scalar_inv_to_mont, +}; + +fn p384_scalar_inv_to_mont(a: Scalar, _cpu: cpu::Features) -> Scalar { + // Calculate the modular inverse of scalar |a| using Fermat's Little + // Theorem: + // + // a**-1 (mod n) == a**(n - 2) (mod n) + // + // The exponent (n - 2) is: + // + // 0xffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf\ + // 581a0db248b0a77aecec196accc52971 + + fn mul(a: &Scalar, b: &Scalar) -> Scalar { + binary_op(p384_scalar_mul_mont, a, b) + } + + fn sqr(a: &Scalar) -> Scalar { + binary_op(p384_scalar_mul_mont, a, a) + } + + fn sqr_mut(a: &mut Scalar) { + unary_op_from_binary_op_assign(p384_scalar_mul_mont, a); + } + + // Returns (`a` squared `squarings` times) * `b`. + fn sqr_mul(a: &Scalar, squarings: LeakyWord, b: &Scalar) -> Scalar { + debug_assert!(squarings >= 1); + let mut tmp = sqr(a); + for _ in 1..squarings { + sqr_mut(&mut tmp); + } + mul(&tmp, b) + } + + // Sets `acc` = (`acc` squared `squarings` times) * `b`. + fn sqr_mul_acc(acc: &mut Scalar, squarings: LeakyWord, b: &Scalar) { + debug_assert!(squarings >= 1); + for _ in 0..squarings { + sqr_mut(acc); + } + binary_op_assign(p384_scalar_mul_mont, acc, b) + } + + // Indexes into `d`. + const B_1: usize = 0; + const B_11: usize = 1; + const B_101: usize = 2; + const B_111: usize = 3; + const B_1001: usize = 4; + const B_1011: usize = 5; + const B_1101: usize = 6; + const B_1111: usize = 7; + const DIGIT_COUNT: usize = 8; + + let mut d = [Scalar::zero(); DIGIT_COUNT]; + d[B_1] = a; + let b_10 = sqr(&d[B_1]); + for i in B_11..DIGIT_COUNT { + d[i] = mul(&d[i - 1], &b_10); + } + + let ff = sqr_mul(&d[B_1111], 0 + 4, &d[B_1111]); + let ffff = sqr_mul(&ff, 0 + 8, &ff); + let ffffffff = sqr_mul(&ffff, 0 + 16, &ffff); + + let ffffffffffffffff = sqr_mul(&ffffffff, 0 + 32, &ffffffff); + + let ffffffffffffffffffffffff = sqr_mul(&ffffffffffffffff, 0 + 32, &ffffffff); + + // ffffffffffffffffffffffffffffffffffffffffffffffff + let mut acc = sqr_mul(&ffffffffffffffffffffffff, 0 + 96, &ffffffffffffffffffffffff); + + // The rest of the exponent, in binary, is: + // + // 1100011101100011010011011000000111110100001101110010110111011111 + // 0101100000011010000011011011001001001000101100001010011101111010 + // 1110110011101100000110010110101011001100110001010010100101110001 + + #[allow(clippy::cast_possible_truncation)] + static REMAINING_WINDOWS: [(u8, u8); 39] = [ + (2, B_11 as u8), + (3 + 3, B_111 as u8), + (1 + 2, B_11 as u8), + (3 + 2, B_11 as u8), + (1 + 4, B_1001 as u8), + (4, B_1011 as u8), + (6 + 4, B_1111 as u8), + (3, B_101 as u8), + (4 + 1, B_1 as u8), + (4, B_1011 as u8), + (4, B_1001 as u8), + (1 + 4, B_1101 as u8), + (4, B_1101 as u8), + (4, B_1111 as u8), + (1 + 4, B_1011 as u8), + (6 + 4, B_1101 as u8), + (5 + 4, B_1101 as u8), + (4, B_1011 as u8), + (2 + 4, B_1001 as u8), + (2 + 1, B_1 as u8), + (3 + 4, B_1011 as u8), + (4 + 3, B_101 as u8), + (2 + 3, B_111 as u8), + (1 + 4, B_1111 as u8), + (1 + 4, B_1011 as u8), + (4, B_1011 as u8), + (2 + 3, B_111 as u8), + (1 + 2, B_11 as u8), + (5 + 2, B_11 as u8), + (2 + 4, B_1011 as u8), + (1 + 3, B_101 as u8), + (1 + 2, B_11 as u8), + (2 + 2, B_11 as u8), + (2 + 2, B_11 as u8), + (3 + 3, B_101 as u8), + (2 + 3, B_101 as u8), + (2 + 3, B_101 as u8), + (2, B_11 as u8), + (3 + 1, B_1 as u8), + ]; + + for &(squarings, digit) in &REMAINING_WINDOWS[..] { + sqr_mul_acc(&mut acc, LeakyWord::from(squarings), &d[usize::from(digit)]); + } + + acc +} + +unsafe extern "C" fn p384_elem_sqr_mont( + r: *mut Limb, // [COMMON_OPS.num_limbs] + a: *const Limb, // [COMMON_OPS.num_limbs] +) { + // XXX: Inefficient. TODO: Make a dedicated squaring routine. + unsafe { + p384_elem_mul_mont(r, a, a); + } +} + +prefixed_extern! { + fn p384_elem_mul_mont( + r: *mut Limb, // [COMMON_OPS.num_limbs] + a: *const Limb, // [COMMON_OPS.num_limbs] + b: *const Limb, // [COMMON_OPS.num_limbs] + ); + + fn p384_point_add( + r: *mut Limb, // [3][COMMON_OPS.num_limbs] + a: *const Limb, // [3][COMMON_OPS.num_limbs] + b: *const Limb, // [3][COMMON_OPS.num_limbs] + ); + fn p384_point_mul( + r: *mut Limb, // [3][COMMON_OPS.num_limbs] + p_scalar: *const Limb, // [COMMON_OPS.num_limbs] + p_x: *const Limb, // [COMMON_OPS.num_limbs] + p_y: *const Limb, // [COMMON_OPS.num_limbs] + ); + + fn p384_scalar_mul_mont( + r: *mut Limb, // [COMMON_OPS.num_limbs] + a: *const Limb, // [COMMON_OPS.num_limbs] + b: *const Limb, // [COMMON_OPS.num_limbs] + ); +} diff --git a/ring-0.17.14/src/ec/suite_b/private_key.rs b/ring-0.17.14/src/ec/suite_b/private_key.rs new file mode 100644 index 0000000000..19129296c3 --- /dev/null +++ b/ring-0.17.14/src/ec/suite_b/private_key.rs @@ -0,0 +1,203 @@ +// Copyright 2016 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! Functionality shared by operations on private keys (ECC keygen and +//! ECDSA signing). + +use super::{ops::*, verify_affine_point_is_on_the_curve}; +use crate::{arithmetic::montgomery::R, cpu, ec, error, limb, rand}; + +/// Generates a random scalar in the range [1, n). +pub(super) fn random_scalar( + ops: &PrivateKeyOps, + n: &Modulus, + rng: &dyn rand::SecureRandom, +) -> Result { + let mut bytes = [0; ec::SCALAR_MAX_BYTES]; + let bytes = &mut bytes[..ops.common.len()]; + generate_private_scalar_bytes(ops, rng, bytes, n.cpu())?; + scalar_from_big_endian_bytes(n, bytes) +} + +pub(super) fn generate_private_scalar_bytes( + ops: &PrivateKeyOps, + rng: &dyn rand::SecureRandom, + out: &mut [u8], + cpu: cpu::Features, +) -> Result<(), error::Unspecified> { + // [NSA Suite B Implementer's Guide to ECDSA] Appendix A.1.2, and + // [NSA Suite B Implementer's Guide to NIST SP 800-56A] Appendix B.2, + // "Key Pair Generation by Testing Candidates". + // + // [NSA Suite B Implementer's Guide to ECDSA]: doc/ecdsa.pdf + // [NSA Suite B Implementer's Guide to NIST SP 800-56A]: doc/ecdh.pdf + + // TODO: The NSA guide also suggests, in appendix B.1, another mechanism + // that would avoid the need to use `rng.fill()` more than once. It works + // by generating an extra 64 bits of random bytes and then reducing the + // output (mod n). Supposedly, this removes enough of the bias towards + // small values from the modular reduction, but it isn't obvious that it is + // sufficient. TODO: Figure out what we can do to mitigate the bias issue + // and switch to the other mechanism. + + let candidate = out; + + // XXX: The value 100 was chosen to match OpenSSL due to uncertainty of + // what specific value would be better, but it seems bad to try 100 times. + for _ in 0..100 { + // NSA Guide Steps 1, 2, and 3. + // + // Since we calculate the length ourselves, it is pointless to check + // it, since we can only check it by doing the same calculation. + + // NSA Guide Step 4. + // + // The requirement that the random number generator has the + // requested security strength is delegated to `rng`. + rng.fill(candidate)?; + + // NSA Guide Steps 5, 6, and 7. + if check_scalar_big_endian_bytes(ops, candidate, cpu).is_err() { + continue; + } + + // NSA Guide Step 8 is done in `public_from_private()`. + + // NSA Guide Step 9. + return Ok(()); + } + + Err(error::Unspecified) +} + +// The underlying X25519 and Ed25519 code uses an [u8; 32] to store the private +// key. To make the ECDH and ECDSA code similar to that, we also store the +// private key that way, which means we have to convert it to a Scalar whenever +// we need to use it. +#[inline] +pub(super) fn private_key_as_scalar(n: &Modulus, private_key: &ec::Seed) -> Scalar { + // This cannot fail because we know the private key is valid. + scalar_from_big_endian_bytes(n, private_key.bytes_less_safe()).unwrap() +} + +pub(super) fn check_scalar_big_endian_bytes( + ops: &PrivateKeyOps, + bytes: &[u8], + cpu: cpu::Features, +) -> Result<(), error::Unspecified> { + debug_assert_eq!(bytes.len(), ops.common.len()); + let n = &ops.common.scalar_modulus(cpu); + scalar_from_big_endian_bytes(n, bytes).map(|_| ()) +} + +// Parses a fixed-length (zero-padded) big-endian-encoded scalar in the range +// [1, n). This is intended to be constant-time with respect to the actual +// value *only if* the value is actually in range. In other words, this won't +// leak anything about a valid value, but it might leak small amounts of +// information about an invalid value (which constraint it failed). +pub(super) fn scalar_from_big_endian_bytes( + n: &Modulus, + bytes: &[u8], +) -> Result { + // [NSA Suite B Implementer's Guide to ECDSA] Appendix A.1.2, and + // [NSA Suite B Implementer's Guide to NIST SP 800-56A] Appendix B.2, + // "Key Pair Generation by Testing Candidates". + // + // [NSA Suite B Implementer's Guide to ECDSA]: doc/ecdsa.pdf + // [NSA Suite B Implementer's Guide to NIST SP 800-56A]: doc/ecdh.pdf + // + // Steps 5, 6, and 7. + // + // XXX: The NSA guide says that we should verify that the random scalar is + // in the range [0, n - 1) and then add one to it so that it is in the range + // [1, n). Instead, we verify that the scalar is in the range [1, n). This + // way, we avoid needing to compute or store the value (n - 1), we avoid the + // need to implement a function to add one to a scalar, and we avoid needing + // to convert the scalar back into an array of bytes. + scalar_parse_big_endian_fixed_consttime(n, untrusted::Input::from(bytes)) +} + +pub(super) fn public_from_private( + ops: &PrivateKeyOps, + public_out: &mut [u8], + my_private_key: &ec::Seed, + cpu: cpu::Features, +) -> Result<(), error::Unspecified> { + let q = &ops.common.elem_modulus(cpu); + let elem_and_scalar_bytes = ops.common.len(); + debug_assert_eq!(public_out.len(), 1 + (2 * elem_and_scalar_bytes)); + let n = &ops.common.scalar_modulus(cpu); + let my_private_key = private_key_as_scalar(n, my_private_key); + let my_public_key = ops.point_mul_base(&my_private_key, cpu); + public_out[0] = 4; // Uncompressed encoding. + let (x_out, y_out) = public_out[1..].split_at_mut(elem_and_scalar_bytes); + + // `big_endian_affine_from_jacobian` verifies that the point is not at + // infinity and is on the curve. + big_endian_affine_from_jacobian(ops, q, x_out, Some(y_out), &my_public_key) +} + +pub(super) fn affine_from_jacobian( + ops: &PrivateKeyOps, + q: &Modulus, + p: &Point, +) -> Result<(Elem, Elem), error::Unspecified> { + let z = q.point_z(p); + + // Since we restrict our private key to the range [1, n), the curve has + // prime order, and we verify that the peer's point is on the curve, + // there's no way that the result can be at infinity. But, use `assert!` + // instead of `debug_assert!` anyway + assert!(q.elem_verify_is_not_zero(&z).is_ok()); + + let x = q.point_x(p); + let y = q.point_y(p); + + let zz_inv = ops.elem_inverse_squared(q, &z); + + let x_aff = q.elem_product(&x, &zz_inv); + + // `y_aff` is needed to validate the point is on the curve. It is also + // needed in the non-ECDH case where we need to output it. + let y_aff = { + let zzzz_inv = q.elem_squared(&zz_inv); + let zzz_inv = q.elem_product(&z, &zzzz_inv); + q.elem_product(&y, &zzz_inv) + }; + + // If we validated our inputs correctly and then computed (x, y, z), then + // (x, y, z) will be on the curve. See + // `verify_affine_point_is_on_the_curve_scaled` for the motivation. + verify_affine_point_is_on_the_curve(q, (&x_aff, &y_aff))?; + + Ok((x_aff, y_aff)) +} + +pub(super) fn big_endian_affine_from_jacobian( + ops: &PrivateKeyOps, + q: &Modulus, + x_out: &mut [u8], + y_out: Option<&mut [u8]>, + p: &Point, +) -> Result<(), error::Unspecified> { + let (x_aff, y_aff) = affine_from_jacobian(ops, q, p)?; + let x = q.elem_unencoded(&x_aff); + limb::big_endian_from_limbs(ops.leak_limbs(&x), x_out); + if let Some(y_out) = y_out { + let y = q.elem_unencoded(&y_aff); + limb::big_endian_from_limbs(ops.leak_limbs(&y), y_out); + } + + Ok(()) +} diff --git a/ring-0.17.14/src/ec/suite_b/public_key.rs b/ring-0.17.14/src/ec/suite_b/public_key.rs new file mode 100644 index 0000000000..328bb371a4 --- /dev/null +++ b/ring-0.17.14/src/ec/suite_b/public_key.rs @@ -0,0 +1,111 @@ +// Copyright 2016 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! Functionality shared by operations on public keys (ECDSA verification and +//! ECDH agreement). + +use super::{ops::*, verify_affine_point_is_on_the_curve}; +use crate::{arithmetic::montgomery::*, error}; + +/// Parses a public key encoded in uncompressed form. The key is validated +/// using the ECC Partial Public-Key Validation Routine from +/// [NIST SP 800-56A, revision 2] Section 5.6.2.3.3, the NSA's +/// "Suite B Implementer's Guide to NIST SP 800-56A," Appendix B.3, and the +/// NSA's "Suite B Implementer's Guide to FIPS 186-3 (ECDSA)," Appendix A.3. +/// +/// [NIST SP 800-56A, revision 2]: +/// http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-56Ar2.pdf +pub(super) fn parse_uncompressed_point( + ops: &PublicKeyOps, + q: &Modulus, + input: untrusted::Input, +) -> Result<(Elem, Elem), error::Unspecified> { + // NIST SP 800-56A Step 1: "Verify that Q is not the point at infinity. + // This can be done by inspection if the point is entered in the standard + // affine representation." (We do it by inspection since we only accept + // the affine representation.) + let (x, y) = input.read_all(error::Unspecified, |input| { + // The encoding must be 4, which is the encoding for "uncompressed". + let encoding = input.read_byte()?; + if encoding != 4 { + return Err(error::Unspecified); + } + + // NIST SP 800-56A Step 2: "Verify that xQ and yQ are integers in the + // interval [0, p-1] in the case that q is an odd prime p[.]" + let x = ops.elem_parse(q, input)?; + let y = ops.elem_parse(q, input)?; + Ok((x, y)) + })?; + + // NIST SP 800-56A Step 3: "If q is an odd prime p, verify that + // yQ**2 = xQ**3 + axQ + b in GF(p), where the arithmetic is performed + // modulo p." + verify_affine_point_is_on_the_curve(q, (&x, &y))?; + + // NIST SP 800-56A Note: "Since its order is not verified, there is no + // check that the public key is in the correct EC subgroup." + // + // NSA Suite B Implementer's Guide Note: "ECC Full Public-Key Validation + // includes an additional check to ensure that the point has the correct + // order. This check is not necessary for curves having prime order (and + // cofactor h = 1), such as P-256 and P-384." + + Ok((x, y)) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::cpu; + use crate::testutil as test; + + #[test] + fn parse_uncompressed_point_test() { + let cpu = cpu::features(); + test::run( + test_vector_file!("suite_b_public_key_tests.txt"), + |section, test_case| { + assert_eq!(section, ""); + + let curve_name = test_case.consume_string("Curve"); + + let public_key = test_case.consume_bytes("Q"); + let public_key = untrusted::Input::from(&public_key); + let is_valid = test_case.consume_string("Result") == "P"; + + let curve_ops = public_key_ops_from_curve_name(&curve_name); + let q = &curve_ops.common.elem_modulus(cpu); + + let result = parse_uncompressed_point(curve_ops, q, public_key); + assert_eq!(is_valid, result.is_ok()); + + // TODO: Verify that we when we re-serialize the parsed (x, y), the + // output is equal to the input. + + Ok(()) + }, + ); + } + + fn public_key_ops_from_curve_name(curve_name: &str) -> &'static PublicKeyOps { + if curve_name == "P-256" { + &p256::PUBLIC_KEY_OPS + } else if curve_name == "P-384" { + &p384::PUBLIC_KEY_OPS + } else { + panic!("Unsupported curve: {}", curve_name); + } + } +} diff --git a/ring-0.17.14/src/error/input_too_long.rs b/ring-0.17.14/src/error/input_too_long.rs new file mode 100644 index 0000000000..6000077469 --- /dev/null +++ b/ring-0.17.14/src/error/input_too_long.rs @@ -0,0 +1,39 @@ +// Copyright 2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +pub struct InputTooLongError { + /// Note that this might not actually be the (exact) length of the input, + /// and its units might be lost. For example, it could be any of the + /// following: + /// + /// * The length in bytes of the entire input. + /// * The length in bytes of some *part* of the input. + /// * A bit length. + /// * A length in terms of "blocks" or other grouping of input values. + /// * Some intermediate quantity that was used when checking the input + /// length. + /// * Some arbitrary value. + #[allow(dead_code)] + imprecise_input_length: T, +} + +impl InputTooLongError { + #[cold] + #[inline(never)] + pub(crate) fn new(imprecise_input_length: T) -> Self { + Self { + imprecise_input_length, + } + } +} diff --git a/ring-0.17.14/src/error/into_unspecified.rs b/ring-0.17.14/src/error/into_unspecified.rs new file mode 100644 index 0000000000..dc0bd94ae5 --- /dev/null +++ b/ring-0.17.14/src/error/into_unspecified.rs @@ -0,0 +1,33 @@ +// Copyright 2016-2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use crate::error::{KeyRejected, Unspecified}; + +impl From for Unspecified { + fn from(source: untrusted::EndOfInput) -> Self { + super::erase(source) + } +} + +impl From for Unspecified { + fn from(source: core::array::TryFromSliceError) -> Self { + super::erase(source) + } +} + +impl From for Unspecified { + fn from(source: KeyRejected) -> Self { + super::erase(source) + } +} diff --git a/ring-0.17.14/src/error/key_rejected.rs b/ring-0.17.14/src/error/key_rejected.rs new file mode 100644 index 0000000000..c12d973ebc --- /dev/null +++ b/ring-0.17.14/src/error/key_rejected.rs @@ -0,0 +1,111 @@ +// Copyright 2016-2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! Error reporting. + +#[cfg(feature = "std")] +extern crate std; + +/// An error parsing or validating a key. +/// +/// The `Display` implementation will return a string that will help you better +/// understand why a key was rejected change which errors are reported in which +/// situations while minimizing the likelihood that any applications will be +/// broken. +/// +/// Here is an incomplete list of reasons a key may be unsupported: +/// +/// * Invalid or Inconsistent Components: A component of the key has an invalid +/// value, or the mathematical relationship between two (or more) components +/// required for a valid key does not hold. +/// +/// * The encoding of the key is invalid. Perhaps the key isn't in the correct +/// format; e.g. it may be Base64 ("PEM") encoded, in which case the Base64 +/// encoding needs to be undone first. +/// +/// * The encoding includes a versioning mechanism and that mechanism indicates +/// that the key is encoded in a version of the encoding that isn't supported. +/// This might happen for multi-prime RSA keys (keys with more than two +/// private prime factors), which aren't supported, for example. +/// +/// * Too small or too Large: One of the primary components of the key is too +/// small or two large. Too-small keys are rejected for security reasons. Some +/// unnecessarily large keys are rejected for performance reasons. +/// +/// * Wrong algorithm: The key is not valid for the algorithm in which it was +/// being used. +/// +/// * Unexpected errors: Report this as a bug. +#[derive(Copy, Clone, Debug)] +pub struct KeyRejected(&'static str); + +impl KeyRejected { + pub(crate) fn inconsistent_components() -> Self { + Self("InconsistentComponents") + } + + pub(crate) fn invalid_component() -> Self { + Self("InvalidComponent") + } + + #[inline] + pub(crate) fn invalid_encoding() -> Self { + Self("InvalidEncoding") + } + + // XXX: See the comment at the call site. + pub(crate) fn rng_failed() -> Self { + Self("RNG failed") + } + + pub(crate) fn public_key_is_missing() -> Self { + Self("PublicKeyIsMissing") + } + + #[cfg(feature = "alloc")] + pub(crate) fn too_small() -> Self { + Self("TooSmall") + } + + #[cfg(feature = "alloc")] + pub(crate) fn too_large() -> Self { + Self("TooLarge") + } + + pub(crate) fn version_not_supported() -> Self { + Self("VersionNotSupported") + } + + pub(crate) fn wrong_algorithm() -> Self { + Self("WrongAlgorithm") + } + + #[cfg(feature = "alloc")] + pub(crate) fn private_modulus_len_not_multiple_of_512_bits() -> Self { + Self("PrivateModulusLenNotMultipleOf512Bits") + } + + pub(crate) fn unexpected_error() -> Self { + Self("UnexpectedError") + } +} + +#[cfg(feature = "std")] +impl std::error::Error for KeyRejected {} + +impl core::fmt::Display for KeyRejected { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.write_str(self.0) + } +} diff --git a/ring-0.17.14/src/error/mod.rs b/ring-0.17.14/src/error/mod.rs new file mode 100644 index 0000000000..ead174fd45 --- /dev/null +++ b/ring-0.17.14/src/error/mod.rs @@ -0,0 +1,58 @@ +// Copyright 2016-2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! Error reporting. + +pub use self::{key_rejected::KeyRejected, unspecified::Unspecified}; + +pub(crate) use self::{ + input_too_long::InputTooLongError, len_mismatch_error::LenMismatchError, + too_much_output_requested::TooMuchOutputRequestedError, +}; + +mod input_too_long; +mod into_unspecified; +mod key_rejected; +mod unspecified; + +#[cold] +#[inline(never)] +pub(crate) fn erase(_: T) -> Unspecified { + Unspecified +} + +cold_exhaustive_error! { + struct too_much_output_requested::TooMuchOutputRequestedError + with pub(crate) constructor { + // Note that this might not actually be the (exact) output length + // requested, and its units might be lost. For example, it could be any of + // the following: + // + // * The length in bytes of the entire output. + // * The length in bytes of some *part* of the output. + // * A bit length. + // * A length in terms of "blocks" or other grouping of output values. + // * Some intermediate quantity that was used when checking the output + // length. + // * Some arbitrary value. + imprecise_output_length: usize + } +} + +cold_exhaustive_error! { + struct len_mismatch_error::LenMismatchError + with pub(crate) constructor { + len: usize + } +} diff --git a/ring-0.17.14/src/error/unspecified.rs b/ring-0.17.14/src/error/unspecified.rs new file mode 100644 index 0000000000..22a3e02950 --- /dev/null +++ b/ring-0.17.14/src/error/unspecified.rs @@ -0,0 +1,85 @@ +// Copyright 2016-2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#[cfg(feature = "std")] +extern crate std; + +/// An error with absolutely no details. +/// +/// *ring* uses this unit type as the error type in most of its results +/// because (a) usually the specific reasons for a failure are obvious or are +/// not useful to know, and/or (b) providing more details about a failure might +/// provide a dangerous side channel, and/or (c) it greatly simplifies the +/// error handling logic. +/// +/// `Result` is mostly equivalent to +/// `Result`. However, `ring::error::Unspecified` implements +/// [`std::error::Error`] and users of *ring* can implement +/// `From` to map this to their own error types, as +/// described in [“Error Handling” in the Rust Book]: +/// +/// ``` +/// use ring::rand::{self, SecureRandom}; +/// +/// enum Error { +/// CryptoError, +/// +/// # #[cfg(feature = "alloc")] +/// IOError(std::io::Error), +/// // [...] +/// } +/// +/// impl From for Error { +/// fn from(_: ring::error::Unspecified) -> Self { Error::CryptoError } +/// } +/// +/// fn eight_random_bytes() -> Result<[u8; 8], Error> { +/// let rng = rand::SystemRandom::new(); +/// let mut bytes = [0; 8]; +/// +/// // The `From` implementation above makes this +/// // equivalent to +/// // `rng.fill(&mut bytes).map_err(|_| Error::CryptoError)?`. +/// rng.fill(&mut bytes)?; +/// +/// Ok(bytes) +/// } +/// +/// assert!(eight_random_bytes().is_ok()); +/// ``` +/// +/// Experience with using and implementing other crypto libraries like has +/// shown that sophisticated error reporting facilities often cause significant +/// bugs themselves, both within the crypto library and within users of the +/// crypto library. This approach attempts to minimize complexity in the hopes +/// of avoiding such problems. In some cases, this approach may be too extreme, +/// and it may be important for an operation to provide some details about the +/// cause of a failure. Users of *ring* are encouraged to report such cases so +/// that they can be addressed individually. +/// +/// [`std::error::Error`]: https://doc.rust-lang.org/std/error/trait.Error.html +/// [“Error Handling” in the Rust Book]: +/// https://doc.rust-lang.org/book/first-edition/error-handling.html#the-from-trait +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct Unspecified; + +// This is required for the implementation of `std::error::Error`. +impl core::fmt::Display for Unspecified { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.write_str("ring::error::Unspecified") + } +} + +#[cfg(feature = "std")] +impl std::error::Error for Unspecified {} diff --git a/ring-0.17.14/src/hkdf.rs b/ring-0.17.14/src/hkdf.rs new file mode 100644 index 0000000000..6a28a4e91a --- /dev/null +++ b/ring-0.17.14/src/hkdf.rs @@ -0,0 +1,233 @@ +// Copyright 2015 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! HMAC-based Extract-and-Expand Key Derivation Function. +//! +//! HKDF is specified in [RFC 5869]. +//! +//! [RFC 5869]: https://tools.ietf.org/html/rfc5869 + +use crate::{error, hmac}; + +/// An HKDF algorithm. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct Algorithm(hmac::Algorithm); + +impl Algorithm { + /// The underlying HMAC algorithm. + #[inline] + pub fn hmac_algorithm(&self) -> hmac::Algorithm { + self.0 + } +} + +/// HKDF using HMAC-SHA-1. Obsolete. +pub static HKDF_SHA1_FOR_LEGACY_USE_ONLY: Algorithm = + Algorithm(hmac::HMAC_SHA1_FOR_LEGACY_USE_ONLY); + +/// HKDF using HMAC-SHA-256. +pub static HKDF_SHA256: Algorithm = Algorithm(hmac::HMAC_SHA256); + +/// HKDF using HMAC-SHA-384. +pub static HKDF_SHA384: Algorithm = Algorithm(hmac::HMAC_SHA384); + +/// HKDF using HMAC-SHA-512. +pub static HKDF_SHA512: Algorithm = Algorithm(hmac::HMAC_SHA512); + +impl KeyType for Algorithm { + fn len(&self) -> usize { + self.0.digest_algorithm().output_len() + } +} + +/// A salt for HKDF operations. +#[derive(Debug)] +pub struct Salt(hmac::Key); + +impl Salt { + /// Constructs a new `Salt` with the given value based on the given digest + /// algorithm. + /// + /// Constructing a `Salt` is relatively expensive so it is good to reuse a + /// `Salt` object instead of re-constructing `Salt`s with the same value. + pub fn new(algorithm: Algorithm, value: &[u8]) -> Self { + Self(hmac::Key::new(algorithm.0, value)) + } + + /// The [HKDF-Extract] operation. + /// + /// [HKDF-Extract]: https://tools.ietf.org/html/rfc5869#section-2.2 + pub fn extract(&self, secret: &[u8]) -> Prk { + // The spec says that if no salt is provided then a key of + // `digest_alg.output_len` bytes of zeros is used. But, HMAC keys are + // already zero-padded to the block length, which is larger than the output + // length of the extract step (the length of the digest). Consequently the + // `Key` constructor will automatically do the right thing for a + // zero-length string. + let salt = &self.0; + let prk = hmac::sign(salt, secret); + Prk(hmac::Key::new(salt.algorithm(), prk.as_ref())) + } + + /// The algorithm used to derive this salt. + #[inline] + pub fn algorithm(&self) -> Algorithm { + Algorithm(self.0.algorithm()) + } +} + +impl From> for Salt { + fn from(okm: Okm<'_, Algorithm>) -> Self { + Self(hmac::Key::from(Okm { + prk: okm.prk, + info: okm.info, + len: okm.len().0, + len_cached: okm.len_cached, + })) + } +} + +/// The length of the OKM (Output Keying Material) for a `Prk::expand()` call. +pub trait KeyType { + /// The length that `Prk::expand()` should expand its input to. + fn len(&self) -> usize; +} + +/// A HKDF PRK (pseudorandom key). +#[derive(Clone, Debug)] +pub struct Prk(hmac::Key); + +impl Prk { + /// Construct a new `Prk` directly with the given value. + /// + /// Usually one can avoid using this. It is useful when the application + /// intentionally wants to leak the PRK secret, e.g. to implement + /// `SSLKEYLOGFILE` functionality. + pub fn new_less_safe(algorithm: Algorithm, value: &[u8]) -> Self { + Self(hmac::Key::new(algorithm.hmac_algorithm(), value)) + } + + /// The [HKDF-Expand] operation. + /// + /// [HKDF-Expand]: https://tools.ietf.org/html/rfc5869#section-2.3 + /// + /// Fails if (and only if) `len` is too large. + #[inline] + pub fn expand<'a, L: KeyType>( + &'a self, + info: &'a [&'a [u8]], + len: L, + ) -> Result, error::Unspecified> { + let len_cached = len.len(); + if len_cached > 255 * self.0.algorithm().digest_algorithm().output_len() { + return Err(error::Unspecified); + } + Ok(Okm { + prk: self, + info, + len, + len_cached, + }) + } +} + +impl From> for Prk { + fn from(okm: Okm) -> Self { + Self(hmac::Key::from(Okm { + prk: okm.prk, + info: okm.info, + len: okm.len().0, + len_cached: okm.len_cached, + })) + } +} + +/// An HKDF OKM (Output Keying Material) +/// +/// Intentionally not `Clone` or `Copy` as an OKM is generally only safe to +/// use once. +#[derive(Debug)] +pub struct Okm<'a, L: KeyType> { + prk: &'a Prk, + info: &'a [&'a [u8]], + len: L, + len_cached: usize, +} + +impl Okm<'_, L> { + /// The `OkmLength` given to `Prk::expand()`. + #[inline] + pub fn len(&self) -> &L { + &self.len + } + + /// Fills `out` with the output of the HKDF-Expand operation for the given + /// inputs. + /// + /// Fails if (and only if) the requested output length is larger than 255 + /// times the size of the digest algorithm's output. (This is the limit + /// imposed by the HKDF specification due to the way HKDF's counter is + /// constructed.) + #[inline] + pub fn fill(self, out: &mut [u8]) -> Result<(), error::Unspecified> { + fill_okm(self.prk, self.info, out, self.len_cached) + } +} + +fn fill_okm( + prk: &Prk, + info: &[&[u8]], + out: &mut [u8], + len: usize, +) -> Result<(), error::Unspecified> { + if out.len() != len { + return Err(error::Unspecified); + } + + let digest_alg = prk.0.algorithm().digest_algorithm(); + assert!(digest_alg.block_len() >= digest_alg.output_len()); + + let mut ctx = hmac::Context::with_key(&prk.0); + + let mut n = 1u8; + let mut out = out; + loop { + for info in info { + ctx.update(info); + } + ctx.update(&[n]); + + let t = ctx.sign(); + let t = t.as_ref(); + + // Append `t` to the output. + out = if out.len() < digest_alg.output_len() { + let len = out.len(); + out.copy_from_slice(&t[..len]); + &mut [] + } else { + let (this_chunk, rest) = out.split_at_mut(digest_alg.output_len()); + this_chunk.copy_from_slice(t); + rest + }; + + if out.is_empty() { + return Ok(()); + } + + ctx = hmac::Context::with_key(&prk.0); + ctx.update(t); + n = n.checked_add(1).unwrap(); + } +} diff --git a/ring-0.17.14/src/hmac.rs b/ring-0.17.14/src/hmac.rs new file mode 100644 index 0000000000..f0b23809c4 --- /dev/null +++ b/ring-0.17.14/src/hmac.rs @@ -0,0 +1,440 @@ +// Copyright 2015-2016 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! HMAC is specified in [RFC 2104]. +//! +//! After a `Key` is constructed, it can be used for multiple signing or +//! verification operations. Separating the construction of the key from the +//! rest of the HMAC operation allows the per-key precomputation to be done +//! only once, instead of it being done in every HMAC operation. +//! +//! Frequently all the data to be signed in a message is available in a single +//! contiguous piece. In that case, the module-level `sign` function can be +//! used. Otherwise, if the input is in multiple parts, `Context` should be +//! used. +//! +//! # Examples: +//! +//! ## Signing a value and verifying it wasn't tampered with +//! +//! ``` +//! use ring::{hmac, rand}; +//! +//! let rng = rand::SystemRandom::new(); +//! let key = hmac::Key::generate(hmac::HMAC_SHA256, &rng)?; +//! +//! let msg = "hello, world"; +//! +//! let tag = hmac::sign(&key, msg.as_bytes()); +//! +//! // [We give access to the message to an untrusted party, and they give it +//! // back to us. We need to verify they didn't tamper with it.] +//! +//! hmac::verify(&key, msg.as_bytes(), tag.as_ref())?; +//! +//! # Ok::<(), ring::error::Unspecified>(()) +//! ``` +//! +//! ## Using the one-shot API: +//! +//! ``` +//! use ring::{digest, hmac, rand}; +//! use ring::rand::SecureRandom; +//! +//! let msg = "hello, world"; +//! +//! // The sender generates a secure key value and signs the message with it. +//! // Note that in a real protocol, a key agreement protocol would be used to +//! // derive `key_value`. +//! let rng = rand::SystemRandom::new(); +//! let key_value: [u8; digest::SHA256_OUTPUT_LEN] = rand::generate(&rng)?.expose(); +//! +//! let s_key = hmac::Key::new(hmac::HMAC_SHA256, key_value.as_ref()); +//! let tag = hmac::sign(&s_key, msg.as_bytes()); +//! +//! // The receiver (somehow!) knows the key value, and uses it to verify the +//! // integrity of the message. +//! let v_key = hmac::Key::new(hmac::HMAC_SHA256, key_value.as_ref()); +//! hmac::verify(&v_key, msg.as_bytes(), tag.as_ref())?; +//! +//! # Ok::<(), ring::error::Unspecified>(()) +//! ``` +//! +//! ## Using the multi-part API: +//! ``` +//! use ring::{digest, hmac, rand}; +//! use ring::rand::SecureRandom; +//! +//! let parts = ["hello", ", ", "world"]; +//! +//! // The sender generates a secure key value and signs the message with it. +//! // Note that in a real protocol, a key agreement protocol would be used to +//! // derive `key_value`. +//! let rng = rand::SystemRandom::new(); +//! let mut key_value: [u8; digest::SHA384_OUTPUT_LEN] = rand::generate(&rng)?.expose(); +//! +//! let s_key = hmac::Key::new(hmac::HMAC_SHA384, key_value.as_ref()); +//! let mut s_ctx = hmac::Context::with_key(&s_key); +//! for part in &parts { +//! s_ctx.update(part.as_bytes()); +//! } +//! let tag = s_ctx.sign(); +//! +//! // The receiver (somehow!) knows the key value, and uses it to verify the +//! // integrity of the message. +//! let v_key = hmac::Key::new(hmac::HMAC_SHA384, key_value.as_ref()); +//! let mut msg = Vec::::new(); +//! for part in &parts { +//! msg.extend(part.as_bytes()); +//! } +//! hmac::verify(&v_key, &msg.as_ref(), tag.as_ref())?; +//! +//! # Ok::<(), ring::error::Unspecified>(()) +//! ``` +//! +//! [RFC 2104]: https://tools.ietf.org/html/rfc2104 + +use crate::{ + bb, cpu, + digest::{self, Digest, FinishError}, + error, hkdf, rand, +}; + +pub(crate) use crate::digest::InputTooLongError; + +/// An HMAC algorithm. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub struct Algorithm(&'static digest::Algorithm); + +impl Algorithm { + /// The digest algorithm this HMAC algorithm is based on. + #[inline] + pub fn digest_algorithm(&self) -> &'static digest::Algorithm { + self.0 + } +} + +/// HMAC using SHA-1. Obsolete. +pub static HMAC_SHA1_FOR_LEGACY_USE_ONLY: Algorithm = Algorithm(&digest::SHA1_FOR_LEGACY_USE_ONLY); + +/// HMAC using SHA-256. +pub static HMAC_SHA256: Algorithm = Algorithm(&digest::SHA256); + +/// HMAC using SHA-384. +pub static HMAC_SHA384: Algorithm = Algorithm(&digest::SHA384); + +/// HMAC using SHA-512. +pub static HMAC_SHA512: Algorithm = Algorithm(&digest::SHA512); + +/// An HMAC tag. +/// +/// For a given tag `t`, use `t.as_ref()` to get the tag value as a byte slice. +#[derive(Clone, Copy, Debug)] +pub struct Tag(Digest); + +impl AsRef<[u8]> for Tag { + #[inline] + fn as_ref(&self) -> &[u8] { + self.0.as_ref() + } +} + +/// A key to use for HMAC signing. +#[derive(Clone)] +pub struct Key { + inner: digest::BlockContext, + outer: digest::BlockContext, +} + +impl core::fmt::Debug for Key { + fn fmt(&self, f: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> { + f.debug_struct("Key") + .field("algorithm", self.algorithm().digest_algorithm()) + .finish() + } +} + +impl Key { + /// Generate an HMAC signing key using the given digest algorithm with a + /// random value generated from `rng`. + /// + /// The key will be `digest_alg.output_len` bytes long, based on the + /// recommendation in [RFC 2104 Section 3]. + /// + /// [RFC 2104 Section 3]: https://tools.ietf.org/html/rfc2104#section-3 + pub fn generate( + algorithm: Algorithm, + rng: &dyn rand::SecureRandom, + ) -> Result { + Self::construct(algorithm, |buf| rng.fill(buf), cpu::features()) + } + + fn construct( + algorithm: Algorithm, + fill: F, + cpu: cpu::Features, + ) -> Result + where + F: FnOnce(&mut [u8]) -> Result<(), error::Unspecified>, + { + let mut key_bytes = [0; digest::MAX_OUTPUT_LEN]; + let key_bytes = &mut key_bytes[..algorithm.0.output_len()]; + fill(key_bytes)?; + Self::try_new(algorithm, key_bytes, cpu).map_err(error::erase::) + } + + /// Construct an HMAC signing key using the given digest algorithm and key + /// value. + /// + /// `key_value` should be a value generated using a secure random number + /// generator (e.g. the `key_value` output by + /// `SealingKey::generate_serializable()`) or derived from a random key by + /// a key derivation function (e.g. `ring::hkdf`). In particular, + /// `key_value` shouldn't be a password. + /// + /// As specified in RFC 2104, if `key_value` is shorter than the digest + /// algorithm's block length (as returned by `digest::Algorithm::block_len()`, + /// not the digest length returned by `digest::Algorithm::output_len()`) then + /// it will be padded with zeros. Similarly, if it is longer than the block + /// length then it will be compressed using the digest algorithm. + /// + /// You should not use keys larger than the `digest_alg.block_len` because + /// the truncation described above reduces their strength to only + /// `digest_alg.output_len * 8` bits. Support for such keys is likely to be + /// removed in a future version of *ring*. + pub fn new(algorithm: Algorithm, key_value: &[u8]) -> Self { + Self::try_new(algorithm, key_value, cpu::features()) + .map_err(error::erase::) + .unwrap() + } + + pub(crate) fn try_new( + algorithm: Algorithm, + key_value: &[u8], + cpu_features: cpu::Features, + ) -> Result { + let digest_alg = algorithm.0; + let mut key = Self { + inner: digest::BlockContext::new(digest_alg), + outer: digest::BlockContext::new(digest_alg), + }; + + let block_len = digest_alg.block_len(); + + let key_hash; + let key_value = if key_value.len() <= block_len { + key_value + } else { + key_hash = Digest::compute_from(digest_alg, key_value, cpu_features)?; + key_hash.as_ref() + }; + + const IPAD: u8 = 0x36; + + let mut padded_key = [IPAD; digest::MAX_BLOCK_LEN]; + let padded_key = &mut padded_key[..block_len]; + + // If the key is shorter than one block then we're supposed to act like + // it is padded with zero bytes up to the block length. `x ^ 0 == x` so + // we can just leave the trailing bytes of `padded_key` untouched. + bb::xor_assign_at_start(&mut padded_key[..], key_value); + + let leftover = key.inner.update(padded_key, cpu_features); + debug_assert_eq!(leftover.len(), 0); + + const OPAD: u8 = 0x5C; + + // Remove the `IPAD` masking, leaving the unmasked padded key, then + // mask with `OPAD`, all in one step. + bb::xor_assign(&mut padded_key[..], IPAD ^ OPAD); + let leftover = key.outer.update(padded_key, cpu_features); + debug_assert_eq!(leftover.len(), 0); + + Ok(key) + } + + /// The digest algorithm for the key. + #[inline] + pub fn algorithm(&self) -> Algorithm { + Algorithm(self.inner.algorithm) + } + + pub(crate) fn sign(&self, data: &[u8], cpu: cpu::Features) -> Result { + let mut ctx = Context::with_key(self); + ctx.update(data); + ctx.try_sign(cpu) + } + + fn verify(&self, data: &[u8], tag: &[u8], cpu: cpu::Features) -> Result<(), VerifyError> { + let computed = self + .sign(data, cpu) + .map_err(VerifyError::InputTooLongError)?; + bb::verify_slices_are_equal(computed.as_ref(), tag) + .map_err(|_: error::Unspecified| VerifyError::Mismatch) + } +} + +impl hkdf::KeyType for Algorithm { + fn len(&self) -> usize { + self.digest_algorithm().output_len() + } +} + +impl From> for Key { + fn from(okm: hkdf::Okm) -> Self { + Self::construct(*okm.len(), |buf| okm.fill(buf), cpu::features()).unwrap() + } +} + +/// A context for multi-step (Init-Update-Finish) HMAC signing. +/// +/// Use `sign` for single-step HMAC signing. +#[derive(Clone)] +pub struct Context { + inner: digest::Context, + outer: digest::BlockContext, +} + +impl core::fmt::Debug for Context { + fn fmt(&self, f: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> { + f.debug_struct("Context") + .field("algorithm", self.inner.algorithm()) + .finish() + } +} + +impl Context { + /// Constructs a new HMAC signing context using the given digest algorithm + /// and key. + pub fn with_key(signing_key: &Key) -> Self { + Self { + inner: digest::Context::clone_from(&signing_key.inner), + outer: signing_key.outer.clone(), + } + } + + /// Updates the HMAC with all the data in `data`. `update` may be called + /// zero or more times until `finish` is called. + pub fn update(&mut self, data: &[u8]) { + self.inner.update(data); + } + + /// Finalizes the HMAC calculation and returns the HMAC value. `sign` + /// consumes the context so it cannot be (mis-)used after `sign` has been + /// called. + /// + /// It is generally not safe to implement HMAC verification by comparing + /// the return value of `sign` to a tag. Use `verify` for verification + /// instead. + pub fn sign(self) -> Tag { + self.try_sign(cpu::features()) + .map_err(error::erase::) + .unwrap() + } + + pub(crate) fn try_sign(self, cpu_features: cpu::Features) -> Result { + // Consequently, `num_pending` is valid. + debug_assert_eq!(self.inner.algorithm(), self.outer.algorithm); + debug_assert!(self.inner.algorithm().output_len() < self.outer.algorithm.block_len()); + + let inner = self.inner.try_finish(cpu_features)?; + let inner = inner.as_ref(); + let num_pending = inner.len(); + let buffer = &mut [0u8; digest::MAX_BLOCK_LEN]; + const _BUFFER_IS_LARGE_ENOUGH_TO_HOLD_INNER: () = + assert!(digest::MAX_OUTPUT_LEN < digest::MAX_BLOCK_LEN); + buffer[..num_pending].copy_from_slice(inner); + + self.outer + .try_finish(buffer, num_pending, cpu_features) + .map(Tag) + .map_err(|err| match err { + FinishError::InputTooLong(i) => { + // Unreachable, as we gave the inner context exactly the + // same input we gave the outer context, and + // `inner.try_finish` already succeeded. However, it is + // quite difficult to prove this, and we already return + // `InputTooLongError`, so just forward it along. + i + } + FinishError::PendingNotAPartialBlock(_) => { + // Follows from the assertions above. + unreachable!() + } + }) + } +} + +/// Calculates the HMAC of `data` using the key `key` in one step. +/// +/// Use `Context` to calculate HMACs where the input is in multiple parts. +/// +/// It is generally not safe to implement HMAC verification by comparing the +/// return value of `sign` to a tag. Use `verify` for verification instead. +pub fn sign(key: &Key, data: &[u8]) -> Tag { + key.sign(data, cpu::features()) + .map_err(error::erase::) + .unwrap() +} + +/// Calculates the HMAC of `data` using the signing key `key`, and verifies +/// whether the resultant value equals `tag`, in one step. +/// +/// This is logically equivalent to, but more efficient than, constructing a +/// `Key` with the same value as `key` and then using `verify`. +/// +/// The verification will be done in constant time to prevent timing attacks. +pub fn verify(key: &Key, data: &[u8], tag: &[u8]) -> Result<(), error::Unspecified> { + key.verify(data, tag, cpu::features()) + .map_err(|_: VerifyError| error::Unspecified) +} + +enum VerifyError { + // Theoretically somebody could have calculated a valid tag with a gigantic + // input that we do not support. If we were to support every theoretically + // valid input length, for *every* digest algorithm, then we could argue + // that hitting the input length limit implies a mismatch since nobody + // could have calculated such a tag with the given input. + #[allow(dead_code)] + InputTooLongError(InputTooLongError), + + Mismatch, +} + +#[cfg(test)] +mod tests { + use crate::{hmac, rand}; + + // Make sure that `Key::generate` and `verify_with_own_key` aren't + // completely wacky. + #[test] + pub fn hmac_signing_key_coverage() { + let rng = rand::SystemRandom::new(); + + const HELLO_WORLD_GOOD: &[u8] = b"hello, world"; + const HELLO_WORLD_BAD: &[u8] = b"hello, worle"; + + for algorithm in &[ + hmac::HMAC_SHA1_FOR_LEGACY_USE_ONLY, + hmac::HMAC_SHA256, + hmac::HMAC_SHA384, + hmac::HMAC_SHA512, + ] { + let key = hmac::Key::generate(*algorithm, &rng).unwrap(); + let tag = hmac::sign(&key, HELLO_WORLD_GOOD); + assert!(hmac::verify(&key, HELLO_WORLD_GOOD, tag.as_ref()).is_ok()); + assert!(hmac::verify(&key, HELLO_WORLD_BAD, tag.as_ref()).is_err()) + } + } +} diff --git a/ring-0.17.14/src/io.rs b/ring-0.17.14/src/io.rs new file mode 100644 index 0000000000..232d5ff39a --- /dev/null +++ b/ring-0.17.14/src/io.rs @@ -0,0 +1,31 @@ +// Copyright 2018 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! Serialization and deserialization. + +#[doc(hidden)] +pub mod der; + +#[cfg(feature = "alloc")] +mod writer; + +#[cfg(feature = "alloc")] +pub(crate) mod der_writer; + +pub(crate) mod positive; + +pub use self::positive::Positive; + +#[cfg(feature = "alloc")] +pub(crate) use self::writer::TooLongError; diff --git a/ring-0.17.14/src/io/der.rs b/ring-0.17.14/src/io/der.rs new file mode 100644 index 0000000000..52aed92b9b --- /dev/null +++ b/ring-0.17.14/src/io/der.rs @@ -0,0 +1,301 @@ +// Copyright 2015 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! Building blocks for parsing DER-encoded ASN.1 structures. +//! +//! This module contains the foundational parts of an ASN.1 DER parser. + +use super::Positive; +use crate::error; + +pub const CONSTRUCTED: u8 = 1 << 5; +pub const CONTEXT_SPECIFIC: u8 = 2 << 6; + +#[derive(Clone, Copy, PartialEq)] +#[repr(u8)] +pub enum Tag { + Boolean = 0x01, + Integer = 0x02, + BitString = 0x03, + OctetString = 0x04, + Null = 0x05, + OID = 0x06, + Sequence = CONSTRUCTED | 0x10, // 0x30 + UTCTime = 0x17, + GeneralizedTime = 0x18, + + ContextSpecific1 = CONTEXT_SPECIFIC | 1, + + ContextSpecificConstructed0 = CONTEXT_SPECIFIC | CONSTRUCTED | 0, + ContextSpecificConstructed1 = CONTEXT_SPECIFIC | CONSTRUCTED | 1, + ContextSpecificConstructed3 = CONTEXT_SPECIFIC | CONSTRUCTED | 3, +} + +impl From for usize { + fn from(tag: Tag) -> Self { + Self::from(Tag::into(tag)) + } +} + +impl From for u8 { + fn from(tag: Tag) -> Self { + Tag::into(tag) + } +} + +// `impl From for u8` but as a `const fn`. +impl Tag { + pub const fn into(self) -> u8 { + self as u8 + } +} + +pub fn expect_tag_and_get_value<'a>( + input: &mut untrusted::Reader<'a>, + tag: Tag, +) -> Result, error::Unspecified> { + let (actual_tag, inner) = read_tag_and_get_value(input)?; + if usize::from(tag) != usize::from(actual_tag) { + return Err(error::Unspecified); + } + Ok(inner) +} + +pub fn read_tag_and_get_value<'a>( + input: &mut untrusted::Reader<'a>, +) -> Result<(u8, untrusted::Input<'a>), error::Unspecified> { + let tag = input.read_byte()?; + if (tag & 0x1F) == 0x1F { + return Err(error::Unspecified); // High tag number form is not allowed. + } + + // If the high order bit of the first byte is set to zero then the length + // is encoded in the seven remaining bits of that byte. Otherwise, those + // seven bits represent the number of bytes used to encode the length. + let length = match input.read_byte()? { + n if (n & 0x80) == 0 => usize::from(n), + 0x81 => { + let second_byte = input.read_byte()?; + if second_byte < 128 { + return Err(error::Unspecified); // Not the canonical encoding. + } + usize::from(second_byte) + } + 0x82 => { + let second_byte = usize::from(input.read_byte()?); + let third_byte = usize::from(input.read_byte()?); + let combined = (second_byte << 8) | third_byte; + if combined < 256 { + return Err(error::Unspecified); // Not the canonical encoding. + } + combined + } + _ => { + return Err(error::Unspecified); // We don't support longer lengths. + } + }; + + let inner = input.read_bytes(length)?; + Ok((tag, inner)) +} + +#[inline] +pub fn bit_string_with_no_unused_bits<'a>( + input: &mut untrusted::Reader<'a>, +) -> Result, error::Unspecified> { + bit_string_tagged_with_no_unused_bits(Tag::BitString, input) +} + +pub(crate) fn bit_string_tagged_with_no_unused_bits<'a>( + tag: Tag, + input: &mut untrusted::Reader<'a>, +) -> Result, error::Unspecified> { + nested(input, tag, error::Unspecified, |value| { + let unused_bits_at_end = value.read_byte().map_err(|_| error::Unspecified)?; + if unused_bits_at_end != 0 { + return Err(error::Unspecified); + } + Ok(value.read_bytes_to_end()) + }) +} + +// TODO: investigate taking decoder as a reference to reduce generated code +// size. +pub fn nested<'a, F, R, E: Copy>( + input: &mut untrusted::Reader<'a>, + tag: Tag, + error: E, + decoder: F, +) -> Result +where + F: FnOnce(&mut untrusted::Reader<'a>) -> Result, +{ + let inner = expect_tag_and_get_value(input, tag).map_err(|_| error)?; + inner.read_all(error, decoder) +} + +pub(crate) fn nonnegative_integer<'a>( + input: &mut untrusted::Reader<'a>, +) -> Result, error::Unspecified> { + let value = expect_tag_and_get_value(input, Tag::Integer)?; + match value + .as_slice_less_safe() + .split_first() + .ok_or(error::Unspecified)? + { + // Zero or leading zero. + (0, rest) => { + match rest.first() { + // Zero. + None => Ok(value), + // Necessary leading zero. + Some(&second) if second & 0x80 == 0x80 => Ok(untrusted::Input::from(rest)), + // Unnecessary leading zero. + _ => Err(error::Unspecified), + } + } + // Positive value with no leading zero. + (first, _) if first & 0x80 == 0 => Ok(value), + // Negative value. + (_, _) => Err(error::Unspecified), + } +} + +/// Parse as integer with a value in the in the range [0, 255], returning its +/// numeric value. This is typically used for parsing version numbers. +#[inline] +pub fn small_nonnegative_integer(input: &mut untrusted::Reader) -> Result { + let value = nonnegative_integer(input)?; + match *value.as_slice_less_safe() { + [b] => Ok(b), + _ => Err(error::Unspecified), + } +} + +/// Parses a positive DER integer, returning the big-endian-encoded value, +/// sans any leading zero byte. +pub fn positive_integer<'a>( + input: &mut untrusted::Reader<'a>, +) -> Result, error::Unspecified> { + let value = nonnegative_integer(input)?; + Positive::from_be_bytes(value) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::error; + + fn with_i<'a, F, R>(value: &'a [u8], f: F) -> Result + where + F: FnOnce(&mut untrusted::Reader<'a>) -> Result, + { + untrusted::Input::from(value).read_all(error::Unspecified, f) + } + + static ZERO_INTEGER: &[u8] = &[0x02, 0x01, 0x00]; + + static GOOD_POSITIVE_INTEGERS_SMALL: &[(&[u8], u8)] = &[ + (&[0x02, 0x01, 0x01], 0x01), + (&[0x02, 0x01, 0x02], 0x02), + (&[0x02, 0x01, 0x7e], 0x7e), + (&[0x02, 0x01, 0x7f], 0x7f), + // Values that need to have an 0x00 prefix to disambiguate them from + // them from negative values. + (&[0x02, 0x02, 0x00, 0x80], 0x80), + (&[0x02, 0x02, 0x00, 0x81], 0x81), + (&[0x02, 0x02, 0x00, 0xfe], 0xfe), + (&[0x02, 0x02, 0x00, 0xff], 0xff), + ]; + + static GOOD_POSITIVE_INTEGERS_LARGE: &[(&[u8], &[u8])] = &[ + (&[0x02, 0x02, 0x01, 0x00], &[0x01, 0x00]), + (&[0x02, 0x02, 0x02, 0x01], &[0x02, 0x01]), + (&[0x02, 0x02, 0x7e, 0xfe], &[0x7e, 0xfe]), + (&[0x02, 0x02, 0x7f, 0xff], &[0x7f, 0xff]), + // Values that need to have an 0x00 prefix to disambiguate them from + // them from negative values. + (&[0x02, 0x03, 0x00, 0x80, 0x00], &[0x80, 0x00]), + (&[0x02, 0x03, 0x00, 0x81, 0x01], &[0x81, 0x01]), + (&[0x02, 0x03, 0x00, 0xfe, 0xfe], &[0xfe, 0xfe]), + (&[0x02, 0x03, 0x00, 0xff, 0xff], &[0xff, 0xff]), + ]; + + static BAD_NONNEGATIVE_INTEGERS: &[&[u8]] = &[ + &[], // At end of input + &[0x02], // Tag only + &[0x02, 0x00], // Empty value + // Length mismatch + &[0x02, 0x00, 0x01], + &[0x02, 0x01], + // Would be valid if leading zero is ignored when comparing length. + &[0x02, 0x01, 0x00, 0x01], + &[0x02, 0x01, 0x01, 0x00], // Would be valid if last byte is ignored. + &[0x02, 0x02, 0x01], + // Values that are missing a necessary leading 0x00 + &[0x02, 0x01, 0x80], + &[0x02, 0x01, 0x81], + &[0x02, 0x01, 0xfe], + &[0x02, 0x01, 0xff], + // Values that have an unnecessary leading 0x00 + &[0x02, 0x02, 0x00, 0x00], + &[0x02, 0x02, 0x00, 0x01], + &[0x02, 0x02, 0x00, 0x02], + &[0x02, 0x02, 0x00, 0x7e], + &[0x02, 0x02, 0x00, 0x7f], + ]; + + #[test] + fn test_small_nonnegative_integer() { + let zero = (ZERO_INTEGER, 0x00); + for &(test_in, test_out) in + core::iter::once(&zero).chain(GOOD_POSITIVE_INTEGERS_SMALL.iter()) + { + let result = with_i(test_in, |input| { + assert_eq!(small_nonnegative_integer(input)?, test_out); + Ok(()) + }); + assert_eq!(result, Ok(())); + } + for &test_in in BAD_NONNEGATIVE_INTEGERS + .iter() + .chain(GOOD_POSITIVE_INTEGERS_LARGE.iter().map(|(input, _)| input)) + { + let result = with_i(test_in, small_nonnegative_integer); + assert_eq!(result, Err(error::Unspecified)); + } + } + + #[test] + fn test_positive_integer() { + for (test_in, test_out) in GOOD_POSITIVE_INTEGERS_SMALL + .iter() + .map(|(test_in, test_out)| (*test_in, core::slice::from_ref(test_out))) + .chain(GOOD_POSITIVE_INTEGERS_LARGE.iter().copied()) + { + let result = with_i(test_in, |input| { + assert_eq!( + positive_integer(input)?.big_endian_without_leading_zero(), + test_out + ); + Ok(()) + }); + assert_eq!(result, Ok(())) + } + for &test_in in core::iter::once(&ZERO_INTEGER).chain(BAD_NONNEGATIVE_INTEGERS.iter()) { + let result = with_i(test_in, positive_integer); + assert!(matches!(result, Err(error::Unspecified))); + } + } +} diff --git a/ring-0.17.14/src/io/der_writer.rs b/ring-0.17.14/src/io/der_writer.rs new file mode 100644 index 0000000000..d6f9717d6c --- /dev/null +++ b/ring-0.17.14/src/io/der_writer.rs @@ -0,0 +1,71 @@ +// Copyright 2018 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::{der::*, writer::*, *}; +use alloc::boxed::Box; + +pub(crate) fn write_positive_integer( + output: &mut dyn Accumulator, + value: &Positive, +) -> Result<(), TooLongError> { + let first_byte = value.first_byte(); + let value = value.big_endian_without_leading_zero_as_input(); + write_tlv(output, Tag::Integer, |output| { + if (first_byte & 0x80) != 0 { + output.write_byte(0)?; // Disambiguate negative number. + } + write_copy(output, value) + }) +} + +pub(crate) fn write_all( + tag: Tag, + write_value: &dyn Fn(&mut dyn Accumulator) -> Result<(), TooLongError>, +) -> Result, TooLongError> { + let length = { + let mut length = LengthMeasurement::zero(); + write_tlv(&mut length, tag, write_value)?; + length + }; + + let mut output = Writer::with_capacity(length); + write_tlv(&mut output, tag, write_value)?; + + Ok(output.into()) +} + +fn write_tlv(output: &mut dyn Accumulator, tag: Tag, write_value: F) -> Result<(), TooLongError> +where + F: Fn(&mut dyn Accumulator) -> Result<(), TooLongError>, +{ + let length: usize = { + let mut length = LengthMeasurement::zero(); + write_value(&mut length)?; + length.into() + }; + let length: u16 = length.try_into().map_err(|_| TooLongError::new())?; + + output.write_byte(tag.into())?; + + let [lo, hi] = length.to_le_bytes(); + if length >= 0x1_00 { + output.write_byte(0x82)?; + output.write_byte(hi)?; + } else if length >= 0x80 { + output.write_byte(0x81)?; + } + output.write_byte(lo)?; + + write_value(output) +} diff --git a/ring-0.17.14/src/io/positive.rs b/ring-0.17.14/src/io/positive.rs new file mode 100644 index 0000000000..33f2fefc57 --- /dev/null +++ b/ring-0.17.14/src/io/positive.rs @@ -0,0 +1,98 @@ +// Copyright 2018 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! Serialization and deserialization. + +use crate::error; + +/// A serialized positive integer. +#[derive(Copy, Clone)] +pub struct Positive<'a>(untrusted::Input<'a>); + +impl<'a> Positive<'a> { + #[inline] + pub(crate) fn from_be_bytes(input: untrusted::Input<'a>) -> Result { + // Empty inputs are not allowed. + let &first_byte = input + .as_slice_less_safe() + .first() + .ok_or(error::Unspecified)?; + // Zero isn't allowed and leading zeros aren't allowed. + if first_byte == 0 { + return Err(error::Unspecified); + } + Ok(Self(input)) + } + + /// Returns the value, ordered from significant byte to least significant + /// byte, without any leading zeros. The result is guaranteed to be + /// non-empty. + #[inline] + pub fn big_endian_without_leading_zero(&self) -> &'a [u8] { + self.big_endian_without_leading_zero_as_input() + .as_slice_less_safe() + } + + #[inline] + pub(crate) fn big_endian_without_leading_zero_as_input(&self) -> untrusted::Input<'a> { + self.0 + } +} + +impl Positive<'_> { + /// Returns the first byte. + /// + /// Will not panic because the value is guaranteed to have at least one + /// byte. + pub fn first_byte(&self) -> u8 { + // This won't panic because + self.0.as_slice_less_safe()[0] + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_from_be_bytes() { + static TEST_CASES: &[(&[u8], Result<&[u8], error::Unspecified>)] = &[ + // An empty input isn't a number. + (&[], Err(error::Unspecified)), + // Zero is not positive. + (&[0x00], Err(error::Unspecified)), + // Minimum value. No leading zero required or allowed. + (&[0x00, 0x01], Err(error::Unspecified)), + (&[0x01], Ok(&[0x01])), + // Maximum first byte. No leading zero required or allowed. + (&[0xff], Ok(&[0xff])), + (&[0x00, 0xff], Err(error::Unspecified)), + // The last byte can be zero. + (&[0x01, 0x00], Ok(&[0x01, 0x00])), + (&[0x01, 0x00, 0x00], Ok(&[0x01, 0x00, 0x00])), + // Having no zero bytes are also allowed. + (&[0x01, 0x01], Ok(&[0x01, 0x01])), + // A middle byte can be zero. + (&[0x01, 0x00, 0x01], Ok(&[0x01, 0x00, 0x01])), + (&[0x01, 0x01, 0x01], Ok(&[0x01, 0x01, 0x01])), + ]; + for &(input, result) in TEST_CASES { + let input = untrusted::Input::from(input); + assert_eq!( + Positive::from_be_bytes(input).map(|p| p.big_endian_without_leading_zero()), + result + ); + } + } +} diff --git a/ring-0.17.14/src/io/writer.rs b/ring-0.17.14/src/io/writer.rs new file mode 100644 index 0000000000..9ebe0d32a3 --- /dev/null +++ b/ring-0.17.14/src/io/writer.rs @@ -0,0 +1,97 @@ +// Copyright 2018 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use alloc::{boxed::Box, vec::Vec}; + +pub trait Accumulator { + fn write_byte(&mut self, value: u8) -> Result<(), TooLongError>; + fn write_bytes(&mut self, value: &[u8]) -> Result<(), TooLongError>; +} + +pub(super) struct LengthMeasurement { + len: usize, +} + +impl From for usize { + fn from(len: LengthMeasurement) -> usize { + len.len + } +} + +impl LengthMeasurement { + pub fn zero() -> Self { + Self { len: 0 } + } +} + +impl Accumulator for LengthMeasurement { + fn write_byte(&mut self, _value: u8) -> Result<(), TooLongError> { + self.len = self.len.checked_add(1).ok_or_else(TooLongError::new)?; + Ok(()) + } + fn write_bytes(&mut self, value: &[u8]) -> Result<(), TooLongError> { + self.len = self + .len + .checked_add(value.len()) + .ok_or_else(TooLongError::new)?; + Ok(()) + } +} + +pub(super) struct Writer { + bytes: Vec, + requested_capacity: usize, +} + +impl Writer { + pub(super) fn with_capacity(capacity: LengthMeasurement) -> Self { + Self { + bytes: Vec::with_capacity(capacity.len), + requested_capacity: capacity.len, + } + } +} + +impl From for Box<[u8]> { + fn from(writer: Writer) -> Self { + assert_eq!(writer.requested_capacity, writer.bytes.len()); + writer.bytes.into_boxed_slice() + } +} + +impl Accumulator for Writer { + fn write_byte(&mut self, value: u8) -> Result<(), TooLongError> { + self.bytes.push(value); + Ok(()) + } + fn write_bytes(&mut self, value: &[u8]) -> Result<(), TooLongError> { + self.bytes.extend(value); + Ok(()) + } +} + +pub fn write_copy( + accumulator: &mut dyn Accumulator, + to_copy: untrusted::Input, +) -> Result<(), TooLongError> { + accumulator.write_bytes(to_copy.as_slice_less_safe()) +} + +pub struct TooLongError(()); + +impl TooLongError { + pub fn new() -> Self { + Self(()) + } +} diff --git a/ring-0.17.14/src/lib.rs b/ring-0.17.14/src/lib.rs new file mode 100644 index 0000000000..5d33f39026 --- /dev/null +++ b/ring-0.17.14/src/lib.rs @@ -0,0 +1,180 @@ +// Copyright 2015-2016 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! # Feature Flags +//! +//! +//!
Feature +//! Description +//!
alloc (default) +//! Enable features that require use of the heap, RSA in particular. +//!
less-safe-getrandom-custom-or-rdrand +//! Treat user-provided ("custom") and RDRAND-based getrandom +//! implementations as secure random number generators (see +//! SecureRandom). This feature only works with +//! os = "none" targets. See +//! +//! register_custom_getrandom +//! and +//! RDRAND on x86 +//! for additional details. +//!
less-safe-getrandom-espidf +//! Treat getrandom as a secure random number generator (see +//! SecureRandom) on the esp-idf target. While the esp-idf +//! target does have hardware RNG, it is beyond the scope of ring to +//! ensure its configuration. This feature allows ring to build +//! on esp-idf despite the likelihood that RNG is not secure. +//! This feature only works with os = espidf targets. +//! See +//!
std +//! Enable features that use libstd, in particular +//! std::error::Error integration. Implies `alloc`. +//!
wasm32_unknown_unknown_js +//! When this feature is enabled, for the wasm32-unknown-unknown target, +//! Web APIs will be used to implement features like `ring::rand` that +//! require an operating environment of some kind. This has no effect +//! for any other target. This enables the `getrandom` crate's `js` +//! feature. +//!
+ +// When running mk/package.sh, don't actually build any code. +#![allow( + clippy::collapsible_if, + clippy::identity_op, + clippy::len_without_is_empty, + clippy::let_unit_value, + clippy::new_without_default, + clippy::neg_cmp_op_on_partial_ord, + clippy::too_many_arguments, + clippy::type_complexity, + non_camel_case_types, + non_snake_case, + unsafe_code +)] +#![deny(variant_size_differences)] +#![forbid( + unused_results, + unsafe_op_in_unsafe_fn, + clippy::char_lit_as_u8, + clippy::fn_to_numeric_cast, + clippy::fn_to_numeric_cast_with_truncation, + clippy::ptr_as_ptr +)] +#![warn( + clippy::unnecessary_cast, + clippy::cast_lossless, + clippy::cast_possible_truncation, + clippy::cast_possible_wrap, + clippy::cast_precision_loss, + clippy::cast_sign_loss +)] +#![cfg_attr( + not(any( + all(target_arch = "aarch64", target_endian = "little"), + all(target_arch = "arm", target_endian = "little"), + target_arch = "x86", + target_arch = "x86_64", + feature = "alloc" + )), + allow(dead_code, unused_imports, unused_macros) +)] +#![no_std] + +#[cfg(all(target_arch = "aarch64", not(target_os="optee")))] +extern crate libc; + +#[cfg(feature = "alloc")] +extern crate alloc; + +#[macro_use] +mod debug; + +#[macro_use] +mod prefixed; + +#[doc(hidden)] +#[macro_use] +mod testutil; + +#[macro_use] +mod bssl; + +#[macro_use] +mod polyfill; + +pub mod aead; + +pub mod agreement; +mod arithmetic; +mod bits; + +pub(crate) mod bb; +pub(crate) mod c; + +#[doc(hidden)] +#[deprecated( + note = "Will be removed. Internal module not intended for external use, with no promises regarding side channels." +)] +pub mod deprecated_constant_time; + +#[doc(hidden)] +#[allow(deprecated)] +#[deprecated( + note = "Will be removed. Internal module not intended for external use, with no promises regarding side channels." +)] +pub use deprecated_constant_time as constant_time; + +pub mod io; + +mod cpu; +pub mod digest; +mod ec; +pub mod error; +pub mod hkdf; +pub mod hmac; +mod limb; +pub mod pbkdf2; +pub mod pkcs8; +pub mod rand; + +#[cfg(feature = "alloc")] +pub mod rsa; + +pub mod signature; + +#[cfg(test)] +mod tests; + +mod sealed { + /// Traits that are designed to only be implemented internally in *ring*. + // + // Usage: + // ``` + // use crate::sealed; + // + // pub trait MyType: sealed::Sealed { + // // [...] + // } + // + // impl sealed::Sealed for MyType {} + // ``` + pub trait Sealed {} +} + +#[deprecated(note = "internal API that will be removed")] +pub mod deprecated_test; + +#[allow(deprecated)] +#[deprecated(note = "internal API that will be removed")] +pub use deprecated_test as test; diff --git a/ring-0.17.14/src/limb.rs b/ring-0.17.14/src/limb.rs new file mode 100644 index 0000000000..fef4bc058b --- /dev/null +++ b/ring-0.17.14/src/limb.rs @@ -0,0 +1,662 @@ +// Copyright 2016 David Judd. +// Copyright 2016 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! Unsigned multi-precision integer arithmetic. +//! +//! Limbs ordered least-significant-limb to most-significant-limb. The bits +//! limbs use the native endianness. + +use crate::{ + arithmetic::inout::{AliasingSlices2, AliasingSlices3}, + bb, c, + error::{self, LenMismatchError}, + polyfill::{sliceutil, usize_from_u32, ArrayFlatMap}, +}; +use core::{iter, num::NonZeroUsize}; + +#[cfg(any(test, feature = "alloc"))] +use crate::bits; + +#[cfg(feature = "alloc")] +use core::num::Wrapping; + +// XXX: Not correct for x32 ABIs. +pub type Limb = bb::Word; +pub type LeakyLimb = bb::LeakyWord; +pub const LIMB_BITS: usize = usize_from_u32(Limb::BITS); +pub const LIMB_BYTES: usize = (LIMB_BITS + 7) / 8; + +pub type LimbMask = bb::BoolMask; + +#[inline] +pub fn limbs_equal_limbs_consttime(a: &[Limb], b: &[Limb]) -> Result { + if a.len() != b.len() { + return Err(LenMismatchError::new(a.len())); + } + let all = a.iter().zip(b).fold(0, |running, (a, b)| running | (a ^ b)); + Ok(limb_is_zero(all)) +} + +#[inline] +fn limbs_less_than_limbs(a: &[Limb], b: &[Limb]) -> Result { + prefixed_extern! { + fn LIMBS_less_than(a: *const Limb, b: *const Limb, num_limbs: c::NonZero_size_t) + -> LimbMask; + } + // Use `b.len` because usually `b` will be the modulus which is likely to + // have had its length checked already so this can be elided by the + // optimizer. + // XXX: Questionable whether `LenMismatchError` is appropriate. + let len = NonZeroUsize::new(b.len()).ok_or_else(|| LenMismatchError::new(a.len()))?; + if a.len() != len.get() { + return Err(LenMismatchError::new(a.len())); + } + Ok(unsafe { LIMBS_less_than(a.as_ptr(), b.as_ptr(), len) }) +} + +#[inline] +pub(crate) fn verify_limbs_less_than_limbs_leak_bit( + a: &[Limb], + b: &[Limb], +) -> Result<(), error::Unspecified> { + let r = limbs_less_than_limbs(a, b).map_err(error::erase::)?; + if r.leak() { + Ok(()) + } else { + Err(error::Unspecified) + } +} + +#[inline] +pub fn limbs_less_than_limbs_vartime(a: &[Limb], b: &[Limb]) -> Result { + let r = limbs_less_than_limbs(a, b)?; + Ok(r.leak()) +} + +#[inline] +fn limb_is_zero(limb: Limb) -> LimbMask { + prefixed_extern! { + fn LIMB_is_zero(limb: Limb) -> LimbMask; + } + unsafe { LIMB_is_zero(limb) } +} + +#[inline] +pub fn limbs_are_zero(limbs: &[Limb]) -> LimbMask { + limb_is_zero(limbs.iter().fold(0, |a, b| a | b)) +} + +/// Leaks one bit of information (other than the lengths of the inputs): +/// Whether the given limbs are even. +#[cfg(any(test, feature = "alloc"))] +#[inline] +pub fn limbs_reject_even_leak_bit(limbs: &[Limb]) -> Result<(), error::Unspecified> { + let bottom = *limbs.first().ok_or(error::Unspecified)?; + if limb_is_zero(bottom & 1).leak() { + return Err(error::Unspecified); + } + Ok(()) +} + +#[cfg(any(test, feature = "alloc"))] +#[inline] +pub fn verify_limbs_equal_1_leak_bit(a: &[Limb]) -> Result<(), error::Unspecified> { + if let [bottom, ref rest @ ..] = *a { + let equal = limb_is_zero(bottom ^ 1) & limbs_are_zero(rest); + if equal.leak() { + return Ok(()); + } + } + Err(error::Unspecified) +} + +/// Returns the number of bits in `a`. +// +// This strives to be constant-time with respect to the values of all bits +// except the most significant bit. This does not attempt to be constant-time +// with respect to `a.len()` or the value of the result or the value of the +// most significant bit (It's 1, unless the input is zero, in which case it's +// zero.) +#[cfg(any(test, feature = "alloc"))] +pub fn limbs_minimal_bits(a: &[Limb]) -> bits::BitLength { + for num_limbs in (1..=a.len()).rev() { + let high_limb = a[num_limbs - 1]; + + // Find the number of set bits in |high_limb| by a linear scan from the + // most significant bit to the least significant bit. This works great + // for the most common inputs because usually the most significant bit + // it set. + for high_limb_num_bits in (1..=LIMB_BITS).rev() { + let shifted = unsafe { LIMB_shr(high_limb, high_limb_num_bits - 1) }; + if shifted != 0 { + return bits::BitLength::from_bits( + ((num_limbs - 1) * LIMB_BITS) + high_limb_num_bits, + ); + } + } + } + + // No bits were set. + bits::BitLength::from_bits(0) +} + +/// Equivalent to `if (r >= m) { r -= m; }` +#[inline] +pub fn limbs_reduce_once(r: &mut [Limb], m: &[Limb]) -> Result<(), LenMismatchError> { + prefixed_extern! { + fn LIMBS_reduce_once(r: *mut Limb, m: *const Limb, num_limbs: c::NonZero_size_t); + } + let num_limbs = NonZeroUsize::new(r.len()).ok_or_else(|| LenMismatchError::new(m.len()))?; + let r = r.as_mut_ptr(); // Non-dangling because num_limbs is non-zero. + let m = m.as_ptr(); // Non-dangling because num_limbs is non-zero. + unsafe { LIMBS_reduce_once(r, m, num_limbs) }; + Ok(()) +} + +#[derive(Clone, Copy, PartialEq)] +pub enum AllowZero { + No, + Yes, +} + +/// Parses `input` into `result`, verifies that the value is less than +/// `max_exclusive`, and pads `result` with zeros to its length. If `allow_zero` +/// is not `AllowZero::Yes`, zero values are rejected. +/// +/// This attempts to be constant-time with respect to the actual value *only if* +/// the value is actually in range. In other words, this won't leak anything +/// about a valid value, but it might leak small amounts of information about an +/// invalid value (which constraint it failed). +pub fn parse_big_endian_in_range_and_pad_consttime( + input: untrusted::Input, + allow_zero: AllowZero, + max_exclusive: &[Limb], + result: &mut [Limb], +) -> Result<(), error::Unspecified> { + parse_big_endian_and_pad_consttime(input, result)?; + verify_limbs_less_than_limbs_leak_bit(result, max_exclusive)?; + if allow_zero != AllowZero::Yes { + if limbs_are_zero(result).leak() { + return Err(error::Unspecified); + } + } + Ok(()) +} + +/// Parses `input` into `result`, padding `result` with zeros to its length. +/// This attempts to be constant-time with respect to the value but not with +/// respect to the length; it is assumed that the length is public knowledge. +pub fn parse_big_endian_and_pad_consttime( + input: untrusted::Input, + result: &mut [Limb], +) -> Result<(), error::Unspecified> { + if input.is_empty() { + return Err(error::Unspecified); + } + let input_limbs = input.as_slice_less_safe().rchunks(LIMB_BYTES).map(|chunk| { + let mut padded = [0; LIMB_BYTES]; + sliceutil::overwrite_at_start(&mut padded[(LIMB_BYTES - chunk.len())..], chunk); + Limb::from_be_bytes(padded) + }); + if input_limbs.len() > result.len() { + return Err(error::Unspecified); + } + + result + .iter_mut() + .zip(input_limbs.chain(iter::repeat(0))) + .for_each(|(r, i)| *r = i); + + Ok(()) +} + +pub fn big_endian_from_limbs(limbs: &[Limb], out: &mut [u8]) { + let be_bytes = unstripped_be_bytes(limbs); + assert_eq!(out.len(), be_bytes.len()); + out.iter_mut().zip(be_bytes).for_each(|(o, i)| { + *o = i; + }); +} + +/// Returns an iterator of the big-endian encoding of `limbs`. +/// +/// The number of bytes returned will be a multiple of `LIMB_BYTES` +/// and thus may be padded with leading zeros. +pub fn unstripped_be_bytes(limbs: &[Limb]) -> impl ExactSizeIterator + Clone + '_ { + // The unwrap is safe because a slice can never be larger than `usize` bytes. + ArrayFlatMap::new(limbs.iter().rev().copied(), Limb::to_be_bytes).unwrap() +} + +// Used in FFI +pub type Window = bb::Word; + +// Used in FFI +pub type LeakyWindow = bb::LeakyWord; + +/// Processes `limbs` as a sequence of 5-bit windows, folding the windows from +/// most significant to least significant and returning the accumulated result. +/// The first window will be mapped by `init` to produce the initial value for +/// the accumulator. Then `f` will be called to fold the accumulator and the +/// next window until all windows are processed. When the input's bit length +/// isn't divisible by 5, the window passed to `init` will be partial; all +/// windows passed to `fold` will be full. +/// +/// This is designed to avoid leaking the contents of `limbs` through side +/// channels as long as `init` and `fold` are side-channel free. +/// +/// Panics if `limbs` is empty. +#[cfg(feature = "alloc")] +pub fn fold_5_bit_windows R, F: Fn(R, Window) -> R>( + limbs: &[Limb], + init: I, + fold: F, +) -> R { + #[derive(Clone, Copy)] + #[repr(transparent)] + struct BitIndex(Wrapping); + + const WINDOW_BITS: Wrapping = Wrapping(5); + + prefixed_extern! { + fn LIMBS_window5_split_window( + lower_limb: Limb, + higher_limb: Limb, + index_within_word: BitIndex, + ) -> Window; + fn LIMBS_window5_unsplit_window(limb: Limb, index_within_word: BitIndex) -> Window; + } + + let num_limbs = limbs.len(); + let mut window_low_bit = { + let num_whole_windows = (num_limbs * LIMB_BITS) / 5; + let mut leading_bits = (num_limbs * LIMB_BITS) - (num_whole_windows * 5); + if leading_bits == 0 { + leading_bits = WINDOW_BITS.0; + } + BitIndex(Wrapping(LIMB_BITS - leading_bits)) + }; + + let initial_value = { + let leading_partial_window = + unsafe { LIMBS_window5_split_window(*limbs.first().unwrap(), 0, window_low_bit) }; + window_low_bit.0 -= WINDOW_BITS; + init(leading_partial_window) + }; + + let mut low_limb = Limb::from(0 as LeakyWindow); + limbs.iter().fold(initial_value, |mut acc, current_limb| { + let higher_limb = low_limb; + low_limb = *current_limb; + + if window_low_bit.0 > Wrapping(LIMB_BITS) - WINDOW_BITS { + let window = + unsafe { LIMBS_window5_split_window(low_limb, higher_limb, window_low_bit) }; + window_low_bit.0 -= WINDOW_BITS; + acc = fold(acc, window); + }; + while window_low_bit.0 < Wrapping(LIMB_BITS) { + let window = unsafe { LIMBS_window5_unsplit_window(low_limb, window_low_bit) }; + // The loop exits when this subtraction underflows, causing `window_low_bit` to + // wrap around to a very large value. + window_low_bit.0 -= WINDOW_BITS; + acc = fold(acc, window); + } + window_low_bit.0 += Wrapping(LIMB_BITS); // "Fix" the underflow. + + acc + }) +} + +#[inline] +pub(crate) fn limbs_add_assign_mod( + a: &mut [Limb], + b: &[Limb], + m: &[Limb], +) -> Result<(), LenMismatchError> { + prefixed_extern! { + // `r` and `a` may alias. + fn LIMBS_add_mod( + r: *mut Limb, + a: *const Limb, + b: *const Limb, + m: *const Limb, + num_limbs: c::NonZero_size_t, + ); + } + let num_limbs = NonZeroUsize::new(m.len()).ok_or_else(|| LenMismatchError::new(m.len()))?; + (a, b).with_non_dangling_non_null_pointers_rab(num_limbs, |r, a, b| { + let m = m.as_ptr(); // Also non-dangling because `num_limbs` is non-zero. + unsafe { LIMBS_add_mod(r, a, b, m, num_limbs) } + }) +} + +// r *= 2 (mod m). +pub(crate) fn limbs_double_mod(r: &mut [Limb], m: &[Limb]) -> Result<(), LenMismatchError> { + prefixed_extern! { + // `r` and `a` may alias. + fn LIMBS_shl_mod( + r: *mut Limb, + a: *const Limb, + m: *const Limb, + num_limbs: c::NonZero_size_t); + } + let num_limbs = NonZeroUsize::new(m.len()).ok_or_else(|| LenMismatchError::new(m.len()))?; + r.with_non_dangling_non_null_pointers_ra(num_limbs, |r, a| { + let m = m.as_ptr(); // Also non-dangling because num_limbs > 0. + unsafe { + LIMBS_shl_mod(r, a, m, num_limbs); + } + }) +} + +// *r = -a, assuming a is odd. +pub(crate) fn limbs_negative_odd(r: &mut [Limb], a: &[Limb]) { + debug_assert_eq!(r.len(), a.len()); + // Two's complement step 1: flip all the bits. + // The compiler should optimize this to vectorized (a ^ !0). + r.iter_mut().zip(a.iter()).for_each(|(r, &a)| { + *r = !a; + }); + // Two's complement step 2: Add one. Since `a` is odd, `r` is even. Thus we + // can use a bitwise or for addition. + r[0] |= 1; +} + +#[cfg(any(test, feature = "alloc"))] +prefixed_extern! { + fn LIMB_shr(a: Limb, shift: c::size_t) -> Limb; +} + +#[allow(clippy::useless_conversion)] +#[cfg(test)] +mod tests { + use super::*; + use alloc::vec::Vec; + use cfg_if::cfg_if; + + const MAX: LeakyLimb = LeakyLimb::MAX; + + fn leak_in_test(a: LimbMask) -> bool { + a.leak() + } + + #[test] + fn test_limbs_are_even() { + static EVENS: &[&[LeakyLimb]] = &[ + &[], + &[0], + &[2], + &[0, 0], + &[2, 0], + &[0, 1], + &[0, 2], + &[0, 3], + &[0, 0, 0, 0, MAX], + ]; + for even in EVENS { + let even = &Vec::from_iter(even.iter().copied().map(Limb::from)); + assert!(matches!( + limbs_reject_even_leak_bit(even), + Err(error::Unspecified) + )); + } + static ODDS: &[&[LeakyLimb]] = &[ + &[1], + &[3], + &[1, 0], + &[3, 0], + &[1, 1], + &[1, 2], + &[1, 3], + &[1, 0, 0, 0, MAX], + ]; + for odd in ODDS { + let odd = &Vec::from_iter(odd.iter().copied().map(Limb::from)); + assert!(matches!(limbs_reject_even_leak_bit(odd), Ok(()))); + } + } + + static ZEROES: &[&[LeakyLimb]] = &[ + &[], + &[0], + &[0, 0], + &[0, 0, 0], + &[0, 0, 0, 0], + &[0, 0, 0, 0, 0], + &[0, 0, 0, 0, 0, 0, 0], + &[0, 0, 0, 0, 0, 0, 0, 0], + &[0, 0, 0, 0, 0, 0, 0, 0, 0], + ]; + + static NONZEROES: &[&[LeakyLimb]] = &[ + &[1], + &[0, 1], + &[1, 1], + &[1, 0, 0, 0], + &[0, 1, 0, 0], + &[0, 0, 1, 0], + &[0, 0, 0, 1], + ]; + + #[test] + fn test_limbs_are_zero() { + for zero in ZEROES { + let zero = &Vec::from_iter(zero.iter().copied().map(Limb::from)); + assert!(leak_in_test(limbs_are_zero(zero))); + } + for nonzero in NONZEROES { + let nonzero = &Vec::from_iter(nonzero.iter().copied().map(Limb::from)); + assert!(!leak_in_test(limbs_are_zero(nonzero))); + } + } + + #[test] + fn test_limbs_equal_limb() { + // Equal + static EQUAL: &[&[LeakyLimb]] = &[&[1], &[1, 0], &[1, 0, 0], &[1, 0, 0, 0, 0, 0, 0]]; + for a in EQUAL { + let a = &Vec::from_iter(a.iter().copied().map(Limb::from)); + assert!(matches!(verify_limbs_equal_1_leak_bit(a), Ok(()))); + } + + // Unequal + static UNEQUAL: &[&[LeakyLimb]] = &[ + &[0], + &[2], + &[3], + &[MAX], + &[0, 1], + &[1, 1], + &[0, 0, 0, 0, 0, 0, 0, 1], + &[0, 0, 0, 0, 1, 0, 0, 0], + &[0, 0, 0, 0, 1, 0, 0, 1], + &[MAX, 1], + ]; + for a in UNEQUAL { + let a = &Vec::from_iter(a.iter().copied().map(Limb::from)); + assert!(matches!( + verify_limbs_equal_1_leak_bit(a), + Err(error::Unspecified) + )); + } + } + + #[test] + fn test_parse_big_endian_and_pad_consttime() { + const LIMBS: usize = 4; + + { + // Empty input. + let inp = untrusted::Input::from(&[]); + let mut result = [0; LIMBS].map(From::::from); + assert!(parse_big_endian_and_pad_consttime(inp, &mut result).is_err()); + } + + // The input is longer than will fit in the given number of limbs. + { + let inp = [1, 2, 3, 4, 5, 6, 7, 8, 9]; + let inp = untrusted::Input::from(&inp); + let mut result = [0; 8 / LIMB_BYTES].map(From::::from); + assert!(parse_big_endian_and_pad_consttime(inp, &mut result[..]).is_err()); + } + + // Less than a full limb. + { + let inp = [0xfe]; + let inp = untrusted::Input::from(&inp); + let mut result = [0; LIMBS].map(From::::from); + assert_eq!( + Ok(()), + parse_big_endian_and_pad_consttime(inp, &mut result[..]) + ); + assert_eq!(&[0xfe, 0, 0, 0], &result); + } + + // A whole limb for 32-bit, half a limb for 64-bit. + { + let inp = [0xbe, 0xef, 0xf0, 0x0d]; + let inp = untrusted::Input::from(&inp); + let mut result = [0; LIMBS].map(From::::from); + assert_eq!(Ok(()), parse_big_endian_and_pad_consttime(inp, &mut result)); + assert_eq!(&[0xbeeff00d, 0, 0, 0], &result); + } + + cfg_if! { + if #[cfg(target_pointer_width = "64")] { + static TEST_CASES: &[(&[u8], &[Limb])] = &[ + (&[1], &[1, 0]), + (&[1, 2], &[0x102, 0]), + (&[1, 2, 3], &[0x10203, 0]), + (&[1, 2, 3, 4], &[0x102_0304, 0]), + (&[1, 2, 3, 4, 5], &[0x1_0203_0405, 0]), + (&[1, 2, 3, 4, 5, 6], &[0x102_0304_0506, 0]), + (&[1, 2, 3, 4, 5, 6, 7], &[0x1_0203_0405_0607, 0]), + (&[1, 2, 3, 4, 5, 6, 7, 8], &[0x102_0304_0506_0708, 0]), + (&[1, 2, 3, 4, 5, 6, 7, 8, 9], &[0x0203_0405_0607_0809, 0x1]), + (&[1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa], &[0x0304_0506_0708_090a, 0x102]), + (&[1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb], &[0x0405_0607_0809_0a0b, 0x1_0203]), + ]; + for (be_bytes, limbs) in TEST_CASES { + let mut buf = [0; 2]; + parse_big_endian_and_pad_consttime(untrusted::Input::from(be_bytes), &mut buf) + .unwrap(); + assert_eq!(limbs, &buf, "({be_bytes:x?}, {limbs:x?}"); + } + } else if #[cfg(target_pointer_width = "32")] { + static TEST_CASES: &[(&[u8], &[Limb])] = &[ + (&[1], &[1, 0, 0]), + (&[1, 2], &[0x102, 0, 0]), + (&[1, 2, 3], &[0x10203, 0, 0]), + (&[1, 2, 3, 4], &[0x102_0304, 0, 0]), + (&[1, 2, 3, 4, 5], &[0x0203_0405, 0x1, 0]), + (&[1, 2, 3, 4, 5, 6], &[0x0304_0506, 0x102, 0]), + (&[1, 2, 3, 4, 5, 6, 7], &[0x0405_0607, 0x1_0203, 0]), + (&[1, 2, 3, 4, 5, 6, 7, 8], &[0x0506_0708, 0x102_0304, 0]), + (&[1, 2, 3, 4, 5, 6, 7, 8, 9], &[0x0607_0809, 0x0203_0405, 0x1]), + (&[1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa], &[0x0708_090a, 0x0304_0506, 0x102]), + (&[1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb], &[0x0809_0a0b, 0x0405_0607, 0x1_0203]), + ]; + for (be_bytes, limbs) in TEST_CASES { + let mut buf = [0; 3]; + parse_big_endian_and_pad_consttime(untrusted::Input::from(be_bytes), &mut buf) + .unwrap(); + assert_eq!(limbs, &buf, "({be_bytes:x?}, {limbs:x?}"); + } + } else { + panic!("Unsupported target_pointer_width"); + } + + // XXX: This is a weak set of tests. TODO: expand it. + } + } + + #[test] + fn test_big_endian_from_limbs_same_length() { + #[cfg(target_pointer_width = "32")] + let limbs = [ + 0xbccddeef, 0x89900aab, 0x45566778, 0x01122334, 0xddeeff00, 0x99aabbcc, 0x55667788, + 0x11223344, + ]; + + #[cfg(target_pointer_width = "64")] + let limbs = [ + 0x8990_0aab_bccd_deef, + 0x0112_2334_4556_6778, + 0x99aa_bbcc_ddee_ff00, + 0x1122_3344_5566_7788, + ]; + + let limbs = limbs.map(From::::from); + + let expected = [ + 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, + 0xff, 0x00, 0x01, 0x12, 0x23, 0x34, 0x45, 0x56, 0x67, 0x78, 0x89, 0x90, 0x0a, 0xab, + 0xbc, 0xcd, 0xde, 0xef, + ]; + + let mut out = [0xabu8; 32]; + big_endian_from_limbs(&limbs[..], &mut out); + assert_eq!(&out[..], &expected[..]); + } + + #[should_panic] + #[test] + fn test_big_endian_from_limbs_fewer_limbs() { + #[cfg(target_pointer_width = "32")] + // Two fewer limbs. + let limbs = [ + 0xbccddeef, 0x89900aab, 0x45566778, 0x01122334, 0xddeeff00, 0x99aabbcc, + ]; + + // One fewer limb. + #[cfg(target_pointer_width = "64")] + let limbs = [ + 0x8990_0aab_bccd_deef, + 0x0112_2334_4556_6778, + 0x99aa_bbcc_ddee_ff00, + ]; + + let limbs = limbs.map(From::::from); + + let mut out = [0xabu8; 32]; + + big_endian_from_limbs(&limbs[..], &mut out); + } + + #[test] + fn test_limbs_minimal_bits() { + const ALL_ONES: LeakyLimb = LeakyLimb::MAX; + static CASES: &[(&[LeakyLimb], usize)] = &[ + (&[], 0), + (&[0], 0), + (&[ALL_ONES], LIMB_BITS), + (&[ALL_ONES, 0], LIMB_BITS), + (&[ALL_ONES, 1], LIMB_BITS + 1), + (&[0, 0], 0), + (&[1, 0], 1), + (&[0, 1], LIMB_BITS + 1), + (&[0, ALL_ONES], 2 * LIMB_BITS), + (&[ALL_ONES, ALL_ONES], 2 * LIMB_BITS), + (&[ALL_ONES, ALL_ONES >> 1], 2 * LIMB_BITS - 1), + (&[ALL_ONES, 0b100_0000], LIMB_BITS + 7), + (&[ALL_ONES, 0b101_0000], LIMB_BITS + 7), + (&[ALL_ONES, ALL_ONES >> 1], LIMB_BITS + (LIMB_BITS) - 1), + ]; + for (limbs, bits) in CASES { + let limbs = &Vec::from_iter(limbs.iter().copied().map(Limb::from)); + assert_eq!(limbs_minimal_bits(limbs).as_bits(), *bits); + } + } +} diff --git a/ring-0.17.14/src/pbkdf2.rs b/ring-0.17.14/src/pbkdf2.rs new file mode 100644 index 0000000000..7baacdbdc9 --- /dev/null +++ b/ring-0.17.14/src/pbkdf2.rs @@ -0,0 +1,344 @@ +// Copyright 2015 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! PBKDF2 derivation and verification. +//! +//! Use `derive` to derive PBKDF2 outputs. Use `verify` to verify secret +//! against previously-derived outputs. +//! +//! PBKDF2 is specified in [RFC 2898 Section 5.2] with test vectors given in +//! [RFC 6070]. See also [NIST Special Publication 800-132]. +//! +//! [RFC 2898 Section 5.2]: https://tools.ietf.org/html/rfc2898#section-5.2 +//! [RFC 6070]: https://tools.ietf.org/html/rfc6070 +//! [NIST Special Publication 800-132]: +//! http://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-132.pdf +//! +//! # Examples +//! +//! ## Password Database Example +//! +//! ``` +//! use ring::{digest, pbkdf2}; +//! use std::{collections::HashMap, num::NonZeroU32}; +//! +//! static PBKDF2_ALG: pbkdf2::Algorithm = pbkdf2::PBKDF2_HMAC_SHA256; +//! const CREDENTIAL_LEN: usize = digest::SHA256_OUTPUT_LEN; +//! pub type Credential = [u8; CREDENTIAL_LEN]; +//! +//! enum Error { +//! WrongUsernameOrPassword +//! } +//! +//! struct PasswordDatabase { +//! pbkdf2_iterations: NonZeroU32, +//! db_salt_component: [u8; 16], +//! +//! // Normally this would be a persistent database. +//! storage: HashMap, +//! } +//! +//! impl PasswordDatabase { +//! pub fn store_password(&mut self, username: &str, password: &str) { +//! let salt = self.salt(username); +//! let mut to_store: Credential = [0u8; CREDENTIAL_LEN]; +//! pbkdf2::derive(PBKDF2_ALG, self.pbkdf2_iterations, &salt, +//! password.as_bytes(), &mut to_store); +//! self.storage.insert(String::from(username), to_store); +//! } +//! +//! pub fn verify_password(&self, username: &str, attempted_password: &str) +//! -> Result<(), Error> { +//! match self.storage.get(username) { +//! Some(actual_password) => { +//! let salt = self.salt(username); +//! pbkdf2::verify(PBKDF2_ALG, self.pbkdf2_iterations, &salt, +//! attempted_password.as_bytes(), +//! actual_password) +//! .map_err(|_| Error::WrongUsernameOrPassword) +//! }, +//! +//! None => Err(Error::WrongUsernameOrPassword) +//! } +//! } +//! +//! // The salt should have a user-specific component so that an attacker +//! // cannot crack one password for multiple users in the database. It +//! // should have a database-unique component so that an attacker cannot +//! // crack the same user's password across databases in the unfortunate +//! // but common case that the user has used the same password for +//! // multiple systems. +//! fn salt(&self, username: &str) -> Vec { +//! let mut salt = Vec::with_capacity(self.db_salt_component.len() + +//! username.as_bytes().len()); +//! salt.extend(self.db_salt_component.as_ref()); +//! salt.extend(username.as_bytes()); +//! salt +//! } +//! } +//! +//! fn main() { +//! // Normally these parameters would be loaded from a configuration file. +//! let mut db = PasswordDatabase { +//! pbkdf2_iterations: NonZeroU32::new(100_000).unwrap(), +//! db_salt_component: [ +//! // This value was generated from a secure PRNG. +//! 0xd6, 0x26, 0x98, 0xda, 0xf4, 0xdc, 0x50, 0x52, +//! 0x24, 0xf2, 0x27, 0xd1, 0xfe, 0x39, 0x01, 0x8a +//! ], +//! storage: HashMap::new(), +//! }; +//! +//! db.store_password("alice", "@74d7]404j|W}6u"); +//! +//! // An attempt to log in with the wrong password fails. +//! assert!(db.verify_password("alice", "wrong password").is_err()); +//! +//! // Normally there should be an expoentially-increasing delay between +//! // attempts to further protect against online attacks. +//! +//! // An attempt to log in with the right password succeeds. +//! assert!(db.verify_password("alice", "@74d7]404j|W}6u").is_ok()); +//! } + +use self::{derive_error::DeriveError, verify_error::VerifyError}; +use crate::{ + bb, cpu, digest, + error::{self, TooMuchOutputRequestedError}, + hmac::{self, InputTooLongError}, +}; +use core::num::NonZeroU32; + +/// A PBKDF2 algorithm. +#[derive(Clone, Copy, PartialEq, Eq)] +pub struct Algorithm(hmac::Algorithm); + +/// PBKDF2 using HMAC-SHA1. +pub static PBKDF2_HMAC_SHA1: Algorithm = Algorithm(hmac::HMAC_SHA1_FOR_LEGACY_USE_ONLY); + +/// PBKDF2 using HMAC-SHA256. +pub static PBKDF2_HMAC_SHA256: Algorithm = Algorithm(hmac::HMAC_SHA256); + +/// PBKDF2 using HMAC-SHA384. +pub static PBKDF2_HMAC_SHA384: Algorithm = Algorithm(hmac::HMAC_SHA384); + +/// PBKDF2 using HMAC-SHA512. +pub static PBKDF2_HMAC_SHA512: Algorithm = Algorithm(hmac::HMAC_SHA512); + +/// Fills `out` with the key derived using PBKDF2 with the given inputs. +/// +/// Do not use `derive` as part of verifying a secret; use `verify` instead, to +/// minimize the effectiveness of timing attacks. +/// +/// `out.len()` must be no larger than the digest length * (2**32 - 1), per the +/// PBKDF2 specification. +/// +/// | Parameter | RFC 2898 Section 5.2 Term +/// |-------------|------------------------------------------- +/// | digest_alg | PRF (HMAC with the given digest algorithm) +/// | iterations | c (iteration count) +/// | salt | S (salt) +/// | secret | P (password) +/// | out | dk (derived key) +/// | out.len() | dkLen (derived key length) +/// +/// # Panics +/// +/// Panics if `out.len() > u32::MAX * digest_alg.output_len()`, where +/// `digest_alg` is the underlying HMAC/digest algorithm. +/// +/// Panics if `salt` is so astronomically gigantic that it isn't a valid input +/// to the underlying digest function. +/// +/// Panics if `secret` is so astronomically gigantic that it isn't a valid +/// input to the underlying digest function. +pub fn derive( + algorithm: Algorithm, + iterations: NonZeroU32, + salt: &[u8], + secret: &[u8], + out: &mut [u8], +) { + let cpu = cpu::features(); + try_derive(algorithm, iterations, salt, secret, out, cpu) + .map_err(error::erase::) + .unwrap() +} + +fn try_derive( + algorithm: Algorithm, + iterations: NonZeroU32, + salt: &[u8], + secret: &[u8], + out: &mut [u8], + cpu: cpu::Features, +) -> Result<(), DeriveError> { + let digest_alg = algorithm.0.digest_algorithm(); + let output_len = digest_alg.output_len(); + + // This implementation's performance is asymptotically optimal as described + // in https://jbp.io/2015/08/11/pbkdf2-performance-matters/. However, it + // hasn't been optimized to the same extent as fastpbkdf2. In particular, + // this implementation is probably doing a lot of unnecessary copying. + + let secret = + hmac::Key::try_new(algorithm.0, secret, cpu).map_err(DeriveError::secret_too_long)?; + + // Clear |out|. + out.fill(0); + + let mut idx: u32 = 0; + + let out_len = out.len(); + for chunk in out.chunks_mut(output_len) { + idx = idx.checked_add(1).ok_or_else(|| { + DeriveError::too_much_output_requested(TooMuchOutputRequestedError::new(out_len)) + })?; + // If the salt is too long, then we'll detect this on the first + // iteration before we've written any output. + derive_block(&secret, iterations, salt, idx, chunk, cpu) + .map_err(DeriveError::salt_too_long)?; + } + Ok(()) +} + +fn derive_block( + secret: &hmac::Key, + iterations: NonZeroU32, + salt: &[u8], + idx: u32, + out: &mut [u8], + cpu: cpu::Features, +) -> Result<(), InputTooLongError> { + let mut ctx = hmac::Context::with_key(secret); + ctx.update(salt); + ctx.update(&u32::to_be_bytes(idx)); + + let mut u = ctx.try_sign(cpu)?; + + let mut remaining: u32 = iterations.into(); + loop { + bb::xor_assign_at_start(&mut out[..], u.as_ref()); + + if remaining == 1 { + break; + } + remaining -= 1; + + // This will not fail, because the output of HMAC is never too long to + // be an input for the same algorithm, but we can't prove that with + // only locally-available information. + u = secret.sign(u.as_ref(), cpu)? + } + Ok(()) +} + +cold_exhaustive_error! { + enum derive_error::DeriveError { + secret_too_long => SecretTooLong(InputTooLongError), + salt_too_long => SaltTooLong(InputTooLongError), + too_much_output_requested => TooMuchOutputRequested(TooMuchOutputRequestedError), + } +} + +cold_exhaustive_error! { + enum verify_error::VerifyError { + mismatch => Mismatch(()), + secret_too_long => SecretTooLong(InputTooLongError), + salt_too_long => SaltTooLong(InputTooLongError), + previously_derived_empty => PreviouslyDerivedEmpty(usize), + } +} + +/// Verifies that a previously-derived (e.g., using `derive`) PBKDF2 value +/// matches the PBKDF2 value derived from the other inputs. +/// +/// The comparison is done in constant time to prevent timing attacks. The +/// comparison will fail if `previously_derived` is empty (has a length of +/// zero). +/// +/// | Parameter | RFC 2898 Section 5.2 Term +/// |----------------------------|-------------------------------------------- +/// | digest_alg | PRF (HMAC with the given digest algorithm). +/// | `iterations` | c (iteration count) +/// | `salt` | S (salt) +/// | `secret` | P (password) +/// | `previously_derived` | dk (derived key) +/// | `previously_derived.len()` | dkLen (derived key length) +pub fn verify( + algorithm: Algorithm, + iterations: NonZeroU32, + salt: &[u8], + secret: &[u8], + previously_derived: &[u8], +) -> Result<(), error::Unspecified> { + let cpu = cpu::features(); + try_verify(algorithm, iterations, salt, secret, previously_derived, cpu) + .map_err(error::erase::) +} + +fn try_verify( + algorithm: Algorithm, + iterations: NonZeroU32, + salt: &[u8], + secret: &[u8], + previously_derived: &[u8], + cpu: cpu::Features, +) -> Result<(), VerifyError> { + let digest_alg = algorithm.0.digest_algorithm(); + + if previously_derived.is_empty() { + return Err(VerifyError::previously_derived_empty(0)); + } + + let mut derived_buf = [0u8; digest::MAX_OUTPUT_LEN]; + + let output_len = digest_alg.output_len(); + let secret = + hmac::Key::try_new(algorithm.0, secret, cpu).map_err(VerifyError::secret_too_long)?; + let mut idx: u32 = 0; + + let mut matches = 1; + + for previously_derived_chunk in previously_derived.chunks(output_len) { + idx = idx.checked_add(1).ok_or_else(|| { + // `previously_derived` is so gigantic that PBKDF2 couldn't + // have been used to compute it. + VerifyError::mismatch(()) + })?; + + let derived_chunk = &mut derived_buf[..previously_derived_chunk.len()]; + derived_chunk.fill(0); + + derive_block(&secret, iterations, salt, idx, derived_chunk, cpu) + .map_err(VerifyError::salt_too_long)?; + + // XXX: This isn't fully constant-time-safe. TODO: Fix that. + #[allow(clippy::bool_to_int_with_if)] + let current_block_matches = + if bb::verify_slices_are_equal(derived_chunk, previously_derived_chunk).is_ok() { + 1 + } else { + 0 + }; + + matches &= current_block_matches; + } + + if matches == 0 { + return Err(VerifyError::mismatch(())); + } + + Ok(()) +} diff --git a/ring-0.17.14/src/pkcs8.rs b/ring-0.17.14/src/pkcs8.rs new file mode 100644 index 0000000000..5ef66fd18e --- /dev/null +++ b/ring-0.17.14/src/pkcs8.rs @@ -0,0 +1,216 @@ +// Copyright 2017 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! PKCS#8 is specified in [RFC 5958]. +//! +//! [RFC 5958]: https://tools.ietf.org/html/rfc5958 + +use crate::{ec, error, io::der}; + +pub(crate) struct PublicKeyOptions { + /// Should the wrong public key ASN.1 tagging used by early implementations + /// of PKCS#8 v2 (including earlier versions of *ring*) be accepted? + pub accept_legacy_ed25519_public_key_tag: bool, +} + +pub(crate) enum Version { + V1Only, + V1OrV2(PublicKeyOptions), + V2Only(PublicKeyOptions), +} + +/// A template for constructing PKCS#8 documents. +/// +/// Note that this only works for ECC. +pub(crate) struct Template { + pub bytes: &'static [u8], + + // The range within `bytes` that holds the value (not including the tag and + // length) for use in the PKCS#8 document's privateKeyAlgorithm field. + pub alg_id_range: core::ops::Range, + + // `bytes[alg_id_range][curve_id_index..]` contains the OID identifying the, + // curve, including the tag and length. + pub curve_id_index: usize, + + // `bytes` will be split into two parts at `private_key_index`, where the + // first part is written before the private key and the second part is + // written after the private key. The public key is written after the second + // part. + pub private_key_index: usize, +} + +impl Template { + #[inline] + fn alg_id_value(&self) -> untrusted::Input { + untrusted::Input::from(self.alg_id_value_()) + } + + fn alg_id_value_(&self) -> &[u8] { + &self.bytes[self.alg_id_range.start..self.alg_id_range.end] + } + + #[inline] + pub fn curve_oid(&self) -> untrusted::Input { + untrusted::Input::from(&self.alg_id_value_()[self.curve_id_index..]) + } +} + +/// Parses an unencrypted PKCS#8 private key, verifies that it is the right type +/// of key, and returns the key value. +/// +/// PKCS#8 is specified in [RFC 5958]. +/// +/// [RFC 5958]: https://tools.ietf.org/html/rfc5958 +pub(crate) fn unwrap_key<'a>( + template: &Template, + version: Version, + input: untrusted::Input<'a>, +) -> Result<(untrusted::Input<'a>, Option>), error::KeyRejected> { + unwrap_key_(template.alg_id_value(), version, input) +} + +/// Parses an unencrypted PKCS#8 private key, verifies that it is the right type +/// of key, and returns the key value. +/// +/// `alg_id` must be the encoded value (not including the outermost `SEQUENCE` +/// tag and length) of the `AlgorithmIdentifier` that identifies the key type. +/// The result will be an encoded `RSAPrivateKey` or `ECPrivateKey` or similar. +/// +/// PKCS#8 is specified in [RFC 5958]. +/// +/// [RFC 5958]: https://tools.ietf.org/html/rfc5958 +pub(crate) fn unwrap_key_<'a>( + alg_id: untrusted::Input, + version: Version, + input: untrusted::Input<'a>, +) -> Result<(untrusted::Input<'a>, Option>), error::KeyRejected> { + input.read_all(error::KeyRejected::invalid_encoding(), |input| { + der::nested( + input, + der::Tag::Sequence, + error::KeyRejected::invalid_encoding(), + |input| unwrap_key__(alg_id, version, input), + ) + }) +} + +fn unwrap_key__<'a>( + alg_id: untrusted::Input, + version: Version, + input: &mut untrusted::Reader<'a>, +) -> Result<(untrusted::Input<'a>, Option>), error::KeyRejected> { + let actual_version = der::small_nonnegative_integer(input) + .map_err(|error::Unspecified| error::KeyRejected::invalid_encoding())?; + + // Do things in a specific order to return more useful errors: + // 1. Check for completely unsupported version. + // 2. Check for algorithm mismatch. + // 3. Check for algorithm-specific version mismatch. + + if actual_version > 1 { + return Err(error::KeyRejected::version_not_supported()); + }; + + let actual_alg_id = der::expect_tag_and_get_value(input, der::Tag::Sequence) + .map_err(|error::Unspecified| error::KeyRejected::invalid_encoding())?; + if actual_alg_id.as_slice_less_safe() != alg_id.as_slice_less_safe() { + return Err(error::KeyRejected::wrong_algorithm()); + } + + let public_key_options = match (actual_version, version) { + (0, Version::V1Only) => None, + (0, Version::V1OrV2(_)) => None, + (1, Version::V1OrV2(options)) | (1, Version::V2Only(options)) => Some(options), + _ => { + return Err(error::KeyRejected::version_not_supported()); + } + }; + + let private_key = der::expect_tag_and_get_value(input, der::Tag::OctetString) + .map_err(|error::Unspecified| error::KeyRejected::invalid_encoding())?; + + // Ignore any attributes that are present. + if input.peek(der::Tag::ContextSpecificConstructed0.into()) { + let _ = der::expect_tag_and_get_value(input, der::Tag::ContextSpecificConstructed0) + .map_err(|error::Unspecified| error::KeyRejected::invalid_encoding())?; + } + + let public_key = if let Some(options) = public_key_options { + if input.at_end() { + return Err(error::KeyRejected::public_key_is_missing()); + } + + const INCORRECT_LEGACY: der::Tag = der::Tag::ContextSpecificConstructed1; + let result = if options.accept_legacy_ed25519_public_key_tag + && input.peek(INCORRECT_LEGACY.into()) + { + der::nested( + input, + INCORRECT_LEGACY, + error::Unspecified, + der::bit_string_with_no_unused_bits, + ) + } else { + der::bit_string_tagged_with_no_unused_bits(der::Tag::ContextSpecific1, input) + }; + let public_key = + result.map_err(|error::Unspecified| error::KeyRejected::invalid_encoding())?; + Some(public_key) + } else { + None + }; + + Ok((private_key, public_key)) +} + +/// A generated PKCS#8 document. +pub struct Document { + bytes: [u8; ec::PKCS8_DOCUMENT_MAX_LEN], + len: usize, +} + +impl AsRef<[u8]> for Document { + #[inline] + fn as_ref(&self) -> &[u8] { + &self.bytes[..self.len] + } +} + +pub(crate) fn wrap_key(template: &Template, private_key: &[u8], public_key: &[u8]) -> Document { + let mut result = Document { + bytes: [0; ec::PKCS8_DOCUMENT_MAX_LEN], + len: template.bytes.len() + private_key.len() + public_key.len(), + }; + wrap_key_( + template, + private_key, + public_key, + &mut result.bytes[..result.len], + ); + result +} + +/// Formats a private key "prefix||private_key||middle||public_key" where +/// `template` is "prefix||middle" split at position `private_key_index`. +fn wrap_key_(template: &Template, private_key: &[u8], public_key: &[u8], bytes: &mut [u8]) { + let (before_private_key, after_private_key) = + template.bytes.split_at(template.private_key_index); + let private_key_end_index = template.private_key_index + private_key.len(); + bytes[..template.private_key_index].copy_from_slice(before_private_key); + bytes[template.private_key_index..private_key_end_index].copy_from_slice(private_key); + bytes[private_key_end_index..(private_key_end_index + after_private_key.len())] + .copy_from_slice(after_private_key); + bytes[(private_key_end_index + after_private_key.len())..].copy_from_slice(public_key); +} diff --git a/ring-0.17.14/src/polyfill.rs b/ring-0.17.14/src/polyfill.rs new file mode 100644 index 0000000000..cf513208aa --- /dev/null +++ b/ring-0.17.14/src/polyfill.rs @@ -0,0 +1,107 @@ +// Copyright 2015-2016 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! Polyfills for functionality that will (hopefully) be added to Rust's +//! standard library soon. + +#[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] +#[inline(always)] +pub const fn u64_from_usize(x: usize) -> u64 { + x as u64 +} + +#[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] +pub const fn usize_from_u32(x: u32) -> usize { + x as usize +} + +#[cfg(all( + target_arch = "aarch64", + target_endian = "little", + target_pointer_width = "64" +))] +#[allow(clippy::cast_possible_truncation)] +pub fn usize_from_u64(x: u64) -> usize { + x as usize +} + +/// const-capable `x.try_into().unwrap_or(usize::MAX)` +#[allow(clippy::cast_possible_truncation)] +#[inline(always)] +pub const fn usize_from_u64_saturated(x: u64) -> usize { + const USIZE_MAX: u64 = u64_from_usize(usize::MAX); + if x < USIZE_MAX { + x as usize + } else { + usize::MAX + } +} + +#[macro_use] +mod cold_error; + +mod array_flat_map; +mod array_split_map; + +pub mod cstr; + +pub mod sliceutil; + +#[cfg(feature = "alloc")] +mod leading_zeros_skipped; + +#[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + all(target_arch = "arm", target_endian = "little"), + target_arch = "x86", + target_arch = "x86_64" +))] +pub mod once_cell { + pub mod race; +} + +mod notsend; +pub mod ptr; + +pub mod slice; + +#[cfg(test)] +mod test; + +mod unwrap_const; + +pub use self::{ + array_flat_map::ArrayFlatMap, array_split_map::ArraySplitMap, notsend::NotSend, + unwrap_const::unwrap_const, +}; + +#[cfg(feature = "alloc")] +pub use leading_zeros_skipped::LeadingZerosStripped; + +#[cfg(test)] +mod tests { + use super::*; + #[test] + fn test_usize_from_u64_saturated() { + const USIZE_MAX: u64 = u64_from_usize(usize::MAX); + assert_eq!(usize_from_u64_saturated(u64::MIN), usize::MIN); + assert_eq!(usize_from_u64_saturated(USIZE_MAX), usize::MAX); + assert_eq!(usize_from_u64_saturated(USIZE_MAX - 1), usize::MAX - 1); + + #[cfg(not(target_pointer_width = "64"))] + { + assert_eq!(usize_from_u64_saturated(USIZE_MAX + 1), usize::MAX); + } + } +} diff --git a/ring-0.17.14/src/polyfill/array_flat_map.rs b/ring-0.17.14/src/polyfill/array_flat_map.rs new file mode 100644 index 0000000000..8b3abbefe1 --- /dev/null +++ b/ring-0.17.14/src/polyfill/array_flat_map.rs @@ -0,0 +1,127 @@ +use core::iter::FlatMap; + +/// A specialized version of `core::iter::FlatMap` for mapping over exact-sized +/// iterators with a function that returns an array. +/// +/// `ArrayFlatMap` differs from `FlatMap` in that `ArrayFlatMap` implements +/// `ExactSizeIterator`. Since the result of `F` always has `LEN` elements, if +/// `I` is an exact-sized iterator of length `inner_len` then we know the +/// length of the flat-mapped result is `inner_len * LEN`. (The constructor +/// verifies that this multiplication doesn't overflow `usize`.) +#[derive(Clone)] +pub struct ArrayFlatMap { + inner: FlatMap, + remaining: usize, +} + +impl ArrayFlatMap +where + I: ExactSizeIterator, + F: FnMut(I::Item) -> [Item; LEN], +{ + /// Constructs an `ArrayFlatMap` wrapping the given iterator, using the + /// given function + pub fn new(inner: I, f: F) -> Option { + let remaining = inner.len().checked_mul(LEN)?; + let inner = inner.flat_map(f); + Some(Self { inner, remaining }) + } +} + +impl Iterator for ArrayFlatMap +where + I: Iterator, + F: FnMut(I::Item) -> [Item; LEN], +{ + type Item = Item; + + fn next(&mut self) -> Option { + let result = self.inner.next(); + if result.is_some() { + self.remaining -= 1; + } + result + } + + /// Required for implementing `ExactSizeIterator`. + fn size_hint(&self) -> (usize, Option) { + (self.remaining, Some(self.remaining)) + } +} + +impl ExactSizeIterator for ArrayFlatMap +where + I: Iterator, + F: FnMut(I::Item) -> [Item; LEN], +{ +} + +#[cfg(test)] +mod tests { + use super::*; + use core::mem::size_of; + + #[test] + fn test_array_flat_map() { + static TEST_CASES: &[(&[u16], fn(u16) -> [u8; 2], &[u8])] = &[ + // Empty input + (&[], u16::to_be_bytes, &[]), + // Non-empty input. + ( + &[0x0102, 0x0304, 0x0506], + u16::to_be_bytes, + &[1, 2, 3, 4, 5, 6], + ), + // Test with a different mapping function. + ( + &[0x0102, 0x0304, 0x0506], + u16::to_le_bytes, + &[2, 1, 4, 3, 6, 5], + ), + ]; + TEST_CASES.iter().copied().for_each(|(input, f, expected)| { + let mapped = ArrayFlatMap::new(input.iter().copied(), f).unwrap(); + super::super::test::assert_iterator(mapped, expected); + }); + } + + // Does ArrayFlatMap::new() handle overflow correctly? + #[test] + fn test_array_flat_map_len_overflow() { + struct DownwardCounter { + remaining: usize, + } + impl Iterator for DownwardCounter { + type Item = usize; + + fn next(&mut self) -> Option { + if self.remaining > 0 { + let result = self.remaining; + self.remaining -= 1; + Some(result) + } else { + None + } + } + + fn size_hint(&self) -> (usize, Option) { + (self.remaining, Some(self.remaining)) + } + } + impl ExactSizeIterator for DownwardCounter {} + + const MAX: usize = usize::MAX / size_of::(); + + static TEST_CASES: &[(usize, bool)] = &[(MAX, true), (MAX + 1, false)]; + TEST_CASES.iter().copied().for_each(|(input_len, is_some)| { + let inner = DownwardCounter { + remaining: input_len, + }; + let mapped = ArrayFlatMap::new(inner, usize::to_be_bytes); + assert_eq!(mapped.is_some(), is_some); + if let Some(mapped) = mapped { + assert_eq!(mapped.len(), input_len * size_of::()); + } + }); + } +} diff --git a/ring-0.17.14/src/polyfill/array_split_map.rs b/ring-0.17.14/src/polyfill/array_split_map.rs new file mode 100644 index 0000000000..c754330d23 --- /dev/null +++ b/ring-0.17.14/src/polyfill/array_split_map.rs @@ -0,0 +1,71 @@ +// Copyright 2023 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +pub trait ArraySplitMap { + fn array_split_map(self, f: impl Fn([I; CN]) -> O) -> [O; ON]; +} + +impl ArraySplitMap for [I; 12] { + #[inline] + fn array_split_map(self, f: impl Fn([I; 4]) -> O) -> [O; 3] { + let [a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3] = self; + [ + f([a0, a1, a2, a3]), + f([b0, b1, b2, b3]), + f([c0, c1, c2, c3]), + ] + } +} + +impl ArraySplitMap for [I; 16] { + #[inline] + fn array_split_map(self, f: impl Fn([I; 4]) -> O) -> [O; 4] { + let [a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3] = self; + [ + f([a0, a1, a2, a3]), + f([b0, b1, b2, b3]), + f([c0, c1, c2, c3]), + f([d0, d1, d2, d3]), + ] + } +} + +impl ArraySplitMap for [I; 32] { + #[inline] + fn array_split_map(self, f: impl Fn([I; 4]) -> O) -> [O; 8] { + let [a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3, e0, e1, e2, e3, f0, f1, f2, f3, g0, g1, g2, g3, h0, h1, h2, h3] = + self; + [ + f([a0, a1, a2, a3]), + f([b0, b1, b2, b3]), + f([c0, c1, c2, c3]), + f([d0, d1, d2, d3]), + f([e0, e1, e2, e3]), + f([f0, f1, f2, f3]), + f([g0, g1, g2, g3]), + f([h0, h1, h2, h3]), + ] + } +} + +impl ArraySplitMap for [I; 16] { + #[inline] + fn array_split_map(self, f: impl Fn([I; 8]) -> O) -> [O; 2] { + let [a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7] = self; + [ + f([a0, a1, a2, a3, a4, a5, a6, a7]), + f([b0, b1, b2, b3, b4, b5, b6, b7]), + ] + } +} diff --git a/ring-0.17.14/src/polyfill/cold_error.rs b/ring-0.17.14/src/polyfill/cold_error.rs new file mode 100644 index 0000000000..13fc9f6baa --- /dev/null +++ b/ring-0.17.14/src/polyfill/cold_error.rs @@ -0,0 +1,101 @@ +// Copyright 2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +/// Reduces boilerplate for defining error types where we want the compiler to +/// optimize for the non-error path by assuming constructing an error is +/// unlikely/cold code. +/// +/// WARNING: Every struct/variant must contain some *non-constant* value so +/// that the "invariant code" pass of the compiler doesn't recognize the +/// constructor as being "invariant code" and optimizing it away; +/// although such optimization would be nice to take advantage of, it +/// seems to lose the `#[cold]` attribute. +/// +/// Constructor functions ar marked `pub(super)` to ensure that instances can +/// only be constructed from within the enclosing module (and its submodules). +/// +/// XXX: #[inline(never)] is required to avoid the (MIR?) optimizer inlining +/// away the function call and losing the `#[cold]` attribute in the process. +/// We'd otherwise maybe prefer all constructors to be inline. +/// +/// The type is defined in its own submodule `#mod_name` to hide the +/// variant/struct constructor, ensuring instances are only constructed +/// through the generated `$constructor` functions. The constructor methods +/// work around the lack of the ability to mark an enum variant `#[cold]` and +/// `#[inline(never)]`. +macro_rules! cold_exhaustive_error { + // struct + { + struct $mod_name:ident::$Error:ident with $vis:vis constructor { + $field:ident: $ValueType:ty + } + } => { + mod $mod_name { + #[allow(unused_imports)] + use super::*; // So `$ValueType` is in scope. + + pub struct $Error { #[allow(dead_code)] $field: $ValueType } + + impl $Error { + #[cold] + #[inline(never)] + $vis fn new($field: $ValueType) -> Self { + Self { $field } + } + } + } + }; + // struct with default constructor visibility. + { + struct $mod_name:ident::$Error:ident { + $field:ident: $ValueType:ty + } + } => { + cold_exhaustive_error! { + struct $mod_name::$Error with pub(super) constructor { + $field: $ValueType + } + } + }; + + // enum + { + enum $mod_name:ident::$Error:ident { + $( + $constructor:ident => $Variant:ident($ValueType:ty), + )+ + } + } => { + mod $mod_name { + #[allow(unused_imports)] + use super::*; // So `$ValueType` is in scope. + + pub enum $Error { + $( + $Variant(#[allow(dead_code)] $ValueType) + ),+ + } + + impl $Error { + $( + #[cold] + #[inline(never)] + pub(super) fn $constructor(value: $ValueType) -> Self { + Self::$Variant(value) + } + )+ + } + } + }; +} diff --git a/ring-0.17.14/src/polyfill/cstr.rs b/ring-0.17.14/src/polyfill/cstr.rs new file mode 100644 index 0000000000..10739f9b2a --- /dev/null +++ b/ring-0.17.14/src/polyfill/cstr.rs @@ -0,0 +1,107 @@ +// Copyright 2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![cfg(all( + target_vendor = "apple", + any( + target_os = "ios", + target_os = "macos", + target_os = "tvos", + target_os = "visionos", + target_os = "watchos" + ) +))] + +//! Work around lack of `core::ffi::CStr` prior to Rust 1.64, and the lack of +//! `const fn` support for `CStr` in later versions. + +#![cfg(all( + all(target_arch = "aarch64", target_endian = "little"), + target_vendor = "apple" +))] + +use core::mem::{align_of, size_of}; + +// TODO(MSRV 1.64): Use `core::ffi::c_char`. +use libc::c_char; + +// TODO(MSRV 1.64): Replace with `&core::ffi::CStr`. +pub struct Ref(&'static [u8]); + +impl Ref { + #[inline(always)] + pub fn as_ptr(&self) -> *const c_char { + const _SAME_ALIGNMENT: () = assert!(align_of::() == align_of::()); + const _SAME_SIZE: () = assert!(size_of::() == size_of::()); + + // It is safe to cast a `*const u8` to a `const c_char` as they are the + // same size and alignment. + self.0.as_ptr().cast() + } + + // SAFETY: Same as `CStr::from_bytes_with_nul_unchecked`. + const unsafe fn from_bytes_with_nul_unchecked(value: &'static [u8]) -> Self { + Self(value) + } +} + +pub const fn unwrap_const_from_bytes_with_nul(value: &'static [u8]) -> Ref { + // XXX: We cannot use `unwrap_const` since `Ref`/`CStr` is not `Copy`. + match const_from_bytes_with_nul(value) { + Some(r) => r, + None => panic!("const_from_bytes_with_nul failed"), + } +} + +// TODO(MSRV 1.72): Replace with `CStr::from_bytes_with_nul`. +#[inline(always)] +const fn const_from_bytes_with_nul(value: &'static [u8]) -> Option { + const fn const_contains(mut value: &[u8], needle: &u8) -> bool { + while let [head, tail @ ..] = value { + if *head == *needle { + return true; + } + value = tail; + } + false + } + + // TODO(MSRV 1.69): Use `core::ffi::CStr::from_bytes_until_nul` + match value { + [before_nul @ .., 0] if !const_contains(before_nul, &0) => { + // SAFETY: + // * `value` is nul-terminated according to the slice pattern. + // * `value` doesn't contain any interior null, by the guard. + // TODO(MSRV 1.64): Use `CStr::from_bytes_with_nul_unchecked` + Some(unsafe { Ref::from_bytes_with_nul_unchecked(value) }) + } + _ => None, + } +} + +mod tests { + use super::const_from_bytes_with_nul; + + // Bad. + const _EMPTY_UNTERMINATED: () = assert!(const_from_bytes_with_nul(b"").is_none()); + const _EMPTY_DOUBLE_TERMINATED: () = assert!(const_from_bytes_with_nul(b"\0\0").is_none()); + const _DOUBLE_NUL: () = assert!(const_from_bytes_with_nul(b"\0\0").is_none()); + const _LEADINGL_NUL: () = assert!(const_from_bytes_with_nul(b"\0a\0").is_none()); + const _INTERNAL_NUL_UNTERMINATED: () = assert!(const_from_bytes_with_nul(b"\0a").is_none()); + + // Good. + const _EMPTY_TERMINATED: () = assert!(const_from_bytes_with_nul(b"\0").is_some()); + const _NONEMPTY: () = assert!(const_from_bytes_with_nul(b"asdf\0").is_some()); + const _1_CHAR: () = assert!(const_from_bytes_with_nul(b"a\0").is_some()); +} diff --git a/ring-0.17.14/src/polyfill/leading_zeros_skipped.rs b/ring-0.17.14/src/polyfill/leading_zeros_skipped.rs new file mode 100644 index 0000000000..36053f31f0 --- /dev/null +++ b/ring-0.17.14/src/polyfill/leading_zeros_skipped.rs @@ -0,0 +1,78 @@ +use core::iter::Peekable; + +/// An iterator that skips all leading zeros. +/// +/// When the wrapped iterator is all zeros, then the last item is retained. +pub struct LeadingZerosStripped +where + I: Iterator, +{ + inner: Peekable, +} + +impl Clone for LeadingZerosStripped +where + I: Iterator, + Peekable: Clone, +{ + fn clone(&self) -> Self { + Self { + inner: self.inner.clone(), + } + } +} + +impl LeadingZerosStripped +where + I: ExactSizeIterator, +{ + pub fn new(inner: I) -> Self { + let mut len = inner.len(); + let mut inner = inner.peekable(); + // Strip all leading zeroes, but don't strip the last byte if all bytes + // were zero. + while len > 1 && inner.next_if_eq(&0).is_some() { + len -= 1; + } + Self { inner } + } +} + +impl Iterator for LeadingZerosStripped +where + I: Iterator, +{ + type Item = I::Item; + + fn next(&mut self) -> Option { + self.inner.next() + } + + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } +} + +impl ExactSizeIterator for LeadingZerosStripped where I: ExactSizeIterator {} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_leading_zeroes_stripped() { + static TEST_CASES: &[(&[u8], &[u8])] = &[ + (&[], &[]), + (&[0], &[0]), + (&[0, 1], &[1]), + (&[0, 0, 1], &[1]), + (&[0, 0, 0, 1], &[1]), + (&[1, 0], &[1, 0]), + (&[0, 1, 0], &[1, 0]), + ]; + TEST_CASES.iter().copied().for_each(|(input, expected)| { + let stripped = LeadingZerosStripped::new(input.iter().copied()); + super::super::test::assert_iterator(stripped, expected); + }); + } +} diff --git a/ring-0.17.14/src/polyfill/notsend.rs b/ring-0.17.14/src/polyfill/notsend.rs new file mode 100644 index 0000000000..9c14006f57 --- /dev/null +++ b/ring-0.17.14/src/polyfill/notsend.rs @@ -0,0 +1,30 @@ +// Copyright 2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use crate::testutil; +use core::{marker::PhantomData, mem::size_of}; + +/// A ZST that can be added to any type to make the type `!Send`. +#[derive(Clone, Copy)] +pub struct NotSend(PhantomData<*mut ()>); + +impl NotSend { + pub const VALUE: Self = Self(PhantomData); +} + +#[allow(deprecated)] +const _: () = testutil::compile_time_assert_clone::(); +#[allow(deprecated)] +const _: () = testutil::compile_time_assert_copy::(); +const _: () = assert!(size_of::() == 0); diff --git a/ring-0.17.14/src/polyfill/once_cell/LICENSE-APACHE b/ring-0.17.14/src/polyfill/once_cell/LICENSE-APACHE new file mode 100644 index 0000000000..16fe87b06e --- /dev/null +++ b/ring-0.17.14/src/polyfill/once_cell/LICENSE-APACHE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/ring-0.17.14/src/polyfill/once_cell/LICENSE-MIT b/ring-0.17.14/src/polyfill/once_cell/LICENSE-MIT new file mode 100644 index 0000000000..51feadc5aa --- /dev/null +++ b/ring-0.17.14/src/polyfill/once_cell/LICENSE-MIT @@ -0,0 +1,23 @@ +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHOR OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/ring-0.17.14/src/polyfill/once_cell/race.rs b/ring-0.17.14/src/polyfill/once_cell/race.rs new file mode 100644 index 0000000000..6fc4056913 --- /dev/null +++ b/ring-0.17.14/src/polyfill/once_cell/race.rs @@ -0,0 +1,77 @@ +//! Thread-safe, non-blocking, "first one wins" flavor of `OnceCell`. +//! +//! If two threads race to initialize a type from the `race` module, they +//! don't block, execute initialization function together, but only one of +//! them stores the result. +//! +//! This module does not require `std` feature. +//! +//! # Atomic orderings +//! +//! All types in this module use `Acquire` and `Release` +//! [atomic orderings](Ordering) for all their operations. While this is not +//! strictly necessary for types other than `OnceBox`, it is useful for users as +//! it allows them to be certain that after `get` or `get_or_init` returns on +//! one thread, any side-effects caused by the setter thread prior to them +//! calling `set` or `get_or_init` will be made visible to that thread; without +//! it, it's possible for it to appear as if they haven't happened yet from the +//! getter thread's perspective. This is an acceptable tradeoff to make since +//! `Acquire` and `Release` have very little performance overhead on most +//! architectures versus `Relaxed`. + +use core::sync::atomic; + +use atomic::{AtomicUsize, Ordering}; +use core::num::NonZeroUsize; + +/// A thread-safe cell which can be written to only once. +pub struct OnceNonZeroUsize { + inner: AtomicUsize, +} + +impl OnceNonZeroUsize { + /// Creates a new empty cell. + #[inline] + pub const fn new() -> OnceNonZeroUsize { + OnceNonZeroUsize { + inner: AtomicUsize::new(0), + } + } + + /// Gets the underlying value. + #[inline] + pub fn get(&self) -> Option { + let val = self.inner.load(Ordering::Acquire); + NonZeroUsize::new(val) + } + + /// Gets the contents of the cell, initializing it with `f` if the cell was + /// empty. + /// + /// If several threads concurrently run `get_or_init`, more than one `f` can + /// be called. However, all threads will return the same value, produced by + /// some `f`. + pub fn get_or_init(&self, f: F) -> NonZeroUsize + where + F: FnOnce() -> NonZeroUsize, + { + let val = self.inner.load(Ordering::Acquire); + match NonZeroUsize::new(val) { + Some(it) => it, + None => self.init(f), + } + } + + #[cold] + #[inline(never)] + fn init(&self, f: impl FnOnce() -> NonZeroUsize) -> NonZeroUsize { + let mut val = f().get(); + let exchange = self + .inner + .compare_exchange(0, val, Ordering::AcqRel, Ordering::Acquire); + if let Err(old) = exchange { + val = old; + } + unsafe { NonZeroUsize::new_unchecked(val) } + } +} diff --git a/ring-0.17.14/src/polyfill/ptr.rs b/ring-0.17.14/src/polyfill/ptr.rs new file mode 100644 index 0000000000..58df760c99 --- /dev/null +++ b/ring-0.17.14/src/polyfill/ptr.rs @@ -0,0 +1,27 @@ +// Copyright 2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +// TODO(MSRV 1.76): Replace with `core::ptr::from_mut`. +#[allow(dead_code)] +#[inline(always)] +pub fn from_mut(r: &mut T) -> *mut T { + r +} + +// TODO(MSRV 1.76): Replace with `core::ptr::from_ref`. +#[allow(dead_code)] +#[inline(always)] +pub const fn from_ref(r: &T) -> *const T { + r +} diff --git a/ring-0.17.14/src/polyfill/slice.rs b/ring-0.17.14/src/polyfill/slice.rs new file mode 100644 index 0000000000..8299cc62d5 --- /dev/null +++ b/ring-0.17.14/src/polyfill/slice.rs @@ -0,0 +1,57 @@ +// Permission is hereby granted, free of charge, to any +// person obtaining a copy of this software and associated +// documentation files (the "Software"), to deal in the +// Software without restriction, including without +// limitation the rights to use, copy, modify, merge, +// publish, distribute, sublicense, and/or sell copies of +// the Software, and to permit persons to whom the Software +// is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice +// shall be included in all copies or substantial portions +// of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +// TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +// PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +// SHALL THE AUTHOR OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +// CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS IN THE SOFTWARE. + +mod as_chunks; +mod as_chunks_mut; + +pub use as_chunks::{as_chunks, AsChunks}; +pub use as_chunks_mut::{as_chunks_mut, AsChunksMut}; + +// TODO(MSRV feature(split_at_checked)): Use `slice::split_at_checked`. +// +// Note that the libcore version is implemented in terms of +// `slice::split_at_unchecked()`, and `slice::split_at()` was changed to be +// implemented in terms of `split_at_checked`. For now, we implement this in +// terms of `split_at` and rely on the optimizer to eliminate the panic. +#[inline(always)] +pub fn split_at_checked(slice: &[T], i: usize) -> Option<(&[T], &[T])> { + if slice.len() >= i { + Some(slice.split_at(i)) + } else { + None + } +} + +// TODO(MSRV-1.77): Use `slice::split_first_chunk_mut`. +#[inline(always)] +pub fn split_first_chunk_mut( + slice: &mut [T], +) -> Option<(&mut [T; N], &mut [T])> { + if slice.len() >= N { + let (head, tail) = slice.split_at_mut(N); + head.try_into().ok().map(|head| (head, tail)) + } else { + None + } +} diff --git a/ring-0.17.14/src/polyfill/slice/as_chunks.rs b/ring-0.17.14/src/polyfill/slice/as_chunks.rs new file mode 100644 index 0000000000..bfe5ab5709 --- /dev/null +++ b/ring-0.17.14/src/polyfill/slice/as_chunks.rs @@ -0,0 +1,114 @@ +// Copyright 2025 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::AsChunksMut; +use core::ops; + +#[inline(always)] +pub fn as_chunks(slice: &[T]) -> (AsChunks, &[T]) { + assert!(N != 0, "chunk size must be non-zero"); + let len = slice.len() / N; + let (multiple_of_n, remainder) = slice.split_at(len * N); + (AsChunks(multiple_of_n), remainder) +} + +#[derive(Clone, Copy)] +pub struct AsChunks<'a, T, const N: usize>(&'a [T]); + +impl<'a, T, const N: usize> AsChunks<'a, T, N> { + #[inline(always)] + pub fn from_ref(value: &'a [T; N]) -> Self { + Self(value) + } + + #[inline(always)] + pub fn as_flattened(&self) -> &[T] { + self.0 + } + + #[cfg(any(target_arch = "aarch64", target_arch = "arm", target_arch = "x86_64"))] + #[inline(always)] + pub fn as_ptr(&self) -> *const [T; N] { + self.0.as_ptr().cast() + } + + #[inline(always)] + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + #[inline(always)] + pub fn len(&self) -> usize { + self.0.len() / N + } +} + +impl ops::Index for AsChunks<'_, T, N> +where + [T]: ops::Index, Output = [T]>, +{ + type Output = [T; N]; + + #[inline(always)] + fn index(&self, index: usize) -> &Self::Output { + let start = N * index; + let slice = &self.0[start..(start + N)]; + slice.try_into().unwrap() + } +} + +impl<'a, T, const N: usize> IntoIterator for AsChunks<'a, T, N> { + type IntoIter = AsChunksIter<'a, T, N>; + type Item = &'a [T; N]; + + #[inline(always)] + fn into_iter(self) -> Self::IntoIter { + AsChunksIter(self.0.chunks_exact(N)) + } +} + +pub struct AsChunksIter<'a, T, const N: usize>(core::slice::ChunksExact<'a, T>); + +impl<'a, T, const N: usize> Iterator for AsChunksIter<'a, T, N> { + type Item = &'a [T; N]; + + #[inline(always)] + fn next(&mut self) -> Option { + self.0.next().map(|x| x.try_into().unwrap()) + } +} + +// `&mut [[T; N]]` is implicitly convertable to `&[[T; N]]` but our types can't +// do that. +impl<'a, T, const N: usize> From<&'a AsChunksMut<'_, T, N>> for AsChunks<'a, T, N> { + #[inline(always)] + fn from(as_mut: &'a AsChunksMut<'_, T, N>) -> Self { + Self(as_mut.as_flattened()) + } +} + +impl<'a, T, const N: usize> From<&'a [T; N]> for AsChunks<'a, T, N> { + #[inline(always)] + fn from(array: &'a [T; N]) -> Self { + Self(array) + } +} + +// TODO: `impl From for AsChunks<'a, T, N>`. +impl<'a, T> From> for AsChunks<'a, T, 4> { + #[inline(always)] + fn from(as_2x: AsChunks<'a, T, 8>) -> Self { + Self(as_2x.0) + } +} diff --git a/ring-0.17.14/src/polyfill/slice/as_chunks_mut.rs b/ring-0.17.14/src/polyfill/slice/as_chunks_mut.rs new file mode 100644 index 0000000000..ae32f04e8b --- /dev/null +++ b/ring-0.17.14/src/polyfill/slice/as_chunks_mut.rs @@ -0,0 +1,88 @@ +// Copyright 2025 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::AsChunks; + +#[inline(always)] +pub fn as_chunks_mut(slice: &mut [T]) -> (AsChunksMut, &mut [T]) { + assert!(N != 0, "chunk size must be non-zero"); + let len = slice.len() / N; + let (multiple_of_n, remainder) = slice.split_at_mut(len * N); + (AsChunksMut(multiple_of_n), remainder) +} + +pub struct AsChunksMut<'a, T, const N: usize>(&'a mut [T]); + +impl AsChunksMut<'_, T, N> { + #[inline(always)] + pub fn as_flattened(&self) -> &[T] { + self.0 + } + + #[inline(always)] + pub fn as_flattened_mut(&mut self) -> &mut [T] { + self.0 + } + + #[cfg(target_arch = "aarch64")] + pub fn as_ptr(&self) -> *const [T; N] { + self.0.as_ptr().cast() + } + + #[cfg(target_arch = "x86_64")] + pub fn as_ptr(&self) -> *const [T; N] { + self.0.as_ptr().cast() + } + + #[cfg(target_arch = "aarch64")] + pub fn as_mut_ptr(&mut self) -> *mut [T; N] { + self.0.as_mut_ptr().cast() + } + + #[cfg(target_arch = "x86_64")] + #[inline(always)] + pub fn as_mut(&mut self) -> AsChunksMut { + AsChunksMut(self.0) + } + + #[inline(always)] + pub fn as_ref(&self) -> AsChunks { + AsChunks::::from(self) + } + + // Argument moved from runtime argument to `const` argument so that + // `CHUNK_LEN * N` is checked at compile time for overflow. + #[inline(always)] + pub fn chunks_mut(&mut self) -> AsChunksMutChunksMutIter { + AsChunksMutChunksMutIter(self.0.chunks_mut(CHUNK_LEN * N)) + } + + #[cfg(target_arch = "x86_64")] + #[inline(always)] + pub fn split_at_mut(&mut self, mid: usize) -> (AsChunksMut, AsChunksMut) { + let (before, after) = self.0.split_at_mut(mid * N); + (AsChunksMut(before), AsChunksMut(after)) + } +} + +pub struct AsChunksMutChunksMutIter<'a, T, const N: usize>(core::slice::ChunksMut<'a, T>); + +impl<'a, T, const N: usize> Iterator for AsChunksMutChunksMutIter<'a, T, N> { + type Item = AsChunksMut<'a, T, N>; + + #[inline(always)] + fn next(&mut self) -> Option { + self.0.next().map(AsChunksMut) + } +} diff --git a/ring-0.17.14/src/polyfill/sliceutil.rs b/ring-0.17.14/src/polyfill/sliceutil.rs new file mode 100644 index 0000000000..a6765f17b4 --- /dev/null +++ b/ring-0.17.14/src/polyfill/sliceutil.rs @@ -0,0 +1,23 @@ +// Copyright 2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! Utilities to make dealing with slices less tediuous. + +/// Replaces the first N elements of `a` with the first N elements of `b`, where +/// N is `core::cmp::min(a.len(), b.len())`, leaving the rest unchanged. +pub fn overwrite_at_start(a: &mut [T], b: &[T]) { + a.iter_mut().zip(b).for_each(|(a, b)| { + *a = *b; + }); +} diff --git a/ring-0.17.14/src/polyfill/test.rs b/ring-0.17.14/src/polyfill/test.rs new file mode 100644 index 0000000000..ea90a2437a --- /dev/null +++ b/ring-0.17.14/src/polyfill/test.rs @@ -0,0 +1,29 @@ +pub fn assert_iterator(it: impl ExactSizeIterator + Clone, expected: &[T]) +where + T: Copy + core::fmt::Debug + PartialEq, +{ + // Assert that the cloned iterator is correct. + assert_exact_size_iterator(it.clone(), expected); + // Assert that the original iterator is correct. + assert_exact_size_iterator(it, expected); +} + +/// Asserts that `it` adheres to the `ExactSizeIterator` contract. +fn assert_exact_size_iterator(mut it: impl ExactSizeIterator, expected: &[T]) +where + T: Copy + core::fmt::Debug + PartialEq, +{ + assert_eq!(it.len(), expected.len()); + assert_eq!(it.size_hint(), expected.iter().size_hint()); + + for i in 0..expected.len() { + let len = it.len(); + assert_eq!(len, expected.len() - i); + assert_eq!(it.size_hint(), (len, Some(len))); + assert_eq!(it.next(), Some(expected[i])); + } + + assert_eq!(it.len(), 0); + assert_eq!(it.size_hint(), (0, Some(0))); + assert_eq!(it.next(), None); +} diff --git a/ring-0.17.14/src/polyfill/unwrap_const.rs b/ring-0.17.14/src/polyfill/unwrap_const.rs new file mode 100644 index 0000000000..a1c982f97c --- /dev/null +++ b/ring-0.17.14/src/polyfill/unwrap_const.rs @@ -0,0 +1,29 @@ +// Copyright 2022 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +/// Polyfill for `Option::unwrap()` as a const fn; feature `const_option`. +/// https://github.com/rust-lang/rust/issues/67441. +/// TODO(MSRV): Replace this with `x.unwrap()`. +/// +/// `T: Copy` avoids "constant functions cannot evaluate destructors." +pub const fn unwrap_const(x: Option) -> T +where + T: Copy, +{ + if let Some(x) = x { + x + } else { + panic!("unwrap_const on `None`"); + } +} diff --git a/ring-0.17.14/src/prefixed.rs b/ring-0.17.14/src/prefixed.rs new file mode 100644 index 0000000000..39f3f9fa06 --- /dev/null +++ b/ring-0.17.14/src/prefixed.rs @@ -0,0 +1,118 @@ +// Keep in sync with `core_name_and_version` in build.rs. +macro_rules! core_name_and_version { + () => { + concat!( + env!("CARGO_PKG_NAME"), + "_core_", + env!("CARGO_PKG_VERSION_MAJOR"), + "_", + env!("CARGO_PKG_VERSION_MINOR"), + "_", + env!("CARGO_PKG_VERSION_PATCH"), + "_", + env!("CARGO_PKG_VERSION_PRE"), // Often empty + ) + }; +} + +// Keep in sync with `prefix` in build.rs. +macro_rules! prefix { + ( ) => { + concat!(core_name_and_version!(), "_") + }; +} + +macro_rules! prefixed_extern { + // Functions. + { + $( + $( #[$meta:meta] )* + $vis:vis fn $name:ident ( $( $arg_pat:ident : $arg_ty:ty ),* $(,)? ) + $( -> $ret_ty:ty )?; + )+ + } => { + extern "C" { + $( + prefixed_item! { + link_name + $name + { + $( #[$meta] )* + $vis fn $name ( $( $arg_pat : $arg_ty ),* ) $( -> $ret_ty )?; + } + + } + )+ + } + }; + + // A `static` global variable. + { + $( #[$meta:meta] )* + $vis:vis static $name:ident: $typ:ty; + } => { + extern "C" { + prefixed_item! { + link_name + $name + { + $( #[$meta] )* + $vis static $name: $typ; + } + } + } + }; + + // A `static mut` global variable. + { + $( #[$meta:meta] )* + $vis:vis static mut $name:ident: $typ:ty; + } => { + extern "C" { + prefixed_item! { + link_name + $name + { + $( #[$meta] )* + $vis static mut $name: $typ; + } + } + } + }; +} + +#[deprecated = "`#[export_name]` creates problems and we will stop doing it."] +#[cfg(not(any( + all(target_arch = "aarch64", target_endian = "little"), + all(target_arch = "arm", target_endian = "little"), + target_arch = "x86", + target_arch = "x86_64" +)))] +macro_rules! prefixed_export { + // A function. + { + $( #[$meta:meta] )* + $vis:vis unsafe extern "C" + fn $name:ident ( $( $arg_pat:ident : $arg_ty:ty ),* $(,)? ) $body:block + } => { + prefixed_item! { + export_name + $name + { + $( #[$meta] )* + $vis unsafe extern "C" fn $name ( $( $arg_pat : $arg_ty ),* ) $body + } + } + }; +} + +macro_rules! prefixed_item { + { + $attr:ident + $name:ident + { $item:item } + } => { + #[$attr = concat!(prefix!(), stringify!($name))] + $item + }; +} diff --git a/ring-0.17.14/src/rand.rs b/ring-0.17.14/src/rand.rs new file mode 100644 index 0000000000..dc1c2a24e5 --- /dev/null +++ b/ring-0.17.14/src/rand.rs @@ -0,0 +1,176 @@ +// Copyright 2015-2016 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +//! Cryptographic pseudo-random number generation. +//! +//! *ring* functions that generate random bytes take a `&dyn SecureRandom` +//! parameter to make it clear which functions are non-deterministic. + +use crate::error; + +/// A secure random number generator. +pub trait SecureRandom: sealed::SecureRandom { + /// Fills `dest` with random bytes. + fn fill(&self, dest: &mut [u8]) -> Result<(), error::Unspecified>; +} + +impl SecureRandom for T +where + T: sealed::SecureRandom, +{ + #[inline(always)] + fn fill(&self, dest: &mut [u8]) -> Result<(), error::Unspecified> { + self.fill_impl(dest) + } +} + +/// A random value constructed from a `SecureRandom` that hasn't been exposed +/// through any safe Rust interface. +/// +/// Intentionally does not implement any traits other than `Sized`. +pub struct Random(T); + +impl Random { + /// Expose the random value. + #[inline] + pub fn expose(self) -> T { + self.0 + } +} + +/// Generate the new random value using `rng`. +#[inline] +pub fn generate( + rng: &dyn SecureRandom, +) -> Result, error::Unspecified> { + let mut r = T::zero(); + rng.fill(r.as_mut_bytes())?; + Ok(Random(r)) +} + +pub(crate) mod sealed { + use crate::error; + + pub trait SecureRandom: core::fmt::Debug { + /// Fills `dest` with random bytes. + fn fill_impl(&self, dest: &mut [u8]) -> Result<(), error::Unspecified>; + } + + pub trait RandomlyConstructable: Sized { + fn zero() -> Self; // `Default::default()` + fn as_mut_bytes(&mut self) -> &mut [u8]; // `AsMut<[u8]>::as_mut` + } + + impl RandomlyConstructable for [u8; N] { + #[inline] + fn zero() -> Self { + [0; N] + } + + #[inline] + fn as_mut_bytes(&mut self) -> &mut [u8] { + &mut self[..] + } + } +} + +/// A type that can be returned by `ring::rand::generate()`. +pub trait RandomlyConstructable: sealed::RandomlyConstructable {} +impl RandomlyConstructable for T where T: sealed::RandomlyConstructable {} + +/// A secure random number generator where the random values come directly +/// from the operating system. +/// +/// "Directly from the operating system" here presently means "whatever the +/// `getrandom` crate does" but that may change in the future. That roughly +/// means calling libc's `getrandom` function or whatever is analogous to that; +/// see the `getrandom` crate's documentation for more info. +/// +/// A single `SystemRandom` may be shared across multiple threads safely. +/// +/// `new()` is guaranteed to always succeed and to have low latency; it won't +/// try to open or read from a file or do similar things. The first call to +/// `fill()` may block a substantial amount of time since any and all +/// initialization is deferred to it. Therefore, it may be a good idea to call +/// `fill()` once at a non-latency-sensitive time to minimize latency for +/// future calls. +#[derive(Clone, Debug)] +pub struct SystemRandom(()); + +impl SystemRandom { + /// Constructs a new `SystemRandom`. + #[inline(always)] + pub fn new() -> Self { + Self(()) + } +} + +impl crate::sealed::Sealed for SystemRandom {} + +// Use the `getrandom` crate whenever it is using the environment's (operating +// system's) CSPRNG. Avoid using it on targets where it uses the `rdrand` +// implementation. +#[cfg(all(not(target_os = "optee"), any( + all(feature = "less-safe-getrandom-custom-or-rdrand", target_os = "none"), + all(feature = "less-safe-getrandom-espidf", target_os = "espidf"), + target_os = "aix", + target_os = "android", + target_os = "dragonfly", + target_os = "freebsd", + target_os = "fuchsia", + target_os = "haiku", + target_os = "hermit", + target_os = "hurd", + target_os = "horizon", + target_os = "illumos", + target_os = "linux", + target_os = "netbsd", + target_os = "openbsd", + target_os = "redox", + target_os = "solaris", + target_os = "vita", + target_os = "windows", + all( + target_vendor = "apple", + any( + target_os = "ios", + target_os = "macos", + target_os = "tvos", + target_os = "visionos", + target_os = "watchos", + ) + ), + all( + target_arch = "wasm32", + any( + target_os = "wasi", + all(target_os = "unknown", feature = "wasm32_unknown_unknown_js") + ) + ), +)))] +impl sealed::SecureRandom for SystemRandom { + #[inline(always)] + fn fill_impl(&self, dest: &mut [u8]) -> Result<(), error::Unspecified> { + getrandom::getrandom(dest).map_err(|_| error::Unspecified) + } +} + +#[cfg(target_os = "optee")] +impl sealed::SecureRandom for SystemRandom { + #[inline(always)] + fn fill_impl(&self, dest: &mut [u8]) -> Result<(), error::Unspecified> { + optee_utee::Random::generate(dest); + Ok(()) + } +} \ No newline at end of file diff --git a/ring-0.17.14/src/rsa.rs b/ring-0.17.14/src/rsa.rs new file mode 100644 index 0000000000..e1fb2e7fc6 --- /dev/null +++ b/ring-0.17.14/src/rsa.rs @@ -0,0 +1,76 @@ +// Copyright 2015-2016 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +// *R* and *r* in Montgomery math refer to different things, so we always use +// `R` to refer to *R* to avoid confusion, even when that's against the normal +// naming conventions. Also the standard camelCase names are used for `KeyPair` +// components. + +//! RSA. + +use crate::{ + arithmetic::bigint, + bits, error, + io::{self, der}, +}; + +pub(crate) mod padding; + +// Maximum RSA modulus size supported for signature verification (in bytes). +const PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN: usize = + bits::BitLength::from_bits(8192).as_usize_bytes_rounded_up(); + +// Keep in sync with the documentation comment for `KeyPair`. +const PRIVATE_KEY_PUBLIC_MODULUS_MAX_BITS: bits::BitLength = bits::BitLength::from_bits(4096); + +/// Parameters for RSA verification. +#[derive(Debug)] +pub struct RsaParameters { + padding_alg: &'static dyn padding::Verification, + min_bits: bits::BitLength, +} + +fn parse_public_key( + input: untrusted::Input, +) -> Result<(io::Positive, io::Positive), error::Unspecified> { + input.read_all(error::Unspecified, |input| { + der::nested(input, der::Tag::Sequence, error::Unspecified, |input| { + let n = der::positive_integer(input)?; + let e = der::positive_integer(input)?; + Ok((n, e)) + }) + }) +} + +// Type-level representation of an RSA public modulus *n*. See +// `super::bigint`'s modulue-level documentation. +enum N {} + +impl bigint::PublicModulus for N {} + +mod keypair; +mod keypair_components; +mod public_exponent; +mod public_key; +mod public_key_components; +mod public_modulus; + +pub(crate) mod verification; + +use self::{public_exponent::PublicExponent, public_modulus::PublicModulus}; + +pub use self::{ + keypair::KeyPair, keypair_components::KeyPairComponents, public_key::PublicKey, + public_key_components::PublicKeyComponents, +}; diff --git a/ring-0.17.14/src/rsa/keypair.rs b/ring-0.17.14/src/rsa/keypair.rs new file mode 100644 index 0000000000..5994e55882 --- /dev/null +++ b/ring-0.17.14/src/rsa/keypair.rs @@ -0,0 +1,685 @@ +// Copyright 2015-2016 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::{ + padding::{self, RsaEncoding}, + KeyPairComponents, PublicExponent, PublicKey, PublicKeyComponents, N, +}; + +/// RSA PKCS#1 1.5 signatures. +use crate::{ + arithmetic::{ + bigint, + montgomery::{R, RR, RRR}, + LimbSliceError, + }, + bits::BitLength, + cpu, digest, + error::{self, KeyRejected}, + io::der, + pkcs8, rand, signature, +}; + +/// An RSA key pair, used for signing. +pub struct KeyPair { + p: PrivateCrtPrime